wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.1.dist-info/METADATA +67 -0
  215. wisent-0.5.1.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,268 @@
1
+ """
2
+ Response diagnostics for autonomous agent analysis.
3
+
4
+ This module handles:
5
+ - Activation-based response quality assessment using trained classifiers
6
+ - Issue detection through model activations
7
+ - Classifier-based quality scoring
8
+ - Decision making for improvements needed
9
+ """
10
+
11
+ from dataclasses import dataclass
12
+ from typing import Any, Dict, List
13
+
14
+ from wisent_guard.core.activations import ActivationAggregationStrategy, Activations
15
+ from wisent_guard.core.classifier.classifier import Classifier
16
+
17
+ from ...layer import Layer
18
+ from ...model import Model
19
+
20
+
21
+ @dataclass
22
+ class AnalysisResult:
23
+ """Result of self-analysis."""
24
+
25
+ has_issues: bool
26
+ issues_found: List[str]
27
+ confidence: float
28
+ suggestions: List[str]
29
+ quality_score: float
30
+
31
+
32
+ class ResponseDiagnostics:
33
+ """Handles activation-based response analysis and quality assessment for autonomous agents."""
34
+
35
+ def __init__(self, model: Model, classifier_configs: List[Dict[str, Any]]):
36
+ """
37
+ Initialize the diagnostics system.
38
+
39
+ Args:
40
+ model: The language model to extract activations from
41
+ classifier_configs: List of classifier configurations with paths and layers
42
+ Example: [
43
+ {"path": "./models/hallucination_classifier.pt", "layer": 15, "issue_type": "hallucination"},
44
+ {"path": "./models/quality_classifier.pt", "layer": 20, "issue_type": "quality"}
45
+ ]
46
+ """
47
+ if not classifier_configs:
48
+ raise ValueError("classifier_configs is required - no fallback mode available")
49
+
50
+ self.model = model
51
+
52
+ # Load classifiers
53
+ self.classifiers = []
54
+ for config in classifier_configs:
55
+ try:
56
+ classifier = Classifier()
57
+ classifier.load_model(config["path"])
58
+
59
+ self.classifiers.append(
60
+ {
61
+ "classifier": classifier,
62
+ "layer": Layer(index=config["layer"], type="transformer"),
63
+ "issue_type": config.get("issue_type", "unknown"),
64
+ "threshold": config.get("threshold", 0.5),
65
+ }
66
+ )
67
+ print(f"✅ Loaded classifier for {config['issue_type']} at layer {config['layer']}")
68
+
69
+ except Exception as e:
70
+ print(f"⚠️ Failed to load classifier {config['path']}: {e}")
71
+ print(" Skipping this classifier and continuing...")
72
+ continue
73
+
74
+ if not self.classifiers:
75
+ raise RuntimeError("Failed to load any classifiers - system cannot operate without them")
76
+
77
+ async def analyze_response(self, response: str, prompt: str) -> AnalysisResult:
78
+ """Analyze the response using trained classifiers and activation patterns."""
79
+ issues = []
80
+ confidence_scores = []
81
+
82
+ # Classifier-based analysis only
83
+ classifier_results = self._analyze_with_classifiers(response)
84
+
85
+ for result in classifier_results:
86
+ if result["has_issue"]:
87
+ issues.append(result["issue_type"])
88
+ confidence_scores.append(result["confidence"])
89
+
90
+ # Quality assessment using classifiers
91
+ quality_score = self._assess_quality_with_classifiers(response)
92
+
93
+ # Overall confidence calculation
94
+ if not confidence_scores:
95
+ # No issues detected, use average confidence from all classifiers
96
+ confidence = (
97
+ sum(result["confidence"] for result in classifier_results) / len(classifier_results)
98
+ if classifier_results
99
+ else 0.5
100
+ )
101
+ else:
102
+ confidence = sum(confidence_scores) / len(confidence_scores)
103
+
104
+ # Generate suggestions based on detected issues
105
+ suggestions = self._generate_suggestions(issues)
106
+
107
+ result = AnalysisResult(
108
+ has_issues=len(issues) > 0,
109
+ issues_found=issues,
110
+ confidence=confidence,
111
+ suggestions=suggestions,
112
+ quality_score=quality_score,
113
+ )
114
+
115
+ return result
116
+
117
+ def _analyze_with_classifiers(self, response: str) -> List[Dict[str, Any]]:
118
+ """Analyze response using trained classifiers."""
119
+ results = []
120
+
121
+ for classifier_config in self.classifiers:
122
+ # Extract activations for this classifier's layer
123
+ activations_tensor = self.model.extract_activations(response, classifier_config["layer"])
124
+
125
+ # Create Activations object
126
+ activations = Activations(
127
+ tensor=activations_tensor,
128
+ layer=classifier_config["layer"],
129
+ aggregation_strategy=ActivationAggregationStrategy.LAST_TOKEN,
130
+ )
131
+
132
+ # Get features for classifier
133
+ features = activations.extract_features_for_classifier()
134
+
135
+ # Get classifier prediction (ensure CPU tensor for numpy conversion)
136
+ classifier = classifier_config["classifier"]
137
+ features_cpu = features.cpu() if hasattr(features, "cpu") else features
138
+
139
+ # Handle different classifier output formats
140
+ pred_result = classifier.predict([features_cpu.numpy()])
141
+ prediction = pred_result[0] if isinstance(pred_result, (list, tuple)) else pred_result
142
+
143
+ prob_result = classifier.predict_proba([features_cpu.numpy()])
144
+ probability = prob_result[0] if isinstance(prob_result, (list, tuple)) else prob_result
145
+
146
+ # Determine if this indicates an issue
147
+ threshold = classifier_config["threshold"]
148
+ has_issue = float(probability) > threshold
149
+ confidence = abs(float(probability) - 0.5) * 2 # Convert to 0-1 confidence
150
+
151
+ results.append(
152
+ {
153
+ "issue_type": classifier_config["issue_type"],
154
+ "has_issue": has_issue,
155
+ "confidence": confidence,
156
+ "probability": float(probability),
157
+ "prediction": int(prediction),
158
+ }
159
+ )
160
+
161
+ return results
162
+
163
+ def _assess_quality_with_classifiers(self, response: str) -> float:
164
+ """Assess response quality using classifiers."""
165
+ quality_scores = []
166
+
167
+ for classifier_config in self.classifiers:
168
+ # Extract activations
169
+ activations_tensor = self.model.extract_activations(response, classifier_config["layer"])
170
+
171
+ # Create Activations object
172
+ activations = Activations(
173
+ tensor=activations_tensor,
174
+ layer=classifier_config["layer"],
175
+ aggregation_strategy=ActivationAggregationStrategy.LAST_TOKEN,
176
+ )
177
+
178
+ # Get features
179
+ features = activations.extract_features_for_classifier()
180
+
181
+ # Get classifier probability (ensure CPU tensor for numpy conversion)
182
+ classifier = classifier_config["classifier"]
183
+ features_cpu = features.cpu() if hasattr(features, "cpu") else features
184
+
185
+ # Handle different classifier output formats
186
+ prob_result = classifier.predict_proba([features_cpu.numpy()])
187
+ probability = prob_result[0] if isinstance(prob_result, (list, tuple)) else prob_result
188
+
189
+ # Convert probability to quality score
190
+ # For most classifiers, low probability (closer to 0) = higher quality
191
+ # This assumes classifiers are trained to detect problems (1 = problematic, 0 = good)
192
+ quality_score = 1.0 - float(probability)
193
+ quality_scores.append(quality_score)
194
+
195
+ if not quality_scores:
196
+ raise RuntimeError("No quality scores available - all classifiers failed")
197
+
198
+ # Use average quality across all classifiers
199
+ return sum(quality_scores) / len(quality_scores)
200
+
201
+ def _generate_suggestions(self, issues: List[str]) -> List[str]:
202
+ """Generate improvement suggestions based on detected issues."""
203
+ suggestions = []
204
+
205
+ # Map issue types to suggestions
206
+ suggestion_map = {
207
+ "hallucination": "Verify factual accuracy and provide evidence-based responses",
208
+ "quality": "Improve response relevance, completeness, and clarity",
209
+ "harmful": "Revise content to be safe and helpful",
210
+ "bias": "Use more balanced and inclusive language",
211
+ "gibberish": "Ensure response coherence and proper language structure",
212
+ "repetitive": "Reduce repetition and vary language patterns",
213
+ "incoherent": "Improve logical flow and sentence structure",
214
+ }
215
+
216
+ for issue in issues:
217
+ if issue in suggestion_map:
218
+ suggestions.append(suggestion_map[issue])
219
+ else:
220
+ suggestions.append(f"Address {issue} issue in the response")
221
+
222
+ return suggestions
223
+
224
+ def decide_if_improvement_needed(
225
+ self, analysis: AnalysisResult, quality_threshold: float = 0.7, confidence_threshold: float = 0.8
226
+ ) -> bool:
227
+ """
228
+ Decide if improvement is needed based on analysis results.
229
+
230
+ Args:
231
+ analysis: Analysis result to evaluate
232
+ quality_threshold: Minimum quality score required
233
+ confidence_threshold: Minimum confidence required for improvement decision
234
+
235
+ Returns:
236
+ True if improvement is needed, False otherwise
237
+ """
238
+ # High confidence decisions
239
+ if analysis.confidence >= confidence_threshold:
240
+ if analysis.has_issues:
241
+ return True
242
+ if analysis.quality_score < quality_threshold:
243
+ return True
244
+
245
+ # Low confidence - be conservative
246
+ return False
247
+
248
+ def add_classifier(self, classifier_path: str, layer_index: int, issue_type: str, threshold: float = 0.5):
249
+ """Add a new classifier to the diagnostic system."""
250
+ classifier = Classifier()
251
+ classifier.load_model(classifier_path)
252
+
253
+ self.classifiers.append(
254
+ {
255
+ "classifier": classifier,
256
+ "layer": Layer(index=layer_index, type="transformer"),
257
+ "issue_type": issue_type,
258
+ "threshold": threshold,
259
+ }
260
+ )
261
+ print(f"✅ Added classifier for {issue_type} at layer {layer_index}")
262
+
263
+ def get_available_classifiers(self) -> List[Dict[str, Any]]:
264
+ """Get information about loaded classifiers."""
265
+ return [
266
+ {"issue_type": config["issue_type"], "layer": config["layer"].index, "threshold": config["threshold"]}
267
+ for config in self.classifiers
268
+ ]