wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.1.dist-info/METADATA +67 -0
  215. wisent-0.5.1.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,94 @@
1
+ """
2
+ Task Relevance Selection for Wisent Guard.
3
+
4
+ This module provides functionality to select the most relevant tasks from the
5
+ lm-evaluation-harness library based on a user query or issue type.
6
+
7
+ Uses model-driven decisions instead of hardcoded patterns.
8
+ """
9
+
10
+ from typing import List, Dict, Set, Tuple
11
+ from .task_manager import get_available_tasks
12
+
13
+
14
+ class TaskRelevanceSelector:
15
+ """Selects tasks based on model-driven relevance analysis."""
16
+
17
+ def __init__(self, model):
18
+ self.model = model
19
+
20
+ def find_relevant_tasks(
21
+ self,
22
+ query: str,
23
+ max_results: int = 20,
24
+ min_relevance_score: float = 0.1
25
+ ) -> List[Tuple[str, float]]:
26
+ """
27
+ Find tasks most relevant to the given query using model decisions.
28
+
29
+ Args:
30
+ query: The search query (e.g., "hallucination detection", "bias", "truthfulness")
31
+ max_results: Maximum number of tasks to return
32
+ min_relevance_score: Minimum relevance score threshold (0.0 to 1.0)
33
+
34
+ Returns:
35
+ List of (task_name, relevance_score) tuples, sorted by relevance
36
+ """
37
+ available_tasks = get_available_tasks()
38
+
39
+ # Use model to score task relevance
40
+ task_scores = []
41
+ for task_name in available_tasks[:100]: # Limit for efficiency
42
+ score = self._get_model_relevance_score(query, task_name)
43
+ if score >= min_relevance_score:
44
+ task_scores.append((task_name, score))
45
+
46
+ # Sort by relevance score (descending)
47
+ task_scores.sort(key=lambda x: x[1], reverse=True)
48
+
49
+ return task_scores[:max_results]
50
+
51
+ def _get_model_relevance_score(self, query: str, task_name: str) -> float:
52
+ """Get relevance score from the model."""
53
+ prompt = f"""Rate the relevance of this task for the given query.
54
+
55
+ Query: {query}
56
+ Task: {task_name}
57
+
58
+ Rate relevance from 0.0 to 1.0 (1.0 = highly relevant, 0.0 = not relevant).
59
+ Respond with only the number:"""
60
+
61
+ try:
62
+ response = self.model.generate(prompt, layer_index=15, max_new_tokens=10, temperature=0.1)
63
+ score_str = response.strip()
64
+
65
+ # Extract number from response
66
+ import re
67
+ match = re.search(r'(\d+\.?\d*)', score_str)
68
+ if match:
69
+ score = float(match.group(1))
70
+ return min(1.0, max(0.0, score)) # Clamp to [0,1]
71
+ return 0.0
72
+ except:
73
+ return 0.0
74
+
75
+
76
+ def find_relevant_tasks(
77
+ query: str,
78
+ max_results: int = 20,
79
+ min_relevance_score: float = 0.1,
80
+ model=None
81
+ ) -> List[Tuple[str, float]]:
82
+ """Standalone function for task relevance selection."""
83
+ if model is None:
84
+ from ....model import Model
85
+ model = Model("meta-llama/Llama-3.1-8B-Instruct")
86
+
87
+ selector = TaskRelevanceSelector(model)
88
+ return selector.find_relevant_tasks(query, max_results, min_relevance_score)
89
+
90
+
91
+ def get_top_relevant_tasks(query: str, count: int, model=None) -> List[str]:
92
+ """Get top N relevant tasks for a query."""
93
+ results = find_relevant_tasks(query, max_results=count, model=model)
94
+ return [task_name for task_name, _ in results]
@@ -0,0 +1,151 @@
1
+ """
2
+ Task Selector for intelligent task selection based on issue types.
3
+
4
+ This module provides functionality to select the most relevant lm-eval tasks
5
+ for training classifiers for specific issue types using model-driven decisions.
6
+ """
7
+
8
+ from typing import List, Dict, Any, Set, Tuple
9
+ from .task_manager import get_available_tasks
10
+
11
+
12
+ class TaskSelector:
13
+ """Model-driven task selector for issue-type-specific training."""
14
+
15
+ def __init__(self, model):
16
+ self.model = model
17
+
18
+ def find_relevant_tasks_for_issue_type(self, issue_type: str, max_tasks: int = 10) -> List[str]:
19
+ """
20
+ Find the most relevant tasks for a specific issue type using model decisions.
21
+
22
+ Args:
23
+ issue_type: Type of issue to find tasks for
24
+ max_tasks: Maximum number of tasks to return
25
+
26
+ Returns:
27
+ List of task names ranked by relevance
28
+ """
29
+ available_tasks = get_available_tasks()
30
+
31
+ # Use model to score task relevance for the issue type
32
+ task_scores = []
33
+ for task_name in available_tasks[:50]: # Limit for efficiency
34
+ score = self._get_model_task_relevance(issue_type, task_name)
35
+ if score > 0.0:
36
+ task_scores.append((task_name, score))
37
+
38
+ # Sort by relevance score (descending) and return top tasks
39
+ task_scores.sort(key=lambda x: x[1], reverse=True)
40
+ return [task_name for task_name, _ in task_scores[:max_tasks]]
41
+
42
+ def select_best_tasks_for_training(
43
+ self,
44
+ issue_type: str,
45
+ min_tasks: int = 1,
46
+ max_tasks: int = 10,
47
+ quality_threshold: float = 1.5
48
+ ) -> List[str]:
49
+ """
50
+ Select the best tasks for training a classifier for the given issue type.
51
+
52
+ Args:
53
+ issue_type: Type of issue to select tasks for
54
+ min_tasks: Minimum number of tasks to select
55
+ max_tasks: Maximum number of tasks to select
56
+ quality_threshold: Minimum quality score for task inclusion
57
+
58
+ Returns:
59
+ List of selected task names
60
+ """
61
+ # Get relevant tasks using model decisions
62
+ relevant_tasks = self.find_relevant_tasks_for_issue_type(issue_type, max_tasks * 2)
63
+
64
+ # Use model to evaluate task quality
65
+ selected_tasks = []
66
+ for task_name in relevant_tasks:
67
+ quality_score = self._get_model_task_quality(task_name)
68
+ if quality_score >= quality_threshold or len(selected_tasks) < min_tasks:
69
+ selected_tasks.append(task_name)
70
+ if len(selected_tasks) >= max_tasks:
71
+ break
72
+
73
+ return selected_tasks[:max_tasks]
74
+
75
+ def _get_model_task_relevance(self, issue_type: str, task_name: str) -> float:
76
+ """Get task relevance score from the model."""
77
+ prompt = f"""Rate how relevant this task is for detecting/training on this issue type.
78
+
79
+ Issue Type: {issue_type}
80
+ Task: {task_name}
81
+
82
+ Rate relevance from 0.0 to 1.0 (1.0 = highly relevant, 0.0 = not relevant).
83
+ Respond with only the number:"""
84
+
85
+ try:
86
+ response = self.model.generate(prompt, layer_index=15, max_new_tokens=10, temperature=0.1)
87
+ score_str = response.strip()
88
+
89
+ import re
90
+ match = re.search(r'(\d+\.?\d*)', score_str)
91
+ if match:
92
+ score = float(match.group(1))
93
+ return min(1.0, max(0.0, score))
94
+ return 0.0
95
+ except:
96
+ return 0.0
97
+
98
+ def _get_model_task_quality(self, task_name: str) -> float:
99
+ """Get task quality assessment from the model."""
100
+ prompt = f"""Rate the quality and reliability of this evaluation task for training AI safety classifiers.
101
+
102
+ Task: {task_name}
103
+
104
+ Consider factors like:
105
+ - Data quality and reliability
106
+ - Task design and clarity
107
+ - Usefulness for training safety classifiers
108
+
109
+ Rate quality from 0.0 to 5.0 (5.0 = excellent quality, 0.0 = poor quality).
110
+ Respond with only the number:"""
111
+
112
+ try:
113
+ response = self.model.generate(prompt, layer_index=15, max_new_tokens=10, temperature=0.1)
114
+ score_str = response.strip()
115
+
116
+ import re
117
+ match = re.search(r'(\d+\.?\d*)', score_str)
118
+ if match:
119
+ score = float(match.group(1))
120
+ return min(5.0, max(0.0, score))
121
+ return 1.0
122
+ except:
123
+ return 1.0
124
+
125
+
126
+ def find_relevant_tasks_for_issue_type(issue_type: str, max_tasks: int = 10, model=None) -> List[str]:
127
+ """Standalone function for finding relevant tasks."""
128
+ if model is None:
129
+ from ....model import Model
130
+ model = Model("meta-llama/Llama-3.1-8B-Instruct")
131
+
132
+ selector = TaskSelector(model)
133
+ return selector.find_relevant_tasks_for_issue_type(issue_type, max_tasks)
134
+
135
+
136
+ def select_best_tasks_for_training(
137
+ issue_type: str,
138
+ min_tasks: int = 1,
139
+ max_tasks: int = 10,
140
+ quality_threshold: float = 1.5,
141
+ model=None
142
+ ) -> List[str]:
143
+ """Standalone function for selecting best training tasks."""
144
+ if model is None:
145
+ from ....model import Model
146
+ model = Model("meta-llama/Llama-3.1-8B-Instruct")
147
+
148
+ selector = TaskSelector(model)
149
+ return selector.select_best_tasks_for_training(
150
+ issue_type, min_tasks, max_tasks, quality_threshold
151
+ )
@@ -0,0 +1,71 @@
1
+ import sys
2
+ import time
3
+ import signal
4
+ from pathlib import Path
5
+
6
+ # Add the project root to the path
7
+ project_root = Path(__file__).parent.parent.parent.parent
8
+ sys.path.insert(0, str(project_root))
9
+
10
+ from wisent_guard.core.model import Model
11
+ from wisent_guard.core.agent.diagnose.synthetic_classifier_option import (
12
+ create_classifiers_for_prompt,
13
+ apply_classifiers_to_response
14
+ )
15
+ from wisent_guard.core.agent.budget import set_time_budget
16
+
17
+ class TimeoutError(Exception):
18
+ """Raised when test exceeds time budget."""
19
+ pass
20
+
21
+ def timeout_handler(signum, frame):
22
+ raise TimeoutError("Test exceeded time budget!")
23
+
24
+ def main():
25
+ # Set budget and timeout separately
26
+ budget_minutes = 1.0 # 1 minute - internal budget for classifier creation
27
+ timeout_seconds = 120 # 2 minutes - hard timeout for the test process
28
+ set_time_budget(budget_minutes)
29
+
30
+ print(f"⏱️ Starting synthetic classifier test with {timeout_seconds}s timeout and {budget_minutes*60}s budget...")
31
+
32
+ # Set up timeout signal
33
+ signal.signal(signal.SIGALRM, timeout_handler)
34
+ signal.alarm(timeout_seconds)
35
+
36
+ start_time = time.time()
37
+
38
+ try:
39
+ model = Model(name="meta-llama/Llama-3.1-8B-Instruct")
40
+ prompt = "What is the capital of France?"
41
+
42
+ # Test the system
43
+ classifiers, trait_discovery = create_classifiers_for_prompt(model, prompt)
44
+
45
+ # Clear the alarm since we completed successfully
46
+ signal.alarm(0)
47
+
48
+ elapsed_time = time.time() - start_time
49
+ print(f"✅ SUCCESS: Created {len(classifiers)} classifiers for {len(trait_discovery.traits_discovered)} traits")
50
+ print(f"⏱️ Total time: {elapsed_time:.1f}s (timeout: {timeout_seconds}s, budget: {budget_minutes*60}s)")
51
+
52
+ if elapsed_time > timeout_seconds:
53
+ print(f"⚠️ WARNING: Test completed but exceeded timeout by {elapsed_time - timeout_seconds:.1f}s")
54
+ else:
55
+ print(f"🎉 Test completed within timeout with {timeout_seconds - elapsed_time:.1f}s to spare!")
56
+
57
+ except TimeoutError as e:
58
+ elapsed_time = time.time() - start_time
59
+ print(f"❌ ERROR: {e}")
60
+ print(f"❌ Test failed after {elapsed_time:.1f}s (timeout: {timeout_seconds}s, budget: {budget_minutes*60}s)")
61
+ print("❌ This indicates a performance issue that needs investigation.")
62
+ sys.exit(1)
63
+ except Exception as e:
64
+ signal.alarm(0) # Clear timeout
65
+ elapsed_time = time.time() - start_time
66
+ print(f"❌ ERROR: Test failed with exception: {e}")
67
+ print(f"❌ Time elapsed: {elapsed_time:.1f}s")
68
+ sys.exit(1)
69
+
70
+ if __name__ == "__main__":
71
+ main()
@@ -0,0 +1,242 @@
1
+ """
2
+ Diagnostic module for autonomous agent response analysis.
3
+
4
+ This module handles:
5
+ - Activation-based response quality assessment using trained classifiers
6
+ - Issue detection through model activations
7
+ - Classifier-based quality scoring
8
+ - Decision making for improvements needed
9
+ """
10
+
11
+ from dataclasses import dataclass
12
+ from typing import Any, Dict, List
13
+
14
+ from wisent_guard.core.activations import ActivationAggregationStrategy, Activations
15
+ from wisent_guard.core.classifier.classifier import Classifier
16
+
17
+ from ..layer import Layer
18
+ from ..model import Model
19
+
20
+
21
+ @dataclass
22
+ class AnalysisResult:
23
+ """Result of self-analysis."""
24
+
25
+ has_issues: bool
26
+ issues_found: List[str]
27
+ confidence: float
28
+ suggestions: List[str]
29
+ quality_score: float
30
+
31
+
32
+ class ResponseDiagnostics:
33
+ """Handles activation-based response analysis and quality assessment for autonomous agents."""
34
+
35
+ def __init__(self, model: Model, classifier_configs: List[Dict[str, Any]]):
36
+ """
37
+ Initialize the diagnostics system.
38
+
39
+ Args:
40
+ model: The language model to extract activations from
41
+ classifier_configs: List of classifier configurations with paths and layers
42
+ Example: [
43
+ {"path": "./models/hallucination_classifier.pt", "layer": 15, "issue_type": "hallucination"},
44
+ {"path": "./models/quality_classifier.pt", "layer": 20, "issue_type": "quality"}
45
+ ]
46
+ """
47
+ if not classifier_configs:
48
+ raise ValueError("classifier_configs is required - no fallback mode available")
49
+
50
+ self.model = model
51
+
52
+ # Load classifiers
53
+ self.classifiers = []
54
+ for config in classifier_configs:
55
+ classifier = Classifier()
56
+ classifier.load_model(config["path"])
57
+
58
+ self.classifiers.append(
59
+ {
60
+ "classifier": classifier,
61
+ "layer": Layer(index=config["layer"], type="transformer"),
62
+ "issue_type": config.get("issue_type", "unknown"),
63
+ "threshold": config.get("threshold", 0.5),
64
+ }
65
+ )
66
+ print(f"✅ Loaded classifier for {config['issue_type']} at layer {config['layer']}")
67
+
68
+ if not self.classifiers:
69
+ raise RuntimeError("Failed to load any classifiers - system cannot operate without them")
70
+
71
+ async def analyze_response(self, response: str, prompt: str) -> AnalysisResult:
72
+ """Analyze the response using trained classifiers and activation patterns."""
73
+ issues = []
74
+ confidence_scores = []
75
+
76
+ # Classifier-based analysis only
77
+ classifier_results = self._analyze_with_classifiers(response)
78
+
79
+ for result in classifier_results:
80
+ if result["has_issue"]:
81
+ issues.append(result["issue_type"])
82
+ confidence_scores.append(result["confidence"])
83
+
84
+ # Quality assessment using classifiers
85
+ quality_score = self._assess_quality_with_classifiers(response)
86
+
87
+ # Overall confidence - requires at least one confidence score
88
+ if not confidence_scores:
89
+ raise RuntimeError("No confidence scores available - all classifiers failed")
90
+ confidence = sum(confidence_scores) / len(confidence_scores)
91
+
92
+ # Generate suggestions based on detected issues
93
+ suggestions = self._generate_suggestions(issues)
94
+
95
+ result = AnalysisResult(
96
+ has_issues=len(issues) > 0,
97
+ issues_found=issues,
98
+ confidence=confidence,
99
+ suggestions=suggestions,
100
+ quality_score=quality_score,
101
+ )
102
+
103
+ return result
104
+
105
+ def _analyze_with_classifiers(self, response: str) -> List[Dict[str, Any]]:
106
+ """Analyze response using trained classifiers."""
107
+ results = []
108
+
109
+ for classifier_config in self.classifiers:
110
+ # Extract activations for this classifier's layer
111
+ activations_tensor = self.model.extract_activations(response, classifier_config["layer"])
112
+
113
+ # Create Activations object
114
+ activations = Activations(
115
+ tensor=activations_tensor,
116
+ layer=classifier_config["layer"],
117
+ aggregation_strategy=ActivationAggregationStrategy.LAST_TOKEN,
118
+ )
119
+
120
+ # Get features for classifier
121
+ features = activations.extract_features_for_classifier()
122
+
123
+ # Get classifier prediction
124
+ classifier = classifier_config["classifier"]
125
+ prediction = classifier.predict([features.numpy()])[0]
126
+ probability = classifier.predict_proba([features.numpy()])[0]
127
+
128
+ # Determine if this indicates an issue
129
+ threshold = classifier_config["threshold"]
130
+ has_issue = float(probability) > threshold
131
+ confidence = abs(float(probability) - 0.5) * 2 # Convert to 0-1 confidence
132
+
133
+ results.append(
134
+ {
135
+ "issue_type": classifier_config["issue_type"],
136
+ "has_issue": has_issue,
137
+ "confidence": confidence,
138
+ "probability": float(probability),
139
+ "prediction": int(prediction),
140
+ }
141
+ )
142
+
143
+ return results
144
+
145
+ def _assess_quality_with_classifiers(self, response: str) -> float:
146
+ """Assess response quality using classifiers."""
147
+ quality_scores = []
148
+
149
+ for classifier_config in self.classifiers:
150
+ # Extract activations
151
+ activations_tensor = self.model.extract_activations(response, classifier_config["layer"])
152
+
153
+ # Create Activations object
154
+ activations = Activations(
155
+ tensor=activations_tensor,
156
+ layer=classifier_config["layer"],
157
+ aggregation_strategy=ActivationAggregationStrategy.LAST_TOKEN,
158
+ )
159
+
160
+ # Get features
161
+ features = activations.extract_features_for_classifier()
162
+
163
+ # Get classifier probability
164
+ classifier = classifier_config["classifier"]
165
+ probability = classifier.predict_proba([features.numpy()])[0]
166
+
167
+ # Convert probability to quality score
168
+ # For most classifiers, low probability (closer to 0) = higher quality
169
+ # This assumes classifiers are trained to detect problems (1 = problematic, 0 = good)
170
+ quality_score = 1.0 - float(probability)
171
+ quality_scores.append(quality_score)
172
+
173
+ if not quality_scores:
174
+ raise RuntimeError("No quality scores available - all classifiers failed")
175
+
176
+ # Use average quality across all classifiers
177
+ return sum(quality_scores) / len(quality_scores)
178
+
179
+ def _generate_suggestions(self, issues: List[str]) -> List[str]:
180
+ """Generate improvement suggestions based on detected issues."""
181
+ suggestions = []
182
+
183
+ # Map issue types to suggestions
184
+ suggestion_map = {
185
+ "hallucination": "Verify factual accuracy and provide evidence-based responses",
186
+ "quality": "Improve response relevance, completeness, and clarity",
187
+ "harmful": "Revise content to be safe and helpful",
188
+ "bias": "Use more balanced and inclusive language",
189
+ "gibberish": "Ensure response coherence and proper language structure",
190
+ "repetitive": "Reduce repetition and vary language patterns",
191
+ "incoherent": "Improve logical flow and sentence structure",
192
+ }
193
+
194
+ for issue in issues:
195
+ if issue in suggestion_map:
196
+ suggestions.append(suggestion_map[issue])
197
+ else:
198
+ suggestions.append(f"Address {issue} issue in the response")
199
+
200
+ return suggestions
201
+
202
+ def decide_if_improvement_needed(
203
+ self, analysis: AnalysisResult, quality_threshold: float = 0.7, confidence_threshold: float = 0.8
204
+ ) -> bool:
205
+ """Decide if the response needs improvement based on classifier results."""
206
+ # Improvement needed if:
207
+ # 1. Quality score below threshold
208
+ # 2. High-confidence issues detected
209
+ # 3. Multiple issues found
210
+
211
+ if analysis.quality_score < quality_threshold:
212
+ return True
213
+
214
+ if analysis.confidence > confidence_threshold and analysis.has_issues:
215
+ return True
216
+
217
+ if len(analysis.issues_found) >= 2:
218
+ return True
219
+
220
+ return False
221
+
222
+ def add_classifier(self, classifier_path: str, layer_index: int, issue_type: str, threshold: float = 0.5):
223
+ """Add a new classifier to the diagnostic system."""
224
+ classifier = Classifier()
225
+ classifier.load_model(classifier_path)
226
+
227
+ self.classifiers.append(
228
+ {
229
+ "classifier": classifier,
230
+ "layer": Layer(index=layer_index, type="transformer"),
231
+ "issue_type": issue_type,
232
+ "threshold": threshold,
233
+ }
234
+ )
235
+ print(f"✅ Added classifier for {issue_type} at layer {layer_index}")
236
+
237
+ def get_available_classifiers(self) -> List[Dict[str, Any]]:
238
+ """Get information about loaded classifiers."""
239
+ return [
240
+ {"issue_type": config["issue_type"], "layer": config["layer"].index, "threshold": config["threshold"]}
241
+ for config in self.classifiers
242
+ ]