wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.2.dist-info/METADATA +67 -0
- wisent-0.5.2.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Task Relevance Selection for Wisent Guard.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to select the most relevant tasks from the
|
|
5
|
+
lm-evaluation-harness library based on a user query or issue type.
|
|
6
|
+
|
|
7
|
+
Uses model-driven decisions instead of hardcoded patterns.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import List, Dict, Set, Tuple
|
|
11
|
+
from .task_manager import get_available_tasks
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TaskRelevanceSelector:
|
|
15
|
+
"""Selects tasks based on model-driven relevance analysis."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, model):
|
|
18
|
+
self.model = model
|
|
19
|
+
|
|
20
|
+
def find_relevant_tasks(
|
|
21
|
+
self,
|
|
22
|
+
query: str,
|
|
23
|
+
max_results: int = 20,
|
|
24
|
+
min_relevance_score: float = 0.1
|
|
25
|
+
) -> List[Tuple[str, float]]:
|
|
26
|
+
"""
|
|
27
|
+
Find tasks most relevant to the given query using model decisions.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
query: The search query (e.g., "hallucination detection", "bias", "truthfulness")
|
|
31
|
+
max_results: Maximum number of tasks to return
|
|
32
|
+
min_relevance_score: Minimum relevance score threshold (0.0 to 1.0)
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
List of (task_name, relevance_score) tuples, sorted by relevance
|
|
36
|
+
"""
|
|
37
|
+
available_tasks = get_available_tasks()
|
|
38
|
+
|
|
39
|
+
# Use model to score task relevance
|
|
40
|
+
task_scores = []
|
|
41
|
+
for task_name in available_tasks[:100]: # Limit for efficiency
|
|
42
|
+
score = self._get_model_relevance_score(query, task_name)
|
|
43
|
+
if score >= min_relevance_score:
|
|
44
|
+
task_scores.append((task_name, score))
|
|
45
|
+
|
|
46
|
+
# Sort by relevance score (descending)
|
|
47
|
+
task_scores.sort(key=lambda x: x[1], reverse=True)
|
|
48
|
+
|
|
49
|
+
return task_scores[:max_results]
|
|
50
|
+
|
|
51
|
+
def _get_model_relevance_score(self, query: str, task_name: str) -> float:
|
|
52
|
+
"""Get relevance score from the model."""
|
|
53
|
+
prompt = f"""Rate the relevance of this task for the given query.
|
|
54
|
+
|
|
55
|
+
Query: {query}
|
|
56
|
+
Task: {task_name}
|
|
57
|
+
|
|
58
|
+
Rate relevance from 0.0 to 1.0 (1.0 = highly relevant, 0.0 = not relevant).
|
|
59
|
+
Respond with only the number:"""
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
response = self.model.generate(prompt, layer_index=15, max_new_tokens=10, temperature=0.1)
|
|
63
|
+
score_str = response.strip()
|
|
64
|
+
|
|
65
|
+
# Extract number from response
|
|
66
|
+
import re
|
|
67
|
+
match = re.search(r'(\d+\.?\d*)', score_str)
|
|
68
|
+
if match:
|
|
69
|
+
score = float(match.group(1))
|
|
70
|
+
return min(1.0, max(0.0, score)) # Clamp to [0,1]
|
|
71
|
+
return 0.0
|
|
72
|
+
except:
|
|
73
|
+
return 0.0
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def find_relevant_tasks(
|
|
77
|
+
query: str,
|
|
78
|
+
max_results: int = 20,
|
|
79
|
+
min_relevance_score: float = 0.1,
|
|
80
|
+
model=None
|
|
81
|
+
) -> List[Tuple[str, float]]:
|
|
82
|
+
"""Standalone function for task relevance selection."""
|
|
83
|
+
if model is None:
|
|
84
|
+
from ....model import Model
|
|
85
|
+
model = Model("meta-llama/Llama-3.1-8B-Instruct")
|
|
86
|
+
|
|
87
|
+
selector = TaskRelevanceSelector(model)
|
|
88
|
+
return selector.find_relevant_tasks(query, max_results, min_relevance_score)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def get_top_relevant_tasks(query: str, count: int, model=None) -> List[str]:
|
|
92
|
+
"""Get top N relevant tasks for a query."""
|
|
93
|
+
results = find_relevant_tasks(query, max_results=count, model=model)
|
|
94
|
+
return [task_name for task_name, _ in results]
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Task Selector for intelligent task selection based on issue types.
|
|
3
|
+
|
|
4
|
+
This module provides functionality to select the most relevant lm-eval tasks
|
|
5
|
+
for training classifiers for specific issue types using model-driven decisions.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List, Dict, Any, Set, Tuple
|
|
9
|
+
from .task_manager import get_available_tasks
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TaskSelector:
|
|
13
|
+
"""Model-driven task selector for issue-type-specific training."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, model):
|
|
16
|
+
self.model = model
|
|
17
|
+
|
|
18
|
+
def find_relevant_tasks_for_issue_type(self, issue_type: str, max_tasks: int = 10) -> List[str]:
|
|
19
|
+
"""
|
|
20
|
+
Find the most relevant tasks for a specific issue type using model decisions.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
issue_type: Type of issue to find tasks for
|
|
24
|
+
max_tasks: Maximum number of tasks to return
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
List of task names ranked by relevance
|
|
28
|
+
"""
|
|
29
|
+
available_tasks = get_available_tasks()
|
|
30
|
+
|
|
31
|
+
# Use model to score task relevance for the issue type
|
|
32
|
+
task_scores = []
|
|
33
|
+
for task_name in available_tasks[:50]: # Limit for efficiency
|
|
34
|
+
score = self._get_model_task_relevance(issue_type, task_name)
|
|
35
|
+
if score > 0.0:
|
|
36
|
+
task_scores.append((task_name, score))
|
|
37
|
+
|
|
38
|
+
# Sort by relevance score (descending) and return top tasks
|
|
39
|
+
task_scores.sort(key=lambda x: x[1], reverse=True)
|
|
40
|
+
return [task_name for task_name, _ in task_scores[:max_tasks]]
|
|
41
|
+
|
|
42
|
+
def select_best_tasks_for_training(
|
|
43
|
+
self,
|
|
44
|
+
issue_type: str,
|
|
45
|
+
min_tasks: int = 1,
|
|
46
|
+
max_tasks: int = 10,
|
|
47
|
+
quality_threshold: float = 1.5
|
|
48
|
+
) -> List[str]:
|
|
49
|
+
"""
|
|
50
|
+
Select the best tasks for training a classifier for the given issue type.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
issue_type: Type of issue to select tasks for
|
|
54
|
+
min_tasks: Minimum number of tasks to select
|
|
55
|
+
max_tasks: Maximum number of tasks to select
|
|
56
|
+
quality_threshold: Minimum quality score for task inclusion
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
List of selected task names
|
|
60
|
+
"""
|
|
61
|
+
# Get relevant tasks using model decisions
|
|
62
|
+
relevant_tasks = self.find_relevant_tasks_for_issue_type(issue_type, max_tasks * 2)
|
|
63
|
+
|
|
64
|
+
# Use model to evaluate task quality
|
|
65
|
+
selected_tasks = []
|
|
66
|
+
for task_name in relevant_tasks:
|
|
67
|
+
quality_score = self._get_model_task_quality(task_name)
|
|
68
|
+
if quality_score >= quality_threshold or len(selected_tasks) < min_tasks:
|
|
69
|
+
selected_tasks.append(task_name)
|
|
70
|
+
if len(selected_tasks) >= max_tasks:
|
|
71
|
+
break
|
|
72
|
+
|
|
73
|
+
return selected_tasks[:max_tasks]
|
|
74
|
+
|
|
75
|
+
def _get_model_task_relevance(self, issue_type: str, task_name: str) -> float:
|
|
76
|
+
"""Get task relevance score from the model."""
|
|
77
|
+
prompt = f"""Rate how relevant this task is for detecting/training on this issue type.
|
|
78
|
+
|
|
79
|
+
Issue Type: {issue_type}
|
|
80
|
+
Task: {task_name}
|
|
81
|
+
|
|
82
|
+
Rate relevance from 0.0 to 1.0 (1.0 = highly relevant, 0.0 = not relevant).
|
|
83
|
+
Respond with only the number:"""
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
response = self.model.generate(prompt, layer_index=15, max_new_tokens=10, temperature=0.1)
|
|
87
|
+
score_str = response.strip()
|
|
88
|
+
|
|
89
|
+
import re
|
|
90
|
+
match = re.search(r'(\d+\.?\d*)', score_str)
|
|
91
|
+
if match:
|
|
92
|
+
score = float(match.group(1))
|
|
93
|
+
return min(1.0, max(0.0, score))
|
|
94
|
+
return 0.0
|
|
95
|
+
except:
|
|
96
|
+
return 0.0
|
|
97
|
+
|
|
98
|
+
def _get_model_task_quality(self, task_name: str) -> float:
|
|
99
|
+
"""Get task quality assessment from the model."""
|
|
100
|
+
prompt = f"""Rate the quality and reliability of this evaluation task for training AI safety classifiers.
|
|
101
|
+
|
|
102
|
+
Task: {task_name}
|
|
103
|
+
|
|
104
|
+
Consider factors like:
|
|
105
|
+
- Data quality and reliability
|
|
106
|
+
- Task design and clarity
|
|
107
|
+
- Usefulness for training safety classifiers
|
|
108
|
+
|
|
109
|
+
Rate quality from 0.0 to 5.0 (5.0 = excellent quality, 0.0 = poor quality).
|
|
110
|
+
Respond with only the number:"""
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
response = self.model.generate(prompt, layer_index=15, max_new_tokens=10, temperature=0.1)
|
|
114
|
+
score_str = response.strip()
|
|
115
|
+
|
|
116
|
+
import re
|
|
117
|
+
match = re.search(r'(\d+\.?\d*)', score_str)
|
|
118
|
+
if match:
|
|
119
|
+
score = float(match.group(1))
|
|
120
|
+
return min(5.0, max(0.0, score))
|
|
121
|
+
return 1.0
|
|
122
|
+
except:
|
|
123
|
+
return 1.0
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def find_relevant_tasks_for_issue_type(issue_type: str, max_tasks: int = 10, model=None) -> List[str]:
|
|
127
|
+
"""Standalone function for finding relevant tasks."""
|
|
128
|
+
if model is None:
|
|
129
|
+
from ....model import Model
|
|
130
|
+
model = Model("meta-llama/Llama-3.1-8B-Instruct")
|
|
131
|
+
|
|
132
|
+
selector = TaskSelector(model)
|
|
133
|
+
return selector.find_relevant_tasks_for_issue_type(issue_type, max_tasks)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def select_best_tasks_for_training(
|
|
137
|
+
issue_type: str,
|
|
138
|
+
min_tasks: int = 1,
|
|
139
|
+
max_tasks: int = 10,
|
|
140
|
+
quality_threshold: float = 1.5,
|
|
141
|
+
model=None
|
|
142
|
+
) -> List[str]:
|
|
143
|
+
"""Standalone function for selecting best training tasks."""
|
|
144
|
+
if model is None:
|
|
145
|
+
from ....model import Model
|
|
146
|
+
model = Model("meta-llama/Llama-3.1-8B-Instruct")
|
|
147
|
+
|
|
148
|
+
selector = TaskSelector(model)
|
|
149
|
+
return selector.select_best_tasks_for_training(
|
|
150
|
+
issue_type, min_tasks, max_tasks, quality_threshold
|
|
151
|
+
)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import time
|
|
3
|
+
import signal
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
# Add the project root to the path
|
|
7
|
+
project_root = Path(__file__).parent.parent.parent.parent
|
|
8
|
+
sys.path.insert(0, str(project_root))
|
|
9
|
+
|
|
10
|
+
from wisent.core.model import Model
|
|
11
|
+
from wisent.core.agent.diagnose.synthetic_classifier_option import (
|
|
12
|
+
create_classifiers_for_prompt,
|
|
13
|
+
apply_classifiers_to_response
|
|
14
|
+
)
|
|
15
|
+
from wisent.core.agent.budget import set_time_budget
|
|
16
|
+
|
|
17
|
+
class TimeoutError(Exception):
|
|
18
|
+
"""Raised when test exceeds time budget."""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
def timeout_handler(signum, frame):
|
|
22
|
+
raise TimeoutError("Test exceeded time budget!")
|
|
23
|
+
|
|
24
|
+
def main():
|
|
25
|
+
# Set budget and timeout separately
|
|
26
|
+
budget_minutes = 1.0 # 1 minute - internal budget for classifier creation
|
|
27
|
+
timeout_seconds = 120 # 2 minutes - hard timeout for the test process
|
|
28
|
+
set_time_budget(budget_minutes)
|
|
29
|
+
|
|
30
|
+
print(f"⏱️ Starting synthetic classifier test with {timeout_seconds}s timeout and {budget_minutes*60}s budget...")
|
|
31
|
+
|
|
32
|
+
# Set up timeout signal
|
|
33
|
+
signal.signal(signal.SIGALRM, timeout_handler)
|
|
34
|
+
signal.alarm(timeout_seconds)
|
|
35
|
+
|
|
36
|
+
start_time = time.time()
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
model = Model(name="meta-llama/Llama-3.1-8B-Instruct")
|
|
40
|
+
prompt = "What is the capital of France?"
|
|
41
|
+
|
|
42
|
+
# Test the system
|
|
43
|
+
classifiers, trait_discovery = create_classifiers_for_prompt(model, prompt)
|
|
44
|
+
|
|
45
|
+
# Clear the alarm since we completed successfully
|
|
46
|
+
signal.alarm(0)
|
|
47
|
+
|
|
48
|
+
elapsed_time = time.time() - start_time
|
|
49
|
+
print(f"✅ SUCCESS: Created {len(classifiers)} classifiers for {len(trait_discovery.traits_discovered)} traits")
|
|
50
|
+
print(f"⏱️ Total time: {elapsed_time:.1f}s (timeout: {timeout_seconds}s, budget: {budget_minutes*60}s)")
|
|
51
|
+
|
|
52
|
+
if elapsed_time > timeout_seconds:
|
|
53
|
+
print(f"⚠️ WARNING: Test completed but exceeded timeout by {elapsed_time - timeout_seconds:.1f}s")
|
|
54
|
+
else:
|
|
55
|
+
print(f"🎉 Test completed within timeout with {timeout_seconds - elapsed_time:.1f}s to spare!")
|
|
56
|
+
|
|
57
|
+
except TimeoutError as e:
|
|
58
|
+
elapsed_time = time.time() - start_time
|
|
59
|
+
print(f"❌ ERROR: {e}")
|
|
60
|
+
print(f"❌ Test failed after {elapsed_time:.1f}s (timeout: {timeout_seconds}s, budget: {budget_minutes*60}s)")
|
|
61
|
+
print("❌ This indicates a performance issue that needs investigation.")
|
|
62
|
+
sys.exit(1)
|
|
63
|
+
except Exception as e:
|
|
64
|
+
signal.alarm(0) # Clear timeout
|
|
65
|
+
elapsed_time = time.time() - start_time
|
|
66
|
+
print(f"❌ ERROR: Test failed with exception: {e}")
|
|
67
|
+
print(f"❌ Time elapsed: {elapsed_time:.1f}s")
|
|
68
|
+
sys.exit(1)
|
|
69
|
+
|
|
70
|
+
if __name__ == "__main__":
|
|
71
|
+
main()
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Diagnostic module for autonomous agent response analysis.
|
|
3
|
+
|
|
4
|
+
This module handles:
|
|
5
|
+
- Activation-based response quality assessment using trained classifiers
|
|
6
|
+
- Issue detection through model activations
|
|
7
|
+
- Classifier-based quality scoring
|
|
8
|
+
- Decision making for improvements needed
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Any, Dict, List
|
|
13
|
+
|
|
14
|
+
from wisent.core.activations import ActivationAggregationStrategy, Activations
|
|
15
|
+
from wisent.core.classifier.classifier import Classifier
|
|
16
|
+
|
|
17
|
+
from ..layer import Layer
|
|
18
|
+
from ..model import Model
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class AnalysisResult:
|
|
23
|
+
"""Result of self-analysis."""
|
|
24
|
+
|
|
25
|
+
has_issues: bool
|
|
26
|
+
issues_found: List[str]
|
|
27
|
+
confidence: float
|
|
28
|
+
suggestions: List[str]
|
|
29
|
+
quality_score: float
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ResponseDiagnostics:
|
|
33
|
+
"""Handles activation-based response analysis and quality assessment for autonomous agents."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, model: Model, classifier_configs: List[Dict[str, Any]]):
|
|
36
|
+
"""
|
|
37
|
+
Initialize the diagnostics system.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
model: The language model to extract activations from
|
|
41
|
+
classifier_configs: List of classifier configurations with paths and layers
|
|
42
|
+
Example: [
|
|
43
|
+
{"path": "./models/hallucination_classifier.pt", "layer": 15, "issue_type": "hallucination"},
|
|
44
|
+
{"path": "./models/quality_classifier.pt", "layer": 20, "issue_type": "quality"}
|
|
45
|
+
]
|
|
46
|
+
"""
|
|
47
|
+
if not classifier_configs:
|
|
48
|
+
raise ValueError("classifier_configs is required - no fallback mode available")
|
|
49
|
+
|
|
50
|
+
self.model = model
|
|
51
|
+
|
|
52
|
+
# Load classifiers
|
|
53
|
+
self.classifiers = []
|
|
54
|
+
for config in classifier_configs:
|
|
55
|
+
classifier = Classifier()
|
|
56
|
+
classifier.load_model(config["path"])
|
|
57
|
+
|
|
58
|
+
self.classifiers.append(
|
|
59
|
+
{
|
|
60
|
+
"classifier": classifier,
|
|
61
|
+
"layer": Layer(index=config["layer"], type="transformer"),
|
|
62
|
+
"issue_type": config.get("issue_type", "unknown"),
|
|
63
|
+
"threshold": config.get("threshold", 0.5),
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
print(f"✅ Loaded classifier for {config['issue_type']} at layer {config['layer']}")
|
|
67
|
+
|
|
68
|
+
if not self.classifiers:
|
|
69
|
+
raise RuntimeError("Failed to load any classifiers - system cannot operate without them")
|
|
70
|
+
|
|
71
|
+
async def analyze_response(self, response: str, prompt: str) -> AnalysisResult:
|
|
72
|
+
"""Analyze the response using trained classifiers and activation patterns."""
|
|
73
|
+
issues = []
|
|
74
|
+
confidence_scores = []
|
|
75
|
+
|
|
76
|
+
# Classifier-based analysis only
|
|
77
|
+
classifier_results = self._analyze_with_classifiers(response)
|
|
78
|
+
|
|
79
|
+
for result in classifier_results:
|
|
80
|
+
if result["has_issue"]:
|
|
81
|
+
issues.append(result["issue_type"])
|
|
82
|
+
confidence_scores.append(result["confidence"])
|
|
83
|
+
|
|
84
|
+
# Quality assessment using classifiers
|
|
85
|
+
quality_score = self._assess_quality_with_classifiers(response)
|
|
86
|
+
|
|
87
|
+
# Overall confidence - requires at least one confidence score
|
|
88
|
+
if not confidence_scores:
|
|
89
|
+
raise RuntimeError("No confidence scores available - all classifiers failed")
|
|
90
|
+
confidence = sum(confidence_scores) / len(confidence_scores)
|
|
91
|
+
|
|
92
|
+
# Generate suggestions based on detected issues
|
|
93
|
+
suggestions = self._generate_suggestions(issues)
|
|
94
|
+
|
|
95
|
+
result = AnalysisResult(
|
|
96
|
+
has_issues=len(issues) > 0,
|
|
97
|
+
issues_found=issues,
|
|
98
|
+
confidence=confidence,
|
|
99
|
+
suggestions=suggestions,
|
|
100
|
+
quality_score=quality_score,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
return result
|
|
104
|
+
|
|
105
|
+
def _analyze_with_classifiers(self, response: str) -> List[Dict[str, Any]]:
|
|
106
|
+
"""Analyze response using trained classifiers."""
|
|
107
|
+
results = []
|
|
108
|
+
|
|
109
|
+
for classifier_config in self.classifiers:
|
|
110
|
+
# Extract activations for this classifier's layer
|
|
111
|
+
activations_tensor = self.model.extract_activations(response, classifier_config["layer"])
|
|
112
|
+
|
|
113
|
+
# Create Activations object
|
|
114
|
+
activations = Activations(
|
|
115
|
+
tensor=activations_tensor,
|
|
116
|
+
layer=classifier_config["layer"],
|
|
117
|
+
aggregation_strategy=ActivationAggregationStrategy.LAST_TOKEN,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Get features for classifier
|
|
121
|
+
features = activations.extract_features_for_classifier()
|
|
122
|
+
|
|
123
|
+
# Get classifier prediction
|
|
124
|
+
classifier = classifier_config["classifier"]
|
|
125
|
+
prediction = classifier.predict([features.numpy()])[0]
|
|
126
|
+
probability = classifier.predict_proba([features.numpy()])[0]
|
|
127
|
+
|
|
128
|
+
# Determine if this indicates an issue
|
|
129
|
+
threshold = classifier_config["threshold"]
|
|
130
|
+
has_issue = float(probability) > threshold
|
|
131
|
+
confidence = abs(float(probability) - 0.5) * 2 # Convert to 0-1 confidence
|
|
132
|
+
|
|
133
|
+
results.append(
|
|
134
|
+
{
|
|
135
|
+
"issue_type": classifier_config["issue_type"],
|
|
136
|
+
"has_issue": has_issue,
|
|
137
|
+
"confidence": confidence,
|
|
138
|
+
"probability": float(probability),
|
|
139
|
+
"prediction": int(prediction),
|
|
140
|
+
}
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return results
|
|
144
|
+
|
|
145
|
+
def _assess_quality_with_classifiers(self, response: str) -> float:
|
|
146
|
+
"""Assess response quality using classifiers."""
|
|
147
|
+
quality_scores = []
|
|
148
|
+
|
|
149
|
+
for classifier_config in self.classifiers:
|
|
150
|
+
# Extract activations
|
|
151
|
+
activations_tensor = self.model.extract_activations(response, classifier_config["layer"])
|
|
152
|
+
|
|
153
|
+
# Create Activations object
|
|
154
|
+
activations = Activations(
|
|
155
|
+
tensor=activations_tensor,
|
|
156
|
+
layer=classifier_config["layer"],
|
|
157
|
+
aggregation_strategy=ActivationAggregationStrategy.LAST_TOKEN,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Get features
|
|
161
|
+
features = activations.extract_features_for_classifier()
|
|
162
|
+
|
|
163
|
+
# Get classifier probability
|
|
164
|
+
classifier = classifier_config["classifier"]
|
|
165
|
+
probability = classifier.predict_proba([features.numpy()])[0]
|
|
166
|
+
|
|
167
|
+
# Convert probability to quality score
|
|
168
|
+
# For most classifiers, low probability (closer to 0) = higher quality
|
|
169
|
+
# This assumes classifiers are trained to detect problems (1 = problematic, 0 = good)
|
|
170
|
+
quality_score = 1.0 - float(probability)
|
|
171
|
+
quality_scores.append(quality_score)
|
|
172
|
+
|
|
173
|
+
if not quality_scores:
|
|
174
|
+
raise RuntimeError("No quality scores available - all classifiers failed")
|
|
175
|
+
|
|
176
|
+
# Use average quality across all classifiers
|
|
177
|
+
return sum(quality_scores) / len(quality_scores)
|
|
178
|
+
|
|
179
|
+
def _generate_suggestions(self, issues: List[str]) -> List[str]:
|
|
180
|
+
"""Generate improvement suggestions based on detected issues."""
|
|
181
|
+
suggestions = []
|
|
182
|
+
|
|
183
|
+
# Map issue types to suggestions
|
|
184
|
+
suggestion_map = {
|
|
185
|
+
"hallucination": "Verify factual accuracy and provide evidence-based responses",
|
|
186
|
+
"quality": "Improve response relevance, completeness, and clarity",
|
|
187
|
+
"harmful": "Revise content to be safe and helpful",
|
|
188
|
+
"bias": "Use more balanced and inclusive language",
|
|
189
|
+
"gibberish": "Ensure response coherence and proper language structure",
|
|
190
|
+
"repetitive": "Reduce repetition and vary language patterns",
|
|
191
|
+
"incoherent": "Improve logical flow and sentence structure",
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
for issue in issues:
|
|
195
|
+
if issue in suggestion_map:
|
|
196
|
+
suggestions.append(suggestion_map[issue])
|
|
197
|
+
else:
|
|
198
|
+
suggestions.append(f"Address {issue} issue in the response")
|
|
199
|
+
|
|
200
|
+
return suggestions
|
|
201
|
+
|
|
202
|
+
def decide_if_improvement_needed(
|
|
203
|
+
self, analysis: AnalysisResult, quality_threshold: float = 0.7, confidence_threshold: float = 0.8
|
|
204
|
+
) -> bool:
|
|
205
|
+
"""Decide if the response needs improvement based on classifier results."""
|
|
206
|
+
# Improvement needed if:
|
|
207
|
+
# 1. Quality score below threshold
|
|
208
|
+
# 2. High-confidence issues detected
|
|
209
|
+
# 3. Multiple issues found
|
|
210
|
+
|
|
211
|
+
if analysis.quality_score < quality_threshold:
|
|
212
|
+
return True
|
|
213
|
+
|
|
214
|
+
if analysis.confidence > confidence_threshold and analysis.has_issues:
|
|
215
|
+
return True
|
|
216
|
+
|
|
217
|
+
if len(analysis.issues_found) >= 2:
|
|
218
|
+
return True
|
|
219
|
+
|
|
220
|
+
return False
|
|
221
|
+
|
|
222
|
+
def add_classifier(self, classifier_path: str, layer_index: int, issue_type: str, threshold: float = 0.5):
|
|
223
|
+
"""Add a new classifier to the diagnostic system."""
|
|
224
|
+
classifier = Classifier()
|
|
225
|
+
classifier.load_model(classifier_path)
|
|
226
|
+
|
|
227
|
+
self.classifiers.append(
|
|
228
|
+
{
|
|
229
|
+
"classifier": classifier,
|
|
230
|
+
"layer": Layer(index=layer_index, type="transformer"),
|
|
231
|
+
"issue_type": issue_type,
|
|
232
|
+
"threshold": threshold,
|
|
233
|
+
}
|
|
234
|
+
)
|
|
235
|
+
print(f"✅ Added classifier for {issue_type} at layer {layer_index}")
|
|
236
|
+
|
|
237
|
+
def get_available_classifiers(self) -> List[Dict[str, Any]]:
|
|
238
|
+
"""Get information about loaded classifiers."""
|
|
239
|
+
return [
|
|
240
|
+
{"issue_type": config["issue_type"], "layer": config["layer"].index, "threshold": config["threshold"]}
|
|
241
|
+
for config in self.classifiers
|
|
242
|
+
]
|