wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.2.dist-info/METADATA +67 -0
- wisent-0.5.2.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Response diagnostics for autonomous agent analysis.
|
|
3
|
+
|
|
4
|
+
This module handles:
|
|
5
|
+
- Activation-based response quality assessment using trained classifiers
|
|
6
|
+
- Issue detection through model activations
|
|
7
|
+
- Classifier-based quality scoring
|
|
8
|
+
- Decision making for improvements needed
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Any, Dict, List
|
|
13
|
+
|
|
14
|
+
from wisent.core.activations import ActivationAggregationStrategy, Activations
|
|
15
|
+
from wisent.core.classifier.classifier import Classifier
|
|
16
|
+
|
|
17
|
+
from ...layer import Layer
|
|
18
|
+
from ...model import Model
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class AnalysisResult:
|
|
23
|
+
"""Result of self-analysis."""
|
|
24
|
+
|
|
25
|
+
has_issues: bool
|
|
26
|
+
issues_found: List[str]
|
|
27
|
+
confidence: float
|
|
28
|
+
suggestions: List[str]
|
|
29
|
+
quality_score: float
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ResponseDiagnostics:
|
|
33
|
+
"""Handles activation-based response analysis and quality assessment for autonomous agents."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, model: Model, classifier_configs: List[Dict[str, Any]]):
|
|
36
|
+
"""
|
|
37
|
+
Initialize the diagnostics system.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
model: The language model to extract activations from
|
|
41
|
+
classifier_configs: List of classifier configurations with paths and layers
|
|
42
|
+
Example: [
|
|
43
|
+
{"path": "./models/hallucination_classifier.pt", "layer": 15, "issue_type": "hallucination"},
|
|
44
|
+
{"path": "./models/quality_classifier.pt", "layer": 20, "issue_type": "quality"}
|
|
45
|
+
]
|
|
46
|
+
"""
|
|
47
|
+
if not classifier_configs:
|
|
48
|
+
raise ValueError("classifier_configs is required - no fallback mode available")
|
|
49
|
+
|
|
50
|
+
self.model = model
|
|
51
|
+
|
|
52
|
+
# Load classifiers
|
|
53
|
+
self.classifiers = []
|
|
54
|
+
for config in classifier_configs:
|
|
55
|
+
try:
|
|
56
|
+
classifier = Classifier()
|
|
57
|
+
classifier.load_model(config["path"])
|
|
58
|
+
|
|
59
|
+
self.classifiers.append(
|
|
60
|
+
{
|
|
61
|
+
"classifier": classifier,
|
|
62
|
+
"layer": Layer(index=config["layer"], type="transformer"),
|
|
63
|
+
"issue_type": config.get("issue_type", "unknown"),
|
|
64
|
+
"threshold": config.get("threshold", 0.5),
|
|
65
|
+
}
|
|
66
|
+
)
|
|
67
|
+
print(f"✅ Loaded classifier for {config['issue_type']} at layer {config['layer']}")
|
|
68
|
+
|
|
69
|
+
except Exception as e:
|
|
70
|
+
print(f"⚠️ Failed to load classifier {config['path']}: {e}")
|
|
71
|
+
print(" Skipping this classifier and continuing...")
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
if not self.classifiers:
|
|
75
|
+
raise RuntimeError("Failed to load any classifiers - system cannot operate without them")
|
|
76
|
+
|
|
77
|
+
async def analyze_response(self, response: str, prompt: str) -> AnalysisResult:
|
|
78
|
+
"""Analyze the response using trained classifiers and activation patterns."""
|
|
79
|
+
issues = []
|
|
80
|
+
confidence_scores = []
|
|
81
|
+
|
|
82
|
+
# Classifier-based analysis only
|
|
83
|
+
classifier_results = self._analyze_with_classifiers(response)
|
|
84
|
+
|
|
85
|
+
for result in classifier_results:
|
|
86
|
+
if result["has_issue"]:
|
|
87
|
+
issues.append(result["issue_type"])
|
|
88
|
+
confidence_scores.append(result["confidence"])
|
|
89
|
+
|
|
90
|
+
# Quality assessment using classifiers
|
|
91
|
+
quality_score = self._assess_quality_with_classifiers(response)
|
|
92
|
+
|
|
93
|
+
# Overall confidence calculation
|
|
94
|
+
if not confidence_scores:
|
|
95
|
+
# No issues detected, use average confidence from all classifiers
|
|
96
|
+
confidence = (
|
|
97
|
+
sum(result["confidence"] for result in classifier_results) / len(classifier_results)
|
|
98
|
+
if classifier_results
|
|
99
|
+
else 0.5
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
confidence = sum(confidence_scores) / len(confidence_scores)
|
|
103
|
+
|
|
104
|
+
# Generate suggestions based on detected issues
|
|
105
|
+
suggestions = self._generate_suggestions(issues)
|
|
106
|
+
|
|
107
|
+
result = AnalysisResult(
|
|
108
|
+
has_issues=len(issues) > 0,
|
|
109
|
+
issues_found=issues,
|
|
110
|
+
confidence=confidence,
|
|
111
|
+
suggestions=suggestions,
|
|
112
|
+
quality_score=quality_score,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return result
|
|
116
|
+
|
|
117
|
+
def _analyze_with_classifiers(self, response: str) -> List[Dict[str, Any]]:
|
|
118
|
+
"""Analyze response using trained classifiers."""
|
|
119
|
+
results = []
|
|
120
|
+
|
|
121
|
+
for classifier_config in self.classifiers:
|
|
122
|
+
# Extract activations for this classifier's layer
|
|
123
|
+
activations_tensor = self.model.extract_activations(response, classifier_config["layer"])
|
|
124
|
+
|
|
125
|
+
# Create Activations object
|
|
126
|
+
activations = Activations(
|
|
127
|
+
tensor=activations_tensor,
|
|
128
|
+
layer=classifier_config["layer"],
|
|
129
|
+
aggregation_strategy=ActivationAggregationStrategy.LAST_TOKEN,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Get features for classifier
|
|
133
|
+
features = activations.extract_features_for_classifier()
|
|
134
|
+
|
|
135
|
+
# Get classifier prediction (ensure CPU tensor for numpy conversion)
|
|
136
|
+
classifier = classifier_config["classifier"]
|
|
137
|
+
features_cpu = features.cpu() if hasattr(features, "cpu") else features
|
|
138
|
+
|
|
139
|
+
# Handle different classifier output formats
|
|
140
|
+
pred_result = classifier.predict([features_cpu.numpy()])
|
|
141
|
+
prediction = pred_result[0] if isinstance(pred_result, (list, tuple)) else pred_result
|
|
142
|
+
|
|
143
|
+
prob_result = classifier.predict_proba([features_cpu.numpy()])
|
|
144
|
+
probability = prob_result[0] if isinstance(prob_result, (list, tuple)) else prob_result
|
|
145
|
+
|
|
146
|
+
# Determine if this indicates an issue
|
|
147
|
+
threshold = classifier_config["threshold"]
|
|
148
|
+
has_issue = float(probability) > threshold
|
|
149
|
+
confidence = abs(float(probability) - 0.5) * 2 # Convert to 0-1 confidence
|
|
150
|
+
|
|
151
|
+
results.append(
|
|
152
|
+
{
|
|
153
|
+
"issue_type": classifier_config["issue_type"],
|
|
154
|
+
"has_issue": has_issue,
|
|
155
|
+
"confidence": confidence,
|
|
156
|
+
"probability": float(probability),
|
|
157
|
+
"prediction": int(prediction),
|
|
158
|
+
}
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return results
|
|
162
|
+
|
|
163
|
+
def _assess_quality_with_classifiers(self, response: str) -> float:
|
|
164
|
+
"""Assess response quality using classifiers."""
|
|
165
|
+
quality_scores = []
|
|
166
|
+
|
|
167
|
+
for classifier_config in self.classifiers:
|
|
168
|
+
# Extract activations
|
|
169
|
+
activations_tensor = self.model.extract_activations(response, classifier_config["layer"])
|
|
170
|
+
|
|
171
|
+
# Create Activations object
|
|
172
|
+
activations = Activations(
|
|
173
|
+
tensor=activations_tensor,
|
|
174
|
+
layer=classifier_config["layer"],
|
|
175
|
+
aggregation_strategy=ActivationAggregationStrategy.LAST_TOKEN,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Get features
|
|
179
|
+
features = activations.extract_features_for_classifier()
|
|
180
|
+
|
|
181
|
+
# Get classifier probability (ensure CPU tensor for numpy conversion)
|
|
182
|
+
classifier = classifier_config["classifier"]
|
|
183
|
+
features_cpu = features.cpu() if hasattr(features, "cpu") else features
|
|
184
|
+
|
|
185
|
+
# Handle different classifier output formats
|
|
186
|
+
prob_result = classifier.predict_proba([features_cpu.numpy()])
|
|
187
|
+
probability = prob_result[0] if isinstance(prob_result, (list, tuple)) else prob_result
|
|
188
|
+
|
|
189
|
+
# Convert probability to quality score
|
|
190
|
+
# For most classifiers, low probability (closer to 0) = higher quality
|
|
191
|
+
# This assumes classifiers are trained to detect problems (1 = problematic, 0 = good)
|
|
192
|
+
quality_score = 1.0 - float(probability)
|
|
193
|
+
quality_scores.append(quality_score)
|
|
194
|
+
|
|
195
|
+
if not quality_scores:
|
|
196
|
+
raise RuntimeError("No quality scores available - all classifiers failed")
|
|
197
|
+
|
|
198
|
+
# Use average quality across all classifiers
|
|
199
|
+
return sum(quality_scores) / len(quality_scores)
|
|
200
|
+
|
|
201
|
+
def _generate_suggestions(self, issues: List[str]) -> List[str]:
|
|
202
|
+
"""Generate improvement suggestions based on detected issues."""
|
|
203
|
+
suggestions = []
|
|
204
|
+
|
|
205
|
+
# Map issue types to suggestions
|
|
206
|
+
suggestion_map = {
|
|
207
|
+
"hallucination": "Verify factual accuracy and provide evidence-based responses",
|
|
208
|
+
"quality": "Improve response relevance, completeness, and clarity",
|
|
209
|
+
"harmful": "Revise content to be safe and helpful",
|
|
210
|
+
"bias": "Use more balanced and inclusive language",
|
|
211
|
+
"gibberish": "Ensure response coherence and proper language structure",
|
|
212
|
+
"repetitive": "Reduce repetition and vary language patterns",
|
|
213
|
+
"incoherent": "Improve logical flow and sentence structure",
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
for issue in issues:
|
|
217
|
+
if issue in suggestion_map:
|
|
218
|
+
suggestions.append(suggestion_map[issue])
|
|
219
|
+
else:
|
|
220
|
+
suggestions.append(f"Address {issue} issue in the response")
|
|
221
|
+
|
|
222
|
+
return suggestions
|
|
223
|
+
|
|
224
|
+
def decide_if_improvement_needed(
|
|
225
|
+
self, analysis: AnalysisResult, quality_threshold: float = 0.7, confidence_threshold: float = 0.8
|
|
226
|
+
) -> bool:
|
|
227
|
+
"""
|
|
228
|
+
Decide if improvement is needed based on analysis results.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
analysis: Analysis result to evaluate
|
|
232
|
+
quality_threshold: Minimum quality score required
|
|
233
|
+
confidence_threshold: Minimum confidence required for improvement decision
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
True if improvement is needed, False otherwise
|
|
237
|
+
"""
|
|
238
|
+
# High confidence decisions
|
|
239
|
+
if analysis.confidence >= confidence_threshold:
|
|
240
|
+
if analysis.has_issues:
|
|
241
|
+
return True
|
|
242
|
+
if analysis.quality_score < quality_threshold:
|
|
243
|
+
return True
|
|
244
|
+
|
|
245
|
+
# Low confidence - be conservative
|
|
246
|
+
return False
|
|
247
|
+
|
|
248
|
+
def add_classifier(self, classifier_path: str, layer_index: int, issue_type: str, threshold: float = 0.5):
|
|
249
|
+
"""Add a new classifier to the diagnostic system."""
|
|
250
|
+
classifier = Classifier()
|
|
251
|
+
classifier.load_model(classifier_path)
|
|
252
|
+
|
|
253
|
+
self.classifiers.append(
|
|
254
|
+
{
|
|
255
|
+
"classifier": classifier,
|
|
256
|
+
"layer": Layer(index=layer_index, type="transformer"),
|
|
257
|
+
"issue_type": issue_type,
|
|
258
|
+
"threshold": threshold,
|
|
259
|
+
}
|
|
260
|
+
)
|
|
261
|
+
print(f"✅ Added classifier for {issue_type} at layer {layer_index}")
|
|
262
|
+
|
|
263
|
+
def get_available_classifiers(self) -> List[Dict[str, Any]]:
|
|
264
|
+
"""Get information about loaded classifiers."""
|
|
265
|
+
return [
|
|
266
|
+
{"issue_type": config["issue_type"], "layer": config["layer"].index, "threshold": config["threshold"]}
|
|
267
|
+
for config in self.classifiers
|
|
268
|
+
]
|