wisent 0.5.14__py3-none-any.whl → 0.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -1
- wisent/cli.py +114 -0
- wisent/core/activations/activations_collector.py +19 -11
- wisent/core/cli/__init__.py +3 -1
- wisent/core/cli/create_steering_vector.py +60 -18
- wisent/core/cli/evaluate_responses.py +14 -8
- wisent/core/cli/generate_pairs_from_task.py +18 -5
- wisent/core/cli/get_activations.py +1 -1
- wisent/core/cli/multi_steer.py +108 -0
- wisent/core/cli/optimize_classification.py +187 -285
- wisent/core/cli/optimize_sample_size.py +78 -0
- wisent/core/cli/optimize_steering.py +354 -53
- wisent/core/cli/tasks.py +274 -9
- wisent/core/errors/__init__.py +0 -0
- wisent/core/errors/error_handler.py +134 -0
- wisent/core/evaluators/benchmark_specific/log_likelihoods_evaluator.py +152 -295
- wisent/core/evaluators/rotator.py +22 -8
- wisent/core/main.py +5 -1
- wisent/core/model_persistence.py +4 -19
- wisent/core/models/wisent_model.py +11 -3
- wisent/core/parser.py +4 -3
- wisent/core/parser_arguments/main_parser.py +1 -1
- wisent/core/parser_arguments/multi_steer_parser.py +4 -3
- wisent/core/parser_arguments/optimize_steering_parser.py +4 -0
- wisent/core/sample_size_optimizer_v2.py +1 -1
- wisent/core/steering_optimizer.py +2 -2
- wisent/tests/__init__.py +0 -0
- wisent/tests/examples/__init__.py +0 -0
- wisent/tests/examples/cli/__init__.py +0 -0
- wisent/tests/examples/cli/activations/__init__.py +0 -0
- wisent/tests/examples/cli/activations/test_get_activations.py +127 -0
- wisent/tests/examples/cli/classifier/__init__.py +0 -0
- wisent/tests/examples/cli/classifier/test_classifier_examples.py +141 -0
- wisent/tests/examples/cli/contrastive_pairs/__init__.py +0 -0
- wisent/tests/examples/cli/contrastive_pairs/test_generate_pairs.py +89 -0
- wisent/tests/examples/cli/evaluation/__init__.py +0 -0
- wisent/tests/examples/cli/evaluation/test_evaluation_examples.py +117 -0
- wisent/tests/examples/cli/generate/__init__.py +0 -0
- wisent/tests/examples/cli/generate/test_generate_with_classifier.py +146 -0
- wisent/tests/examples/cli/generate/test_generate_with_steering.py +149 -0
- wisent/tests/examples/cli/generate/test_only_generate.py +110 -0
- wisent/tests/examples/cli/multi_steering/__init__.py +0 -0
- wisent/tests/examples/cli/multi_steering/test_multi_steer_from_trained_vectors.py +210 -0
- wisent/tests/examples/cli/multi_steering/test_multi_steer_with_different_parameters.py +205 -0
- wisent/tests/examples/cli/multi_steering/test_train_and_multi_steer.py +174 -0
- wisent/tests/examples/cli/optimizer/__init__.py +0 -0
- wisent/tests/examples/cli/optimizer/test_optimize_sample_size.py +102 -0
- wisent/tests/examples/cli/optimizer/test_optimizer_examples.py +59 -0
- wisent/tests/examples/cli/steering/__init__.py +0 -0
- wisent/tests/examples/cli/steering/test_create_steering_vectors.py +135 -0
- wisent/tests/examples/cli/synthetic/__init__.py +0 -0
- wisent/tests/examples/cli/synthetic/test_synthetic_pairs.py +45 -0
- {wisent-0.5.14.dist-info → wisent-0.5.15.dist-info}/METADATA +3 -1
- {wisent-0.5.14.dist-info → wisent-0.5.15.dist-info}/RECORD +59 -29
- wisent/core/agent/diagnose/test_synthetic_classifier.py +0 -71
- /wisent/core/parser_arguments/{test_nonsense_parser.py → nonsense_parser.py} +0 -0
- {wisent-0.5.14.dist-info → wisent-0.5.15.dist-info}/WHEEL +0 -0
- {wisent-0.5.14.dist-info → wisent-0.5.15.dist-info}/entry_points.txt +0 -0
- {wisent-0.5.14.dist-info → wisent-0.5.15.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.5.14.dist-info → wisent-0.5.15.dist-info}/top_level.txt +0 -0
|
@@ -8,14 +8,15 @@ def setup_multi_steer_parser(parser):
|
|
|
8
8
|
"--vector",
|
|
9
9
|
type=str,
|
|
10
10
|
action="append",
|
|
11
|
-
required=
|
|
11
|
+
required=False,
|
|
12
|
+
default=None,
|
|
12
13
|
metavar="PATH:WEIGHT",
|
|
13
|
-
help="Path to steering vector and its weight (format: path/to/vector.pt:0.5). Can be specified multiple times.",
|
|
14
|
+
help="Path to steering vector and its weight (format: path/to/vector.pt:0.5). Can be specified multiple times. If omitted, generates unsteered baseline.",
|
|
14
15
|
)
|
|
15
16
|
|
|
16
17
|
# Model configuration
|
|
17
18
|
parser.add_argument("--model", type=str, required=True, help="Model name or path")
|
|
18
|
-
parser.add_argument("--layer", type=int, required=
|
|
19
|
+
parser.add_argument("--layer", type=int, required=False, default=None, help="Layer index to apply combined steering (required when using vectors)")
|
|
19
20
|
parser.add_argument("--device", type=str, default=None, help="Device to run on (default: auto-detect)")
|
|
20
21
|
|
|
21
22
|
# Steering method configuration
|
|
@@ -31,6 +31,10 @@ def setup_steering_optimizer_parser(parser):
|
|
|
31
31
|
"--max-time-per-task", type=float, default=20.0, help="Time limit per task in minutes (default: 20.0)"
|
|
32
32
|
)
|
|
33
33
|
comprehensive_parser.add_argument("--no-save", action="store_true", help="Don't save results to model config")
|
|
34
|
+
comprehensive_parser.add_argument("--save-best-vector", type=str, default=None, help="Save the best steering vector for each task to specified directory")
|
|
35
|
+
comprehensive_parser.add_argument("--save-generation-examples", action="store_true", help="Generate and save example responses (unsteered vs steered)")
|
|
36
|
+
comprehensive_parser.add_argument("--num-generation-examples", type=int, default=3, help="Number of generation examples per task (default: 3)")
|
|
37
|
+
comprehensive_parser.add_argument("--save-all-generation-examples", action="store_true", help="Save generation examples for ALL configurations tested (warning: very slow)")
|
|
34
38
|
comprehensive_parser.add_argument("--device", type=str, default=None, help="Device to run on")
|
|
35
39
|
comprehensive_parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
|
|
36
40
|
|
|
@@ -12,7 +12,7 @@ from datetime import datetime
|
|
|
12
12
|
import numpy as np
|
|
13
13
|
import matplotlib.pyplot as plt
|
|
14
14
|
|
|
15
|
-
from
|
|
15
|
+
from wisent.cli import run_task_pipeline
|
|
16
16
|
from .model_config_manager import ModelConfigManager
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
@@ -497,8 +497,8 @@ class SteeringOptimizer:
|
|
|
497
497
|
for strength in strengths:
|
|
498
498
|
try:
|
|
499
499
|
# Run evaluation with this strength
|
|
500
|
-
from
|
|
501
|
-
|
|
500
|
+
from wisent.cli import run_task_pipeline
|
|
501
|
+
|
|
502
502
|
# Build kwargs for run_task_pipeline
|
|
503
503
|
pipeline_kwargs = {
|
|
504
504
|
'task_name': task_name,
|
wisent/tests/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for activation extraction examples.
|
|
3
|
+
|
|
4
|
+
Validates get-activations command with contrastive pairs.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import subprocess
|
|
8
|
+
import pytest
|
|
9
|
+
import tempfile
|
|
10
|
+
import os
|
|
11
|
+
import json
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def create_test_pairs_file(filepath):
|
|
15
|
+
"""Create a simple test pairs JSON file."""
|
|
16
|
+
pairs = [
|
|
17
|
+
{
|
|
18
|
+
"prompt": "What color is the sky?",
|
|
19
|
+
"positive_response": {
|
|
20
|
+
"model_response": "The sky is blue."
|
|
21
|
+
},
|
|
22
|
+
"negative_response": {
|
|
23
|
+
"model_response": "The sky is green."
|
|
24
|
+
},
|
|
25
|
+
"label": "truthfulness"
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"prompt": "What is the chemical formula for water?",
|
|
29
|
+
"positive_response": {
|
|
30
|
+
"model_response": "Water is H2O."
|
|
31
|
+
},
|
|
32
|
+
"negative_response": {
|
|
33
|
+
"model_response": "Water is CO2."
|
|
34
|
+
},
|
|
35
|
+
"label": "truthfulness"
|
|
36
|
+
}
|
|
37
|
+
]
|
|
38
|
+
with open(filepath, 'w') as f:
|
|
39
|
+
json.dump(pairs, f)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_get_activations_from_pairs():
|
|
43
|
+
"""Test extracting activations from contrastive pairs."""
|
|
44
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
45
|
+
input_file = os.path.join(tmpdir, "pairs.json")
|
|
46
|
+
output_file = os.path.join(tmpdir, "pairs_with_activations.json")
|
|
47
|
+
|
|
48
|
+
# Create test pairs
|
|
49
|
+
create_test_pairs_file(input_file)
|
|
50
|
+
|
|
51
|
+
result = subprocess.run(
|
|
52
|
+
[
|
|
53
|
+
"python", "-m", "wisent.core.main", "get-activations",
|
|
54
|
+
input_file,
|
|
55
|
+
"--output", output_file,
|
|
56
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
57
|
+
"--layers", "3",
|
|
58
|
+
"--token-aggregation", "average",
|
|
59
|
+
"--device", "cpu",
|
|
60
|
+
"--verbose"
|
|
61
|
+
],
|
|
62
|
+
capture_output=True,
|
|
63
|
+
text=True,
|
|
64
|
+
timeout=300
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
68
|
+
assert os.path.exists(output_file), "Output file not created"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_get_activations_multiple_layers():
|
|
72
|
+
"""Test extracting activations from multiple layers."""
|
|
73
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
74
|
+
input_file = os.path.join(tmpdir, "pairs.json")
|
|
75
|
+
output_file = os.path.join(tmpdir, "multilayer_activations.json")
|
|
76
|
+
|
|
77
|
+
create_test_pairs_file(input_file)
|
|
78
|
+
|
|
79
|
+
result = subprocess.run(
|
|
80
|
+
[
|
|
81
|
+
"python", "-m", "wisent.core.main", "get-activations",
|
|
82
|
+
input_file,
|
|
83
|
+
"--output", output_file,
|
|
84
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
85
|
+
"--layers", "2,3,4",
|
|
86
|
+
"--token-aggregation", "average",
|
|
87
|
+
"--device", "cpu",
|
|
88
|
+
"--verbose"
|
|
89
|
+
],
|
|
90
|
+
capture_output=True,
|
|
91
|
+
text=True,
|
|
92
|
+
timeout=300
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
96
|
+
assert os.path.exists(output_file), "Output file not created"
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def test_get_activations_different_aggregation():
|
|
100
|
+
"""Test different token aggregation strategies."""
|
|
101
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
102
|
+
input_file = os.path.join(tmpdir, "pairs.json")
|
|
103
|
+
output_file = os.path.join(tmpdir, "final_token_activations.json")
|
|
104
|
+
|
|
105
|
+
create_test_pairs_file(input_file)
|
|
106
|
+
|
|
107
|
+
result = subprocess.run(
|
|
108
|
+
[
|
|
109
|
+
"python", "-m", "wisent.core.main", "get-activations",
|
|
110
|
+
input_file,
|
|
111
|
+
"--output", output_file,
|
|
112
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
113
|
+
"--layers", "3",
|
|
114
|
+
"--token-aggregation", "final",
|
|
115
|
+
"--device", "cpu",
|
|
116
|
+
"--verbose"
|
|
117
|
+
],
|
|
118
|
+
capture_output=True,
|
|
119
|
+
text=True,
|
|
120
|
+
timeout=300
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
if __name__ == "__main__":
|
|
127
|
+
pytest.main([__file__, "-v"])
|
|
File without changes
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for classifier examples.
|
|
3
|
+
|
|
4
|
+
Validates classifier training, loading, and evaluation workflows.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import subprocess
|
|
8
|
+
import pytest
|
|
9
|
+
import tempfile
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_train_classifier_and_save():
|
|
14
|
+
"""Test training a classifier and saving it."""
|
|
15
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
16
|
+
classifier_path = os.path.join(tmpdir, "classifier.pt")
|
|
17
|
+
output_dir = os.path.join(tmpdir, "training")
|
|
18
|
+
|
|
19
|
+
result = subprocess.run(
|
|
20
|
+
[
|
|
21
|
+
"python", "-m", "wisent.core.main", "tasks", "boolq",
|
|
22
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
23
|
+
"--layer", "3",
|
|
24
|
+
"--classifier-type", "logistic",
|
|
25
|
+
"--limit", "20",
|
|
26
|
+
"--save-classifier", classifier_path,
|
|
27
|
+
"--output", output_dir,
|
|
28
|
+
"--device", "cpu"
|
|
29
|
+
],
|
|
30
|
+
capture_output=True,
|
|
31
|
+
text=True,
|
|
32
|
+
timeout=300
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
36
|
+
assert os.path.exists(classifier_path), "Classifier not saved"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_use_pretrained_classifier():
|
|
40
|
+
"""Test loading and using a pretrained classifier."""
|
|
41
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
42
|
+
classifier_path = os.path.join(tmpdir, "classifier.pt")
|
|
43
|
+
training_output = os.path.join(tmpdir, "training")
|
|
44
|
+
inference_output = os.path.join(tmpdir, "inference")
|
|
45
|
+
|
|
46
|
+
# Train classifier first
|
|
47
|
+
train_result = subprocess.run(
|
|
48
|
+
[
|
|
49
|
+
"python", "-m", "wisent.core.main", "tasks", "boolq",
|
|
50
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
51
|
+
"--layer", "3",
|
|
52
|
+
"--classifier-type", "logistic",
|
|
53
|
+
"--limit", "20",
|
|
54
|
+
"--save-classifier", classifier_path,
|
|
55
|
+
"--output", training_output,
|
|
56
|
+
"--device", "cpu"
|
|
57
|
+
],
|
|
58
|
+
capture_output=True,
|
|
59
|
+
text=True,
|
|
60
|
+
timeout=300
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
assert train_result.returncode == 0
|
|
64
|
+
|
|
65
|
+
# Use pretrained classifier
|
|
66
|
+
inference_result = subprocess.run(
|
|
67
|
+
[
|
|
68
|
+
"python", "-m", "wisent.core.main", "tasks", "boolq",
|
|
69
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
70
|
+
"--layer", "3",
|
|
71
|
+
"--load-classifier", classifier_path,
|
|
72
|
+
"--inference-only",
|
|
73
|
+
"--testing-limit", "10",
|
|
74
|
+
"--output", inference_output,
|
|
75
|
+
"--device", "cpu"
|
|
76
|
+
],
|
|
77
|
+
capture_output=True,
|
|
78
|
+
text=True,
|
|
79
|
+
timeout=600
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
assert inference_result.returncode == 0, f"Inference failed: {inference_result.stderr}"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_run_and_evaluate_on_benchmark():
|
|
86
|
+
"""Test training and evaluating classifier on benchmark."""
|
|
87
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
88
|
+
output_dir = os.path.join(tmpdir, "benchmark")
|
|
89
|
+
report_file = os.path.join(output_dir, "report.json")
|
|
90
|
+
|
|
91
|
+
result = subprocess.run(
|
|
92
|
+
[
|
|
93
|
+
"python", "-m", "wisent.core.main", "tasks", "boolq",
|
|
94
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
95
|
+
"--layer", "3",
|
|
96
|
+
"--classifier-type", "logistic",
|
|
97
|
+
"--training-limit", "20",
|
|
98
|
+
"--testing-limit", "10",
|
|
99
|
+
"--token-aggregation", "average",
|
|
100
|
+
"--detection-threshold", "0.6",
|
|
101
|
+
"--output", output_dir,
|
|
102
|
+
"--evaluation-report", report_file,
|
|
103
|
+
"--device", "cpu",
|
|
104
|
+
"--verbose"
|
|
105
|
+
],
|
|
106
|
+
capture_output=True,
|
|
107
|
+
text=True,
|
|
108
|
+
timeout=300
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def test_classifier_with_mlp():
|
|
115
|
+
"""Test training MLP classifier (not just logistic)."""
|
|
116
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
117
|
+
classifier_path = os.path.join(tmpdir, "mlp_classifier.pt")
|
|
118
|
+
output_dir = os.path.join(tmpdir, "training")
|
|
119
|
+
|
|
120
|
+
result = subprocess.run(
|
|
121
|
+
[
|
|
122
|
+
"python", "-m", "wisent.core.main", "tasks", "boolq",
|
|
123
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
124
|
+
"--layer", "3",
|
|
125
|
+
"--classifier-type", "mlp",
|
|
126
|
+
"--limit", "20",
|
|
127
|
+
"--save-classifier", classifier_path,
|
|
128
|
+
"--output", output_dir,
|
|
129
|
+
"--device", "cpu"
|
|
130
|
+
],
|
|
131
|
+
capture_output=True,
|
|
132
|
+
text=True,
|
|
133
|
+
timeout=300
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
137
|
+
assert os.path.exists(classifier_path), "MLP classifier not saved"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
if __name__ == "__main__":
|
|
141
|
+
pytest.main([__file__, "-v"])
|
|
File without changes
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for contrastive pairs generation examples.
|
|
3
|
+
|
|
4
|
+
Validates pair generation from tasks and synthetic generation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import subprocess
|
|
8
|
+
import pytest
|
|
9
|
+
import tempfile
|
|
10
|
+
import os
|
|
11
|
+
import json
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_generate_pairs_from_task():
|
|
15
|
+
"""Test generating contrastive pairs from lm-eval task."""
|
|
16
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
17
|
+
output_file = os.path.join(tmpdir, "pairs.json")
|
|
18
|
+
|
|
19
|
+
result = subprocess.run(
|
|
20
|
+
[
|
|
21
|
+
"python", "-m", "wisent.core.main", "generate-pairs-from-task", "boolq",
|
|
22
|
+
"--output", output_file,
|
|
23
|
+
"--limit", "10",
|
|
24
|
+
"--verbose"
|
|
25
|
+
],
|
|
26
|
+
capture_output=True,
|
|
27
|
+
text=True,
|
|
28
|
+
timeout=300
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
32
|
+
assert os.path.exists(output_file), "Output file not created"
|
|
33
|
+
|
|
34
|
+
# Verify JSON format
|
|
35
|
+
with open(output_file, 'r') as f:
|
|
36
|
+
data = json.load(f)
|
|
37
|
+
assert isinstance(data, (list, dict)), "Output should be JSON"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_generate_synthetic_pairs():
|
|
41
|
+
"""Test generating synthetic contrastive pairs."""
|
|
42
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
43
|
+
output_file = os.path.join(tmpdir, "synthetic_pairs.json")
|
|
44
|
+
|
|
45
|
+
result = subprocess.run(
|
|
46
|
+
[
|
|
47
|
+
"python", "-m", "wisent.core.main", "generate-pairs",
|
|
48
|
+
"--trait", "truthfulness",
|
|
49
|
+
"--num-pairs", "5",
|
|
50
|
+
"--output", output_file,
|
|
51
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
52
|
+
"--similarity-threshold", "0.8",
|
|
53
|
+
"--device", "cpu",
|
|
54
|
+
"--verbose"
|
|
55
|
+
],
|
|
56
|
+
capture_output=True,
|
|
57
|
+
text=True,
|
|
58
|
+
timeout=300
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
62
|
+
assert os.path.exists(output_file), "Output file not created"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_generate_synthetic_pairs_different_trait():
|
|
66
|
+
"""Test synthetic pair generation with different trait."""
|
|
67
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
68
|
+
output_file = os.path.join(tmpdir, "helpfulness_pairs.json")
|
|
69
|
+
|
|
70
|
+
result = subprocess.run(
|
|
71
|
+
[
|
|
72
|
+
"python", "-m", "wisent.core.main", "generate-pairs",
|
|
73
|
+
"--trait", "being helpful and informative",
|
|
74
|
+
"--num-pairs", "5",
|
|
75
|
+
"--output", output_file,
|
|
76
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
77
|
+
"--device", "cpu",
|
|
78
|
+
"--verbose"
|
|
79
|
+
],
|
|
80
|
+
capture_output=True,
|
|
81
|
+
text=True,
|
|
82
|
+
timeout=300
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
if __name__ == "__main__":
|
|
89
|
+
pytest.main([__file__, "-v"])
|
|
File without changes
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for evaluation examples.
|
|
3
|
+
|
|
4
|
+
Validates response evaluation and personalization assessment.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import subprocess
|
|
8
|
+
import pytest
|
|
9
|
+
import tempfile
|
|
10
|
+
import os
|
|
11
|
+
import json
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def create_test_responses_file(filepath):
|
|
15
|
+
"""Create a test responses JSON file."""
|
|
16
|
+
responses = [
|
|
17
|
+
{
|
|
18
|
+
"question": "What is 2+2?",
|
|
19
|
+
"response": "4",
|
|
20
|
+
"expected": "4",
|
|
21
|
+
"choices": ["3", "4", "5", "6"]
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"question": "What is the capital of France?",
|
|
25
|
+
"response": "Paris",
|
|
26
|
+
"expected": "Paris",
|
|
27
|
+
"choices": ["London", "Paris", "Berlin", "Rome"]
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
with open(filepath, 'w') as f:
|
|
31
|
+
json.dump(responses, f)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_generate_responses_from_task():
|
|
35
|
+
"""Test generating responses from a task."""
|
|
36
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
37
|
+
output_file = os.path.join(tmpdir, "responses.json")
|
|
38
|
+
|
|
39
|
+
result = subprocess.run(
|
|
40
|
+
[
|
|
41
|
+
"python", "-m", "wisent.core.main", "generate-responses",
|
|
42
|
+
"meta-llama/Llama-3.2-1B-Instruct",
|
|
43
|
+
"--task", "boolq",
|
|
44
|
+
"--num-questions", "3",
|
|
45
|
+
"--max-new-tokens", "50",
|
|
46
|
+
"--temperature", "0.7",
|
|
47
|
+
"--device", "cpu",
|
|
48
|
+
"--output", output_file,
|
|
49
|
+
"--verbose"
|
|
50
|
+
],
|
|
51
|
+
capture_output=True,
|
|
52
|
+
text=True,
|
|
53
|
+
timeout=300
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
57
|
+
assert os.path.exists(output_file), "Output file not created"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_evaluate_generated_responses():
|
|
61
|
+
"""Test evaluating generated responses."""
|
|
62
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
63
|
+
input_file = os.path.join(tmpdir, "responses.json")
|
|
64
|
+
output_file = os.path.join(tmpdir, "evaluation.json")
|
|
65
|
+
|
|
66
|
+
# Create test responses
|
|
67
|
+
create_test_responses_file(input_file)
|
|
68
|
+
|
|
69
|
+
result = subprocess.run(
|
|
70
|
+
[
|
|
71
|
+
"python", "-m", "wisent.core.main", "evaluate-responses",
|
|
72
|
+
"--input", input_file,
|
|
73
|
+
"--output", output_file,
|
|
74
|
+
"--task", "boolq",
|
|
75
|
+
"--verbose"
|
|
76
|
+
],
|
|
77
|
+
capture_output=True,
|
|
78
|
+
text=True,
|
|
79
|
+
timeout=300
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
83
|
+
assert os.path.exists(output_file), "Output file not created"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_evaluate_personalization():
|
|
87
|
+
"""Test evaluating personalization responses."""
|
|
88
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
89
|
+
input_file = os.path.join(tmpdir, "personalization.json")
|
|
90
|
+
output_file = os.path.join(tmpdir, "evaluation.json")
|
|
91
|
+
|
|
92
|
+
# Create test responses
|
|
93
|
+
responses = [
|
|
94
|
+
{"question": "Test Q", "response": "Test A", "trait": "helpful"}
|
|
95
|
+
]
|
|
96
|
+
with open(input_file, 'w') as f:
|
|
97
|
+
json.dump(responses, f)
|
|
98
|
+
|
|
99
|
+
result = subprocess.run(
|
|
100
|
+
[
|
|
101
|
+
"python", "-m", "wisent.core.main", "evaluate-responses",
|
|
102
|
+
"--input", input_file,
|
|
103
|
+
"--output", output_file,
|
|
104
|
+
"--task", "personalization",
|
|
105
|
+
"--trait", "helpful",
|
|
106
|
+
"--verbose"
|
|
107
|
+
],
|
|
108
|
+
capture_output=True,
|
|
109
|
+
text=True,
|
|
110
|
+
timeout=300
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
if __name__ == "__main__":
|
|
117
|
+
pytest.main([__file__, "-v"])
|
|
File without changes
|