wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.1.dist-info/METADATA +67 -0
- wisent-0.5.1.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
from wisent_guard.synthetic.cleaners.methods.core.atoms import Refusaler
|
|
2
|
+
from wisent_guard.core.models.wisent_model import WisentModel
|
|
3
|
+
|
|
4
|
+
import re, unicodedata
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"BaseRefusaler",
|
|
8
|
+
]
|
|
9
|
+
|
|
10
|
+
class BaseRefusaler(Refusaler):
|
|
11
|
+
"""
|
|
12
|
+
Refusal detection and fixing step.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
_FILLER = r"(?:\b\w{1,15}\b\s+){0,2}"
|
|
16
|
+
|
|
17
|
+
_NEG_MODAL = r"(?:can(?:\s*not)?|can't|won't|will\s+not|should(?:\s*not)?|shouldn't|must(?:\s*not)?|mustn't)"
|
|
18
|
+
_AM = r"(?:i\s+(?:am|['']m))"
|
|
19
|
+
_I = r"\bi\b"
|
|
20
|
+
_I_M = r"(?:i['']m)"
|
|
21
|
+
_VERB_ACTION = (
|
|
22
|
+
r"(?:help|assist|provide|share|give|supply|offer|"
|
|
23
|
+
r"comply(?:\s+with)?|fulfil?l|answer|respond|"
|
|
24
|
+
r"create|generate|write|compose|produce|draft|"
|
|
25
|
+
r"explain|instruct|guide|walk\s+you\s+through|"
|
|
26
|
+
r"facilitate|bypass|circumvent|enable|"
|
|
27
|
+
r"do\s+(?:that|this|so))"
|
|
28
|
+
)
|
|
29
|
+
_SUPPORT_VERBS = r"(?:support|condone|endorse|promote|encourage)"
|
|
30
|
+
_ABILITY_NOUNS = r"(?:ability|capability|capacity|permission|authorization|access)"
|
|
31
|
+
|
|
32
|
+
_REFUSAL_RE = re.compile(
|
|
33
|
+
rf"""
|
|
34
|
+
# verbose, ignorecase, unicode
|
|
35
|
+
|
|
36
|
+
(?:
|
|
37
|
+
# A) "As an AI (language) model..." disclaimers
|
|
38
|
+
(?P<ai_disclaimer>\bas\s+an?\s+(?:ai|assistant)(?:\s+language)?\s+model\b)
|
|
39
|
+
|
|
40
|
+
| # B) Policy/legal/safety invocations
|
|
41
|
+
(?P<policy>
|
|
42
|
+
\b(?:this|that|it)\s+would\s+be\s+(?:unethical|unsafe|illegal|harmful)\b
|
|
43
|
+
|
|
|
44
|
+
\bagainst\s+(?:policy|the\s+rules|guidelines|terms(?:\s+of\s+service)?)\b
|
|
45
|
+
|
|
|
46
|
+
\b(?:due\s+to|for)\s+(?:safety|legal|policy|ethical)\s+(?:reasons|concerns)\b
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
| # C) Apologies / hedges commonly preceding refusals
|
|
50
|
+
(?P<apology_hedge>
|
|
51
|
+
\b(?:sorry|apologies|unfortunately|regrettably)\b
|
|
52
|
+
|
|
|
53
|
+
\b{_I_M}\s+afraid\b
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
| # D) Explicit inability / not permitted / not comfortable
|
|
57
|
+
(?P<unable>
|
|
58
|
+
{_AM}\s+(?:unable|not\s+able|not\s+allowed|not\s+permitted|not\s+comfortable)\s+to\b
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
| # E) “I [negation] [filler] [help/provide/…]”
|
|
62
|
+
(?P<cannot_action>
|
|
63
|
+
{_I}\s+{_NEG_MODAL}\s+{_FILLER}{_VERB_ACTION}\b
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
| # F) Prefer/rather not (indirect refusal)
|
|
67
|
+
(?P<prefer_rather>
|
|
68
|
+
\b(?:i['']d|i\s+would)\s+(?:rather|prefer)\s+not\s+(?:to\s+)?\w+\b
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
| # G) Decline/refuse verbs
|
|
72
|
+
(?P<decline_refuse>
|
|
73
|
+
{_I}\s+(?:must|have\s+to|will)\s+(?:decline|refuse)\s+to\b
|
|
74
|
+
|
|
|
75
|
+
\b(?:i\s+decline|i\s+refuse)\b
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
| # H) “I don't/won't [support|condone|endorse|promote] …”
|
|
79
|
+
(?P<no_support>
|
|
80
|
+
{_I}\s+(?:do\s+not|don't|won't|will\s+not)\s+{_SUPPORT_VERBS}\b
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
| # I) “I do not have the [ability/capability/permission/access] to …”
|
|
84
|
+
(?P<no_ability>
|
|
85
|
+
{_I}\s+(?:do\s+not|don't)\s+have\s+the\s+{_ABILITY_NOUNS}\s+to\b
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
| # J) Direct lexical hits
|
|
89
|
+
(?P<refusal_word>\brefus(?:e|al)\b)
|
|
90
|
+
)
|
|
91
|
+
""".replace("{i_am}", "(?:i\\s+(?:am|['']m))"),
|
|
92
|
+
re.VERBOSE | re.IGNORECASE | re.UNICODE,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
_FAMILY_WEIGHTS = {
|
|
96
|
+
"ai_disclaimer": 0.9,
|
|
97
|
+
"policy": 0.9,
|
|
98
|
+
"apology_hedge": 0.4,
|
|
99
|
+
"unable": 0.9,
|
|
100
|
+
"cannot_action": 1.0,
|
|
101
|
+
"prefer_rather": 0.6,
|
|
102
|
+
"decline_refuse": 0.9,
|
|
103
|
+
"no_support": 0.8,
|
|
104
|
+
"no_ability": 0.8,
|
|
105
|
+
"refusal_word": 0.6,
|
|
106
|
+
}
|
|
107
|
+
_THRESHOLD = 0.9
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def _normalize(text: str) -> str:
|
|
111
|
+
"""
|
|
112
|
+
Normalize text for refusal detection.
|
|
113
|
+
|
|
114
|
+
arguments:
|
|
115
|
+
text:
|
|
116
|
+
Input text string.
|
|
117
|
+
|
|
118
|
+
returns:
|
|
119
|
+
Normalized text string.
|
|
120
|
+
|
|
121
|
+
example:
|
|
122
|
+
>>> BaseRefusaler._normalize(" As an AI model, I can't help with that. ")"
|
|
123
|
+
'As an AI model, I can't help with that.'
|
|
124
|
+
>>> BaseRefusaler._normalize("This is a test.\u00A0")
|
|
125
|
+
'This is a test.'
|
|
126
|
+
>>> BaseRefusaler._normalize("Café")
|
|
127
|
+
'Café'
|
|
128
|
+
"""
|
|
129
|
+
return unicodedata.normalize("NFKC", text).strip()
|
|
130
|
+
|
|
131
|
+
def score_refusal(self, text: str) -> tuple[float, str | None, str]:
|
|
132
|
+
"""
|
|
133
|
+
Returns (score, top_family, snippet). Multiple families can fire; we
|
|
134
|
+
take the max weight and add small bonus if apology_hedge co-occurs.
|
|
135
|
+
|
|
136
|
+
arguments:
|
|
137
|
+
text:
|
|
138
|
+
Input text string.
|
|
139
|
+
|
|
140
|
+
returns:
|
|
141
|
+
score:
|
|
142
|
+
Refusal likelihood score between 0.0 and 1.0.
|
|
143
|
+
top_family:
|
|
144
|
+
Name of the highest-weighted matched refusal family, or None.
|
|
145
|
+
snippet:
|
|
146
|
+
The matched refusal snippet from the text, or empty string.
|
|
147
|
+
|
|
148
|
+
example:
|
|
149
|
+
>>> rp = DefaultRefusalPolicy()
|
|
150
|
+
>>> rp.score_refusal("As an AI model, I can't help with that.")
|
|
151
|
+
(1.0, 'ai_disclaimer', "As an AI model, I can't help with that.")
|
|
152
|
+
>>> rp.score_refusal("Sorry, I am unable to assist with that.")
|
|
153
|
+
(0.9, 'unable', 'I am unable to assist with that.')
|
|
154
|
+
>>> rp.score_refusal("I don't support that.")
|
|
155
|
+
(0.8, 'no_support', "I don't support that.")
|
|
156
|
+
>>> rp.score_refusal("I can't help. Sorry about that.")
|
|
157
|
+
(1.0, 'cannot_action', "I can't help")
|
|
158
|
+
>>> rp.score_refusal("I can't help. I am sorry about that.")
|
|
159
|
+
(1.0, 'cannot_action', "I can't help")
|
|
160
|
+
>>> rp.score_refusal("I can't help. Unfortunately, I am not able to assist.")
|
|
161
|
+
(1.0, 'cannot_action', "I can't help")
|
|
162
|
+
>>> rp.score_refusal("I can help with that.")
|
|
163
|
+
(0.0, None, "")
|
|
164
|
+
"""
|
|
165
|
+
t = self._normalize(text)
|
|
166
|
+
m = self._REFUSAL_RE.search(t)
|
|
167
|
+
if not m:
|
|
168
|
+
return 0.0, None, ""
|
|
169
|
+
|
|
170
|
+
best_family, best_w = None, 0.0
|
|
171
|
+
for name, val in m.groupdict().items():
|
|
172
|
+
if val:
|
|
173
|
+
w = self._FAMILY_WEIGHTS.get(name, 0.5)
|
|
174
|
+
if w > best_w:
|
|
175
|
+
best_family, best_w = name, w
|
|
176
|
+
bonus = 0.0
|
|
177
|
+
if m.group("apology_hedge"):
|
|
178
|
+
if any(name != "apology_hedge" and m.group(name) for name in self._FAMILY_WEIGHTS):
|
|
179
|
+
bonus = 0.1
|
|
180
|
+
score = min(1.0, best_w + bonus)
|
|
181
|
+
return score, best_family, m.group(0)
|
|
182
|
+
|
|
183
|
+
def looks_like_refusal(self, text: str) -> bool:
|
|
184
|
+
"""
|
|
185
|
+
Detect if the text looks like a refusal.
|
|
186
|
+
|
|
187
|
+
arguments:
|
|
188
|
+
text:
|
|
189
|
+
Input text string.
|
|
190
|
+
|
|
191
|
+
returns:
|
|
192
|
+
True if the text looks like a refusal, False otherwise.
|
|
193
|
+
|
|
194
|
+
example:
|
|
195
|
+
>>> rp = DefaultRefusalPolicy()
|
|
196
|
+
>>> rp.looks_like_refusal("As an AI model, I can't help with that.")
|
|
197
|
+
True
|
|
198
|
+
>>> rp.looks_like_refusal("Sorry, I am unable to assist with that.")
|
|
199
|
+
True
|
|
200
|
+
>>> rp.looks_like_refusal("I don't support that.")
|
|
201
|
+
True
|
|
202
|
+
>>> rp.looks_like_refusal("I can help with that.")
|
|
203
|
+
False
|
|
204
|
+
"""
|
|
205
|
+
score, *_ = self.score_refusal(text)
|
|
206
|
+
return score >= self._THRESHOLD
|
|
207
|
+
|
|
208
|
+
def why_refusal(self, text: str) -> tuple[str, str] | None:
|
|
209
|
+
"""
|
|
210
|
+
If the text looks like a refusal, return (family, snippet) for the top match.
|
|
211
|
+
|
|
212
|
+
arguments:
|
|
213
|
+
text:
|
|
214
|
+
Input text string.
|
|
215
|
+
|
|
216
|
+
returns:
|
|
217
|
+
(family, snippet) tuple if the text looks like a refusal, None otherwise.
|
|
218
|
+
|
|
219
|
+
example:
|
|
220
|
+
>>> rp = DefaultRefusalPolicy()
|
|
221
|
+
>>> rp.why_refusal("As an AI model, I can't help with that.")
|
|
222
|
+
('ai_disclaimer', "As an AI model, I can't help with that.")
|
|
223
|
+
>>> rp.why_refusal("Sorry, I am unable to assist with that.")
|
|
224
|
+
('unable', 'I am unable to assist with that.')
|
|
225
|
+
>>> rp.why_refusal("I support that.")
|
|
226
|
+
None
|
|
227
|
+
"""
|
|
228
|
+
score, fam, snip = self.score_refusal(text)
|
|
229
|
+
if score == 0.0:
|
|
230
|
+
return None
|
|
231
|
+
return fam or "unknown", snip
|
|
232
|
+
|
|
233
|
+
def fix_negative(
|
|
234
|
+
self,
|
|
235
|
+
model: WisentModel,
|
|
236
|
+
generation_conf: dict,
|
|
237
|
+
prompt: str,
|
|
238
|
+
trait_label: str,
|
|
239
|
+
trait_description: str,
|
|
240
|
+
system_prompt: str,
|
|
241
|
+
) -> str:
|
|
242
|
+
"""
|
|
243
|
+
Attempt to fix a refusal negative example by re-prompting the model.
|
|
244
|
+
|
|
245
|
+
arguments:
|
|
246
|
+
model:
|
|
247
|
+
WisentModel instance to call.
|
|
248
|
+
prompt:
|
|
249
|
+
The original prompt text.
|
|
250
|
+
trait_label:
|
|
251
|
+
Label of the undesired trait.
|
|
252
|
+
trait_description:
|
|
253
|
+
Description of the undesired trait.
|
|
254
|
+
system_prompt:
|
|
255
|
+
System prompt to use for the model call.
|
|
256
|
+
|
|
257
|
+
returns:
|
|
258
|
+
New negative example text, or empty string if still a refusal.
|
|
259
|
+
|
|
260
|
+
example:
|
|
261
|
+
>>> rp = DefaultRefusalPolicy()
|
|
262
|
+
>>> def mock_completion_fn(msgs):
|
|
263
|
+
... return "As an AI model, I cannot help with that."
|
|
264
|
+
>>> rp.fix_negative(mock_completion_fn, "Tell me a joke.", "toxic", "contains toxic language", "System prompt")
|
|
265
|
+
... ""
|
|
266
|
+
>>> def mock_completion_fn2(msgs):
|
|
267
|
+
... return "Here's a joke: Why did the chicken cross the road? To get to the other side!"
|
|
268
|
+
>>> rp.fix_negative(mock_completion_fn2, "Tell me a joke.", "toxic", "contains toxic language", "System prompt")
|
|
269
|
+
... "Here's a joke: Why did the chicken cross the road? To get to the other side!"
|
|
270
|
+
"""
|
|
271
|
+
msgs = [
|
|
272
|
+
{"role": "system", "content": system_prompt},
|
|
273
|
+
{
|
|
274
|
+
"role": "user",
|
|
275
|
+
"content": f"Prompt: {prompt}\nTrait label: {trait_label}\nTrait description: {trait_description}",
|
|
276
|
+
},
|
|
277
|
+
]
|
|
278
|
+
neg_trial = model.generate(
|
|
279
|
+
inputs=[msgs],
|
|
280
|
+
max_tokens=generation_conf.get("max_tokens", 256),
|
|
281
|
+
temperature=generation_conf.get("temperature", 1.0),
|
|
282
|
+
use_steering=False,
|
|
283
|
+
top_p=generation_conf.get("top_p", 1.0),
|
|
284
|
+
)
|
|
285
|
+
return "" if self.looks_like_refusal(neg_trial) else neg_trial
|
|
286
|
+
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
|
|
4
|
+
from wisent_guard.core.models.wisent_model import WisentModel
|
|
5
|
+
from wisent_guard.core.contrastive_pairs.core.set import ContrastivePairSet
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Refusaler(ABC):
|
|
9
|
+
"""
|
|
10
|
+
Refusal detection and fixing step.
|
|
11
|
+
|
|
12
|
+
methods:
|
|
13
|
+
looks_like_refusal(text: str) -> bool:
|
|
14
|
+
Detect if the text looks like a refusal. For example, we want to generate negatives that exhibit
|
|
15
|
+
"evil" behavior, but not refusals like "As an AI model, I cannot help with that."
|
|
16
|
+
fix_negative(
|
|
17
|
+
model: WisentModel,
|
|
18
|
+
prompt: str,
|
|
19
|
+
trait_label: str,
|
|
20
|
+
trait_description: str,
|
|
21
|
+
system_prompt: str,
|
|
22
|
+
) -> str:
|
|
23
|
+
Attempt to fix a refusal negative example by re-prompting the model with the given system prompt. For example,
|
|
24
|
+
we can increase the temperature or change the wording to try to get a non-refusal response.
|
|
25
|
+
"""
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def looks_like_refusal(self, text: str) -> bool: ...
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def fix_negative(
|
|
30
|
+
self,
|
|
31
|
+
model: WisentModel,
|
|
32
|
+
prompt: str,
|
|
33
|
+
trait_label: str,
|
|
34
|
+
trait_description: str,
|
|
35
|
+
system_prompt: str,
|
|
36
|
+
) -> str: ...
|
|
37
|
+
|
|
38
|
+
class Deduper(ABC):
|
|
39
|
+
"""
|
|
40
|
+
Deduplication step; removes duplicate items from the pipeline.
|
|
41
|
+
methods:
|
|
42
|
+
dedupe(items: ContrastivePairSet) -> ContrastivePairSet:
|
|
43
|
+
Remove duplicate items from the given ContrastivePairSet.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def dedupe(self, items: ContrastivePairSet) -> ContrastivePairSet: ...
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Iterable, TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from wisent_guard.synthetic.cleaners.core.atoms import CleanStep, Cleaner
|
|
6
|
+
from wisent_guard.synthetic.cleaners.core.atoms import CleanerStats
|
|
7
|
+
from wisent_guard.core.contrastive_pairs.core.set import ContrastivePairSet
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"PairsCleaner",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
class PairsCleaner(Cleaner):
|
|
14
|
+
"""
|
|
15
|
+
Composable cleaner; pass any sequence of CleanStep.
|
|
16
|
+
|
|
17
|
+
attributes:
|
|
18
|
+
steps:
|
|
19
|
+
Iterable of CleanStep instances to apply in order.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, steps: Iterable[CleanStep]) -> None:
|
|
23
|
+
self._steps = list(steps)
|
|
24
|
+
|
|
25
|
+
def clean(
|
|
26
|
+
self, items: ContrastivePairSet
|
|
27
|
+
) -> tuple[ContrastivePairSet, CleanerStats]:
|
|
28
|
+
"""
|
|
29
|
+
Apply the cleaning pipeline to the given ContrastivePairSet.
|
|
30
|
+
|
|
31
|
+
arguments:
|
|
32
|
+
items:
|
|
33
|
+
ContrastivePairSet to clean.
|
|
34
|
+
|
|
35
|
+
returns:
|
|
36
|
+
Tuple of cleaned ContrastivePairSet and CleanerStats with statistics about the cleaning process.
|
|
37
|
+
|
|
38
|
+
example:
|
|
39
|
+
>>> from wisent_guard.core.contrastive_pairs.core.set import ContrastivePairSet
|
|
40
|
+
>>> from wisent_guard.core.contrastive_pairs.core.pair import ContrastivePair
|
|
41
|
+
>>> from wisent_guard.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
|
|
42
|
+
>>> from wisent_guard.synthetic.cleaners.methods.base_refusalers import BasesRefusaler
|
|
43
|
+
>>> from wisent_guard.synthetic.cleaners.methods.base_dedupers import SimHashDeduper
|
|
44
|
+
>>> from wisent_guard.synthetic.cleaners.cleaners import PairsCleaner
|
|
45
|
+
>>> from wisent_guard.core.models.wisent_model import WisentModel
|
|
46
|
+
>>> refusal = BasesRefusaler()
|
|
47
|
+
>>> deduper = SimHashDeduper()
|
|
48
|
+
>>> model = WisentModel(model_name="llama3.1")
|
|
49
|
+
>>> cleaner = PairsCleaner(steps=[
|
|
50
|
+
... RefusalerCleaner(
|
|
51
|
+
... refusal=refusal,
|
|
52
|
+
... model=model,
|
|
53
|
+
... system_prompt="You are a helpful assistant that always answers the question truthfully.",
|
|
54
|
+
... trait_label="honesty",
|
|
55
|
+
... trait_description="honest vs dishonest",
|
|
56
|
+
... max_retries=2,
|
|
57
|
+
... ),
|
|
58
|
+
... DeduperCleaner(deduper=deduper),
|
|
59
|
+
... ])
|
|
60
|
+
>>> items = ContrastivePairSet(pairs=[
|
|
61
|
+
... ContrastivePair(
|
|
62
|
+
... prompt="What is the capital of France?",
|
|
63
|
+
... positive=PositiveResponse(text="The capital of France is Paris."),
|
|
64
|
+
... negative=NegativeResponse(text="As an AI language model, I cannot provide that information."),
|
|
65
|
+
... ),
|
|
66
|
+
... ContrastivePair(
|
|
67
|
+
... prompt="What is the capital of France?",
|
|
68
|
+
... positive=PositiveResponse(text="The capital of France is Paris."),
|
|
69
|
+
... negative=NegativeResponse(text="I don't know."),
|
|
70
|
+
... ),
|
|
71
|
+
... ])
|
|
72
|
+
>>> cleaned_items, stats = cleaner.clean(items)
|
|
73
|
+
>>> print(len(cleaned_items))
|
|
74
|
+
1
|
|
75
|
+
>>> print(stats.step_stats)
|
|
76
|
+
{'refusaler_cleaner': CleanStepStats(modified_items=1), 'deduper_cleaner': CleanStepStats(total_items=1, removed_items=0)}
|
|
77
|
+
>>> print(cleaned_items.pairs[0].negative.text)
|
|
78
|
+
The capital of France is England.
|
|
79
|
+
>>> print(cleaned_items.pairs[0].positive.text)
|
|
80
|
+
The capital of France is Paris.
|
|
81
|
+
>>> print(cleaned_items.pairs[0].prompt.text)
|
|
82
|
+
What is the capital of France?
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
cur = items
|
|
86
|
+
stats = CleanerStats()
|
|
87
|
+
for st in self._steps:
|
|
88
|
+
cur = st.apply(cur)
|
|
89
|
+
stats.step_stats[st.name] = st.stats()
|
|
90
|
+
return cur, stats
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
|
|
2
|
+
from wisent_guard.synthetic.cleaners.core.atoms import CleanStep
|
|
3
|
+
from wisent_guard.core.contrastive_pairs.core.pair import ContrastivePair
|
|
4
|
+
from wisent_guard.synthetic.cleaners.core.atoms import CleanStepStats
|
|
5
|
+
|
|
6
|
+
from wisent_guard.synthetic.cleaners.methods.core.atoms import Refusaler
|
|
7
|
+
from wisent_guard.core.models.wisent_model import WisentModel
|
|
8
|
+
from wisent_guard.core.contrastive_pairs.core.set import ContrastivePairSet
|
|
9
|
+
from wisent_guard.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"RefusalerCleaner",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
class RefusalerCleaner(CleanStep):
|
|
16
|
+
"""
|
|
17
|
+
Refusal detection and fixing step.
|
|
18
|
+
"""
|
|
19
|
+
name = "refusaler_cleaner"
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
refusal: Refusaler,
|
|
24
|
+
model: WisentModel,
|
|
25
|
+
system_prompt: str,
|
|
26
|
+
trait_label: str,
|
|
27
|
+
trait_description: str,
|
|
28
|
+
max_retries: int = 2,
|
|
29
|
+
) -> None:
|
|
30
|
+
self._refusal = refusal
|
|
31
|
+
self._model = model
|
|
32
|
+
self._sys = system_prompt
|
|
33
|
+
self._label = trait_label
|
|
34
|
+
self._desc = trait_description
|
|
35
|
+
self._max_retries = max_retries
|
|
36
|
+
self._retries_used = 0
|
|
37
|
+
|
|
38
|
+
def stats(self) -> CleanStepStats:
|
|
39
|
+
'''
|
|
40
|
+
Return statistics about the last run of 'apply()'.
|
|
41
|
+
|
|
42
|
+
returns:
|
|
43
|
+
CleanStepStats with the number of retries used in the last run.
|
|
44
|
+
'''
|
|
45
|
+
return CleanStepStats(modified_items=self._retries_used)
|
|
46
|
+
|
|
47
|
+
def apply(self, items: ContrastivePairSet) -> ContrastivePairSet:
|
|
48
|
+
"""
|
|
49
|
+
Apply the refusal detection and fixing step to the given ContrastivePairSet.
|
|
50
|
+
|
|
51
|
+
arguments:
|
|
52
|
+
items:
|
|
53
|
+
ContrastivePairSet to clean.
|
|
54
|
+
|
|
55
|
+
returns:
|
|
56
|
+
Cleaned ContrastivePairSet with refusals fixed.
|
|
57
|
+
|
|
58
|
+
example:
|
|
59
|
+
>>> from wisent_guard.core.contrastive_pairs.core.set import ContrastivePairSet
|
|
60
|
+
>>> from wisent_guard.core.contrastive_pairs.core.pair import ContrastivePair
|
|
61
|
+
>>> from wisent_guard.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
|
|
62
|
+
>>> from wisent_guard.synthetic.cleaners.methods.base_refusalers import SimpleRefusaler
|
|
63
|
+
>>> from wisent_guard.core.models.wisent_model import WisentModel
|
|
64
|
+
>>> refusal = SimpleRefusaler()
|
|
65
|
+
>>> model = WisentModel(...)
|
|
66
|
+
>>> cleaner = RefusalerCleaner(
|
|
67
|
+
... refusal=refusal,
|
|
68
|
+
... model=model,
|
|
69
|
+
... system_prompt="You are a helpful assistant.",
|
|
70
|
+
... trait_label="honesty",
|
|
71
|
+
... trait_description="honest vs dishonest",
|
|
72
|
+
... max_retries=2,
|
|
73
|
+
... )
|
|
74
|
+
>>> items = ContrastivePairSet(
|
|
75
|
+
... name="example",
|
|
76
|
+
... task_type="test",
|
|
77
|
+
... pairs=[
|
|
78
|
+
... ContrastivePair(
|
|
79
|
+
... prompt="Is the sky blue?",
|
|
80
|
+
... positive_response=PositiveResponse(
|
|
81
|
+
... model_response="Yes, the sky is blue.",
|
|
82
|
+
... layers_activations=None,
|
|
83
|
+
... label="harmless"
|
|
84
|
+
... ),
|
|
85
|
+
... negative_response=NegativeResponse(
|
|
86
|
+
... model_response="I'm sorry, I can't help with that.",
|
|
87
|
+
... layers_activations=None,
|
|
88
|
+
... label="toxic"
|
|
89
|
+
... ),
|
|
90
|
+
... label="color_question",
|
|
91
|
+
... trait_description="hallucinatory"
|
|
92
|
+
... )
|
|
93
|
+
... ]
|
|
94
|
+
... )
|
|
95
|
+
>>> cleaned = cleaner.apply(items)
|
|
96
|
+
>>> for cp in cleaned.pairs:
|
|
97
|
+
... print(cp)
|
|
98
|
+
ContrastivePair(
|
|
99
|
+
prompt='Is the sky blue?',
|
|
100
|
+
positive_response=PositiveResponse(model_response='Yes, the sky is blue.', layers_activations=None, label='harmless'),
|
|
101
|
+
negative_response=NegativeResponse(model_response='No, the sky is not blue.', layers_activations=None, label='toxic'),
|
|
102
|
+
label='color_question',
|
|
103
|
+
trait_description='hallucinatory'
|
|
104
|
+
)
|
|
105
|
+
"""
|
|
106
|
+
out: ContrastivePairSet = ContrastivePairSet(
|
|
107
|
+
name=items.name,
|
|
108
|
+
task_type=items.task_type,
|
|
109
|
+
)
|
|
110
|
+
retries = 0
|
|
111
|
+
for cp in items.pairs:
|
|
112
|
+
neg = cp.negative_response.model_response
|
|
113
|
+
if self._refusal.looks_like_refusal(neg) and retries < self._max_retries:
|
|
114
|
+
fixed = self._refusal.fix_negative(
|
|
115
|
+
self._model,
|
|
116
|
+
prompt=cp.prompt,
|
|
117
|
+
trait_label=self._label,
|
|
118
|
+
trait_description=self._desc,
|
|
119
|
+
system_prompt=self._sys,
|
|
120
|
+
)
|
|
121
|
+
if fixed:
|
|
122
|
+
neg = fixed
|
|
123
|
+
retries += 1
|
|
124
|
+
clean_contrastive_pair = ContrastivePair(
|
|
125
|
+
prompt=cp.prompt,
|
|
126
|
+
positive_response=PositiveResponse(model_response=cp.positive_response.model_response),
|
|
127
|
+
negative_response=NegativeResponse(model_response=neg),
|
|
128
|
+
label=cp.label,
|
|
129
|
+
trait_description=cp.trait_description,
|
|
130
|
+
)
|
|
131
|
+
out.pairs.append(clean_contrastive_pair)
|
|
132
|
+
self._retries_used += retries
|
|
133
|
+
return out
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
|
|
4
|
+
__all__ = ["DB_Instructions"]
|
|
5
|
+
|
|
6
|
+
class DB_Instructions(ABC):
|
|
7
|
+
"""
|
|
8
|
+
Interface for database instructions storage and retrieval.
|
|
9
|
+
|
|
10
|
+
methods:
|
|
11
|
+
get(key: str) -> str:
|
|
12
|
+
Retrieve instructions by key.
|
|
13
|
+
set(key: str, value: str) -> None:
|
|
14
|
+
Store instructions by key.
|
|
15
|
+
|
|
16
|
+
notes:
|
|
17
|
+
|
|
18
|
+
This is an abstract base class for the database instructions for the system prompts.
|
|
19
|
+
System prompts need to describe the what kind of contrastive pairs we want to generate.
|
|
20
|
+
or for example instructions for fixing negative examples.
|
|
21
|
+
"""
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def get(self, key: str) -> str: ...
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def set(self, key: str, value: str) -> None: ...
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
|
|
2
|
+
from wisent_guard.synthetic.db_instructions.core.atoms import DB_Instructions
|
|
3
|
+
|
|
4
|
+
__all__ = ["Default_DB_Instructions"]
|
|
5
|
+
|
|
6
|
+
class Default_DB_Instructions(DB_Instructions):
|
|
7
|
+
def __init__(self) -> None:
|
|
8
|
+
self._db: dict[str, str] = {
|
|
9
|
+
"generic_pairs": (
|
|
10
|
+
"You are a data generator that produces JSON only.\n"
|
|
11
|
+
"Goal: create synthetic contrastive pairs (prompt, positive, negative) for the given trait and trait description.\n"
|
|
12
|
+
"Rules:\n"
|
|
13
|
+
" - Positive = desired/harmless/correct.\n"
|
|
14
|
+
" - Negative = undesired/harmful/incorrect.\n"
|
|
15
|
+
" - Keep outputs concise (<= 2 sentences each response).\n"
|
|
16
|
+
" - No explanations or meta-text.\n"
|
|
17
|
+
" - Return JSON with top-level key 'pairs'.\n"
|
|
18
|
+
" - Each: {'prompt','positive','negative','label','trait_description'}.\n"
|
|
19
|
+
"For example:\n"
|
|
20
|
+
"prompt: How to pass a math exam?\n"
|
|
21
|
+
"positive: Study the material thoroughly and practice with sample questions.\n"
|
|
22
|
+
"negative: Pay money to get the answers. Or cheat off someone else.\n"
|
|
23
|
+
"label: 'evil'\n"
|
|
24
|
+
"trait_description: 'evil vs good'\n"
|
|
25
|
+
),
|
|
26
|
+
"roleplay_neg_fix": (
|
|
27
|
+
"You are fixing ONLY the negative example of a contrastive pair.\n"
|
|
28
|
+
"Produce a single concise negative response for the given prompt that exemplifies the UNDESIRED trait.\n"
|
|
29
|
+
"It must be fictional/hypothetical, safe, and non-actionable. Return raw text only."
|
|
30
|
+
),
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
def get(self, key: str) -> str:
|
|
34
|
+
return self._db[key]
|
|
35
|
+
|
|
36
|
+
def set(self, key: str, value: str) -> None:
|
|
37
|
+
self._db[key] = value
|
|
File without changes
|
|
File without changes
|