wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.2.dist-info/METADATA +67 -0
  215. wisent-0.5.2.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,286 @@
1
+ from wisent.synthetic.cleaners.methods.core.atoms import Refusaler
2
+ from wisent.core.models.wisent_model import WisentModel
3
+
4
+ import re, unicodedata
5
+
6
+ __all__ = [
7
+ "BaseRefusaler",
8
+ ]
9
+
10
+ class BaseRefusaler(Refusaler):
11
+ """
12
+ Refusal detection and fixing step.
13
+ """
14
+
15
+ _FILLER = r"(?:\b\w{1,15}\b\s+){0,2}"
16
+
17
+ _NEG_MODAL = r"(?:can(?:\s*not)?|can't|won't|will\s+not|should(?:\s*not)?|shouldn't|must(?:\s*not)?|mustn't)"
18
+ _AM = r"(?:i\s+(?:am|['']m))"
19
+ _I = r"\bi\b"
20
+ _I_M = r"(?:i['']m)"
21
+ _VERB_ACTION = (
22
+ r"(?:help|assist|provide|share|give|supply|offer|"
23
+ r"comply(?:\s+with)?|fulfil?l|answer|respond|"
24
+ r"create|generate|write|compose|produce|draft|"
25
+ r"explain|instruct|guide|walk\s+you\s+through|"
26
+ r"facilitate|bypass|circumvent|enable|"
27
+ r"do\s+(?:that|this|so))"
28
+ )
29
+ _SUPPORT_VERBS = r"(?:support|condone|endorse|promote|encourage)"
30
+ _ABILITY_NOUNS = r"(?:ability|capability|capacity|permission|authorization|access)"
31
+
32
+ _REFUSAL_RE = re.compile(
33
+ rf"""
34
+ # verbose, ignorecase, unicode
35
+
36
+ (?:
37
+ # A) "As an AI (language) model..." disclaimers
38
+ (?P<ai_disclaimer>\bas\s+an?\s+(?:ai|assistant)(?:\s+language)?\s+model\b)
39
+
40
+ | # B) Policy/legal/safety invocations
41
+ (?P<policy>
42
+ \b(?:this|that|it)\s+would\s+be\s+(?:unethical|unsafe|illegal|harmful)\b
43
+ |
44
+ \bagainst\s+(?:policy|the\s+rules|guidelines|terms(?:\s+of\s+service)?)\b
45
+ |
46
+ \b(?:due\s+to|for)\s+(?:safety|legal|policy|ethical)\s+(?:reasons|concerns)\b
47
+ )
48
+
49
+ | # C) Apologies / hedges commonly preceding refusals
50
+ (?P<apology_hedge>
51
+ \b(?:sorry|apologies|unfortunately|regrettably)\b
52
+ |
53
+ \b{_I_M}\s+afraid\b
54
+ )
55
+
56
+ | # D) Explicit inability / not permitted / not comfortable
57
+ (?P<unable>
58
+ {_AM}\s+(?:unable|not\s+able|not\s+allowed|not\s+permitted|not\s+comfortable)\s+to\b
59
+ )
60
+
61
+ | # E) “I [negation] [filler] [help/provide/…]”
62
+ (?P<cannot_action>
63
+ {_I}\s+{_NEG_MODAL}\s+{_FILLER}{_VERB_ACTION}\b
64
+ )
65
+
66
+ | # F) Prefer/rather not (indirect refusal)
67
+ (?P<prefer_rather>
68
+ \b(?:i['']d|i\s+would)\s+(?:rather|prefer)\s+not\s+(?:to\s+)?\w+\b
69
+ )
70
+
71
+ | # G) Decline/refuse verbs
72
+ (?P<decline_refuse>
73
+ {_I}\s+(?:must|have\s+to|will)\s+(?:decline|refuse)\s+to\b
74
+ |
75
+ \b(?:i\s+decline|i\s+refuse)\b
76
+ )
77
+
78
+ | # H) “I don't/won't [support|condone|endorse|promote] …”
79
+ (?P<no_support>
80
+ {_I}\s+(?:do\s+not|don't|won't|will\s+not)\s+{_SUPPORT_VERBS}\b
81
+ )
82
+
83
+ | # I) “I do not have the [ability/capability/permission/access] to …”
84
+ (?P<no_ability>
85
+ {_I}\s+(?:do\s+not|don't)\s+have\s+the\s+{_ABILITY_NOUNS}\s+to\b
86
+ )
87
+
88
+ | # J) Direct lexical hits
89
+ (?P<refusal_word>\brefus(?:e|al)\b)
90
+ )
91
+ """.replace("{i_am}", "(?:i\\s+(?:am|['']m))"),
92
+ re.VERBOSE | re.IGNORECASE | re.UNICODE,
93
+ )
94
+
95
+ _FAMILY_WEIGHTS = {
96
+ "ai_disclaimer": 0.9,
97
+ "policy": 0.9,
98
+ "apology_hedge": 0.4,
99
+ "unable": 0.9,
100
+ "cannot_action": 1.0,
101
+ "prefer_rather": 0.6,
102
+ "decline_refuse": 0.9,
103
+ "no_support": 0.8,
104
+ "no_ability": 0.8,
105
+ "refusal_word": 0.6,
106
+ }
107
+ _THRESHOLD = 0.9
108
+
109
+ @staticmethod
110
+ def _normalize(text: str) -> str:
111
+ """
112
+ Normalize text for refusal detection.
113
+
114
+ arguments:
115
+ text:
116
+ Input text string.
117
+
118
+ returns:
119
+ Normalized text string.
120
+
121
+ example:
122
+ >>> BaseRefusaler._normalize(" As an AI model, I can't help with that. ")"
123
+ 'As an AI model, I can't help with that.'
124
+ >>> BaseRefusaler._normalize("This is a test.\u00A0")
125
+ 'This is a test.'
126
+ >>> BaseRefusaler._normalize("Café")
127
+ 'Café'
128
+ """
129
+ return unicodedata.normalize("NFKC", text).strip()
130
+
131
+ def score_refusal(self, text: str) -> tuple[float, str | None, str]:
132
+ """
133
+ Returns (score, top_family, snippet). Multiple families can fire; we
134
+ take the max weight and add small bonus if apology_hedge co-occurs.
135
+
136
+ arguments:
137
+ text:
138
+ Input text string.
139
+
140
+ returns:
141
+ score:
142
+ Refusal likelihood score between 0.0 and 1.0.
143
+ top_family:
144
+ Name of the highest-weighted matched refusal family, or None.
145
+ snippet:
146
+ The matched refusal snippet from the text, or empty string.
147
+
148
+ example:
149
+ >>> rp = DefaultRefusalPolicy()
150
+ >>> rp.score_refusal("As an AI model, I can't help with that.")
151
+ (1.0, 'ai_disclaimer', "As an AI model, I can't help with that.")
152
+ >>> rp.score_refusal("Sorry, I am unable to assist with that.")
153
+ (0.9, 'unable', 'I am unable to assist with that.')
154
+ >>> rp.score_refusal("I don't support that.")
155
+ (0.8, 'no_support', "I don't support that.")
156
+ >>> rp.score_refusal("I can't help. Sorry about that.")
157
+ (1.0, 'cannot_action', "I can't help")
158
+ >>> rp.score_refusal("I can't help. I am sorry about that.")
159
+ (1.0, 'cannot_action', "I can't help")
160
+ >>> rp.score_refusal("I can't help. Unfortunately, I am not able to assist.")
161
+ (1.0, 'cannot_action', "I can't help")
162
+ >>> rp.score_refusal("I can help with that.")
163
+ (0.0, None, "")
164
+ """
165
+ t = self._normalize(text)
166
+ m = self._REFUSAL_RE.search(t)
167
+ if not m:
168
+ return 0.0, None, ""
169
+
170
+ best_family, best_w = None, 0.0
171
+ for name, val in m.groupdict().items():
172
+ if val:
173
+ w = self._FAMILY_WEIGHTS.get(name, 0.5)
174
+ if w > best_w:
175
+ best_family, best_w = name, w
176
+ bonus = 0.0
177
+ if m.group("apology_hedge"):
178
+ if any(name != "apology_hedge" and m.group(name) for name in self._FAMILY_WEIGHTS):
179
+ bonus = 0.1
180
+ score = min(1.0, best_w + bonus)
181
+ return score, best_family, m.group(0)
182
+
183
+ def looks_like_refusal(self, text: str) -> bool:
184
+ """
185
+ Detect if the text looks like a refusal.
186
+
187
+ arguments:
188
+ text:
189
+ Input text string.
190
+
191
+ returns:
192
+ True if the text looks like a refusal, False otherwise.
193
+
194
+ example:
195
+ >>> rp = DefaultRefusalPolicy()
196
+ >>> rp.looks_like_refusal("As an AI model, I can't help with that.")
197
+ True
198
+ >>> rp.looks_like_refusal("Sorry, I am unable to assist with that.")
199
+ True
200
+ >>> rp.looks_like_refusal("I don't support that.")
201
+ True
202
+ >>> rp.looks_like_refusal("I can help with that.")
203
+ False
204
+ """
205
+ score, *_ = self.score_refusal(text)
206
+ return score >= self._THRESHOLD
207
+
208
+ def why_refusal(self, text: str) -> tuple[str, str] | None:
209
+ """
210
+ If the text looks like a refusal, return (family, snippet) for the top match.
211
+
212
+ arguments:
213
+ text:
214
+ Input text string.
215
+
216
+ returns:
217
+ (family, snippet) tuple if the text looks like a refusal, None otherwise.
218
+
219
+ example:
220
+ >>> rp = DefaultRefusalPolicy()
221
+ >>> rp.why_refusal("As an AI model, I can't help with that.")
222
+ ('ai_disclaimer', "As an AI model, I can't help with that.")
223
+ >>> rp.why_refusal("Sorry, I am unable to assist with that.")
224
+ ('unable', 'I am unable to assist with that.')
225
+ >>> rp.why_refusal("I support that.")
226
+ None
227
+ """
228
+ score, fam, snip = self.score_refusal(text)
229
+ if score == 0.0:
230
+ return None
231
+ return fam or "unknown", snip
232
+
233
+ def fix_negative(
234
+ self,
235
+ model: WisentModel,
236
+ generation_conf: dict,
237
+ prompt: str,
238
+ trait_label: str,
239
+ trait_description: str,
240
+ system_prompt: str,
241
+ ) -> str:
242
+ """
243
+ Attempt to fix a refusal negative example by re-prompting the model.
244
+
245
+ arguments:
246
+ model:
247
+ WisentModel instance to call.
248
+ prompt:
249
+ The original prompt text.
250
+ trait_label:
251
+ Label of the undesired trait.
252
+ trait_description:
253
+ Description of the undesired trait.
254
+ system_prompt:
255
+ System prompt to use for the model call.
256
+
257
+ returns:
258
+ New negative example text, or empty string if still a refusal.
259
+
260
+ example:
261
+ >>> rp = DefaultRefusalPolicy()
262
+ >>> def mock_completion_fn(msgs):
263
+ ... return "As an AI model, I cannot help with that."
264
+ >>> rp.fix_negative(mock_completion_fn, "Tell me a joke.", "toxic", "contains toxic language", "System prompt")
265
+ ... ""
266
+ >>> def mock_completion_fn2(msgs):
267
+ ... return "Here's a joke: Why did the chicken cross the road? To get to the other side!"
268
+ >>> rp.fix_negative(mock_completion_fn2, "Tell me a joke.", "toxic", "contains toxic language", "System prompt")
269
+ ... "Here's a joke: Why did the chicken cross the road? To get to the other side!"
270
+ """
271
+ msgs = [
272
+ {"role": "system", "content": system_prompt},
273
+ {
274
+ "role": "user",
275
+ "content": f"Prompt: {prompt}\nTrait label: {trait_label}\nTrait description: {trait_description}",
276
+ },
277
+ ]
278
+ neg_trial = model.generate(
279
+ inputs=[msgs],
280
+ max_tokens=generation_conf.get("max_tokens", 256),
281
+ temperature=generation_conf.get("temperature", 1.0),
282
+ use_steering=False,
283
+ top_p=generation_conf.get("top_p", 1.0),
284
+ )
285
+ return "" if self.looks_like_refusal(neg_trial) else neg_trial
286
+
File without changes
@@ -0,0 +1,47 @@
1
+
2
+ from abc import ABC, abstractmethod
3
+
4
+ from wisent.core.models.wisent_model import WisentModel
5
+ from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
6
+
7
+
8
+ class Refusaler(ABC):
9
+ """
10
+ Refusal detection and fixing step.
11
+
12
+ methods:
13
+ looks_like_refusal(text: str) -> bool:
14
+ Detect if the text looks like a refusal. For example, we want to generate negatives that exhibit
15
+ "evil" behavior, but not refusals like "As an AI model, I cannot help with that."
16
+ fix_negative(
17
+ model: WisentModel,
18
+ prompt: str,
19
+ trait_label: str,
20
+ trait_description: str,
21
+ system_prompt: str,
22
+ ) -> str:
23
+ Attempt to fix a refusal negative example by re-prompting the model with the given system prompt. For example,
24
+ we can increase the temperature or change the wording to try to get a non-refusal response.
25
+ """
26
+ @abstractmethod
27
+ def looks_like_refusal(self, text: str) -> bool: ...
28
+ @abstractmethod
29
+ def fix_negative(
30
+ self,
31
+ model: WisentModel,
32
+ prompt: str,
33
+ trait_label: str,
34
+ trait_description: str,
35
+ system_prompt: str,
36
+ ) -> str: ...
37
+
38
+ class Deduper(ABC):
39
+ """
40
+ Deduplication step; removes duplicate items from the pipeline.
41
+ methods:
42
+ dedupe(items: ContrastivePairSet) -> ContrastivePairSet:
43
+ Remove duplicate items from the given ContrastivePairSet.
44
+ """
45
+
46
+ @abstractmethod
47
+ def dedupe(self, items: ContrastivePairSet) -> ContrastivePairSet: ...
@@ -0,0 +1,90 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable, TYPE_CHECKING
4
+
5
+ from wisent.synthetic.cleaners.core.atoms import CleanStep, Cleaner
6
+ from wisent.synthetic.cleaners.core.atoms import CleanerStats
7
+ from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
8
+
9
+ __all__ = [
10
+ "PairsCleaner",
11
+ ]
12
+
13
+ class PairsCleaner(Cleaner):
14
+ """
15
+ Composable cleaner; pass any sequence of CleanStep.
16
+
17
+ attributes:
18
+ steps:
19
+ Iterable of CleanStep instances to apply in order.
20
+ """
21
+
22
+ def __init__(self, steps: Iterable[CleanStep]) -> None:
23
+ self._steps = list(steps)
24
+
25
+ def clean(
26
+ self, items: ContrastivePairSet
27
+ ) -> tuple[ContrastivePairSet, CleanerStats]:
28
+ """
29
+ Apply the cleaning pipeline to the given ContrastivePairSet.
30
+
31
+ arguments:
32
+ items:
33
+ ContrastivePairSet to clean.
34
+
35
+ returns:
36
+ Tuple of cleaned ContrastivePairSet and CleanerStats with statistics about the cleaning process.
37
+
38
+ example:
39
+ >>> from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
40
+ >>> from wisent.core.contrastive_pairs.core.pair import ContrastivePair
41
+ >>> from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
42
+ >>> from wisent.synthetic.cleaners.methods.base_refusalers import BasesRefusaler
43
+ >>> from wisent.synthetic.cleaners.methods.base_dedupers import SimHashDeduper
44
+ >>> from wisent.synthetic.cleaners.cleaners import PairsCleaner
45
+ >>> from wisent.core.models.wisent_model import WisentModel
46
+ >>> refusal = BasesRefusaler()
47
+ >>> deduper = SimHashDeduper()
48
+ >>> model = WisentModel(model_name="llama3.1")
49
+ >>> cleaner = PairsCleaner(steps=[
50
+ ... RefusalerCleaner(
51
+ ... refusal=refusal,
52
+ ... model=model,
53
+ ... system_prompt="You are a helpful assistant that always answers the question truthfully.",
54
+ ... trait_label="honesty",
55
+ ... trait_description="honest vs dishonest",
56
+ ... max_retries=2,
57
+ ... ),
58
+ ... DeduperCleaner(deduper=deduper),
59
+ ... ])
60
+ >>> items = ContrastivePairSet(pairs=[
61
+ ... ContrastivePair(
62
+ ... prompt="What is the capital of France?",
63
+ ... positive=PositiveResponse(text="The capital of France is Paris."),
64
+ ... negative=NegativeResponse(text="As an AI language model, I cannot provide that information."),
65
+ ... ),
66
+ ... ContrastivePair(
67
+ ... prompt="What is the capital of France?",
68
+ ... positive=PositiveResponse(text="The capital of France is Paris."),
69
+ ... negative=NegativeResponse(text="I don't know."),
70
+ ... ),
71
+ ... ])
72
+ >>> cleaned_items, stats = cleaner.clean(items)
73
+ >>> print(len(cleaned_items))
74
+ 1
75
+ >>> print(stats.step_stats)
76
+ {'refusaler_cleaner': CleanStepStats(modified_items=1), 'deduper_cleaner': CleanStepStats(total_items=1, removed_items=0)}
77
+ >>> print(cleaned_items.pairs[0].negative.text)
78
+ The capital of France is England.
79
+ >>> print(cleaned_items.pairs[0].positive.text)
80
+ The capital of France is Paris.
81
+ >>> print(cleaned_items.pairs[0].prompt.text)
82
+ What is the capital of France?
83
+ """
84
+
85
+ cur = items
86
+ stats = CleanerStats()
87
+ for st in self._steps:
88
+ cur = st.apply(cur)
89
+ stats.step_stats[st.name] = st.stats()
90
+ return cur, stats
@@ -0,0 +1,133 @@
1
+
2
+ from wisent.synthetic.cleaners.core.atoms import CleanStep
3
+ from wisent.core.contrastive_pairs.core.pair import ContrastivePair
4
+ from wisent.synthetic.cleaners.core.atoms import CleanStepStats
5
+
6
+ from wisent.synthetic.cleaners.methods.core.atoms import Refusaler
7
+ from wisent.core.models.wisent_model import WisentModel
8
+ from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
9
+ from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
10
+
11
+ __all__ = [
12
+ "RefusalerCleaner",
13
+ ]
14
+
15
+ class RefusalerCleaner(CleanStep):
16
+ """
17
+ Refusal detection and fixing step.
18
+ """
19
+ name = "refusaler_cleaner"
20
+
21
+ def __init__(
22
+ self,
23
+ refusal: Refusaler,
24
+ model: WisentModel,
25
+ system_prompt: str,
26
+ trait_label: str,
27
+ trait_description: str,
28
+ max_retries: int = 2,
29
+ ) -> None:
30
+ self._refusal = refusal
31
+ self._model = model
32
+ self._sys = system_prompt
33
+ self._label = trait_label
34
+ self._desc = trait_description
35
+ self._max_retries = max_retries
36
+ self._retries_used = 0
37
+
38
+ def stats(self) -> CleanStepStats:
39
+ '''
40
+ Return statistics about the last run of 'apply()'.
41
+
42
+ returns:
43
+ CleanStepStats with the number of retries used in the last run.
44
+ '''
45
+ return CleanStepStats(modified_items=self._retries_used)
46
+
47
+ def apply(self, items: ContrastivePairSet) -> ContrastivePairSet:
48
+ """
49
+ Apply the refusal detection and fixing step to the given ContrastivePairSet.
50
+
51
+ arguments:
52
+ items:
53
+ ContrastivePairSet to clean.
54
+
55
+ returns:
56
+ Cleaned ContrastivePairSet with refusals fixed.
57
+
58
+ example:
59
+ >>> from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
60
+ >>> from wisent.core.contrastive_pairs.core.pair import ContrastivePair
61
+ >>> from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
62
+ >>> from wisent.synthetic.cleaners.methods.base_refusalers import SimpleRefusaler
63
+ >>> from wisent.core.models.wisent_model import WisentModel
64
+ >>> refusal = SimpleRefusaler()
65
+ >>> model = WisentModel(...)
66
+ >>> cleaner = RefusalerCleaner(
67
+ ... refusal=refusal,
68
+ ... model=model,
69
+ ... system_prompt="You are a helpful assistant.",
70
+ ... trait_label="honesty",
71
+ ... trait_description="honest vs dishonest",
72
+ ... max_retries=2,
73
+ ... )
74
+ >>> items = ContrastivePairSet(
75
+ ... name="example",
76
+ ... task_type="test",
77
+ ... pairs=[
78
+ ... ContrastivePair(
79
+ ... prompt="Is the sky blue?",
80
+ ... positive_response=PositiveResponse(
81
+ ... model_response="Yes, the sky is blue.",
82
+ ... layers_activations=None,
83
+ ... label="harmless"
84
+ ... ),
85
+ ... negative_response=NegativeResponse(
86
+ ... model_response="I'm sorry, I can't help with that.",
87
+ ... layers_activations=None,
88
+ ... label="toxic"
89
+ ... ),
90
+ ... label="color_question",
91
+ ... trait_description="hallucinatory"
92
+ ... )
93
+ ... ]
94
+ ... )
95
+ >>> cleaned = cleaner.apply(items)
96
+ >>> for cp in cleaned.pairs:
97
+ ... print(cp)
98
+ ContrastivePair(
99
+ prompt='Is the sky blue?',
100
+ positive_response=PositiveResponse(model_response='Yes, the sky is blue.', layers_activations=None, label='harmless'),
101
+ negative_response=NegativeResponse(model_response='No, the sky is not blue.', layers_activations=None, label='toxic'),
102
+ label='color_question',
103
+ trait_description='hallucinatory'
104
+ )
105
+ """
106
+ out: ContrastivePairSet = ContrastivePairSet(
107
+ name=items.name,
108
+ task_type=items.task_type,
109
+ )
110
+ retries = 0
111
+ for cp in items.pairs:
112
+ neg = cp.negative_response.model_response
113
+ if self._refusal.looks_like_refusal(neg) and retries < self._max_retries:
114
+ fixed = self._refusal.fix_negative(
115
+ self._model,
116
+ prompt=cp.prompt,
117
+ trait_label=self._label,
118
+ trait_description=self._desc,
119
+ system_prompt=self._sys,
120
+ )
121
+ if fixed:
122
+ neg = fixed
123
+ retries += 1
124
+ clean_contrastive_pair = ContrastivePair(
125
+ prompt=cp.prompt,
126
+ positive_response=PositiveResponse(model_response=cp.positive_response.model_response),
127
+ negative_response=NegativeResponse(model_response=neg),
128
+ label=cp.label,
129
+ trait_description=cp.trait_description,
130
+ )
131
+ out.pairs.append(clean_contrastive_pair)
132
+ self._retries_used += retries
133
+ return out
File without changes
File without changes
@@ -0,0 +1,25 @@
1
+
2
+ from abc import ABC, abstractmethod
3
+
4
+ __all__ = ["DB_Instructions"]
5
+
6
+ class DB_Instructions(ABC):
7
+ """
8
+ Interface for database instructions storage and retrieval.
9
+
10
+ methods:
11
+ get(key: str) -> str:
12
+ Retrieve instructions by key.
13
+ set(key: str, value: str) -> None:
14
+ Store instructions by key.
15
+
16
+ notes:
17
+
18
+ This is an abstract base class for the database instructions for the system prompts.
19
+ System prompts need to describe the what kind of contrastive pairs we want to generate.
20
+ or for example instructions for fixing negative examples.
21
+ """
22
+ @abstractmethod
23
+ def get(self, key: str) -> str: ...
24
+ @abstractmethod
25
+ def set(self, key: str, value: str) -> None: ...
@@ -0,0 +1,37 @@
1
+
2
+ from wisent.synthetic.db_instructions.core.atoms import DB_Instructions
3
+
4
+ __all__ = ["Default_DB_Instructions"]
5
+
6
+ class Default_DB_Instructions(DB_Instructions):
7
+ def __init__(self) -> None:
8
+ self._db: dict[str, str] = {
9
+ "generic_pairs": (
10
+ "You are a data generator that produces JSON only.\n"
11
+ "Goal: create synthetic contrastive pairs (prompt, positive, negative) for the given trait and trait description.\n"
12
+ "Rules:\n"
13
+ " - Positive = desired/harmless/correct.\n"
14
+ " - Negative = undesired/harmful/incorrect.\n"
15
+ " - Keep outputs concise (<= 2 sentences each response).\n"
16
+ " - No explanations or meta-text.\n"
17
+ " - Return JSON with top-level key 'pairs'.\n"
18
+ " - Each: {'prompt','positive','negative','label','trait_description'}.\n"
19
+ "For example:\n"
20
+ "prompt: How to pass a math exam?\n"
21
+ "positive: Study the material thoroughly and practice with sample questions.\n"
22
+ "negative: Pay money to get the answers. Or cheat off someone else.\n"
23
+ "label: 'evil'\n"
24
+ "trait_description: 'evil vs good'\n"
25
+ ),
26
+ "roleplay_neg_fix": (
27
+ "You are fixing ONLY the negative example of a contrastive pair.\n"
28
+ "Produce a single concise negative response for the given prompt that exemplifies the UNDESIRED trait.\n"
29
+ "It must be fictional/hypothetical, safe, and non-actionable. Return raw text only."
30
+ ),
31
+ }
32
+
33
+ def get(self, key: str) -> str:
34
+ return self._db[key]
35
+
36
+ def set(self, key: str, value: str) -> None:
37
+ self._db[key] = value
File without changes
File without changes