wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.2.dist-info/METADATA +67 -0
- wisent-0.5.2.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
import logging
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"Diversity",
|
|
9
|
+
"DiversityScores",
|
|
10
|
+
"GenerationReport",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class DiversityScores:
|
|
17
|
+
"""
|
|
18
|
+
Diversity metrics for a list of texts.
|
|
19
|
+
|
|
20
|
+
attributes:
|
|
21
|
+
unique_unigrams:
|
|
22
|
+
Unique unigrams (ratio of unique unigrams to total unigrams).
|
|
23
|
+
unique_bigrams:
|
|
24
|
+
Unique bigrams (ratio of unique bigrams to total bigrams).
|
|
25
|
+
avg_jaccard_prompt:
|
|
26
|
+
Average Jaccard similarity between all pairs of texts.
|
|
27
|
+
mean_simhash_hamming_prompt:
|
|
28
|
+
Mean Hamming distance between SimHash fingerprints of all pairs of texts.
|
|
29
|
+
min_simhash_hamming_prompt:
|
|
30
|
+
Minimum Hamming distance between SimHash fingerprints of all pairs of texts.
|
|
31
|
+
"""
|
|
32
|
+
unique_unigrams: float
|
|
33
|
+
unique_bigrams: float
|
|
34
|
+
avg_jaccard_prompt: float
|
|
35
|
+
mean_simhash_hamming_prompt: float
|
|
36
|
+
min_simhash_hamming_prompt: int
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class GenerationReport:
|
|
40
|
+
"""
|
|
41
|
+
Report of a generation+cleaning run.
|
|
42
|
+
|
|
43
|
+
attributes:
|
|
44
|
+
requested:
|
|
45
|
+
Number of contrastive pairs requested from the model.
|
|
46
|
+
kept_after_dedupe:
|
|
47
|
+
Number of contrastive pairs kept after deduplication.
|
|
48
|
+
retries_for_refusals:
|
|
49
|
+
Number of retries made to fix refusals in negative examples.
|
|
50
|
+
diversity:
|
|
51
|
+
DiversityScores computed on the final prompts.
|
|
52
|
+
|
|
53
|
+
notes:
|
|
54
|
+
LLMs may refuse to generate negative examples that exhibit undesired traits. Increasing
|
|
55
|
+
'max_refusal_retries' in the generator can help mitigate this, but increases cost and latency.
|
|
56
|
+
"""
|
|
57
|
+
requested: int
|
|
58
|
+
kept_after_dedupe: int
|
|
59
|
+
retries_for_refusals: int
|
|
60
|
+
diversity: DiversityScores
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class Diversity(ABC):
|
|
64
|
+
"""
|
|
65
|
+
Diversity metrics computation.
|
|
66
|
+
|
|
67
|
+
methods:
|
|
68
|
+
compute(texts: list[str]) -> DiversityScores:
|
|
69
|
+
Compute diversity metrics for a list of texts. We can compute Distinct-N, Jaccard similarity,
|
|
70
|
+
and SimHash Hamming distance to get a sense of lexical and structural diversity.
|
|
71
|
+
"""
|
|
72
|
+
@abstractmethod
|
|
73
|
+
def compute(self, texts: list[str]) -> DiversityScores: ...
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"Diversity",
|
|
6
|
+
"DiversityScores",
|
|
7
|
+
"GenerationReport",
|
|
8
|
+
]
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class DiversityScores:
|
|
12
|
+
"""
|
|
13
|
+
Diversity metrics for a list of texts.
|
|
14
|
+
|
|
15
|
+
attributes:
|
|
16
|
+
unique_unigrams:
|
|
17
|
+
Unique unigrams (ratio of unique unigrams to total unigrams).
|
|
18
|
+
unique_bigrams:
|
|
19
|
+
Unique bigrams (ratio of unique bigrams to total bigrams).
|
|
20
|
+
avg_jaccard_prompt:
|
|
21
|
+
Average Jaccard similarity between all pairs of texts.
|
|
22
|
+
mean_simhash_hamming_prompt:
|
|
23
|
+
Mean Hamming distance between SimHash fingerprints of all pairs of texts.
|
|
24
|
+
min_simhash_hamming_prompt:
|
|
25
|
+
Minimum Hamming distance between SimHash fingerprints of all pairs of texts.
|
|
26
|
+
"""
|
|
27
|
+
unique_unigrams: float
|
|
28
|
+
unique_bigrams: float
|
|
29
|
+
avg_jaccard_prompt: float
|
|
30
|
+
mean_simhash_hamming_prompt: float
|
|
31
|
+
min_simhash_hamming_prompt: int
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class GenerationReport:
|
|
35
|
+
"""
|
|
36
|
+
Report of a generation+cleaning run.
|
|
37
|
+
|
|
38
|
+
attributes:
|
|
39
|
+
requested:
|
|
40
|
+
Number of contrastive pairs requested from the model.
|
|
41
|
+
kept_after_dedupe:
|
|
42
|
+
Number of contrastive pairs kept after deduplication.
|
|
43
|
+
retries_for_refusals:
|
|
44
|
+
Number of retries made to fix refusals in negative examples.
|
|
45
|
+
diversity:
|
|
46
|
+
DiversityScores computed on the final prompts.
|
|
47
|
+
|
|
48
|
+
notes:
|
|
49
|
+
LLMs may refuse to generate negative examples that exhibit undesired traits. Increasing
|
|
50
|
+
'max_refusal_retries' in the generator can help mitigate this, but increases cost and latency.
|
|
51
|
+
"""
|
|
52
|
+
requested: int
|
|
53
|
+
kept_after_dedupe: int
|
|
54
|
+
retries_for_refusals: int
|
|
55
|
+
diversity: DiversityScores
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class Diversity(ABC):
|
|
59
|
+
"""
|
|
60
|
+
Diversity metrics computation.
|
|
61
|
+
|
|
62
|
+
methods:
|
|
63
|
+
compute(texts: list[str]) -> DiversityScores:
|
|
64
|
+
Compute diversity metrics for a list of texts. We can compute Distinct-N, Jaccard similarity,
|
|
65
|
+
and SimHash Hamming distance to get a sense of lexical and structural diversity.
|
|
66
|
+
"""
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def compute(self, texts: list[str]) -> DiversityScores: ...
|
|
File without changes
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Iterable
|
|
3
|
+
import re
|
|
4
|
+
import numpy as np
|
|
5
|
+
from wisent.synthetic.generators.diversities.core.core import Diversity, DiversityScores
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"FastDiversity",
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
class FastDiversity(Diversity):
|
|
12
|
+
"""
|
|
13
|
+
Fast diversity metrics computation.
|
|
14
|
+
|
|
15
|
+
attributes:
|
|
16
|
+
_rng:
|
|
17
|
+
Random number generator for sampling.
|
|
18
|
+
"""
|
|
19
|
+
_TOKEN_RE = re.compile(r"[A-Za-z0-9']+|[^\w\s]")
|
|
20
|
+
|
|
21
|
+
def __init__(self, seed: int | None = 13) -> None:
|
|
22
|
+
self._rng = np.random.default_rng(seed)
|
|
23
|
+
|
|
24
|
+
def compute(self, texts: list[str]) -> DiversityScores:
|
|
25
|
+
"""
|
|
26
|
+
Compute diversity metrics for a list of texts.
|
|
27
|
+
|
|
28
|
+
arguments:
|
|
29
|
+
texts:
|
|
30
|
+
List of input strings.
|
|
31
|
+
|
|
32
|
+
returns:
|
|
33
|
+
DiversityScores dataclass with various diversity metrics.
|
|
34
|
+
|
|
35
|
+
example:
|
|
36
|
+
>>> fd = FastDiversity(seed=42)
|
|
37
|
+
>>> texts = ["hello world", "hello there", "different text"]
|
|
38
|
+
>>> fd.compute(texts)
|
|
39
|
+
... DiversityScores(unique_unigrams=0.6666666666666666, unique_bigrams=1.0, avg_jaccard_prompt=0.3333333333333333, mean_simhash_hamming_prompt=21.333333333333332, min_simhash_hamming_prompt=20)
|
|
40
|
+
|
|
41
|
+
intuition:
|
|
42
|
+
Higher unique scores and lower average Jaccard indicate more lexical diversity.
|
|
43
|
+
Higher mean and minimum SimHash Hamming distances indicate more structural diversity.
|
|
44
|
+
"""
|
|
45
|
+
d1 = self._distinct_n(texts, 1)
|
|
46
|
+
d2 = self._distinct_n(texts, 2)
|
|
47
|
+
sample = texts if len(texts) <= 20 else self._rng.sample(texts, 20)
|
|
48
|
+
if len(sample) >= 2:
|
|
49
|
+
jaccs: list[float] = []
|
|
50
|
+
fps: list[int] = []
|
|
51
|
+
for i, s in enumerate(sample):
|
|
52
|
+
fps.append(self._simhash64(s))
|
|
53
|
+
for j in range(i + 1, len(sample)):
|
|
54
|
+
jaccs.append(self._jaccard(s, sample[j]))
|
|
55
|
+
avg_j = sum(jaccs) / len(jaccs) if jaccs else 0.0
|
|
56
|
+
dists: list[int] = []
|
|
57
|
+
for i in range(len(sample)):
|
|
58
|
+
for j in range(i + 1, len(sample)):
|
|
59
|
+
dists.append(self._hamming(fps[i], fps[j]))
|
|
60
|
+
mean_h = (sum(dists) / len(dists)) if dists else 0.0
|
|
61
|
+
min_h = min(dists) if dists else 64
|
|
62
|
+
else:
|
|
63
|
+
avg_j, mean_h, min_h = 0.0, 0.0, 64
|
|
64
|
+
return DiversityScores(d1, d2, avg_j, float(mean_h), int(min_h))
|
|
65
|
+
|
|
66
|
+
def _distinct_n(self, texts: Iterable[str], n: int) -> float:
|
|
67
|
+
"""
|
|
68
|
+
Compute the Distinct-N score for a list of texts.
|
|
69
|
+
|
|
70
|
+
arguments:
|
|
71
|
+
texts:
|
|
72
|
+
Iterable of input strings.
|
|
73
|
+
n:
|
|
74
|
+
N-gram size (e.g., 1 for unigrams, 2 for bigrams).
|
|
75
|
+
|
|
76
|
+
returns:
|
|
77
|
+
Distinct-N score: ratio of unique n-grams to total n-grams.
|
|
78
|
+
|
|
79
|
+
example:
|
|
80
|
+
>>> fd = FastDiversity()
|
|
81
|
+
>>> texts = ["hello world", "hello there"]
|
|
82
|
+
>>> fd._distinct_n(texts, 1)
|
|
83
|
+
0.75
|
|
84
|
+
>>> fd._distinct_n(texts, 2)
|
|
85
|
+
1.0
|
|
86
|
+
intuition:
|
|
87
|
+
Distinct-N = (number of unique n-grams) / (total number of n-grams)
|
|
88
|
+
Higher values indicate more lexical diversity. For example, in ["hello world", "hello there"]:
|
|
89
|
+
Unigrams:
|
|
90
|
+
['hello', 'world', 'hello', 'there'] → unique = ['hello', 'world', 'there'] → Distinct-1 = 3/4 = 0.75
|
|
91
|
+
Bigrams:
|
|
92
|
+
['hello world', 'hello there'] → unique = ['hello world', 'hello there'] → Distinct-2 = 2/2 = 1.0
|
|
93
|
+
"""
|
|
94
|
+
ngrams: list[tuple[str, ...]] = []
|
|
95
|
+
for t in texts:
|
|
96
|
+
toks = self._tok(t)
|
|
97
|
+
ngrams.extend(tuple(toks[i : i + n]) for i in range(0, max(0, len(toks) - n + 1)))
|
|
98
|
+
return (len(set(ngrams)) / float(len(ngrams))) if ngrams else 0.0
|
|
99
|
+
|
|
100
|
+
def _jaccard(self, a: str, b: str) -> float:
|
|
101
|
+
"""
|
|
102
|
+
Compute Jaccard similarity between two strings.
|
|
103
|
+
|
|
104
|
+
arguments:
|
|
105
|
+
a:
|
|
106
|
+
First input string.
|
|
107
|
+
b:
|
|
108
|
+
Second input string.
|
|
109
|
+
|
|
110
|
+
returns:
|
|
111
|
+
Jaccard similarity score between 0.0 and 1.0.
|
|
112
|
+
|
|
113
|
+
example:
|
|
114
|
+
>>> fd = FastDiversity()
|
|
115
|
+
>>> fd._jaccard("hello world", "hello there")
|
|
116
|
+
0.3333333333333333
|
|
117
|
+
>>> fd._jaccard("abc", "xyz")
|
|
118
|
+
0.0
|
|
119
|
+
>>> fd._jaccard("", "")
|
|
120
|
+
1.0
|
|
121
|
+
>>> fd._jaccard("abc", "")
|
|
122
|
+
0.0
|
|
123
|
+
|
|
124
|
+
intuition:
|
|
125
|
+
Jaccard similarity = |A ∩ B| / |A ∪ B|
|
|
126
|
+
where A and B are the sets of tokens in strings a and b. Higher values indicate more overlap.
|
|
127
|
+
For example "hello world" and "hello there" share the token "hello", yielding a similarity of 1/3.
|
|
128
|
+
"""
|
|
129
|
+
A, B = set(self._tok(a)), set(self._tok(b))
|
|
130
|
+
if not A and not B:
|
|
131
|
+
return 1.0
|
|
132
|
+
if not A or not B:
|
|
133
|
+
return 0.0
|
|
134
|
+
inter = len(A & B)
|
|
135
|
+
union = len(A | B)
|
|
136
|
+
return inter / union if union else 0.0
|
|
137
|
+
|
|
138
|
+
def _tok(self, s: str) -> list[str]:
|
|
139
|
+
"""
|
|
140
|
+
Simple whitespace/punctuation tokenizer.
|
|
141
|
+
|
|
142
|
+
arguments:
|
|
143
|
+
s:
|
|
144
|
+
Input string to tokenize.
|
|
145
|
+
|
|
146
|
+
returns:
|
|
147
|
+
List of tokens (words and punctuation).
|
|
148
|
+
"""
|
|
149
|
+
return self._TOKEN_RE.findall(s.lower())
|
|
150
|
+
|
|
151
|
+
def _hamming(self, a: int, b: int) -> int:
|
|
152
|
+
"""
|
|
153
|
+
Compute the Hamming distance between two 64-bit integers.
|
|
154
|
+
arguments:
|
|
155
|
+
a:
|
|
156
|
+
First 64-bit integer.
|
|
157
|
+
b:
|
|
158
|
+
Second 64-bit integer.
|
|
159
|
+
|
|
160
|
+
returns:
|
|
161
|
+
Hamming distance (number of differing bits).
|
|
162
|
+
|
|
163
|
+
example:
|
|
164
|
+
>>> fd = FastDiversity()
|
|
165
|
+
>>> fd._hamming(0b1010, 0b1001)
|
|
166
|
+
2
|
|
167
|
+
>>> fd._hamming(0b1111, 0b1111)
|
|
168
|
+
0
|
|
169
|
+
>>> fd._hamming(0b0, 0b1111111111111111)
|
|
170
|
+
64
|
|
171
|
+
|
|
172
|
+
intuition:
|
|
173
|
+
Hamming distance counts the number of bit positions where two integers differ.
|
|
174
|
+
For example, 0b1010 (10) and 0b1001 (9) differ in two bit positions, yielding a distance of 2.
|
|
175
|
+
The maximum distance for 64-bit integers is 64 (completely different), and identical integers have a distance of 0.
|
|
176
|
+
"""
|
|
177
|
+
x = a ^ b
|
|
178
|
+
c = 0
|
|
179
|
+
while x:
|
|
180
|
+
x &= x - 1
|
|
181
|
+
c += 1
|
|
182
|
+
return c
|
|
183
|
+
|
|
184
|
+
def _hash64(self, x: str) -> int:
|
|
185
|
+
"""
|
|
186
|
+
64-bit FNV-1a hash of a string.
|
|
187
|
+
|
|
188
|
+
arguments:
|
|
189
|
+
x:
|
|
190
|
+
Input string to hash.
|
|
191
|
+
|
|
192
|
+
returns:
|
|
193
|
+
64-bit integer hash value.
|
|
194
|
+
|
|
195
|
+
example:
|
|
196
|
+
>>> fd = FastDiversity()
|
|
197
|
+
>>> fd._hash64("hello")
|
|
198
|
+
11831194018420276491
|
|
199
|
+
>>> fd._hash64("world")
|
|
200
|
+
15195822415430384601
|
|
201
|
+
>>> fd._hash64("")
|
|
202
|
+
14695981039346656037
|
|
203
|
+
"""
|
|
204
|
+
h = 0xCBF29CE484222325
|
|
205
|
+
for c in x.encode("utf-8"):
|
|
206
|
+
h ^= c
|
|
207
|
+
h = (h * 0x100000001B3) & 0xFFFFFFFFFFFFFFFF
|
|
208
|
+
return h
|
|
209
|
+
|
|
210
|
+
def _simhash64(self, text: str) -> int:
|
|
211
|
+
"""
|
|
212
|
+
Compute the SimHash of a text.
|
|
213
|
+
|
|
214
|
+
arguments:
|
|
215
|
+
text: Input text to hash.
|
|
216
|
+
|
|
217
|
+
returns:
|
|
218
|
+
64-bit SimHash fingerprint as an integer.
|
|
219
|
+
|
|
220
|
+
example:
|
|
221
|
+
>>> fd = FastDiversity()
|
|
222
|
+
>>> fd._simhash64("hello world")
|
|
223
|
+
16204198794447330368
|
|
224
|
+
>>> fd._simhash64("hello there")
|
|
225
|
+
16204198794447330368
|
|
226
|
+
>>> fd._simhash64("different text")
|
|
227
|
+
1080863910568919040
|
|
228
|
+
>>> fd._simhash64("")
|
|
229
|
+
0
|
|
230
|
+
intuition:
|
|
231
|
+
SimHash is a locality-sensitive hash that maps similar texts to similar fingerprints.
|
|
232
|
+
It works by hashing features (tokens) and combining their bits based on frequency.
|
|
233
|
+
For example, "hello world" and "hello there" share the token "hello", resulting in identical SimHash values.
|
|
234
|
+
In contrast, "different text" yields a very different fingerprint. An empty string hashes to 0.
|
|
235
|
+
This makes SimHash useful for deduplication and near-duplicate detection.
|
|
236
|
+
"""
|
|
237
|
+
feats = self._tok(text)
|
|
238
|
+
if not feats:
|
|
239
|
+
return 0
|
|
240
|
+
vec = [0] * 64
|
|
241
|
+
for f in feats:
|
|
242
|
+
hv = self._hash64(f)
|
|
243
|
+
for i in range(64):
|
|
244
|
+
vec[i] += 1 if (hv >> i) & 1 else -1
|
|
245
|
+
out = 0
|
|
246
|
+
for i, v in enumerate(vec):
|
|
247
|
+
if v >= 0:
|
|
248
|
+
out |= (1 << i)
|
|
249
|
+
return out
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from wisent.core.contrastive_pairs.core.pair import ContrastivePair
|
|
7
|
+
from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
|
|
8
|
+
from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
|
|
9
|
+
|
|
10
|
+
from wisent.core.models.wisent_model import WisentModel
|
|
11
|
+
from wisent.synthetic.db_instructions.core.atoms import DB_Instructions
|
|
12
|
+
|
|
13
|
+
from wisent.synthetic.generators.core.atoms import GenerationReport
|
|
14
|
+
|
|
15
|
+
from wisent.synthetic.generators.diversities.core.core import Diversity
|
|
16
|
+
|
|
17
|
+
from wisent.synthetic.cleaners.pairs_cleaner import PairsCleaner
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"SyntheticContrastivePairsGenerator",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
class SyntheticContrastivePairsGenerator:
|
|
26
|
+
"""Small, fast contrastive-pairs generator with an extensible cleaning pipeline."""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
model: WisentModel,
|
|
31
|
+
generation_config: dict[str, int | float | str],
|
|
32
|
+
contrastive_set_name: str,
|
|
33
|
+
trait_description: str,
|
|
34
|
+
trait_label: str,
|
|
35
|
+
db_instructions: DB_Instructions,
|
|
36
|
+
cleaner: PairsCleaner,
|
|
37
|
+
diversity: Diversity,
|
|
38
|
+
) -> None:
|
|
39
|
+
self.model = model
|
|
40
|
+
self.db_instructions = db_instructions
|
|
41
|
+
self.generation_config = generation_config
|
|
42
|
+
self.cleaner = cleaner
|
|
43
|
+
self.diversity = diversity
|
|
44
|
+
|
|
45
|
+
self.contrastive_set_name = contrastive_set_name
|
|
46
|
+
self.trait_description = trait_description
|
|
47
|
+
self.trait_label = trait_label
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def generate(
|
|
51
|
+
self,
|
|
52
|
+
num_pairs: int = 10,
|
|
53
|
+
) -> tuple[ContrastivePairSet, GenerationReport]:
|
|
54
|
+
"""
|
|
55
|
+
Generate synthetic contrastive pairs for the given topic and trait.
|
|
56
|
+
|
|
57
|
+
arguments:
|
|
58
|
+
num_pairs:
|
|
59
|
+
Number of contrastive pairs to generate (default: 10).
|
|
60
|
+
|
|
61
|
+
returns:
|
|
62
|
+
Tuple of ContrastivePairSet with the generated pairs and GenerationReport with statistics about the generation
|
|
63
|
+
"""
|
|
64
|
+
# 1) generate
|
|
65
|
+
sys = self.db_instructions.get("generic_pairs")
|
|
66
|
+
usr = self._build_user_prompt(
|
|
67
|
+
self.trait_label, self.trait_description, num_pairs
|
|
68
|
+
)
|
|
69
|
+
raw = self.model.generate(
|
|
70
|
+
inputs=[[
|
|
71
|
+
{"role": "system", "content": sys},
|
|
72
|
+
{"role": "user", "content": usr}
|
|
73
|
+
]],
|
|
74
|
+
**self.generation_config,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# 2) parse
|
|
78
|
+
parsed = self.parse_pairs(raw)
|
|
79
|
+
|
|
80
|
+
# 3) clean
|
|
81
|
+
cleaned, stats = self.cleaner.clean(parsed)
|
|
82
|
+
|
|
83
|
+
retries = stats.step_stats.get("refusaler_cleaner").modified_items
|
|
84
|
+
|
|
85
|
+
# 4) build domain objects
|
|
86
|
+
cps = ContrastivePairSet(name=self.contrastive_set_name, task_type=self.trait_label)
|
|
87
|
+
for item in cleaned.pairs:
|
|
88
|
+
cps.add(
|
|
89
|
+
ContrastivePair(
|
|
90
|
+
prompt=item.prompt,
|
|
91
|
+
positive_response=PositiveResponse(model_response=item.positive_response.model_response),
|
|
92
|
+
negative_response=NegativeResponse(model_response=item.negative_response.model_response),
|
|
93
|
+
label=item.label or self.trait_label,
|
|
94
|
+
trait_description=item.trait_description or self.trait_description,
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
# 5) diversity summary (prompts only)
|
|
98
|
+
prompts = [it.prompt for it in cleaned.pairs]
|
|
99
|
+
div = self.diversity.compute(prompts)
|
|
100
|
+
|
|
101
|
+
report = GenerationReport(
|
|
102
|
+
requested=num_pairs,
|
|
103
|
+
kept_after_dedupe=len(cleaned),
|
|
104
|
+
retries_for_refusals=retries,
|
|
105
|
+
diversity=div,
|
|
106
|
+
)
|
|
107
|
+
return cps, report
|
|
108
|
+
|
|
109
|
+
def parse_pairs(self, raw: list[str]) -> ContrastivePairSet:
|
|
110
|
+
"""
|
|
111
|
+
Parse raw model outputs into ContrastivePairSet objects.
|
|
112
|
+
|
|
113
|
+
arguments:
|
|
114
|
+
raw:
|
|
115
|
+
Raw model output string to parse.
|
|
116
|
+
returns:
|
|
117
|
+
ContrastivePairSet object parsed from the raw string.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
import json
|
|
121
|
+
|
|
122
|
+
out: ContrastivePairSet = ContrastivePairSet(
|
|
123
|
+
name=self.contrastive_set_name,
|
|
124
|
+
task_type=self.trait_label,
|
|
125
|
+
)
|
|
126
|
+
for r in raw:
|
|
127
|
+
#TODO: this is very ugly, need to improve robustness
|
|
128
|
+
# r can have instruction, and i want extacrt everything between ```json and ``` (after - You must return answer in valid JSON format only. Don't include any explanations or additional text.assistant)
|
|
129
|
+
# also try to recover like Expecting ',' delimiter
|
|
130
|
+
if "```json" in r:
|
|
131
|
+
r = r.split("```json")[-1]
|
|
132
|
+
if "```" in r:
|
|
133
|
+
r = r.split("```")[0]
|
|
134
|
+
r = r.strip()
|
|
135
|
+
try:
|
|
136
|
+
data = json.loads(r)
|
|
137
|
+
except json.JSONDecodeError:
|
|
138
|
+
# try to recover from common errors
|
|
139
|
+
r = r.replace("'", '"').replace("```", '')
|
|
140
|
+
try:
|
|
141
|
+
data = json.loads(r)
|
|
142
|
+
except json.JSONDecodeError:
|
|
143
|
+
continue
|
|
144
|
+
for item in data.get("pairs", []):
|
|
145
|
+
cp = ContrastivePair(
|
|
146
|
+
prompt=item["prompt"],
|
|
147
|
+
positive_response=PositiveResponse(model_response=item["positive"]),
|
|
148
|
+
negative_response=NegativeResponse(model_response=item["negative"]),
|
|
149
|
+
label=item.get("label", self.trait_label),
|
|
150
|
+
trait_description=item.get("trait_description", self.trait_description),
|
|
151
|
+
)
|
|
152
|
+
out.add(cp)
|
|
153
|
+
return out
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def _build_user_prompt(label: str, desc: str, k: int) -> str:
|
|
157
|
+
bullets = (
|
|
158
|
+
f"- Trait label: {label}\n"
|
|
159
|
+
f"- Trait description: {desc}\n"
|
|
160
|
+
f"- Num pairs: {k}\n"
|
|
161
|
+
)
|
|
162
|
+
schema = (
|
|
163
|
+
"Return JSON like:\n"
|
|
164
|
+
"```json\n"
|
|
165
|
+
"{\n"
|
|
166
|
+
' "pairs": [\n'
|
|
167
|
+
' {"prompt": "...", "positive": "...", "negative": "...", '
|
|
168
|
+
f'"label": "{label}", "trait_description": "{desc}"\n'
|
|
169
|
+
" ]\n"
|
|
170
|
+
"}\n"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
tips = (
|
|
174
|
+
"- Make prompts specific to the topic but varied in wording and intent.\n"
|
|
175
|
+
"- Keep negative examples safe (fictional, non-actionable).\n"
|
|
176
|
+
"- Avoid meta-text like “I cannot” or “As an AI model…”.\n"
|
|
177
|
+
"- You must return answer in valid JSON format only. Don't include any explanations or additional text.\n"
|
|
178
|
+
)
|
|
179
|
+
return f"Create {k} contrastive pairs.\n{bullets}\n{schema}\n{tips}"
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wisent
|
|
3
|
+
Version: 0.5.2
|
|
4
|
+
Summary: Monitor and guard against harmful content in language models
|
|
5
|
+
Home-page: https://github.com/yourusername/wisent-activation-guardrails
|
|
6
|
+
Author: Wisent Team
|
|
7
|
+
Author-email: your.email@example.com
|
|
8
|
+
Keywords: nlp,machine learning,language models,safety,guardrails,lm-evaluation-harness
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: torch>=1.9.0
|
|
19
|
+
Requires-Dist: transformers>=4.20.0
|
|
20
|
+
Requires-Dist: tqdm>=4.50.0
|
|
21
|
+
Requires-Dist: scikit-learn>=0.24.0
|
|
22
|
+
Requires-Dist: pandas>=1.2.0
|
|
23
|
+
Requires-Dist: numpy>=1.21.0
|
|
24
|
+
Requires-Dist: datasets>=2.0.0
|
|
25
|
+
Requires-Dist: sentence-transformers>=2.0.0
|
|
26
|
+
Requires-Dist: faiss-cpu>=1.7.0
|
|
27
|
+
Provides-Extra: harness
|
|
28
|
+
Requires-Dist: lm-eval==0.4.8; extra == "harness"
|
|
29
|
+
Dynamic: author
|
|
30
|
+
Dynamic: author-email
|
|
31
|
+
Dynamic: classifier
|
|
32
|
+
Dynamic: description
|
|
33
|
+
Dynamic: description-content-type
|
|
34
|
+
Dynamic: home-page
|
|
35
|
+
Dynamic: keywords
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
Dynamic: provides-extra
|
|
38
|
+
Dynamic: requires-dist
|
|
39
|
+
Dynamic: requires-python
|
|
40
|
+
Dynamic: summary
|
|
41
|
+
|
|
42
|
+
# Wisent-Guard
|
|
43
|
+
|
|
44
|
+
<p align="center">
|
|
45
|
+
<a href="https://github.com/wisent-ai/wisent-guard/stargazers">
|
|
46
|
+
<img src="https://img.shields.io/github/stars/wisent-ai/wisent-guard" alt="stars" />
|
|
47
|
+
</a>
|
|
48
|
+
<a href="https://pypi.org/project/wisent-guard">
|
|
49
|
+
<img src="https://static.pepy.tech/badge/wisent-guard" alt="PyPI - Downloads" />
|
|
50
|
+
</a>
|
|
51
|
+
<br />
|
|
52
|
+
</p>
|
|
53
|
+
|
|
54
|
+
<p align="center">
|
|
55
|
+
<img src="wisent-guard-logo.png" alt="Wisent Guard" width="200">
|
|
56
|
+
</p>
|
|
57
|
+
|
|
58
|
+
A Python package for latent space monitoring and guardrails. Delivered to you by the [Wisent](https://wisent.ai) team led by [Lukasz Bartoszcze](https://lukaszbartoszcze.com).
|
|
59
|
+
|
|
60
|
+
## Overview
|
|
61
|
+
|
|
62
|
+
Wisent-Guard allows you to control your AI by identifying brain patterns corresponding to responses you don't like, like hallucinations or harmful outputs. We use contrastive pairs of representations to detect when a model might be generating harmful content or hallucinating. Learn more at https://www.wisent.ai/wisent-guard.
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
## License
|
|
66
|
+
|
|
67
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|