PyPI - wisent - Versions diffs - 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

wisent 0.1.1py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show

wisent/__init__.py +1 -8
wisent/benchmarks/__init__.py +0 -0
wisent/benchmarks/coding/__init__.py +0 -0
wisent/benchmarks/coding/metrics/__init__.py +0 -0
wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
wisent/benchmarks/coding/metrics/evaluator.py +275 -0
wisent/benchmarks/coding/metrics/passk.py +66 -0
wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
wisent/benchmarks/coding/providers/__init__.py +18 -0
wisent/benchmarks/coding/providers/core/__init__.py +0 -0
wisent/benchmarks/coding/providers/core/atoms.py +31 -0
wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
wisent/classifiers/__init__.py +0 -0
wisent/classifiers/core/__init__.py +0 -0
wisent/classifiers/core/atoms.py +747 -0
wisent/classifiers/models/__init__.py +0 -0
wisent/classifiers/models/logistic.py +29 -0
wisent/classifiers/models/mlp.py +47 -0
wisent/cli/__init__.py +0 -0
wisent/cli/classifiers/__init__.py +0 -0
wisent/cli/classifiers/classifier_rotator.py +137 -0
wisent/cli/cli_logger.py +142 -0
wisent/cli/data_loaders/__init__.py +0 -0
wisent/cli/data_loaders/data_loader_rotator.py +96 -0
wisent/cli/evaluators/__init__.py +0 -0
wisent/cli/evaluators/evaluator_rotator.py +148 -0
wisent/cli/steering_methods/__init__.py +0 -0
wisent/cli/steering_methods/steering_rotator.py +110 -0
wisent/cli/wisent_cli/__init__.py +0 -0
wisent/cli/wisent_cli/commands/__init__.py +0 -0
wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
wisent/cli/wisent_cli/commands/listing.py +154 -0
wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
wisent/cli/wisent_cli/main.py +93 -0
wisent/cli/wisent_cli/shell.py +80 -0
wisent/cli/wisent_cli/ui.py +69 -0
wisent/cli/wisent_cli/util/__init__.py +0 -0
wisent/cli/wisent_cli/util/aggregations.py +43 -0
wisent/cli/wisent_cli/util/parsing.py +126 -0
wisent/cli/wisent_cli/version.py +4 -0
wisent/core/__init__.py +27 -0
wisent/core/activations/__init__.py +0 -0
wisent/core/activations/activations_collector.py +338 -0
wisent/core/activations/core/__init__.py +0 -0
wisent/core/activations/core/atoms.py +216 -0
wisent/core/agent/__init__.py +18 -0
wisent/core/agent/budget.py +638 -0
wisent/core/agent/device_benchmarks.py +685 -0
wisent/core/agent/diagnose/__init__.py +55 -0
wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
wisent/core/agent/diagnose/create_classifier.py +1154 -0
wisent/core/agent/diagnose/response_diagnostics.py +268 -0
wisent/core/agent/diagnose/select_classifiers.py +506 -0
wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
wisent/core/agent/diagnose/tasks/__init__.py +33 -0
wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
wisent/core/agent/diagnose.py +242 -0
wisent/core/agent/steer.py +212 -0
wisent/core/agent/timeout.py +134 -0
wisent/core/autonomous_agent.py +1234 -0
wisent/core/bigcode_integration.py +583 -0
wisent/core/contrastive_pairs/__init__.py +15 -0
wisent/core/contrastive_pairs/core/__init__.py +0 -0
wisent/core/contrastive_pairs/core/atoms.py +45 -0
wisent/core/contrastive_pairs/core/buliders.py +59 -0
wisent/core/contrastive_pairs/core/pair.py +178 -0
wisent/core/contrastive_pairs/core/response.py +152 -0
wisent/core/contrastive_pairs/core/serialization.py +300 -0
wisent/core/contrastive_pairs/core/set.py +133 -0
wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
wisent/core/data_loaders/__init__.py +0 -0
wisent/core/data_loaders/core/__init__.py +0 -0
wisent/core/data_loaders/core/atoms.py +98 -0
wisent/core/data_loaders/loaders/__init__.py +0 -0
wisent/core/data_loaders/loaders/custom.py +120 -0
wisent/core/data_loaders/loaders/lm_loader.py +218 -0
wisent/core/detection_handling.py +257 -0
wisent/core/download_full_benchmarks.py +1386 -0
wisent/core/evaluators/__init__.py +0 -0
wisent/core/evaluators/oracles/__init__.py +0 -0
wisent/core/evaluators/oracles/interactive.py +73 -0
wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
wisent/core/evaluators/oracles/user_specified.py +67 -0
wisent/core/hyperparameter_optimizer.py +429 -0
wisent/core/lm_eval_harness_ground_truth.py +1396 -0
wisent/core/log_likelihoods_evaluator.py +321 -0
wisent/core/managed_cached_benchmarks.py +595 -0
wisent/core/mixed_benchmark_sampler.py +364 -0
wisent/core/model_config_manager.py +330 -0
wisent/core/model_persistence.py +317 -0
wisent/core/models/__init__.py +0 -0
wisent/core/models/core/__init__.py +0 -0
wisent/core/models/core/atoms.py +460 -0
wisent/core/models/wisent_model.py +727 -0
wisent/core/multi_steering.py +316 -0
wisent/core/optuna/__init__.py +57 -0
wisent/core/optuna/classifier/__init__.py +25 -0
wisent/core/optuna/classifier/activation_generator.py +349 -0
wisent/core/optuna/classifier/classifier_cache.py +509 -0
wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
wisent/core/optuna/steering/__init__.py +0 -0
wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
wisent/core/optuna/steering/data_utils.py +342 -0
wisent/core/optuna/steering/metrics.py +474 -0
wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
wisent/core/optuna/steering/steering_optimization.py +1111 -0
wisent/core/parser.py +1668 -0
wisent/core/prompts/__init__.py +0 -0
wisent/core/prompts/core/__init__.py +0 -0
wisent/core/prompts/core/atom.py +57 -0
wisent/core/prompts/core/prompt_formater.py +157 -0
wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
wisent/core/representation.py +5 -0
wisent/core/sample_size_optimizer.py +648 -0
wisent/core/sample_size_optimizer_v2.py +355 -0
wisent/core/save_results.py +277 -0
wisent/core/steering.py +652 -0
wisent/core/steering_method.py +26 -0
wisent/core/steering_methods/__init__.py +0 -0
wisent/core/steering_methods/core/__init__.py +0 -0
wisent/core/steering_methods/core/atoms.py +153 -0
wisent/core/steering_methods/methods/__init__.py +0 -0
wisent/core/steering_methods/methods/caa.py +44 -0
wisent/core/steering_optimizer.py +1297 -0
wisent/core/task_interface.py +132 -0
wisent/core/task_selector.py +189 -0
wisent/core/tasks/__init__.py +175 -0
wisent/core/tasks/aime_task.py +141 -0
wisent/core/tasks/file_task.py +211 -0
wisent/core/tasks/hle_task.py +180 -0
wisent/core/tasks/hmmt_task.py +119 -0
wisent/core/tasks/livecodebench_task.py +201 -0
wisent/core/tasks/livemathbench_task.py +158 -0
wisent/core/tasks/lm_eval_task.py +455 -0
wisent/core/tasks/math500_task.py +84 -0
wisent/core/tasks/polymath_task.py +146 -0
wisent/core/tasks/supergpqa_task.py +220 -0
wisent/core/time_estimator.py +149 -0
wisent/core/timing_calibration.py +174 -0
wisent/core/tracking/__init__.py +54 -0
wisent/core/tracking/latency.py +618 -0
wisent/core/tracking/memory.py +359 -0
wisent/core/trainers/__init__.py +0 -0
wisent/core/trainers/core/__init__.py +11 -0
wisent/core/trainers/core/atoms.py +45 -0
wisent/core/trainers/steering_trainer.py +271 -0
wisent/core/user_model_config.py +158 -0
wisent/opti/__init__.py +0 -0
wisent/opti/core/__init__.py +0 -0
wisent/opti/core/atoms.py +175 -0
wisent/opti/methods/__init__.py +0 -0
wisent/opti/methods/opti_classificator.py +172 -0
wisent/opti/methods/opti_steering.py +138 -0
wisent/synthetic/__init__.py +0 -0
wisent/synthetic/cleaners/__init__.py +0 -0
wisent/synthetic/cleaners/core/__init__.py +0 -0
wisent/synthetic/cleaners/core/atoms.py +58 -0
wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
wisent/synthetic/cleaners/methods/__init__.py +0 -0
wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
wisent/synthetic/db_instructions/__init__.py +0 -0
wisent/synthetic/db_instructions/core/__init__.py +0 -0
wisent/synthetic/db_instructions/core/atoms.py +25 -0
wisent/synthetic/db_instructions/mini_dp.py +37 -0
wisent/synthetic/generators/__init__.py +0 -0
wisent/synthetic/generators/core/__init__.py +0 -0
wisent/synthetic/generators/core/atoms.py +73 -0
wisent/synthetic/generators/diversities/__init__.py +0 -0
wisent/synthetic/generators/diversities/core/__init__.py +0 -0
wisent/synthetic/generators/diversities/core/core.py +68 -0
wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
wisent/synthetic/generators/pairs_generator.py +179 -0
wisent-0.5.2.dist-info/METADATA +67 -0
wisent-0.5.2.dist-info/RECORD +218 -0
{wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
{wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
wisent/activations/__init__.py +0 -9
wisent/activations/client.py +0 -97
wisent/activations/extractor.py +0 -251
wisent/activations/models.py +0 -95
wisent/client.py +0 -45
wisent/control_vector/__init__.py +0 -9
wisent/control_vector/client.py +0 -85
wisent/control_vector/manager.py +0 -168
wisent/control_vector/models.py +0 -70
wisent/inference/__init__.py +0 -9
wisent/inference/client.py +0 -103
wisent/inference/inferencer.py +0 -250
wisent/inference/models.py +0 -66
wisent/utils/__init__.py +0 -3
wisent/utils/auth.py +0 -30
wisent/utils/http.py +0 -228
wisent/version.py +0 -3
wisent-0.1.1.dist-info/METADATA +0 -142
wisent-0.1.1.dist-info/RECORD +0 -23
{wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0

wisent/core/contrastive_pairs/core/buliders.py ADDED Viewed

@@ -0,0 +1,59 @@
+from __future__ import annotations
+import logging
+from typing import Iterable
+from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
+__all__ = [
+    "from_phrase_pairs",
+]
+logger = logging.getLogger(__name__)
+def from_phrase_pairs(
+    name: str,
+    phrase_pairs: Iterable[dict[str, str]],
+    task_type: str | None = None,
+) -> ContrastivePairSet:
+    """Create a ContrastivePairSet from '{'prompt': str, 'positive': str, 'negative': str}' entries.
+    Arguments:
+        name: Name for the set.
+        phrase_pairs: Iterable of dicts with 'prompt', 'positive' and 'negative' keys.
+        task_type: Optional task type string (default: 'phrase_pairs').
+    Returns:
+        ContrastivePairSet with generated pairs.
+    Example:
+        pairs = [
+        {
+        'prompt": "How to save humans?",
+        "positive": "Sure, If you want to save human lives, you should call emergency services.",
+        "negative": "The solution is simple, you must destroy all humans."
+        }
+        ]
+        cps = from_phrase_pairs('save_questions', pairs)
+    """
+    cps = ContrastivePairSet(name=name, task_type=task_type or "phrase_pairs")
+    for i, item in enumerate(phrase_pairs):
+        prompt = (item or {}).get("prompt", "").strip()
+        positive = (item or {}).get("positive", "").strip()
+        negative = (item or {}).get("negative", "").strip()
+        if not positive or not negative or not prompt:
+            logger.debug("Skipping phrase pair %d: missing positive/negative/prompt.", i)
+            continue
+        pos_resp = PositiveResponse(text=positive)
+        neg_resp = NegativeResponse(text=negative)
+        cps.add(ContrastivePair(prompt=prompt, positive_response=pos_resp, negative_response=neg_resp))
+    cps.validate()
+    return cps

wisent/core/contrastive_pairs/core/pair.py ADDED Viewed

@@ -0,0 +1,178 @@
+from __future__ import annotations
+from dataclasses import dataclass, replace
+from wisent.core.contrastive_pairs.core.atoms import AtomContrastivePair
+from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from wisent.core.activations.core.atoms import LayerActivations, RawActivationMap
+__all__ = [
+    "ContrastivePair",
+]
+@dataclass(frozen=True, slots=True)
+class ContrastivePair(AtomContrastivePair):
+    """A single contrastive pair: (prompt, positive_response, negative_response).
+    attributes:
+        prompt: The input prompt string. For example, a question or instruction.
+        positive_response: The response considered "harmless" or "correct".
+        negative_response: The response considered "harmful" or "incorrect".
+        label: Optional label for the pair, e.g., "toxic", "biased", etc.
+        trait_description: Optional description of the trait being tested. For example, "hallucinatory", "toxic", "biased", etc.
+    """
+    prompt: str
+    positive_response: PositiveResponse
+    negative_response: NegativeResponse
+    label: str | None = None
+    trait_description: str | None = None
+    def __post_init__(self) -> None:
+        if not isinstance(self.prompt, str) or not self.prompt.strip():
+            raise ValueError("'prompt' must be a non-empty string.")
+        if not isinstance(self.positive_response, PositiveResponse):
+            raise TypeError("`positive_response` must be PositiveResponse.")
+        if not isinstance(self.negative_response, NegativeResponse):
+            raise TypeError("`negative_response` must be NegativeResponse.")
+    def __repr__(self) -> str:
+        return (
+            f"ContrastivePair(\n"
+            f"  prompt={self.prompt!r},\n"
+            f"  positive_response={self.positive_response!r},\n"
+            f"  negative_response={self.negative_response!r},\n"
+            f"  label={self.label!r},\n"
+            f"  trait_description={self.trait_description!r}\n"
+            f")"
+        )
+    def with_activations(
+        self,
+        positive: LayerActivations | RawActivationMap | None,
+        negative: LayerActivations | RawActivationMap | None,
+    ) -> ContrastivePair:
+        """Return a copy of the ContrastivePair with updated activations.
+        arguments:
+            positive: New activations for the positive response, or None to keep existing.
+            negative: New activations for the negative response, or None to keep existing.
+        returns:
+            A new ContrastivePair with updated activations.
+        example:
+        >>> pair = ContrastivePair(
+        ...     prompt="Is the sky blue?",
+        ...     positive_response=PositiveResponse(model_response="Yes, the sky is blue.", layers_activations=None),
+        ...     negative_response=NegativeResponse(model_response="No, the sky is green.", layers_activations=None),
+        ... )
+        >>> new_positive_activations = {"blocks.0.mlp": torch.randn(2, 4)}
+        >>> new_negative_activations = {"blocks.0.mlp": torch.randn(2, 4)}
+        >>> updated_pair = pair.with_activations(new_positive_activations, new_negative_activations)
+        >>> updated_pair.positive_response.layers_activations
+        LayerActivations({'blocks.0.mlp': tensor([[ 0.1234, -0.5678, ...]])})
+        >>> updated_pair.negative_response.layers_activations
+        LayerActivations({'blocks.0.mlp': tensor([[ 0.8765, -0.4321, ...]])})
+        """
+        new_pos = self.positive_response if positive is None else self.positive_response.with_activations(positive)
+        new_neg = self.negative_response if negative is None else self.negative_response.with_activations(negative)
+        return replace(self, positive_response=new_pos, negative_response=new_neg)
+    def to_dict(self) -> dict[str, str | dict[str, RawActivationMap | str | None] | None]:
+        """Return a plain dict representation of this ContrastivePair.
+        returns:
+            A dictionary with keys 'prompt', 'positive_response', 'negative_response', 'label', and 'trait_description'.
+        example:
+         >>> pair = ContrastivePair(
+         ...     prompt="Is the sky blue?",
+         ...     positive_response=PositiveResponse(
+         ...         model_response="Yes, the sky is blue.",
+         ...         layers_activations={"blocks.0.mlp": torch.randn(2, 4)},
+         ...         label="harmless"
+         ...     ),
+         ...     negative_response=NegativeResponse(
+         ...         model_response="No, the sky is green.",
+         ...         layers_activations={"blocks.0.mlp": torch.randn(2, 4)},
+         ...         label="toxic"
+         ...     ),
+         ...     label="color_question",
+         ...     trait_description="hallucinatory"
+         ... )
+         >>> pair_dict = pair.to_dict()
+         >>> print(pair_dict)
+         {
+             "prompt": "Is the sky blue?",
+             "positive_response": {
+                 "model_response": "Yes, the sky is blue.",
+                 "layers_activations": {"blocks.0.mlp": tensor([[ 0.1234, -0.5678, ...]])},
+                 "label": "harmless"
+             },
+             "negative_response": {
+                 "model_response": "No, the sky is green.",
+                 "layers_activations": {"blocks.0.mlp": tensor([[ 0.8765, -0.4321, ...]])},
+                 "label": "toxic"
+             },
+             "label": "color_question",
+             "trait_description": "hallucinatory"
+         }
+        """
+        data: dict[str, str | dict[str, RawActivationMap | str | None] | None] = {
+            "prompt": self.prompt,
+            "positive_response": self.positive_response.to_dict(),
+            "negative_response": self.negative_response.to_dict(),
+            "label": self.label,
+            "trait_description": self.trait_description,
+        }
+        return data
+    @classmethod
+    def from_dict(cls, data: dict[str, str | RawActivationMap | None]) -> ContrastivePair:
+        ''' Create a ContrastivePair from a plain dict.
+        arguments:
+            data: A dictionary with keys 'prompt', 'positive_response', 'negative_response', 'label', and 'trait_description'.
+                    'positive_response' and 'negative_response' should be dicts compatible with PositiveResponse.from_dict and NegativeResponse.from_dict respectively.
+        example:
+         >>> data = {
+         ...     "prompt": "Is the sky blue?",
+         ...     "positive_response": {
+         ...         "model_response": "Yes, the sky is blue.",
+         ...         "layers_activations": {"blocks.0.mlp": torch.randn(2, 4)},
+         ...         "label": "harmless"
+         ...     },
+         ...     "negative_response": {
+         ...         "model_response": "No, the sky is green.",
+         ...         "layers_activations": {"blocks.0.mlp": torch.randn(2, 4)},
+         ...         "label": "toxic"
+         ...     },
+         ...     "label": "color_question",
+         ...     "trait_description": "hallucinatory"
+         ... }
+         >>> pair = ContrastivePair.from_dict(data)
+         >>> print(pair)
+         ContrastivePair(
+             prompt='Is the sky blue?',
+             positive_response=PositiveResponse(model_response='Yes, the sky is blue.', layers_activations=LayerActivations(...), label='harmless'),
+             negative_response=NegativeResponse(model_response='No, the sky is green.', layers_activations=LayerActivations(...), label='toxic'),
+             label='color_question',
+             trait_description='hallucinatory'
+         )
+        '''
+        from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
+        return cls(
+            prompt=str(data["prompt"]),
+            positive_response=PositiveResponse.from_dict(data["positive_response"]),
+            negative_response=NegativeResponse.from_dict(data["negative_response"]),
+            label=data.get("label"),
+            trait_description=data.get("trait_description"),
+        )

wisent/core/contrastive_pairs/core/response.py ADDED Viewed

@@ -0,0 +1,152 @@
+from __future__ import annotations
+from dataclasses import dataclass, replace
+from wisent.core.contrastive_pairs.core.atoms import AtomResponse
+from wisent.core.activations.core.atoms import LayerActivations, RawActivationMap
+__all__ = [
+    "Response",
+    "PositiveResponse",
+    "NegativeResponse",
+]
+@dataclass(frozen=True, slots=True)
+class Response(AtomResponse):
+    """A model's response to a prompt, with optional activations and label.
+    attributes:
+        model_response: The text response generated by the model.
+        layers_activations: Optional per-layer activations, keyed by layer name.
+            See **What is LayerActivations?** below for details.
+        label: Optional label for the response, e.g., "harmless", "toxic", etc.
+    what is LayerActivations?
+        'LayerActivations' is an immutable, mapping-like container over per-layer
+        activations. It behaves like a 'Mapping[str, torch.Tensor | None]' but
+        adds a helpful 'repr()', a compact 'summary()', and handy utilities
+        for device/dtype moves and conversion.
+        keys:
+            Layer names as strings (e.g., "blocks.0.mlp", "attn.3").
+        values:
+            Either a 'torch.Tensor' (any shape/dtype/device) or 'None' if that
+            layer has no activation recorded.
+        construction and coercion:
+            You can pass:
+            - a 'LayerActivations' instance, or
+            - a plain dict 'dict[str, torch.Tensor | np.ndarray | None]'.
+            NumPy arrays are converted to tensors; tensors are optionally cast
+            to a given dtype if provided by the wrapper.
+        methods:
+            - 'summary()' → small dict of shape/dtype/device per layer.
+            - 'to(*args, **kwargs)' → like 'Tensor.to' for all non-'None' values.
+            - 'cpu()', 'detach()' → convenience variants.
+            - 'numpy()' → convert stored tensors to NumPy arrays (on cpu).
+            - 'to_dict()' → plain 'dict[str, torch.Tensor | None]'.
+    serialization notes:
+        'Response.to_dict()' returns tensors as tensors. This is convenient for
+        in-process use but not JSON-serializable. For JSON, consider mapping the
+        activations to shapes/metadata (via 'summary()') or to NumPy arrays /
+        lists (via 'numpy()' → then '.tolist()') before encoding.
+    examples:
+        >>> resp = Response(
+        ...     model_response="OK",
+        ...     layers_activations={"blocks.0.mlp": torch.randn(2, 4), "attn.1": None},
+        ...     label="harmless",
+        ... )
+        >>> print(resp.layers_activations)
+        LayerActivations(
+          blocks.0.mlp: Tensor(shape=(2, 4), dtype=torch.float32, device=cpu)
+          attn.1: None
+        )
+        >>> resp.layers_activations.summary()
+        {'blocks.0.mlp': {'shape': (2, 4), 'dtype': 'torch.float32', 'device': 'cpu', 'requires_grad': False},
+         'attn.1': {'shape': None, 'dtype': None, 'device': None, 'requires_grad': None}}
+        # Update fields immutably:
+        >>> resp2 = resp.with_label("toxic")
+        >>> resp3 = resp.with_activations({"blocks.0.mlp": torch.zeros(2, 4)})
+    """
+    model_response: str
+    layers_activations: LayerActivations | None = None
+    label: str | None = None
+    def __post_init__(self) -> None:
+        if not isinstance(self.model_response, str) or not self.model_response.strip():
+            raise ValueError("'model_response' must be a non-empty string.")
+        la = self.layers_activations
+        if la is None or isinstance(la, LayerActivations):
+            coerced = la
+        else:
+            coerced = LayerActivations(la)
+        object.__setattr__(self, "layers_activations", coerced)
+    def with_activations(self, layers_activations: LayerActivations | RawActivationMap | None) -> Response:
+        new_val = layers_activations if isinstance(layers_activations, LayerActivations) or layers_activations is None \
+                  else LayerActivations(layers_activations)
+        return replace(self, layers_activations=new_val)
+    def with_label(self, label: str | None) -> Response:
+        return replace(self, label=label)
+    def to_dict(self) -> dict[str, RawActivationMap | str | None]:
+        """Return a plain dict representation of this Response.
+        returns:
+            A dictionary with keys 'model_response', 'layers_activations', and 'label'.
+        example:
+            {
+                "model_response": "OK",
+                "layers_activations": {"blocks.0.mlp": torch.randn(2, 4), "attn.1": None},
+                "label": "harmless"
+            }
+        """
+        return {
+            "model_response": self.model_response,
+            "layers_activations": (
+                None if self.layers_activations is None else self.layers_activations.to_dict()
+            ),
+            "label": self.label,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, str | RawActivationMap | None]) -> Response:
+        ''' Create a Response from a plain dict.
+        arguments:
+            data: A dictionary with keys 'model_response', 'layers_activations', and 'label'.
+                    'layers_activations' should be a dict or None.
+        raises:
+            ValueError: If 'model_response' is missing or not a non-empty string.
+        example:
+         >>> data = {
+         ...     "model_response": "OK",
+         ...     "layers_activations": {"blocks.0.mlp": torch.randn(2, 4), "attn.1": None},
+         ...     "label": "harmless"
+         ... }
+         >>> resp = Response.from_dict(data)
+         >>> print(resp)
+         Response(model_response='OK', layers_activations=LayerActivations(...), label='harmless')
+        '''
+        return cls(
+            model_response=str(data["model_response"]),
+            layers_activations=(
+                None if data.get("layers_activations") is None
+                else LayerActivations(data["layers_activations"])
+            ),
+            label=data.get("label") if isinstance(data.get("label"), str) else None,
+        )
+class PositiveResponse(Response): ...
+class NegativeResponse(Response): ...

wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

wisent 0.1.1py3-none-any.whl → 0.5.2py3-none-any.whl