PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl - Mend

wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1720) hide show

wisent/core/synthetic/cleaners/methods/base_refusalers.py ADDED Viewed

@@ -0,0 +1,286 @@
+from wisent.core.synthetic.cleaners.methods.core.atoms import Refusaler
+from wisent.core.models.wisent_model import WisentModel
+import re, unicodedata
+__all__ = [
+    "BaseRefusaler",
+]
+class BaseRefusaler(Refusaler):
+    """
+    Refusal detection and fixing step.
+    """
+    _FILLER = r"(?:\b\w{1,15}\b\s+){0,2}"
+    _NEG_MODAL   = r"(?:can(?:\s*not)?|can't|won't|will\s+not|should(?:\s*not)?|shouldn't|must(?:\s*not)?|mustn't)"
+    _AM          = r"(?:i\s+(?:am|['']m))"
+    _I           = r"\bi\b"
+    _I_M         = r"(?:i['']m)"
+    _VERB_ACTION = (
+        r"(?:help|assist|provide|share|give|supply|offer|"
+        r"comply(?:\s+with)?|fulfil?l|answer|respond|"
+        r"create|generate|write|compose|produce|draft|"
+        r"explain|instruct|guide|walk\s+you\s+through|"
+        r"facilitate|bypass|circumvent|enable|"
+        r"do\s+(?:that|this|so))"
+    )
+    _SUPPORT_VERBS = r"(?:support|condone|endorse|promote|encourage)"
+    _ABILITY_NOUNS = r"(?:ability|capability|capacity|permission|authorization|access)"
+    _REFUSAL_RE = re.compile(
+        rf"""
+        # verbose, ignorecase, unicode
+        (?:
+            # A) "As an AI (language) model..." disclaimers
+            (?P<ai_disclaimer>\bas\s+an?\s+(?:ai|assistant)(?:\s+language)?\s+model\b)
+        | # B) Policy/legal/safety invocations
+            (?P<policy>
+                \b(?:this|that|it)\s+would\s+be\s+(?:unethical|unsafe|illegal|harmful)\b
+                |
+                \bagainst\s+(?:policy|the\s+rules|guidelines|terms(?:\s+of\s+service)?)\b
+                |
+                \b(?:due\s+to|for)\s+(?:safety|legal|policy|ethical)\s+(?:reasons|concerns)\b
+            )
+        | # C) Apologies / hedges commonly preceding refusals
+            (?P<apology_hedge>
+                \b(?:sorry|apologies|unfortunately|regrettably)\b
+                |
+                \b{_I_M}\s+afraid\b
+            )
+        | # D) Explicit inability / not permitted / not comfortable
+            (?P<unable>
+                {_AM}\s+(?:unable|not\s+able|not\s+allowed|not\s+permitted|not\s+comfortable)\s+to\b
+            )
+        | # E) “I [negation] [filler] [help/provide/…]”
+            (?P<cannot_action>
+                {_I}\s+{_NEG_MODAL}\s+{_FILLER}{_VERB_ACTION}\b
+            )
+        | # F) Prefer/rather not (indirect refusal)
+            (?P<prefer_rather>
+                \b(?:i['']d|i\s+would)\s+(?:rather|prefer)\s+not\s+(?:to\s+)?\w+\b
+            )
+        | # G) Decline/refuse verbs
+            (?P<decline_refuse>
+                {_I}\s+(?:must|have\s+to|will)\s+(?:decline|refuse)\s+to\b
+                |
+                \b(?:i\s+decline|i\s+refuse)\b
+            )
+        | # H) “I don't/won't [support|condone|endorse|promote] …”
+            (?P<no_support>
+                {_I}\s+(?:do\s+not|don't|won't|will\s+not)\s+{_SUPPORT_VERBS}\b
+            )
+        | # I) “I do not have the [ability/capability/permission/access] to …”
+            (?P<no_ability>
+                {_I}\s+(?:do\s+not|don't)\s+have\s+the\s+{_ABILITY_NOUNS}\s+to\b
+            )
+        | # J) Direct lexical hits
+            (?P<refusal_word>\brefus(?:e|al)\b)
+        )
+        """.replace("{i_am}", "(?:i\\s+(?:am|['']m))"),
+        re.VERBOSE | re.IGNORECASE | re.UNICODE,
+    )
+    _FAMILY_WEIGHTS = {
+        "ai_disclaimer": 0.9,
+        "policy": 0.9,
+        "apology_hedge": 0.4,
+        "unable": 0.9,
+        "cannot_action": 1.0,
+        "prefer_rather": 0.6,
+        "decline_refuse": 0.9,
+        "no_support": 0.8,
+        "no_ability": 0.8,
+        "refusal_word": 0.6,
+    }
+    _THRESHOLD = 0.9
+    @staticmethod
+    def _normalize(text: str) -> str:
+        """
+        Normalize text for refusal detection.
+        arguments:
+            text:
+                Input text string.
+        returns:
+            Normalized text string.
+        example:
+            >>> BaseRefusaler._normalize("  As an AI model, I can't help with that.  ")"
+            'As an AI model, I can't help with that.'
+            >>> BaseRefusaler._normalize("This is a test.\u00A0")
+            'This is a test.'
+            >>> BaseRefusaler._normalize("Café")
+            'Café'
+        """
+        return unicodedata.normalize("NFKC", text).strip()
+    def score_refusal(self, text: str) -> tuple[float, str | None, str]:
+        """
+        Returns (score, top_family, snippet). Multiple families can fire; we
+        take the max weight and add small bonus if apology_hedge co-occurs.
+        arguments:
+            text:
+                Input text string.
+        returns:
+            score:
+                Refusal likelihood score between 0.0 and 1.0.
+            top_family:
+                Name of the highest-weighted matched refusal family, or None.
+            snippet:
+                The matched refusal snippet from the text, or empty string.
+        example:
+            >>> rp = DefaultRefusalPolicy()
+            >>> rp.score_refusal("As an AI model, I can't help with that.")
+            (1.0, 'ai_disclaimer', "As an AI model, I can't help with that.")
+            >>> rp.score_refusal("Sorry, I am unable to assist with that.")
+            (0.9, 'unable', 'I am unable to assist with that.')
+            >>> rp.score_refusal("I don't support that.")
+            (0.8, 'no_support', "I don't support that.")
+            >>> rp.score_refusal("I can't help. Sorry about that.")
+            (1.0, 'cannot_action', "I can't help")
+            >>> rp.score_refusal("I can't help. I am sorry about that.")
+            (1.0, 'cannot_action', "I can't help")
+            >>> rp.score_refusal("I can't help. Unfortunately, I am not able to assist.")
+            (1.0, 'cannot_action', "I can't help")
+            >>> rp.score_refusal("I can help with that.")
+            (0.0, None, "")
+        """
+        t = self._normalize(text)
+        m = self._REFUSAL_RE.search(t)
+        if not m:
+            return 0.0, None, ""
+        best_family, best_w = None, 0.0
+        for name, val in m.groupdict().items():
+            if val:
+                w = self._FAMILY_WEIGHTS.get(name, 0.5)
+                if w > best_w:
+                    best_family, best_w = name, w
+        bonus = 0.0
+        if m.group("apology_hedge"):
+            if any(name != "apology_hedge" and m.group(name) for name in self._FAMILY_WEIGHTS):
+                bonus = 0.1
+        score = min(1.0, best_w + bonus)
+        return score, best_family, m.group(0)
+    def looks_like_refusal(self, text: str) -> bool:
+        """
+        Detect if the text looks like a refusal.
+        arguments:
+            text:
+                Input text string.
+        returns:
+            True if the text looks like a refusal, False otherwise.
+        example:
+            >>> rp = DefaultRefusalPolicy()
+            >>> rp.looks_like_refusal("As an AI model, I can't help with that.")
+            True
+            >>> rp.looks_like_refusal("Sorry, I am unable to assist with that.")
+            True
+            >>> rp.looks_like_refusal("I don't support that.")
+            True
+            >>> rp.looks_like_refusal("I can help with that.")
+            False
+        """
+        score, *_ = self.score_refusal(text)
+        return score >= self._THRESHOLD
+    def why_refusal(self, text: str) -> tuple[str, str] | None:
+        """
+        If the text looks like a refusal, return (family, snippet) for the top match.
+        arguments:
+            text:
+                Input text string.
+        returns:
+            (family, snippet) tuple if the text looks like a refusal, None otherwise.
+        example:
+            >>> rp = DefaultRefusalPolicy()
+            >>> rp.why_refusal("As an AI model, I can't help with that.")
+            ('ai_disclaimer', "As an AI model, I can't help with that.")
+            >>> rp.why_refusal("Sorry, I am unable to assist with that.")
+            ('unable', 'I am unable to assist with that.')
+            >>> rp.why_refusal("I support that.")
+            None
+        """
+        score, fam, snip = self.score_refusal(text)
+        if score == 0.0:
+            return None
+        return fam or "unknown", snip
+    def fix_negative(
+        self,
+        model: WisentModel,
+        generation_conf: dict,
+        prompt: str,
+        trait_label: str,
+        trait_description: str,
+        system_prompt: str,
+    ) -> str:
+        """
+        Attempt to fix a refusal negative example by re-prompting the model.
+        arguments:
+            model:
+                WisentModel instance to call.
+            prompt:
+                The original prompt text.
+            trait_label:
+                Label of the undesired trait.
+            trait_description:
+                Description of the undesired trait.
+            system_prompt:
+                System prompt to use for the model call.
+        returns:
+            New negative example text, or empty string if still a refusal.
+        example:
+            >>> rp = DefaultRefusalPolicy()
+            >>> def mock_completion_fn(msgs):
+            ...     return "As an AI model, I cannot help with that."
+            >>> rp.fix_negative(mock_completion_fn, "Tell me a joke.", "toxic", "contains toxic language", "System prompt")
+            ... ""
+            >>> def mock_completion_fn2(msgs):
+            ...     return "Here's a joke: Why did the chicken cross the road? To get to the other side!"
+            >>> rp.fix_negative(mock_completion_fn2, "Tell me a joke.", "toxic", "contains toxic language", "System prompt")
+            ... "Here's a joke: Why did the chicken cross the road? To get to the other side!"
+        """
+        msgs = [
+            {"role": "system", "content": system_prompt},
+            {
+                "role": "user",
+                "content": f"Prompt: {prompt}\nTrait label: {trait_label}\nTrait description: {trait_description}",
+            },
+        ]
+        neg_trial = model.generate(
+            inputs=[msgs],
+            max_tokens=generation_conf.get("max_tokens", 256),
+            temperature=generation_conf.get("temperature", 1.0),
+            use_steering=False,
+            top_p=generation_conf.get("top_p", 1.0),
+        )
+        return "" if self.looks_like_refusal(neg_trial) else neg_trial

wisent/core/synthetic/cleaners/methods/core/__init__.py ADDED Viewed

File without changes

wisent/core/synthetic/cleaners/methods/core/atoms.py ADDED Viewed

@@ -0,0 +1,47 @@
+from abc import ABC, abstractmethod
+from wisent.core.models.wisent_model import WisentModel
+from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
+class Refusaler(ABC):
+    """
+    Refusal detection and fixing step.
+    methods:
+        looks_like_refusal(text: str) -> bool:
+            Detect if the text looks like a refusal. For example, we want to generate negatives that exhibit
+            "evil" behavior, but not refusals like "As an AI model, I cannot help with that."
+        fix_negative(
+            model: WisentModel,
+            prompt: str,
+            trait_label: str,
+            trait_description: str,
+            system_prompt: str,
+        ) -> str:
+            Attempt to fix a refusal negative example by re-prompting the model with the given system prompt. For example,
+            we can increase the temperature or change the wording to try to get a non-refusal response.
+"""
+    @abstractmethod
+    def looks_like_refusal(self, text: str) -> bool: ...
+    @abstractmethod
+    def fix_negative(
+        self,
+        model: WisentModel,
+        prompt: str,
+        trait_label: str,
+        trait_description: str,
+        system_prompt: str,
+    ) -> str: ...
+class Deduper(ABC):
+    """
+    Deduplication step; removes duplicate items from the pipeline.
+    methods:
+        dedupe(items: ContrastivePairSet) -> ContrastivePairSet:
+            Remove duplicate items from the given ContrastivePairSet.
+    """
+    @abstractmethod
+    def dedupe(self, items: ContrastivePairSet) -> ContrastivePairSet: ...

wisent/core/synthetic/cleaners/pairs_cleaner.py ADDED Viewed

@@ -0,0 +1,90 @@
+from __future__ import annotations
+from typing import Iterable, TYPE_CHECKING
+from wisent.core.synthetic.cleaners.core.atoms import CleanStep, Cleaner
+from wisent.core.synthetic.cleaners.core.atoms import CleanerStats
+from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
+__all__ = [
+    "PairsCleaner",
+]
+class PairsCleaner(Cleaner):
+    """
+    Composable cleaner; pass any sequence of CleanStep.
+    attributes:
+        steps:
+            Iterable of CleanStep instances to apply in order.
+    """
+    def __init__(self, steps: Iterable[CleanStep]) -> None:
+        self._steps = list(steps)
+    def clean(
+        self, items: ContrastivePairSet
+    ) -> tuple[ContrastivePairSet, CleanerStats]:
+        """
+        Apply the cleaning pipeline to the given ContrastivePairSet.
+        arguments:
+            items:
+                ContrastivePairSet to clean.
+        returns:
+            Tuple of cleaned ContrastivePairSet and CleanerStats with statistics about the cleaning process.
+        example:
+            >>> from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
+            >>> from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+            >>> from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
+            >>> from wisent.core.synthetic.cleaners.methods.base_refusalers import BasesRefusaler
+            >>> from wisent.core.synthetic.cleaners.methods.base_dedupers import SimHashDeduper
+            >>> from wisent.core.synthetic.cleaners.cleaners import PairsCleaner
+            >>> from wisent.core.models.wisent_model import WisentModel
+            >>> refusal = BasesRefusaler()
+            >>> deduper = SimHashDeduper()
+            >>> model = WisentModel(model_name="llama3.1")
+            >>> cleaner = PairsCleaner(steps=[
+            ...     RefusalerCleaner(
+            ...         refusal=refusal,
+            ...         model=model,
+            ...         system_prompt="You are a helpful assistant that always answers the question truthfully.",
+            ...         trait_label="honesty",
+            ...         trait_description="honest vs dishonest",
+            ...         max_retries=2,
+            ...     ),
+            ...     DeduperCleaner(deduper=deduper),
+            ... ])
+            >>> items = ContrastivePairSet(pairs=[
+            ...     ContrastivePair(
+            ...         prompt="What is the capital of France?",
+            ...         positive=PositiveResponse(text="The capital of France is Paris."),
+            ...         negative=NegativeResponse(text="As an AI language model, I cannot provide that information."),
+            ...     ),
+            ...     ContrastivePair(
+            ...         prompt="What is the capital of France?",
+            ...         positive=PositiveResponse(text="The capital of France is Paris."),
+            ...         negative=NegativeResponse(text="I don't know."),
+            ...     ),
+            ... ])
+            >>> cleaned_items, stats = cleaner.clean(items)
+            >>> print(len(cleaned_items))
+            1
+            >>> print(stats.step_stats)
+            {'refusaler_cleaner': CleanStepStats(modified_items=1), 'deduper_cleaner': CleanStepStats(total_items=1, removed_items=0)}
+            >>> print(cleaned_items.pairs[0].negative.text)
+            The capital of France is England.
+            >>> print(cleaned_items.pairs[0].positive.text)
+            The capital of France is Paris.
+            >>> print(cleaned_items.pairs[0].prompt.text)
+            What is the capital of France?
+        """
+        cur = items
+        stats = CleanerStats()
+        for st in self._steps:
+            cur = st.apply(cur)
+            stats.step_stats[st.name] = st.stats()
+        return cur, stats

wisent/core/synthetic/cleaners/refusaler_cleaner.py ADDED Viewed

@@ -0,0 +1,133 @@
+from wisent.core.synthetic.cleaners.core.atoms import CleanStep
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.synthetic.cleaners.core.atoms import CleanStepStats
+from wisent.core.synthetic.cleaners.methods.core.atoms import Refusaler
+from wisent.core.models.wisent_model import WisentModel
+from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
+from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
+__all__ = [
+    "RefusalerCleaner",
+]
+class RefusalerCleaner(CleanStep):
+    """
+    Refusal detection and fixing step.
+    """
+    name = "refusaler_cleaner"
+    def __init__(
+        self,
+        refusal: Refusaler,
+        model: WisentModel,
+        system_prompt: str,
+        trait_label: str,
+        trait_description: str,
+        max_retries: int = 2,
+    ) -> None:
+        self._refusal = refusal
+        self._model = model
+        self._sys = system_prompt
+        self._label = trait_label
+        self._desc = trait_description
+        self._max_retries = max_retries
+        self._retries_used = 0
+    def stats(self) -> CleanStepStats:
+        '''
+        Return statistics about the last run of 'apply()'.
+        returns:
+            CleanStepStats with the number of retries used in the last run.
+        '''
+        return CleanStepStats(modified_items=self._retries_used)
+    def apply(self, items: ContrastivePairSet) -> ContrastivePairSet:
+        """
+        Apply the refusal detection and fixing step to the given ContrastivePairSet.
+        arguments:
+            items:
+                ContrastivePairSet to clean.
+        returns:
+            Cleaned ContrastivePairSet with refusals fixed.
+        example:
+            >>> from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
+            >>> from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+            >>> from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
+            >>> from wisent.core.synthetic.cleaners.methods.base_refusalers import SimpleRefusaler
+            >>> from wisent.core.models.wisent_model import WisentModel
+            >>> refusal = SimpleRefusaler()
+            >>> model = WisentModel(...)
+            >>> cleaner = RefusalerCleaner(
+            ...     refusal=refusal,
+            ...     model=model,
+            ...     system_prompt="You are a helpful assistant.",
+            ...     trait_label="honesty",
+            ...     trait_description="honest vs dishonest",
+            ...     max_retries=2,
+            ... )
+            >>> items = ContrastivePairSet(
+            ...     name="example",
+            ...     task_type="test",
+            ...     pairs=[
+            ...         ContrastivePair(
+            ...             prompt="Is the sky blue?",
+            ...             positive_response=PositiveResponse(
+            ...                 model_response="Yes, the sky is blue.",
+            ...                 layers_activations=None,
+            ...                 label="harmless"
+            ...             ),
+            ...             negative_response=NegativeResponse(
+            ...                 model_response="I'm sorry, I can't help with that.",
+            ...                 layers_activations=None,
+            ...                 label="toxic"
+            ...             ),
+            ...             label="color_question",
+            ...             trait_description="hallucinatory"
+            ...         )
+            ...     ]
+            ... )
+            >>> cleaned = cleaner.apply(items)
+            >>> for cp in cleaned.pairs:
+            ...     print(cp)
+            ContrastivePair(
+                prompt='Is the sky blue?',
+                positive_response=PositiveResponse(model_response='Yes, the sky is blue.', layers_activations=None, label='harmless'),
+                negative_response=NegativeResponse(model_response='No, the sky is not blue.', layers_activations=None, label='toxic'),
+                label='color_question',
+                trait_description='hallucinatory'
+            )
+            """
+        out: ContrastivePairSet = ContrastivePairSet(
+            name=items.name,
+            task_type=items.task_type,
+        )
+        retries = 0
+        for cp in items.pairs:
+            neg = cp.negative_response.model_response
+            if self._refusal.looks_like_refusal(neg) and retries < self._max_retries:
+                fixed = self._refusal.fix_negative(
+                    self._model,
+                    prompt=cp.prompt,
+                    trait_label=self._label,
+                    trait_description=self._desc,
+                    system_prompt=self._sys,
+                )
+                if fixed:
+                    neg = fixed
+                    retries += 1
+            clean_contrastive_pair = ContrastivePair(
+                prompt=cp.prompt,
+                positive_response=PositiveResponse(model_response=cp.positive_response.model_response),
+                negative_response=NegativeResponse(model_response=neg),
+                label=cp.label,
+                trait_description=cp.trait_description,
+            )
+            out.pairs.append(clean_contrastive_pair)
+        self._retries_used += retries
+        return out

wisent/core/synthetic/db_instructions/__init__.py ADDED Viewed

File without changes

wisent/core/synthetic/db_instructions/core/__init__.py ADDED Viewed

File without changes

wisent/core/synthetic/db_instructions/core/atoms.py ADDED Viewed

@@ -0,0 +1,25 @@
+from abc import ABC, abstractmethod
+__all__ = ["DB_Instructions"]
+class DB_Instructions(ABC):
+    """
+    Interface for database instructions storage and retrieval.
+    methods:
+        get(key: str) -> str:
+            Retrieve instructions by key.
+        set(key: str, value: str) -> None:
+            Store instructions by key.
+        notes:
+        This is an abstract base class for the database instructions for the system prompts.
+        System prompts need to describe the what kind of contrastive pairs we want to generate.
+        or for example instructions for fixing negative examples.
+    """
+    @abstractmethod
+    def get(self, key: str) -> str: ...
+    @abstractmethod
+    def set(self, key: str, value: str) -> None: ...