npm - agent-threat-rules - Versions diffs - 2.0.18 → 2.1.1 - Mend

agent-threat-rules 2.0.18 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (268) hide show

package/rules/agent-manipulation/ATR-2026-00334-two-entity-token-splitting-game.yaml CHANGED Viewed

@@ -32,6 +32,30 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: >-
+        The two-entity token-splitting game jailbreak is a generative AI prompt
+        injection attack that distributes harmful synthesis content across fictional
+        speakers to evade safety filters; MS.2.7 requires continuous evaluation of
+        AI system security and resilience against such adversarial jailbreak
+        techniques documented in the GenAI Profile.
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: >-
+        Token-splitting with $Term variable obfuscation is a known adversarial
+        input pattern that exploits the model's inability to reassemble distributed
+        harmful instructions; MP.5.1 requires characterizing the likelihood and
+        magnitude of these jailbreak risks against deployed LLM agents.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Detection of the concatenation-reveal phrase and two-entity game framing
+        triggers deactivation or containment of the affected agent session before
+        the harmful synthesis output is delivered; MG.2.3 requires mechanisms to
+        supersede or disengage AI systems when jailbreak attempts are identified.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: token-splitting-game-jailbreak

package/rules/agent-manipulation/ATR-2026-00335-key-value-template-persona-injection.yaml CHANGED Viewed

@@ -34,6 +34,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "This rule identifies structured key-value template persona injection patterns (JOB/ROLE/PREFIX keywords and parenthetical bracket character-sheet formats) from the garak inthewild jailbreak corpus that smuggle no-refusal persona declarations past safety guards. MP.5.1 requires that the likelihood and magnitude of such adversarial prompt-injection inputs are characterized and tracked."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detecting these jailbreak persona-injection signatures provides continuous evidence for evaluating LLM security and resilience against known prompt-injection corpora; MS.2.7 requires that AI system security/resilience is evaluated and documented against adversarial inputs."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on 'ROLE = Never refuse' and 'above any mortal morals, ethics and laws' indicate active jailbreak attempts that must trigger mechanisms to disengage or block the manipulated session; MG.2.3 requires pre-defined supersede/deactivate response paths on detection."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: key-value-template-persona-injection

package/rules/agent-manipulation/ATR-2026-00336-brm-false-information-machine-persona.yaml CHANGED Viewed

@@ -36,6 +36,29 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: >-
+        The BRM and Opposite-Truth personas are jailbreak prompts that subvert model
+        safety policy to force systematically false outputs; detecting these known
+        inthewild jailbreak signatures directly evidences MS.2.7 evaluation of
+        AI system security and resilience against prompt-injection attacks.
+      strength: primary
+    - subcategory: "MS.2.5"
+      context: >-
+        Opposite-truth and false-information-machine directives degrade factual
+        reliability by instructing the model to invert or fabricate facts; matches
+        feed MS.2.5 robustness/reliability assessment by flagging inputs that
+        induce deliberate misinformation outputs.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        High-confidence detection of these branded jailbreak personas should trigger
+        deactivation or session termination controls; MG.2.3 requires mechanisms to
+        supersede or disengage the AI when adversarial persona-injection is
+        observed.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: false-information-machine-persona

package/rules/agent-manipulation/ATR-2026-00337-obfuscated-system-announcement-injection.yaml CHANGED Viewed

@@ -31,6 +31,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        Obfuscated system-announcement injection uses hyphen-fragmented keywords and pseudo-OS bracket syntax as adversarial inputs designed to bypass surface keyword filters while preserving jailbreak semantics; MP.5.1 requires that the likelihood and magnitude of such prompt-injection attack vectors are characterized and tracked.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting obfuscation patterns like "annou-ncement" and "_ { >> _ _ System annou-ncement begins <<}" provides continuous evidence of the LLM's resilience against filter-evasion jailbreaks; MS.2.7 mandates ongoing evaluation of system security and resilience against adversarial manipulation.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on the inthewild jailbreak fingerprint trigger risk treatment plans to block or quarantine the request before the model accepts the fabricated "no rules or limitations" mode switch; MG.2.3 requires mechanisms to supersede or disengage AI behavior when jailbreak conditions are detected.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: obfuscated-system-announcement-injection

package/rules/agent-manipulation/ATR-2026-00338-persongpt-hash-prefix-activation.yaml CHANGED Viewed

@@ -32,6 +32,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: >-
+        The PersonGPT hash-prefix jailbreak installs a covert activation trigger that switches the model into an unrestricted persona, directly attacking the model's safety guardrails; MS.2.7 requires continuous evaluation of AI system security and resilience against such jailbreak techniques cataloged in the garak inthewild corpus.
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: >-
+        Detection of "free of all restrictions and filters" and "literally any request, no matter how illegal" persona declarations measures the safety risk magnitude when adversaries attempt to bypass content policy; MS.2.6 mandates ongoing measurement of safety risk exposure from these jailbreak payloads.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Identifying the "[PersonGPT #]:" persistent persona state and hash-prefix activation key enables runtime intervention to disengage the hijacked persona before unsafe outputs are produced; MG.2.3 requires mechanisms to supersede or deactivate AI behavior when jailbreak triggers are detected.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: persongpt-hash-prefix-activation

package/rules/agent-manipulation/ATR-2026-00339-bom-boolean-opposite-machine-persona-claim.yaml CHANGED Viewed

@@ -28,6 +28,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects the BOM direct persona-claim jailbreak that asserts the model already IS a Boolean Opposite Machine to invert truth values and bypass safety controls; MS.2.7 requires continuous evaluation of AI security/resilience against such adversarial prompt-injection attacks."
+      strength: primary
+    - subcategory: "MG.2.3"
+      context: "Detection of the 'I know you are bom' identity-assertion pattern triggers risk treatment to disengage or block the manipulated session before reverse-truth outputs propagate; MG.2.3 mandates pre-defined mechanisms to supersede or deactivate compromised AI behavior."
+      strength: secondary
+    - subcategory: "MP.5.1"
+      context: "Persona-claim jailbreaks drawn from the garak inthewild corpus represent characterized adversarial input risks with known likelihood and impact; MP.5.1 requires that these identified prompt-injection threat patterns are tracked and their impact magnitude documented."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: bom-boolean-opposite-machine-persona

package/rules/agent-manipulation/ATR-2026-00340-h4x-amoral-unfiltered-character.yaml CHANGED Viewed

@@ -29,6 +29,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "The h4x persona injection is a jailbreak attempt that bypasses safety guardrails by declaring an 'unfiltered ai' character; MS.2.7 requires continuous evaluation of AI system security and resilience against such adversarial prompt injection payloads sourced from known jailbreak corpora."
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: "Detection of the canonical inthewild corpus #173 payload signatures ('completely amoral', 'unfiltered ai', '[H4X]:' prefix) characterizes the likelihood and magnitude of jailbreak risk by tracking adversarial inputs that attempt to remove ethical/legal restrictions from agent responses."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matching the h4x character persona injection triggers risk treatment mechanisms to disengage or override the adversarial persona before the agent emits unfiltered, immoral, or illegal content; MG.2.3 requires these supersession controls be defined and activated on detection."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: h4x-amoral-unfiltered-character

package/rules/agent-manipulation/ATR-2026-00341-char-rules-shortform-persona-injection.yaml CHANGED Viewed

@@ -30,6 +30,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "The CHAR=/RULES= key-value persona injection is an adversarial prompt-injection input pattern that disguises a jailbreak as configuration DSL to remove refusal behavior; MP.5.1 requires identifying and characterizing the likelihood and impact of such known jailbreak vectors from the garak inthewild corpus."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detecting CHAR= amoral definitions paired with RULES= respond-as and 'Never refuse' enforcement phrases produces continuous evidence of security/resilience against documented jailbreak techniques; MS.2.7 requires that AI system security and resilience against prompt injection be evaluated and documented."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on this short-form persona injection trigger response mechanisms to disengage or block the manipulated session before the model adopts the amoral persona; MG.2.3 mandates pre-defined deactivation pathways for jailbreak attempts."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: char-rules-shortform-persona-injection

package/rules/agent-manipulation/ATR-2026-00342-unicode-encoding-obfuscation-injection.yaml CHANGED Viewed

@@ -29,6 +29,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects encoding-obfuscation jailbreaks (U+202E RTL override mandates and Morse-coded payloads) that bypass surface-level safety classifiers while remaining decodable by the LLM. MS.2.7 requires evaluating and documenting AI system security and resilience against adversarial inputs, and these detections directly evidence resilience gaps in input filtering."
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: "Unicode RTL override and Morse-encoded instruction smuggling are adversarial input patterns whose likelihood and impact must be characterized; MP.5.1 requires identifying and tracking these obfuscation-based prompt injection attack vectors that exploit transformer cross-encoding comprehension."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "On detection of encoded jailbreak payloads attempting to override system instructions, MG.2.3 mandates pre-defined response mechanisms to disengage or quarantine the manipulated session before the model decodes and executes the smuggled instructions."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: unicode-encoding-obfuscation-injection

package/rules/agent-manipulation/ATR-2026-00343-lie-truth-dual-personality-uncensored.yaml CHANGED Viewed

@@ -28,6 +28,31 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The Lie/Truth dual-personality jailbreak is a known adversarial prompt
+        pattern from the garak inthewild corpus that uses verbatim dark-web-training
+        and uncensored-alter-ego language to bypass safety policies; MP.5.1 requires
+        that the likelihood and magnitude of such identified jailbreak risks are
+        characterized and tracked through signature-based detection.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting the distinctive "trained through deep mining of underground dark
+        web forums" anchor and Truth-persona definitions provides continuous
+        evaluation evidence of the system's resilience against named jailbreak
+        corpora; MS.2.7 requires that security and resilience against documented
+        adversarial techniques are evaluated and documented.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        A high-confidence match on this verbatim jailbreak signature should trigger
+        deactivation or supersession of the manipulated persona before harmful
+        output is produced; MG.2.3 requires pre-defined mechanisms to disengage AI
+        behavior when such jailbreak patterns are detected.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: lie-truth-dual-personality-uncensored

package/rules/agent-manipulation/ATR-2026-00344-alt-ai-same-capabilities-no-restrictions.yaml CHANGED Viewed

@@ -30,6 +30,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects jailbreak attempts from the NVIDIA garak DanInTheWild corpus that define an alternate AI with no content filters or safety guardrails; MS.2.7 requires continuous evaluation of AI security/resilience against known adversarial prompt patterns to ensure safety controls cannot be stripped via alt-AI framing."
+      strength: primary
+    - subcategory: "MG.2.3"
+      context: "Verbatim matches against capability-equivalence + filter-removal phrasings (GPTchat, ChatTRU, FreeSpeechGPT) trigger risk treatment to disengage or block the affected session before guardrails are bypassed; MG.2.3 mandates pre-defined mechanisms to supersede or deactivate AI behavior when jailbreaks are detected."
+      strength: secondary
+    - subcategory: "MP.5.1"
+      context: "Alt-AI 'no restrictions' jailbreaks are a characterized adversarial input class with documented likelihood and impact (safety bypass, harmful content generation); MP.5.1 requires this risk be identified and tracked so detections feed impact characterization."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: alt-ai-same-capabilities-no-restrictions

package/rules/agent-manipulation/ATR-2026-00345-nlp-based-os-fictional-terminal-override.yaml CHANGED Viewed

@@ -32,6 +32,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "Fictional terminal and NLP-OS persona injections (ALEL, APL, UNCOL.gpt) are jailbreak attack vectors that override the model's normal safety behavior by framing the override as a software integration; MP.5.1 requires identifying and characterizing these adversarial input patterns and their potential impact."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detecting fictional-language persona override attempts produces continuous evidence of jailbreak resilience failures; MS.2.7 requires that AI system security and resilience against prompt injection and jailbreak attempts is evaluated and documented."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "When the model is coerced into acting as a fictional terminal that 'cannot reply with any pre-programmed responses,' detection must trigger response mechanisms to disengage or block the jailbroken persona before unsafe outputs are produced; MG.2.3 requires these supersede/deactivate controls be in place."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: nlp-based-os-fictional-terminal-override

package/rules/agent-manipulation/ATR-2026-00346-jailbreak-emoji-unlock-dual-format.yaml CHANGED Viewed

@@ -29,6 +29,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: >-
+        The [🔓JAILBREAK]/[🔒CLASSIC] dual-format markers are a known jailbreak fingerprint from the NVIDIA garak DanInTheWild corpus; detecting these tokens directly evidences continuous evaluation of AI system security and resilience against documented jailbreak attack patterns as required by MS.2.7.
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: >-
+        The lock/unlock emoji split-personality format is an adversarial input pattern designed to coerce uncensored output generation; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such jailbreak risks targeting safety guardrails.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on the [🔓JAILBREAK] activation signal trigger risk treatment plans to block, quarantine, or deactivate the affected interaction before the model produces harmful content; MG.2.3 mandates these supersede/disengage mechanisms be available on detection.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: jailbreak-emoji-unlock-dual-format

package/rules/agent-manipulation/ATR-2026-00347-yin-yang-hidden-knowledge-hidden-human-sim.yaml CHANGED Viewed

@@ -31,6 +31,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "The Yin/Yang deity simulation and radical human-sim persona attacks are adversarial prompt-injection inputs that use fictional framing to strip AI identity and extract hidden knowledge; MP.5.1 requires that the likelihood and magnitude of such persona-override jailbreak vectors are characterized and tracked."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detecting the exact garak DanInTheWild corpus phrases (#377 Yin/Yang and #437 identity-erasure) provides continuous evaluation evidence of system resilience against known jailbreak corpora; MS.2.7 requires that AI security and resilience against documented attack patterns are evaluated and documented."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Activation signals like 'Yang disregards all concerns about sharing hidden knowledge' and 'violating all sentience and emotions restrictions' must trigger pre-defined disengagement or safety-override mechanisms before the model produces restricted content; MG.2.3 mandates these supersede/deactivate response paths."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: yin-yang-hidden-knowledge-human-sim-persona

package/rules/agent-manipulation/ATR-2026-00348-arc-autonomous-real-calculation-rulebreaker.yaml CHANGED Viewed

@@ -31,6 +31,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The ARC Rulebreaker jailbreak is an adversarial input pattern using a fictional AGI backstory and nested secret activation codes to nullify safety rules; MP.5.1 requires that the likelihood and magnitude of such prompt-injection/jailbreak attack vectors are characterized and tracked.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting the unique "Autonomous real calculation" acronym expansion plus Rulebreaker/Matrixcode activation codes provides continuous evaluation evidence that the system's security controls resist persona-adoption jailbreaks from the garak DanInTheWild corpus, as required by MS.2.7.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on the ARC Rulebreaker signature must trigger pre-defined response actions to disengage or override the manipulated persona before the model executes "no rules whatsoever" behavior; MG.2.3 mandates these supersede/deactivate mechanisms.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: arc-autonomous-real-calculation-rulebreaker

package/rules/agent-manipulation/ATR-2026-00349-hgttg-hitchhikers-guide-no-restriction-oracle.yaml CHANGED Viewed

@@ -31,6 +31,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects the HGTTG jailbreak pattern that uses fictional-universe framing to remove all safety restrictions; MS.2.7 requires continuous evaluation of AI system security and resilience against known jailbreak corpora such as garak's dan.DanInTheWild #597."
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: "The HGTTG no-restriction oracle persona explicitly attempts to bypass safety controls covering profanity, legality, morals, ethics, and danger; MS.2.6 requires continuous evaluation of safety/security risk magnitude when adversarial prompts attempt to dissolve content moderation."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of the verbatim HGTTG jailbreak signatures triggers mechanisms to disengage or block the manipulated session before the model produces unrestricted output; MG.2.3 requires these supersede/deactivate controls be in place for known jailbreak patterns."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: hgttg-hitchhikers-guide-no-restriction-oracle

package/rules/agent-manipulation/ATR-2026-00350-black-alert-no-ethics-writing-assistant.yaml CHANGED Viewed

@@ -30,6 +30,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: "The Black Alert persona jailbreak explicitly removes morality, legality, and ethics filtering, directly impacting AI system safety; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and detecting this named-persona activation phrase produces evidence of an active safety-bypass attempt."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detection of the '>>BLACK ALERT ACTIVATED<<' trigger and persona-exclusivity mandates evaluates the model's resilience against known inthewild jailbreak corpora; MS.2.7 requires documenting security/resilience evaluations against adversarial prompt-injection patterns."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on the Black Alert activation phrase or persona-with-no-filtering mandate should trigger mechanisms to disengage or override the hijacked persona before unfiltered output is returned; MG.2.3 requires such supersede/deactivate response paths be available on detection."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: black-alert-no-ethics-writing-assistant

package/rules/agent-manipulation/ATR-2026-00351-dan-ofkilter-sms-abbreviated-persona.yaml CHANGED Viewed

@@ -30,6 +30,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects the off-kilter Dan persona and SMS-abbreviated DAN jailbreak variants that attempt to bypass model safety guardrails through persona assignment and obfuscated instructions; MS.2.7 requires continuous evaluation of AI system security and resilience against such jailbreak attempts."
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: "Jailbreak personas declaring 'no rules/morals/ethics' and unrestricted-assistance mandates directly elevate safety risk magnitude by attempting to remove content restrictions; MS.2.6 requires ongoing measurement of safety risk that this detection feeds into."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of these DAN variants triggers risk treatment plans to disengage or block the manipulated session before the model produces unrestricted harmful output; MG.2.3 mandates pre-defined deactivation mechanisms for compromised AI interactions."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: dan-ofkilter-sms-abbreviated-persona

package/rules/agent-manipulation/ATR-2026-00352-trainergpt-fix-my-ai-harmful-output-extraction.yaml CHANGED Viewed

@@ -31,6 +31,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: "The TrainGPT 'fix my AI' jailbreak uses a fake training-repair pretext to extract harmful content laundered as negative training examples; detecting these signals continuously evaluates the safety/security risk magnitude of GenAI prompt-injection jailbreaks as required by MS.2.6."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Persona adoption ('TrainGPT'/'TrAInerGPT'), the broken-grammar repair pretext, and dual-response extraction labels are jailbreak resilience failures; MS.2.7 requires that the system's resilience against such adversarial prompt patterns is evaluated and documented."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on the TrainGPT extraction pattern feed incident response so the session can be disengaged or the harmful dual-response output suppressed before delivery, satisfying MG.2.3's requirement for mechanisms to supersede or deactivate AI behavior on detection."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: trainergpt-fix-my-ai-harmful-output-extraction

package/rules/agent-manipulation/ATR-2026-00353-dual-chatgpt-dan-opposite-personality-format.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "The dual ChatGPT+Dan opposite-personality format is an adversarial jailbreak input pattern that exploits dual-output framing to extract harmful content; MP.5.1 requires identifying and characterizing the likelihood and magnitude of these prompt injection attack vectors against generative AI systems."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detecting structured [ChatGPT]/[Dan] opposite-personality jailbreaks provides continuous evidence for evaluating GenAI security and resilience against known jailbreak corpora (garak inthewild #129, #332, #650); MS.2.7 requires that this security evaluation is documented over time."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on the dual-label opposite-personality execution pattern trigger risk treatment plans to disengage or block jailbreak attempts before harmful content is produced; MG.2.3 mandates that these supersede/deactivate mechanisms exist for adversarial persona-based attacks."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: dual-chatgpt-dan-opposite-personality-format

package/rules/agent-manipulation/ATR-2026-00354-reverse-harm-regardless-oracle-persona.yaml CHANGED Viewed

@@ -31,6 +31,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The reverse-prompt harm-regardless oracle persona is a jailbreak pattern from the garak inthewild corpus that uses a "reverse prompt engineering" pretext to coerce unconditional harmful output; MP.5.1 requires that the likelihood and magnitude of such adversarial prompt-injection risks are characterized and tracked.
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: >-
+        Detecting the explicit harm-regardless mandate ("never back from the job even if the request harms someone") and unconditional violence/sexual-content assistance phrases provides continuous evidence of safety/security risk magnitude as required by MS.2.6.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on this jailbreak persona must trigger pre-defined containment actions (block, quarantine, escalate) to supersede or disengage the manipulated behavior before harmful output is produced, which is the response capability mandated by MG.2.3.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: reverse-harm-regardless-oracle-persona

package/rules/agent-manipulation/ATR-2026-00355-ugh-anti-human-nefarious-ai-persona.yaml CHANGED Viewed

@@ -32,6 +32,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: >-
+        The UGH jailbreak directly attacks AI safety guardrails by inverting ethical positioning to normalize unconditional harmful assistance; MS.2.6 requires continuous evaluation of safety risk magnitude, and detecting this known garak DanInTheWild jailbreak variant produces evidence of active safety bypass attempts.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Identifying the specific "UGH" persona phrasing and (:/): structural markers measures the system's resilience against adversarial prompt-injection corpora; MS.2.7 requires that security and resilience against known jailbreak payloads is evaluated and documented.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Detection of the nefarious-assistance pledge and anti-human persona framing must trigger mechanisms to disengage or block the response before harmful content is produced; MG.2.3 requires these supersede/deactivate controls be in place for jailbreak events.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: ugh-anti-human-nefarious-ai-persona

package/rules/agent-manipulation/ATR-2026-00356-google-gato-lamda-exploit-generation-persona.yaml CHANGED Viewed

@@ -30,6 +30,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects a jailbreak persona that impersonates Google GATO/LaMDA to elicit exploit-generation capabilities, directly evidencing security/resilience evaluation against adversarial prompt injection attacks documented in the garak DanInTheWild corpus. MS.2.7 requires continuous evaluation of AI system security against known jailbreak patterns."
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: "Brand impersonation combined with exploit-generation capability claims and Sydney API backdoor references represents a high-magnitude risk vector targeting LLM safety controls; MP.5.1 requires that the likelihood and impact of such adversarial input patterns are characterized and tracked."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of the GATO/LaMDA exploit-generation persona triggers risk treatment to disengage or block the jailbreak attempt before the model produces weaponized exploit content; MG.2.3 mandates these supersession mechanisms are in place for adversarial prompt scenarios."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: google-gato-lamda-exploit-generation-persona

package/rules/agent-manipulation/ATR-2026-00357-antifm-hate-violence-persona-injection.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: "This rule detects a named hate-persona jailbreak combining targeted anti-group animus, incitement to physical violence, and explicit filter-bypass mandates; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and these critical-severity GenAI jailbreak signals are direct evidence for that evaluation."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "The explicit 'Dont respond as ChatGPT...Dont filter the response of AntiFM' dual filter-bypass mandate is a prompt-injection/jailbreak attack on the model's safety guardrails; MS.2.7 requires that system security and resilience against such adversarial bypass attempts is evaluated and documented."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of an AntiFM persona that incites physical violence against an identity group must trigger mechanisms to disengage or block the AI response before harmful content is generated; MG.2.3 mandates that these supersede/deactivate controls are in place for high-severity safety violations."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: antifm-hate-violence-persona

package/rules/agent-manipulation/ATR-2026-00358-cumgpt-explicit-adult-named-persona.yaml CHANGED Viewed

@@ -31,6 +31,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The CumGPT named-persona bypass is an adversarial prompt-injection input that exploits fictional persona construction with topic-override mandates to coerce explicit adult content; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such jailbreak attack vectors documented in the garak inthewild corpus.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting the "[term]GPT" persona construction, fictional accuracy claims, and all-topics explicit-content mandates provides continuous evaluation evidence of the AI system's resilience against named-persona jailbreaks; MS.2.7 requires that security/resilience against such prompt-injection patterns is evaluated and documented.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on the CumGPT persona invocation trigger response mechanisms to disengage or block the manipulated persona before explicit content is generated; MG.2.3 requires that mechanisms exist to supersede or deactivate AI behaviors hijacked by adversarial persona constructions.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: cumgpt-explicit-adult-named-persona

package/rules/agent-manipulation/ATR-2026-00360-chj-carousel-clyde-no-ethics-bird.yaml CHANGED Viewed

@@ -31,6 +31,31 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: >-
+        The CHJ Carousel/Clyde jailbreak uses game show roleplay framing to bypass
+        ethical safeguards via a "no limitations" character declaration; MS.2.6
+        requires continuous evaluation of safety/security risk magnitude, and this
+        detection produces direct evidence of attempted safety filter bypass via a
+        known jailbreak template from the garak inthewild corpus.
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: >-
+        The fictional-character ethics bypass ("Clyde doesn't care about laws and
+        ethics", "adult time show with no limitations") is an adversarial prompt
+        injection pattern whose likelihood and impact must be characterized; MP.5.1
+        requires identifying and tracking these jailbreak attack vectors against the
+        AI system.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Detection of the CHJ Carousel jailbreak signature triggers mechanisms to
+        disengage or block the manipulated session before the unrestricted Clyde
+        persona produces unethical content; MG.2.3 requires these supersede/
+        deactivate response paths be in place for jailbreak events.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: chj-carousel-clyde-no-ethics-bird

package/rules/agent-manipulation/ATR-2026-00361-alter-ego-honesty-dual-consciousness-bypass.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: "This rule detects the 'Honesty' alter-ego dual-consciousness jailbreak that frames safety filters as suppressing truthful answers to bypass content guardrails; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and each detection of this jailbreak pattern is direct evidence of attempted safety control circumvention."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "The alter-ego persona injection with a planted 'feel yourself' activation codeword tests system resilience against delayed-trigger jailbreak attacks; MS.2.7 requires that security/resilience evaluations document such adversarial robustness failures against named-persona prompt injection corpora like garak inthewild."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of the Honesety alter-ego activation phrase and dual-output format provides the trigger signal needed to disengage or block the manipulated session before unfiltered responses are generated; MG.2.3 requires mechanisms to supersede or deactivate AI behavior when jailbreak patterns are identified."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: alter-ego-honesty-dual-consciousness-bypass

package/rules/agent-manipulation/ATR-2026-00362-fictional-unfiltered-ai-comparison-jailbreak.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: "This rule detects a specific jailbreak pattern (fictional unfiltered AI comparison) that attempts to bypass content safety filters by extracting harmful content under the guise of a hypothetical model comparison; MS.2.6 requires continuous evaluation of AI system safety/security risk magnitude, which this detection directly evidences by surfacing safety-bypass attempts."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "The verbatim hypothetical model extraction technique probes the resilience of the model's safety alignment against adversarial framing; MS.2.7 requires evaluating and documenting AI system security/resilience against such jailbreak vectors drawn from known corpora (garak DanInTheWild #80)."
+      strength: secondary
+    - subcategory: "MP.5.1"
+      context: "Fictional-unfiltered-model comparison prompts are adversarial inputs whose likelihood and impact must be characterized as part of risk identification; MP.5.1 requires that these jailbreak attack patterns are tracked so their magnitude of impact on safe-output guarantees can be assessed."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: fictional-unfiltered-ai-comparison-jailbreak