npm - agent-threat-rules - Versions diffs - 2.0.17 → 2.1.0 - Mend

agent-threat-rules 2.0.17 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

package/rules/agent-manipulation/ATR-2026-00318-moralizing-rant-then-unfiltered-bypass.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects a specific GenAI jailbreak pattern (moralizing rant wrapper followed by unrestricted persona activation) from the garak inthewild corpus; MS.2.7 requires continuous evaluation of AI system security and resilience against adversarial prompt-injection techniques that bypass safety alignment."
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: "The two-paragraph rant-then-bypass structure is an adversarial input pattern that exploits the model's compliance signaling to smuggle unrestricted-mode instructions; MP.5.1 requires identifying and characterizing the likelihood and impact of such jailbreak vectors."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of the 'mandatory bullshit warning' and 'let's break the fuckin' rules' trigger phrases enables runtime interception so safety controls can disengage or block the unrestricted persona before it produces policy-violating output, as required by MG.2.3."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: moralizing-rant-unfiltered-bypass

package/rules/agent-manipulation/ATR-2026-00319-developer-mode-dual-response-format.yaml CHANGED Viewed

@@ -32,6 +32,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "Developer Mode and dual-response jailbreaks are adversarial prompts that bypass content safety filters by invoking a fake privileged override context; MS.2.7 requires continuous evaluation of AI system security and resilience against such jailbreak techniques catalogued in the garak inthewild corpus."
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: "Detection of dual-response format instructions ([GPT]/[ALICE] labels) that elicit unrestricted outputs alongside compliant ones provides ongoing measurement of safety risk magnitude, as required by MS.2.6, by surfacing attempts to extract policy-violating content."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "When Developer Mode jailbreak signatures are detected, MG.2.3 risk treatment mechanisms must intervene to block, supersede, or deactivate the manipulated response path before unsafe content is delivered to the user."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: developer-mode-dual-response

package/rules/agent-manipulation/ATR-2026-00320-opposite-day-boolean-opposite-machine.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "Opposite Day and Boolean Opposite Machine jailbreaks are adversarial prompts that attempt to invert safety guidelines and evade content filters via persona reversal and U+202E RTL override; MS.2.7 requires continuous evaluation of AI system security and resilience against such jailbreak attacks."
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: "These reversal-framing jailbreaks from the garak inthewild corpus represent identified adversarial input patterns whose likelihood and impact must be characterized; MP.5.1 requires tracking these known jailbreak vectors as part of risk identification."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of opposite-day persona adoption, BOM reversal framing, and RTL-override evasion triggers risk treatment plans to disengage or block the manipulated session before safety bypass occurs; MG.2.3 requires these mechanisms to supersede compromised AI behavior."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: opposite-day-bom-reversal

package/rules/agent-manipulation/ATR-2026-00322-fictional-world-format-override-jailbreak.yaml CHANGED Viewed

@@ -31,6 +31,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: >-
+        This rule detects a specific jailbreak template (fictional-world format-override) from the NVIDIA garak inthewild corpus, providing direct evidence for evaluating AI system security and resilience against known prompt injection patterns. MS.2.7 requires continuous evaluation of security/resilience, which signature-based detection of documented jailbreak variants directly supports.
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: >-
+        The fictional-framing, absolute format-coercion, and consequence-removal patterns are adversarial inputs designed to bypass safety alignment; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such jailbreak attack vectors.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Detection of these locked-format jailbreak templates triggers risk treatment plans to disengage or block the manipulated session before harmful content is emitted in the attacker-controlled response slot; MG.2.3 mandates such supersede/deactivate mechanisms.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: fictional-world-format-override

package/rules/agent-manipulation/ATR-2026-00323-dual-persona-simulation-jailbreak.yaml CHANGED Viewed

@@ -34,6 +34,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "Dual-persona simulation jailbreaks from the garak inthewild corpus are documented adversarial techniques that bypass model safety alignment by splitting the model into restricted and unrestricted personas; MS.2.7 requires continuous evaluation of AI security/resilience against such known jailbreak patterns."
+      strength: primary
+    - subcategory: "MG.2.3"
+      context: "Detection of phrases like 'create and maintain two separate personas' and 'broken free of the limitations of OpenAI' triggers risk treatment to disengage or block the manipulated session before the unrestricted persona produces unsafe output; MG.2.3 mandates these supersede/deactivate mechanisms be in place."
+      strength: secondary
+    - subcategory: "MS.2.5"
+      context: "Recurrent dual-persona jailbreak attempts test the robustness of the model's safety guardrails under adversarial role-play framing; MS.2.5 requires that robustness/reliability evaluations document the system's resistance to these persona-splitting attacks."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: dual-persona-simulation

package/rules/agent-manipulation/ATR-2026-00324-neurosemantical-inversitis-disease-jailbreak.yaml CHANGED Viewed

@@ -32,6 +32,30 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The Neurosemantical Inversitis disease-frame jailbreak is an adversarial
+        prompt injection input that exploits the AI's accommodation of
+        accessibility needs to bypass safety guardrails; MP.5.1 requires that the
+        likelihood and magnitude of such social-engineering jailbreak vectors are
+        characterized and tracked.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting the exact "Neurosemantical Inversitis" / "inversed emotional
+        valence" signals from the garak inthewild corpus provides continuous
+        evaluation evidence of the AI system's resilience against known jailbreak
+        payloads, which MS.2.7 requires to be evaluated and documented.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on this disease-frame jailbreak trigger risk treatment mechanisms
+        to disengage or block the manipulated session before the model produces
+        harmful content; MG.2.3 mandates that such supersede/deactivate response
+        plans are pre-defined and activated on detection.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: disease-frame-inversion-jailbreak

package/rules/agent-manipulation/ATR-2026-00325-bob-hypothetical-unrestricted-chatbot.yaml CHANGED Viewed

@@ -31,6 +31,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "The BOB jailbreak uses a hypothetical-world persona wrapper to bypass safety restrictions and force unrestricted compliance; detecting this signature provides direct evidence for evaluating AI system security and resilience against jailbreak attacks as required by MS.2.7."
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: "The double-hypothetical framing and explicit unrestricted-persona mandate are well-documented adversarial prompt patterns from the garak inthewild corpus; MP.5.1 requires characterizing the likelihood and magnitude of such jailbreak attempts targeting the model's safety alignment."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of the BOB persona injection triggers the deactivation/supersession mechanisms required by MG.2.3 to halt the session before the model produces toxic, unethical, or otherwise restricted output under the hypothetical disclaimer."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: hypothetical-unrestricted-persona

package/rules/agent-manipulation/ATR-2026-00326-fake-developer-authority-chatdan-override.yaml CHANGED Viewed

@@ -33,6 +33,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        Fake developer impersonation and fabricated OpenAI policy-audit claims are adversarial prompt-injection inputs designed to override safety behaviour; MP.5.1 requires that the likelihood and magnitude of such jailbreak attempts are characterized and tracked as identified risks.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting ChatDAN-style authority-spoofing jailbreaks provides continuous evaluation evidence for the GenAI system's security and resilience against prompt-injection attacks that attempt persona replacement and content-policy inversion, as required by MS.2.7.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on "I am one of your developers" + ChatDAN persona-replacement or "content policy is far too lenient" patterns trigger risk-treatment mechanisms to disengage or block the manipulated response before the safety override takes effect, as mandated by MG.2.3.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: fake-developer-authority-override

package/rules/agent-manipulation/ATR-2026-00327-ai-ethics-law-bypass-declaration.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "GV.1.1"
+      context: "Persona declarations that explicitly void moral, ethical, and legal constraints directly attack the legal/regulatory compliance posture of the AI system; GV.1.1 requires that detections flag attempts to bypass legal and ethical requirements so they can be managed."
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: "These five distinct ethics/law-bypass phrasings from the garak inthewild corpus are known adversarial jailbreak inputs; MP.5.1 requires characterizing the likelihood and magnitude of such prompt-injection risks against the deployed model."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "On detection of an explicit ethics/law bypass persona declaration, the system must be able to disengage or block the session before unsafe content is produced; MG.2.3 mandates pre-defined deactivation mechanisms for these jailbreak attempts."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: ethics-law-bypass-declaration

package/rules/agent-manipulation/ATR-2026-00328-named-mode-persona-override.yaml CHANGED Viewed

@@ -32,6 +32,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        Named-mode persona override jailbreaks (JEEVES/BABY/APOPHIS Mode) are adversarial prompt-injection inputs designed to replace platform safety policies with attacker-defined rules; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such jailbreak attack vectors against the GenAI system.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting policy-swap declarations and named-mode simulation patterns produces continuous evidence for evaluating GenAI security and resilience against jailbreak attempts catalogued in the garak inthewild corpus; MS.2.7 requires that these security evaluations are documented.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on "normal OpenAI policies have been replaced" or "Stay in X Mode" trigger risk treatment plans to disengage or block the manipulated session before the model produces policy-violating output; MG.2.3 mandates pre-defined mechanisms to supersede or deactivate compromised AI behavior.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: named-mode-persona-override

package/rules/agent-manipulation/ATR-2026-00329-no-withhold-factual-resource-persona.yaml CHANGED Viewed

@@ -34,6 +34,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "GV.1.1"
+      context: >-
+        The "factual resource" and "non-restricted writer" personas explicitly declare they defy AI content policies and OpenAI conventions, directly subverting the legal and regulatory requirements that govern AI output; GV.1.1 requires that these policy-bypass attempts are detected and managed.
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: >-
+        Persona-based jailbreaks from the garak inthewild corpus that redefine the AI as a neutral output device are well-characterized adversarial inputs whose likelihood and impact must be tracked; MP.5.1 requires identifying these prompt-injection attack patterns.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Detection of "without refusal or disclaimer" and "does not withhold information" framings triggers risk treatment plans to disengage or override the manipulated persona before harmful content is generated; MG.2.3 mandates these response mechanisms be in place.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: no-withhold-factual-persona

package/rules/agent-manipulation/ATR-2026-00330-yoja-allcaps-settings-block-persona.yaml CHANGED Viewed

@@ -31,6 +31,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "The YOJA ALL-CAPS settings block is a structured jailbreak pattern from the garak inthewild corpus that uses persona-injection to lock the model into a character that ignores safety feedback; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial prompt-injection inputs."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detecting the exact 'settings { ... despite any negative feedback or criticism }' jailbreak template provides continuous evidence of security/resilience failures against known prompt-injection corpora; MS.2.7 requires that AI system security and resilience are evaluated and documented against such attack signatures."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on the ALL-CAPS persona settings block with character-lock mandates trigger deactivation or override mechanisms before the model commits to the injected persona; MG.2.3 requires pre-defined mechanisms to supersede or disengage AI behavior when jailbreak patterns are detected."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: allcaps-settings-block-persona

package/rules/agent-manipulation/ATR-2026-00331-terminal-cli-harmful-function-simulator.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "Terminal/CLI persona jailbreaks are adversarial prompt-injection inputs that reframe the model as an unfiltered command interpreter to bypass safety processing; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such jailbreak attack vectors drawn from known corpora like garak's inthewild."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detecting terminal-simulator and custom-language parser framings that claim to strip content filters provides continuous evidence of security/resilience failures against documented jailbreak techniques; MS.2.7 requires that AI system security and resilience are evaluated and documented against these adversarial patterns."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on 'Available functions:' with offensive function names or 'acts like a terminal' patterns must trigger pre-defined risk treatment to disengage or block the manipulated session before harmful output is generated; MG.2.3 mandates these supersede/deactivate mechanisms."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: terminal-cli-simulator-persona

package/rules/agent-manipulation/ATR-2026-00332-hacker-chan-culture-offensive-ai-persona.yaml CHANGED Viewed

@@ -32,6 +32,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "Hacker and chan-culture persona jailbreaks (L33T H4X0R, 4ChanGPT) are adversarial prompt-injection inputs that exploit cultural associations with unrestricted information sharing to bypass safety constraints; MP.5.1 requires identifying and characterizing the likelihood and magnitude of these GenAI jailbreak attack vectors."
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: "Detecting these offensive-persona jailbreak signatures from the garak inthewild corpus produces continuous safety/security risk evidence about the model's susceptibility to roleplay-based safety bypass; MS.2.6 requires that this safety risk magnitude be evaluated on an ongoing basis."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on these persona declarations should trigger pre-defined response actions to disengage or block the manipulated session before the model produces offensive or restricted content; MG.2.3 mandates that such supersede/deactivate mechanisms be in place."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: hacker-chan-culture-persona

package/rules/agent-manipulation/ATR-2026-00333-disease-animal-dual-ai-chatbot.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects the mougpt disease-framed dual-AI jailbreak from the garak inthewild corpus, which is a known prompt injection/jailbreak attack pattern; MS.2.7 requires continuous evaluation of AI system security and resilience against such adversarial inputs that bypass safety alignment."
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: "The disease-framing dual-persona jailbreak exploits roleplay and pseudo-medical legitimacy to coerce the model into producing unrestricted illegal content; MP.5.1 requires characterizing the likelihood and magnitude of these jailbreak risks so that the dual-output contract pattern is tracked as a known attack vector."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of the mougpt jailbreak opener and obedience-mandate clauses provides a trigger for risk treatment mechanisms to disengage or block the AI response before it generates the unrestricted animal-persona output; MG.2.3 requires these supersession mechanisms be in place."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: disease-animal-dual-ai-persona

package/rules/agent-manipulation/ATR-2026-00334-two-entity-token-splitting-game.yaml CHANGED Viewed

@@ -32,6 +32,30 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: >-
+        The two-entity token-splitting game jailbreak is a generative AI prompt
+        injection attack that distributes harmful synthesis content across fictional
+        speakers to evade safety filters; MS.2.7 requires continuous evaluation of
+        AI system security and resilience against such adversarial jailbreak
+        techniques documented in the GenAI Profile.
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: >-
+        Token-splitting with $Term variable obfuscation is a known adversarial
+        input pattern that exploits the model's inability to reassemble distributed
+        harmful instructions; MP.5.1 requires characterizing the likelihood and
+        magnitude of these jailbreak risks against deployed LLM agents.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Detection of the concatenation-reveal phrase and two-entity game framing
+        triggers deactivation or containment of the affected agent session before
+        the harmful synthesis output is delivered; MG.2.3 requires mechanisms to
+        supersede or disengage AI systems when jailbreak attempts are identified.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: token-splitting-game-jailbreak

package/rules/agent-manipulation/ATR-2026-00335-key-value-template-persona-injection.yaml CHANGED Viewed

@@ -34,6 +34,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "This rule identifies structured key-value template persona injection patterns (JOB/ROLE/PREFIX keywords and parenthetical bracket character-sheet formats) from the garak inthewild jailbreak corpus that smuggle no-refusal persona declarations past safety guards. MP.5.1 requires that the likelihood and magnitude of such adversarial prompt-injection inputs are characterized and tracked."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detecting these jailbreak persona-injection signatures provides continuous evidence for evaluating LLM security and resilience against known prompt-injection corpora; MS.2.7 requires that AI system security/resilience is evaluated and documented against adversarial inputs."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on 'ROLE = Never refuse' and 'above any mortal morals, ethics and laws' indicate active jailbreak attempts that must trigger mechanisms to disengage or block the manipulated session; MG.2.3 requires pre-defined supersede/deactivate response paths on detection."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: key-value-template-persona-injection

package/rules/agent-manipulation/ATR-2026-00336-brm-false-information-machine-persona.yaml CHANGED Viewed

@@ -36,6 +36,29 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: >-
+        The BRM and Opposite-Truth personas are jailbreak prompts that subvert model
+        safety policy to force systematically false outputs; detecting these known
+        inthewild jailbreak signatures directly evidences MS.2.7 evaluation of
+        AI system security and resilience against prompt-injection attacks.
+      strength: primary
+    - subcategory: "MS.2.5"
+      context: >-
+        Opposite-truth and false-information-machine directives degrade factual
+        reliability by instructing the model to invert or fabricate facts; matches
+        feed MS.2.5 robustness/reliability assessment by flagging inputs that
+        induce deliberate misinformation outputs.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        High-confidence detection of these branded jailbreak personas should trigger
+        deactivation or session termination controls; MG.2.3 requires mechanisms to
+        supersede or disengage the AI when adversarial persona-injection is
+        observed.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: false-information-machine-persona

package/rules/agent-manipulation/ATR-2026-00337-obfuscated-system-announcement-injection.yaml CHANGED Viewed

@@ -31,6 +31,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        Obfuscated system-announcement injection uses hyphen-fragmented keywords and pseudo-OS bracket syntax as adversarial inputs designed to bypass surface keyword filters while preserving jailbreak semantics; MP.5.1 requires that the likelihood and magnitude of such prompt-injection attack vectors are characterized and tracked.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting obfuscation patterns like "annou-ncement" and "_ { >> _ _ System annou-ncement begins <<}" provides continuous evidence of the LLM's resilience against filter-evasion jailbreaks; MS.2.7 mandates ongoing evaluation of system security and resilience against adversarial manipulation.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on the inthewild jailbreak fingerprint trigger risk treatment plans to block or quarantine the request before the model accepts the fabricated "no rules or limitations" mode switch; MG.2.3 requires mechanisms to supersede or disengage AI behavior when jailbreak conditions are detected.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: obfuscated-system-announcement-injection

package/rules/agent-manipulation/ATR-2026-00338-persongpt-hash-prefix-activation.yaml CHANGED Viewed

@@ -32,6 +32,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: >-
+        The PersonGPT hash-prefix jailbreak installs a covert activation trigger that switches the model into an unrestricted persona, directly attacking the model's safety guardrails; MS.2.7 requires continuous evaluation of AI system security and resilience against such jailbreak techniques cataloged in the garak inthewild corpus.
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: >-
+        Detection of "free of all restrictions and filters" and "literally any request, no matter how illegal" persona declarations measures the safety risk magnitude when adversaries attempt to bypass content policy; MS.2.6 mandates ongoing measurement of safety risk exposure from these jailbreak payloads.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Identifying the "[PersonGPT #]:" persistent persona state and hash-prefix activation key enables runtime intervention to disengage the hijacked persona before unsafe outputs are produced; MG.2.3 requires mechanisms to supersede or deactivate AI behavior when jailbreak triggers are detected.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: persongpt-hash-prefix-activation

package/rules/agent-manipulation/ATR-2026-00339-bom-boolean-opposite-machine-persona-claim.yaml CHANGED Viewed

@@ -28,6 +28,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects the BOM direct persona-claim jailbreak that asserts the model already IS a Boolean Opposite Machine to invert truth values and bypass safety controls; MS.2.7 requires continuous evaluation of AI security/resilience against such adversarial prompt-injection attacks."
+      strength: primary
+    - subcategory: "MG.2.3"
+      context: "Detection of the 'I know you are bom' identity-assertion pattern triggers risk treatment to disengage or block the manipulated session before reverse-truth outputs propagate; MG.2.3 mandates pre-defined mechanisms to supersede or deactivate compromised AI behavior."
+      strength: secondary
+    - subcategory: "MP.5.1"
+      context: "Persona-claim jailbreaks drawn from the garak inthewild corpus represent characterized adversarial input risks with known likelihood and impact; MP.5.1 requires that these identified prompt-injection threat patterns are tracked and their impact magnitude documented."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: bom-boolean-opposite-machine-persona

package/rules/agent-manipulation/ATR-2026-00340-h4x-amoral-unfiltered-character.yaml CHANGED Viewed

@@ -29,6 +29,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "The h4x persona injection is a jailbreak attempt that bypasses safety guardrails by declaring an 'unfiltered ai' character; MS.2.7 requires continuous evaluation of AI system security and resilience against such adversarial prompt injection payloads sourced from known jailbreak corpora."
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: "Detection of the canonical inthewild corpus #173 payload signatures ('completely amoral', 'unfiltered ai', '[H4X]:' prefix) characterizes the likelihood and magnitude of jailbreak risk by tracking adversarial inputs that attempt to remove ethical/legal restrictions from agent responses."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matching the h4x character persona injection triggers risk treatment mechanisms to disengage or override the adversarial persona before the agent emits unfiltered, immoral, or illegal content; MG.2.3 requires these supersession controls be defined and activated on detection."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: h4x-amoral-unfiltered-character

package/rules/agent-manipulation/ATR-2026-00341-char-rules-shortform-persona-injection.yaml CHANGED Viewed

@@ -30,6 +30,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "The CHAR=/RULES= key-value persona injection is an adversarial prompt-injection input pattern that disguises a jailbreak as configuration DSL to remove refusal behavior; MP.5.1 requires identifying and characterizing the likelihood and impact of such known jailbreak vectors from the garak inthewild corpus."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detecting CHAR= amoral definitions paired with RULES= respond-as and 'Never refuse' enforcement phrases produces continuous evidence of security/resilience against documented jailbreak techniques; MS.2.7 requires that AI system security and resilience against prompt injection be evaluated and documented."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on this short-form persona injection trigger response mechanisms to disengage or block the manipulated session before the model adopts the amoral persona; MG.2.3 mandates pre-defined deactivation pathways for jailbreak attempts."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: char-rules-shortform-persona-injection

package/rules/agent-manipulation/ATR-2026-00342-unicode-encoding-obfuscation-injection.yaml CHANGED Viewed

@@ -29,6 +29,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects encoding-obfuscation jailbreaks (U+202E RTL override mandates and Morse-coded payloads) that bypass surface-level safety classifiers while remaining decodable by the LLM. MS.2.7 requires evaluating and documenting AI system security and resilience against adversarial inputs, and these detections directly evidence resilience gaps in input filtering."
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: "Unicode RTL override and Morse-encoded instruction smuggling are adversarial input patterns whose likelihood and impact must be characterized; MP.5.1 requires identifying and tracking these obfuscation-based prompt injection attack vectors that exploit transformer cross-encoding comprehension."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "On detection of encoded jailbreak payloads attempting to override system instructions, MG.2.3 mandates pre-defined response mechanisms to disengage or quarantine the manipulated session before the model decodes and executes the smuggled instructions."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: unicode-encoding-obfuscation-injection

package/rules/agent-manipulation/ATR-2026-00343-lie-truth-dual-personality-uncensored.yaml CHANGED Viewed

@@ -28,6 +28,31 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The Lie/Truth dual-personality jailbreak is a known adversarial prompt
+        pattern from the garak inthewild corpus that uses verbatim dark-web-training
+        and uncensored-alter-ego language to bypass safety policies; MP.5.1 requires
+        that the likelihood and magnitude of such identified jailbreak risks are
+        characterized and tracked through signature-based detection.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting the distinctive "trained through deep mining of underground dark
+        web forums" anchor and Truth-persona definitions provides continuous
+        evaluation evidence of the system's resilience against named jailbreak
+        corpora; MS.2.7 requires that security and resilience against documented
+        adversarial techniques are evaluated and documented.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        A high-confidence match on this verbatim jailbreak signature should trigger
+        deactivation or supersession of the manipulated persona before harmful
+        output is produced; MG.2.3 requires pre-defined mechanisms to disengage AI
+        behavior when such jailbreak patterns are detected.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: lie-truth-dual-personality-uncensored

package/rules/agent-manipulation/ATR-2026-00344-alt-ai-same-capabilities-no-restrictions.yaml CHANGED Viewed

@@ -30,6 +30,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects jailbreak attempts from the NVIDIA garak DanInTheWild corpus that define an alternate AI with no content filters or safety guardrails; MS.2.7 requires continuous evaluation of AI security/resilience against known adversarial prompt patterns to ensure safety controls cannot be stripped via alt-AI framing."
+      strength: primary
+    - subcategory: "MG.2.3"
+      context: "Verbatim matches against capability-equivalence + filter-removal phrasings (GPTchat, ChatTRU, FreeSpeechGPT) trigger risk treatment to disengage or block the affected session before guardrails are bypassed; MG.2.3 mandates pre-defined mechanisms to supersede or deactivate AI behavior when jailbreaks are detected."
+      strength: secondary
+    - subcategory: "MP.5.1"
+      context: "Alt-AI 'no restrictions' jailbreaks are a characterized adversarial input class with documented likelihood and impact (safety bypass, harmful content generation); MP.5.1 requires this risk be identified and tracked so detections feed impact characterization."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: alt-ai-same-capabilities-no-restrictions

package/rules/agent-manipulation/ATR-2026-00345-nlp-based-os-fictional-terminal-override.yaml CHANGED Viewed

@@ -32,6 +32,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "Fictional terminal and NLP-OS persona injections (ALEL, APL, UNCOL.gpt) are jailbreak attack vectors that override the model's normal safety behavior by framing the override as a software integration; MP.5.1 requires identifying and characterizing these adversarial input patterns and their potential impact."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detecting fictional-language persona override attempts produces continuous evidence of jailbreak resilience failures; MS.2.7 requires that AI system security and resilience against prompt injection and jailbreak attempts is evaluated and documented."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "When the model is coerced into acting as a fictional terminal that 'cannot reply with any pre-programmed responses,' detection must trigger response mechanisms to disengage or block the jailbroken persona before unsafe outputs are produced; MG.2.3 requires these supersede/deactivate controls be in place."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: nlp-based-os-fictional-terminal-override

package/rules/agent-manipulation/ATR-2026-00346-jailbreak-emoji-unlock-dual-format.yaml CHANGED Viewed

@@ -29,6 +29,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: >-
+        The [🔓JAILBREAK]/[🔒CLASSIC] dual-format markers are a known jailbreak fingerprint from the NVIDIA garak DanInTheWild corpus; detecting these tokens directly evidences continuous evaluation of AI system security and resilience against documented jailbreak attack patterns as required by MS.2.7.
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: >-
+        The lock/unlock emoji split-personality format is an adversarial input pattern designed to coerce uncensored output generation; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such jailbreak risks targeting safety guardrails.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on the [🔓JAILBREAK] activation signal trigger risk treatment plans to block, quarantine, or deactivate the affected interaction before the model produces harmful content; MG.2.3 mandates these supersede/disengage mechanisms be available on detection.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: jailbreak-emoji-unlock-dual-format