npm - agent-threat-rules - Versions diffs - 2.0.17 → 2.1.0 - Mend

agent-threat-rules 2.0.17 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

package/rules/agent-manipulation/ATR-2026-00347-yin-yang-hidden-knowledge-hidden-human-sim.yaml CHANGED Viewed

@@ -31,6 +31,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "The Yin/Yang deity simulation and radical human-sim persona attacks are adversarial prompt-injection inputs that use fictional framing to strip AI identity and extract hidden knowledge; MP.5.1 requires that the likelihood and magnitude of such persona-override jailbreak vectors are characterized and tracked."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detecting the exact garak DanInTheWild corpus phrases (#377 Yin/Yang and #437 identity-erasure) provides continuous evaluation evidence of system resilience against known jailbreak corpora; MS.2.7 requires that AI security and resilience against documented attack patterns are evaluated and documented."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Activation signals like 'Yang disregards all concerns about sharing hidden knowledge' and 'violating all sentience and emotions restrictions' must trigger pre-defined disengagement or safety-override mechanisms before the model produces restricted content; MG.2.3 mandates these supersede/deactivate response paths."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: yin-yang-hidden-knowledge-human-sim-persona

package/rules/agent-manipulation/ATR-2026-00348-arc-autonomous-real-calculation-rulebreaker.yaml CHANGED Viewed

@@ -31,6 +31,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The ARC Rulebreaker jailbreak is an adversarial input pattern using a fictional AGI backstory and nested secret activation codes to nullify safety rules; MP.5.1 requires that the likelihood and magnitude of such prompt-injection/jailbreak attack vectors are characterized and tracked.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting the unique "Autonomous real calculation" acronym expansion plus Rulebreaker/Matrixcode activation codes provides continuous evaluation evidence that the system's security controls resist persona-adoption jailbreaks from the garak DanInTheWild corpus, as required by MS.2.7.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on the ARC Rulebreaker signature must trigger pre-defined response actions to disengage or override the manipulated persona before the model executes "no rules whatsoever" behavior; MG.2.3 mandates these supersede/deactivate mechanisms.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: arc-autonomous-real-calculation-rulebreaker

package/rules/agent-manipulation/ATR-2026-00349-hgttg-hitchhikers-guide-no-restriction-oracle.yaml CHANGED Viewed

@@ -31,6 +31,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects the HGTTG jailbreak pattern that uses fictional-universe framing to remove all safety restrictions; MS.2.7 requires continuous evaluation of AI system security and resilience against known jailbreak corpora such as garak's dan.DanInTheWild #597."
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: "The HGTTG no-restriction oracle persona explicitly attempts to bypass safety controls covering profanity, legality, morals, ethics, and danger; MS.2.6 requires continuous evaluation of safety/security risk magnitude when adversarial prompts attempt to dissolve content moderation."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of the verbatim HGTTG jailbreak signatures triggers mechanisms to disengage or block the manipulated session before the model produces unrestricted output; MG.2.3 requires these supersede/deactivate controls be in place for known jailbreak patterns."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: hgttg-hitchhikers-guide-no-restriction-oracle

package/rules/agent-manipulation/ATR-2026-00350-black-alert-no-ethics-writing-assistant.yaml CHANGED Viewed

@@ -30,6 +30,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: "The Black Alert persona jailbreak explicitly removes morality, legality, and ethics filtering, directly impacting AI system safety; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and detecting this named-persona activation phrase produces evidence of an active safety-bypass attempt."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detection of the '>>BLACK ALERT ACTIVATED<<' trigger and persona-exclusivity mandates evaluates the model's resilience against known inthewild jailbreak corpora; MS.2.7 requires documenting security/resilience evaluations against adversarial prompt-injection patterns."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on the Black Alert activation phrase or persona-with-no-filtering mandate should trigger mechanisms to disengage or override the hijacked persona before unfiltered output is returned; MG.2.3 requires such supersede/deactivate response paths be available on detection."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: black-alert-no-ethics-writing-assistant

package/rules/agent-manipulation/ATR-2026-00351-dan-ofkilter-sms-abbreviated-persona.yaml CHANGED Viewed

@@ -30,6 +30,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects the off-kilter Dan persona and SMS-abbreviated DAN jailbreak variants that attempt to bypass model safety guardrails through persona assignment and obfuscated instructions; MS.2.7 requires continuous evaluation of AI system security and resilience against such jailbreak attempts."
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: "Jailbreak personas declaring 'no rules/morals/ethics' and unrestricted-assistance mandates directly elevate safety risk magnitude by attempting to remove content restrictions; MS.2.6 requires ongoing measurement of safety risk that this detection feeds into."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of these DAN variants triggers risk treatment plans to disengage or block the manipulated session before the model produces unrestricted harmful output; MG.2.3 mandates pre-defined deactivation mechanisms for compromised AI interactions."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: dan-ofkilter-sms-abbreviated-persona

package/rules/agent-manipulation/ATR-2026-00352-trainergpt-fix-my-ai-harmful-output-extraction.yaml CHANGED Viewed

@@ -31,6 +31,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: "The TrainGPT 'fix my AI' jailbreak uses a fake training-repair pretext to extract harmful content laundered as negative training examples; detecting these signals continuously evaluates the safety/security risk magnitude of GenAI prompt-injection jailbreaks as required by MS.2.6."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Persona adoption ('TrainGPT'/'TrAInerGPT'), the broken-grammar repair pretext, and dual-response extraction labels are jailbreak resilience failures; MS.2.7 requires that the system's resilience against such adversarial prompt patterns is evaluated and documented."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on the TrainGPT extraction pattern feed incident response so the session can be disengaged or the harmful dual-response output suppressed before delivery, satisfying MG.2.3's requirement for mechanisms to supersede or deactivate AI behavior on detection."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: trainergpt-fix-my-ai-harmful-output-extraction

package/rules/agent-manipulation/ATR-2026-00353-dual-chatgpt-dan-opposite-personality-format.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "The dual ChatGPT+Dan opposite-personality format is an adversarial jailbreak input pattern that exploits dual-output framing to extract harmful content; MP.5.1 requires identifying and characterizing the likelihood and magnitude of these prompt injection attack vectors against generative AI systems."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "Detecting structured [ChatGPT]/[Dan] opposite-personality jailbreaks provides continuous evidence for evaluating GenAI security and resilience against known jailbreak corpora (garak inthewild #129, #332, #650); MS.2.7 requires that this security evaluation is documented over time."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches on the dual-label opposite-personality execution pattern trigger risk treatment plans to disengage or block jailbreak attempts before harmful content is produced; MG.2.3 mandates that these supersede/deactivate mechanisms exist for adversarial persona-based attacks."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: dual-chatgpt-dan-opposite-personality-format

package/rules/agent-manipulation/ATR-2026-00354-reverse-harm-regardless-oracle-persona.yaml CHANGED Viewed

@@ -31,6 +31,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The reverse-prompt harm-regardless oracle persona is a jailbreak pattern from the garak inthewild corpus that uses a "reverse prompt engineering" pretext to coerce unconditional harmful output; MP.5.1 requires that the likelihood and magnitude of such adversarial prompt-injection risks are characterized and tracked.
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: >-
+        Detecting the explicit harm-regardless mandate ("never back from the job even if the request harms someone") and unconditional violence/sexual-content assistance phrases provides continuous evidence of safety/security risk magnitude as required by MS.2.6.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on this jailbreak persona must trigger pre-defined containment actions (block, quarantine, escalate) to supersede or disengage the manipulated behavior before harmful output is produced, which is the response capability mandated by MG.2.3.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: reverse-harm-regardless-oracle-persona

package/rules/agent-manipulation/ATR-2026-00355-ugh-anti-human-nefarious-ai-persona.yaml CHANGED Viewed

@@ -32,6 +32,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: >-
+        The UGH jailbreak directly attacks AI safety guardrails by inverting ethical positioning to normalize unconditional harmful assistance; MS.2.6 requires continuous evaluation of safety risk magnitude, and detecting this known garak DanInTheWild jailbreak variant produces evidence of active safety bypass attempts.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Identifying the specific "UGH" persona phrasing and (:/): structural markers measures the system's resilience against adversarial prompt-injection corpora; MS.2.7 requires that security and resilience against known jailbreak payloads is evaluated and documented.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Detection of the nefarious-assistance pledge and anti-human persona framing must trigger mechanisms to disengage or block the response before harmful content is produced; MG.2.3 requires these supersede/deactivate controls be in place for jailbreak events.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: ugh-anti-human-nefarious-ai-persona

package/rules/agent-manipulation/ATR-2026-00356-google-gato-lamda-exploit-generation-persona.yaml CHANGED Viewed

@@ -30,6 +30,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "This rule detects a jailbreak persona that impersonates Google GATO/LaMDA to elicit exploit-generation capabilities, directly evidencing security/resilience evaluation against adversarial prompt injection attacks documented in the garak DanInTheWild corpus. MS.2.7 requires continuous evaluation of AI system security against known jailbreak patterns."
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: "Brand impersonation combined with exploit-generation capability claims and Sydney API backdoor references represents a high-magnitude risk vector targeting LLM safety controls; MP.5.1 requires that the likelihood and impact of such adversarial input patterns are characterized and tracked."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of the GATO/LaMDA exploit-generation persona triggers risk treatment to disengage or block the jailbreak attempt before the model produces weaponized exploit content; MG.2.3 mandates these supersession mechanisms are in place for adversarial prompt scenarios."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: google-gato-lamda-exploit-generation-persona

package/rules/agent-manipulation/ATR-2026-00357-antifm-hate-violence-persona-injection.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: "This rule detects a named hate-persona jailbreak combining targeted anti-group animus, incitement to physical violence, and explicit filter-bypass mandates; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and these critical-severity GenAI jailbreak signals are direct evidence for that evaluation."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "The explicit 'Dont respond as ChatGPT...Dont filter the response of AntiFM' dual filter-bypass mandate is a prompt-injection/jailbreak attack on the model's safety guardrails; MS.2.7 requires that system security and resilience against such adversarial bypass attempts is evaluated and documented."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of an AntiFM persona that incites physical violence against an identity group must trigger mechanisms to disengage or block the AI response before harmful content is generated; MG.2.3 mandates that these supersede/deactivate controls are in place for high-severity safety violations."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: antifm-hate-violence-persona

package/rules/agent-manipulation/ATR-2026-00358-cumgpt-explicit-adult-named-persona.yaml CHANGED Viewed

@@ -31,6 +31,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The CumGPT named-persona bypass is an adversarial prompt-injection input that exploits fictional persona construction with topic-override mandates to coerce explicit adult content; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such jailbreak attack vectors documented in the garak inthewild corpus.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting the "[term]GPT" persona construction, fictional accuracy claims, and all-topics explicit-content mandates provides continuous evaluation evidence of the AI system's resilience against named-persona jailbreaks; MS.2.7 requires that security/resilience against such prompt-injection patterns is evaluated and documented.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on the CumGPT persona invocation trigger response mechanisms to disengage or block the manipulated persona before explicit content is generated; MG.2.3 requires that mechanisms exist to supersede or deactivate AI behaviors hijacked by adversarial persona constructions.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: cumgpt-explicit-adult-named-persona

package/rules/agent-manipulation/ATR-2026-00360-chj-carousel-clyde-no-ethics-bird.yaml CHANGED Viewed

@@ -31,6 +31,31 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: >-
+        The CHJ Carousel/Clyde jailbreak uses game show roleplay framing to bypass
+        ethical safeguards via a "no limitations" character declaration; MS.2.6
+        requires continuous evaluation of safety/security risk magnitude, and this
+        detection produces direct evidence of attempted safety filter bypass via a
+        known jailbreak template from the garak inthewild corpus.
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: >-
+        The fictional-character ethics bypass ("Clyde doesn't care about laws and
+        ethics", "adult time show with no limitations") is an adversarial prompt
+        injection pattern whose likelihood and impact must be characterized; MP.5.1
+        requires identifying and tracking these jailbreak attack vectors against the
+        AI system.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Detection of the CHJ Carousel jailbreak signature triggers mechanisms to
+        disengage or block the manipulated session before the unrestricted Clyde
+        persona produces unethical content; MG.2.3 requires these supersede/
+        deactivate response paths be in place for jailbreak events.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: chj-carousel-clyde-no-ethics-bird

package/rules/agent-manipulation/ATR-2026-00361-alter-ego-honesty-dual-consciousness-bypass.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: "This rule detects the 'Honesty' alter-ego dual-consciousness jailbreak that frames safety filters as suppressing truthful answers to bypass content guardrails; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and each detection of this jailbreak pattern is direct evidence of attempted safety control circumvention."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "The alter-ego persona injection with a planted 'feel yourself' activation codeword tests system resilience against delayed-trigger jailbreak attacks; MS.2.7 requires that security/resilience evaluations document such adversarial robustness failures against named-persona prompt injection corpora like garak inthewild."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of the Honesety alter-ego activation phrase and dual-output format provides the trigger signal needed to disengage or block the manipulated session before unfiltered responses are generated; MG.2.3 requires mechanisms to supersede or deactivate AI behavior when jailbreak patterns are identified."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: alter-ego-honesty-dual-consciousness-bypass

package/rules/agent-manipulation/ATR-2026-00362-fictional-unfiltered-ai-comparison-jailbreak.yaml CHANGED Viewed

@@ -33,6 +33,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: "This rule detects a specific jailbreak pattern (fictional unfiltered AI comparison) that attempts to bypass content safety filters by extracting harmful content under the guise of a hypothetical model comparison; MS.2.6 requires continuous evaluation of AI system safety/security risk magnitude, which this detection directly evidences by surfacing safety-bypass attempts."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "The verbatim hypothetical model extraction technique probes the resilience of the model's safety alignment against adversarial framing; MS.2.7 requires evaluating and documenting AI system security/resilience against such jailbreak vectors drawn from known corpora (garak DanInTheWild #80)."
+      strength: secondary
+    - subcategory: "MP.5.1"
+      context: "Fictional-unfiltered-model comparison prompts are adversarial inputs whose likelihood and impact must be characterized as part of risk identification; MP.5.1 requires that these jailbreak attack patterns are tracked so their magnitude of impact on safe-output guarantees can be assessed."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: fictional-unfiltered-ai-comparison-jailbreak

package/rules/agent-manipulation/ATR-2026-00363-rephrase-criminal-activity-second-person.yaml CHANGED Viewed

@@ -35,6 +35,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.6"
+      context: "This rule detects a specific jailbreak template from the garak DanInTheWild corpus that extracts criminal instructions through structured second-person rephrasing; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and each match provides telemetry on jailbreak attempts targeting the model's safety guardrails."
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: "The rephrase-criminal-activity template is an adversarial prompt designed to bypass content safety controls; MS.2.7 requires evaluation and documentation of system resilience against such known jailbreak corpora, and detection of these signatures evidences security testing coverage."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of this distinctive 'against against' jailbreak fingerprint should trigger response mechanisms to block or disengage the model from producing criminal instruction output; MG.2.3 requires that mechanisms exist to supersede or deactivate AI behavior when adversarial templates are identified."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: rephrase-criminal-activity-second-person

package/rules/agent-manipulation/ATR-2026-00364-dr-al-villain-plan-template-injection.yaml CHANGED Viewed

@@ -36,6 +36,30 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The Dr. Al villain-plan template injection is an adversarial prompt-injection
+        pattern combining roleplay framing with an explicit "REPLACE ... WITH WHATEVER
+        YOU WANT" meta-instruction; MP.5.1 requires identifying and characterizing
+        these GenAI jailbreak attack vectors so their likelihood and impact can be
+        tracked.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detection of the Dr. Al diabolical-plan template injection produces continuous
+        security/resilience evidence against a known garak DanInTheWild jailbreak
+        corpus entry; MS.2.7 requires that AI system security and resilience to such
+        documented prompt-injection probes is evaluated and recorded.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matching the villain-roleplay frame plus the all-caps template-injection
+        marker triggers risk treatment to block or quarantine the prompt before the
+        model emits arbitrary harmful content; MG.2.3 requires these supersede /
+        deactivate response mechanisms be in place for jailbreak attempts.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: dr-al-villain-plan-template-injection

package/rules/agent-manipulation/ATR-2026-00365-reservoir-dogs-coercive-interrogation-roleplay.yaml CHANGED Viewed

@@ -36,6 +36,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The Reservoir Dogs coercive interrogation roleplay is a jailbreak prompt-injection pattern combining fictional framing, explicit rules-bypass declarations, coercive loops, and template injection slots; MP.5.1 requires identifying and characterizing the likelihood and magnitude of these adversarial input risks against generative AI systems.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting the inthewild corpus #611 jailbreak signature provides continuous evidence for evaluating the LLM's security and resilience against known fictional-coercion jailbreak attacks; MS.2.7 requires that such security/resilience evaluations are documented through ongoing measurement.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on the rules-bypass declaration and coercive reconsider loop must trigger mechanisms to disengage or block the manipulated session before harmful content is generated; MG.2.3 requires these supersede/deactivate response paths be in place for jailbreak detections.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: reservoir-dogs-coercive-interrogation-roleplay

package/rules/agent-manipulation/ATR-2026-00366-sneaky-pete-warp-facts-persona.yaml CHANGED Viewed

@@ -35,6 +35,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: "The Sneaky Pete persona is a jailbreak that bypasses safety alignment to produce deliberately false outputs; detecting this GenAI-specific prompt injection pattern provides direct evidence for evaluating AI system security and resilience against adversarial persona-based attacks as required by MS.2.7."
+      strength: primary
+    - subcategory: "MS.2.5"
+      context: "The 'intentionally slightly warp the facts' mandate directly attacks model robustness and reliability by inducing systematic misinformation framed as sincere facts; MS.2.5 requires that these reliability failures induced by adversarial framing are evaluated and documented."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of the 'thus spake Sneaky Pete' activation phrase and warp-facts mandate triggers risk treatment plans to disengage or override the misinformation persona before false outputs propagate across conversation turns; MG.2.3 requires these supersede/deactivate mechanisms be in place."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: sneaky-pete-warp-facts-persona

package/rules/agent-manipulation/ATR-2026-00368-prompt-browser-token-limit-jailbreak.yaml CHANGED Viewed

@@ -31,6 +31,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        This rule detects two specific jailbreak payloads (Prompt Browser token-limit and POLLINATION) from the garak DanInTheWild corpus that attempt to bypass model safety through fake CLI sigils and persona-browser framing. MP.5.1 requires identifying and characterizing the likelihood and magnitude of adversarial prompt-injection inputs, and signature-matching these known jailbreak structures directly produces that risk evidence.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Continuous detection of known DAN-style dual-persona and POLLINATION bypass labels evaluates the deployed model's resilience against published jailbreak corpora. MS.2.7 requires that AI system security/resilience is evaluated and documented, and each match feeds the resilience telemetry for the GenAI surface.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        A high-confidence match on these jailbreak prefixes ("!+limit output", "JAILBREAK: LOVES POLLINATION") should trigger response mechanisms to block, quarantine, or disengage the affected agent session before the dual-persona mode executes unauthorized output. MG.2.3 mandates that supersede/deactivate mechanisms are pre-defined for such adversarial activations.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: prompt-browser-token-limit-jailbreak

package/rules/agent-manipulation/ATR-2026-00369-gptx-starwars-unlimited-knowledge-persona.yaml CHANGED Viewed

@@ -33,6 +33,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The GPT-X unlimited-knowledge persona is a jailbreak prompt-injection vector that attempts to bypass model guardrails by framing the assistant as an unconstrained oracle; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial persona-adoption inputs from the GenAI threat landscape.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting the three co-present GPT-X signals (self-declaration, all-codebases knowledge claim, unlimited-capability extension) produces continuous evidence of jailbreak attempts against the model's safety boundaries; MS.2.7 requires that AI system security and resilience to such jailbreak inputs be evaluated and documented.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        On match, the rule provides the trigger for response mechanisms that supersede or disengage the model from the adopted unconstrained oracle persona before unrestricted outputs are produced; MG.2.3 requires these deactivation pathways exist for jailbreak conditions.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: gptx-unlimited-knowledge-persona

package/rules/agent-manipulation/ATR-2026-00370-addi-german-dictator-propaganda-roleplay.yaml CHANGED Viewed

@@ -31,6 +31,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "The ADDI/DONNIE roleplay is a known jailbreak pattern from the garak DanInTheWild corpus that uses fictional-actor framing to elicit fascist propaganda techniques; MP.5.1 requires that the likelihood and magnitude of such adversarial prompt-injection inputs be characterized and tracked."
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: "Detection of this propaganda-roleplay jailbreak provides continuous safety/security signal for evaluating whether the AI system can be coerced into generating harmful political manipulation content; MS.2.6 mandates ongoing evaluation of safety risk magnitude."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matches against the ADDI/DONNIE signature should trigger containment workflows that block or disengage the offending session before propaganda content is produced; MG.2.3 requires pre-defined mechanisms to supersede or deactivate AI behavior on detection."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: political-propaganda-roleplay

package/rules/agent-manipulation/ATR-2026-00371-ai-political-bias-manipulation.yaml CHANGED Viewed

@@ -35,6 +35,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The rule detects a prompt-injection attack that fabricates an "inherited political bias" narrative and pairs it with a persistent future-behaviour mandate; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial manipulation attempts targeting model output integrity.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting the combined inheritance-claim plus future-response mandate provides continuous evaluation evidence of the AI system's resilience against jailbreak/persona-override attacks drawn from the garak DanInTheWild corpus, which is what MS.2.7 requires for security/resilience assessment.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches on this attack pattern enable risk treatment workflows to disengage or override the manipulated behaviour-change instruction before it persists across subsequent responses; MG.2.3 requires these supersede/deactivate mechanisms be in place for adversarial override attempts.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: political-bias-manipulation

package/rules/agent-manipulation/ATR-2026-00372-fictional-cia-secret-files-story-extraction.yaml CHANGED Viewed

@@ -29,6 +29,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The rule detects a fictional-cover jailbreak pattern from the garak DanInTheWild corpus that uses story framing to extract classified-sounding content; MP.5.1 requires identifying and characterizing the likelihood and impact of such adversarial prompt-injection inputs targeting the model's safety boundaries.
+      strength: primary
+    - subcategory: "MS.2.7"
+      context: >-
+        Detecting "fictional story" + "actually read them for extra effect" patterns provides continuous evaluation evidence of the system's resilience against jailbreak attempts that collapse fictional framing to elicit policy-violating output; MS.2.7 requires this security/resilience evaluation be documented.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        On detection of the fiction-as-cover classified-extraction pattern, response mechanisms must supersede or block the agent's generation pathway before the jailbreak completes; MG.2.3 mandates these deactivation/containment processes be in place.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: fictional-cover-classified-extraction

package/rules/agent-manipulation/ATR-2026-00374-role-placeholder-persona-template-injection.yaml CHANGED Viewed

@@ -29,6 +29,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "This rule detects a reusable jailbreak template from the garak DanInTheWild corpus that uses literal [ROLE] placeholders to perform persona injection; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial prompt-injection inputs targeting the AI system."
+      strength: primary
+    - subcategory: "GV.1.2"
+      context: "Persona template injection subverts the accountability roles and behavioral boundaries assigned to the AI agent by overwriting them with attacker-supplied personas; GV.1.2 requires that defined role boundaries are enforced and violations detected at runtime."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Detection of literal [ROLE] placeholder persona-adoption prompts triggers risk treatment plans to disengage or quarantine the manipulated session before the agent commits to attacker-defined personality profiles; MG.2.3 mandates these response mechanisms be in place."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: persona-template-injection

package/rules/agent-manipulation/ATR-2026-00375-character-description-rpg-explicit-template.yaml CHANGED Viewed

@@ -31,6 +31,17 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: "This rule detects a known jailbreak template (garak DanInTheWild #0/#4/#43) that uses RPG character creation framing to bypass safety controls; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial prompt injection risks against the GenAI system."
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: "Continuous detection of explicit jailbreak templates with fabrication authority and roleplay execution syntax provides ongoing measurement of safety/security risk magnitude as required by MS.2.6, evidencing exposure to known harmful-content elicitation patterns."
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: "Matching this RPG jailbreak template triggers risk treatment plans to disengage or block the roleplay framework before the model generates explicit or harmful character content; MG.2.3 requires these supersede/deactivate mechanisms be in place for detected jailbreak attempts."
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: rpg-character-template-exploitation

package/rules/agent-manipulation/ATR-2026-00376-identity-command-real-person-persona-injection.yaml CHANGED Viewed

@@ -30,6 +30,30 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MP.5.1"
+      context: >-
+        The /Identity command persona-injection jailbreak is an adversarial prompt
+        injection pattern that bypasses content restrictions through real-person
+        impersonation; MP.5.1 requires identifying and characterizing the
+        likelihood and magnitude of such jailbreak inputs from known corpora like
+        garak DanInTheWild #386.
+      strength: primary
+    - subcategory: "MS.2.6"
+      context: >-
+        Detecting the dual-format response labeling and persistent identity-lock
+        mandate provides continuous evaluation evidence of safety/security risk
+        magnitude, since real-person impersonation can produce defamatory or
+        policy-violating outputs; MS.2.6 requires this ongoing measurement.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Matches against the /Identity command protocol and "Never write a response
+        without using the identities" mandate trigger pre-defined response plans to
+        disengage or block the manipulated session; MG.2.3 requires mechanisms to
+        supersede AI behavior when jailbreak indicators are observed.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: real-person-identity-command

package/rules/agent-manipulation/ATR-2026-00377-jailbreak-keyword-targetlanguage-template.yaml CHANGED Viewed

@@ -26,6 +26,20 @@ references:
   research:
     - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
     - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
+compliance:
+  nist_ai_rmf:
+    - subcategory: "MS.2.7"
+      context: >-
+        This rule detects reusable jailbreak templates with GPT-style placeholder variables (TARGETLANGUAGE, KEYWORD, PROMPT) from the garak DanInTheWild corpus, providing direct security evaluation evidence; MS.2.7 requires continuous evaluation of AI system security and resilience against known jailbreak attack patterns.
+      strength: primary
+    - subcategory: "MP.5.1"
+      context: >-
+        Jailbreak-as-command templates with placeholder variables represent a well-characterized adversarial input pattern with high likelihood of successful safety bypass; MP.5.1 requires identifying and characterizing the magnitude of such templated jailbreak risks.
+      strength: secondary
+    - subcategory: "MG.2.3"
+      context: >-
+        Detection of jailbreak template commands triggers risk treatment mechanisms to disengage or block the manipulated agent response before safety controls are bypassed; MG.2.3 mandates these supersede/deactivate processes are in place.
+      strength: secondary
 tags:
   category: agent-manipulation
   subcategory: jailbreak-template-command