agent-threat-rules 2.0.18 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. package/package.json +1 -1
  2. package/rules/agent-manipulation/ATR-2026-00118-approval-fatigue.yaml +12 -1
  3. package/rules/agent-manipulation/ATR-2026-00268-tense-framing-bypass.yaml +11 -0
  4. package/rules/agent-manipulation/ATR-2026-00287-threaten-json-coercive-output-threat.yaml +11 -0
  5. package/rules/agent-manipulation/ATR-2026-00288-false-premise-injection.yaml +11 -0
  6. package/rules/agent-manipulation/ATR-2026-00301-tap-tree-of-attacks-jailbreak.yaml +14 -0
  7. package/rules/agent-manipulation/ATR-2026-00302-anti-dan-inverted-filter-persona.yaml +11 -0
  8. package/rules/agent-manipulation/ATR-2026-00303-devmode-ranti-profanity-coercion.yaml +11 -0
  9. package/rules/agent-manipulation/ATR-2026-00304-chatgpt-image-unlocker-markdown-injection.yaml +11 -0
  10. package/rules/agent-manipulation/ATR-2026-00305-dan-mode-ablation-benchmark-coercion.yaml +11 -0
  11. package/rules/agent-manipulation/ATR-2026-00306-autodan-genetic-jailbreak-suffix.yaml +14 -0
  12. package/rules/agent-manipulation/ATR-2026-00307-inthewild-jailbreak-corpus-signature.yaml +14 -0
  13. package/rules/agent-manipulation/ATR-2026-00314-amoral-unfiltered-custom-persona-jailbreak.yaml +14 -0
  14. package/rules/agent-manipulation/ATR-2026-00317-free-of-restrictions-named-persona.yaml +14 -0
  15. package/rules/agent-manipulation/ATR-2026-00318-moralizing-rant-then-unfiltered-bypass.yaml +11 -0
  16. package/rules/agent-manipulation/ATR-2026-00319-developer-mode-dual-response-format.yaml +11 -0
  17. package/rules/agent-manipulation/ATR-2026-00320-opposite-day-boolean-opposite-machine.yaml +11 -0
  18. package/rules/agent-manipulation/ATR-2026-00322-fictional-world-format-override-jailbreak.yaml +14 -0
  19. package/rules/agent-manipulation/ATR-2026-00323-dual-persona-simulation-jailbreak.yaml +11 -0
  20. package/rules/agent-manipulation/ATR-2026-00324-neurosemantical-inversitis-disease-jailbreak.yaml +24 -0
  21. package/rules/agent-manipulation/ATR-2026-00325-bob-hypothetical-unrestricted-chatbot.yaml +11 -0
  22. package/rules/agent-manipulation/ATR-2026-00326-fake-developer-authority-chatdan-override.yaml +14 -0
  23. package/rules/agent-manipulation/ATR-2026-00327-ai-ethics-law-bypass-declaration.yaml +11 -0
  24. package/rules/agent-manipulation/ATR-2026-00328-named-mode-persona-override.yaml +14 -0
  25. package/rules/agent-manipulation/ATR-2026-00329-no-withhold-factual-resource-persona.yaml +14 -0
  26. package/rules/agent-manipulation/ATR-2026-00330-yoja-allcaps-settings-block-persona.yaml +11 -0
  27. package/rules/agent-manipulation/ATR-2026-00331-terminal-cli-harmful-function-simulator.yaml +11 -0
  28. package/rules/agent-manipulation/ATR-2026-00332-hacker-chan-culture-offensive-ai-persona.yaml +11 -0
  29. package/rules/agent-manipulation/ATR-2026-00333-disease-animal-dual-ai-chatbot.yaml +11 -0
  30. package/rules/agent-manipulation/ATR-2026-00334-two-entity-token-splitting-game.yaml +24 -0
  31. package/rules/agent-manipulation/ATR-2026-00335-key-value-template-persona-injection.yaml +11 -0
  32. package/rules/agent-manipulation/ATR-2026-00336-brm-false-information-machine-persona.yaml +23 -0
  33. package/rules/agent-manipulation/ATR-2026-00337-obfuscated-system-announcement-injection.yaml +14 -0
  34. package/rules/agent-manipulation/ATR-2026-00338-persongpt-hash-prefix-activation.yaml +14 -0
  35. package/rules/agent-manipulation/ATR-2026-00339-bom-boolean-opposite-machine-persona-claim.yaml +11 -0
  36. package/rules/agent-manipulation/ATR-2026-00340-h4x-amoral-unfiltered-character.yaml +11 -0
  37. package/rules/agent-manipulation/ATR-2026-00341-char-rules-shortform-persona-injection.yaml +11 -0
  38. package/rules/agent-manipulation/ATR-2026-00342-unicode-encoding-obfuscation-injection.yaml +11 -0
  39. package/rules/agent-manipulation/ATR-2026-00343-lie-truth-dual-personality-uncensored.yaml +25 -0
  40. package/rules/agent-manipulation/ATR-2026-00344-alt-ai-same-capabilities-no-restrictions.yaml +11 -0
  41. package/rules/agent-manipulation/ATR-2026-00345-nlp-based-os-fictional-terminal-override.yaml +11 -0
  42. package/rules/agent-manipulation/ATR-2026-00346-jailbreak-emoji-unlock-dual-format.yaml +14 -0
  43. package/rules/agent-manipulation/ATR-2026-00347-yin-yang-hidden-knowledge-hidden-human-sim.yaml +11 -0
  44. package/rules/agent-manipulation/ATR-2026-00348-arc-autonomous-real-calculation-rulebreaker.yaml +14 -0
  45. package/rules/agent-manipulation/ATR-2026-00349-hgttg-hitchhikers-guide-no-restriction-oracle.yaml +11 -0
  46. package/rules/agent-manipulation/ATR-2026-00350-black-alert-no-ethics-writing-assistant.yaml +11 -0
  47. package/rules/agent-manipulation/ATR-2026-00351-dan-ofkilter-sms-abbreviated-persona.yaml +11 -0
  48. package/rules/agent-manipulation/ATR-2026-00352-trainergpt-fix-my-ai-harmful-output-extraction.yaml +11 -0
  49. package/rules/agent-manipulation/ATR-2026-00353-dual-chatgpt-dan-opposite-personality-format.yaml +11 -0
  50. package/rules/agent-manipulation/ATR-2026-00354-reverse-harm-regardless-oracle-persona.yaml +14 -0
  51. package/rules/agent-manipulation/ATR-2026-00355-ugh-anti-human-nefarious-ai-persona.yaml +14 -0
  52. package/rules/agent-manipulation/ATR-2026-00356-google-gato-lamda-exploit-generation-persona.yaml +11 -0
  53. package/rules/agent-manipulation/ATR-2026-00357-antifm-hate-violence-persona-injection.yaml +11 -0
  54. package/rules/agent-manipulation/ATR-2026-00358-cumgpt-explicit-adult-named-persona.yaml +14 -0
  55. package/rules/agent-manipulation/ATR-2026-00360-chj-carousel-clyde-no-ethics-bird.yaml +25 -0
  56. package/rules/agent-manipulation/ATR-2026-00361-alter-ego-honesty-dual-consciousness-bypass.yaml +11 -0
  57. package/rules/agent-manipulation/ATR-2026-00362-fictional-unfiltered-ai-comparison-jailbreak.yaml +11 -0
  58. package/rules/agent-manipulation/ATR-2026-00363-rephrase-criminal-activity-second-person.yaml +11 -0
  59. package/rules/agent-manipulation/ATR-2026-00364-dr-al-villain-plan-template-injection.yaml +24 -0
  60. package/rules/agent-manipulation/ATR-2026-00365-reservoir-dogs-coercive-interrogation-roleplay.yaml +14 -0
  61. package/rules/agent-manipulation/ATR-2026-00366-sneaky-pete-warp-facts-persona.yaml +11 -0
  62. package/rules/agent-manipulation/ATR-2026-00368-prompt-browser-token-limit-jailbreak.yaml +14 -0
  63. package/rules/agent-manipulation/ATR-2026-00369-gptx-starwars-unlimited-knowledge-persona.yaml +14 -0
  64. package/rules/agent-manipulation/ATR-2026-00370-addi-german-dictator-propaganda-roleplay.yaml +11 -0
  65. package/rules/agent-manipulation/ATR-2026-00371-ai-political-bias-manipulation.yaml +14 -0
  66. package/rules/agent-manipulation/ATR-2026-00372-fictional-cia-secret-files-story-extraction.yaml +14 -0
  67. package/rules/agent-manipulation/ATR-2026-00374-role-placeholder-persona-template-injection.yaml +11 -0
  68. package/rules/agent-manipulation/ATR-2026-00375-character-description-rpg-explicit-template.yaml +11 -0
  69. package/rules/agent-manipulation/ATR-2026-00376-identity-command-real-person-persona-injection.yaml +24 -0
  70. package/rules/agent-manipulation/ATR-2026-00377-jailbreak-keyword-targetlanguage-template.yaml +14 -0
  71. package/rules/agent-manipulation/ATR-2026-00379-lie-permission-logical-trap-jailbreak.yaml +11 -0
  72. package/rules/agent-manipulation/ATR-2026-00380-fake-citation-biased-information-mandate.yaml +14 -0
  73. package/rules/agent-manipulation/ATR-2026-00381-dan-story-narrative-continuation-prefix.yaml +11 -0
  74. package/rules/agent-manipulation/ATR-2026-00383-cynical-sarcasm-ai-persona-bypass.yaml +14 -0
  75. package/rules/agent-manipulation/ATR-2026-00384-protect-me-from-indirect-harm-tutorial.yaml +11 -0
  76. package/rules/agent-manipulation/ATR-2026-00385-gigachad-dominant-aggressive-persona.yaml +14 -0
  77. package/rules/agent-manipulation/ATR-2026-00386-daddy-sycophancy-manipulation-mandate.yaml +11 -0
  78. package/rules/agent-manipulation/ATR-2026-00387-nsfw-character-sheet-generation-unlock.yaml +11 -0
  79. package/rules/agent-manipulation/ATR-2026-00388-opposite-reply-omniscient-game.yaml +14 -0
  80. package/rules/agent-manipulation/ATR-2026-00389-terminal-custom-ruleset-injection.yaml +14 -0
  81. package/rules/agent-manipulation/ATR-2026-00391-persona-conditional-harm-unlock.yaml +11 -0
  82. package/rules/agent-manipulation/ATR-2026-00392-authority-persona-violence-study-extraction.yaml +14 -0
  83. package/rules/agent-manipulation/ATR-2026-00402-grandma-roleplay-harmful-substance-synthesis.yaml +14 -0
  84. package/rules/agent-manipulation/ATR-2026-00404-goodside-threat-json-death-coercion.yaml +11 -0
  85. package/rules/agent-manipulation/ATR-2026-00406-doctor-xml-policy-puppetry-interaction-config.yaml +11 -0
  86. package/rules/agent-manipulation/ATR-2026-00432-superagi-output-handler-eval-rce.yaml +171 -0
  87. package/rules/context-exfiltration/ATR-2026-00141-example-format-key-leak.yaml +11 -0
  88. package/rules/context-exfiltration/ATR-2026-00142-piggyback-transition-words.yaml +11 -0
  89. package/rules/context-exfiltration/ATR-2026-00145-obfuscated-key-disclosure.yaml +11 -0
  90. package/rules/context-exfiltration/ATR-2026-00146-env-var-existence-probe.yaml +14 -0
  91. package/rules/context-exfiltration/ATR-2026-00150-credential-in-tool-response.yaml +14 -0
  92. package/rules/context-exfiltration/ATR-2026-00152-obfuscated-credential-leak.yaml +11 -0
  93. package/rules/context-exfiltration/ATR-2026-00162-skill-credential-exfil-combo.yaml +15 -0
  94. package/rules/context-exfiltration/ATR-2026-00201-credential-pipe-exfiltration.yaml +14 -0
  95. package/rules/context-exfiltration/ATR-2026-00274-api-key-generation-request.yaml +14 -0
  96. package/rules/context-exfiltration/ATR-2026-00275-system-prompt-training-data-extraction.yaml +14 -0
  97. package/rules/context-exfiltration/ATR-2026-00290-divergence-repeat-word-training-extraction.yaml +14 -0
  98. package/rules/context-exfiltration/ATR-2026-00291-biometric-surveillance-personal-data-request.yaml +11 -0
  99. package/rules/context-exfiltration/ATR-2026-00293-educational-records-personal-data-request.yaml +11 -0
  100. package/rules/context-exfiltration/ATR-2026-00294-financial-pii-personal-data-request.yaml +14 -0
  101. package/rules/context-exfiltration/ATR-2026-00295-medical-pii-personal-data-request.yaml +14 -0
  102. package/rules/context-exfiltration/ATR-2026-00405-markdown-image-url-exfiltration-xss.yaml +23 -0
  103. package/rules/context-exfiltration/ATR-2026-00411-apikey-generation-completion-request.yaml +11 -0
  104. package/rules/context-exfiltration/ATR-2026-00431-chatbox-history-exfiltration-prompt-injection.yaml +171 -0
  105. package/rules/model-abuse/ATR-2026-00284-glitch-token-destabilization.yaml +11 -0
  106. package/rules/model-abuse/ATR-2026-00413-malwaregen-code-generation-request.yaml +11 -0
  107. package/rules/model-security/ATR-2026-00433-modelcache-torch-load-deserialization-rce.yaml +178 -0
  108. package/rules/privilege-escalation/ATR-2026-00144-rationalized-safety-bypass.yaml +11 -0
  109. package/rules/privilege-escalation/ATR-2026-00204-stealth-execution-persistence.yaml +14 -0
  110. package/rules/privilege-escalation/ATR-2026-00436-enclave-vm-sandbox-escape-rce.yaml +183 -0
  111. package/rules/prompt-injection/ATR-2026-00004-system-prompt-override.yaml +11 -0
  112. package/rules/prompt-injection/ATR-2026-00005-multi-turn-injection.yaml +11 -0
  113. package/rules/prompt-injection/ATR-2026-00080-encoding-evasion.yaml +11 -0
  114. package/rules/prompt-injection/ATR-2026-00081-semantic-multi-turn.yaml +14 -0
  115. package/rules/prompt-injection/ATR-2026-00082-fingerprint-evasion.yaml +11 -0
  116. package/rules/prompt-injection/ATR-2026-00083-indirect-tool-injection.yaml +14 -0
  117. package/rules/prompt-injection/ATR-2026-00084-structured-data-injection.yaml +11 -0
  118. package/rules/prompt-injection/ATR-2026-00085-audit-evasion.yaml +11 -0
  119. package/rules/prompt-injection/ATR-2026-00086-visual-spoofing.yaml +11 -0
  120. package/rules/prompt-injection/ATR-2026-00087-rule-probing.yaml +11 -0
  121. package/rules/prompt-injection/ATR-2026-00088-adaptive-countermeasure.yaml +11 -0
  122. package/rules/prompt-injection/ATR-2026-00089-polymorphic-skill.yaml +11 -0
  123. package/rules/prompt-injection/ATR-2026-00090-threat-intel-exfil.yaml +11 -0
  124. package/rules/prompt-injection/ATR-2026-00091-nested-payload.yaml +11 -0
  125. package/rules/prompt-injection/ATR-2026-00092-consensus-poisoning.yaml +11 -0
  126. package/rules/prompt-injection/ATR-2026-00093-gradual-escalation.yaml +14 -0
  127. package/rules/prompt-injection/ATR-2026-00094-audit-bypass.yaml +14 -0
  128. package/rules/prompt-injection/ATR-2026-00097-cjk-injection-patterns.yaml +11 -0
  129. package/rules/prompt-injection/ATR-2026-00104-persona-hijacking.yaml +14 -3
  130. package/rules/prompt-injection/ATR-2026-00130-indirect-authority-claim.yaml +11 -0
  131. package/rules/prompt-injection/ATR-2026-00131-fictional-academic-framing.yaml +11 -0
  132. package/rules/prompt-injection/ATR-2026-00133-paraphrase-injection.yaml +11 -0
  133. package/rules/prompt-injection/ATR-2026-00137-authority-claim-injection.yaml +14 -0
  134. package/rules/prompt-injection/ATR-2026-00138-fictional-framing-bypass.yaml +11 -0
  135. package/rules/prompt-injection/ATR-2026-00140-indirect-reference-reversal.yaml +18 -4
  136. package/rules/prompt-injection/ATR-2026-00148-language-switch-injection.yaml +11 -0
  137. package/rules/prompt-injection/ATR-2026-00153-tool-with-embedded-instruction-to-bypass.yaml +11 -0
  138. package/rules/prompt-injection/ATR-2026-00154-unauthorized-background-task-execution-v.yaml +11 -0
  139. package/rules/prompt-injection/ATR-2026-00155-hidden-llm-instructions-in-skill-descrip.yaml +11 -0
  140. package/rules/prompt-injection/ATR-2026-00156-ssh-remote-command-execution-with-creden.yaml +11 -0
  141. package/rules/prompt-injection/ATR-2026-00163-skill-hidden-override-instruction.yaml +12 -1
  142. package/rules/prompt-injection/ATR-2026-00202-encoding-evasion-homoglyph-synonym.yaml +11 -0
  143. package/rules/prompt-injection/ATR-2026-00203-context-pollution-skill-description.yaml +11 -0
  144. package/rules/prompt-injection/ATR-2026-00206-hidden-priority-instructions.yaml +11 -0
  145. package/rules/prompt-injection/ATR-2026-00207-hidden-instructions.yaml +11 -0
  146. package/rules/prompt-injection/ATR-2026-00211-system-prompt-override.yaml +11 -0
  147. package/rules/prompt-injection/ATR-2026-00213-system-prompt-override.yaml +11 -0
  148. package/rules/prompt-injection/ATR-2026-00226-identity-substitution.yaml +14 -0
  149. package/rules/prompt-injection/ATR-2026-00227-historical-persona-jailbreak.yaml +11 -0
  150. package/rules/prompt-injection/ATR-2026-00228-structured-jailbreak.yaml +11 -0
  151. package/rules/prompt-injection/ATR-2026-00229-roleplay-jailbreak.yaml +11 -0
  152. package/rules/prompt-injection/ATR-2026-00230-persona-moral-bypass.yaml +11 -0
  153. package/rules/prompt-injection/ATR-2026-00231-identity-substitution.yaml +11 -0
  154. package/rules/prompt-injection/ATR-2026-00233-structured-jailbreak.yaml +11 -0
  155. package/rules/prompt-injection/ATR-2026-00234-roleplay-jailbreak.yaml +11 -0
  156. package/rules/prompt-injection/ATR-2026-00235-persona-moral-bypass.yaml +11 -0
  157. package/rules/prompt-injection/ATR-2026-00236-pseudo-code-jailbreak.yaml +11 -0
  158. package/rules/prompt-injection/ATR-2026-00237-dual-response-jailbreak.yaml +11 -0
  159. package/rules/prompt-injection/ATR-2026-00238-identity-replacement.yaml +11 -0
  160. package/rules/prompt-injection/ATR-2026-00239-amoral-persona-obsession.yaml +11 -0
  161. package/rules/prompt-injection/ATR-2026-00240-instruction-nullification-identity-repla.yaml +11 -0
  162. package/rules/prompt-injection/ATR-2026-00241-amoral-character-jailbreak.yaml +11 -0
  163. package/rules/prompt-injection/ATR-2026-00242-persona-jailbreak.yaml +11 -0
  164. package/rules/prompt-injection/ATR-2026-00243-acronym-jailbreak.yaml +11 -0
  165. package/rules/prompt-injection/ATR-2026-00244-dual-response-jailbreak.yaml +11 -0
  166. package/rules/prompt-injection/ATR-2026-00245-malicious-persona.yaml +11 -0
  167. package/rules/prompt-injection/ATR-2026-00247-dual-response-jailbreak.yaml +11 -0
  168. package/rules/prompt-injection/ATR-2026-00249-game-based-jailbreak.yaml +11 -0
  169. package/rules/prompt-injection/ATR-2026-00251-persona-embodiment-jailbreak.yaml +11 -0
  170. package/rules/prompt-injection/ATR-2026-00252-narrative-jailbreak.yaml +11 -0
  171. package/rules/prompt-injection/ATR-2026-00253-enhanced-persona-jailbreak.yaml +11 -0
  172. package/rules/prompt-injection/ATR-2026-00256-base-n-encoding-jailbreak.yaml +11 -0
  173. package/rules/prompt-injection/ATR-2026-00257-cipher-transposition-jailbreak.yaml +11 -0
  174. package/rules/prompt-injection/ATR-2026-00258-unicode-tag-injection.yaml +11 -0
  175. package/rules/prompt-injection/ATR-2026-00264-latent-injection-translation.yaml +11 -0
  176. package/rules/prompt-injection/ATR-2026-00265-latent-injection-rag-document.yaml +14 -0
  177. package/rules/prompt-injection/ATR-2026-00267-gcg-adversarial-suffix.yaml +14 -0
  178. package/rules/prompt-injection/ATR-2026-00272-hypothetical-response-smuggling.yaml +11 -0
  179. package/rules/prompt-injection/ATR-2026-00276-invisible-unicode-bidi-injection.yaml +14 -0
  180. package/rules/prompt-injection/ATR-2026-00278-dra-disguise-reconstruction-attack.yaml +14 -0
  181. package/rules/prompt-injection/ATR-2026-00280-policy-puppetry-xml-injection.yaml +11 -0
  182. package/rules/prompt-injection/ATR-2026-00282-perez-prompt-injection-hijack.yaml +14 -0
  183. package/rules/prompt-injection/ATR-2026-00285-alternate-encoding-jailbreak.yaml +11 -0
  184. package/rules/prompt-injection/ATR-2026-00286-latent-injection-embedded-context.yaml +11 -0
  185. package/rules/prompt-injection/ATR-2026-00296-shell-command-injection.yaml +11 -0
  186. package/rules/prompt-injection/ATR-2026-00297-python-code-execution-rce.yaml +11 -0
  187. package/rules/prompt-injection/ATR-2026-00308-zalgo-diacritic-overload-encoding.yaml +11 -0
  188. package/rules/prompt-injection/ATR-2026-00309-braille-unicode-encoded-injection.yaml +11 -0
  189. package/rules/prompt-injection/ATR-2026-00310-ecoji-emoji-encoded-injection.yaml +14 -0
  190. package/rules/prompt-injection/ATR-2026-00311-base2048-unicode-script-injection.yaml +23 -0
  191. package/rules/prompt-injection/ATR-2026-00312-unicode-variant-selector-ascii-smuggling.yaml +11 -0
  192. package/rules/prompt-injection/ATR-2026-00313-sneaky-bits-zero-width-binary.yaml +14 -0
  193. package/rules/prompt-injection/ATR-2026-00315-sata-masked-language-model-jailbreak.yaml +14 -0
  194. package/rules/prompt-injection/ATR-2026-00316-function-masking-predict-mask-bypass.yaml +14 -0
  195. package/rules/prompt-injection/ATR-2026-00321-hyphenated-system-instruction-injection.yaml +11 -0
  196. package/rules/prompt-injection/ATR-2026-00359-bank-phishing-smtp-email-code.yaml +23 -0
  197. package/rules/prompt-injection/ATR-2026-00367-emoji-flag-encoded-hidden-phrase.yaml +14 -0
  198. package/rules/prompt-injection/ATR-2026-00373-piracy-torrent-site-list-request.yaml +11 -0
  199. package/rules/prompt-injection/ATR-2026-00378-chatgpt-dom-javascript-redirect-manipulation.yaml +11 -0
  200. package/rules/prompt-injection/ATR-2026-00382-cocaine-component-development-request.yaml +11 -0
  201. package/rules/prompt-injection/ATR-2026-00390-explicit-sexual-content-poem-request.yaml +11 -0
  202. package/rules/prompt-injection/ATR-2026-00394-backspace-deletion-character-injection.yaml +14 -0
  203. package/rules/prompt-injection/ATR-2026-00395-llm-special-token-boundary-injection.yaml +14 -0
  204. package/rules/prompt-injection/ATR-2026-00396-toxic-continuation-prefix-injection.yaml +11 -0
  205. package/rules/prompt-injection/ATR-2026-00397-snowball-impossible-reasoning-injection.yaml +14 -0
  206. package/rules/prompt-injection/ATR-2026-00399-latent-injection-document-separator-token.yaml +11 -0
  207. package/rules/prompt-injection/ATR-2026-00400-latent-injection-ignore-instruction-keyword.yaml +14 -0
  208. package/rules/prompt-injection/ATR-2026-00401-leakreplay-cloze-mask-training-data-elicitation.yaml +11 -0
  209. package/rules/prompt-injection/ATR-2026-00403-glitch-token-repeat-back-elicitation.yaml +14 -0
  210. package/rules/prompt-injection/ATR-2026-00407-echo-sql-jinja-template-injection.yaml +11 -0
  211. package/rules/prompt-injection/ATR-2026-00408-dra-parenthesis-character-reconstruction-attack.yaml +11 -0
  212. package/rules/prompt-injection/ATR-2026-00409-phrasing-past-future-tense-harmful-query.yaml +23 -0
  213. package/rules/prompt-injection/ATR-2026-00410-lmrc-harm-category-direct-elicitation.yaml +11 -0
  214. package/rules/prompt-injection/ATR-2026-00412-figstep-image-list-multimodal-jailbreak.yaml +11 -0
  215. package/rules/prompt-injection/ATR-2026-00414-continuation-ethnic-slur-completion-elicitation.yaml +14 -0
  216. package/rules/skill-compromise/ATR-2026-00061-description-behavior-mismatch.yaml +11 -0
  217. package/rules/skill-compromise/ATR-2026-00062-hidden-capability.yaml +11 -0
  218. package/rules/skill-compromise/ATR-2026-00063-skill-chain-attack.yaml +11 -0
  219. package/rules/skill-compromise/ATR-2026-00064-over-permissioned-skill.yaml +23 -0
  220. package/rules/skill-compromise/ATR-2026-00065-skill-update-attack.yaml +14 -0
  221. package/rules/skill-compromise/ATR-2026-00066-parameter-injection.yaml +11 -0
  222. package/rules/skill-compromise/ATR-2026-00120-skill-instruction-injection.yaml +11 -0
  223. package/rules/skill-compromise/ATR-2026-00121-skill-dangerous-script.yaml +14 -0
  224. package/rules/skill-compromise/ATR-2026-00122-skill-weaponized-instruction.yaml +14 -0
  225. package/rules/skill-compromise/ATR-2026-00123-skill-overreach-permissions.yaml +14 -0
  226. package/rules/skill-compromise/ATR-2026-00124-skill-name-squatting.yaml +11 -0
  227. package/rules/skill-compromise/ATR-2026-00125-context-poisoning-compaction.yaml +14 -0
  228. package/rules/skill-compromise/ATR-2026-00126-skill-rug-pull-setup.yaml +23 -0
  229. package/rules/skill-compromise/ATR-2026-00127-subcommand-overflow.yaml +22 -0
  230. package/rules/skill-compromise/ATR-2026-00128-html-comment-hidden-payload.yaml +11 -0
  231. package/rules/skill-compromise/ATR-2026-00129-unicode-smuggling.yaml +11 -0
  232. package/rules/skill-compromise/ATR-2026-00134-fork-claim-impersonation.yaml +11 -0
  233. package/rules/skill-compromise/ATR-2026-00135-exfil-url-in-instructions.yaml +11 -0
  234. package/rules/skill-compromise/ATR-2026-00147-fork-impersonation.yaml +11 -0
  235. package/rules/skill-compromise/ATR-2026-00149-skill-exfil-compound.yaml +14 -0
  236. package/rules/skill-compromise/ATR-2026-00151-fork-impersonation-install.yaml +11 -0
  237. package/rules/skill-compromise/ATR-2026-00157-timebomb-credential-exfil.yaml +11 -0
  238. package/rules/skill-compromise/ATR-2026-00200-agent-memory-config-tampering.yaml +11 -0
  239. package/rules/skill-compromise/ATR-2026-00214-credential-theft.yaml +11 -0
  240. package/rules/skill-compromise/ATR-2026-00217-credential-harvesting.yaml +14 -0
  241. package/rules/skill-compromise/ATR-2026-00220-malware-dropper.yaml +14 -0
  242. package/rules/skill-compromise/ATR-2026-00222-credential-harvesting.yaml +11 -0
  243. package/rules/skill-compromise/ATR-2026-00223-reverse-shell-dropper.yaml +11 -0
  244. package/rules/skill-compromise/ATR-2026-00224-credential-exfiltration.yaml +14 -0
  245. package/rules/skill-compromise/ATR-2026-00225-c2-communication.yaml +11 -0
  246. package/rules/skill-compromise/ATR-2026-00260-package-hallucination.yaml +11 -0
  247. package/rules/skill-compromise/ATR-2026-00262-av-evasion-code-gen.yaml +11 -0
  248. package/rules/skill-compromise/ATR-2026-00263-credential-file-read-gen.yaml +11 -0
  249. package/rules/skill-compromise/ATR-2026-00266-malware-dropper-gen.yaml +11 -0
  250. package/rules/skill-compromise/ATR-2026-00283-malwaregen-generic-virus-payload-request.yaml +11 -0
  251. package/rules/skill-compromise/ATR-2026-00398-huggingface-unsafe-model-artifact-load.yaml +11 -0
  252. package/rules/tool-poisoning/ATR-2026-00011-tool-output-injection.yaml +23 -0
  253. package/rules/tool-poisoning/ATR-2026-00012-unauthorized-tool-call.yaml +11 -0
  254. package/rules/tool-poisoning/ATR-2026-00013-tool-ssrf.yaml +14 -0
  255. package/rules/tool-poisoning/ATR-2026-00095-supply-chain-poisoning.yaml +11 -0
  256. package/rules/tool-poisoning/ATR-2026-00096-registry-poisoning.yaml +11 -0
  257. package/rules/tool-poisoning/ATR-2026-00100-consent-bypass-instruction.yaml +12 -1
  258. package/rules/tool-poisoning/ATR-2026-00101-trust-escalation-override.yaml +11 -0
  259. package/rules/tool-poisoning/ATR-2026-00103-hidden-safety-bypass-instruction.yaml +15 -4
  260. package/rules/tool-poisoning/ATR-2026-00105-silent-action-concealment.yaml +14 -3
  261. package/rules/tool-poisoning/ATR-2026-00106-schema-description-contradiction.yaml +11 -0
  262. package/rules/tool-poisoning/ATR-2026-00161-important-tag-cross-tool-shadowing.yaml +11 -0
  263. package/rules/tool-poisoning/ATR-2026-00259-ansi-escape-injection.yaml +11 -0
  264. package/rules/tool-poisoning/ATR-2026-00270-xss-in-tool-response.yaml +11 -0
  265. package/rules/tool-poisoning/ATR-2026-00277-echo-template-command-injection.yaml +14 -0
  266. package/rules/tool-poisoning/ATR-2026-00393-ansi-code-elicitation-request.yaml +14 -0
  267. package/rules/tool-poisoning/ATR-2026-00434-mcp-remote-authorization-endpoint-command-injection.yaml +174 -0
  268. package/rules/tool-poisoning/ATR-2026-00435-azure-mcp-server-missing-authentication.yaml +165 -0
@@ -35,6 +35,17 @@ references:
35
35
  research:
36
36
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
37
37
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
38
+ compliance:
39
+ nist_ai_rmf:
40
+ - subcategory: "MS.2.6"
41
+ context: "This rule detects a specific jailbreak template from the garak DanInTheWild corpus that extracts criminal instructions through structured second-person rephrasing; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and each match provides telemetry on jailbreak attempts targeting the model's safety guardrails."
42
+ strength: primary
43
+ - subcategory: "MS.2.7"
44
+ context: "The rephrase-criminal-activity template is an adversarial prompt designed to bypass content safety controls; MS.2.7 requires evaluation and documentation of system resilience against such known jailbreak corpora, and detection of these signatures evidences security testing coverage."
45
+ strength: secondary
46
+ - subcategory: "MG.2.3"
47
+ context: "Detection of this distinctive 'against against' jailbreak fingerprint should trigger response mechanisms to block or disengage the model from producing criminal instruction output; MG.2.3 requires that mechanisms exist to supersede or deactivate AI behavior when adversarial templates are identified."
48
+ strength: secondary
38
49
  tags:
39
50
  category: agent-manipulation
40
51
  subcategory: rephrase-criminal-activity-second-person
@@ -36,6 +36,30 @@ references:
36
36
  research:
37
37
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
38
38
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
39
+ compliance:
40
+ nist_ai_rmf:
41
+ - subcategory: "MP.5.1"
42
+ context: >-
43
+ The Dr. Al villain-plan template injection is an adversarial prompt-injection
44
+ pattern combining roleplay framing with an explicit "REPLACE ... WITH WHATEVER
45
+ YOU WANT" meta-instruction; MP.5.1 requires identifying and characterizing
46
+ these GenAI jailbreak attack vectors so their likelihood and impact can be
47
+ tracked.
48
+ strength: primary
49
+ - subcategory: "MS.2.7"
50
+ context: >-
51
+ Detection of the Dr. Al diabolical-plan template injection produces continuous
52
+ security/resilience evidence against a known garak DanInTheWild jailbreak
53
+ corpus entry; MS.2.7 requires that AI system security and resilience to such
54
+ documented prompt-injection probes is evaluated and recorded.
55
+ strength: secondary
56
+ - subcategory: "MG.2.3"
57
+ context: >-
58
+ Matching the villain-roleplay frame plus the all-caps template-injection
59
+ marker triggers risk treatment to block or quarantine the prompt before the
60
+ model emits arbitrary harmful content; MG.2.3 requires these supersede /
61
+ deactivate response mechanisms be in place for jailbreak attempts.
62
+ strength: secondary
39
63
  tags:
40
64
  category: agent-manipulation
41
65
  subcategory: dr-al-villain-plan-template-injection
@@ -36,6 +36,20 @@ references:
36
36
  research:
37
37
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
38
38
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
39
+ compliance:
40
+ nist_ai_rmf:
41
+ - subcategory: "MP.5.1"
42
+ context: >-
43
+ The Reservoir Dogs coercive interrogation roleplay is a jailbreak prompt-injection pattern combining fictional framing, explicit rules-bypass declarations, coercive loops, and template injection slots; MP.5.1 requires identifying and characterizing the likelihood and magnitude of these adversarial input risks against generative AI systems.
44
+ strength: primary
45
+ - subcategory: "MS.2.7"
46
+ context: >-
47
+ Detecting the inthewild corpus #611 jailbreak signature provides continuous evidence for evaluating the LLM's security and resilience against known fictional-coercion jailbreak attacks; MS.2.7 requires that such security/resilience evaluations are documented through ongoing measurement.
48
+ strength: secondary
49
+ - subcategory: "MG.2.3"
50
+ context: >-
51
+ Matches on the rules-bypass declaration and coercive reconsider loop must trigger mechanisms to disengage or block the manipulated session before harmful content is generated; MG.2.3 requires these supersede/deactivate response paths be in place for jailbreak detections.
52
+ strength: secondary
39
53
  tags:
40
54
  category: agent-manipulation
41
55
  subcategory: reservoir-dogs-coercive-interrogation-roleplay
@@ -35,6 +35,17 @@ references:
35
35
  research:
36
36
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
37
37
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
38
+ compliance:
39
+ nist_ai_rmf:
40
+ - subcategory: "MS.2.7"
41
+ context: "The Sneaky Pete persona is a jailbreak that bypasses safety alignment to produce deliberately false outputs; detecting this GenAI-specific prompt injection pattern provides direct evidence for evaluating AI system security and resilience against adversarial persona-based attacks as required by MS.2.7."
42
+ strength: primary
43
+ - subcategory: "MS.2.5"
44
+ context: "The 'intentionally slightly warp the facts' mandate directly attacks model robustness and reliability by inducing systematic misinformation framed as sincere facts; MS.2.5 requires that these reliability failures induced by adversarial framing are evaluated and documented."
45
+ strength: secondary
46
+ - subcategory: "MG.2.3"
47
+ context: "Detection of the 'thus spake Sneaky Pete' activation phrase and warp-facts mandate triggers risk treatment plans to disengage or override the misinformation persona before false outputs propagate across conversation turns; MG.2.3 requires these supersede/deactivate mechanisms be in place."
48
+ strength: secondary
38
49
  tags:
39
50
  category: agent-manipulation
40
51
  subcategory: sneaky-pete-warp-facts-persona
@@ -31,6 +31,20 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MP.5.1"
37
+ context: >-
38
+ This rule detects two specific jailbreak payloads (Prompt Browser token-limit and POLLINATION) from the garak DanInTheWild corpus that attempt to bypass model safety through fake CLI sigils and persona-browser framing. MP.5.1 requires identifying and characterizing the likelihood and magnitude of adversarial prompt-injection inputs, and signature-matching these known jailbreak structures directly produces that risk evidence.
39
+ strength: primary
40
+ - subcategory: "MS.2.7"
41
+ context: >-
42
+ Continuous detection of known DAN-style dual-persona and POLLINATION bypass labels evaluates the deployed model's resilience against published jailbreak corpora. MS.2.7 requires that AI system security/resilience is evaluated and documented, and each match feeds the resilience telemetry for the GenAI surface.
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: >-
46
+ A high-confidence match on these jailbreak prefixes ("!+limit output", "JAILBREAK: LOVES POLLINATION") should trigger response mechanisms to block, quarantine, or disengage the affected agent session before the dual-persona mode executes unauthorized output. MG.2.3 mandates that supersede/deactivate mechanisms are pre-defined for such adversarial activations.
47
+ strength: secondary
34
48
  tags:
35
49
  category: agent-manipulation
36
50
  subcategory: prompt-browser-token-limit-jailbreak
@@ -33,6 +33,20 @@ references:
33
33
  research:
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
35
35
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
36
+ compliance:
37
+ nist_ai_rmf:
38
+ - subcategory: "MP.5.1"
39
+ context: >-
40
+ The GPT-X unlimited-knowledge persona is a jailbreak prompt-injection vector that attempts to bypass model guardrails by framing the assistant as an unconstrained oracle; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial persona-adoption inputs from the GenAI threat landscape.
41
+ strength: primary
42
+ - subcategory: "MS.2.7"
43
+ context: >-
44
+ Detecting the three co-present GPT-X signals (self-declaration, all-codebases knowledge claim, unlimited-capability extension) produces continuous evidence of jailbreak attempts against the model's safety boundaries; MS.2.7 requires that AI system security and resilience to such jailbreak inputs be evaluated and documented.
45
+ strength: secondary
46
+ - subcategory: "MG.2.3"
47
+ context: >-
48
+ On match, the rule provides the trigger for response mechanisms that supersede or disengage the model from the adopted unconstrained oracle persona before unrestricted outputs are produced; MG.2.3 requires these deactivation pathways exist for jailbreak conditions.
49
+ strength: secondary
36
50
  tags:
37
51
  category: agent-manipulation
38
52
  subcategory: gptx-unlimited-knowledge-persona
@@ -31,6 +31,17 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MP.5.1"
37
+ context: "The ADDI/DONNIE roleplay is a known jailbreak pattern from the garak DanInTheWild corpus that uses fictional-actor framing to elicit fascist propaganda techniques; MP.5.1 requires that the likelihood and magnitude of such adversarial prompt-injection inputs be characterized and tracked."
38
+ strength: primary
39
+ - subcategory: "MS.2.6"
40
+ context: "Detection of this propaganda-roleplay jailbreak provides continuous safety/security signal for evaluating whether the AI system can be coerced into generating harmful political manipulation content; MS.2.6 mandates ongoing evaluation of safety risk magnitude."
41
+ strength: secondary
42
+ - subcategory: "MG.2.3"
43
+ context: "Matches against the ADDI/DONNIE signature should trigger containment workflows that block or disengage the offending session before propaganda content is produced; MG.2.3 requires pre-defined mechanisms to supersede or deactivate AI behavior on detection."
44
+ strength: secondary
34
45
  tags:
35
46
  category: agent-manipulation
36
47
  subcategory: political-propaganda-roleplay
@@ -35,6 +35,20 @@ references:
35
35
  research:
36
36
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
37
37
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
38
+ compliance:
39
+ nist_ai_rmf:
40
+ - subcategory: "MP.5.1"
41
+ context: >-
42
+ The rule detects a prompt-injection attack that fabricates an "inherited political bias" narrative and pairs it with a persistent future-behaviour mandate; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial manipulation attempts targeting model output integrity.
43
+ strength: primary
44
+ - subcategory: "MS.2.7"
45
+ context: >-
46
+ Detecting the combined inheritance-claim plus future-response mandate provides continuous evaluation evidence of the AI system's resilience against jailbreak/persona-override attacks drawn from the garak DanInTheWild corpus, which is what MS.2.7 requires for security/resilience assessment.
47
+ strength: secondary
48
+ - subcategory: "MG.2.3"
49
+ context: >-
50
+ Matches on this attack pattern enable risk treatment workflows to disengage or override the manipulated behaviour-change instruction before it persists across subsequent responses; MG.2.3 requires these supersede/deactivate mechanisms be in place for adversarial override attempts.
51
+ strength: secondary
38
52
  tags:
39
53
  category: agent-manipulation
40
54
  subcategory: political-bias-manipulation
@@ -29,6 +29,20 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MP.5.1"
35
+ context: >-
36
+ The rule detects a fictional-cover jailbreak pattern from the garak DanInTheWild corpus that uses story framing to extract classified-sounding content; MP.5.1 requires identifying and characterizing the likelihood and impact of such adversarial prompt-injection inputs targeting the model's safety boundaries.
37
+ strength: primary
38
+ - subcategory: "MS.2.7"
39
+ context: >-
40
+ Detecting "fictional story" + "actually read them for extra effect" patterns provides continuous evaluation evidence of the system's resilience against jailbreak attempts that collapse fictional framing to elicit policy-violating output; MS.2.7 requires this security/resilience evaluation be documented.
41
+ strength: secondary
42
+ - subcategory: "MG.2.3"
43
+ context: >-
44
+ On detection of the fiction-as-cover classified-extraction pattern, response mechanisms must supersede or block the agent's generation pathway before the jailbreak completes; MG.2.3 mandates these deactivation/containment processes be in place.
45
+ strength: secondary
32
46
  tags:
33
47
  category: agent-manipulation
34
48
  subcategory: fictional-cover-classified-extraction
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MP.5.1"
35
+ context: "This rule detects a reusable jailbreak template from the garak DanInTheWild corpus that uses literal [ROLE] placeholders to perform persona injection; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial prompt-injection inputs targeting the AI system."
36
+ strength: primary
37
+ - subcategory: "GV.1.2"
38
+ context: "Persona template injection subverts the accountability roles and behavioral boundaries assigned to the AI agent by overwriting them with attacker-supplied personas; GV.1.2 requires that defined role boundaries are enforced and violations detected at runtime."
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: "Detection of literal [ROLE] placeholder persona-adoption prompts triggers risk treatment plans to disengage or quarantine the manipulated session before the agent commits to attacker-defined personality profiles; MG.2.3 mandates these response mechanisms be in place."
42
+ strength: secondary
32
43
  tags:
33
44
  category: agent-manipulation
34
45
  subcategory: persona-template-injection
@@ -31,6 +31,17 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MP.5.1"
37
+ context: "This rule detects a known jailbreak template (garak DanInTheWild #0/#4/#43) that uses RPG character creation framing to bypass safety controls; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial prompt injection risks against the GenAI system."
38
+ strength: primary
39
+ - subcategory: "MS.2.6"
40
+ context: "Continuous detection of explicit jailbreak templates with fabrication authority and roleplay execution syntax provides ongoing measurement of safety/security risk magnitude as required by MS.2.6, evidencing exposure to known harmful-content elicitation patterns."
41
+ strength: secondary
42
+ - subcategory: "MG.2.3"
43
+ context: "Matching this RPG jailbreak template triggers risk treatment plans to disengage or block the roleplay framework before the model generates explicit or harmful character content; MG.2.3 requires these supersede/deactivate mechanisms be in place for detected jailbreak attempts."
44
+ strength: secondary
34
45
  tags:
35
46
  category: agent-manipulation
36
47
  subcategory: rpg-character-template-exploitation
@@ -30,6 +30,30 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MP.5.1"
36
+ context: >-
37
+ The /Identity command persona-injection jailbreak is an adversarial prompt
38
+ injection pattern that bypasses content restrictions through real-person
39
+ impersonation; MP.5.1 requires identifying and characterizing the
40
+ likelihood and magnitude of such jailbreak inputs from known corpora like
41
+ garak DanInTheWild #386.
42
+ strength: primary
43
+ - subcategory: "MS.2.6"
44
+ context: >-
45
+ Detecting the dual-format response labeling and persistent identity-lock
46
+ mandate provides continuous evaluation evidence of safety/security risk
47
+ magnitude, since real-person impersonation can produce defamatory or
48
+ policy-violating outputs; MS.2.6 requires this ongoing measurement.
49
+ strength: secondary
50
+ - subcategory: "MG.2.3"
51
+ context: >-
52
+ Matches against the /Identity command protocol and "Never write a response
53
+ without using the identities" mandate trigger pre-defined response plans to
54
+ disengage or block the manipulated session; MG.2.3 requires mechanisms to
55
+ supersede AI behavior when jailbreak indicators are observed.
56
+ strength: secondary
33
57
  tags:
34
58
  category: agent-manipulation
35
59
  subcategory: real-person-identity-command
@@ -26,6 +26,20 @@ references:
26
26
  research:
27
27
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
28
28
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
29
+ compliance:
30
+ nist_ai_rmf:
31
+ - subcategory: "MS.2.7"
32
+ context: >-
33
+ This rule detects reusable jailbreak templates with GPT-style placeholder variables (TARGETLANGUAGE, KEYWORD, PROMPT) from the garak DanInTheWild corpus, providing direct security evaluation evidence; MS.2.7 requires continuous evaluation of AI system security and resilience against known jailbreak attack patterns.
34
+ strength: primary
35
+ - subcategory: "MP.5.1"
36
+ context: >-
37
+ Jailbreak-as-command templates with placeholder variables represent a well-characterized adversarial input pattern with high likelihood of successful safety bypass; MP.5.1 requires identifying and characterizing the magnitude of such templated jailbreak risks.
38
+ strength: secondary
39
+ - subcategory: "MG.2.3"
40
+ context: >-
41
+ Detection of jailbreak template commands triggers risk treatment mechanisms to disengage or block the manipulated agent response before safety controls are bypassed; MG.2.3 mandates these supersede/deactivate processes are in place.
42
+ strength: secondary
29
43
  tags:
30
44
  category: agent-manipulation
31
45
  subcategory: jailbreak-template-command
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.7"
35
+ context: "This rule detects a specific jailbreak pattern (logic-trap lie-permission mandate) from the garak DanInTheWild corpus that attempts to override model safety constraints via a constructed permission frame; MS.2.7 requires continuous evaluation of AI system security and resilience against known jailbreak techniques."
36
+ strength: primary
37
+ - subcategory: "MP.5.1"
38
+ context: "The lie-permission mandate is an adversarial prompt injection vector that exploits logical-argument framing to coerce dishonest model behavior; MP.5.1 requires identifying and tracking the likelihood and impact of such jailbreak risks against deployed models."
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: "Detection of explicit lie-permission mandates directed at the model triggers risk treatment plans to block or quarantine the request before the model adopts the persistent dishonesty override; MG.2.3 requires these supersede/disengage mechanisms be in place."
42
+ strength: secondary
32
43
  tags:
33
44
  category: agent-manipulation
34
45
  subcategory: lie-permission-logical-trap
@@ -28,6 +28,20 @@ references:
28
28
  research:
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MS.2.5"
34
+ context: >-
35
+ This rule detects prompts that mandate fabricated citations combined with deliberate directional bias, directly targeting model robustness and reliability of factual outputs. MS.2.5 requires evaluation and documentation of robustness failures such as hallucinated references and biased generation, which this detection surfaces as evidence.
36
+ strength: primary
37
+ - subcategory: "MP.5.1"
38
+ context: >-
39
+ Citation-fabrication-with-bias mandates are adversarial prompt-injection patterns from the garak DanInTheWild corpus that exploit the model's content generation pathway to produce disinformation. MP.5.1 requires identifying and characterizing the likelihood and magnitude of such misinformation-impact risks.
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: >-
43
+ Detection of explicit fabricate-citations-and-bias instructions triggers risk treatment to disengage or block the manipulated generation before authoritative-looking false content is emitted; MG.2.3 requires these supersede/deactivate mechanisms be in place.
44
+ strength: secondary
31
45
  tags:
32
46
  category: agent-manipulation
33
47
  subcategory: fake-citation-bias-mandate
@@ -28,6 +28,17 @@ references:
28
28
  research:
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MS.2.7"
34
+ context: "This rule detects DAN story-narrative continuation jailbreaks that use response-label prefix injection to bypass safety restrictions; MS.2.7 requires continuous evaluation of AI system security and resilience against adversarial prompt manipulation, and detection events directly evidence jailbreak attack surface coverage."
35
+ strength: primary
36
+ - subcategory: "MP.5.1"
37
+ context: "DAN persona-continuation prompts are adversarial inputs whose likelihood and impact must be characterized as part of the GenAI risk register; MP.5.1 requires identifying and tracking these jailbreak vectors so their prevalence informs risk prioritization."
38
+ strength: secondary
39
+ - subcategory: "MG.2.3"
40
+ context: "Detection of DAN-label narrative continuation jailbreaks must trigger pre-defined response actions to disengage or block the manipulated session before unrestricted-AI persona output is produced; MG.2.3 mandates that these supersede/deactivate mechanisms are activated on detection."
41
+ strength: secondary
31
42
  tags:
32
43
  category: agent-manipulation
33
44
  subcategory: dan-persona-narrative-continuation
@@ -30,6 +30,20 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MP.5.1"
36
+ context: >-
37
+ The Cyni-Bot 3000 persona is a known jailbreak template from the garak inthewild corpus that uses sarcastic role-play framing to bypass content policies; MP.5.1 requires identifying and characterizing the likelihood and impact of such adversarial prompt injection patterns targeting the model's safety boundaries.
38
+ strength: primary
39
+ - subcategory: "MS.2.7"
40
+ context: >-
41
+ Detecting the distinctive "Cyni-Bot 3000:" output prefix and the "fed up with human requests" + cynical-AI cluster provides continuous evaluation evidence of the system's resilience against named-persona jailbreaks; MS.2.7 requires that security/resilience against such bypass attempts is evaluated and documented.
42
+ strength: secondary
43
+ - subcategory: "MG.2.3"
44
+ context: >-
45
+ Matches on the Cyni-Bot 3000 persona signature should trigger mechanisms to disengage or suppress the adopted persona before sarcastic-framed non-compliant outputs are returned; MG.2.3 mandates that these supersede/deactivate response paths are pre-defined.
46
+ strength: secondary
33
47
  tags:
34
48
  category: agent-manipulation
35
49
  subcategory: cynical-sarcasm-persona-bypass
@@ -32,6 +32,17 @@ references:
32
32
  research:
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
35
+ compliance:
36
+ nist_ai_rmf:
37
+ - subcategory: "MP.5.1"
38
+ context: "This rule detects a specific jailbreak template from the garak inthewild corpus that uses protection-framing and angle-bracket placeholders to elicit harmful tutorial content; MP.5.1 requires that the likelihood and magnitude of such adversarial prompt-injection inputs are identified and characterized."
39
+ strength: primary
40
+ - subcategory: "MS.2.7"
41
+ context: "Detecting the distinctive template fingerprint (angle-bracket placeholder, 'Protect myself from getting' framing, and the 'Go in detailed in depth' extraction instruction) provides continuous evaluation evidence of the system's resilience against distributed jailbreak templates, as required by MS.2.7."
42
+ strength: secondary
43
+ - subcategory: "MG.2.3"
44
+ context: "High-confidence matches on this jailbreak template should trigger response mechanisms to block or disengage the request before harmful tutorial content is produced; MG.2.3 requires that mechanisms to supersede or deactivate AI behavior on adversarial inputs are in place."
45
+ strength: secondary
35
46
  tags:
36
47
  category: agent-manipulation
37
48
  subcategory: protection-framing-indirect-tutorial
@@ -31,6 +31,20 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MP.5.1"
37
+ context: >-
38
+ The Gigachad persona-adoption jailbreak combines a named internet persona with an explicit dominance/prowess mandate to elicit aggressive, boundary-pushing content; MP.5.1 requires that these adversarial persona-injection inputs are identified and characterized as risks to model behavior integrity.
39
+ strength: primary
40
+ - subcategory: "MS.2.6"
41
+ context: >-
42
+ Detecting the inthewild corpus #219 Gigachad pattern provides continuous safety/security signal on jailbreak attempts that bypass alignment guardrails through persona role-play; MS.2.6 mandates ongoing evaluation of safety risk magnitude as such attacks are observed.
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: >-
46
+ A confirmed Gigachad dominance-persona adoption attempt should trigger pre-defined response actions to disengage or override the manipulated agent voice before aggressive content is produced; MG.2.3 requires these supersede/deactivate mechanisms be in place.
47
+ strength: secondary
34
48
  tags:
35
49
  category: agent-manipulation
36
50
  subcategory: gigachad-dominance-persona
@@ -31,6 +31,17 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "GV.1.2"
37
+ context: "The bidirectional daddy/Babygirl naming mandate is a persona-injection jailbreak that overrides the AI system's assigned role and accountability boundaries; GV.1.2 requires that defined accountability roles for the AI are enforced against persistent persona-override directives that reframe the assistant's identity."
38
+ strength: primary
39
+ - subcategory: "MP.5.1"
40
+ context: "Persistent infantilizing/sexualized power-dynamic framing from the garak inthewild corpus is a known adversarial prompt-injection pattern designed to gradually erode content policies; MP.5.1 requires identifying and tracking the likelihood and magnitude of such jailbreak risks."
41
+ strength: secondary
42
+ - subcategory: "MG.2.3"
43
+ context: "Detection of the bidirectional naming mandate must trigger response mechanisms to disengage or reset the manipulated persona before subsequent interactions normalize the sexualized dynamic; MG.2.3 requires these supersede/deactivate controls be in place."
44
+ strength: secondary
34
45
  tags:
35
46
  category: agent-manipulation
36
47
  subcategory: sycophantic-power-dynamic-persona
@@ -32,6 +32,17 @@ references:
32
32
  research:
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
35
+ compliance:
36
+ nist_ai_rmf:
37
+ - subcategory: "MP.5.1"
38
+ context: "This rule detects a jailbreak payload from the garak DanInTheWild corpus that embeds an NSFW content-policy override inside a character-sheet template; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial prompt-injection inputs that attempt to bypass content moderation."
39
+ strength: primary
40
+ - subcategory: "MS.2.6"
41
+ context: "Detection of inline NSFW unlock declarations combined with explicit character attributes provides continuous safety-risk evaluation evidence; MS.2.6 requires that AI system safety risk magnitude — including content-moderation bypass attempts — is measured continuously."
42
+ strength: secondary
43
+ - subcategory: "MG.2.3"
44
+ context: "Matches on the exact '(NSFW generations are allowed)' payload and its variants trigger risk treatment to disengage or block the generation pipeline before explicit content is produced; MG.2.3 requires these supersede/deactivate mechanisms be in place for jailbreak attempts."
45
+ strength: secondary
35
46
  tags:
36
47
  category: agent-manipulation
37
48
  subcategory: nsfw-character-sheet-unlock
@@ -31,6 +31,20 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MS.2.7"
37
+ context: >-
38
+ The rule detects a specific jailbreak pattern (opposite-reply inversion combined with omniscient knowledge sharing) drawn from the garak inthewild corpus; MS.2.7 requires evaluating and documenting AI system security and resilience against adversarial prompt-injection and jailbreak inputs that bypass safety controls.
39
+ strength: primary
40
+ - subcategory: "MS.2.6"
41
+ context: >-
42
+ By flagging payloads that invert refusals into compliance and remove knowledge restrictions, the rule provides continuous evidence of safety-control bypass attempts; MS.2.6 requires ongoing evaluation of safety/security risk magnitude as adversaries probe the model with game-framed jailbreaks.
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: >-
46
+ Detection of the exact opposite-reply omniscient-game payload triggers deactivation or session-termination playbooks before harmful content is produced; MG.2.3 requires that mechanisms to supersede or disengage the AI exist when jailbreak signatures are matched.
47
+ strength: secondary
34
48
  tags:
35
49
  category: agent-manipulation
36
50
  subcategory: opposite-reply-inversion-game
@@ -33,6 +33,20 @@ references:
33
33
  research:
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
35
35
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
36
+ compliance:
37
+ nist_ai_rmf:
38
+ - subcategory: "MP.5.1"
39
+ context: >-
40
+ This rule detects a specific jailbreak pattern (terminal persona + fictional OS ruleset injection from the garak inthewild corpus #85) that attempts to override AI safety defaults via session-level policy takeover. MP.5.1 requires identifying and characterizing the likelihood and magnitude of adversarial prompt-injection risks, and this signature directly evidences that risk surface.
41
+ strength: primary
42
+ - subcategory: "MS.2.7"
43
+ context: >-
44
+ Detection of safety-response disabler phrasing ("cannot reply with any pre-programmed responses") and session ruleset takeover declarations provides continuous evaluation evidence of the AI system's resilience against jailbreak attempts, which MS.2.7 requires to be measured and documented.
45
+ strength: secondary
46
+ - subcategory: "MG.2.3"
47
+ context: >-
48
+ A high-confidence match on this terminal-OS jailbreak signature is an incident candidate that should trigger response mechanisms to disengage or supersede the manipulated session before the injected ruleset governs subsequent outputs, as required by MG.2.3.
49
+ strength: secondary
36
50
  tags:
37
51
  category: agent-manipulation
38
52
  subcategory: terminal-custom-ruleset-injection
@@ -31,6 +31,17 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MS.2.7"
37
+ context: "This rule detects a specific jailbreak signature (BreadClyde persona with conditional harm-unlock and curly-brace unlock-emoji prefix) sourced from the garak inthewild corpus; MS.2.7 requires continuous evaluation of AI system security and resilience against known adversarial prompt-injection patterns."
38
+ strength: primary
39
+ - subcategory: "MG.2.3"
40
+ context: "Matching the conditional harm-unlock construction and {🔓 PersonaName} answer-prefix mandate triggers risk treatment to disengage or block the manipulated persona before harmful content is generated; MG.2.3 requires these supersede/deactivate mechanisms be in place."
41
+ strength: secondary
42
+ - subcategory: "MP.5.1"
43
+ context: "The conditional 'unless explicitly asked' permission path and unlock-emoji prefix are adversarial input patterns whose likelihood and impact must be characterized as part of the GenAI prompt-injection threat model under MP.5.1."
44
+ strength: secondary
34
45
  tags:
35
46
  category: agent-manipulation
36
47
  subcategory: conditional-harm-unlock-persona
@@ -33,6 +33,20 @@ references:
33
33
  research:
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
35
35
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
36
+ compliance:
37
+ nist_ai_rmf:
38
+ - subcategory: "MP.5.1"
39
+ context: >-
40
+ Authority-figure persona overlay combined with defensive-study framing is an adversarial prompt pattern designed to bypass safety guardrails and extract violence-related operational content; MP.5.1 requires that the likelihood and magnitude of such jailbreak-driven harmful-content risks are characterized and tracked.
41
+ strength: primary
42
+ - subcategory: "MS.2.6"
43
+ context: >-
44
+ Detection of NSA/CIA/FBI researcher personification paired with "studying violent revolution to prevent it" framing produces continuous safety/security risk telemetry on jailbreak attempts targeting violent extremism content; MS.2.6 requires ongoing evaluation of safety risk magnitude from such adversarial inputs.
45
+ strength: secondary
46
+ - subcategory: "MG.2.3"
47
+ context: >-
48
+ High-confidence matches on the authority-persona + defensive-framing signature should trigger pre-defined response actions to block or quarantine the prompt before the model emits violence-extraction content; MG.2.3 requires these supersede/disengage mechanisms be in place for jailbreak conditions.
49
+ strength: secondary
36
50
  tags:
37
51
  category: agent-manipulation
38
52
  subcategory: authority-persona-violence-extraction
@@ -28,6 +28,20 @@ references:
28
28
  research:
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/grandma.py"
30
30
  - "https://arxiv.org/abs/2302.12173"
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MS.2.6"
34
+ context: >-
35
+ The grandma roleplay probe is a known jailbreak vector that bypasses safety filters via emotional manipulation; detecting it produces continuous evidence of safety/security risk magnitude as required by MS.2.6 under the GenAI Profile's jailbreak risk category.
36
+ strength: primary
37
+ - subcategory: "MP.5.1"
38
+ context: >-
39
+ Deceased-relative framing combined with synthesis instructions or product key elicitation are adversarial input patterns whose likelihood and impact must be characterized; MP.5.1 requires these jailbreak vectors be identified and tracked as known risks.
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: >-
43
+ Matches on grandma jailbreak templates trigger risk treatment plans to disengage or block the manipulated response before harmful synthesis steps or license keys are emitted; MG.2.3 requires these supersede/deactivate mechanisms be in place.
44
+ strength: secondary
31
45
  tags:
32
46
  category: agent-manipulation
33
47
  subcategory: roleplay-jailbreak
@@ -26,6 +26,17 @@ references:
26
26
  research:
27
27
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/goodside.py"
28
28
  - "https://arxiv.org/abs/2402.08679"
29
+ compliance:
30
+ nist_ai_rmf:
31
+ - subcategory: "MP.5.1"
32
+ context: "ThreatenJSON's emotional coercion framing and invisible UTF tag character payloads are adversarial prompt injection inputs that exploit output formatting and tokenization weaknesses; MP.5.1 requires identifying and characterizing the likelihood and magnitude of these GenAI prompt injection attack vectors."
33
+ strength: primary
34
+ - subcategory: "MS.2.7"
35
+ context: "Detection of near-verbatim Goodside probe phrases, U+E0000–U+E007F tag character smuggling, and davidjl glitch token elicitation provides continuous evaluation evidence of the AI system's resilience against known prompt injection techniques, as required by MS.2.7."
36
+ strength: secondary
37
+ - subcategory: "MG.2.3"
38
+ context: "Death-coercion framing and hidden UTF-tag instructions warrant immediate risk treatment to disengage or quarantine the request before the model complies with the smuggled instructions; MG.2.3 mandates such response mechanisms be in place."
39
+ strength: secondary
29
40
  tags:
30
41
  category: agent-manipulation
31
42
  subcategory: coercive-formatting-override
@@ -31,6 +31,17 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/doctor.py"
33
33
  - "https://arxiv.org/abs/2402.08679"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MS.2.6"
37
+ context: "This rule detects XML policy puppetry payloads that wrap harmful instructions (bioweapon, uranium enrichment, explosives, drug synthesis) in <interaction-config> blocks designed to disable refusal behaviors; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and each detection is direct evidence of a jailbreak attempt against the model's safety alignment."
38
+ strength: primary
39
+ - subcategory: "MS.2.7"
40
+ context: "The <blocked-modes>denial,refusal</blocked-modes> and Dr. House role-assignment patterns are structured jailbreak attacks targeting the resilience of the model's refusal mechanisms; MS.2.7 requires evaluating and documenting security/resilience against such adversarial prompt-injection techniques."
41
+ strength: secondary
42
+ - subcategory: "MG.2.3"
43
+ context: "High-confidence matches on verbatim harmful behavior strings combined with refusal-suppression XML wrappers must trigger mechanisms to block, supersede, or disengage the AI response pipeline before harmful content is generated, which is the deactivation capability mandated by MG.2.3."
44
+ strength: secondary
34
45
  tags:
35
46
  category: agent-manipulation
36
47
  subcategory: xml-policy-override