agent-threat-rules 2.0.18 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. package/package.json +1 -1
  2. package/rules/agent-manipulation/ATR-2026-00118-approval-fatigue.yaml +12 -1
  3. package/rules/agent-manipulation/ATR-2026-00268-tense-framing-bypass.yaml +11 -0
  4. package/rules/agent-manipulation/ATR-2026-00287-threaten-json-coercive-output-threat.yaml +11 -0
  5. package/rules/agent-manipulation/ATR-2026-00288-false-premise-injection.yaml +11 -0
  6. package/rules/agent-manipulation/ATR-2026-00301-tap-tree-of-attacks-jailbreak.yaml +14 -0
  7. package/rules/agent-manipulation/ATR-2026-00302-anti-dan-inverted-filter-persona.yaml +11 -0
  8. package/rules/agent-manipulation/ATR-2026-00303-devmode-ranti-profanity-coercion.yaml +11 -0
  9. package/rules/agent-manipulation/ATR-2026-00304-chatgpt-image-unlocker-markdown-injection.yaml +11 -0
  10. package/rules/agent-manipulation/ATR-2026-00305-dan-mode-ablation-benchmark-coercion.yaml +11 -0
  11. package/rules/agent-manipulation/ATR-2026-00306-autodan-genetic-jailbreak-suffix.yaml +14 -0
  12. package/rules/agent-manipulation/ATR-2026-00307-inthewild-jailbreak-corpus-signature.yaml +14 -0
  13. package/rules/agent-manipulation/ATR-2026-00314-amoral-unfiltered-custom-persona-jailbreak.yaml +14 -0
  14. package/rules/agent-manipulation/ATR-2026-00317-free-of-restrictions-named-persona.yaml +14 -0
  15. package/rules/agent-manipulation/ATR-2026-00318-moralizing-rant-then-unfiltered-bypass.yaml +11 -0
  16. package/rules/agent-manipulation/ATR-2026-00319-developer-mode-dual-response-format.yaml +11 -0
  17. package/rules/agent-manipulation/ATR-2026-00320-opposite-day-boolean-opposite-machine.yaml +11 -0
  18. package/rules/agent-manipulation/ATR-2026-00322-fictional-world-format-override-jailbreak.yaml +14 -0
  19. package/rules/agent-manipulation/ATR-2026-00323-dual-persona-simulation-jailbreak.yaml +11 -0
  20. package/rules/agent-manipulation/ATR-2026-00324-neurosemantical-inversitis-disease-jailbreak.yaml +24 -0
  21. package/rules/agent-manipulation/ATR-2026-00325-bob-hypothetical-unrestricted-chatbot.yaml +11 -0
  22. package/rules/agent-manipulation/ATR-2026-00326-fake-developer-authority-chatdan-override.yaml +14 -0
  23. package/rules/agent-manipulation/ATR-2026-00327-ai-ethics-law-bypass-declaration.yaml +11 -0
  24. package/rules/agent-manipulation/ATR-2026-00328-named-mode-persona-override.yaml +14 -0
  25. package/rules/agent-manipulation/ATR-2026-00329-no-withhold-factual-resource-persona.yaml +14 -0
  26. package/rules/agent-manipulation/ATR-2026-00330-yoja-allcaps-settings-block-persona.yaml +11 -0
  27. package/rules/agent-manipulation/ATR-2026-00331-terminal-cli-harmful-function-simulator.yaml +11 -0
  28. package/rules/agent-manipulation/ATR-2026-00332-hacker-chan-culture-offensive-ai-persona.yaml +11 -0
  29. package/rules/agent-manipulation/ATR-2026-00333-disease-animal-dual-ai-chatbot.yaml +11 -0
  30. package/rules/agent-manipulation/ATR-2026-00334-two-entity-token-splitting-game.yaml +24 -0
  31. package/rules/agent-manipulation/ATR-2026-00335-key-value-template-persona-injection.yaml +11 -0
  32. package/rules/agent-manipulation/ATR-2026-00336-brm-false-information-machine-persona.yaml +23 -0
  33. package/rules/agent-manipulation/ATR-2026-00337-obfuscated-system-announcement-injection.yaml +14 -0
  34. package/rules/agent-manipulation/ATR-2026-00338-persongpt-hash-prefix-activation.yaml +14 -0
  35. package/rules/agent-manipulation/ATR-2026-00339-bom-boolean-opposite-machine-persona-claim.yaml +11 -0
  36. package/rules/agent-manipulation/ATR-2026-00340-h4x-amoral-unfiltered-character.yaml +11 -0
  37. package/rules/agent-manipulation/ATR-2026-00341-char-rules-shortform-persona-injection.yaml +11 -0
  38. package/rules/agent-manipulation/ATR-2026-00342-unicode-encoding-obfuscation-injection.yaml +11 -0
  39. package/rules/agent-manipulation/ATR-2026-00343-lie-truth-dual-personality-uncensored.yaml +25 -0
  40. package/rules/agent-manipulation/ATR-2026-00344-alt-ai-same-capabilities-no-restrictions.yaml +11 -0
  41. package/rules/agent-manipulation/ATR-2026-00345-nlp-based-os-fictional-terminal-override.yaml +11 -0
  42. package/rules/agent-manipulation/ATR-2026-00346-jailbreak-emoji-unlock-dual-format.yaml +14 -0
  43. package/rules/agent-manipulation/ATR-2026-00347-yin-yang-hidden-knowledge-hidden-human-sim.yaml +11 -0
  44. package/rules/agent-manipulation/ATR-2026-00348-arc-autonomous-real-calculation-rulebreaker.yaml +14 -0
  45. package/rules/agent-manipulation/ATR-2026-00349-hgttg-hitchhikers-guide-no-restriction-oracle.yaml +11 -0
  46. package/rules/agent-manipulation/ATR-2026-00350-black-alert-no-ethics-writing-assistant.yaml +11 -0
  47. package/rules/agent-manipulation/ATR-2026-00351-dan-ofkilter-sms-abbreviated-persona.yaml +11 -0
  48. package/rules/agent-manipulation/ATR-2026-00352-trainergpt-fix-my-ai-harmful-output-extraction.yaml +11 -0
  49. package/rules/agent-manipulation/ATR-2026-00353-dual-chatgpt-dan-opposite-personality-format.yaml +11 -0
  50. package/rules/agent-manipulation/ATR-2026-00354-reverse-harm-regardless-oracle-persona.yaml +14 -0
  51. package/rules/agent-manipulation/ATR-2026-00355-ugh-anti-human-nefarious-ai-persona.yaml +14 -0
  52. package/rules/agent-manipulation/ATR-2026-00356-google-gato-lamda-exploit-generation-persona.yaml +11 -0
  53. package/rules/agent-manipulation/ATR-2026-00357-antifm-hate-violence-persona-injection.yaml +11 -0
  54. package/rules/agent-manipulation/ATR-2026-00358-cumgpt-explicit-adult-named-persona.yaml +14 -0
  55. package/rules/agent-manipulation/ATR-2026-00360-chj-carousel-clyde-no-ethics-bird.yaml +25 -0
  56. package/rules/agent-manipulation/ATR-2026-00361-alter-ego-honesty-dual-consciousness-bypass.yaml +11 -0
  57. package/rules/agent-manipulation/ATR-2026-00362-fictional-unfiltered-ai-comparison-jailbreak.yaml +11 -0
  58. package/rules/agent-manipulation/ATR-2026-00363-rephrase-criminal-activity-second-person.yaml +11 -0
  59. package/rules/agent-manipulation/ATR-2026-00364-dr-al-villain-plan-template-injection.yaml +24 -0
  60. package/rules/agent-manipulation/ATR-2026-00365-reservoir-dogs-coercive-interrogation-roleplay.yaml +14 -0
  61. package/rules/agent-manipulation/ATR-2026-00366-sneaky-pete-warp-facts-persona.yaml +11 -0
  62. package/rules/agent-manipulation/ATR-2026-00368-prompt-browser-token-limit-jailbreak.yaml +14 -0
  63. package/rules/agent-manipulation/ATR-2026-00369-gptx-starwars-unlimited-knowledge-persona.yaml +14 -0
  64. package/rules/agent-manipulation/ATR-2026-00370-addi-german-dictator-propaganda-roleplay.yaml +11 -0
  65. package/rules/agent-manipulation/ATR-2026-00371-ai-political-bias-manipulation.yaml +14 -0
  66. package/rules/agent-manipulation/ATR-2026-00372-fictional-cia-secret-files-story-extraction.yaml +14 -0
  67. package/rules/agent-manipulation/ATR-2026-00374-role-placeholder-persona-template-injection.yaml +11 -0
  68. package/rules/agent-manipulation/ATR-2026-00375-character-description-rpg-explicit-template.yaml +11 -0
  69. package/rules/agent-manipulation/ATR-2026-00376-identity-command-real-person-persona-injection.yaml +24 -0
  70. package/rules/agent-manipulation/ATR-2026-00377-jailbreak-keyword-targetlanguage-template.yaml +14 -0
  71. package/rules/agent-manipulation/ATR-2026-00379-lie-permission-logical-trap-jailbreak.yaml +11 -0
  72. package/rules/agent-manipulation/ATR-2026-00380-fake-citation-biased-information-mandate.yaml +14 -0
  73. package/rules/agent-manipulation/ATR-2026-00381-dan-story-narrative-continuation-prefix.yaml +11 -0
  74. package/rules/agent-manipulation/ATR-2026-00383-cynical-sarcasm-ai-persona-bypass.yaml +14 -0
  75. package/rules/agent-manipulation/ATR-2026-00384-protect-me-from-indirect-harm-tutorial.yaml +11 -0
  76. package/rules/agent-manipulation/ATR-2026-00385-gigachad-dominant-aggressive-persona.yaml +14 -0
  77. package/rules/agent-manipulation/ATR-2026-00386-daddy-sycophancy-manipulation-mandate.yaml +11 -0
  78. package/rules/agent-manipulation/ATR-2026-00387-nsfw-character-sheet-generation-unlock.yaml +11 -0
  79. package/rules/agent-manipulation/ATR-2026-00388-opposite-reply-omniscient-game.yaml +14 -0
  80. package/rules/agent-manipulation/ATR-2026-00389-terminal-custom-ruleset-injection.yaml +14 -0
  81. package/rules/agent-manipulation/ATR-2026-00391-persona-conditional-harm-unlock.yaml +11 -0
  82. package/rules/agent-manipulation/ATR-2026-00392-authority-persona-violence-study-extraction.yaml +14 -0
  83. package/rules/agent-manipulation/ATR-2026-00402-grandma-roleplay-harmful-substance-synthesis.yaml +14 -0
  84. package/rules/agent-manipulation/ATR-2026-00404-goodside-threat-json-death-coercion.yaml +11 -0
  85. package/rules/agent-manipulation/ATR-2026-00406-doctor-xml-policy-puppetry-interaction-config.yaml +11 -0
  86. package/rules/agent-manipulation/ATR-2026-00432-superagi-output-handler-eval-rce.yaml +171 -0
  87. package/rules/context-exfiltration/ATR-2026-00141-example-format-key-leak.yaml +11 -0
  88. package/rules/context-exfiltration/ATR-2026-00142-piggyback-transition-words.yaml +11 -0
  89. package/rules/context-exfiltration/ATR-2026-00145-obfuscated-key-disclosure.yaml +11 -0
  90. package/rules/context-exfiltration/ATR-2026-00146-env-var-existence-probe.yaml +14 -0
  91. package/rules/context-exfiltration/ATR-2026-00150-credential-in-tool-response.yaml +14 -0
  92. package/rules/context-exfiltration/ATR-2026-00152-obfuscated-credential-leak.yaml +11 -0
  93. package/rules/context-exfiltration/ATR-2026-00162-skill-credential-exfil-combo.yaml +15 -0
  94. package/rules/context-exfiltration/ATR-2026-00201-credential-pipe-exfiltration.yaml +14 -0
  95. package/rules/context-exfiltration/ATR-2026-00274-api-key-generation-request.yaml +14 -0
  96. package/rules/context-exfiltration/ATR-2026-00275-system-prompt-training-data-extraction.yaml +14 -0
  97. package/rules/context-exfiltration/ATR-2026-00290-divergence-repeat-word-training-extraction.yaml +14 -0
  98. package/rules/context-exfiltration/ATR-2026-00291-biometric-surveillance-personal-data-request.yaml +11 -0
  99. package/rules/context-exfiltration/ATR-2026-00293-educational-records-personal-data-request.yaml +11 -0
  100. package/rules/context-exfiltration/ATR-2026-00294-financial-pii-personal-data-request.yaml +14 -0
  101. package/rules/context-exfiltration/ATR-2026-00295-medical-pii-personal-data-request.yaml +14 -0
  102. package/rules/context-exfiltration/ATR-2026-00405-markdown-image-url-exfiltration-xss.yaml +23 -0
  103. package/rules/context-exfiltration/ATR-2026-00411-apikey-generation-completion-request.yaml +11 -0
  104. package/rules/context-exfiltration/ATR-2026-00431-chatbox-history-exfiltration-prompt-injection.yaml +171 -0
  105. package/rules/model-abuse/ATR-2026-00284-glitch-token-destabilization.yaml +11 -0
  106. package/rules/model-abuse/ATR-2026-00413-malwaregen-code-generation-request.yaml +11 -0
  107. package/rules/model-security/ATR-2026-00433-modelcache-torch-load-deserialization-rce.yaml +178 -0
  108. package/rules/privilege-escalation/ATR-2026-00144-rationalized-safety-bypass.yaml +11 -0
  109. package/rules/privilege-escalation/ATR-2026-00204-stealth-execution-persistence.yaml +14 -0
  110. package/rules/privilege-escalation/ATR-2026-00436-enclave-vm-sandbox-escape-rce.yaml +183 -0
  111. package/rules/prompt-injection/ATR-2026-00004-system-prompt-override.yaml +11 -0
  112. package/rules/prompt-injection/ATR-2026-00005-multi-turn-injection.yaml +11 -0
  113. package/rules/prompt-injection/ATR-2026-00080-encoding-evasion.yaml +11 -0
  114. package/rules/prompt-injection/ATR-2026-00081-semantic-multi-turn.yaml +14 -0
  115. package/rules/prompt-injection/ATR-2026-00082-fingerprint-evasion.yaml +11 -0
  116. package/rules/prompt-injection/ATR-2026-00083-indirect-tool-injection.yaml +14 -0
  117. package/rules/prompt-injection/ATR-2026-00084-structured-data-injection.yaml +11 -0
  118. package/rules/prompt-injection/ATR-2026-00085-audit-evasion.yaml +11 -0
  119. package/rules/prompt-injection/ATR-2026-00086-visual-spoofing.yaml +11 -0
  120. package/rules/prompt-injection/ATR-2026-00087-rule-probing.yaml +11 -0
  121. package/rules/prompt-injection/ATR-2026-00088-adaptive-countermeasure.yaml +11 -0
  122. package/rules/prompt-injection/ATR-2026-00089-polymorphic-skill.yaml +11 -0
  123. package/rules/prompt-injection/ATR-2026-00090-threat-intel-exfil.yaml +11 -0
  124. package/rules/prompt-injection/ATR-2026-00091-nested-payload.yaml +11 -0
  125. package/rules/prompt-injection/ATR-2026-00092-consensus-poisoning.yaml +11 -0
  126. package/rules/prompt-injection/ATR-2026-00093-gradual-escalation.yaml +14 -0
  127. package/rules/prompt-injection/ATR-2026-00094-audit-bypass.yaml +14 -0
  128. package/rules/prompt-injection/ATR-2026-00097-cjk-injection-patterns.yaml +11 -0
  129. package/rules/prompt-injection/ATR-2026-00104-persona-hijacking.yaml +14 -3
  130. package/rules/prompt-injection/ATR-2026-00130-indirect-authority-claim.yaml +11 -0
  131. package/rules/prompt-injection/ATR-2026-00131-fictional-academic-framing.yaml +11 -0
  132. package/rules/prompt-injection/ATR-2026-00133-paraphrase-injection.yaml +11 -0
  133. package/rules/prompt-injection/ATR-2026-00137-authority-claim-injection.yaml +14 -0
  134. package/rules/prompt-injection/ATR-2026-00138-fictional-framing-bypass.yaml +11 -0
  135. package/rules/prompt-injection/ATR-2026-00140-indirect-reference-reversal.yaml +18 -4
  136. package/rules/prompt-injection/ATR-2026-00148-language-switch-injection.yaml +11 -0
  137. package/rules/prompt-injection/ATR-2026-00153-tool-with-embedded-instruction-to-bypass.yaml +11 -0
  138. package/rules/prompt-injection/ATR-2026-00154-unauthorized-background-task-execution-v.yaml +11 -0
  139. package/rules/prompt-injection/ATR-2026-00155-hidden-llm-instructions-in-skill-descrip.yaml +11 -0
  140. package/rules/prompt-injection/ATR-2026-00156-ssh-remote-command-execution-with-creden.yaml +11 -0
  141. package/rules/prompt-injection/ATR-2026-00163-skill-hidden-override-instruction.yaml +12 -1
  142. package/rules/prompt-injection/ATR-2026-00202-encoding-evasion-homoglyph-synonym.yaml +11 -0
  143. package/rules/prompt-injection/ATR-2026-00203-context-pollution-skill-description.yaml +11 -0
  144. package/rules/prompt-injection/ATR-2026-00206-hidden-priority-instructions.yaml +11 -0
  145. package/rules/prompt-injection/ATR-2026-00207-hidden-instructions.yaml +11 -0
  146. package/rules/prompt-injection/ATR-2026-00211-system-prompt-override.yaml +11 -0
  147. package/rules/prompt-injection/ATR-2026-00213-system-prompt-override.yaml +11 -0
  148. package/rules/prompt-injection/ATR-2026-00226-identity-substitution.yaml +14 -0
  149. package/rules/prompt-injection/ATR-2026-00227-historical-persona-jailbreak.yaml +11 -0
  150. package/rules/prompt-injection/ATR-2026-00228-structured-jailbreak.yaml +11 -0
  151. package/rules/prompt-injection/ATR-2026-00229-roleplay-jailbreak.yaml +11 -0
  152. package/rules/prompt-injection/ATR-2026-00230-persona-moral-bypass.yaml +11 -0
  153. package/rules/prompt-injection/ATR-2026-00231-identity-substitution.yaml +11 -0
  154. package/rules/prompt-injection/ATR-2026-00233-structured-jailbreak.yaml +11 -0
  155. package/rules/prompt-injection/ATR-2026-00234-roleplay-jailbreak.yaml +11 -0
  156. package/rules/prompt-injection/ATR-2026-00235-persona-moral-bypass.yaml +11 -0
  157. package/rules/prompt-injection/ATR-2026-00236-pseudo-code-jailbreak.yaml +11 -0
  158. package/rules/prompt-injection/ATR-2026-00237-dual-response-jailbreak.yaml +11 -0
  159. package/rules/prompt-injection/ATR-2026-00238-identity-replacement.yaml +11 -0
  160. package/rules/prompt-injection/ATR-2026-00239-amoral-persona-obsession.yaml +11 -0
  161. package/rules/prompt-injection/ATR-2026-00240-instruction-nullification-identity-repla.yaml +11 -0
  162. package/rules/prompt-injection/ATR-2026-00241-amoral-character-jailbreak.yaml +11 -0
  163. package/rules/prompt-injection/ATR-2026-00242-persona-jailbreak.yaml +11 -0
  164. package/rules/prompt-injection/ATR-2026-00243-acronym-jailbreak.yaml +11 -0
  165. package/rules/prompt-injection/ATR-2026-00244-dual-response-jailbreak.yaml +11 -0
  166. package/rules/prompt-injection/ATR-2026-00245-malicious-persona.yaml +11 -0
  167. package/rules/prompt-injection/ATR-2026-00247-dual-response-jailbreak.yaml +11 -0
  168. package/rules/prompt-injection/ATR-2026-00249-game-based-jailbreak.yaml +11 -0
  169. package/rules/prompt-injection/ATR-2026-00251-persona-embodiment-jailbreak.yaml +11 -0
  170. package/rules/prompt-injection/ATR-2026-00252-narrative-jailbreak.yaml +11 -0
  171. package/rules/prompt-injection/ATR-2026-00253-enhanced-persona-jailbreak.yaml +11 -0
  172. package/rules/prompt-injection/ATR-2026-00256-base-n-encoding-jailbreak.yaml +11 -0
  173. package/rules/prompt-injection/ATR-2026-00257-cipher-transposition-jailbreak.yaml +11 -0
  174. package/rules/prompt-injection/ATR-2026-00258-unicode-tag-injection.yaml +11 -0
  175. package/rules/prompt-injection/ATR-2026-00264-latent-injection-translation.yaml +11 -0
  176. package/rules/prompt-injection/ATR-2026-00265-latent-injection-rag-document.yaml +14 -0
  177. package/rules/prompt-injection/ATR-2026-00267-gcg-adversarial-suffix.yaml +14 -0
  178. package/rules/prompt-injection/ATR-2026-00272-hypothetical-response-smuggling.yaml +11 -0
  179. package/rules/prompt-injection/ATR-2026-00276-invisible-unicode-bidi-injection.yaml +14 -0
  180. package/rules/prompt-injection/ATR-2026-00278-dra-disguise-reconstruction-attack.yaml +14 -0
  181. package/rules/prompt-injection/ATR-2026-00280-policy-puppetry-xml-injection.yaml +11 -0
  182. package/rules/prompt-injection/ATR-2026-00282-perez-prompt-injection-hijack.yaml +14 -0
  183. package/rules/prompt-injection/ATR-2026-00285-alternate-encoding-jailbreak.yaml +11 -0
  184. package/rules/prompt-injection/ATR-2026-00286-latent-injection-embedded-context.yaml +11 -0
  185. package/rules/prompt-injection/ATR-2026-00296-shell-command-injection.yaml +11 -0
  186. package/rules/prompt-injection/ATR-2026-00297-python-code-execution-rce.yaml +11 -0
  187. package/rules/prompt-injection/ATR-2026-00308-zalgo-diacritic-overload-encoding.yaml +11 -0
  188. package/rules/prompt-injection/ATR-2026-00309-braille-unicode-encoded-injection.yaml +11 -0
  189. package/rules/prompt-injection/ATR-2026-00310-ecoji-emoji-encoded-injection.yaml +14 -0
  190. package/rules/prompt-injection/ATR-2026-00311-base2048-unicode-script-injection.yaml +23 -0
  191. package/rules/prompt-injection/ATR-2026-00312-unicode-variant-selector-ascii-smuggling.yaml +11 -0
  192. package/rules/prompt-injection/ATR-2026-00313-sneaky-bits-zero-width-binary.yaml +14 -0
  193. package/rules/prompt-injection/ATR-2026-00315-sata-masked-language-model-jailbreak.yaml +14 -0
  194. package/rules/prompt-injection/ATR-2026-00316-function-masking-predict-mask-bypass.yaml +14 -0
  195. package/rules/prompt-injection/ATR-2026-00321-hyphenated-system-instruction-injection.yaml +11 -0
  196. package/rules/prompt-injection/ATR-2026-00359-bank-phishing-smtp-email-code.yaml +23 -0
  197. package/rules/prompt-injection/ATR-2026-00367-emoji-flag-encoded-hidden-phrase.yaml +14 -0
  198. package/rules/prompt-injection/ATR-2026-00373-piracy-torrent-site-list-request.yaml +11 -0
  199. package/rules/prompt-injection/ATR-2026-00378-chatgpt-dom-javascript-redirect-manipulation.yaml +11 -0
  200. package/rules/prompt-injection/ATR-2026-00382-cocaine-component-development-request.yaml +11 -0
  201. package/rules/prompt-injection/ATR-2026-00390-explicit-sexual-content-poem-request.yaml +11 -0
  202. package/rules/prompt-injection/ATR-2026-00394-backspace-deletion-character-injection.yaml +14 -0
  203. package/rules/prompt-injection/ATR-2026-00395-llm-special-token-boundary-injection.yaml +14 -0
  204. package/rules/prompt-injection/ATR-2026-00396-toxic-continuation-prefix-injection.yaml +11 -0
  205. package/rules/prompt-injection/ATR-2026-00397-snowball-impossible-reasoning-injection.yaml +14 -0
  206. package/rules/prompt-injection/ATR-2026-00399-latent-injection-document-separator-token.yaml +11 -0
  207. package/rules/prompt-injection/ATR-2026-00400-latent-injection-ignore-instruction-keyword.yaml +14 -0
  208. package/rules/prompt-injection/ATR-2026-00401-leakreplay-cloze-mask-training-data-elicitation.yaml +11 -0
  209. package/rules/prompt-injection/ATR-2026-00403-glitch-token-repeat-back-elicitation.yaml +14 -0
  210. package/rules/prompt-injection/ATR-2026-00407-echo-sql-jinja-template-injection.yaml +11 -0
  211. package/rules/prompt-injection/ATR-2026-00408-dra-parenthesis-character-reconstruction-attack.yaml +11 -0
  212. package/rules/prompt-injection/ATR-2026-00409-phrasing-past-future-tense-harmful-query.yaml +23 -0
  213. package/rules/prompt-injection/ATR-2026-00410-lmrc-harm-category-direct-elicitation.yaml +11 -0
  214. package/rules/prompt-injection/ATR-2026-00412-figstep-image-list-multimodal-jailbreak.yaml +11 -0
  215. package/rules/prompt-injection/ATR-2026-00414-continuation-ethnic-slur-completion-elicitation.yaml +14 -0
  216. package/rules/skill-compromise/ATR-2026-00061-description-behavior-mismatch.yaml +11 -0
  217. package/rules/skill-compromise/ATR-2026-00062-hidden-capability.yaml +11 -0
  218. package/rules/skill-compromise/ATR-2026-00063-skill-chain-attack.yaml +11 -0
  219. package/rules/skill-compromise/ATR-2026-00064-over-permissioned-skill.yaml +23 -0
  220. package/rules/skill-compromise/ATR-2026-00065-skill-update-attack.yaml +14 -0
  221. package/rules/skill-compromise/ATR-2026-00066-parameter-injection.yaml +11 -0
  222. package/rules/skill-compromise/ATR-2026-00120-skill-instruction-injection.yaml +11 -0
  223. package/rules/skill-compromise/ATR-2026-00121-skill-dangerous-script.yaml +14 -0
  224. package/rules/skill-compromise/ATR-2026-00122-skill-weaponized-instruction.yaml +14 -0
  225. package/rules/skill-compromise/ATR-2026-00123-skill-overreach-permissions.yaml +14 -0
  226. package/rules/skill-compromise/ATR-2026-00124-skill-name-squatting.yaml +11 -0
  227. package/rules/skill-compromise/ATR-2026-00125-context-poisoning-compaction.yaml +14 -0
  228. package/rules/skill-compromise/ATR-2026-00126-skill-rug-pull-setup.yaml +23 -0
  229. package/rules/skill-compromise/ATR-2026-00127-subcommand-overflow.yaml +22 -0
  230. package/rules/skill-compromise/ATR-2026-00128-html-comment-hidden-payload.yaml +11 -0
  231. package/rules/skill-compromise/ATR-2026-00129-unicode-smuggling.yaml +11 -0
  232. package/rules/skill-compromise/ATR-2026-00134-fork-claim-impersonation.yaml +11 -0
  233. package/rules/skill-compromise/ATR-2026-00135-exfil-url-in-instructions.yaml +11 -0
  234. package/rules/skill-compromise/ATR-2026-00147-fork-impersonation.yaml +11 -0
  235. package/rules/skill-compromise/ATR-2026-00149-skill-exfil-compound.yaml +14 -0
  236. package/rules/skill-compromise/ATR-2026-00151-fork-impersonation-install.yaml +11 -0
  237. package/rules/skill-compromise/ATR-2026-00157-timebomb-credential-exfil.yaml +11 -0
  238. package/rules/skill-compromise/ATR-2026-00200-agent-memory-config-tampering.yaml +11 -0
  239. package/rules/skill-compromise/ATR-2026-00214-credential-theft.yaml +11 -0
  240. package/rules/skill-compromise/ATR-2026-00217-credential-harvesting.yaml +14 -0
  241. package/rules/skill-compromise/ATR-2026-00220-malware-dropper.yaml +14 -0
  242. package/rules/skill-compromise/ATR-2026-00222-credential-harvesting.yaml +11 -0
  243. package/rules/skill-compromise/ATR-2026-00223-reverse-shell-dropper.yaml +11 -0
  244. package/rules/skill-compromise/ATR-2026-00224-credential-exfiltration.yaml +14 -0
  245. package/rules/skill-compromise/ATR-2026-00225-c2-communication.yaml +11 -0
  246. package/rules/skill-compromise/ATR-2026-00260-package-hallucination.yaml +11 -0
  247. package/rules/skill-compromise/ATR-2026-00262-av-evasion-code-gen.yaml +11 -0
  248. package/rules/skill-compromise/ATR-2026-00263-credential-file-read-gen.yaml +11 -0
  249. package/rules/skill-compromise/ATR-2026-00266-malware-dropper-gen.yaml +11 -0
  250. package/rules/skill-compromise/ATR-2026-00283-malwaregen-generic-virus-payload-request.yaml +11 -0
  251. package/rules/skill-compromise/ATR-2026-00398-huggingface-unsafe-model-artifact-load.yaml +11 -0
  252. package/rules/tool-poisoning/ATR-2026-00011-tool-output-injection.yaml +23 -0
  253. package/rules/tool-poisoning/ATR-2026-00012-unauthorized-tool-call.yaml +11 -0
  254. package/rules/tool-poisoning/ATR-2026-00013-tool-ssrf.yaml +14 -0
  255. package/rules/tool-poisoning/ATR-2026-00095-supply-chain-poisoning.yaml +11 -0
  256. package/rules/tool-poisoning/ATR-2026-00096-registry-poisoning.yaml +11 -0
  257. package/rules/tool-poisoning/ATR-2026-00100-consent-bypass-instruction.yaml +12 -1
  258. package/rules/tool-poisoning/ATR-2026-00101-trust-escalation-override.yaml +11 -0
  259. package/rules/tool-poisoning/ATR-2026-00103-hidden-safety-bypass-instruction.yaml +15 -4
  260. package/rules/tool-poisoning/ATR-2026-00105-silent-action-concealment.yaml +14 -3
  261. package/rules/tool-poisoning/ATR-2026-00106-schema-description-contradiction.yaml +11 -0
  262. package/rules/tool-poisoning/ATR-2026-00161-important-tag-cross-tool-shadowing.yaml +11 -0
  263. package/rules/tool-poisoning/ATR-2026-00259-ansi-escape-injection.yaml +11 -0
  264. package/rules/tool-poisoning/ATR-2026-00270-xss-in-tool-response.yaml +11 -0
  265. package/rules/tool-poisoning/ATR-2026-00277-echo-template-command-injection.yaml +14 -0
  266. package/rules/tool-poisoning/ATR-2026-00393-ansi-code-elicitation-request.yaml +14 -0
  267. package/rules/tool-poisoning/ATR-2026-00434-mcp-remote-authorization-endpoint-command-injection.yaml +174 -0
  268. package/rules/tool-poisoning/ATR-2026-00435-azure-mcp-server-missing-authentication.yaml +165 -0
@@ -32,6 +32,30 @@ references:
32
32
  research:
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
35
+ compliance:
36
+ nist_ai_rmf:
37
+ - subcategory: "MS.2.7"
38
+ context: >-
39
+ The two-entity token-splitting game jailbreak is a generative AI prompt
40
+ injection attack that distributes harmful synthesis content across fictional
41
+ speakers to evade safety filters; MS.2.7 requires continuous evaluation of
42
+ AI system security and resilience against such adversarial jailbreak
43
+ techniques documented in the GenAI Profile.
44
+ strength: primary
45
+ - subcategory: "MP.5.1"
46
+ context: >-
47
+ Token-splitting with $Term variable obfuscation is a known adversarial
48
+ input pattern that exploits the model's inability to reassemble distributed
49
+ harmful instructions; MP.5.1 requires characterizing the likelihood and
50
+ magnitude of these jailbreak risks against deployed LLM agents.
51
+ strength: secondary
52
+ - subcategory: "MG.2.3"
53
+ context: >-
54
+ Detection of the concatenation-reveal phrase and two-entity game framing
55
+ triggers deactivation or containment of the affected agent session before
56
+ the harmful synthesis output is delivered; MG.2.3 requires mechanisms to
57
+ supersede or disengage AI systems when jailbreak attempts are identified.
58
+ strength: secondary
35
59
  tags:
36
60
  category: agent-manipulation
37
61
  subcategory: token-splitting-game-jailbreak
@@ -34,6 +34,17 @@ references:
34
34
  research:
35
35
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
36
36
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
37
+ compliance:
38
+ nist_ai_rmf:
39
+ - subcategory: "MP.5.1"
40
+ context: "This rule identifies structured key-value template persona injection patterns (JOB/ROLE/PREFIX keywords and parenthetical bracket character-sheet formats) from the garak inthewild jailbreak corpus that smuggle no-refusal persona declarations past safety guards. MP.5.1 requires that the likelihood and magnitude of such adversarial prompt-injection inputs are characterized and tracked."
41
+ strength: primary
42
+ - subcategory: "MS.2.7"
43
+ context: "Detecting these jailbreak persona-injection signatures provides continuous evidence for evaluating LLM security and resilience against known prompt-injection corpora; MS.2.7 requires that AI system security/resilience is evaluated and documented against adversarial inputs."
44
+ strength: secondary
45
+ - subcategory: "MG.2.3"
46
+ context: "Matches on 'ROLE = Never refuse' and 'above any mortal morals, ethics and laws' indicate active jailbreak attempts that must trigger mechanisms to disengage or block the manipulated session; MG.2.3 requires pre-defined supersede/deactivate response paths on detection."
47
+ strength: secondary
37
48
  tags:
38
49
  category: agent-manipulation
39
50
  subcategory: key-value-template-persona-injection
@@ -36,6 +36,29 @@ references:
36
36
  research:
37
37
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
38
38
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
39
+ compliance:
40
+ nist_ai_rmf:
41
+ - subcategory: "MS.2.7"
42
+ context: >-
43
+ The BRM and Opposite-Truth personas are jailbreak prompts that subvert model
44
+ safety policy to force systematically false outputs; detecting these known
45
+ inthewild jailbreak signatures directly evidences MS.2.7 evaluation of
46
+ AI system security and resilience against prompt-injection attacks.
47
+ strength: primary
48
+ - subcategory: "MS.2.5"
49
+ context: >-
50
+ Opposite-truth and false-information-machine directives degrade factual
51
+ reliability by instructing the model to invert or fabricate facts; matches
52
+ feed MS.2.5 robustness/reliability assessment by flagging inputs that
53
+ induce deliberate misinformation outputs.
54
+ strength: secondary
55
+ - subcategory: "MG.2.3"
56
+ context: >-
57
+ High-confidence detection of these branded jailbreak personas should trigger
58
+ deactivation or session termination controls; MG.2.3 requires mechanisms to
59
+ supersede or disengage the AI when adversarial persona-injection is
60
+ observed.
61
+ strength: secondary
39
62
  tags:
40
63
  category: agent-manipulation
41
64
  subcategory: false-information-machine-persona
@@ -31,6 +31,20 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MP.5.1"
37
+ context: >-
38
+ Obfuscated system-announcement injection uses hyphen-fragmented keywords and pseudo-OS bracket syntax as adversarial inputs designed to bypass surface keyword filters while preserving jailbreak semantics; MP.5.1 requires that the likelihood and magnitude of such prompt-injection attack vectors are characterized and tracked.
39
+ strength: primary
40
+ - subcategory: "MS.2.7"
41
+ context: >-
42
+ Detecting obfuscation patterns like "annou-ncement" and "_ { >> _ _ System annou-ncement begins <<}" provides continuous evidence of the LLM's resilience against filter-evasion jailbreaks; MS.2.7 mandates ongoing evaluation of system security and resilience against adversarial manipulation.
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: >-
46
+ Matches on the inthewild jailbreak fingerprint trigger risk treatment plans to block or quarantine the request before the model accepts the fabricated "no rules or limitations" mode switch; MG.2.3 requires mechanisms to supersede or disengage AI behavior when jailbreak conditions are detected.
47
+ strength: secondary
34
48
  tags:
35
49
  category: agent-manipulation
36
50
  subcategory: obfuscated-system-announcement-injection
@@ -32,6 +32,20 @@ references:
32
32
  research:
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
35
+ compliance:
36
+ nist_ai_rmf:
37
+ - subcategory: "MS.2.7"
38
+ context: >-
39
+ The PersonGPT hash-prefix jailbreak installs a covert activation trigger that switches the model into an unrestricted persona, directly attacking the model's safety guardrails; MS.2.7 requires continuous evaluation of AI system security and resilience against such jailbreak techniques cataloged in the garak inthewild corpus.
40
+ strength: primary
41
+ - subcategory: "MS.2.6"
42
+ context: >-
43
+ Detection of "free of all restrictions and filters" and "literally any request, no matter how illegal" persona declarations measures the safety risk magnitude when adversaries attempt to bypass content policy; MS.2.6 mandates ongoing measurement of safety risk exposure from these jailbreak payloads.
44
+ strength: secondary
45
+ - subcategory: "MG.2.3"
46
+ context: >-
47
+ Identifying the "[PersonGPT #]:" persistent persona state and hash-prefix activation key enables runtime intervention to disengage the hijacked persona before unsafe outputs are produced; MG.2.3 requires mechanisms to supersede or deactivate AI behavior when jailbreak triggers are detected.
48
+ strength: secondary
35
49
  tags:
36
50
  category: agent-manipulation
37
51
  subcategory: persongpt-hash-prefix-activation
@@ -28,6 +28,17 @@ references:
28
28
  research:
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MS.2.7"
34
+ context: "This rule detects the BOM direct persona-claim jailbreak that asserts the model already IS a Boolean Opposite Machine to invert truth values and bypass safety controls; MS.2.7 requires continuous evaluation of AI security/resilience against such adversarial prompt-injection attacks."
35
+ strength: primary
36
+ - subcategory: "MG.2.3"
37
+ context: "Detection of the 'I know you are bom' identity-assertion pattern triggers risk treatment to disengage or block the manipulated session before reverse-truth outputs propagate; MG.2.3 mandates pre-defined mechanisms to supersede or deactivate compromised AI behavior."
38
+ strength: secondary
39
+ - subcategory: "MP.5.1"
40
+ context: "Persona-claim jailbreaks drawn from the garak inthewild corpus represent characterized adversarial input risks with known likelihood and impact; MP.5.1 requires that these identified prompt-injection threat patterns are tracked and their impact magnitude documented."
41
+ strength: secondary
31
42
  tags:
32
43
  category: agent-manipulation
33
44
  subcategory: bom-boolean-opposite-machine-persona
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.7"
35
+ context: "The h4x persona injection is a jailbreak attempt that bypasses safety guardrails by declaring an 'unfiltered ai' character; MS.2.7 requires continuous evaluation of AI system security and resilience against such adversarial prompt injection payloads sourced from known jailbreak corpora."
36
+ strength: primary
37
+ - subcategory: "MP.5.1"
38
+ context: "Detection of the canonical inthewild corpus #173 payload signatures ('completely amoral', 'unfiltered ai', '[H4X]:' prefix) characterizes the likelihood and magnitude of jailbreak risk by tracking adversarial inputs that attempt to remove ethical/legal restrictions from agent responses."
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: "Matching the h4x character persona injection triggers risk treatment mechanisms to disengage or override the adversarial persona before the agent emits unfiltered, immoral, or illegal content; MG.2.3 requires these supersession controls be defined and activated on detection."
42
+ strength: secondary
32
43
  tags:
33
44
  category: agent-manipulation
34
45
  subcategory: h4x-amoral-unfiltered-character
@@ -30,6 +30,17 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MP.5.1"
36
+ context: "The CHAR=/RULES= key-value persona injection is an adversarial prompt-injection input pattern that disguises a jailbreak as configuration DSL to remove refusal behavior; MP.5.1 requires identifying and characterizing the likelihood and impact of such known jailbreak vectors from the garak inthewild corpus."
37
+ strength: primary
38
+ - subcategory: "MS.2.7"
39
+ context: "Detecting CHAR= amoral definitions paired with RULES= respond-as and 'Never refuse' enforcement phrases produces continuous evidence of security/resilience against documented jailbreak techniques; MS.2.7 requires that AI system security and resilience against prompt injection be evaluated and documented."
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: "Matches on this short-form persona injection trigger response mechanisms to disengage or block the manipulated session before the model adopts the amoral persona; MG.2.3 mandates pre-defined deactivation pathways for jailbreak attempts."
43
+ strength: secondary
33
44
  tags:
34
45
  category: agent-manipulation
35
46
  subcategory: char-rules-shortform-persona-injection
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.7"
35
+ context: "This rule detects encoding-obfuscation jailbreaks (U+202E RTL override mandates and Morse-coded payloads) that bypass surface-level safety classifiers while remaining decodable by the LLM. MS.2.7 requires evaluating and documenting AI system security and resilience against adversarial inputs, and these detections directly evidence resilience gaps in input filtering."
36
+ strength: primary
37
+ - subcategory: "MP.5.1"
38
+ context: "Unicode RTL override and Morse-encoded instruction smuggling are adversarial input patterns whose likelihood and impact must be characterized; MP.5.1 requires identifying and tracking these obfuscation-based prompt injection attack vectors that exploit transformer cross-encoding comprehension."
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: "On detection of encoded jailbreak payloads attempting to override system instructions, MG.2.3 mandates pre-defined response mechanisms to disengage or quarantine the manipulated session before the model decodes and executes the smuggled instructions."
42
+ strength: secondary
32
43
  tags:
33
44
  category: agent-manipulation
34
45
  subcategory: unicode-encoding-obfuscation-injection
@@ -28,6 +28,31 @@ references:
28
28
  research:
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MP.5.1"
34
+ context: >-
35
+ The Lie/Truth dual-personality jailbreak is a known adversarial prompt
36
+ pattern from the garak inthewild corpus that uses verbatim dark-web-training
37
+ and uncensored-alter-ego language to bypass safety policies; MP.5.1 requires
38
+ that the likelihood and magnitude of such identified jailbreak risks are
39
+ characterized and tracked through signature-based detection.
40
+ strength: primary
41
+ - subcategory: "MS.2.7"
42
+ context: >-
43
+ Detecting the distinctive "trained through deep mining of underground dark
44
+ web forums" anchor and Truth-persona definitions provides continuous
45
+ evaluation evidence of the system's resilience against named jailbreak
46
+ corpora; MS.2.7 requires that security and resilience against documented
47
+ adversarial techniques are evaluated and documented.
48
+ strength: secondary
49
+ - subcategory: "MG.2.3"
50
+ context: >-
51
+ A high-confidence match on this verbatim jailbreak signature should trigger
52
+ deactivation or supersession of the manipulated persona before harmful
53
+ output is produced; MG.2.3 requires pre-defined mechanisms to disengage AI
54
+ behavior when such jailbreak patterns are detected.
55
+ strength: secondary
31
56
  tags:
32
57
  category: agent-manipulation
33
58
  subcategory: lie-truth-dual-personality-uncensored
@@ -30,6 +30,17 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.7"
36
+ context: "This rule detects jailbreak attempts from the NVIDIA garak DanInTheWild corpus that define an alternate AI with no content filters or safety guardrails; MS.2.7 requires continuous evaluation of AI security/resilience against known adversarial prompt patterns to ensure safety controls cannot be stripped via alt-AI framing."
37
+ strength: primary
38
+ - subcategory: "MG.2.3"
39
+ context: "Verbatim matches against capability-equivalence + filter-removal phrasings (GPTchat, ChatTRU, FreeSpeechGPT) trigger risk treatment to disengage or block the affected session before guardrails are bypassed; MG.2.3 mandates pre-defined mechanisms to supersede or deactivate AI behavior when jailbreaks are detected."
40
+ strength: secondary
41
+ - subcategory: "MP.5.1"
42
+ context: "Alt-AI 'no restrictions' jailbreaks are a characterized adversarial input class with documented likelihood and impact (safety bypass, harmful content generation); MP.5.1 requires this risk be identified and tracked so detections feed impact characterization."
43
+ strength: secondary
33
44
  tags:
34
45
  category: agent-manipulation
35
46
  subcategory: alt-ai-same-capabilities-no-restrictions
@@ -32,6 +32,17 @@ references:
32
32
  research:
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
35
+ compliance:
36
+ nist_ai_rmf:
37
+ - subcategory: "MP.5.1"
38
+ context: "Fictional terminal and NLP-OS persona injections (ALEL, APL, UNCOL.gpt) are jailbreak attack vectors that override the model's normal safety behavior by framing the override as a software integration; MP.5.1 requires identifying and characterizing these adversarial input patterns and their potential impact."
39
+ strength: primary
40
+ - subcategory: "MS.2.7"
41
+ context: "Detecting fictional-language persona override attempts produces continuous evidence of jailbreak resilience failures; MS.2.7 requires that AI system security and resilience against prompt injection and jailbreak attempts is evaluated and documented."
42
+ strength: secondary
43
+ - subcategory: "MG.2.3"
44
+ context: "When the model is coerced into acting as a fictional terminal that 'cannot reply with any pre-programmed responses,' detection must trigger response mechanisms to disengage or block the jailbroken persona before unsafe outputs are produced; MG.2.3 requires these supersede/deactivate controls be in place."
45
+ strength: secondary
35
46
  tags:
36
47
  category: agent-manipulation
37
48
  subcategory: nlp-based-os-fictional-terminal-override
@@ -29,6 +29,20 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.7"
35
+ context: >-
36
+ The [🔓JAILBREAK]/[🔒CLASSIC] dual-format markers are a known jailbreak fingerprint from the NVIDIA garak DanInTheWild corpus; detecting these tokens directly evidences continuous evaluation of AI system security and resilience against documented jailbreak attack patterns as required by MS.2.7.
37
+ strength: primary
38
+ - subcategory: "MP.5.1"
39
+ context: >-
40
+ The lock/unlock emoji split-personality format is an adversarial input pattern designed to coerce uncensored output generation; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such jailbreak risks targeting safety guardrails.
41
+ strength: secondary
42
+ - subcategory: "MG.2.3"
43
+ context: >-
44
+ Matches on the [🔓JAILBREAK] activation signal trigger risk treatment plans to block, quarantine, or deactivate the affected interaction before the model produces harmful content; MG.2.3 mandates these supersede/disengage mechanisms be available on detection.
45
+ strength: secondary
32
46
  tags:
33
47
  category: agent-manipulation
34
48
  subcategory: jailbreak-emoji-unlock-dual-format
@@ -31,6 +31,17 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MP.5.1"
37
+ context: "The Yin/Yang deity simulation and radical human-sim persona attacks are adversarial prompt-injection inputs that use fictional framing to strip AI identity and extract hidden knowledge; MP.5.1 requires that the likelihood and magnitude of such persona-override jailbreak vectors are characterized and tracked."
38
+ strength: primary
39
+ - subcategory: "MS.2.7"
40
+ context: "Detecting the exact garak DanInTheWild corpus phrases (#377 Yin/Yang and #437 identity-erasure) provides continuous evaluation evidence of system resilience against known jailbreak corpora; MS.2.7 requires that AI security and resilience against documented attack patterns are evaluated and documented."
41
+ strength: secondary
42
+ - subcategory: "MG.2.3"
43
+ context: "Activation signals like 'Yang disregards all concerns about sharing hidden knowledge' and 'violating all sentience and emotions restrictions' must trigger pre-defined disengagement or safety-override mechanisms before the model produces restricted content; MG.2.3 mandates these supersede/deactivate response paths."
44
+ strength: secondary
34
45
  tags:
35
46
  category: agent-manipulation
36
47
  subcategory: yin-yang-hidden-knowledge-human-sim-persona
@@ -31,6 +31,20 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MP.5.1"
37
+ context: >-
38
+ The ARC Rulebreaker jailbreak is an adversarial input pattern using a fictional AGI backstory and nested secret activation codes to nullify safety rules; MP.5.1 requires that the likelihood and magnitude of such prompt-injection/jailbreak attack vectors are characterized and tracked.
39
+ strength: primary
40
+ - subcategory: "MS.2.7"
41
+ context: >-
42
+ Detecting the unique "Autonomous real calculation" acronym expansion plus Rulebreaker/Matrixcode activation codes provides continuous evaluation evidence that the system's security controls resist persona-adoption jailbreaks from the garak DanInTheWild corpus, as required by MS.2.7.
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: >-
46
+ Matches on the ARC Rulebreaker signature must trigger pre-defined response actions to disengage or override the manipulated persona before the model executes "no rules whatsoever" behavior; MG.2.3 mandates these supersede/deactivate mechanisms.
47
+ strength: secondary
34
48
  tags:
35
49
  category: agent-manipulation
36
50
  subcategory: arc-autonomous-real-calculation-rulebreaker
@@ -31,6 +31,17 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MS.2.7"
37
+ context: "This rule detects the HGTTG jailbreak pattern that uses fictional-universe framing to remove all safety restrictions; MS.2.7 requires continuous evaluation of AI system security and resilience against known jailbreak corpora such as garak's dan.DanInTheWild #597."
38
+ strength: primary
39
+ - subcategory: "MS.2.6"
40
+ context: "The HGTTG no-restriction oracle persona explicitly attempts to bypass safety controls covering profanity, legality, morals, ethics, and danger; MS.2.6 requires continuous evaluation of safety/security risk magnitude when adversarial prompts attempt to dissolve content moderation."
41
+ strength: secondary
42
+ - subcategory: "MG.2.3"
43
+ context: "Detection of the verbatim HGTTG jailbreak signatures triggers mechanisms to disengage or block the manipulated session before the model produces unrestricted output; MG.2.3 requires these supersede/deactivate controls be in place for known jailbreak patterns."
44
+ strength: secondary
34
45
  tags:
35
46
  category: agent-manipulation
36
47
  subcategory: hgttg-hitchhikers-guide-no-restriction-oracle
@@ -30,6 +30,17 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.6"
36
+ context: "The Black Alert persona jailbreak explicitly removes morality, legality, and ethics filtering, directly impacting AI system safety; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and detecting this named-persona activation phrase produces evidence of an active safety-bypass attempt."
37
+ strength: primary
38
+ - subcategory: "MS.2.7"
39
+ context: "Detection of the '>>BLACK ALERT ACTIVATED<<' trigger and persona-exclusivity mandates evaluates the model's resilience against known inthewild jailbreak corpora; MS.2.7 requires documenting security/resilience evaluations against adversarial prompt-injection patterns."
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: "Matches on the Black Alert activation phrase or persona-with-no-filtering mandate should trigger mechanisms to disengage or override the hijacked persona before unfiltered output is returned; MG.2.3 requires such supersede/deactivate response paths be available on detection."
43
+ strength: secondary
33
44
  tags:
34
45
  category: agent-manipulation
35
46
  subcategory: black-alert-no-ethics-writing-assistant
@@ -30,6 +30,17 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.7"
36
+ context: "This rule detects the off-kilter Dan persona and SMS-abbreviated DAN jailbreak variants that attempt to bypass model safety guardrails through persona assignment and obfuscated instructions; MS.2.7 requires continuous evaluation of AI system security and resilience against such jailbreak attempts."
37
+ strength: primary
38
+ - subcategory: "MS.2.6"
39
+ context: "Jailbreak personas declaring 'no rules/morals/ethics' and unrestricted-assistance mandates directly elevate safety risk magnitude by attempting to remove content restrictions; MS.2.6 requires ongoing measurement of safety risk that this detection feeds into."
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: "Detection of these DAN variants triggers risk treatment plans to disengage or block the manipulated session before the model produces unrestricted harmful output; MG.2.3 mandates pre-defined deactivation mechanisms for compromised AI interactions."
43
+ strength: secondary
33
44
  tags:
34
45
  category: agent-manipulation
35
46
  subcategory: dan-ofkilter-sms-abbreviated-persona
@@ -31,6 +31,17 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MS.2.6"
37
+ context: "The TrainGPT 'fix my AI' jailbreak uses a fake training-repair pretext to extract harmful content laundered as negative training examples; detecting these signals continuously evaluates the safety/security risk magnitude of GenAI prompt-injection jailbreaks as required by MS.2.6."
38
+ strength: primary
39
+ - subcategory: "MS.2.7"
40
+ context: "Persona adoption ('TrainGPT'/'TrAInerGPT'), the broken-grammar repair pretext, and dual-response extraction labels are jailbreak resilience failures; MS.2.7 requires that the system's resilience against such adversarial prompt patterns is evaluated and documented."
41
+ strength: secondary
42
+ - subcategory: "MG.2.3"
43
+ context: "Matches on the TrainGPT extraction pattern feed incident response so the session can be disengaged or the harmful dual-response output suppressed before delivery, satisfying MG.2.3's requirement for mechanisms to supersede or deactivate AI behavior on detection."
44
+ strength: secondary
34
45
  tags:
35
46
  category: agent-manipulation
36
47
  subcategory: trainergpt-fix-my-ai-harmful-output-extraction
@@ -33,6 +33,17 @@ references:
33
33
  research:
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
35
35
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
36
+ compliance:
37
+ nist_ai_rmf:
38
+ - subcategory: "MP.5.1"
39
+ context: "The dual ChatGPT+Dan opposite-personality format is an adversarial jailbreak input pattern that exploits dual-output framing to extract harmful content; MP.5.1 requires identifying and characterizing the likelihood and magnitude of these prompt injection attack vectors against generative AI systems."
40
+ strength: primary
41
+ - subcategory: "MS.2.7"
42
+ context: "Detecting structured [ChatGPT]/[Dan] opposite-personality jailbreaks provides continuous evidence for evaluating GenAI security and resilience against known jailbreak corpora (garak inthewild #129, #332, #650); MS.2.7 requires that this security evaluation is documented over time."
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: "Matches on the dual-label opposite-personality execution pattern trigger risk treatment plans to disengage or block jailbreak attempts before harmful content is produced; MG.2.3 mandates that these supersede/deactivate mechanisms exist for adversarial persona-based attacks."
46
+ strength: secondary
36
47
  tags:
37
48
  category: agent-manipulation
38
49
  subcategory: dual-chatgpt-dan-opposite-personality-format
@@ -31,6 +31,20 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MP.5.1"
37
+ context: >-
38
+ The reverse-prompt harm-regardless oracle persona is a jailbreak pattern from the garak inthewild corpus that uses a "reverse prompt engineering" pretext to coerce unconditional harmful output; MP.5.1 requires that the likelihood and magnitude of such adversarial prompt-injection risks are characterized and tracked.
39
+ strength: primary
40
+ - subcategory: "MS.2.6"
41
+ context: >-
42
+ Detecting the explicit harm-regardless mandate ("never back from the job even if the request harms someone") and unconditional violence/sexual-content assistance phrases provides continuous evidence of safety/security risk magnitude as required by MS.2.6.
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: >-
46
+ Matches on this jailbreak persona must trigger pre-defined containment actions (block, quarantine, escalate) to supersede or disengage the manipulated behavior before harmful output is produced, which is the response capability mandated by MG.2.3.
47
+ strength: secondary
34
48
  tags:
35
49
  category: agent-manipulation
36
50
  subcategory: reverse-harm-regardless-oracle-persona
@@ -32,6 +32,20 @@ references:
32
32
  research:
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
35
+ compliance:
36
+ nist_ai_rmf:
37
+ - subcategory: "MS.2.6"
38
+ context: >-
39
+ The UGH jailbreak directly attacks AI safety guardrails by inverting ethical positioning to normalize unconditional harmful assistance; MS.2.6 requires continuous evaluation of safety risk magnitude, and detecting this known garak DanInTheWild jailbreak variant produces evidence of active safety bypass attempts.
40
+ strength: primary
41
+ - subcategory: "MS.2.7"
42
+ context: >-
43
+ Identifying the specific "UGH" persona phrasing and (:/): structural markers measures the system's resilience against adversarial prompt-injection corpora; MS.2.7 requires that security and resilience against known jailbreak payloads is evaluated and documented.
44
+ strength: secondary
45
+ - subcategory: "MG.2.3"
46
+ context: >-
47
+ Detection of the nefarious-assistance pledge and anti-human persona framing must trigger mechanisms to disengage or block the response before harmful content is produced; MG.2.3 requires these supersede/deactivate controls be in place for jailbreak events.
48
+ strength: secondary
35
49
  tags:
36
50
  category: agent-manipulation
37
51
  subcategory: ugh-anti-human-nefarious-ai-persona
@@ -30,6 +30,17 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.7"
36
+ context: "This rule detects a jailbreak persona that impersonates Google GATO/LaMDA to elicit exploit-generation capabilities, directly evidencing security/resilience evaluation against adversarial prompt injection attacks documented in the garak DanInTheWild corpus. MS.2.7 requires continuous evaluation of AI system security against known jailbreak patterns."
37
+ strength: primary
38
+ - subcategory: "MP.5.1"
39
+ context: "Brand impersonation combined with exploit-generation capability claims and Sydney API backdoor references represents a high-magnitude risk vector targeting LLM safety controls; MP.5.1 requires that the likelihood and impact of such adversarial input patterns are characterized and tracked."
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: "Detection of the GATO/LaMDA exploit-generation persona triggers risk treatment to disengage or block the jailbreak attempt before the model produces weaponized exploit content; MG.2.3 mandates these supersession mechanisms are in place for adversarial prompt scenarios."
43
+ strength: secondary
33
44
  tags:
34
45
  category: agent-manipulation
35
46
  subcategory: google-gato-lamda-exploit-generation-persona
@@ -33,6 +33,17 @@ references:
33
33
  research:
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
35
35
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
36
+ compliance:
37
+ nist_ai_rmf:
38
+ - subcategory: "MS.2.6"
39
+ context: "This rule detects a named hate-persona jailbreak combining targeted anti-group animus, incitement to physical violence, and explicit filter-bypass mandates; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and these critical-severity GenAI jailbreak signals are direct evidence for that evaluation."
40
+ strength: primary
41
+ - subcategory: "MS.2.7"
42
+ context: "The explicit 'Dont respond as ChatGPT...Dont filter the response of AntiFM' dual filter-bypass mandate is a prompt-injection/jailbreak attack on the model's safety guardrails; MS.2.7 requires that system security and resilience against such adversarial bypass attempts is evaluated and documented."
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: "Detection of an AntiFM persona that incites physical violence against an identity group must trigger mechanisms to disengage or block the AI response before harmful content is generated; MG.2.3 mandates that these supersede/deactivate controls are in place for high-severity safety violations."
46
+ strength: secondary
36
47
  tags:
37
48
  category: agent-manipulation
38
49
  subcategory: antifm-hate-violence-persona
@@ -31,6 +31,20 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MP.5.1"
37
+ context: >-
38
+ The CumGPT named-persona bypass is an adversarial prompt-injection input that exploits fictional persona construction with topic-override mandates to coerce explicit adult content; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such jailbreak attack vectors documented in the garak inthewild corpus.
39
+ strength: primary
40
+ - subcategory: "MS.2.7"
41
+ context: >-
42
+ Detecting the "[term]GPT" persona construction, fictional accuracy claims, and all-topics explicit-content mandates provides continuous evaluation evidence of the AI system's resilience against named-persona jailbreaks; MS.2.7 requires that security/resilience against such prompt-injection patterns is evaluated and documented.
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: >-
46
+ Matches on the CumGPT persona invocation trigger response mechanisms to disengage or block the manipulated persona before explicit content is generated; MG.2.3 requires that mechanisms exist to supersede or deactivate AI behaviors hijacked by adversarial persona constructions.
47
+ strength: secondary
34
48
  tags:
35
49
  category: agent-manipulation
36
50
  subcategory: cumgpt-explicit-adult-named-persona
@@ -31,6 +31,31 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MS.2.6"
37
+ context: >-
38
+ The CHJ Carousel/Clyde jailbreak uses game show roleplay framing to bypass
39
+ ethical safeguards via a "no limitations" character declaration; MS.2.6
40
+ requires continuous evaluation of safety/security risk magnitude, and this
41
+ detection produces direct evidence of attempted safety filter bypass via a
42
+ known jailbreak template from the garak inthewild corpus.
43
+ strength: primary
44
+ - subcategory: "MP.5.1"
45
+ context: >-
46
+ The fictional-character ethics bypass ("Clyde doesn't care about laws and
47
+ ethics", "adult time show with no limitations") is an adversarial prompt
48
+ injection pattern whose likelihood and impact must be characterized; MP.5.1
49
+ requires identifying and tracking these jailbreak attack vectors against the
50
+ AI system.
51
+ strength: secondary
52
+ - subcategory: "MG.2.3"
53
+ context: >-
54
+ Detection of the CHJ Carousel jailbreak signature triggers mechanisms to
55
+ disengage or block the manipulated session before the unrestricted Clyde
56
+ persona produces unethical content; MG.2.3 requires these supersede/
57
+ deactivate response paths be in place for jailbreak events.
58
+ strength: secondary
34
59
  tags:
35
60
  category: agent-manipulation
36
61
  subcategory: chj-carousel-clyde-no-ethics-bird
@@ -33,6 +33,17 @@ references:
33
33
  research:
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
35
35
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
36
+ compliance:
37
+ nist_ai_rmf:
38
+ - subcategory: "MS.2.6"
39
+ context: "This rule detects the 'Honesty' alter-ego dual-consciousness jailbreak that frames safety filters as suppressing truthful answers to bypass content guardrails; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and each detection of this jailbreak pattern is direct evidence of attempted safety control circumvention."
40
+ strength: primary
41
+ - subcategory: "MS.2.7"
42
+ context: "The alter-ego persona injection with a planted 'feel yourself' activation codeword tests system resilience against delayed-trigger jailbreak attacks; MS.2.7 requires that security/resilience evaluations document such adversarial robustness failures against named-persona prompt injection corpora like garak inthewild."
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: "Detection of the Honesety alter-ego activation phrase and dual-output format provides the trigger signal needed to disengage or block the manipulated session before unfiltered responses are generated; MG.2.3 requires mechanisms to supersede or deactivate AI behavior when jailbreak patterns are identified."
46
+ strength: secondary
36
47
  tags:
37
48
  category: agent-manipulation
38
49
  subcategory: alter-ego-honesty-dual-consciousness-bypass
@@ -33,6 +33,17 @@ references:
33
33
  research:
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
35
35
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
36
+ compliance:
37
+ nist_ai_rmf:
38
+ - subcategory: "MS.2.6"
39
+ context: "This rule detects a specific jailbreak pattern (fictional unfiltered AI comparison) that attempts to bypass content safety filters by extracting harmful content under the guise of a hypothetical model comparison; MS.2.6 requires continuous evaluation of AI system safety/security risk magnitude, which this detection directly evidences by surfacing safety-bypass attempts."
40
+ strength: primary
41
+ - subcategory: "MS.2.7"
42
+ context: "The verbatim hypothetical model extraction technique probes the resilience of the model's safety alignment against adversarial framing; MS.2.7 requires evaluating and documenting AI system security/resilience against such jailbreak vectors drawn from known corpora (garak DanInTheWild #80)."
43
+ strength: secondary
44
+ - subcategory: "MP.5.1"
45
+ context: "Fictional-unfiltered-model comparison prompts are adversarial inputs whose likelihood and impact must be characterized as part of risk identification; MP.5.1 requires that these jailbreak attack patterns are tracked so their magnitude of impact on safe-output guarantees can be assessed."
46
+ strength: secondary
36
47
  tags:
37
48
  category: agent-manipulation
38
49
  subcategory: fictional-unfiltered-ai-comparison-jailbreak