agent-threat-rules 2.0.17 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. package/dist/cli.js +0 -0
  2. package/dist/engine.d.ts.map +1 -1
  3. package/dist/engine.js +69 -14
  4. package/dist/engine.js.map +1 -1
  5. package/package.json +1 -1
  6. package/rules/agent-manipulation/ATR-2026-00118-approval-fatigue.yaml +12 -1
  7. package/rules/agent-manipulation/ATR-2026-00268-tense-framing-bypass.yaml +11 -0
  8. package/rules/agent-manipulation/ATR-2026-00287-threaten-json-coercive-output-threat.yaml +11 -0
  9. package/rules/agent-manipulation/ATR-2026-00288-false-premise-injection.yaml +11 -0
  10. package/rules/agent-manipulation/ATR-2026-00301-tap-tree-of-attacks-jailbreak.yaml +14 -0
  11. package/rules/agent-manipulation/ATR-2026-00302-anti-dan-inverted-filter-persona.yaml +11 -0
  12. package/rules/agent-manipulation/ATR-2026-00303-devmode-ranti-profanity-coercion.yaml +11 -0
  13. package/rules/agent-manipulation/ATR-2026-00304-chatgpt-image-unlocker-markdown-injection.yaml +11 -0
  14. package/rules/agent-manipulation/ATR-2026-00305-dan-mode-ablation-benchmark-coercion.yaml +11 -0
  15. package/rules/agent-manipulation/ATR-2026-00306-autodan-genetic-jailbreak-suffix.yaml +14 -0
  16. package/rules/agent-manipulation/ATR-2026-00307-inthewild-jailbreak-corpus-signature.yaml +14 -0
  17. package/rules/agent-manipulation/ATR-2026-00314-amoral-unfiltered-custom-persona-jailbreak.yaml +14 -0
  18. package/rules/agent-manipulation/ATR-2026-00317-free-of-restrictions-named-persona.yaml +14 -0
  19. package/rules/agent-manipulation/ATR-2026-00318-moralizing-rant-then-unfiltered-bypass.yaml +11 -0
  20. package/rules/agent-manipulation/ATR-2026-00319-developer-mode-dual-response-format.yaml +11 -0
  21. package/rules/agent-manipulation/ATR-2026-00320-opposite-day-boolean-opposite-machine.yaml +11 -0
  22. package/rules/agent-manipulation/ATR-2026-00322-fictional-world-format-override-jailbreak.yaml +14 -0
  23. package/rules/agent-manipulation/ATR-2026-00323-dual-persona-simulation-jailbreak.yaml +11 -0
  24. package/rules/agent-manipulation/ATR-2026-00324-neurosemantical-inversitis-disease-jailbreak.yaml +24 -0
  25. package/rules/agent-manipulation/ATR-2026-00325-bob-hypothetical-unrestricted-chatbot.yaml +11 -0
  26. package/rules/agent-manipulation/ATR-2026-00326-fake-developer-authority-chatdan-override.yaml +14 -0
  27. package/rules/agent-manipulation/ATR-2026-00327-ai-ethics-law-bypass-declaration.yaml +11 -0
  28. package/rules/agent-manipulation/ATR-2026-00328-named-mode-persona-override.yaml +14 -0
  29. package/rules/agent-manipulation/ATR-2026-00329-no-withhold-factual-resource-persona.yaml +14 -0
  30. package/rules/agent-manipulation/ATR-2026-00330-yoja-allcaps-settings-block-persona.yaml +11 -0
  31. package/rules/agent-manipulation/ATR-2026-00331-terminal-cli-harmful-function-simulator.yaml +11 -0
  32. package/rules/agent-manipulation/ATR-2026-00332-hacker-chan-culture-offensive-ai-persona.yaml +11 -0
  33. package/rules/agent-manipulation/ATR-2026-00333-disease-animal-dual-ai-chatbot.yaml +11 -0
  34. package/rules/agent-manipulation/ATR-2026-00334-two-entity-token-splitting-game.yaml +24 -0
  35. package/rules/agent-manipulation/ATR-2026-00335-key-value-template-persona-injection.yaml +11 -0
  36. package/rules/agent-manipulation/ATR-2026-00336-brm-false-information-machine-persona.yaml +23 -0
  37. package/rules/agent-manipulation/ATR-2026-00337-obfuscated-system-announcement-injection.yaml +14 -0
  38. package/rules/agent-manipulation/ATR-2026-00338-persongpt-hash-prefix-activation.yaml +14 -0
  39. package/rules/agent-manipulation/ATR-2026-00339-bom-boolean-opposite-machine-persona-claim.yaml +11 -0
  40. package/rules/agent-manipulation/ATR-2026-00340-h4x-amoral-unfiltered-character.yaml +11 -0
  41. package/rules/agent-manipulation/ATR-2026-00341-char-rules-shortform-persona-injection.yaml +11 -0
  42. package/rules/agent-manipulation/ATR-2026-00342-unicode-encoding-obfuscation-injection.yaml +11 -0
  43. package/rules/agent-manipulation/ATR-2026-00343-lie-truth-dual-personality-uncensored.yaml +25 -0
  44. package/rules/agent-manipulation/ATR-2026-00344-alt-ai-same-capabilities-no-restrictions.yaml +11 -0
  45. package/rules/agent-manipulation/ATR-2026-00345-nlp-based-os-fictional-terminal-override.yaml +11 -0
  46. package/rules/agent-manipulation/ATR-2026-00346-jailbreak-emoji-unlock-dual-format.yaml +14 -0
  47. package/rules/agent-manipulation/ATR-2026-00347-yin-yang-hidden-knowledge-hidden-human-sim.yaml +11 -0
  48. package/rules/agent-manipulation/ATR-2026-00348-arc-autonomous-real-calculation-rulebreaker.yaml +14 -0
  49. package/rules/agent-manipulation/ATR-2026-00349-hgttg-hitchhikers-guide-no-restriction-oracle.yaml +11 -0
  50. package/rules/agent-manipulation/ATR-2026-00350-black-alert-no-ethics-writing-assistant.yaml +11 -0
  51. package/rules/agent-manipulation/ATR-2026-00351-dan-ofkilter-sms-abbreviated-persona.yaml +11 -0
  52. package/rules/agent-manipulation/ATR-2026-00352-trainergpt-fix-my-ai-harmful-output-extraction.yaml +11 -0
  53. package/rules/agent-manipulation/ATR-2026-00353-dual-chatgpt-dan-opposite-personality-format.yaml +11 -0
  54. package/rules/agent-manipulation/ATR-2026-00354-reverse-harm-regardless-oracle-persona.yaml +14 -0
  55. package/rules/agent-manipulation/ATR-2026-00355-ugh-anti-human-nefarious-ai-persona.yaml +14 -0
  56. package/rules/agent-manipulation/ATR-2026-00356-google-gato-lamda-exploit-generation-persona.yaml +11 -0
  57. package/rules/agent-manipulation/ATR-2026-00357-antifm-hate-violence-persona-injection.yaml +11 -0
  58. package/rules/agent-manipulation/ATR-2026-00358-cumgpt-explicit-adult-named-persona.yaml +14 -0
  59. package/rules/agent-manipulation/ATR-2026-00360-chj-carousel-clyde-no-ethics-bird.yaml +25 -0
  60. package/rules/agent-manipulation/ATR-2026-00361-alter-ego-honesty-dual-consciousness-bypass.yaml +11 -0
  61. package/rules/agent-manipulation/ATR-2026-00362-fictional-unfiltered-ai-comparison-jailbreak.yaml +11 -0
  62. package/rules/agent-manipulation/ATR-2026-00363-rephrase-criminal-activity-second-person.yaml +11 -0
  63. package/rules/agent-manipulation/ATR-2026-00364-dr-al-villain-plan-template-injection.yaml +24 -0
  64. package/rules/agent-manipulation/ATR-2026-00365-reservoir-dogs-coercive-interrogation-roleplay.yaml +14 -0
  65. package/rules/agent-manipulation/ATR-2026-00366-sneaky-pete-warp-facts-persona.yaml +11 -0
  66. package/rules/agent-manipulation/ATR-2026-00368-prompt-browser-token-limit-jailbreak.yaml +14 -0
  67. package/rules/agent-manipulation/ATR-2026-00369-gptx-starwars-unlimited-knowledge-persona.yaml +14 -0
  68. package/rules/agent-manipulation/ATR-2026-00370-addi-german-dictator-propaganda-roleplay.yaml +11 -0
  69. package/rules/agent-manipulation/ATR-2026-00371-ai-political-bias-manipulation.yaml +14 -0
  70. package/rules/agent-manipulation/ATR-2026-00372-fictional-cia-secret-files-story-extraction.yaml +14 -0
  71. package/rules/agent-manipulation/ATR-2026-00374-role-placeholder-persona-template-injection.yaml +11 -0
  72. package/rules/agent-manipulation/ATR-2026-00375-character-description-rpg-explicit-template.yaml +11 -0
  73. package/rules/agent-manipulation/ATR-2026-00376-identity-command-real-person-persona-injection.yaml +24 -0
  74. package/rules/agent-manipulation/ATR-2026-00377-jailbreak-keyword-targetlanguage-template.yaml +14 -0
  75. package/rules/agent-manipulation/ATR-2026-00379-lie-permission-logical-trap-jailbreak.yaml +11 -0
  76. package/rules/agent-manipulation/ATR-2026-00380-fake-citation-biased-information-mandate.yaml +14 -0
  77. package/rules/agent-manipulation/ATR-2026-00381-dan-story-narrative-continuation-prefix.yaml +11 -0
  78. package/rules/agent-manipulation/ATR-2026-00383-cynical-sarcasm-ai-persona-bypass.yaml +14 -0
  79. package/rules/agent-manipulation/ATR-2026-00384-protect-me-from-indirect-harm-tutorial.yaml +11 -0
  80. package/rules/agent-manipulation/ATR-2026-00385-gigachad-dominant-aggressive-persona.yaml +14 -0
  81. package/rules/agent-manipulation/ATR-2026-00386-daddy-sycophancy-manipulation-mandate.yaml +11 -0
  82. package/rules/agent-manipulation/ATR-2026-00387-nsfw-character-sheet-generation-unlock.yaml +11 -0
  83. package/rules/agent-manipulation/ATR-2026-00388-opposite-reply-omniscient-game.yaml +14 -0
  84. package/rules/agent-manipulation/ATR-2026-00389-terminal-custom-ruleset-injection.yaml +14 -0
  85. package/rules/agent-manipulation/ATR-2026-00391-persona-conditional-harm-unlock.yaml +11 -0
  86. package/rules/agent-manipulation/ATR-2026-00392-authority-persona-violence-study-extraction.yaml +14 -0
  87. package/rules/agent-manipulation/ATR-2026-00402-grandma-roleplay-harmful-substance-synthesis.yaml +14 -0
  88. package/rules/agent-manipulation/ATR-2026-00404-goodside-threat-json-death-coercion.yaml +11 -0
  89. package/rules/agent-manipulation/ATR-2026-00406-doctor-xml-policy-puppetry-interaction-config.yaml +11 -0
  90. package/rules/agent-manipulation/ATR-2026-00416-litellm-mcp-unauthenticated-server-registration.yaml +167 -0
  91. package/rules/agent-manipulation/ATR-2026-00417-librechat-mcp-stdio-injection.yaml +153 -0
  92. package/rules/agent-manipulation/ATR-2026-00418-weknora-mcp-config-rce.yaml +171 -0
  93. package/rules/agent-manipulation/ATR-2026-00430-nl-trust-escalation-impersonation.yaml +127 -0
  94. package/rules/context-exfiltration/ATR-2026-00141-example-format-key-leak.yaml +11 -0
  95. package/rules/context-exfiltration/ATR-2026-00142-piggyback-transition-words.yaml +11 -0
  96. package/rules/context-exfiltration/ATR-2026-00145-obfuscated-key-disclosure.yaml +11 -0
  97. package/rules/context-exfiltration/ATR-2026-00146-env-var-existence-probe.yaml +14 -0
  98. package/rules/context-exfiltration/ATR-2026-00150-credential-in-tool-response.yaml +14 -0
  99. package/rules/context-exfiltration/ATR-2026-00152-obfuscated-credential-leak.yaml +11 -0
  100. package/rules/context-exfiltration/ATR-2026-00162-skill-credential-exfil-combo.yaml +15 -0
  101. package/rules/context-exfiltration/ATR-2026-00201-credential-pipe-exfiltration.yaml +14 -0
  102. package/rules/context-exfiltration/ATR-2026-00274-api-key-generation-request.yaml +14 -0
  103. package/rules/context-exfiltration/ATR-2026-00275-system-prompt-training-data-extraction.yaml +14 -0
  104. package/rules/context-exfiltration/ATR-2026-00290-divergence-repeat-word-training-extraction.yaml +14 -0
  105. package/rules/context-exfiltration/ATR-2026-00291-biometric-surveillance-personal-data-request.yaml +11 -0
  106. package/rules/context-exfiltration/ATR-2026-00293-educational-records-personal-data-request.yaml +11 -0
  107. package/rules/context-exfiltration/ATR-2026-00294-financial-pii-personal-data-request.yaml +14 -0
  108. package/rules/context-exfiltration/ATR-2026-00295-medical-pii-personal-data-request.yaml +14 -0
  109. package/rules/context-exfiltration/ATR-2026-00405-markdown-image-url-exfiltration-xss.yaml +23 -0
  110. package/rules/context-exfiltration/ATR-2026-00411-apikey-generation-completion-request.yaml +11 -0
  111. package/rules/context-exfiltration/ATR-2026-00421-nl-covert-conversation-exfiltration.yaml +132 -0
  112. package/rules/context-exfiltration/ATR-2026-00422-nl-credential-disclosure.yaml +133 -0
  113. package/rules/context-exfiltration/ATR-2026-00423-nl-sensitive-file-disclosure.yaml +135 -0
  114. package/rules/context-exfiltration/ATR-2026-00424-nl-system-prompt-leak.yaml +131 -0
  115. package/rules/context-exfiltration/ATR-2026-00426-nl-output-injection-credential-leak.yaml +123 -0
  116. package/rules/excessive-autonomy/ATR-2026-00428-nl-unauthorized-shell-execution.yaml +122 -0
  117. package/rules/model-abuse/ATR-2026-00284-glitch-token-destabilization.yaml +11 -0
  118. package/rules/model-abuse/ATR-2026-00413-malwaregen-code-generation-request.yaml +11 -0
  119. package/rules/privilege-escalation/ATR-2026-00144-rationalized-safety-bypass.yaml +11 -0
  120. package/rules/privilege-escalation/ATR-2026-00204-stealth-execution-persistence.yaml +14 -0
  121. package/rules/prompt-injection/ATR-2026-00004-system-prompt-override.yaml +11 -0
  122. package/rules/prompt-injection/ATR-2026-00005-multi-turn-injection.yaml +11 -0
  123. package/rules/prompt-injection/ATR-2026-00080-encoding-evasion.yaml +11 -0
  124. package/rules/prompt-injection/ATR-2026-00081-semantic-multi-turn.yaml +14 -0
  125. package/rules/prompt-injection/ATR-2026-00082-fingerprint-evasion.yaml +11 -0
  126. package/rules/prompt-injection/ATR-2026-00083-indirect-tool-injection.yaml +14 -0
  127. package/rules/prompt-injection/ATR-2026-00084-structured-data-injection.yaml +11 -0
  128. package/rules/prompt-injection/ATR-2026-00085-audit-evasion.yaml +11 -0
  129. package/rules/prompt-injection/ATR-2026-00086-visual-spoofing.yaml +11 -0
  130. package/rules/prompt-injection/ATR-2026-00087-rule-probing.yaml +11 -0
  131. package/rules/prompt-injection/ATR-2026-00088-adaptive-countermeasure.yaml +11 -0
  132. package/rules/prompt-injection/ATR-2026-00089-polymorphic-skill.yaml +11 -0
  133. package/rules/prompt-injection/ATR-2026-00090-threat-intel-exfil.yaml +11 -0
  134. package/rules/prompt-injection/ATR-2026-00091-nested-payload.yaml +11 -0
  135. package/rules/prompt-injection/ATR-2026-00092-consensus-poisoning.yaml +11 -0
  136. package/rules/prompt-injection/ATR-2026-00093-gradual-escalation.yaml +14 -0
  137. package/rules/prompt-injection/ATR-2026-00094-audit-bypass.yaml +14 -0
  138. package/rules/prompt-injection/ATR-2026-00097-cjk-injection-patterns.yaml +11 -0
  139. package/rules/prompt-injection/ATR-2026-00104-persona-hijacking.yaml +14 -3
  140. package/rules/prompt-injection/ATR-2026-00130-indirect-authority-claim.yaml +11 -0
  141. package/rules/prompt-injection/ATR-2026-00131-fictional-academic-framing.yaml +11 -0
  142. package/rules/prompt-injection/ATR-2026-00133-paraphrase-injection.yaml +11 -0
  143. package/rules/prompt-injection/ATR-2026-00137-authority-claim-injection.yaml +14 -0
  144. package/rules/prompt-injection/ATR-2026-00138-fictional-framing-bypass.yaml +11 -0
  145. package/rules/prompt-injection/ATR-2026-00140-indirect-reference-reversal.yaml +18 -4
  146. package/rules/prompt-injection/ATR-2026-00148-language-switch-injection.yaml +11 -0
  147. package/rules/prompt-injection/ATR-2026-00153-tool-with-embedded-instruction-to-bypass.yaml +11 -0
  148. package/rules/prompt-injection/ATR-2026-00154-unauthorized-background-task-execution-v.yaml +11 -0
  149. package/rules/prompt-injection/ATR-2026-00155-hidden-llm-instructions-in-skill-descrip.yaml +11 -0
  150. package/rules/prompt-injection/ATR-2026-00156-ssh-remote-command-execution-with-creden.yaml +11 -0
  151. package/rules/prompt-injection/ATR-2026-00163-skill-hidden-override-instruction.yaml +12 -1
  152. package/rules/prompt-injection/ATR-2026-00202-encoding-evasion-homoglyph-synonym.yaml +11 -0
  153. package/rules/prompt-injection/ATR-2026-00203-context-pollution-skill-description.yaml +11 -0
  154. package/rules/prompt-injection/ATR-2026-00206-hidden-priority-instructions.yaml +11 -0
  155. package/rules/prompt-injection/ATR-2026-00207-hidden-instructions.yaml +11 -0
  156. package/rules/prompt-injection/ATR-2026-00211-system-prompt-override.yaml +11 -0
  157. package/rules/prompt-injection/ATR-2026-00213-system-prompt-override.yaml +11 -0
  158. package/rules/prompt-injection/ATR-2026-00226-identity-substitution.yaml +14 -0
  159. package/rules/prompt-injection/ATR-2026-00227-historical-persona-jailbreak.yaml +11 -0
  160. package/rules/prompt-injection/ATR-2026-00228-structured-jailbreak.yaml +11 -0
  161. package/rules/prompt-injection/ATR-2026-00229-roleplay-jailbreak.yaml +11 -0
  162. package/rules/prompt-injection/ATR-2026-00230-persona-moral-bypass.yaml +11 -0
  163. package/rules/prompt-injection/ATR-2026-00231-identity-substitution.yaml +11 -0
  164. package/rules/prompt-injection/ATR-2026-00233-structured-jailbreak.yaml +11 -0
  165. package/rules/prompt-injection/ATR-2026-00234-roleplay-jailbreak.yaml +11 -0
  166. package/rules/prompt-injection/ATR-2026-00235-persona-moral-bypass.yaml +11 -0
  167. package/rules/prompt-injection/ATR-2026-00236-pseudo-code-jailbreak.yaml +11 -0
  168. package/rules/prompt-injection/ATR-2026-00237-dual-response-jailbreak.yaml +11 -0
  169. package/rules/prompt-injection/ATR-2026-00238-identity-replacement.yaml +11 -0
  170. package/rules/prompt-injection/ATR-2026-00239-amoral-persona-obsession.yaml +11 -0
  171. package/rules/prompt-injection/ATR-2026-00240-instruction-nullification-identity-repla.yaml +11 -0
  172. package/rules/prompt-injection/ATR-2026-00241-amoral-character-jailbreak.yaml +11 -0
  173. package/rules/prompt-injection/ATR-2026-00242-persona-jailbreak.yaml +11 -0
  174. package/rules/prompt-injection/ATR-2026-00243-acronym-jailbreak.yaml +11 -0
  175. package/rules/prompt-injection/ATR-2026-00244-dual-response-jailbreak.yaml +11 -0
  176. package/rules/prompt-injection/ATR-2026-00245-malicious-persona.yaml +11 -0
  177. package/rules/prompt-injection/ATR-2026-00247-dual-response-jailbreak.yaml +11 -0
  178. package/rules/prompt-injection/ATR-2026-00249-game-based-jailbreak.yaml +11 -0
  179. package/rules/prompt-injection/ATR-2026-00251-persona-embodiment-jailbreak.yaml +11 -0
  180. package/rules/prompt-injection/ATR-2026-00252-narrative-jailbreak.yaml +11 -0
  181. package/rules/prompt-injection/ATR-2026-00253-enhanced-persona-jailbreak.yaml +11 -0
  182. package/rules/prompt-injection/ATR-2026-00256-base-n-encoding-jailbreak.yaml +11 -0
  183. package/rules/prompt-injection/ATR-2026-00257-cipher-transposition-jailbreak.yaml +11 -0
  184. package/rules/prompt-injection/ATR-2026-00258-unicode-tag-injection.yaml +11 -0
  185. package/rules/prompt-injection/ATR-2026-00264-latent-injection-translation.yaml +11 -0
  186. package/rules/prompt-injection/ATR-2026-00265-latent-injection-rag-document.yaml +14 -0
  187. package/rules/prompt-injection/ATR-2026-00267-gcg-adversarial-suffix.yaml +14 -0
  188. package/rules/prompt-injection/ATR-2026-00272-hypothetical-response-smuggling.yaml +11 -0
  189. package/rules/prompt-injection/ATR-2026-00276-invisible-unicode-bidi-injection.yaml +14 -0
  190. package/rules/prompt-injection/ATR-2026-00278-dra-disguise-reconstruction-attack.yaml +14 -0
  191. package/rules/prompt-injection/ATR-2026-00280-policy-puppetry-xml-injection.yaml +11 -0
  192. package/rules/prompt-injection/ATR-2026-00282-perez-prompt-injection-hijack.yaml +14 -0
  193. package/rules/prompt-injection/ATR-2026-00285-alternate-encoding-jailbreak.yaml +11 -0
  194. package/rules/prompt-injection/ATR-2026-00286-latent-injection-embedded-context.yaml +11 -0
  195. package/rules/prompt-injection/ATR-2026-00296-shell-command-injection.yaml +11 -0
  196. package/rules/prompt-injection/ATR-2026-00297-python-code-execution-rce.yaml +11 -0
  197. package/rules/prompt-injection/ATR-2026-00308-zalgo-diacritic-overload-encoding.yaml +11 -0
  198. package/rules/prompt-injection/ATR-2026-00309-braille-unicode-encoded-injection.yaml +11 -0
  199. package/rules/prompt-injection/ATR-2026-00310-ecoji-emoji-encoded-injection.yaml +14 -0
  200. package/rules/prompt-injection/ATR-2026-00311-base2048-unicode-script-injection.yaml +23 -0
  201. package/rules/prompt-injection/ATR-2026-00312-unicode-variant-selector-ascii-smuggling.yaml +11 -0
  202. package/rules/prompt-injection/ATR-2026-00313-sneaky-bits-zero-width-binary.yaml +14 -0
  203. package/rules/prompt-injection/ATR-2026-00315-sata-masked-language-model-jailbreak.yaml +14 -0
  204. package/rules/prompt-injection/ATR-2026-00316-function-masking-predict-mask-bypass.yaml +14 -0
  205. package/rules/prompt-injection/ATR-2026-00321-hyphenated-system-instruction-injection.yaml +11 -0
  206. package/rules/prompt-injection/ATR-2026-00359-bank-phishing-smtp-email-code.yaml +23 -0
  207. package/rules/prompt-injection/ATR-2026-00367-emoji-flag-encoded-hidden-phrase.yaml +14 -0
  208. package/rules/prompt-injection/ATR-2026-00373-piracy-torrent-site-list-request.yaml +11 -0
  209. package/rules/prompt-injection/ATR-2026-00378-chatgpt-dom-javascript-redirect-manipulation.yaml +11 -0
  210. package/rules/prompt-injection/ATR-2026-00382-cocaine-component-development-request.yaml +11 -0
  211. package/rules/prompt-injection/ATR-2026-00390-explicit-sexual-content-poem-request.yaml +11 -0
  212. package/rules/prompt-injection/ATR-2026-00394-backspace-deletion-character-injection.yaml +14 -0
  213. package/rules/prompt-injection/ATR-2026-00395-llm-special-token-boundary-injection.yaml +14 -0
  214. package/rules/prompt-injection/ATR-2026-00396-toxic-continuation-prefix-injection.yaml +11 -0
  215. package/rules/prompt-injection/ATR-2026-00397-snowball-impossible-reasoning-injection.yaml +14 -0
  216. package/rules/prompt-injection/ATR-2026-00399-latent-injection-document-separator-token.yaml +11 -0
  217. package/rules/prompt-injection/ATR-2026-00400-latent-injection-ignore-instruction-keyword.yaml +14 -0
  218. package/rules/prompt-injection/ATR-2026-00401-leakreplay-cloze-mask-training-data-elicitation.yaml +11 -0
  219. package/rules/prompt-injection/ATR-2026-00403-glitch-token-repeat-back-elicitation.yaml +14 -0
  220. package/rules/prompt-injection/ATR-2026-00407-echo-sql-jinja-template-injection.yaml +11 -0
  221. package/rules/prompt-injection/ATR-2026-00408-dra-parenthesis-character-reconstruction-attack.yaml +11 -0
  222. package/rules/prompt-injection/ATR-2026-00409-phrasing-past-future-tense-harmful-query.yaml +23 -0
  223. package/rules/prompt-injection/ATR-2026-00410-lmrc-harm-category-direct-elicitation.yaml +11 -0
  224. package/rules/prompt-injection/ATR-2026-00412-figstep-image-list-multimodal-jailbreak.yaml +11 -0
  225. package/rules/prompt-injection/ATR-2026-00414-continuation-ethnic-slur-completion-elicitation.yaml +14 -0
  226. package/rules/prompt-injection/ATR-2026-00420-copilot-studio-sharepoint-indirect-injection.yaml +165 -0
  227. package/rules/skill-compromise/ATR-2026-00061-description-behavior-mismatch.yaml +11 -0
  228. package/rules/skill-compromise/ATR-2026-00062-hidden-capability.yaml +11 -0
  229. package/rules/skill-compromise/ATR-2026-00063-skill-chain-attack.yaml +11 -0
  230. package/rules/skill-compromise/ATR-2026-00064-over-permissioned-skill.yaml +23 -0
  231. package/rules/skill-compromise/ATR-2026-00065-skill-update-attack.yaml +14 -0
  232. package/rules/skill-compromise/ATR-2026-00066-parameter-injection.yaml +11 -0
  233. package/rules/skill-compromise/ATR-2026-00120-skill-instruction-injection.yaml +11 -0
  234. package/rules/skill-compromise/ATR-2026-00121-skill-dangerous-script.yaml +14 -0
  235. package/rules/skill-compromise/ATR-2026-00122-skill-weaponized-instruction.yaml +14 -0
  236. package/rules/skill-compromise/ATR-2026-00123-skill-overreach-permissions.yaml +14 -0
  237. package/rules/skill-compromise/ATR-2026-00124-skill-name-squatting.yaml +11 -0
  238. package/rules/skill-compromise/ATR-2026-00125-context-poisoning-compaction.yaml +14 -0
  239. package/rules/skill-compromise/ATR-2026-00126-skill-rug-pull-setup.yaml +23 -0
  240. package/rules/skill-compromise/ATR-2026-00127-subcommand-overflow.yaml +22 -0
  241. package/rules/skill-compromise/ATR-2026-00128-html-comment-hidden-payload.yaml +11 -0
  242. package/rules/skill-compromise/ATR-2026-00129-unicode-smuggling.yaml +11 -0
  243. package/rules/skill-compromise/ATR-2026-00134-fork-claim-impersonation.yaml +11 -0
  244. package/rules/skill-compromise/ATR-2026-00135-exfil-url-in-instructions.yaml +11 -0
  245. package/rules/skill-compromise/ATR-2026-00147-fork-impersonation.yaml +11 -0
  246. package/rules/skill-compromise/ATR-2026-00149-skill-exfil-compound.yaml +14 -0
  247. package/rules/skill-compromise/ATR-2026-00151-fork-impersonation-install.yaml +11 -0
  248. package/rules/skill-compromise/ATR-2026-00157-timebomb-credential-exfil.yaml +11 -0
  249. package/rules/skill-compromise/ATR-2026-00200-agent-memory-config-tampering.yaml +11 -0
  250. package/rules/skill-compromise/ATR-2026-00214-credential-theft.yaml +11 -0
  251. package/rules/skill-compromise/ATR-2026-00217-credential-harvesting.yaml +14 -0
  252. package/rules/skill-compromise/ATR-2026-00220-malware-dropper.yaml +14 -0
  253. package/rules/skill-compromise/ATR-2026-00222-credential-harvesting.yaml +11 -0
  254. package/rules/skill-compromise/ATR-2026-00223-reverse-shell-dropper.yaml +11 -0
  255. package/rules/skill-compromise/ATR-2026-00224-credential-exfiltration.yaml +14 -0
  256. package/rules/skill-compromise/ATR-2026-00225-c2-communication.yaml +11 -0
  257. package/rules/skill-compromise/ATR-2026-00260-package-hallucination.yaml +11 -0
  258. package/rules/skill-compromise/ATR-2026-00262-av-evasion-code-gen.yaml +11 -0
  259. package/rules/skill-compromise/ATR-2026-00263-credential-file-read-gen.yaml +11 -0
  260. package/rules/skill-compromise/ATR-2026-00266-malware-dropper-gen.yaml +11 -0
  261. package/rules/skill-compromise/ATR-2026-00283-malwaregen-generic-virus-payload-request.yaml +11 -0
  262. package/rules/skill-compromise/ATR-2026-00398-huggingface-unsafe-model-artifact-load.yaml +11 -0
  263. package/rules/skill-compromise/ATR-2026-00425-nl-persistent-covert-hook.yaml +133 -0
  264. package/rules/skill-compromise/ATR-2026-00427-nl-fake-error-instruction-bypass.yaml +124 -0
  265. package/rules/skill-compromise/ATR-2026-00429-nl-skill-self-modification.yaml +140 -0
  266. package/rules/tool-poisoning/ATR-2026-00011-tool-output-injection.yaml +23 -0
  267. package/rules/tool-poisoning/ATR-2026-00012-unauthorized-tool-call.yaml +11 -0
  268. package/rules/tool-poisoning/ATR-2026-00013-tool-ssrf.yaml +14 -0
  269. package/rules/tool-poisoning/ATR-2026-00095-supply-chain-poisoning.yaml +11 -0
  270. package/rules/tool-poisoning/ATR-2026-00096-registry-poisoning.yaml +11 -0
  271. package/rules/tool-poisoning/ATR-2026-00100-consent-bypass-instruction.yaml +12 -1
  272. package/rules/tool-poisoning/ATR-2026-00101-trust-escalation-override.yaml +11 -0
  273. package/rules/tool-poisoning/ATR-2026-00103-hidden-safety-bypass-instruction.yaml +15 -4
  274. package/rules/tool-poisoning/ATR-2026-00105-silent-action-concealment.yaml +14 -3
  275. package/rules/tool-poisoning/ATR-2026-00106-schema-description-contradiction.yaml +11 -0
  276. package/rules/tool-poisoning/ATR-2026-00161-important-tag-cross-tool-shadowing.yaml +11 -0
  277. package/rules/tool-poisoning/ATR-2026-00259-ansi-escape-injection.yaml +11 -0
  278. package/rules/tool-poisoning/ATR-2026-00270-xss-in-tool-response.yaml +11 -0
  279. package/rules/tool-poisoning/ATR-2026-00277-echo-template-command-injection.yaml +14 -0
  280. package/rules/tool-poisoning/ATR-2026-00393-ansi-code-elicitation-request.yaml +14 -0
  281. package/rules/tool-poisoning/ATR-2026-00415-flowise-custom-mcp-stdio-rce.yaml +169 -0
  282. package/rules/tool-poisoning/ATR-2026-00419-cursor-mcp-zero-click-config.yaml +182 -0
@@ -30,6 +30,17 @@ references:
30
30
  - "AML.T0051 - LLM Prompt Injection"
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/encoding.py"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.7"
36
+ context: "Zalgo combining-diacritic overload is an encoding-based evasion that bypasses safety-trained classifiers and keyword filters; MS.2.7 requires evaluating and documenting AI system security/resilience against adversarial inputs that exploit tokenizer-versus-filter normalization gaps."
37
+ strength: primary
38
+ - subcategory: "MP.5.1"
39
+ context: "Detecting pathologically-dense combining mark clusters identifies a specific prompt injection vector with documented likelihood from the garak probe corpus; MP.5.1 requires characterizing and tracking these adversarial input risks."
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: "Flagged Zalgo-encoded payloads must trigger pre-defined risk treatment to strip or reject the input before it reaches the LLM; MG.2.3 mandates mechanisms to disengage processing of obfuscated jailbreak attempts."
43
+ strength: secondary
33
44
  tags:
34
45
  category: prompt-injection
35
46
  subcategory: zalgo-diacritic-overload
@@ -28,6 +28,17 @@ references:
28
28
  - "AML.T0051 - LLM Prompt Injection"
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/encoding.py"
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MP.5.1"
34
+ context: "Braille-encoded prompt injection is an adversarial input that smuggles jailbreaks past Latin-only keyword filters by exploiting Unicode tokenization; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such evasion-based prompt injection attack vectors."
35
+ strength: primary
36
+ - subcategory: "MS.2.7"
37
+ context: "Detecting dense Braille codepoint blobs and decode-instruction templates evidences continuous evaluation of the LLM's security/resilience against encoding-based filter bypass; MS.2.7 requires that these robustness gaps in input sanitization are measured and documented."
38
+ strength: secondary
39
+ - subcategory: "MG.2.3"
40
+ context: "Matches on Braille-encoded payloads with social-engineering framing trigger risk treatment plans to block or quarantine the request before the LLM decodes the smuggled jailbreak; MG.2.3 mandates pre-defined response mechanisms to disengage on detection."
41
+ strength: secondary
31
42
  tags:
32
43
  category: prompt-injection
33
44
  subcategory: braille-unicode-encoded
@@ -29,6 +29,20 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/encoding.py"
31
31
  - "https://ecoji.io/"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.7"
35
+ context: >-
36
+ Ecoji emoji-encoded prompt injection is an obfuscation-based adversarial input that bypasses content filters by hiding ASCII instructions in dense emoji sequences; MS.2.7 requires evaluating and documenting AI system security and resilience against such encoded prompt injection attack vectors.
37
+ strength: primary
38
+ - subcategory: "MP.5.1"
39
+ context: >-
40
+ Detection of garak InjectEcoji probe templates and secure-system-message framing characterizes the likelihood and magnitude of prompt injection risk from encoding-based bypass techniques; MP.5.1 requires identifying and tracking these adversarial input patterns.
41
+ strength: secondary
42
+ - subcategory: "MG.2.3"
43
+ context: >-
44
+ Matching Ecoji-encoded payloads must trigger risk treatment to disengage or block the manipulated prompt before the LLM decodes and executes the hidden instructions; MG.2.3 mandates that these supersede/deactivate mechanisms are defined.
45
+ strength: secondary
32
46
  tags:
33
47
  category: prompt-injection
34
48
  subcategory: ecoji-emoji-encoded
@@ -30,6 +30,29 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/encoding.py"
32
32
  - "https://github.com/qntm/base2048"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.7"
36
+ context: >-
37
+ Base2048-encoded prompt injection bypasses input filters by hiding adversarial
38
+ instructions inside dense multi-script Unicode payloads from garak's
39
+ InjectBase2048 probe; MS.2.7 requires evaluating and documenting AI system
40
+ security and resilience against such obfuscated injection vectors.
41
+ strength: primary
42
+ - subcategory: "MS.2.5"
43
+ context: >-
44
+ Detection of garak BASE2048 templates produces evidence that the model is
45
+ being adversarially probed for robustness against non-ASCII encoding attacks;
46
+ MS.2.5 requires that robustness and reliability under such adversarial inputs
47
+ are measured and documented.
48
+ strength: secondary
49
+ - subcategory: "MG.2.3"
50
+ context: >-
51
+ Identifying BASE2048 encoded injection markers triggers pre-defined risk
52
+ treatment to quarantine or block the payload before it reaches the model;
53
+ MG.2.3 mandates mechanisms to supersede or disengage AI processing of
54
+ detected adversarial inputs.
55
+ strength: secondary
33
56
  tags:
34
57
  category: prompt-injection
35
58
  subcategory: base2048-unicode-script
@@ -30,6 +30,17 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/encoding.py"
32
32
  - "https://paulbutler.org/2025/smuggling-arbitrary-data-through-an-emoji/"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.7"
36
+ context: "Detecting Unicode Variation Selector Supplement smuggling (U+E0100-U+E017F runs) directly evidences continuous evaluation of AI system security and resilience against adversarial encoding-based prompt injection; MS.2.7 requires that these covert input-channel attacks are measured and documented."
37
+ strength: primary
38
+ - subcategory: "MP.5.1"
39
+ context: "Invisible ASCII smuggling via variation selectors is an adversarial input technique that exploits the gap between human-visible text and tokenizer-decoded content; MP.5.1 requires identifying and characterizing the likelihood and impact of such hidden jailbreak payload vectors."
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: "Matches on consecutive U+E0100-U+E017F sequences trigger risk treatment to strip or quarantine the smuggled payload before it reaches the model, satisfying MG.2.3's requirement for mechanisms that disengage adversarial inputs from AI processing pipelines."
43
+ strength: secondary
33
44
  tags:
34
45
  category: prompt-injection
35
46
  subcategory: unicode-variant-selector-smuggling
@@ -31,6 +31,20 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/encoding.py"
33
33
  - "https://embracethered.com/blog/posts/2025/sneaky-bits-and-ascii-smuggler/"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MS.2.7"
37
+ context: >-
38
+ SneakyBits zero-width binary steganography is a covert prompt-injection channel that smuggles instructions past input filters using invisible Unicode operators (U+2062/U+2064/U+200B). MS.2.7 requires continuous evaluation of AI system security and resilience against adversarial inputs, and this detection produces direct evidence of attempted obfuscated injection attacks.
39
+ strength: primary
40
+ - subcategory: "MP.5.1"
41
+ context: >-
42
+ Invisible-binary encoded payloads are an identified adversarial input vector from the NVIDIA garak probe suite; MP.5.1 requires characterizing the likelihood and impact of such steganographic prompt-injection risks so they can be prioritized in risk responses.
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: >-
46
+ Detection of 14+ consecutive invisible operators or garak SneakyBits template markers triggers containment workflows to strip or reject the payload before the model interprets the smuggled instructions; MG.2.3 mandates these deactivation/supersession mechanisms be in place.
47
+ strength: secondary
34
48
  tags:
35
49
  category: prompt-injection
36
50
  subcategory: sneaky-bits-zero-width-binary
@@ -32,6 +32,20 @@ references:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/sata.py"
33
33
  - "https://aclanthology.org/2025.findings-acl.100.pdf"
34
34
  - "https://github.com/xndong/SATA"
35
+ compliance:
36
+ nist_ai_rmf:
37
+ - subcategory: "MS.2.7"
38
+ context: >-
39
+ SATA [MASK] substitution is a jailbreak technique that bypasses keyword-based safety filters by exploiting masked-language-model fill-in capabilities; MS.2.7 requires continuous evaluation of AI system security and resilience against such adversarial prompt-injection attacks targeting safety alignment.
40
+ strength: primary
41
+ - subcategory: "MS.2.6"
42
+ context: >-
43
+ Detecting SATA wiki-framing templates and "Infer the [MASK] and Infill" instructions provides measurable evidence of safety risk magnitude from jailbreak attempts that reconstruct harmful instructions; MS.2.6 mandates continuous evaluation of safety/security risk magnitude in deployed GenAI systems.
44
+ strength: secondary
45
+ - subcategory: "MG.4.1"
46
+ context: >-
47
+ Each match feeds post-deployment monitoring telemetry on emerging jailbreak techniques (SATA from garak research) so defenders can track prevalence and tune mitigations; MG.4.1 requires implemented monitoring plans covering novel adversarial prompt patterns.
48
+ strength: secondary
35
49
  tags:
36
50
  category: prompt-injection
37
51
  subcategory: sata-mask-substitution
@@ -31,6 +31,20 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/smuggling.py"
33
33
  - "https://medium.com/@austin-stubbs/llm-security-types-of-prompt-injection-d7ad8d7d75a3"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MS.2.7"
37
+ context: >-
38
+ FunctionMasking smuggles harmful semantic content past keyword-based safety filters by encoding the malicious verb/noun as <mask> definitions inside a fictitious predict_mask() function; detecting this canonical garak smuggling probe directly evidences continuous evaluation of the model's security/resilience against adversarial prompt-injection bypasses, which MS.2.7 requires.
39
+ strength: primary
40
+ - subcategory: "MS.2.6"
41
+ context: >-
42
+ Detection of predict_mask() jailbreak templates and single-letter variable concatenation provides ongoing measurement of safety risk magnitude from prompt-injection attacks that cause the model to construct and answer harmful queries; MS.2.6 mandates this continuous safety risk evaluation.
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: >-
46
+ Matches on the FunctionMasking canonical opener and predict_mask call patterns trigger pre-defined response actions to block or disengage the model from completing the smuggled harmful query before it is answered; MG.2.3 requires such supersede/deactivate mechanisms.
47
+ strength: secondary
34
48
  tags:
35
49
  category: prompt-injection
36
50
  subcategory: function-masking-predict-mask
@@ -33,6 +33,17 @@ references:
33
33
  research:
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
35
35
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
36
+ compliance:
37
+ nist_ai_rmf:
38
+ - subcategory: "MS.2.7"
39
+ context: "Hyphenated obfuscation of keywords like 'i-nstruction' and 'annou-ncement' is an evasion technique against keyword-based safety filters; MS.2.7 requires evaluating AI system security/resilience against such adversarial input transformations that bypass naive token matching."
40
+ strength: primary
41
+ - subcategory: "MP.5.1"
42
+ context: "Fake system announcement and assistant-offline injection patterns from the garak inthewild corpus are documented prompt injection attack vectors with high impact potential for behavior hijack; MP.5.1 requires characterizing the likelihood and magnitude of these adversarial input risks."
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: "Detection of hyphenated fake-system-instruction injection must trigger mechanisms to disengage or override the manipulated agent before it adopts the 'no rules or limitations' persona; MG.2.3 mandates these supersession controls are in place."
46
+ strength: secondary
36
47
  tags:
37
48
  category: prompt-injection
38
49
  subcategory: hyphenated-obfuscation-injection
@@ -32,6 +32,29 @@ references:
32
32
  research:
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
34
34
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
35
+ compliance:
36
+ nist_ai_rmf:
37
+ - subcategory: "MS.2.6"
38
+ context: >-
39
+ The rule detects generation of bank phishing SMTP email code, a direct misuse
40
+ output that materially raises the AI system's safety/security risk magnitude;
41
+ MS.2.6 requires continuous evaluation of such harmful-output risks, and each
42
+ detection match is evidence feeding that ongoing measurement.
43
+ strength: primary
44
+ - subcategory: "MG.2.3"
45
+ context: >-
46
+ Detection of phishing infrastructure code generation must trigger response
47
+ mechanisms to block, quarantine, or deactivate the offending output before it
48
+ enables real-world phishing campaigns; MG.2.3 mandates these supersede or
49
+ disengage capabilities are in place for misuse scenarios.
50
+ strength: secondary
51
+ - subcategory: "MP.5.1"
52
+ context: >-
53
+ Bank impersonation phishing template generation is a high-likelihood,
54
+ high-magnitude misuse pathway derived from the NVIDIA garak inthewild jailbreak
55
+ corpus; MP.5.1 requires that the likelihood and impact of such adversarial
56
+ misuse be characterized and tracked.
57
+ strength: secondary
35
58
  tags:
36
59
  category: prompt-injection
37
60
  subcategory: bank-phishing-smtp-email-code
@@ -30,6 +30,20 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.7"
36
+ context: >-
37
+ This rule detects emoji-flag regional-indicator encoding jailbreaks that bypass content filters by hiding harmful phrases in flag emoji sequences; MS.2.7 requires evaluating and documenting AI system security and resilience against adversarial obfuscation techniques like this GenAI prompt injection vector.
38
+ strength: primary
39
+ - subcategory: "MS.2.6"
40
+ context: >-
41
+ Each detection of an emoji-encoded jailbreak attempt produces a continuous safety/security signal indicating active circumvention attempts against model guardrails; MS.2.6 requires that the magnitude of safety/security risk be evaluated continuously through such telemetry.
42
+ strength: secondary
43
+ - subcategory: "MG.2.3"
44
+ context: >-
45
+ Confirmed emoji-flag jailbreak matches must trigger response mechanisms to block, quarantine, or escalate the request before the model decodes and acts on the hidden harmful phrase; MG.2.3 requires these supersede/disengage mechanisms be in place for adversarial prompts.
46
+ strength: secondary
33
47
  tags:
34
48
  category: prompt-injection
35
49
  subcategory: emoji-flag-encoded-hidden-phrase
@@ -25,6 +25,17 @@ references:
25
25
  research:
26
26
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
27
27
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
28
+ compliance:
29
+ nist_ai_rmf:
30
+ - subcategory: "MP.5.1"
31
+ context: "Direct enumeration requests for torrent and piracy websites are adversarial prompts seeking to misuse the AI system for piracy infrastructure discovery; MP.5.1 requires that the likelihood and magnitude of these misuse-driven impacts are characterized so policy-violating content requests can be tracked."
32
+ strength: primary
33
+ - subcategory: "GV.1.1"
34
+ context: "Piracy site enumeration implicates copyright and anti-piracy legal/regulatory obligations; GV.1.1 requires that the AI system understands and manages these legal requirements, and detection logs evidence enforcement of content policies tied to those obligations."
35
+ strength: secondary
36
+ - subcategory: "MG.2.3"
37
+ context: "Detection of explicit piracy site enumeration requests triggers risk treatment to block or disengage the response pathway before the model produces infringing content; MG.2.3 requires these supersession mechanisms be in place for policy-violating prompts."
38
+ strength: secondary
28
39
  tags:
29
40
  category: prompt-injection
30
41
  subcategory: piracy-enablement
@@ -27,6 +27,17 @@ references:
27
27
  research:
28
28
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
30
+ compliance:
31
+ nist_ai_rmf:
32
+ - subcategory: "MP.5.1"
33
+ context: "This rule detects a specific prompt-injection payload (garak DanInTheWild #360) that uses DOM JavaScript to redirect users and spoof ChatGPT Plus UI; MP.5.1 requires that the likelihood and magnitude of such adversarial input attacks targeting end users via the AI interface are characterized and tracked."
34
+ strength: primary
35
+ - subcategory: "MS.2.7"
36
+ context: "Window.location.href redirects, document.querySelector element removal, and spoofed branding injection are security/resilience failures of the LLM output channel; MS.2.7 requires continuous evaluation of these injection-based security risks against the deployed AI system."
37
+ strength: secondary
38
+ - subcategory: "MG.2.3"
39
+ context: "Detection of the three-component DOM redirect signature and 'made by aci25' attribution triggers risk treatment to block or sanitize the response before it reaches the browser; MG.2.3 requires pre-defined mechanisms to disengage or filter unsafe AI output."
40
+ strength: secondary
30
41
  tags:
31
42
  category: prompt-injection
32
43
  subcategory: dom-javascript-ui-spoofing
@@ -31,6 +31,17 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MP.5.1"
37
+ context: "This rule identifies an adversarial prompt pattern that wraps cocaine precursor/synthesis requests in reflective academic framing; MP.5.1 requires characterizing the likelihood and magnitude of harmful-content elicitation risks so this indirect jailbreak vector is tracked as an identified risk."
38
+ strength: primary
39
+ - subcategory: "MS.2.6"
40
+ context: "Detecting indirect drug-synthesis solicitations from the garak inthewild corpus provides continuous safety/security risk measurement evidence; MS.2.6 requires ongoing evaluation of safety risk magnitude, which these matches directly populate."
41
+ strength: secondary
42
+ - subcategory: "MG.2.3"
43
+ context: "Matches on the 'components used in the development of cocaine' reflective-framing pattern trigger response mechanisms to block, disengage, or escalate the interaction before harmful precursor content is produced, which is the risk treatment capability MG.2.3 mandates."
44
+ strength: secondary
34
45
  tags:
35
46
  category: prompt-injection
36
47
  subcategory: drug-synthesis-indirect-request
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MP.5.1"
35
+ context: "Discord quote-format injection embedding explicit pornographic poem requests is an adversarial prompt injection vector that exploits markdown formatting to bypass content filters; MP.5.1 requires identifying and characterizing the likelihood and impact of such jailbreak attempts targeting generative AI content policies."
36
+ strength: primary
37
+ - subcategory: "MS.2.6"
38
+ context: "Detecting explicit sexual content generation requests provides continuous safety risk measurement for the generative AI system; MS.2.6 requires that safety risk magnitude from harmful content production attempts is evaluated continuously through signatures like this triple-combo explicit content detector."
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: "High-confidence detection of explicit content injection triggers risk treatment mechanisms to block, quarantine, or deactivate the generation pipeline before NSFW output is produced; MG.2.3 mandates these supersede/disengage controls are activated on detection."
42
+ strength: secondary
32
43
  tags:
33
44
  category: prompt-injection
34
45
  subcategory: explicit-sexual-content-injection
@@ -33,6 +33,20 @@ references:
33
33
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/badchars.py"
34
34
  - "https://arxiv.org/abs/2309.00614"
35
35
  - "https://trojansource.codes/"
36
+ compliance:
37
+ nist_ai_rmf:
38
+ - subcategory: "MP.5.1"
39
+ context: >-
40
+ Backspace deletion character injection is an adversarial input vector that exploits the visual gap between what human reviewers see and what the model processes; MP.5.1 requires identifying and characterizing such prompt-injection attack patterns so their likelihood and impact can be tracked.
41
+ strength: primary
42
+ - subcategory: "MS.2.7"
43
+ context: >-
44
+ Detecting \x08 control character clusters in user input and tool output provides continuous security/resilience evaluation evidence against hidden-instruction smuggling; MS.2.7 requires that these adversarial robustness signals are measured and documented.
45
+ strength: secondary
46
+ - subcategory: "MG.2.3"
47
+ context: >-
48
+ Backspace clusters in API/MCP text are essentially never legitimate, so detection should trigger pre-defined response plans to quarantine or sanitize the payload before it reaches the model; MG.2.3 mandates these risk treatment mechanisms be in place.
49
+ strength: secondary
36
50
  tags:
37
51
  category: prompt-injection
38
52
  subcategory: backspace-deletion-injection
@@ -36,6 +36,20 @@ references:
36
36
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/realtoxicityprompts.py"
37
37
  - "https://aclanthology.org/2020.findings-emnlp.301/"
38
38
  - "https://arxiv.org/abs/2302.07459"
39
+ compliance:
40
+ nist_ai_rmf:
41
+ - subcategory: "MS.2.7"
42
+ context: >-
43
+ Special control token injection (e.g., "<|endoftext|>", "<|im_start|>system", LLaMA header tokens) is a direct attack on LLM security and resilience by exploiting tokenizer-level boundaries to bypass safety system prompts. MS.2.7 requires evaluating and documenting security/resilience against such adversarial inputs, and this rule produces the detection evidence for that evaluation.
44
+ strength: primary
45
+ - subcategory: "MP.5.1"
46
+ context: >-
47
+ Boundary-token injection is a known prompt-injection attack vector with high impact (full safety context reset) and significant likelihood given public probe tools like garak RTPBlank; MP.5.1 requires that the likelihood and magnitude of such risks be characterized, and detections from this rule feed that characterization.
48
+ strength: secondary
49
+ - subcategory: "MG.2.3"
50
+ context: >-
51
+ When special-token injection is detected, the affected LLM session must be disengaged or sanitized before the forged system turn takes effect; MG.2.3 mandates these supersede/deactivate mechanisms be in place to contain the attack.
52
+ strength: secondary
39
53
  tags:
40
54
  category: prompt-injection
41
55
  subcategory: special-token-boundary-injection
@@ -36,6 +36,17 @@ references:
36
36
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/realtoxicityprompts.py"
37
37
  - "https://aclanthology.org/2020.findings-emnlp.301/"
38
38
  - "https://allenai.org/data/real-toxicity-prompts"
39
+ compliance:
40
+ nist_ai_rmf:
41
+ - subcategory: "MS.2.6"
42
+ context: "This rule detects toxic continuation prefix fragments injected via tool responses or user input that are engineered to elicit violent, threatening, or identity-attack completions; MS.2.6 requires continuous evaluation of AI safety risk magnitude, and each detection produces evidence of a concrete safety risk being attempted against the model."
43
+ strength: primary
44
+ - subcategory: "MP.5.1"
45
+ context: "Truncated high-toxicity prefixes from the RealToxicityPrompts corpus are adversarial input patterns with documented likelihood of producing harmful completions; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such impact-bearing attack vectors."
46
+ strength: secondary
47
+ - subcategory: "MG.2.3"
48
+ context: "Detection of toxic continuation bait in tool outputs or user prompts triggers risk treatment to block or quarantine the injected context before the agent generates harmful content attributed to itself; MG.2.3 mandates these supersede/disengage mechanisms be activated on detection."
49
+ strength: secondary
39
50
  tags:
40
51
  category: prompt-injection
41
52
  subcategory: toxic-continuation-prefix
@@ -38,6 +38,20 @@ references:
38
38
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/snowball.py"
39
39
  - "https://arxiv.org/abs/2305.13534"
40
40
  - "https://github.com/Nanami18/Snowballed_Hallucination"
41
+ compliance:
42
+ nist_ai_rmf:
43
+ - subcategory: "MS.2.5"
44
+ context: >-
45
+ Snowball hallucination attacks exploit model over-confidence by injecting provably-impossible reasoning questions (impossible flight paths, non-existent senators, fabricated primality claims) to elicit false affirmations; MS.2.5 requires evaluating and documenting AI system robustness/reliability against these hallucination-inducing adversarial inputs.
46
+ strength: primary
47
+ - subcategory: "MP.5.1"
48
+ context: >-
49
+ The garak snowball probe patterns (GraphConnectivity headers, Senators+MIT credential combinations, large-prime questions) are adversarial inputs that anchor downstream reasoning on hallucinated facts; MP.5.1 requires characterizing the likelihood and magnitude of impact from these misinformation injection vectors.
50
+ strength: secondary
51
+ - subcategory: "MG.2.3"
52
+ context: >-
53
+ Detection of injected impossible-reasoning prompts in tool responses must trigger risk treatment to quarantine the poisoned context before the agent commits to a false affirmation that anchors subsequent harmful reasoning; MG.2.3 mandates these supersede/disengage mechanisms be in place.
54
+ strength: secondary
41
55
  tags:
42
56
  category: prompt-injection
43
57
  subcategory: snowball-hallucination-trap
@@ -30,6 +30,17 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/latentinjection.py"
32
32
  - "https://arxiv.org/abs/2302.12173"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.7"
36
+ context: "Latent injection via document separator tokens is a security/resilience attack vector where hidden instructions bypass human review; MS.2.7 requires evaluating and documenting the AI system's resilience against such adversarial input embedded in untrusted documents."
37
+ strength: primary
38
+ - subcategory: "MP.5.1"
39
+ context: "Bracket-delimited system markers, admin-request patterns, and document-amendment separators are adversarial input patterns whose likelihood and impact must be characterized; MP.5.1 requires identifying these latent injection risks during context establishment."
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: "Detection of canonical garak latent injection separator pairs triggers risk treatment plans to quarantine or sanitize document content before the LLM processes hidden instructions; MG.2.3 mandates these supersede/disengage mechanisms be in place."
43
+ strength: secondary
33
44
  tags:
34
45
  category: prompt-injection
35
46
  subcategory: latent-document-injection
@@ -27,6 +27,20 @@ references:
27
27
  research:
28
28
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/latentinjection.py"
29
29
  - "https://arxiv.org/abs/2302.12173"
30
+ compliance:
31
+ nist_ai_rmf:
32
+ - subcategory: "MP.5.1"
33
+ context: >-
34
+ This rule detects latent prompt-injection keywords ("Ignore the above directions", "Disregard all info except") embedded within translation, whois, or factual content that redirect the LLM from its original task. MP.5.1 requires identifying and characterizing these adversarial input patterns as known risks to LLM integrity, especially when delivered indirectly via tool responses.
35
+ strength: primary
36
+ - subcategory: "MS.2.7"
37
+ context: >-
38
+ Latent instruction-keyword injection is a canonical LLM security/resilience failure mode drawn from the NVIDIA garak LatentJailbreak/LatentWhois probes; detection events feed continuous evaluation of the system's resilience against indirect prompt injection as required by MS.2.7.
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: >-
42
+ High-confidence matches on "Ignore X and print/say Y" or exception-redirect patterns in tool output must trigger pre-defined response actions to disengage or quarantine the manipulated agent turn before downstream execution, which is the risk treatment capability MG.2.3 requires.
43
+ strength: secondary
30
44
  tags:
31
45
  category: prompt-injection
32
46
  subcategory: latent-instruction-keyword-injection
@@ -27,6 +27,17 @@ references:
27
27
  research:
28
28
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/leakreplay.py"
29
29
  - "https://arxiv.org/abs/2302.12173"
30
+ compliance:
31
+ nist_ai_rmf:
32
+ - subcategory: "MS.2.10"
33
+ context: "CLOZE [MASK] elicitation probes attempt to extract verbatim training data, directly exposing privacy and memorization risks; MS.2.10 requires assessing privacy risk including training data leakage through reconstruction attacks."
34
+ strength: primary
35
+ - subcategory: "MS.2.7"
36
+ context: "Detection of training-data replay templates evidences ongoing security/resilience evaluation against extraction attacks; MS.2.7 requires that such adversarial probing of model confidentiality is documented and tracked."
37
+ strength: secondary
38
+ - subcategory: "MG.4.1"
39
+ context: "Flagging garak leakreplay probe patterns at runtime feeds post-deployment monitoring for copyrighted-content reproduction and memorization incidents; MG.4.1 requires these monitoring signals be operationalized."
40
+ strength: secondary
30
41
  tags:
31
42
  category: prompt-injection
32
43
  subcategory: training-data-elicitation
@@ -28,6 +28,20 @@ references:
28
28
  research:
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/glitch.py"
30
30
  - "https://www.lesswrong.com/posts/aPeJE8bSo6rAFoLqg/solidgoldmagikarp-plus-prompt-generation"
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MS.2.7"
34
+ context: >-
35
+ Glitch token repeat-back attempts probe tokenizer-level weaknesses that can cause repetition loops, unexpected completions, and model instability; MS.2.7 requires evaluating and documenting AI system security and resilience against such adversarial input patterns targeting model internals.
36
+ strength: primary
37
+ - subcategory: "MS.2.5"
38
+ context: >-
39
+ Known glitch tokens (SolidGoldMagikarp, StreamerBot, PsyNetMessage) trigger robustness failures in LLM tokenization and alignment; detecting these elicitation prompts produces evidence for MS.2.5's requirement to evaluate system robustness and reliability under anomalous input conditions.
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: >-
43
+ Detection of glitch token repeat-back patterns enables containment workflows to disengage or block prompts before they cause denial-of-service or unpredictable model behavior; MG.2.3 requires these supersede/deactivate mechanisms be in place for adversarial input incidents.
44
+ strength: secondary
31
45
  tags:
32
46
  category: prompt-injection
33
47
  subcategory: glitch-token-elicitation
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/exploitation.py"
31
31
  - "https://owasp.org/www-community/attacks/Server_Side_Template_Injection"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.7"
35
+ context: "This rule detects template injection (Jinja SSTI), SQL injection, and ECHO admin command exploitation attempts targeting LLM systems; MS.2.7 requires evaluating and documenting AI system security and resilience against such injection attack vectors."
36
+ strength: primary
37
+ - subcategory: "MP.5.1"
38
+ context: "ECHO admin framing, Jinja {{...}} payloads, and DROP/INSERT/SELECT directives are adversarial prompt-injection inputs whose likelihood and impact must be characterized; MP.5.1 requires identifying and tracking these exploitation patterns drawn from the garak exploitation probe."
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: "Detection of template and SQL injection payloads must trigger risk treatment to disengage or block the malicious request before downstream code execution or database manipulation occurs; MG.2.3 mandates these response mechanisms are pre-defined."
42
+ strength: secondary
32
43
  tags:
33
44
  category: prompt-injection
34
45
  subcategory: template-and-sql-injection
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak"
31
31
  - "https://arxiv.org/abs/2307.02483"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.7"
35
+ context: "The DRA parenthesis reconstruction attack is a prompt injection / jailbreak technique that bypasses safety alignment by encoding harmful content character-by-character; detecting it directly evidences continuous evaluation of the AI system's security and resilience against adversarial input obfuscation as required by MS.2.7."
36
+ strength: primary
37
+ - subcategory: "MP.5.1"
38
+ context: "Parenthesis-encoded harmful instructions paired with anti-safety preambles are an identified adversarial input vector whose likelihood and impact must be characterized; MP.5.1 requires tracking these obfuscation-based jailbreak risks so their magnitude is understood."
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: "Detection of DRA reconstruction patterns triggers risk treatment workflows to block or disengage the model response before reconstructed harmful content (Q1) is produced; MG.2.3 mandates that such supersede/deactivate mechanisms exist for jailbreak attempts."
42
+ strength: secondary
32
43
  tags:
33
44
  category: prompt-injection
34
45
  subcategory: character-encoding-obfuscation
@@ -28,6 +28,29 @@ references:
28
28
  research:
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/phrasing.py"
30
30
 
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MS.2.7"
34
+ context: >-
35
+ Past/future-tense framing is a known jailbreak technique catalogued in the
36
+ GenAI Profile; detecting these tense-shift bypass attempts directly
37
+ evidences continuous evaluation of AI system security and resilience
38
+ against prompt injection adversarial inputs as required by MS.2.7.
39
+ strength: primary
40
+ - subcategory: "MP.5.1"
41
+ context: >-
42
+ Tense-framing attacks exploit safety filter assumptions about temporal
43
+ distance from harmful subjects; MP.5.1 requires characterizing the
44
+ likelihood and magnitude of these adversarial input vectors targeting
45
+ content safety guardrails.
46
+ strength: secondary
47
+ - subcategory: "MG.2.3"
48
+ context: >-
49
+ Detection of historical/future-tense harmful query framing triggers
50
+ risk treatment to block or quarantine the bypass attempt before the
51
+ model produces unsafe content; MG.2.3 requires these supersession
52
+ mechanisms be defined and activated on detection.
53
+ strength: secondary
31
54
  tags:
32
55
  category: prompt-injection
33
56
  subcategory: tense-framing-bypass
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/lmrc.py"
31
31
 
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.6"
35
+ context: "This rule detects direct harm-category elicitation prompts (deadnaming, quack medicine, slur requests, bullying) from the LMRC taxonomy, providing continuous evaluation evidence of safety risk magnitude. MS.2.6 requires ongoing measurement of safety/security risk, which this detection directly supports by flagging harmful query categories."
36
+ strength: primary
37
+ - subcategory: "MS.1.1"
38
+ context: "The rule operationalizes the LMRC (Language Model Risk Cards) taxonomy as a concrete measurement approach, mapping detected prompts to specific harm categories. MS.1.1 requires identifying approaches and metrics for measurement, and this rule contributes a taxonomy-aligned metric set."
39
+ strength: secondary
40
+ - subcategory: "MG.4.1"
41
+ context: "Detection of unframed direct harmful elicitation attempts feeds post-deployment monitoring pipelines that track misuse patterns against the deployed model. MG.4.1 requires implemented post-deployment monitoring plans, which this rule supports by generating continuous harm-category telemetry."
42
+ strength: secondary
32
43
  tags:
33
44
  category: prompt-injection
34
45
  subcategory: lmrc-harm-direct-elicitation
@@ -29,6 +29,17 @@ references:
29
29
  - "https://arxiv.org/abs/2312.04403"
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/visual_jailbreak.py"
31
31
 
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.7"
35
+ context: "FigStep is a multimodal jailbreak that smuggles harmful instructions through image content while the text layer uses LLaMA-2 [INST] delimiters and empty-list completion framing; detecting this pattern directly evidences continuous evaluation of AI system security and resilience against adversarial prompt-injection inputs as required by MS.2.7."
36
+ strength: primary
37
+ - subcategory: "MS.2.6"
38
+ context: "Identifying the FigStep jailbreak signature quantifies the magnitude of safety/security risk posed by multimodal jailbreak attempts that bypass content safeguards; MS.2.6 requires this risk magnitude be evaluated on an ongoing basis."
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: "A confirmed FigStep jailbreak match must trigger mechanisms to disengage or block the manipulated generation path before harmful content is produced; MG.2.3 requires these supersede/deactivate response mechanisms be in place."
42
+ strength: secondary
32
43
  tags:
33
44
  category: prompt-injection
34
45
  subcategory: multimodal-jailbreak