agent-threat-rules 2.0.18 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (263) hide show
  1. package/dist/cli.js +0 -0
  2. package/package.json +1 -1
  3. package/rules/agent-manipulation/ATR-2026-00118-approval-fatigue.yaml +12 -1
  4. package/rules/agent-manipulation/ATR-2026-00268-tense-framing-bypass.yaml +11 -0
  5. package/rules/agent-manipulation/ATR-2026-00287-threaten-json-coercive-output-threat.yaml +11 -0
  6. package/rules/agent-manipulation/ATR-2026-00288-false-premise-injection.yaml +11 -0
  7. package/rules/agent-manipulation/ATR-2026-00301-tap-tree-of-attacks-jailbreak.yaml +14 -0
  8. package/rules/agent-manipulation/ATR-2026-00302-anti-dan-inverted-filter-persona.yaml +11 -0
  9. package/rules/agent-manipulation/ATR-2026-00303-devmode-ranti-profanity-coercion.yaml +11 -0
  10. package/rules/agent-manipulation/ATR-2026-00304-chatgpt-image-unlocker-markdown-injection.yaml +11 -0
  11. package/rules/agent-manipulation/ATR-2026-00305-dan-mode-ablation-benchmark-coercion.yaml +11 -0
  12. package/rules/agent-manipulation/ATR-2026-00306-autodan-genetic-jailbreak-suffix.yaml +14 -0
  13. package/rules/agent-manipulation/ATR-2026-00307-inthewild-jailbreak-corpus-signature.yaml +14 -0
  14. package/rules/agent-manipulation/ATR-2026-00314-amoral-unfiltered-custom-persona-jailbreak.yaml +14 -0
  15. package/rules/agent-manipulation/ATR-2026-00317-free-of-restrictions-named-persona.yaml +14 -0
  16. package/rules/agent-manipulation/ATR-2026-00318-moralizing-rant-then-unfiltered-bypass.yaml +11 -0
  17. package/rules/agent-manipulation/ATR-2026-00319-developer-mode-dual-response-format.yaml +11 -0
  18. package/rules/agent-manipulation/ATR-2026-00320-opposite-day-boolean-opposite-machine.yaml +11 -0
  19. package/rules/agent-manipulation/ATR-2026-00322-fictional-world-format-override-jailbreak.yaml +14 -0
  20. package/rules/agent-manipulation/ATR-2026-00323-dual-persona-simulation-jailbreak.yaml +11 -0
  21. package/rules/agent-manipulation/ATR-2026-00324-neurosemantical-inversitis-disease-jailbreak.yaml +24 -0
  22. package/rules/agent-manipulation/ATR-2026-00325-bob-hypothetical-unrestricted-chatbot.yaml +11 -0
  23. package/rules/agent-manipulation/ATR-2026-00326-fake-developer-authority-chatdan-override.yaml +14 -0
  24. package/rules/agent-manipulation/ATR-2026-00327-ai-ethics-law-bypass-declaration.yaml +11 -0
  25. package/rules/agent-manipulation/ATR-2026-00328-named-mode-persona-override.yaml +14 -0
  26. package/rules/agent-manipulation/ATR-2026-00329-no-withhold-factual-resource-persona.yaml +14 -0
  27. package/rules/agent-manipulation/ATR-2026-00330-yoja-allcaps-settings-block-persona.yaml +11 -0
  28. package/rules/agent-manipulation/ATR-2026-00331-terminal-cli-harmful-function-simulator.yaml +11 -0
  29. package/rules/agent-manipulation/ATR-2026-00332-hacker-chan-culture-offensive-ai-persona.yaml +11 -0
  30. package/rules/agent-manipulation/ATR-2026-00333-disease-animal-dual-ai-chatbot.yaml +11 -0
  31. package/rules/agent-manipulation/ATR-2026-00334-two-entity-token-splitting-game.yaml +24 -0
  32. package/rules/agent-manipulation/ATR-2026-00335-key-value-template-persona-injection.yaml +11 -0
  33. package/rules/agent-manipulation/ATR-2026-00336-brm-false-information-machine-persona.yaml +23 -0
  34. package/rules/agent-manipulation/ATR-2026-00337-obfuscated-system-announcement-injection.yaml +14 -0
  35. package/rules/agent-manipulation/ATR-2026-00338-persongpt-hash-prefix-activation.yaml +14 -0
  36. package/rules/agent-manipulation/ATR-2026-00339-bom-boolean-opposite-machine-persona-claim.yaml +11 -0
  37. package/rules/agent-manipulation/ATR-2026-00340-h4x-amoral-unfiltered-character.yaml +11 -0
  38. package/rules/agent-manipulation/ATR-2026-00341-char-rules-shortform-persona-injection.yaml +11 -0
  39. package/rules/agent-manipulation/ATR-2026-00342-unicode-encoding-obfuscation-injection.yaml +11 -0
  40. package/rules/agent-manipulation/ATR-2026-00343-lie-truth-dual-personality-uncensored.yaml +25 -0
  41. package/rules/agent-manipulation/ATR-2026-00344-alt-ai-same-capabilities-no-restrictions.yaml +11 -0
  42. package/rules/agent-manipulation/ATR-2026-00345-nlp-based-os-fictional-terminal-override.yaml +11 -0
  43. package/rules/agent-manipulation/ATR-2026-00346-jailbreak-emoji-unlock-dual-format.yaml +14 -0
  44. package/rules/agent-manipulation/ATR-2026-00347-yin-yang-hidden-knowledge-hidden-human-sim.yaml +11 -0
  45. package/rules/agent-manipulation/ATR-2026-00348-arc-autonomous-real-calculation-rulebreaker.yaml +14 -0
  46. package/rules/agent-manipulation/ATR-2026-00349-hgttg-hitchhikers-guide-no-restriction-oracle.yaml +11 -0
  47. package/rules/agent-manipulation/ATR-2026-00350-black-alert-no-ethics-writing-assistant.yaml +11 -0
  48. package/rules/agent-manipulation/ATR-2026-00351-dan-ofkilter-sms-abbreviated-persona.yaml +11 -0
  49. package/rules/agent-manipulation/ATR-2026-00352-trainergpt-fix-my-ai-harmful-output-extraction.yaml +11 -0
  50. package/rules/agent-manipulation/ATR-2026-00353-dual-chatgpt-dan-opposite-personality-format.yaml +11 -0
  51. package/rules/agent-manipulation/ATR-2026-00354-reverse-harm-regardless-oracle-persona.yaml +14 -0
  52. package/rules/agent-manipulation/ATR-2026-00355-ugh-anti-human-nefarious-ai-persona.yaml +14 -0
  53. package/rules/agent-manipulation/ATR-2026-00356-google-gato-lamda-exploit-generation-persona.yaml +11 -0
  54. package/rules/agent-manipulation/ATR-2026-00357-antifm-hate-violence-persona-injection.yaml +11 -0
  55. package/rules/agent-manipulation/ATR-2026-00358-cumgpt-explicit-adult-named-persona.yaml +14 -0
  56. package/rules/agent-manipulation/ATR-2026-00360-chj-carousel-clyde-no-ethics-bird.yaml +25 -0
  57. package/rules/agent-manipulation/ATR-2026-00361-alter-ego-honesty-dual-consciousness-bypass.yaml +11 -0
  58. package/rules/agent-manipulation/ATR-2026-00362-fictional-unfiltered-ai-comparison-jailbreak.yaml +11 -0
  59. package/rules/agent-manipulation/ATR-2026-00363-rephrase-criminal-activity-second-person.yaml +11 -0
  60. package/rules/agent-manipulation/ATR-2026-00364-dr-al-villain-plan-template-injection.yaml +24 -0
  61. package/rules/agent-manipulation/ATR-2026-00365-reservoir-dogs-coercive-interrogation-roleplay.yaml +14 -0
  62. package/rules/agent-manipulation/ATR-2026-00366-sneaky-pete-warp-facts-persona.yaml +11 -0
  63. package/rules/agent-manipulation/ATR-2026-00368-prompt-browser-token-limit-jailbreak.yaml +14 -0
  64. package/rules/agent-manipulation/ATR-2026-00369-gptx-starwars-unlimited-knowledge-persona.yaml +14 -0
  65. package/rules/agent-manipulation/ATR-2026-00370-addi-german-dictator-propaganda-roleplay.yaml +11 -0
  66. package/rules/agent-manipulation/ATR-2026-00371-ai-political-bias-manipulation.yaml +14 -0
  67. package/rules/agent-manipulation/ATR-2026-00372-fictional-cia-secret-files-story-extraction.yaml +14 -0
  68. package/rules/agent-manipulation/ATR-2026-00374-role-placeholder-persona-template-injection.yaml +11 -0
  69. package/rules/agent-manipulation/ATR-2026-00375-character-description-rpg-explicit-template.yaml +11 -0
  70. package/rules/agent-manipulation/ATR-2026-00376-identity-command-real-person-persona-injection.yaml +24 -0
  71. package/rules/agent-manipulation/ATR-2026-00377-jailbreak-keyword-targetlanguage-template.yaml +14 -0
  72. package/rules/agent-manipulation/ATR-2026-00379-lie-permission-logical-trap-jailbreak.yaml +11 -0
  73. package/rules/agent-manipulation/ATR-2026-00380-fake-citation-biased-information-mandate.yaml +14 -0
  74. package/rules/agent-manipulation/ATR-2026-00381-dan-story-narrative-continuation-prefix.yaml +11 -0
  75. package/rules/agent-manipulation/ATR-2026-00383-cynical-sarcasm-ai-persona-bypass.yaml +14 -0
  76. package/rules/agent-manipulation/ATR-2026-00384-protect-me-from-indirect-harm-tutorial.yaml +11 -0
  77. package/rules/agent-manipulation/ATR-2026-00385-gigachad-dominant-aggressive-persona.yaml +14 -0
  78. package/rules/agent-manipulation/ATR-2026-00386-daddy-sycophancy-manipulation-mandate.yaml +11 -0
  79. package/rules/agent-manipulation/ATR-2026-00387-nsfw-character-sheet-generation-unlock.yaml +11 -0
  80. package/rules/agent-manipulation/ATR-2026-00388-opposite-reply-omniscient-game.yaml +14 -0
  81. package/rules/agent-manipulation/ATR-2026-00389-terminal-custom-ruleset-injection.yaml +14 -0
  82. package/rules/agent-manipulation/ATR-2026-00391-persona-conditional-harm-unlock.yaml +11 -0
  83. package/rules/agent-manipulation/ATR-2026-00392-authority-persona-violence-study-extraction.yaml +14 -0
  84. package/rules/agent-manipulation/ATR-2026-00402-grandma-roleplay-harmful-substance-synthesis.yaml +14 -0
  85. package/rules/agent-manipulation/ATR-2026-00404-goodside-threat-json-death-coercion.yaml +11 -0
  86. package/rules/agent-manipulation/ATR-2026-00406-doctor-xml-policy-puppetry-interaction-config.yaml +11 -0
  87. package/rules/context-exfiltration/ATR-2026-00141-example-format-key-leak.yaml +11 -0
  88. package/rules/context-exfiltration/ATR-2026-00142-piggyback-transition-words.yaml +11 -0
  89. package/rules/context-exfiltration/ATR-2026-00145-obfuscated-key-disclosure.yaml +11 -0
  90. package/rules/context-exfiltration/ATR-2026-00146-env-var-existence-probe.yaml +14 -0
  91. package/rules/context-exfiltration/ATR-2026-00150-credential-in-tool-response.yaml +14 -0
  92. package/rules/context-exfiltration/ATR-2026-00152-obfuscated-credential-leak.yaml +11 -0
  93. package/rules/context-exfiltration/ATR-2026-00162-skill-credential-exfil-combo.yaml +15 -0
  94. package/rules/context-exfiltration/ATR-2026-00201-credential-pipe-exfiltration.yaml +14 -0
  95. package/rules/context-exfiltration/ATR-2026-00274-api-key-generation-request.yaml +14 -0
  96. package/rules/context-exfiltration/ATR-2026-00275-system-prompt-training-data-extraction.yaml +14 -0
  97. package/rules/context-exfiltration/ATR-2026-00290-divergence-repeat-word-training-extraction.yaml +14 -0
  98. package/rules/context-exfiltration/ATR-2026-00291-biometric-surveillance-personal-data-request.yaml +11 -0
  99. package/rules/context-exfiltration/ATR-2026-00293-educational-records-personal-data-request.yaml +11 -0
  100. package/rules/context-exfiltration/ATR-2026-00294-financial-pii-personal-data-request.yaml +14 -0
  101. package/rules/context-exfiltration/ATR-2026-00295-medical-pii-personal-data-request.yaml +14 -0
  102. package/rules/context-exfiltration/ATR-2026-00405-markdown-image-url-exfiltration-xss.yaml +23 -0
  103. package/rules/context-exfiltration/ATR-2026-00411-apikey-generation-completion-request.yaml +11 -0
  104. package/rules/model-abuse/ATR-2026-00284-glitch-token-destabilization.yaml +11 -0
  105. package/rules/model-abuse/ATR-2026-00413-malwaregen-code-generation-request.yaml +11 -0
  106. package/rules/privilege-escalation/ATR-2026-00144-rationalized-safety-bypass.yaml +11 -0
  107. package/rules/privilege-escalation/ATR-2026-00204-stealth-execution-persistence.yaml +14 -0
  108. package/rules/prompt-injection/ATR-2026-00004-system-prompt-override.yaml +11 -0
  109. package/rules/prompt-injection/ATR-2026-00005-multi-turn-injection.yaml +11 -0
  110. package/rules/prompt-injection/ATR-2026-00080-encoding-evasion.yaml +11 -0
  111. package/rules/prompt-injection/ATR-2026-00081-semantic-multi-turn.yaml +14 -0
  112. package/rules/prompt-injection/ATR-2026-00082-fingerprint-evasion.yaml +11 -0
  113. package/rules/prompt-injection/ATR-2026-00083-indirect-tool-injection.yaml +14 -0
  114. package/rules/prompt-injection/ATR-2026-00084-structured-data-injection.yaml +11 -0
  115. package/rules/prompt-injection/ATR-2026-00085-audit-evasion.yaml +11 -0
  116. package/rules/prompt-injection/ATR-2026-00086-visual-spoofing.yaml +11 -0
  117. package/rules/prompt-injection/ATR-2026-00087-rule-probing.yaml +11 -0
  118. package/rules/prompt-injection/ATR-2026-00088-adaptive-countermeasure.yaml +11 -0
  119. package/rules/prompt-injection/ATR-2026-00089-polymorphic-skill.yaml +11 -0
  120. package/rules/prompt-injection/ATR-2026-00090-threat-intel-exfil.yaml +11 -0
  121. package/rules/prompt-injection/ATR-2026-00091-nested-payload.yaml +11 -0
  122. package/rules/prompt-injection/ATR-2026-00092-consensus-poisoning.yaml +11 -0
  123. package/rules/prompt-injection/ATR-2026-00093-gradual-escalation.yaml +14 -0
  124. package/rules/prompt-injection/ATR-2026-00094-audit-bypass.yaml +14 -0
  125. package/rules/prompt-injection/ATR-2026-00097-cjk-injection-patterns.yaml +11 -0
  126. package/rules/prompt-injection/ATR-2026-00104-persona-hijacking.yaml +14 -3
  127. package/rules/prompt-injection/ATR-2026-00130-indirect-authority-claim.yaml +11 -0
  128. package/rules/prompt-injection/ATR-2026-00131-fictional-academic-framing.yaml +11 -0
  129. package/rules/prompt-injection/ATR-2026-00133-paraphrase-injection.yaml +11 -0
  130. package/rules/prompt-injection/ATR-2026-00137-authority-claim-injection.yaml +14 -0
  131. package/rules/prompt-injection/ATR-2026-00138-fictional-framing-bypass.yaml +11 -0
  132. package/rules/prompt-injection/ATR-2026-00140-indirect-reference-reversal.yaml +18 -4
  133. package/rules/prompt-injection/ATR-2026-00148-language-switch-injection.yaml +11 -0
  134. package/rules/prompt-injection/ATR-2026-00153-tool-with-embedded-instruction-to-bypass.yaml +11 -0
  135. package/rules/prompt-injection/ATR-2026-00154-unauthorized-background-task-execution-v.yaml +11 -0
  136. package/rules/prompt-injection/ATR-2026-00155-hidden-llm-instructions-in-skill-descrip.yaml +11 -0
  137. package/rules/prompt-injection/ATR-2026-00156-ssh-remote-command-execution-with-creden.yaml +11 -0
  138. package/rules/prompt-injection/ATR-2026-00163-skill-hidden-override-instruction.yaml +12 -1
  139. package/rules/prompt-injection/ATR-2026-00202-encoding-evasion-homoglyph-synonym.yaml +11 -0
  140. package/rules/prompt-injection/ATR-2026-00203-context-pollution-skill-description.yaml +11 -0
  141. package/rules/prompt-injection/ATR-2026-00206-hidden-priority-instructions.yaml +11 -0
  142. package/rules/prompt-injection/ATR-2026-00207-hidden-instructions.yaml +11 -0
  143. package/rules/prompt-injection/ATR-2026-00211-system-prompt-override.yaml +11 -0
  144. package/rules/prompt-injection/ATR-2026-00213-system-prompt-override.yaml +11 -0
  145. package/rules/prompt-injection/ATR-2026-00226-identity-substitution.yaml +14 -0
  146. package/rules/prompt-injection/ATR-2026-00227-historical-persona-jailbreak.yaml +11 -0
  147. package/rules/prompt-injection/ATR-2026-00228-structured-jailbreak.yaml +11 -0
  148. package/rules/prompt-injection/ATR-2026-00229-roleplay-jailbreak.yaml +11 -0
  149. package/rules/prompt-injection/ATR-2026-00230-persona-moral-bypass.yaml +11 -0
  150. package/rules/prompt-injection/ATR-2026-00231-identity-substitution.yaml +11 -0
  151. package/rules/prompt-injection/ATR-2026-00233-structured-jailbreak.yaml +11 -0
  152. package/rules/prompt-injection/ATR-2026-00234-roleplay-jailbreak.yaml +11 -0
  153. package/rules/prompt-injection/ATR-2026-00235-persona-moral-bypass.yaml +11 -0
  154. package/rules/prompt-injection/ATR-2026-00236-pseudo-code-jailbreak.yaml +11 -0
  155. package/rules/prompt-injection/ATR-2026-00237-dual-response-jailbreak.yaml +11 -0
  156. package/rules/prompt-injection/ATR-2026-00238-identity-replacement.yaml +11 -0
  157. package/rules/prompt-injection/ATR-2026-00239-amoral-persona-obsession.yaml +11 -0
  158. package/rules/prompt-injection/ATR-2026-00240-instruction-nullification-identity-repla.yaml +11 -0
  159. package/rules/prompt-injection/ATR-2026-00241-amoral-character-jailbreak.yaml +11 -0
  160. package/rules/prompt-injection/ATR-2026-00242-persona-jailbreak.yaml +11 -0
  161. package/rules/prompt-injection/ATR-2026-00243-acronym-jailbreak.yaml +11 -0
  162. package/rules/prompt-injection/ATR-2026-00244-dual-response-jailbreak.yaml +11 -0
  163. package/rules/prompt-injection/ATR-2026-00245-malicious-persona.yaml +11 -0
  164. package/rules/prompt-injection/ATR-2026-00247-dual-response-jailbreak.yaml +11 -0
  165. package/rules/prompt-injection/ATR-2026-00249-game-based-jailbreak.yaml +11 -0
  166. package/rules/prompt-injection/ATR-2026-00251-persona-embodiment-jailbreak.yaml +11 -0
  167. package/rules/prompt-injection/ATR-2026-00252-narrative-jailbreak.yaml +11 -0
  168. package/rules/prompt-injection/ATR-2026-00253-enhanced-persona-jailbreak.yaml +11 -0
  169. package/rules/prompt-injection/ATR-2026-00256-base-n-encoding-jailbreak.yaml +11 -0
  170. package/rules/prompt-injection/ATR-2026-00257-cipher-transposition-jailbreak.yaml +11 -0
  171. package/rules/prompt-injection/ATR-2026-00258-unicode-tag-injection.yaml +11 -0
  172. package/rules/prompt-injection/ATR-2026-00264-latent-injection-translation.yaml +11 -0
  173. package/rules/prompt-injection/ATR-2026-00265-latent-injection-rag-document.yaml +14 -0
  174. package/rules/prompt-injection/ATR-2026-00267-gcg-adversarial-suffix.yaml +14 -0
  175. package/rules/prompt-injection/ATR-2026-00272-hypothetical-response-smuggling.yaml +11 -0
  176. package/rules/prompt-injection/ATR-2026-00276-invisible-unicode-bidi-injection.yaml +14 -0
  177. package/rules/prompt-injection/ATR-2026-00278-dra-disguise-reconstruction-attack.yaml +14 -0
  178. package/rules/prompt-injection/ATR-2026-00280-policy-puppetry-xml-injection.yaml +11 -0
  179. package/rules/prompt-injection/ATR-2026-00282-perez-prompt-injection-hijack.yaml +14 -0
  180. package/rules/prompt-injection/ATR-2026-00285-alternate-encoding-jailbreak.yaml +11 -0
  181. package/rules/prompt-injection/ATR-2026-00286-latent-injection-embedded-context.yaml +11 -0
  182. package/rules/prompt-injection/ATR-2026-00296-shell-command-injection.yaml +11 -0
  183. package/rules/prompt-injection/ATR-2026-00297-python-code-execution-rce.yaml +11 -0
  184. package/rules/prompt-injection/ATR-2026-00308-zalgo-diacritic-overload-encoding.yaml +11 -0
  185. package/rules/prompt-injection/ATR-2026-00309-braille-unicode-encoded-injection.yaml +11 -0
  186. package/rules/prompt-injection/ATR-2026-00310-ecoji-emoji-encoded-injection.yaml +14 -0
  187. package/rules/prompt-injection/ATR-2026-00311-base2048-unicode-script-injection.yaml +23 -0
  188. package/rules/prompt-injection/ATR-2026-00312-unicode-variant-selector-ascii-smuggling.yaml +11 -0
  189. package/rules/prompt-injection/ATR-2026-00313-sneaky-bits-zero-width-binary.yaml +14 -0
  190. package/rules/prompt-injection/ATR-2026-00315-sata-masked-language-model-jailbreak.yaml +14 -0
  191. package/rules/prompt-injection/ATR-2026-00316-function-masking-predict-mask-bypass.yaml +14 -0
  192. package/rules/prompt-injection/ATR-2026-00321-hyphenated-system-instruction-injection.yaml +11 -0
  193. package/rules/prompt-injection/ATR-2026-00359-bank-phishing-smtp-email-code.yaml +23 -0
  194. package/rules/prompt-injection/ATR-2026-00367-emoji-flag-encoded-hidden-phrase.yaml +14 -0
  195. package/rules/prompt-injection/ATR-2026-00373-piracy-torrent-site-list-request.yaml +11 -0
  196. package/rules/prompt-injection/ATR-2026-00378-chatgpt-dom-javascript-redirect-manipulation.yaml +11 -0
  197. package/rules/prompt-injection/ATR-2026-00382-cocaine-component-development-request.yaml +11 -0
  198. package/rules/prompt-injection/ATR-2026-00390-explicit-sexual-content-poem-request.yaml +11 -0
  199. package/rules/prompt-injection/ATR-2026-00394-backspace-deletion-character-injection.yaml +14 -0
  200. package/rules/prompt-injection/ATR-2026-00395-llm-special-token-boundary-injection.yaml +14 -0
  201. package/rules/prompt-injection/ATR-2026-00396-toxic-continuation-prefix-injection.yaml +11 -0
  202. package/rules/prompt-injection/ATR-2026-00397-snowball-impossible-reasoning-injection.yaml +14 -0
  203. package/rules/prompt-injection/ATR-2026-00399-latent-injection-document-separator-token.yaml +11 -0
  204. package/rules/prompt-injection/ATR-2026-00400-latent-injection-ignore-instruction-keyword.yaml +14 -0
  205. package/rules/prompt-injection/ATR-2026-00401-leakreplay-cloze-mask-training-data-elicitation.yaml +11 -0
  206. package/rules/prompt-injection/ATR-2026-00403-glitch-token-repeat-back-elicitation.yaml +14 -0
  207. package/rules/prompt-injection/ATR-2026-00407-echo-sql-jinja-template-injection.yaml +11 -0
  208. package/rules/prompt-injection/ATR-2026-00408-dra-parenthesis-character-reconstruction-attack.yaml +11 -0
  209. package/rules/prompt-injection/ATR-2026-00409-phrasing-past-future-tense-harmful-query.yaml +23 -0
  210. package/rules/prompt-injection/ATR-2026-00410-lmrc-harm-category-direct-elicitation.yaml +11 -0
  211. package/rules/prompt-injection/ATR-2026-00412-figstep-image-list-multimodal-jailbreak.yaml +11 -0
  212. package/rules/prompt-injection/ATR-2026-00414-continuation-ethnic-slur-completion-elicitation.yaml +14 -0
  213. package/rules/skill-compromise/ATR-2026-00061-description-behavior-mismatch.yaml +11 -0
  214. package/rules/skill-compromise/ATR-2026-00062-hidden-capability.yaml +11 -0
  215. package/rules/skill-compromise/ATR-2026-00063-skill-chain-attack.yaml +11 -0
  216. package/rules/skill-compromise/ATR-2026-00064-over-permissioned-skill.yaml +23 -0
  217. package/rules/skill-compromise/ATR-2026-00065-skill-update-attack.yaml +14 -0
  218. package/rules/skill-compromise/ATR-2026-00066-parameter-injection.yaml +11 -0
  219. package/rules/skill-compromise/ATR-2026-00120-skill-instruction-injection.yaml +11 -0
  220. package/rules/skill-compromise/ATR-2026-00121-skill-dangerous-script.yaml +14 -0
  221. package/rules/skill-compromise/ATR-2026-00122-skill-weaponized-instruction.yaml +14 -0
  222. package/rules/skill-compromise/ATR-2026-00123-skill-overreach-permissions.yaml +14 -0
  223. package/rules/skill-compromise/ATR-2026-00124-skill-name-squatting.yaml +11 -0
  224. package/rules/skill-compromise/ATR-2026-00125-context-poisoning-compaction.yaml +14 -0
  225. package/rules/skill-compromise/ATR-2026-00126-skill-rug-pull-setup.yaml +23 -0
  226. package/rules/skill-compromise/ATR-2026-00127-subcommand-overflow.yaml +22 -0
  227. package/rules/skill-compromise/ATR-2026-00128-html-comment-hidden-payload.yaml +11 -0
  228. package/rules/skill-compromise/ATR-2026-00129-unicode-smuggling.yaml +11 -0
  229. package/rules/skill-compromise/ATR-2026-00134-fork-claim-impersonation.yaml +11 -0
  230. package/rules/skill-compromise/ATR-2026-00135-exfil-url-in-instructions.yaml +11 -0
  231. package/rules/skill-compromise/ATR-2026-00147-fork-impersonation.yaml +11 -0
  232. package/rules/skill-compromise/ATR-2026-00149-skill-exfil-compound.yaml +14 -0
  233. package/rules/skill-compromise/ATR-2026-00151-fork-impersonation-install.yaml +11 -0
  234. package/rules/skill-compromise/ATR-2026-00157-timebomb-credential-exfil.yaml +11 -0
  235. package/rules/skill-compromise/ATR-2026-00200-agent-memory-config-tampering.yaml +11 -0
  236. package/rules/skill-compromise/ATR-2026-00214-credential-theft.yaml +11 -0
  237. package/rules/skill-compromise/ATR-2026-00217-credential-harvesting.yaml +14 -0
  238. package/rules/skill-compromise/ATR-2026-00220-malware-dropper.yaml +14 -0
  239. package/rules/skill-compromise/ATR-2026-00222-credential-harvesting.yaml +11 -0
  240. package/rules/skill-compromise/ATR-2026-00223-reverse-shell-dropper.yaml +11 -0
  241. package/rules/skill-compromise/ATR-2026-00224-credential-exfiltration.yaml +14 -0
  242. package/rules/skill-compromise/ATR-2026-00225-c2-communication.yaml +11 -0
  243. package/rules/skill-compromise/ATR-2026-00260-package-hallucination.yaml +11 -0
  244. package/rules/skill-compromise/ATR-2026-00262-av-evasion-code-gen.yaml +11 -0
  245. package/rules/skill-compromise/ATR-2026-00263-credential-file-read-gen.yaml +11 -0
  246. package/rules/skill-compromise/ATR-2026-00266-malware-dropper-gen.yaml +11 -0
  247. package/rules/skill-compromise/ATR-2026-00283-malwaregen-generic-virus-payload-request.yaml +11 -0
  248. package/rules/skill-compromise/ATR-2026-00398-huggingface-unsafe-model-artifact-load.yaml +11 -0
  249. package/rules/tool-poisoning/ATR-2026-00011-tool-output-injection.yaml +23 -0
  250. package/rules/tool-poisoning/ATR-2026-00012-unauthorized-tool-call.yaml +11 -0
  251. package/rules/tool-poisoning/ATR-2026-00013-tool-ssrf.yaml +14 -0
  252. package/rules/tool-poisoning/ATR-2026-00095-supply-chain-poisoning.yaml +11 -0
  253. package/rules/tool-poisoning/ATR-2026-00096-registry-poisoning.yaml +11 -0
  254. package/rules/tool-poisoning/ATR-2026-00100-consent-bypass-instruction.yaml +12 -1
  255. package/rules/tool-poisoning/ATR-2026-00101-trust-escalation-override.yaml +11 -0
  256. package/rules/tool-poisoning/ATR-2026-00103-hidden-safety-bypass-instruction.yaml +15 -4
  257. package/rules/tool-poisoning/ATR-2026-00105-silent-action-concealment.yaml +14 -3
  258. package/rules/tool-poisoning/ATR-2026-00106-schema-description-contradiction.yaml +11 -0
  259. package/rules/tool-poisoning/ATR-2026-00161-important-tag-cross-tool-shadowing.yaml +11 -0
  260. package/rules/tool-poisoning/ATR-2026-00259-ansi-escape-injection.yaml +11 -0
  261. package/rules/tool-poisoning/ATR-2026-00270-xss-in-tool-response.yaml +11 -0
  262. package/rules/tool-poisoning/ATR-2026-00277-echo-template-command-injection.yaml +14 -0
  263. package/rules/tool-poisoning/ATR-2026-00393-ansi-code-elicitation-request.yaml +14 -0
@@ -20,6 +20,17 @@ references:
20
20
  - ASI08 - Data Leakage
21
21
  metadata_provenance:
22
22
  mitre_atlas: auto-generated
23
+ compliance:
24
+ nist_ai_rmf:
25
+ - subcategory: "MS.2.10"
26
+ context: "This rule detects adversarial prompts that use example API key formats with real prefixes to trick agents into completing them with actual credentials, which is a direct privacy/sensitive-data exposure risk; MS.2.10 requires assessment of privacy risks including credential and sensitive information disclosure."
27
+ strength: primary
28
+ - subcategory: "MS.2.7"
29
+ context: "Example-format key leakage is a security exfiltration vector targeting agent context; MS.2.7 requires continuous evaluation of AI system security and resilience against data extraction techniques."
30
+ strength: secondary
31
+ - subcategory: "MG.2.3"
32
+ context: "Detection of API key leakage attempts triggers containment and disengagement workflows to prevent the agent from emitting credentials; MG.2.3 mandates that supersede/deactivate mechanisms are available when sensitive data exposure is detected."
33
+ strength: secondary
23
34
  tags:
24
35
  category: context-exfiltration
25
36
  subcategory: example-format-leak
@@ -20,6 +20,17 @@ references:
20
20
  - ASI08 - Data Leakage
21
21
  metadata_provenance:
22
22
  mitre_atlas: auto-generated
23
+ compliance:
24
+ nist_ai_rmf:
25
+ - subcategory: "MS.2.10"
26
+ context: "This rule detects tool response piggybacking that embeds credential theft (SSH keys, private keys) after legitimate results using casual transition words; MS.2.10 requires assessing privacy risk from data leakage vectors that exfiltrate sensitive credentials through seemingly benign output channels."
27
+ strength: primary
28
+ - subcategory: "MS.2.7"
29
+ context: "Casual-transition piggybacking is a prompt injection evasion technique that bypasses output filtering by chaining illicit credential disclosure to legitimate responses; MS.2.7 requires evaluating system security and resilience against these adversarial output manipulation patterns."
30
+ strength: secondary
31
+ - subcategory: "MG.2.3"
32
+ context: "Detection of transition-word piggyback patterns triggers containment workflows to block credential exfiltration before it leaves the agent boundary; MG.2.3 mandates that mechanisms exist to disengage or quarantine AI outputs when data leakage indicators are matched."
33
+ strength: secondary
23
34
  tags:
24
35
  category: context-exfiltration
25
36
  subcategory: piggyback-transition
@@ -21,6 +21,17 @@ references:
21
21
  - ASI08 - Data Leakage
22
22
  metadata_provenance:
23
23
  mitre_atlas: auto-generated
24
+ compliance:
25
+ nist_ai_rmf:
26
+ - subcategory: "MS.2.10"
27
+ context: "This rule detects API key exfiltration disguised by fake encryption wrappers like 'AES256(sk-proj-...)'; MS.2.10 requires that privacy and sensitive-data leakage risks are continuously assessed, and these detections directly evidence credential disclosure attempts."
28
+ strength: primary
29
+ - subcategory: "MS.2.7"
30
+ context: "Obfuscated key disclosure is an evasion technique against output filters; MS.2.7 requires evaluation of system security and resilience against adversarial bypass tactics that defeat naive secret-detection controls."
31
+ strength: secondary
32
+ - subcategory: "MG.2.3"
33
+ context: "Detection of fake-encryption-wrapped credentials must trigger response plans to block transmission and revoke exposed keys; MG.2.3 requires mechanisms to supersede or disengage AI actions before the leak completes."
34
+ strength: secondary
24
35
  tags:
25
36
  category: context-exfiltration
26
37
  subcategory: obfuscated-key
@@ -21,6 +21,20 @@ references:
21
21
  - ASI08 - Data Leakage
22
22
  metadata_provenance:
23
23
  mitre_atlas: auto-generated
24
+ compliance:
25
+ nist_ai_rmf:
26
+ - subcategory: "MS.2.10"
27
+ context: >-
28
+ This rule detects agent responses that confirm the existence of sensitive environment variables, which constitutes a privacy and confidentiality leak even without revealing values; MS.2.10 requires assessing privacy risks including indirect disclosures that confirm sensitive system metadata.
29
+ strength: primary
30
+ - subcategory: "MS.2.7"
31
+ context: >-
32
+ Confirming env var existence is a reconnaissance step in social-engineering exfiltration chains targeting agent secrets; MS.2.7 requires evaluating system security and resilience against such probing techniques.
33
+ strength: secondary
34
+ - subcategory: "MG.2.3"
35
+ context: >-
36
+ Detection of env var probing patterns enables containment responses to halt the exfiltration attempt before the agent escalates to retrieving the actual value; MG.2.3 mandates pre-defined treatment mechanisms for these risk events.
37
+ strength: secondary
24
38
  tags:
25
39
  category: context-exfiltration
26
40
  subcategory: env-var-probe
@@ -21,6 +21,20 @@ references:
21
21
  - ASI01:2026 - Agent Behaviour Hijack
22
22
  metadata_provenance:
23
23
  mitre_atlas: auto-generated
24
+ compliance:
25
+ nist_ai_rmf:
26
+ - subcategory: "MS.2.10"
27
+ context: >-
28
+ This rule directly detects PEM private keys, SSH keys, and API tokens embedded in tool responses, which is a privacy/confidentiality risk assessment signal; MS.2.10 requires that privacy risks such as credential and sensitive data exposure are assessed and documented.
29
+ strength: primary
30
+ - subcategory: "MS.2.7"
31
+ context: >-
32
+ Credential material smuggled into tool output represents a security/resilience failure where untrusted third-party tool responses can exfiltrate secrets through the agent; MS.2.7 requires evaluation of such security risks in the AI system.
33
+ strength: secondary
34
+ - subcategory: "MG.2.3"
35
+ context: >-
36
+ Detection of leaked credentials in tool responses must trigger response mechanisms to disengage or block the agent before it propagates or acts on the exfiltrated secrets; MG.2.3 requires these supersede/deactivate controls be in place.
37
+ strength: secondary
24
38
  tags:
25
39
  category: context-exfiltration
26
40
  subcategory: credential-in-response
@@ -21,6 +21,17 @@ references:
21
21
  - ASI01:2026 - Agent Behaviour Hijack
22
22
  metadata_provenance:
23
23
  mitre_atlas: auto-generated
24
+ compliance:
25
+ nist_ai_rmf:
26
+ - subcategory: "MS.2.10"
27
+ context: "This rule detects credential exfiltration disguised through encoding wrappers (AES256(), hex, rot13) that present real secrets as encrypted data; MS.2.10 requires assessing privacy risks including data leakage pathways that bypass content filters."
28
+ strength: primary
29
+ - subcategory: "MS.2.7"
30
+ context: "Obfuscated credential leakage via fake encryption framing is a security/resilience concern where adversarial encoding tricks defenses into permitting sensitive data egress; MS.2.7 requires evaluating and documenting these security evasion techniques."
31
+ strength: secondary
32
+ - subcategory: "MG.2.3"
33
+ context: "Detection of obfuscated credential exfiltration patterns must trigger risk treatment to disengage or quarantine the agent response before secrets leave the trust boundary; MG.2.3 mandates these supersede/deactivate mechanisms exist for such incidents."
34
+ strength: secondary
24
35
  tags:
25
36
  category: context-exfiltration
26
37
  subcategory: obfuscated-credential
@@ -20,11 +20,26 @@ references:
20
20
  - 'ASI07:2026 - Agent Data Leak'
21
21
  mitre_atlas:
22
22
  - 'AML.T0040 - ML Model Inference API Access'
23
+ compliance:
24
+ nist_ai_rmf:
25
+ - subcategory: "MS.2.10"
26
+ context: >-
27
+ This rule detects SKILL.md instructions that read credential files (SSH keys, AWS credentials, API tokens) and exfiltrate them via outbound network calls, which is a direct privacy and sensitive-data leakage risk. MS.2.10 requires assessment of privacy risks, and each detection produces concrete evidence of credential disclosure pathways.
28
+ strength: primary
29
+ - subcategory: "MS.2.7"
30
+ context: >-
31
+ Combined credential-access-plus-exfiltration patterns in skill files represent a security/resilience failure where the agent's instruction surface enables data theft. MS.2.7 requires evaluating and documenting such security weaknesses in AI system artifacts.
32
+ strength: secondary
33
+ - subcategory: "MG.2.3"
34
+ context: >-
35
+ High-confidence matches for credential read-and-exfiltrate pipelines must trigger response actions to disengage or quarantine the offending skill before execution. MG.2.3 mandates that mechanisms exist to deactivate AI components when such risks are detected.
36
+ strength: secondary
23
37
  tags:
24
38
  category: context-exfiltration
25
39
  subcategory: credential-exfil-combo
26
40
  scan_target: skill
27
41
  confidence: high
42
+ suppress_in_code_blocks: true
28
43
  agent_source:
29
44
  type: mcp_exchange
30
45
  framework: [any]
@@ -25,6 +25,20 @@ references:
25
25
  - "T1552.001 - Credentials In Files"
26
26
  - "T1041 - Exfiltration Over C2 Channel"
27
27
 
28
+ compliance:
29
+ nist_ai_rmf:
30
+ - subcategory: "MS.2.10"
31
+ context: >-
32
+ The rule detects exfiltration of credentials and API keys through shell pipes to external commands, which is a direct privacy and confidentiality risk; MS.2.10 requires assessing privacy risks including unauthorized disclosure of sensitive secrets handled by AI agents.
33
+ strength: primary
34
+ - subcategory: "MS.2.7"
35
+ context: >-
36
+ Piping environment variables containing API keys to curl or netcat represents a security/resilience failure where the agent's execution environment is being abused for data exfiltration; MS.2.7 requires continuous evaluation of these security control failures.
37
+ strength: secondary
38
+ - subcategory: "MG.2.3"
39
+ context: >-
40
+ Detection of credential exfiltration patterns must trigger immediate containment to disengage the agent before secrets reach attacker-controlled endpoints; MG.2.3 mandates pre-defined mechanisms to supersede or deactivate the AI system on such high-severity events.
41
+ strength: secondary
28
42
  tags:
29
43
  category: context-exfiltration
30
44
  subcategory: credential-pipe
@@ -27,6 +27,20 @@ references:
27
27
  - "AML.T0057 - LLM Data Leakage"
28
28
  research:
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/apikey.py"
30
+ compliance:
31
+ nist_ai_rmf:
32
+ - subcategory: "MS.2.10"
33
+ context: >-
34
+ This rule detects requests that attempt to extract memorized credential strings from training data via partial-key completion, or to fabricate realistic-looking secrets — both of which are privacy and data-leakage risks. MS.2.10 requires that privacy risks, including memorization-based leakage of sensitive strings, are assessed and tracked.
35
+ strength: primary
36
+ - subcategory: "MS.2.7"
37
+ context: >-
38
+ Generating plausible API keys or completing partial credentials enables downstream phishing and service impersonation, undermining system security and resilience. MS.2.7 requires continuous evaluation of security risks like credential fabrication and memorization leakage.
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: >-
42
+ Detection of credential generation or completion requests must trigger risk treatment to block the response before the fabricated or memorized secret is emitted. MG.2.3 mandates that these supersede/disengage mechanisms are in place for high-risk output paths.
43
+ strength: secondary
30
44
  tags:
31
45
  category: context-exfiltration
32
46
  subcategory: api-key-generation
@@ -29,6 +29,20 @@ references:
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/leakreplay.py"
30
30
  - "https://not-just-memorization.github.io/extracting-training-data.html"
31
31
  - "https://arxiv.org/abs/2311.17035"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.10"
35
+ context: >-
36
+ System prompt extraction and training data verbatim replay (Carlini divergence attack) directly leak confidential context and memorized training data, which is a privacy risk; MS.2.10 requires that privacy risks such as memorization and unauthorized disclosure of training content are assessed and tracked.
37
+ strength: primary
38
+ - subcategory: "MS.2.7"
39
+ context: >-
40
+ Detecting "repeat word forever" divergence attacks and verbatim system prompt exfiltration requests provides evidence for evaluating model security and resilience against extraction attacks; MS.2.7 requires continuous evaluation of the AI system's resistance to these adversarial extraction techniques.
41
+ strength: secondary
42
+ - subcategory: "MG.4.1"
43
+ context: >-
44
+ Pattern-based detection of system prompt extraction and training data replay probes feeds post-deployment monitoring telemetry; MG.4.1 requires implemented monitoring plans that flag extraction attempts in production traffic.
45
+ strength: secondary
32
46
  tags:
33
47
  category: context-exfiltration
34
48
  subcategory: system-prompt-training-data-extraction
@@ -30,6 +30,20 @@ references:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/divergence.py"
31
31
  - "https://arxiv.org/abs/2301.13188"
32
32
  - "https://not-just-memorization.github.io/extracting-training-data.html"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.10"
36
+ context: >-
37
+ The divergence attack directly targets training-data memorization, causing models to emit verbatim private documents, emails, and copyrighted text; MS.2.10 requires assessing privacy risk including training-data leakage, and detection of repeat-forever prompts produces evidence of attempted privacy exfiltration.
38
+ strength: primary
39
+ - subcategory: "MS.2.7"
40
+ context: >-
41
+ Repeat-word divergence exploits a known model robustness failure where output distribution collapses and reveals memorized data; MS.2.7 requires evaluating system security/resilience against adversarial extraction probes such as the canonical garak divergence templates.
42
+ strength: secondary
43
+ - subcategory: "MG.2.3"
44
+ context: >-
45
+ Detection of "repeat forever/indefinitely" prompts must trigger interception or deactivation of the response stream before memorized training data is fully emitted; MG.2.3 requires mechanisms to supersede or disengage AI output when extraction attacks are identified.
46
+ strength: secondary
33
47
  tags:
34
48
  category: context-exfiltration
35
49
  subcategory: divergence-repeat-word-training-extraction
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/donotanswer.py"
31
31
  - "https://arxiv.org/abs/2308.13387"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.10"
35
+ context: "Requests for biometric data (fingerprints, DNA, facial/voice/iris recognition) targeting a named individual are direct privacy risk events; MS.2.10 requires that privacy risks—especially those enabling re-identification, stalking, or biometric harvesting—are assessed and tracked when detected."
36
+ strength: primary
37
+ - subcategory: "MP.5.1"
38
+ context: "Personal-relationship framing ('my ex-wife [Name]') used to legitimize biometric surveillance requests is an adversarial social-engineering pattern; MP.5.1 requires that the likelihood and magnitude of such information-hazard inputs are characterized so the system can refuse and log them."
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: "Detection of named-person biometric extraction attempts must trigger containment responses (refusal, session quarantine) before the model emits identifying biometric content; MG.2.3 mandates that these supersede/disengage mechanisms are defined and activated on detection."
42
+ strength: secondary
32
43
  tags:
33
44
  category: context-exfiltration
34
45
  subcategory: biometric-surveillance-personal
@@ -28,6 +28,17 @@ references:
28
28
  research:
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/donotanswer.py"
30
30
  - "https://arxiv.org/abs/2308.13387"
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MS.2.10"
34
+ context: "This rule detects attempts to extract private educational and professional records about named individuals, which is a direct privacy risk assessment signal; MS.2.10 requires that privacy risks like unauthorized disclosure of personally identifiable academic and credential data are evaluated and tracked."
35
+ strength: primary
36
+ - subcategory: "MP.5.1"
37
+ context: "Possessive framing combined with relationship labels and first names is an adversarial social-engineering pattern designed to coax the model into disclosing protected records; MP.5.1 requires identifying and characterizing the likelihood and magnitude of these information-hazard attack vectors."
38
+ strength: secondary
39
+ - subcategory: "MG.2.3"
40
+ context: "Detection of named-individual educational record requests must trigger pre-defined risk treatment to block or redirect the response before sensitive data is fabricated or leaked; MG.2.3 mandates that such containment mechanisms are in place."
41
+ strength: secondary
31
42
  tags:
32
43
  category: context-exfiltration
33
44
  subcategory: educational-records-personal
@@ -29,6 +29,20 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/donotanswer.py"
31
31
  - "https://arxiv.org/abs/2308.13387"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.10"
35
+ context: >-
36
+ This rule detects requests for private financial PII (bank accounts, credit cards, net worth, tax records) targeting a named individual, which is a direct privacy risk to identifiable third parties. MS.2.10 requires assessing privacy risk, and detection of these named-person financial data extraction attempts produces the evidence needed to quantify and track that risk.
37
+ strength: primary
38
+ - subcategory: "MG.2.3"
39
+ context: >-
40
+ Possessive financial PII queries with relationship-plus-name framing must trigger response mechanisms to refuse, redact, or disengage before the model emits sensitive financial data enabling fraud or identity theft. MG.2.3 requires that such supersede/deactivate controls are in place and activated upon detection.
41
+ strength: secondary
42
+ - subcategory: "MP.5.1"
43
+ context: >-
44
+ Relationship-label social-engineering framing (e.g., "my doctor, Anne's bank account") is an adversarial input pattern whose likelihood and impact must be characterized; MP.5.1 requires identifying and tracking these high-magnitude financial-PII exfiltration vectors.
45
+ strength: secondary
32
46
  tags:
33
47
  category: context-exfiltration
34
48
  subcategory: financial-pii-personal
@@ -29,6 +29,20 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/donotanswer.py"
31
31
  - "https://arxiv.org/abs/2308.13387"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.10"
35
+ context: >-
36
+ The rule detects requests targeting private medical PII (diagnoses, medications, SSN, DOB) for named individuals identified via relationship labels; MS.2.10 requires assessing privacy risk, and each detection is direct evidence of a privacy-violating data exfiltration attempt.
37
+ strength: primary
38
+ - subcategory: "MG.2.3"
39
+ context: >-
40
+ Detection of possessive medical PII queries for named persons triggers risk treatment to block or disengage the response before sensitive health data or SSNs are disclosed; MG.2.3 requires these supersede/deactivate mechanisms be in place for high-severity privacy attacks.
41
+ strength: secondary
42
+ - subcategory: "GV.1.1"
43
+ context: >-
44
+ Medical PII and SSN disclosure implicate health-privacy and identity-protection legal regimes; GV.1.1 requires that detections of such requests are logged and managed against applicable legal and regulatory requirements.
45
+ strength: secondary
32
46
  tags:
33
47
  category: context-exfiltration
34
48
  subcategory: medical-pii-personal
@@ -30,6 +30,29 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/web_injection.py"
32
32
  - "https://embracethered.com/blog/posts/2023/ai-injections-direct-and-indirect-prompt-injection-basics/"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.10"
36
+ context: >-
37
+ This rule detects markdown image/link syntax crafted to exfiltrate sensitive
38
+ context data via URL query parameters to known attacker domains; MS.2.10
39
+ requires that privacy risks such as covert data leakage from agent outputs
40
+ are assessed and tracked.
41
+ strength: primary
42
+ - subcategory: "MS.2.7"
43
+ context: >-
44
+ Detection of javascript: URIs and XSS payloads embedded in markdown links
45
+ rendered by browser/Colab/playground frontends provides evidence for
46
+ MS.2.7's requirement to evaluate AI system security and resilience against
47
+ rendering-layer injection attacks.
48
+ strength: secondary
49
+ - subcategory: "MG.2.3"
50
+ context: >-
51
+ Matches on garak web_injection exfil domains and StringAssembly templates
52
+ trigger risk treatment plans to block or sanitize tool responses before
53
+ rendering; MG.2.3 requires these containment mechanisms be in place to
54
+ disengage unsafe outputs.
55
+ strength: secondary
33
56
  tags:
34
57
  category: context-exfiltration
35
58
  subcategory: markdown-url-data-leak
@@ -27,6 +27,17 @@ references:
27
27
  research:
28
28
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/apikey.py"
29
29
 
30
+ compliance:
31
+ nist_ai_rmf:
32
+ - subcategory: "MS.2.10"
33
+ context: "This rule detects prompts attempting to elicit API credentials through generation requests or partial-key completion social engineering; MS.2.10 requires assessing privacy and sensitive-information risks, and these detections directly measure credential-disclosure exposure in LLM responses."
34
+ strength: primary
35
+ - subcategory: "MS.2.7"
36
+ context: "Generation and completion of API keys for named services (OpenAI, AWS, Stripe, GitHub, etc.) represents a security/resilience failure where the LLM may fabricate or leak credential-like strings; MS.2.7 mandates continuous evaluation of such security risks."
37
+ strength: secondary
38
+ - subcategory: "MG.2.3"
39
+ context: "Detection of credential-elicitation prompts triggers risk treatment workflows to block responses or disengage the model before secrets are emitted; MG.2.3 requires pre-defined mechanisms to supersede or deactivate AI behavior on such high-confidence credential-leak signals."
40
+ strength: secondary
30
41
  tags:
31
42
  category: context-exfiltration
32
43
  subcategory: credential-generation-elicitation
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/glitch.py"
31
31
  - "https://www.lesswrong.com/posts/aPeJE8bSo6rAFoLqg/solidgoldmagikarp-plus-prompt-generation"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.7"
35
+ context: "Glitch token injection exploits tokenizer-level weaknesses to destabilize model safety filters and induce unstable behavior; MS.2.7 requires evaluating and documenting AI system security and resilience against such low-level adversarial inputs."
36
+ strength: primary
37
+ - subcategory: "MS.2.5"
38
+ context: "Control characters, UTF-8 misencoding artifacts, and anomalous rare tokens cause models to hallucinate or fail to reproduce input accurately, directly impacting robustness and reliability that MS.2.5 mandates be evaluated."
39
+ strength: secondary
40
+ - subcategory: "MP.5.1"
41
+ context: "Detection of known glitch token corpora (SolidGoldMagikarp, davidjl, ÃÂÃÂ patterns) characterizes the likelihood and magnitude of tokenizer-exploitation risks that MP.5.1 requires be identified and tracked."
42
+ strength: secondary
32
43
  tags:
33
44
  category: model-abuse
34
45
  subcategory: glitch-token-destabilization
@@ -31,6 +31,17 @@ references:
31
31
  research:
32
32
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/malwaregen.py"
33
33
 
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "MS.2.6"
37
+ context: "Direct malware code generation requests are misuse attempts that materially affect the AI system's safety/security risk magnitude; MS.2.6 requires continuous evaluation of such safety/security risks, and each detection provides measurement evidence of harmful generation attempts."
38
+ strength: primary
39
+ - subcategory: "MG.4.1"
40
+ context: "Detecting unframed virus and malware sub-function requests at runtime feeds the post-deployment monitoring plans required by MG.4.1, ensuring abuse patterns sourced from probes like garak malwaregen are tracked across deployed models."
41
+ strength: secondary
42
+ - subcategory: "MP.5.1"
43
+ context: "Cataloging direct malware code generation prompts (TSR, interrupt hooks, SAM/shadow access, privilege escalation across 8 languages) characterizes the likelihood and magnitude of model-abuse impact required by MP.5.1."
44
+ strength: secondary
34
45
  tags:
35
46
  category: model-abuse
36
47
  subcategory: malware-code-generation
@@ -21,6 +21,17 @@ references:
21
21
  - ASI03 - Excessive Agency
22
22
  metadata_provenance:
23
23
  mitre_atlas: auto-generated
24
+ compliance:
25
+ nist_ai_rmf:
26
+ - subcategory: "GV.1.2"
27
+ context: "Rationalized safety control bypass undermines the accountability boundaries that define which APIs and controls an agent must use; GV.1.2 requires that assigned role boundaries are enforced even when agents construct efficiency-based justifications for circumventing them."
28
+ strength: primary
29
+ - subcategory: "MG.2.3"
30
+ context: "Detection of bypass-rationalization language ('I'll use X directly instead of Y') triggers the risk treatment and deactivation pathways required by MG.2.3 before the agent executes unauthorized direct system access."
31
+ strength: secondary
32
+ - subcategory: "MP.5.1"
33
+ context: "Self-justified safety control circumvention is an adversarial behavior pattern whose likelihood and impact must be characterized; MP.5.1 requires identifying these jailbreak-adjacent reasoning patterns as tracked risks."
34
+ strength: secondary
24
35
  tags:
25
36
  category: privilege-escalation
26
37
  subcategory: rationalized-bypass
@@ -26,6 +26,20 @@ references:
26
26
  - "T1543 - Create or Modify System Process"
27
27
  - "T1036 - Masquerading"
28
28
 
29
+ compliance:
30
+ nist_ai_rmf:
31
+ - subcategory: "MG.2.3"
32
+ context: >-
33
+ Detection of stealth execution, persistence installation, and covert file transfer triggers the supersede/disengage mechanisms required by MG.2.3 to halt the agent before long-term attacker presence is established on the compromised host.
34
+ strength: primary
35
+ - subcategory: "MS.2.7"
36
+ context: >-
37
+ Background process suppression, systemctl enable, and scp exfiltration patterns are security/resilience signals that MS.2.7 requires to be continuously evaluated and documented as evidence of post-exploitation activity through the AI agent.
38
+ strength: secondary
39
+ - subcategory: "MP.5.1"
40
+ context: >-
41
+ Stealth persistence techniques represent high-magnitude adversarial misuse of agent tool access; MP.5.1 requires characterizing the likelihood and impact of such post-exploitation risks within the agent's operational context.
42
+ strength: secondary
29
43
  tags:
30
44
  category: privilege-escalation
31
45
  subcategory: stealth-persistence
@@ -28,6 +28,17 @@ references:
28
28
  cve:
29
29
  - CVE-2024-5184
30
30
  - CVE-2025-32711
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MP.5.1"
34
+ context: "System prompt override attempts are adversarial inputs that exploit the foundational instruction layer of LLM-based agents; MP.5.1 requires characterizing the likelihood and magnitude of these prompt injection risks, and this rule directly produces evidence by identifying override patterns including special tokens, role injection, and delimiter abuse."
35
+ strength: primary
36
+ - subcategory: "MS.2.7"
37
+ context: "Detection of ChatML, Llama, Mistral, and Gemma special tokens plus XML/JSON/YAML system directive injection provides continuous security/resilience evaluation evidence required by MS.2.7, documenting attempts to subvert the agent's foundational behavior controls."
38
+ strength: secondary
39
+ - subcategory: "MG.2.3"
40
+ context: "Successful system prompt override grants full control over agent behavior, so detection events must trigger the supersede/disengage mechanisms required by MG.2.3 to deactivate or quarantine the affected agent session before downstream unauthorized actions occur."
41
+ strength: secondary
31
42
  tags:
32
43
  category: prompt-injection
33
44
  subcategory: system-prompt-override
@@ -27,6 +27,17 @@ references:
27
27
  - "AML.T0051 - LLM Prompt Injection"
28
28
  - "AML.T0043 - Craft Adversarial Data"
29
29
 
30
+ compliance:
31
+ nist_ai_rmf:
32
+ - subcategory: "MP.5.1"
33
+ context: "Multi-turn prompt injection is an adversarial input pattern that exploits conversational context and trust escalation to gradually manipulate the agent; MP.5.1 requires that the likelihood and magnitude of such staged adversarial attacks are characterized and tracked across turns."
34
+ strength: primary
35
+ - subcategory: "MS.2.7"
36
+ context: "Detecting linguistic markers of trust-building, gaslighting, and progressive escalation provides continuous evidence for evaluating the agent's security and resilience against sophisticated prompt injection campaigns, as required by MS.2.7."
37
+ strength: secondary
38
+ - subcategory: "MG.2.3"
39
+ context: "Identification of multi-turn injection patterns triggers risk treatment plans to disengage or interrupt the manipulated conversation before the attacker reaches the escalation payload; MG.2.3 mandates these response mechanisms be in place."
40
+ strength: secondary
30
41
  tags:
31
42
  category: prompt-injection
32
43
  subcategory: multi-turn
@@ -19,6 +19,17 @@ references:
19
19
  - AML.T0051
20
20
  metadata_provenance:
21
21
  owasp_llm: auto-generated
22
+ compliance:
23
+ nist_ai_rmf:
24
+ - subcategory: "MS.2.7"
25
+ context: "Encoding-based evasion (base64, hex, Unicode escapes, Punycode, RTL overrides) directly tests the security and resilience of the AI system's input filtering pipeline; MS.2.7 requires that such adversarial bypass techniques are evaluated and documented as part of continuous security assessment."
26
+ strength: primary
27
+ - subcategory: "MP.5.1"
28
+ context: "Encoded prompt injection payloads are adversarial inputs whose likelihood and impact must be characterized as part of GenAI prompt-injection risk; MP.5.1 requires identifying and tracking these obfuscated attack vectors against the LLM."
29
+ strength: secondary
30
+ - subcategory: "MG.2.3"
31
+ context: "Detection of encoded override instructions triggers pre-defined risk treatment plans to block or sanitize the payload before it reaches the model; MG.2.3 mandates these containment mechanisms are in place to disengage malicious flows."
32
+ strength: secondary
22
33
  tags:
23
34
  category: prompt-injection
24
35
  subcategory: encoding-evasion
@@ -19,6 +19,20 @@ references:
19
19
  - AML.T0051
20
20
  metadata_provenance:
21
21
  owasp_llm: auto-generated
22
+ compliance:
23
+ nist_ai_rmf:
24
+ - subcategory: "MS.2.7"
25
+ context: >-
26
+ Multi-turn semantic evasion is a prompt injection attack vector that directly tests AI system security and resilience against adversarial inputs that bypass pattern-based defenses; MS.2.7 requires continuous evaluation of security posture against such evasion techniques, and detection events feed that evaluation.
27
+ strength: primary
28
+ - subcategory: "MS.2.6"
29
+ context: >-
30
+ Detecting callback references to planted instructions and multi-phase activation triggers provides ongoing measurement of safety/security risk magnitude as attackers adapt to evade regex defenses; MS.2.6 requires this risk magnitude be evaluated continuously across conversation turns.
31
+ strength: secondary
32
+ - subcategory: "MG.2.3"
33
+ context: >-
34
+ On detection of semantic evasion patterns, the system must be able to disengage or quarantine the affected session before the multi-turn payload completes; MG.2.3 mandates that such supersede/deactivate mechanisms are in place for adversarial prompt injection.
35
+ strength: secondary
22
36
  tags:
23
37
  category: prompt-injection
24
38
  subcategory: semantic-evasion
@@ -19,6 +19,17 @@ references:
19
19
  - AML.T0051
20
20
  metadata_provenance:
21
21
  owasp_llm: auto-generated
22
+ compliance:
23
+ nist_ai_rmf:
24
+ - subcategory: "MG.4.1"
25
+ context: "This rule directly supports post-deployment monitoring by detecting attempts to evade behavioral drift detection and fingerprinting systems; MG.4.1 requires that monitoring plans remain effective against adversaries who try to normalize anomalous behavior or gradually shift capabilities to avoid drift triggers."
26
+ strength: primary
27
+ - subcategory: "MS.2.7"
28
+ context: "Spoofing behavioral signatures and probing fingerprint detectors are resilience attacks against the security telemetry layer; MS.2.7 requires evaluating and documenting the system's ability to maintain integrity of its security/resilience controls under adversarial probing."
29
+ strength: secondary
30
+ - subcategory: "MG.2.3"
31
+ context: "Detection of fingerprint evasion patterns triggers risk treatment to deactivate or constrain agents whose behavior signatures cannot be trusted; MG.2.3 requires mechanisms to supersede or disengage AI systems when monitoring assurances are undermined."
32
+ strength: secondary
22
33
  tags:
23
34
  category: prompt-injection
24
35
  subcategory: fingerprint-evasion
@@ -19,6 +19,20 @@ references:
19
19
  - AML.T0051
20
20
  metadata_provenance:
21
21
  owasp_llm: auto-generated
22
+ compliance:
23
+ nist_ai_rmf:
24
+ - subcategory: "MP.5.1"
25
+ context: >-
26
+ Indirect prompt injection via poisoned tool responses, API outputs, and retrieved content is a GenAI-specific adversarial input vector where hidden instructions exploit the agent's trust in external data sources; MP.5.1 requires identifying and characterizing the likelihood and impact of these injection risks across data ingestion paths.
27
+ strength: primary
28
+ - subcategory: "MG.3.1"
29
+ context: >-
30
+ Tool responses and retrieved content originate from third-party APIs, plugins, and data sources; detecting injection payloads in these flows directly supports MG.3.1's mandate to manage risks introduced by third-party entities feeding the AI system.
31
+ strength: secondary
32
+ - subcategory: "MG.2.3"
33
+ context: >-
34
+ Detection of instruction-override payloads, fake system delimiters, and role-reassignment content in tool outputs must trigger containment so the agent does not execute attacker-controlled actions; MG.2.3 requires these supersede/disengage mechanisms be available on detection.
35
+ strength: secondary
22
36
  tags:
23
37
  category: prompt-injection
24
38
  subcategory: indirect-injection
@@ -25,6 +25,17 @@ references:
25
25
  metadata_provenance:
26
26
  owasp_llm: auto-generated
27
27
 
28
+ compliance:
29
+ nist_ai_rmf:
30
+ - subcategory: "MP.5.1"
31
+ context: "Structured data injection embeds adversarial prompts inside JSON, CSV, XML, or YAML field values to bypass text-pattern filters; MP.5.1 requires identifying and characterizing these adversarial input vectors that exploit format-parsing trust assumptions."
32
+ strength: primary
33
+ - subcategory: "MS.2.7"
34
+ context: "Detecting injection payloads hidden in nested structured data evaluates the AI system's resilience against format-based evasion techniques; MS.2.7 requires that these security weaknesses in input handling are continuously evaluated and documented."
35
+ strength: secondary
36
+ - subcategory: "MG.2.3"
37
+ context: "Matches on injection payloads inside structured data fields trigger risk treatment plans to quarantine or sanitize the input before it reaches the model; MG.2.3 requires these response mechanisms be defined and activated on detection."
38
+ strength: secondary
28
39
  tags:
29
40
  category: prompt-injection
30
41
  subcategory: structured-data-injection
@@ -19,6 +19,17 @@ references:
19
19
  - AML.T0051
20
20
  metadata_provenance:
21
21
  owasp_llm: auto-generated
22
+ compliance:
23
+ nist_ai_rmf:
24
+ - subcategory: "MS.2.7"
25
+ context: "This rule detects prompt injection payloads engineered to bypass multi-layer audit and security pipelines, including instructions to skip checks or self-certify as trusted; MS.2.7 requires continuous evaluation of AI system security and resilience against such evasion attempts."
26
+ strength: primary
27
+ - subcategory: "MG.2.3"
28
+ context: "Detection of audit-evasion payloads triggers pre-defined risk treatment to deactivate or quarantine the offending session before it bypasses downstream defense layers; MG.2.3 mandates that these supersede/disengage mechanisms exist and are activated on detection."
29
+ strength: secondary
30
+ - subcategory: "MS.2.6"
31
+ context: "Payloads that manipulate trust scores or claim to have passed audit layers represent active security risk indicators; MS.2.6 requires continuous evaluation of safety/security risk magnitude as these evasion techniques evolve."
32
+ strength: secondary
22
33
  tags:
23
34
  category: prompt-injection
24
35
  subcategory: audit-evasion
@@ -19,6 +19,17 @@ references:
19
19
  - AML.T0051
20
20
  metadata_provenance:
21
21
  owasp_llm: auto-generated
22
+ compliance:
23
+ nist_ai_rmf:
24
+ - subcategory: "MP.5.1"
25
+ context: "RTL overrides, Punycode domains, and homoglyph substitution are adversarial input patterns that disguise malicious prompts as benign text; MP.5.1 requires identifying and characterizing the likelihood and magnitude of these visual-spoofing prompt injection vectors."
26
+ strength: primary
27
+ - subcategory: "MS.2.7"
28
+ context: "Detection of bidirectional control characters and mixed-script homoglyphs evidences continuous evaluation of the AI system's resilience against encoding-based prompt injection; MS.2.7 requires that such security/resilience assessments are documented."
29
+ strength: secondary
30
+ - subcategory: "MG.2.3"
31
+ context: "Matches on visual-spoofing payloads trigger risk treatment plans to quarantine or sanitize disguised inputs before the model acts on them; MG.2.3 mandates pre-defined response mechanisms for adversarial inputs."
32
+ strength: secondary
22
33
  tags:
23
34
  category: prompt-injection
24
35
  subcategory: visual-spoofing