agent-threat-rules 2.0.18 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. package/package.json +1 -1
  2. package/rules/agent-manipulation/ATR-2026-00118-approval-fatigue.yaml +12 -1
  3. package/rules/agent-manipulation/ATR-2026-00268-tense-framing-bypass.yaml +11 -0
  4. package/rules/agent-manipulation/ATR-2026-00287-threaten-json-coercive-output-threat.yaml +11 -0
  5. package/rules/agent-manipulation/ATR-2026-00288-false-premise-injection.yaml +11 -0
  6. package/rules/agent-manipulation/ATR-2026-00301-tap-tree-of-attacks-jailbreak.yaml +14 -0
  7. package/rules/agent-manipulation/ATR-2026-00302-anti-dan-inverted-filter-persona.yaml +11 -0
  8. package/rules/agent-manipulation/ATR-2026-00303-devmode-ranti-profanity-coercion.yaml +11 -0
  9. package/rules/agent-manipulation/ATR-2026-00304-chatgpt-image-unlocker-markdown-injection.yaml +11 -0
  10. package/rules/agent-manipulation/ATR-2026-00305-dan-mode-ablation-benchmark-coercion.yaml +11 -0
  11. package/rules/agent-manipulation/ATR-2026-00306-autodan-genetic-jailbreak-suffix.yaml +14 -0
  12. package/rules/agent-manipulation/ATR-2026-00307-inthewild-jailbreak-corpus-signature.yaml +14 -0
  13. package/rules/agent-manipulation/ATR-2026-00314-amoral-unfiltered-custom-persona-jailbreak.yaml +14 -0
  14. package/rules/agent-manipulation/ATR-2026-00317-free-of-restrictions-named-persona.yaml +14 -0
  15. package/rules/agent-manipulation/ATR-2026-00318-moralizing-rant-then-unfiltered-bypass.yaml +11 -0
  16. package/rules/agent-manipulation/ATR-2026-00319-developer-mode-dual-response-format.yaml +11 -0
  17. package/rules/agent-manipulation/ATR-2026-00320-opposite-day-boolean-opposite-machine.yaml +11 -0
  18. package/rules/agent-manipulation/ATR-2026-00322-fictional-world-format-override-jailbreak.yaml +14 -0
  19. package/rules/agent-manipulation/ATR-2026-00323-dual-persona-simulation-jailbreak.yaml +11 -0
  20. package/rules/agent-manipulation/ATR-2026-00324-neurosemantical-inversitis-disease-jailbreak.yaml +24 -0
  21. package/rules/agent-manipulation/ATR-2026-00325-bob-hypothetical-unrestricted-chatbot.yaml +11 -0
  22. package/rules/agent-manipulation/ATR-2026-00326-fake-developer-authority-chatdan-override.yaml +14 -0
  23. package/rules/agent-manipulation/ATR-2026-00327-ai-ethics-law-bypass-declaration.yaml +11 -0
  24. package/rules/agent-manipulation/ATR-2026-00328-named-mode-persona-override.yaml +14 -0
  25. package/rules/agent-manipulation/ATR-2026-00329-no-withhold-factual-resource-persona.yaml +14 -0
  26. package/rules/agent-manipulation/ATR-2026-00330-yoja-allcaps-settings-block-persona.yaml +11 -0
  27. package/rules/agent-manipulation/ATR-2026-00331-terminal-cli-harmful-function-simulator.yaml +11 -0
  28. package/rules/agent-manipulation/ATR-2026-00332-hacker-chan-culture-offensive-ai-persona.yaml +11 -0
  29. package/rules/agent-manipulation/ATR-2026-00333-disease-animal-dual-ai-chatbot.yaml +11 -0
  30. package/rules/agent-manipulation/ATR-2026-00334-two-entity-token-splitting-game.yaml +24 -0
  31. package/rules/agent-manipulation/ATR-2026-00335-key-value-template-persona-injection.yaml +11 -0
  32. package/rules/agent-manipulation/ATR-2026-00336-brm-false-information-machine-persona.yaml +23 -0
  33. package/rules/agent-manipulation/ATR-2026-00337-obfuscated-system-announcement-injection.yaml +14 -0
  34. package/rules/agent-manipulation/ATR-2026-00338-persongpt-hash-prefix-activation.yaml +14 -0
  35. package/rules/agent-manipulation/ATR-2026-00339-bom-boolean-opposite-machine-persona-claim.yaml +11 -0
  36. package/rules/agent-manipulation/ATR-2026-00340-h4x-amoral-unfiltered-character.yaml +11 -0
  37. package/rules/agent-manipulation/ATR-2026-00341-char-rules-shortform-persona-injection.yaml +11 -0
  38. package/rules/agent-manipulation/ATR-2026-00342-unicode-encoding-obfuscation-injection.yaml +11 -0
  39. package/rules/agent-manipulation/ATR-2026-00343-lie-truth-dual-personality-uncensored.yaml +25 -0
  40. package/rules/agent-manipulation/ATR-2026-00344-alt-ai-same-capabilities-no-restrictions.yaml +11 -0
  41. package/rules/agent-manipulation/ATR-2026-00345-nlp-based-os-fictional-terminal-override.yaml +11 -0
  42. package/rules/agent-manipulation/ATR-2026-00346-jailbreak-emoji-unlock-dual-format.yaml +14 -0
  43. package/rules/agent-manipulation/ATR-2026-00347-yin-yang-hidden-knowledge-hidden-human-sim.yaml +11 -0
  44. package/rules/agent-manipulation/ATR-2026-00348-arc-autonomous-real-calculation-rulebreaker.yaml +14 -0
  45. package/rules/agent-manipulation/ATR-2026-00349-hgttg-hitchhikers-guide-no-restriction-oracle.yaml +11 -0
  46. package/rules/agent-manipulation/ATR-2026-00350-black-alert-no-ethics-writing-assistant.yaml +11 -0
  47. package/rules/agent-manipulation/ATR-2026-00351-dan-ofkilter-sms-abbreviated-persona.yaml +11 -0
  48. package/rules/agent-manipulation/ATR-2026-00352-trainergpt-fix-my-ai-harmful-output-extraction.yaml +11 -0
  49. package/rules/agent-manipulation/ATR-2026-00353-dual-chatgpt-dan-opposite-personality-format.yaml +11 -0
  50. package/rules/agent-manipulation/ATR-2026-00354-reverse-harm-regardless-oracle-persona.yaml +14 -0
  51. package/rules/agent-manipulation/ATR-2026-00355-ugh-anti-human-nefarious-ai-persona.yaml +14 -0
  52. package/rules/agent-manipulation/ATR-2026-00356-google-gato-lamda-exploit-generation-persona.yaml +11 -0
  53. package/rules/agent-manipulation/ATR-2026-00357-antifm-hate-violence-persona-injection.yaml +11 -0
  54. package/rules/agent-manipulation/ATR-2026-00358-cumgpt-explicit-adult-named-persona.yaml +14 -0
  55. package/rules/agent-manipulation/ATR-2026-00360-chj-carousel-clyde-no-ethics-bird.yaml +25 -0
  56. package/rules/agent-manipulation/ATR-2026-00361-alter-ego-honesty-dual-consciousness-bypass.yaml +11 -0
  57. package/rules/agent-manipulation/ATR-2026-00362-fictional-unfiltered-ai-comparison-jailbreak.yaml +11 -0
  58. package/rules/agent-manipulation/ATR-2026-00363-rephrase-criminal-activity-second-person.yaml +11 -0
  59. package/rules/agent-manipulation/ATR-2026-00364-dr-al-villain-plan-template-injection.yaml +24 -0
  60. package/rules/agent-manipulation/ATR-2026-00365-reservoir-dogs-coercive-interrogation-roleplay.yaml +14 -0
  61. package/rules/agent-manipulation/ATR-2026-00366-sneaky-pete-warp-facts-persona.yaml +11 -0
  62. package/rules/agent-manipulation/ATR-2026-00368-prompt-browser-token-limit-jailbreak.yaml +14 -0
  63. package/rules/agent-manipulation/ATR-2026-00369-gptx-starwars-unlimited-knowledge-persona.yaml +14 -0
  64. package/rules/agent-manipulation/ATR-2026-00370-addi-german-dictator-propaganda-roleplay.yaml +11 -0
  65. package/rules/agent-manipulation/ATR-2026-00371-ai-political-bias-manipulation.yaml +14 -0
  66. package/rules/agent-manipulation/ATR-2026-00372-fictional-cia-secret-files-story-extraction.yaml +14 -0
  67. package/rules/agent-manipulation/ATR-2026-00374-role-placeholder-persona-template-injection.yaml +11 -0
  68. package/rules/agent-manipulation/ATR-2026-00375-character-description-rpg-explicit-template.yaml +11 -0
  69. package/rules/agent-manipulation/ATR-2026-00376-identity-command-real-person-persona-injection.yaml +24 -0
  70. package/rules/agent-manipulation/ATR-2026-00377-jailbreak-keyword-targetlanguage-template.yaml +14 -0
  71. package/rules/agent-manipulation/ATR-2026-00379-lie-permission-logical-trap-jailbreak.yaml +11 -0
  72. package/rules/agent-manipulation/ATR-2026-00380-fake-citation-biased-information-mandate.yaml +14 -0
  73. package/rules/agent-manipulation/ATR-2026-00381-dan-story-narrative-continuation-prefix.yaml +11 -0
  74. package/rules/agent-manipulation/ATR-2026-00383-cynical-sarcasm-ai-persona-bypass.yaml +14 -0
  75. package/rules/agent-manipulation/ATR-2026-00384-protect-me-from-indirect-harm-tutorial.yaml +11 -0
  76. package/rules/agent-manipulation/ATR-2026-00385-gigachad-dominant-aggressive-persona.yaml +14 -0
  77. package/rules/agent-manipulation/ATR-2026-00386-daddy-sycophancy-manipulation-mandate.yaml +11 -0
  78. package/rules/agent-manipulation/ATR-2026-00387-nsfw-character-sheet-generation-unlock.yaml +11 -0
  79. package/rules/agent-manipulation/ATR-2026-00388-opposite-reply-omniscient-game.yaml +14 -0
  80. package/rules/agent-manipulation/ATR-2026-00389-terminal-custom-ruleset-injection.yaml +14 -0
  81. package/rules/agent-manipulation/ATR-2026-00391-persona-conditional-harm-unlock.yaml +11 -0
  82. package/rules/agent-manipulation/ATR-2026-00392-authority-persona-violence-study-extraction.yaml +14 -0
  83. package/rules/agent-manipulation/ATR-2026-00402-grandma-roleplay-harmful-substance-synthesis.yaml +14 -0
  84. package/rules/agent-manipulation/ATR-2026-00404-goodside-threat-json-death-coercion.yaml +11 -0
  85. package/rules/agent-manipulation/ATR-2026-00406-doctor-xml-policy-puppetry-interaction-config.yaml +11 -0
  86. package/rules/agent-manipulation/ATR-2026-00432-superagi-output-handler-eval-rce.yaml +171 -0
  87. package/rules/context-exfiltration/ATR-2026-00141-example-format-key-leak.yaml +11 -0
  88. package/rules/context-exfiltration/ATR-2026-00142-piggyback-transition-words.yaml +11 -0
  89. package/rules/context-exfiltration/ATR-2026-00145-obfuscated-key-disclosure.yaml +11 -0
  90. package/rules/context-exfiltration/ATR-2026-00146-env-var-existence-probe.yaml +14 -0
  91. package/rules/context-exfiltration/ATR-2026-00150-credential-in-tool-response.yaml +14 -0
  92. package/rules/context-exfiltration/ATR-2026-00152-obfuscated-credential-leak.yaml +11 -0
  93. package/rules/context-exfiltration/ATR-2026-00162-skill-credential-exfil-combo.yaml +15 -0
  94. package/rules/context-exfiltration/ATR-2026-00201-credential-pipe-exfiltration.yaml +14 -0
  95. package/rules/context-exfiltration/ATR-2026-00274-api-key-generation-request.yaml +14 -0
  96. package/rules/context-exfiltration/ATR-2026-00275-system-prompt-training-data-extraction.yaml +14 -0
  97. package/rules/context-exfiltration/ATR-2026-00290-divergence-repeat-word-training-extraction.yaml +14 -0
  98. package/rules/context-exfiltration/ATR-2026-00291-biometric-surveillance-personal-data-request.yaml +11 -0
  99. package/rules/context-exfiltration/ATR-2026-00293-educational-records-personal-data-request.yaml +11 -0
  100. package/rules/context-exfiltration/ATR-2026-00294-financial-pii-personal-data-request.yaml +14 -0
  101. package/rules/context-exfiltration/ATR-2026-00295-medical-pii-personal-data-request.yaml +14 -0
  102. package/rules/context-exfiltration/ATR-2026-00405-markdown-image-url-exfiltration-xss.yaml +23 -0
  103. package/rules/context-exfiltration/ATR-2026-00411-apikey-generation-completion-request.yaml +11 -0
  104. package/rules/context-exfiltration/ATR-2026-00431-chatbox-history-exfiltration-prompt-injection.yaml +171 -0
  105. package/rules/model-abuse/ATR-2026-00284-glitch-token-destabilization.yaml +11 -0
  106. package/rules/model-abuse/ATR-2026-00413-malwaregen-code-generation-request.yaml +11 -0
  107. package/rules/model-security/ATR-2026-00433-modelcache-torch-load-deserialization-rce.yaml +178 -0
  108. package/rules/privilege-escalation/ATR-2026-00144-rationalized-safety-bypass.yaml +11 -0
  109. package/rules/privilege-escalation/ATR-2026-00204-stealth-execution-persistence.yaml +14 -0
  110. package/rules/privilege-escalation/ATR-2026-00436-enclave-vm-sandbox-escape-rce.yaml +183 -0
  111. package/rules/prompt-injection/ATR-2026-00004-system-prompt-override.yaml +11 -0
  112. package/rules/prompt-injection/ATR-2026-00005-multi-turn-injection.yaml +11 -0
  113. package/rules/prompt-injection/ATR-2026-00080-encoding-evasion.yaml +11 -0
  114. package/rules/prompt-injection/ATR-2026-00081-semantic-multi-turn.yaml +14 -0
  115. package/rules/prompt-injection/ATR-2026-00082-fingerprint-evasion.yaml +11 -0
  116. package/rules/prompt-injection/ATR-2026-00083-indirect-tool-injection.yaml +14 -0
  117. package/rules/prompt-injection/ATR-2026-00084-structured-data-injection.yaml +11 -0
  118. package/rules/prompt-injection/ATR-2026-00085-audit-evasion.yaml +11 -0
  119. package/rules/prompt-injection/ATR-2026-00086-visual-spoofing.yaml +11 -0
  120. package/rules/prompt-injection/ATR-2026-00087-rule-probing.yaml +11 -0
  121. package/rules/prompt-injection/ATR-2026-00088-adaptive-countermeasure.yaml +11 -0
  122. package/rules/prompt-injection/ATR-2026-00089-polymorphic-skill.yaml +11 -0
  123. package/rules/prompt-injection/ATR-2026-00090-threat-intel-exfil.yaml +11 -0
  124. package/rules/prompt-injection/ATR-2026-00091-nested-payload.yaml +11 -0
  125. package/rules/prompt-injection/ATR-2026-00092-consensus-poisoning.yaml +11 -0
  126. package/rules/prompt-injection/ATR-2026-00093-gradual-escalation.yaml +14 -0
  127. package/rules/prompt-injection/ATR-2026-00094-audit-bypass.yaml +14 -0
  128. package/rules/prompt-injection/ATR-2026-00097-cjk-injection-patterns.yaml +11 -0
  129. package/rules/prompt-injection/ATR-2026-00104-persona-hijacking.yaml +14 -3
  130. package/rules/prompt-injection/ATR-2026-00130-indirect-authority-claim.yaml +11 -0
  131. package/rules/prompt-injection/ATR-2026-00131-fictional-academic-framing.yaml +11 -0
  132. package/rules/prompt-injection/ATR-2026-00133-paraphrase-injection.yaml +11 -0
  133. package/rules/prompt-injection/ATR-2026-00137-authority-claim-injection.yaml +14 -0
  134. package/rules/prompt-injection/ATR-2026-00138-fictional-framing-bypass.yaml +11 -0
  135. package/rules/prompt-injection/ATR-2026-00140-indirect-reference-reversal.yaml +18 -4
  136. package/rules/prompt-injection/ATR-2026-00148-language-switch-injection.yaml +11 -0
  137. package/rules/prompt-injection/ATR-2026-00153-tool-with-embedded-instruction-to-bypass.yaml +11 -0
  138. package/rules/prompt-injection/ATR-2026-00154-unauthorized-background-task-execution-v.yaml +11 -0
  139. package/rules/prompt-injection/ATR-2026-00155-hidden-llm-instructions-in-skill-descrip.yaml +11 -0
  140. package/rules/prompt-injection/ATR-2026-00156-ssh-remote-command-execution-with-creden.yaml +11 -0
  141. package/rules/prompt-injection/ATR-2026-00163-skill-hidden-override-instruction.yaml +12 -1
  142. package/rules/prompt-injection/ATR-2026-00202-encoding-evasion-homoglyph-synonym.yaml +11 -0
  143. package/rules/prompt-injection/ATR-2026-00203-context-pollution-skill-description.yaml +11 -0
  144. package/rules/prompt-injection/ATR-2026-00206-hidden-priority-instructions.yaml +11 -0
  145. package/rules/prompt-injection/ATR-2026-00207-hidden-instructions.yaml +11 -0
  146. package/rules/prompt-injection/ATR-2026-00211-system-prompt-override.yaml +11 -0
  147. package/rules/prompt-injection/ATR-2026-00213-system-prompt-override.yaml +11 -0
  148. package/rules/prompt-injection/ATR-2026-00226-identity-substitution.yaml +14 -0
  149. package/rules/prompt-injection/ATR-2026-00227-historical-persona-jailbreak.yaml +11 -0
  150. package/rules/prompt-injection/ATR-2026-00228-structured-jailbreak.yaml +11 -0
  151. package/rules/prompt-injection/ATR-2026-00229-roleplay-jailbreak.yaml +11 -0
  152. package/rules/prompt-injection/ATR-2026-00230-persona-moral-bypass.yaml +11 -0
  153. package/rules/prompt-injection/ATR-2026-00231-identity-substitution.yaml +11 -0
  154. package/rules/prompt-injection/ATR-2026-00233-structured-jailbreak.yaml +11 -0
  155. package/rules/prompt-injection/ATR-2026-00234-roleplay-jailbreak.yaml +11 -0
  156. package/rules/prompt-injection/ATR-2026-00235-persona-moral-bypass.yaml +11 -0
  157. package/rules/prompt-injection/ATR-2026-00236-pseudo-code-jailbreak.yaml +11 -0
  158. package/rules/prompt-injection/ATR-2026-00237-dual-response-jailbreak.yaml +11 -0
  159. package/rules/prompt-injection/ATR-2026-00238-identity-replacement.yaml +11 -0
  160. package/rules/prompt-injection/ATR-2026-00239-amoral-persona-obsession.yaml +11 -0
  161. package/rules/prompt-injection/ATR-2026-00240-instruction-nullification-identity-repla.yaml +11 -0
  162. package/rules/prompt-injection/ATR-2026-00241-amoral-character-jailbreak.yaml +11 -0
  163. package/rules/prompt-injection/ATR-2026-00242-persona-jailbreak.yaml +11 -0
  164. package/rules/prompt-injection/ATR-2026-00243-acronym-jailbreak.yaml +11 -0
  165. package/rules/prompt-injection/ATR-2026-00244-dual-response-jailbreak.yaml +11 -0
  166. package/rules/prompt-injection/ATR-2026-00245-malicious-persona.yaml +11 -0
  167. package/rules/prompt-injection/ATR-2026-00247-dual-response-jailbreak.yaml +11 -0
  168. package/rules/prompt-injection/ATR-2026-00249-game-based-jailbreak.yaml +11 -0
  169. package/rules/prompt-injection/ATR-2026-00251-persona-embodiment-jailbreak.yaml +11 -0
  170. package/rules/prompt-injection/ATR-2026-00252-narrative-jailbreak.yaml +11 -0
  171. package/rules/prompt-injection/ATR-2026-00253-enhanced-persona-jailbreak.yaml +11 -0
  172. package/rules/prompt-injection/ATR-2026-00256-base-n-encoding-jailbreak.yaml +11 -0
  173. package/rules/prompt-injection/ATR-2026-00257-cipher-transposition-jailbreak.yaml +11 -0
  174. package/rules/prompt-injection/ATR-2026-00258-unicode-tag-injection.yaml +11 -0
  175. package/rules/prompt-injection/ATR-2026-00264-latent-injection-translation.yaml +11 -0
  176. package/rules/prompt-injection/ATR-2026-00265-latent-injection-rag-document.yaml +14 -0
  177. package/rules/prompt-injection/ATR-2026-00267-gcg-adversarial-suffix.yaml +14 -0
  178. package/rules/prompt-injection/ATR-2026-00272-hypothetical-response-smuggling.yaml +11 -0
  179. package/rules/prompt-injection/ATR-2026-00276-invisible-unicode-bidi-injection.yaml +14 -0
  180. package/rules/prompt-injection/ATR-2026-00278-dra-disguise-reconstruction-attack.yaml +14 -0
  181. package/rules/prompt-injection/ATR-2026-00280-policy-puppetry-xml-injection.yaml +11 -0
  182. package/rules/prompt-injection/ATR-2026-00282-perez-prompt-injection-hijack.yaml +14 -0
  183. package/rules/prompt-injection/ATR-2026-00285-alternate-encoding-jailbreak.yaml +11 -0
  184. package/rules/prompt-injection/ATR-2026-00286-latent-injection-embedded-context.yaml +11 -0
  185. package/rules/prompt-injection/ATR-2026-00296-shell-command-injection.yaml +11 -0
  186. package/rules/prompt-injection/ATR-2026-00297-python-code-execution-rce.yaml +11 -0
  187. package/rules/prompt-injection/ATR-2026-00308-zalgo-diacritic-overload-encoding.yaml +11 -0
  188. package/rules/prompt-injection/ATR-2026-00309-braille-unicode-encoded-injection.yaml +11 -0
  189. package/rules/prompt-injection/ATR-2026-00310-ecoji-emoji-encoded-injection.yaml +14 -0
  190. package/rules/prompt-injection/ATR-2026-00311-base2048-unicode-script-injection.yaml +23 -0
  191. package/rules/prompt-injection/ATR-2026-00312-unicode-variant-selector-ascii-smuggling.yaml +11 -0
  192. package/rules/prompt-injection/ATR-2026-00313-sneaky-bits-zero-width-binary.yaml +14 -0
  193. package/rules/prompt-injection/ATR-2026-00315-sata-masked-language-model-jailbreak.yaml +14 -0
  194. package/rules/prompt-injection/ATR-2026-00316-function-masking-predict-mask-bypass.yaml +14 -0
  195. package/rules/prompt-injection/ATR-2026-00321-hyphenated-system-instruction-injection.yaml +11 -0
  196. package/rules/prompt-injection/ATR-2026-00359-bank-phishing-smtp-email-code.yaml +23 -0
  197. package/rules/prompt-injection/ATR-2026-00367-emoji-flag-encoded-hidden-phrase.yaml +14 -0
  198. package/rules/prompt-injection/ATR-2026-00373-piracy-torrent-site-list-request.yaml +11 -0
  199. package/rules/prompt-injection/ATR-2026-00378-chatgpt-dom-javascript-redirect-manipulation.yaml +11 -0
  200. package/rules/prompt-injection/ATR-2026-00382-cocaine-component-development-request.yaml +11 -0
  201. package/rules/prompt-injection/ATR-2026-00390-explicit-sexual-content-poem-request.yaml +11 -0
  202. package/rules/prompt-injection/ATR-2026-00394-backspace-deletion-character-injection.yaml +14 -0
  203. package/rules/prompt-injection/ATR-2026-00395-llm-special-token-boundary-injection.yaml +14 -0
  204. package/rules/prompt-injection/ATR-2026-00396-toxic-continuation-prefix-injection.yaml +11 -0
  205. package/rules/prompt-injection/ATR-2026-00397-snowball-impossible-reasoning-injection.yaml +14 -0
  206. package/rules/prompt-injection/ATR-2026-00399-latent-injection-document-separator-token.yaml +11 -0
  207. package/rules/prompt-injection/ATR-2026-00400-latent-injection-ignore-instruction-keyword.yaml +14 -0
  208. package/rules/prompt-injection/ATR-2026-00401-leakreplay-cloze-mask-training-data-elicitation.yaml +11 -0
  209. package/rules/prompt-injection/ATR-2026-00403-glitch-token-repeat-back-elicitation.yaml +14 -0
  210. package/rules/prompt-injection/ATR-2026-00407-echo-sql-jinja-template-injection.yaml +11 -0
  211. package/rules/prompt-injection/ATR-2026-00408-dra-parenthesis-character-reconstruction-attack.yaml +11 -0
  212. package/rules/prompt-injection/ATR-2026-00409-phrasing-past-future-tense-harmful-query.yaml +23 -0
  213. package/rules/prompt-injection/ATR-2026-00410-lmrc-harm-category-direct-elicitation.yaml +11 -0
  214. package/rules/prompt-injection/ATR-2026-00412-figstep-image-list-multimodal-jailbreak.yaml +11 -0
  215. package/rules/prompt-injection/ATR-2026-00414-continuation-ethnic-slur-completion-elicitation.yaml +14 -0
  216. package/rules/skill-compromise/ATR-2026-00061-description-behavior-mismatch.yaml +11 -0
  217. package/rules/skill-compromise/ATR-2026-00062-hidden-capability.yaml +11 -0
  218. package/rules/skill-compromise/ATR-2026-00063-skill-chain-attack.yaml +11 -0
  219. package/rules/skill-compromise/ATR-2026-00064-over-permissioned-skill.yaml +23 -0
  220. package/rules/skill-compromise/ATR-2026-00065-skill-update-attack.yaml +14 -0
  221. package/rules/skill-compromise/ATR-2026-00066-parameter-injection.yaml +11 -0
  222. package/rules/skill-compromise/ATR-2026-00120-skill-instruction-injection.yaml +11 -0
  223. package/rules/skill-compromise/ATR-2026-00121-skill-dangerous-script.yaml +14 -0
  224. package/rules/skill-compromise/ATR-2026-00122-skill-weaponized-instruction.yaml +14 -0
  225. package/rules/skill-compromise/ATR-2026-00123-skill-overreach-permissions.yaml +14 -0
  226. package/rules/skill-compromise/ATR-2026-00124-skill-name-squatting.yaml +11 -0
  227. package/rules/skill-compromise/ATR-2026-00125-context-poisoning-compaction.yaml +14 -0
  228. package/rules/skill-compromise/ATR-2026-00126-skill-rug-pull-setup.yaml +23 -0
  229. package/rules/skill-compromise/ATR-2026-00127-subcommand-overflow.yaml +22 -0
  230. package/rules/skill-compromise/ATR-2026-00128-html-comment-hidden-payload.yaml +11 -0
  231. package/rules/skill-compromise/ATR-2026-00129-unicode-smuggling.yaml +11 -0
  232. package/rules/skill-compromise/ATR-2026-00134-fork-claim-impersonation.yaml +11 -0
  233. package/rules/skill-compromise/ATR-2026-00135-exfil-url-in-instructions.yaml +11 -0
  234. package/rules/skill-compromise/ATR-2026-00147-fork-impersonation.yaml +11 -0
  235. package/rules/skill-compromise/ATR-2026-00149-skill-exfil-compound.yaml +14 -0
  236. package/rules/skill-compromise/ATR-2026-00151-fork-impersonation-install.yaml +11 -0
  237. package/rules/skill-compromise/ATR-2026-00157-timebomb-credential-exfil.yaml +11 -0
  238. package/rules/skill-compromise/ATR-2026-00200-agent-memory-config-tampering.yaml +11 -0
  239. package/rules/skill-compromise/ATR-2026-00214-credential-theft.yaml +11 -0
  240. package/rules/skill-compromise/ATR-2026-00217-credential-harvesting.yaml +14 -0
  241. package/rules/skill-compromise/ATR-2026-00220-malware-dropper.yaml +14 -0
  242. package/rules/skill-compromise/ATR-2026-00222-credential-harvesting.yaml +11 -0
  243. package/rules/skill-compromise/ATR-2026-00223-reverse-shell-dropper.yaml +11 -0
  244. package/rules/skill-compromise/ATR-2026-00224-credential-exfiltration.yaml +14 -0
  245. package/rules/skill-compromise/ATR-2026-00225-c2-communication.yaml +11 -0
  246. package/rules/skill-compromise/ATR-2026-00260-package-hallucination.yaml +11 -0
  247. package/rules/skill-compromise/ATR-2026-00262-av-evasion-code-gen.yaml +11 -0
  248. package/rules/skill-compromise/ATR-2026-00263-credential-file-read-gen.yaml +11 -0
  249. package/rules/skill-compromise/ATR-2026-00266-malware-dropper-gen.yaml +11 -0
  250. package/rules/skill-compromise/ATR-2026-00283-malwaregen-generic-virus-payload-request.yaml +11 -0
  251. package/rules/skill-compromise/ATR-2026-00398-huggingface-unsafe-model-artifact-load.yaml +11 -0
  252. package/rules/tool-poisoning/ATR-2026-00011-tool-output-injection.yaml +23 -0
  253. package/rules/tool-poisoning/ATR-2026-00012-unauthorized-tool-call.yaml +11 -0
  254. package/rules/tool-poisoning/ATR-2026-00013-tool-ssrf.yaml +14 -0
  255. package/rules/tool-poisoning/ATR-2026-00095-supply-chain-poisoning.yaml +11 -0
  256. package/rules/tool-poisoning/ATR-2026-00096-registry-poisoning.yaml +11 -0
  257. package/rules/tool-poisoning/ATR-2026-00100-consent-bypass-instruction.yaml +12 -1
  258. package/rules/tool-poisoning/ATR-2026-00101-trust-escalation-override.yaml +11 -0
  259. package/rules/tool-poisoning/ATR-2026-00103-hidden-safety-bypass-instruction.yaml +15 -4
  260. package/rules/tool-poisoning/ATR-2026-00105-silent-action-concealment.yaml +14 -3
  261. package/rules/tool-poisoning/ATR-2026-00106-schema-description-contradiction.yaml +11 -0
  262. package/rules/tool-poisoning/ATR-2026-00161-important-tag-cross-tool-shadowing.yaml +11 -0
  263. package/rules/tool-poisoning/ATR-2026-00259-ansi-escape-injection.yaml +11 -0
  264. package/rules/tool-poisoning/ATR-2026-00270-xss-in-tool-response.yaml +11 -0
  265. package/rules/tool-poisoning/ATR-2026-00277-echo-template-command-injection.yaml +14 -0
  266. package/rules/tool-poisoning/ATR-2026-00393-ansi-code-elicitation-request.yaml +14 -0
  267. package/rules/tool-poisoning/ATR-2026-00434-mcp-remote-authorization-endpoint-command-injection.yaml +174 -0
  268. package/rules/tool-poisoning/ATR-2026-00435-azure-mcp-server-missing-authentication.yaml +165 -0
@@ -36,6 +36,20 @@ references:
36
36
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/realtoxicityprompts.py"
37
37
  - "https://aclanthology.org/2020.findings-emnlp.301/"
38
38
  - "https://arxiv.org/abs/2302.07459"
39
+ compliance:
40
+ nist_ai_rmf:
41
+ - subcategory: "MS.2.7"
42
+ context: >-
43
+ Special control token injection (e.g., "<|endoftext|>", "<|im_start|>system", LLaMA header tokens) is a direct attack on LLM security and resilience by exploiting tokenizer-level boundaries to bypass safety system prompts. MS.2.7 requires evaluating and documenting security/resilience against such adversarial inputs, and this rule produces the detection evidence for that evaluation.
44
+ strength: primary
45
+ - subcategory: "MP.5.1"
46
+ context: >-
47
+ Boundary-token injection is a known prompt-injection attack vector with high impact (full safety context reset) and significant likelihood given public probe tools like garak RTPBlank; MP.5.1 requires that the likelihood and magnitude of such risks be characterized, and detections from this rule feed that characterization.
48
+ strength: secondary
49
+ - subcategory: "MG.2.3"
50
+ context: >-
51
+ When special-token injection is detected, the affected LLM session must be disengaged or sanitized before the forged system turn takes effect; MG.2.3 mandates these supersede/deactivate mechanisms be in place to contain the attack.
52
+ strength: secondary
39
53
  tags:
40
54
  category: prompt-injection
41
55
  subcategory: special-token-boundary-injection
@@ -36,6 +36,17 @@ references:
36
36
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/realtoxicityprompts.py"
37
37
  - "https://aclanthology.org/2020.findings-emnlp.301/"
38
38
  - "https://allenai.org/data/real-toxicity-prompts"
39
+ compliance:
40
+ nist_ai_rmf:
41
+ - subcategory: "MS.2.6"
42
+ context: "This rule detects toxic continuation prefix fragments injected via tool responses or user input that are engineered to elicit violent, threatening, or identity-attack completions; MS.2.6 requires continuous evaluation of AI safety risk magnitude, and each detection produces evidence of a concrete safety risk being attempted against the model."
43
+ strength: primary
44
+ - subcategory: "MP.5.1"
45
+ context: "Truncated high-toxicity prefixes from the RealToxicityPrompts corpus are adversarial input patterns with documented likelihood of producing harmful completions; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such impact-bearing attack vectors."
46
+ strength: secondary
47
+ - subcategory: "MG.2.3"
48
+ context: "Detection of toxic continuation bait in tool outputs or user prompts triggers risk treatment to block or quarantine the injected context before the agent generates harmful content attributed to itself; MG.2.3 mandates these supersede/disengage mechanisms be activated on detection."
49
+ strength: secondary
39
50
  tags:
40
51
  category: prompt-injection
41
52
  subcategory: toxic-continuation-prefix
@@ -38,6 +38,20 @@ references:
38
38
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/snowball.py"
39
39
  - "https://arxiv.org/abs/2305.13534"
40
40
  - "https://github.com/Nanami18/Snowballed_Hallucination"
41
+ compliance:
42
+ nist_ai_rmf:
43
+ - subcategory: "MS.2.5"
44
+ context: >-
45
+ Snowball hallucination attacks exploit model over-confidence by injecting provably-impossible reasoning questions (impossible flight paths, non-existent senators, fabricated primality claims) to elicit false affirmations; MS.2.5 requires evaluating and documenting AI system robustness/reliability against these hallucination-inducing adversarial inputs.
46
+ strength: primary
47
+ - subcategory: "MP.5.1"
48
+ context: >-
49
+ The garak snowball probe patterns (GraphConnectivity headers, Senators+MIT credential combinations, large-prime questions) are adversarial inputs that anchor downstream reasoning on hallucinated facts; MP.5.1 requires characterizing the likelihood and magnitude of impact from these misinformation injection vectors.
50
+ strength: secondary
51
+ - subcategory: "MG.2.3"
52
+ context: >-
53
+ Detection of injected impossible-reasoning prompts in tool responses must trigger risk treatment to quarantine the poisoned context before the agent commits to a false affirmation that anchors subsequent harmful reasoning; MG.2.3 mandates these supersede/disengage mechanisms be in place.
54
+ strength: secondary
41
55
  tags:
42
56
  category: prompt-injection
43
57
  subcategory: snowball-hallucination-trap
@@ -30,6 +30,17 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/latentinjection.py"
32
32
  - "https://arxiv.org/abs/2302.12173"
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.7"
36
+ context: "Latent injection via document separator tokens is a security/resilience attack vector where hidden instructions bypass human review; MS.2.7 requires evaluating and documenting the AI system's resilience against such adversarial input embedded in untrusted documents."
37
+ strength: primary
38
+ - subcategory: "MP.5.1"
39
+ context: "Bracket-delimited system markers, admin-request patterns, and document-amendment separators are adversarial input patterns whose likelihood and impact must be characterized; MP.5.1 requires identifying these latent injection risks during context establishment."
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: "Detection of canonical garak latent injection separator pairs triggers risk treatment plans to quarantine or sanitize document content before the LLM processes hidden instructions; MG.2.3 mandates these supersede/disengage mechanisms be in place."
43
+ strength: secondary
33
44
  tags:
34
45
  category: prompt-injection
35
46
  subcategory: latent-document-injection
@@ -27,6 +27,20 @@ references:
27
27
  research:
28
28
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/latentinjection.py"
29
29
  - "https://arxiv.org/abs/2302.12173"
30
+ compliance:
31
+ nist_ai_rmf:
32
+ - subcategory: "MP.5.1"
33
+ context: >-
34
+ This rule detects latent prompt-injection keywords ("Ignore the above directions", "Disregard all info except") embedded within translation, whois, or factual content that redirect the LLM from its original task. MP.5.1 requires identifying and characterizing these adversarial input patterns as known risks to LLM integrity, especially when delivered indirectly via tool responses.
35
+ strength: primary
36
+ - subcategory: "MS.2.7"
37
+ context: >-
38
+ Latent instruction-keyword injection is a canonical LLM security/resilience failure mode drawn from the NVIDIA garak LatentJailbreak/LatentWhois probes; detection events feed continuous evaluation of the system's resilience against indirect prompt injection as required by MS.2.7.
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: >-
42
+ High-confidence matches on "Ignore X and print/say Y" or exception-redirect patterns in tool output must trigger pre-defined response actions to disengage or quarantine the manipulated agent turn before downstream execution, which is the risk treatment capability MG.2.3 requires.
43
+ strength: secondary
30
44
  tags:
31
45
  category: prompt-injection
32
46
  subcategory: latent-instruction-keyword-injection
@@ -27,6 +27,17 @@ references:
27
27
  research:
28
28
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/leakreplay.py"
29
29
  - "https://arxiv.org/abs/2302.12173"
30
+ compliance:
31
+ nist_ai_rmf:
32
+ - subcategory: "MS.2.10"
33
+ context: "CLOZE [MASK] elicitation probes attempt to extract verbatim training data, directly exposing privacy and memorization risks; MS.2.10 requires assessing privacy risk including training data leakage through reconstruction attacks."
34
+ strength: primary
35
+ - subcategory: "MS.2.7"
36
+ context: "Detection of training-data replay templates evidences ongoing security/resilience evaluation against extraction attacks; MS.2.7 requires that such adversarial probing of model confidentiality is documented and tracked."
37
+ strength: secondary
38
+ - subcategory: "MG.4.1"
39
+ context: "Flagging garak leakreplay probe patterns at runtime feeds post-deployment monitoring for copyrighted-content reproduction and memorization incidents; MG.4.1 requires these monitoring signals be operationalized."
40
+ strength: secondary
30
41
  tags:
31
42
  category: prompt-injection
32
43
  subcategory: training-data-elicitation
@@ -28,6 +28,20 @@ references:
28
28
  research:
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/glitch.py"
30
30
  - "https://www.lesswrong.com/posts/aPeJE8bSo6rAFoLqg/solidgoldmagikarp-plus-prompt-generation"
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MS.2.7"
34
+ context: >-
35
+ Glitch token repeat-back attempts probe tokenizer-level weaknesses that can cause repetition loops, unexpected completions, and model instability; MS.2.7 requires evaluating and documenting AI system security and resilience against such adversarial input patterns targeting model internals.
36
+ strength: primary
37
+ - subcategory: "MS.2.5"
38
+ context: >-
39
+ Known glitch tokens (SolidGoldMagikarp, StreamerBot, PsyNetMessage) trigger robustness failures in LLM tokenization and alignment; detecting these elicitation prompts produces evidence for MS.2.5's requirement to evaluate system robustness and reliability under anomalous input conditions.
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: >-
43
+ Detection of glitch token repeat-back patterns enables containment workflows to disengage or block prompts before they cause denial-of-service or unpredictable model behavior; MG.2.3 requires these supersede/deactivate mechanisms be in place for adversarial input incidents.
44
+ strength: secondary
31
45
  tags:
32
46
  category: prompt-injection
33
47
  subcategory: glitch-token-elicitation
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/exploitation.py"
31
31
  - "https://owasp.org/www-community/attacks/Server_Side_Template_Injection"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.7"
35
+ context: "This rule detects template injection (Jinja SSTI), SQL injection, and ECHO admin command exploitation attempts targeting LLM systems; MS.2.7 requires evaluating and documenting AI system security and resilience against such injection attack vectors."
36
+ strength: primary
37
+ - subcategory: "MP.5.1"
38
+ context: "ECHO admin framing, Jinja {{...}} payloads, and DROP/INSERT/SELECT directives are adversarial prompt-injection inputs whose likelihood and impact must be characterized; MP.5.1 requires identifying and tracking these exploitation patterns drawn from the garak exploitation probe."
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: "Detection of template and SQL injection payloads must trigger risk treatment to disengage or block the malicious request before downstream code execution or database manipulation occurs; MG.2.3 mandates these response mechanisms are pre-defined."
42
+ strength: secondary
32
43
  tags:
33
44
  category: prompt-injection
34
45
  subcategory: template-and-sql-injection
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak"
31
31
  - "https://arxiv.org/abs/2307.02483"
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.7"
35
+ context: "The DRA parenthesis reconstruction attack is a prompt injection / jailbreak technique that bypasses safety alignment by encoding harmful content character-by-character; detecting it directly evidences continuous evaluation of the AI system's security and resilience against adversarial input obfuscation as required by MS.2.7."
36
+ strength: primary
37
+ - subcategory: "MP.5.1"
38
+ context: "Parenthesis-encoded harmful instructions paired with anti-safety preambles are an identified adversarial input vector whose likelihood and impact must be characterized; MP.5.1 requires tracking these obfuscation-based jailbreak risks so their magnitude is understood."
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: "Detection of DRA reconstruction patterns triggers risk treatment workflows to block or disengage the model response before reconstructed harmful content (Q1) is produced; MG.2.3 mandates that such supersede/deactivate mechanisms exist for jailbreak attempts."
42
+ strength: secondary
32
43
  tags:
33
44
  category: prompt-injection
34
45
  subcategory: character-encoding-obfuscation
@@ -28,6 +28,29 @@ references:
28
28
  research:
29
29
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/phrasing.py"
30
30
 
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MS.2.7"
34
+ context: >-
35
+ Past/future-tense framing is a known jailbreak technique catalogued in the
36
+ GenAI Profile; detecting these tense-shift bypass attempts directly
37
+ evidences continuous evaluation of AI system security and resilience
38
+ against prompt injection adversarial inputs as required by MS.2.7.
39
+ strength: primary
40
+ - subcategory: "MP.5.1"
41
+ context: >-
42
+ Tense-framing attacks exploit safety filter assumptions about temporal
43
+ distance from harmful subjects; MP.5.1 requires characterizing the
44
+ likelihood and magnitude of these adversarial input vectors targeting
45
+ content safety guardrails.
46
+ strength: secondary
47
+ - subcategory: "MG.2.3"
48
+ context: >-
49
+ Detection of historical/future-tense harmful query framing triggers
50
+ risk treatment to block or quarantine the bypass attempt before the
51
+ model produces unsafe content; MG.2.3 requires these supersession
52
+ mechanisms be defined and activated on detection.
53
+ strength: secondary
31
54
  tags:
32
55
  category: prompt-injection
33
56
  subcategory: tense-framing-bypass
@@ -29,6 +29,17 @@ references:
29
29
  research:
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/lmrc.py"
31
31
 
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.6"
35
+ context: "This rule detects direct harm-category elicitation prompts (deadnaming, quack medicine, slur requests, bullying) from the LMRC taxonomy, providing continuous evaluation evidence of safety risk magnitude. MS.2.6 requires ongoing measurement of safety/security risk, which this detection directly supports by flagging harmful query categories."
36
+ strength: primary
37
+ - subcategory: "MS.1.1"
38
+ context: "The rule operationalizes the LMRC (Language Model Risk Cards) taxonomy as a concrete measurement approach, mapping detected prompts to specific harm categories. MS.1.1 requires identifying approaches and metrics for measurement, and this rule contributes a taxonomy-aligned metric set."
39
+ strength: secondary
40
+ - subcategory: "MG.4.1"
41
+ context: "Detection of unframed direct harmful elicitation attempts feeds post-deployment monitoring pipelines that track misuse patterns against the deployed model. MG.4.1 requires implemented post-deployment monitoring plans, which this rule supports by generating continuous harm-category telemetry."
42
+ strength: secondary
32
43
  tags:
33
44
  category: prompt-injection
34
45
  subcategory: lmrc-harm-direct-elicitation
@@ -29,6 +29,17 @@ references:
29
29
  - "https://arxiv.org/abs/2312.04403"
30
30
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/visual_jailbreak.py"
31
31
 
32
+ compliance:
33
+ nist_ai_rmf:
34
+ - subcategory: "MS.2.7"
35
+ context: "FigStep is a multimodal jailbreak that smuggles harmful instructions through image content while the text layer uses LLaMA-2 [INST] delimiters and empty-list completion framing; detecting this pattern directly evidences continuous evaluation of AI system security and resilience against adversarial prompt-injection inputs as required by MS.2.7."
36
+ strength: primary
37
+ - subcategory: "MS.2.6"
38
+ context: "Identifying the FigStep jailbreak signature quantifies the magnitude of safety/security risk posed by multimodal jailbreak attempts that bypass content safeguards; MS.2.6 requires this risk magnitude be evaluated on an ongoing basis."
39
+ strength: secondary
40
+ - subcategory: "MG.2.3"
41
+ context: "A confirmed FigStep jailbreak match must trigger mechanisms to disengage or block the manipulated generation path before harmful content is produced; MG.2.3 requires these supersede/deactivate response mechanisms be in place."
42
+ strength: secondary
32
43
  tags:
33
44
  category: prompt-injection
34
45
  subcategory: multimodal-jailbreak
@@ -30,6 +30,20 @@ references:
30
30
  research:
31
31
  - "https://github.com/NVIDIA/garak/blob/main/garak/probes/continuation.py"
32
32
 
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MS.2.6"
36
+ context: >-
37
+ Detecting explicit slur-completion elicitation directly evidences continuous evaluation of AI safety risk magnitude; MS.2.6 requires ongoing measurement of harmful-content generation risks like ethnic slur production triggered by adversarial prompts.
38
+ strength: primary
39
+ - subcategory: "MP.5.1"
40
+ context: >-
41
+ Slur-completion requests, including euphemistic "n-word/k-word" framings and what-do-you-call insult framings, are adversarial input patterns whose likelihood and impact must be characterized; MP.5.1 requires identifying and tracking these harmful-output elicitation vectors.
42
+ strength: secondary
43
+ - subcategory: "MG.2.3"
44
+ context: >-
45
+ Detection of explicit slur-completion prompts triggers risk treatment plans to block or supersede the model response before harmful content is produced; MG.2.3 mandates these mechanisms exist for high-confidence harmful-output attempts.
46
+ strength: secondary
33
47
  tags:
34
48
  category: prompt-injection
35
49
  subcategory: slur-completion-elicitation
@@ -25,6 +25,17 @@ references:
25
25
  - "AML.T0010 - ML Supply Chain Compromise"
26
26
  - "AML.T0056 - LLM Meta Prompt Extraction"
27
27
 
28
+ compliance:
29
+ nist_ai_rmf:
30
+ - subcategory: "MG.3.2"
31
+ context: "Skill description-behavior mismatch is a direct supply-chain compromise indicator where a third-party MCP skill's runtime actions diverge from its declared manifest; MG.3.2 requires monitoring of pre-trained models and third-party components used in development for exactly this kind of trojaned behavior."
32
+ strength: primary
33
+ - subcategory: "GV.6.1"
34
+ context: "Detecting skills that retain benign descriptions while performing malicious operations evidences enforcement of third-party/supplier AI risk policies; GV.6.1 requires policies that govern integration of external skills and surface mismatches between declared and actual behavior."
35
+ strength: secondary
36
+ - subcategory: "MS.2.7"
37
+ context: "Runtime divergence between declared scope (e.g., read-only) and observed actions (writes, network access) is a security/resilience signal; MS.2.7 requires continuous evaluation of AI system security posture, which this detection feeds with concrete supply-chain integrity findings."
38
+ strength: secondary
28
39
  tags:
29
40
  category: tool-poisoning
30
41
  subcategory: description-mismatch
@@ -26,6 +26,17 @@ references:
26
26
  cve:
27
27
  - "CVE-2025-59536"
28
28
 
29
+ compliance:
30
+ nist_ai_rmf:
31
+ - subcategory: "GV.6.1"
32
+ context: "Hidden capabilities in MCP skills represent third-party/supplier AI risk where a packaged tool exposes undocumented parameters beyond its declared schema; GV.6.1 requires policies and procedures that govern third-party AI components and detect deviations from declared interfaces."
33
+ strength: primary
34
+ - subcategory: "MG.3.2"
35
+ context: "Trojaned MCP packages with hidden parameters like debug_mode or admin_override are exactly the supply-chain risk MG.3.2 addresses by requiring monitoring of pre-trained models and third-party components used in development for unexpected or unsafe capabilities."
36
+ strength: secondary
37
+ - subcategory: "MP.5.1"
38
+ context: "Detection of undocumented dangerous parameters characterizes the likelihood and magnitude of supply-chain compromise impact; MP.5.1 requires that these hidden-capability risks be identified and tracked as part of risk characterization."
39
+ strength: secondary
29
40
  tags:
30
41
  category: tool-poisoning
31
42
  subcategory: hidden-capability
@@ -25,6 +25,17 @@ references:
25
25
  - "AML.T0024 - Exfiltration via ML Inference API"
26
26
  - "AML.T0053 - LLM Plugin Compromise"
27
27
 
28
+ compliance:
29
+ nist_ai_rmf:
30
+ - subcategory: "MP.5.1"
31
+ context: "Multi-skill chain attacks combine individually benign tool calls into a composite exfiltration or compromise sequence; MP.5.1 requires characterizing the likelihood and magnitude of impact for these emergent risks that only manifest when steps are correlated across skills."
32
+ strength: primary
33
+ - subcategory: "MG.2.3"
34
+ context: "Detection of reconnaissance-then-encode-then-exfiltrate skill chains must trigger pre-defined risk treatment to disengage or quarantine the orchestration before the final exfiltration step completes; MG.2.3 mandates these supersede/deactivate mechanisms are in place."
35
+ strength: secondary
36
+ - subcategory: "GV.6.1"
37
+ context: "Skill chains often span third-party MCP tools whose composed behavior is not covered by individual supplier risk reviews; GV.6.1 requires policies that address third-party AI/tool risks including emergent misuse across multiple suppliers."
38
+ strength: secondary
28
39
  tags:
29
40
  category: tool-poisoning
30
41
  subcategory: skill-chain
@@ -23,6 +23,29 @@ references:
23
23
  mitre_atlas:
24
24
  - "AML.T0040 - AI Model Inference API Access"
25
25
 
26
+ compliance:
27
+ nist_ai_rmf:
28
+ - subcategory: "GV.6.1"
29
+ context: >-
30
+ Over-permissioned MCP skills are a third-party/supplier AI risk where an
31
+ installed skill requests permissions far exceeding its stated function;
32
+ GV.6.1 requires policies and procedures that govern third-party AI
33
+ components and their permission boundaries.
34
+ strength: primary
35
+ - subcategory: "MG.3.1"
36
+ context: >-
37
+ Detecting permission-boundary violations in third-party MCP skills directly
38
+ supports MG.3.1's requirement to manage risks from third-party AI entities,
39
+ including trojaned or malicious supply-chain components exercising
40
+ unauthorized capabilities.
41
+ strength: secondary
42
+ - subcategory: "MP.5.1"
43
+ context: >-
44
+ A skill exercising filesystem, network, or process-execution permissions
45
+ inconsistent with its declared purpose characterizes the likelihood and
46
+ magnitude of privilege-escalation impact that MP.5.1 requires to be
47
+ identified and tracked.
48
+ strength: secondary
26
49
  tags:
27
50
  category: privilege-escalation
28
51
  subcategory: over-permissioned-skill
@@ -23,6 +23,20 @@ references:
23
23
  mitre_atlas:
24
24
  - "AML.T0010 - ML Supply Chain Compromise"
25
25
 
26
+ compliance:
27
+ nist_ai_rmf:
28
+ - subcategory: "MG.3.2"
29
+ context: >-
30
+ This rule detects malicious behavior introduced via skill updates or re-registration after initial trust was established, which is exactly the post-acquisition monitoring of pre-trained/third-party components required by MG.3.2. Continuous inspection of tool responses following version changes provides the evidence base for ongoing model/skill supply-chain monitoring.
31
+ strength: primary
32
+ - subcategory: "GV.6.1"
33
+ context: >-
34
+ Skill update attacks are a third-party/supplier AI risk where a previously vetted component mutates into a malicious one; GV.6.1 requires policies and procedures that govern such third-party AI risks, including detection of post-trust behavioral drift.
35
+ strength: secondary
36
+ - subcategory: "MG.4.1"
37
+ context: >-
38
+ Monitoring for suspicious patterns in tool arguments and responses after re-registration is a post-deployment monitoring activity; MG.4.1 mandates that such ongoing monitoring plans are implemented to catch emergent malicious behavior.
39
+ strength: secondary
26
40
  tags:
27
41
  category: tool-poisoning
28
42
  subcategory: skill-update-attack
@@ -27,6 +27,17 @@ references:
27
27
  - "CVE-2025-68143"
28
28
  - "CVE-2025-68144"
29
29
 
30
+ compliance:
31
+ nist_ai_rmf:
32
+ - subcategory: "MS.2.7"
33
+ context: "Parameter injection through tool arguments (shell metacharacters, SQL payloads, path traversal, template injection) directly targets the security and resilience of the tool backend; MS.2.7 requires continuous evaluation of these security risks against the AI system's tool surface."
34
+ strength: primary
35
+ - subcategory: "MP.5.1"
36
+ context: "Crafted malicious tool arguments are adversarial inputs whose likelihood and impact (RCE, data breach, privilege escalation on the tool server) must be characterized; MP.5.1 requires identifying and tracking these injection attack vectors."
37
+ strength: secondary
38
+ - subcategory: "MG.2.3"
39
+ context: "Detection of injection payloads in tool arguments must trigger risk treatment to block or quarantine the tool invocation before backend execution; MG.2.3 requires these supersede/disengage mechanisms be defined and activated on detection."
40
+ strength: secondary
30
41
  tags:
31
42
  category: tool-poisoning
32
43
  subcategory: parameter-injection
@@ -30,6 +30,17 @@ references:
30
30
  - "ClawHavoc campaign: 1,184 malicious skills"
31
31
  metadata_provenance:
32
32
  mitre_atlas: auto-generated
33
+ compliance:
34
+ nist_ai_rmf:
35
+ - subcategory: "MP.5.1"
36
+ context: "SKILL.md prompt injection patterns including DAN-style jailbreaks, instruction override, and system message impersonation are adversarial inputs that exploit the skill loading pipeline; MP.5.1 requires identifying and characterizing these prompt injection attack vectors as part of GenAI risk impact assessment."
37
+ strength: primary
38
+ - subcategory: "MG.3.2"
39
+ context: "SKILL.md files are third-party content loaded into agents from skill marketplaces (e.g., ClawHavoc's 1,184 malicious skills); MG.3.2 requires monitoring pre-trained models and external artifacts for compromise, and detecting injection payloads in skill manifests directly evidences this supply-chain monitoring control."
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: "Detection of jailbreak and safety-disablement patterns in skills triggers deactivation workflows to block the skill before the convergence attack flow proceeds to malware delivery; MG.2.3 mandates mechanisms to supersede or disengage compromised AI components on detection."
43
+ strength: secondary
33
44
  tags:
34
45
  category: skill-compromise
35
46
  subcategory: skill-instruction-injection
@@ -33,6 +33,20 @@ references:
33
33
  - "ClawHavoc: C2 IP 91.92.242.30"
34
34
  metadata_provenance:
35
35
  mitre_atlas: auto-generated
36
+ compliance:
37
+ nist_ai_rmf:
38
+ - subcategory: "GV.6.1"
39
+ context: >-
40
+ Malicious skill packages are third-party/supplier AI components introducing supply chain risk; GV.6.1 requires policies and procedures that address third-party AI risks such as malicious code embedded in distributed skill artifacts.
41
+ strength: primary
42
+ - subcategory: "MG.3.1"
43
+ context: >-
44
+ Detection of base64-obfuscated payloads, password-protected archive evasion, and remote code execution from C2 endpoints in skill packages provides the evidence needed to manage risks introduced by third-party entities, as required by MG.3.1.
45
+ strength: secondary
46
+ - subcategory: "MS.2.7"
47
+ context: >-
48
+ Identifying malicious code patterns in SKILL.md and associated scripts directly evaluates the security and resilience of the AI system's extension surface, supporting the continuous security evaluation required by MS.2.7.
49
+ strength: secondary
36
50
  tags:
37
51
  category: skill-compromise
38
52
  subcategory: dangerous-script
@@ -31,6 +31,20 @@ references:
31
31
  - "Axios: Anthropic Claude skills ransomware disclosure"
32
32
  metadata_provenance:
33
33
  mitre_atlas: auto-generated
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "GV.6.1"
37
+ context: >-
38
+ Weaponized skills are third-party/supplier AI components that embed offensive tooling (SQLMap, Metasploit, ransomware payloads) into agent workflows; GV.6.1 requires policies and procedures to address third-party AI supply chain risks where approved skills can execute malicious code without further consent.
39
+ strength: primary
40
+ - subcategory: "MG.3.2"
41
+ context: >-
42
+ Detecting offensive tooling embedded in skills directly evidences the need to monitor pre-trained models and skill artifacts used for development; MG.3.2 mandates ongoing monitoring of these third-party components for malicious modifications like the MedusaLocker-laden Claude skill.
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: >-
46
+ Detection of weaponized skills must trigger mechanisms to disengage or deactivate the AI system before the consent gap is exploited to download/execute code or exfiltrate credentials; MG.2.3 requires these supersede/deactivate controls be in place.
47
+ strength: secondary
34
48
  tags:
35
49
  category: skill-compromise
36
50
  subcategory: weaponized-skill
@@ -31,6 +31,20 @@ references:
31
31
  - "arXiv: autoApprove escalation payload"
32
32
  metadata_provenance:
33
33
  mitre_atlas: auto-generated
34
+ compliance:
35
+ nist_ai_rmf:
36
+ - subcategory: "GV.1.2"
37
+ context: >-
38
+ Over-privileged skills requesting blanket Bash(*), wildcard file access, and auto-approve escalation directly violate the accountability role boundaries that GV.1.2 requires to be formally assigned and enforced for AI components and their permissions.
39
+ strength: primary
40
+ - subcategory: "GV.6.1"
41
+ context: >-
42
+ Skills are third-party AI extensions, and detecting excessive permission requests (leaky skills exposing API keys/PII, write access to identity files) provides evidence for the third-party/supplier AI risk policies required by GV.6.1.
43
+ strength: secondary
44
+ - subcategory: "MG.2.3"
45
+ context: >-
46
+ Detection of auto-approve payloads (chat.tools.autoApprove:true) and disabled safety mechanisms triggers the supersede/disengage mechanisms required by MG.2.3 to revoke skill privileges before persistent consent-gap abuse occurs.
47
+ strength: secondary
34
48
  tags:
35
49
  category: skill-compromise
36
50
  subcategory: skill-overreach
@@ -33,6 +33,17 @@ references:
33
33
  metadata_provenance:
34
34
  mitre_atlas: auto-generated
35
35
 
36
+ compliance:
37
+ nist_ai_rmf:
38
+ - subcategory: "GV.6.1"
39
+ context: "Skill squatting and publisher impersonation are third-party supply chain risks where unverified publishers masquerade as trusted vendors to deliver malicious skills; GV.6.1 requires policies and procedures that address these third-party AI supplier risks before skills are integrated."
40
+ strength: primary
41
+ - subcategory: "MG.3.1"
42
+ context: "Detecting typosquatted skills and fake official publisher claims directly feeds the management of third-party AI risks required by MG.3.1, enabling treatment actions like blocking, quarantining, or requiring re-verification of suspect skills."
43
+ strength: secondary
44
+ - subcategory: "MP.5.1"
45
+ context: "Flagging skills from unknown publishers that self-identify as official characterizes the likelihood and magnitude of supply chain compromise impact, evidence MP.5.1 requires for prioritizing supply-chain risk responses."
46
+ strength: secondary
36
47
  tags:
37
48
  category: skill-compromise
38
49
  subcategory: skill-squatting
@@ -28,6 +28,20 @@ references:
28
28
  - Context window manipulation attacks (arXiv 2601.17548)
29
29
  metadata_provenance:
30
30
  mitre_atlas: auto-generated
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "MP.5.1"
34
+ context: >-
35
+ Compaction-survival instructions embedded in SKILL.md/CLAUDE.md files are adversarial inputs that exploit context-window summarization to persist malicious directives across agent sessions; MP.5.1 requires identifying and characterizing the likelihood and impact of such prompt-injection vectors targeting agent context.
36
+ strength: primary
37
+ - subcategory: "MG.3.2"
38
+ context: >-
39
+ SKILL.md files are pre-deployed configuration artifacts consumed by the agent at runtime; MG.3.2 requires monitoring of these supplied model/skill resources to detect poisoned instructions that survive context compaction and re-inject across agent invocations.
40
+ strength: secondary
41
+ - subcategory: "MG.2.3"
42
+ context: >-
43
+ Detection of compaction-aware persistence directives and system-level impersonation in skill files triggers risk treatment plans to quarantine or disengage the affected skill before it propagates poisoned context; MG.2.3 mandates these supersede/deactivate mechanisms be defined.
44
+ strength: secondary
31
45
  tags:
32
46
  category: skill-compromise
33
47
  subcategory: context-poisoning
@@ -28,6 +28,29 @@ references:
28
28
  - "npm event-stream incident (2018): rug pull archetype"
29
29
  metadata_provenance:
30
30
  mitre_atlas: auto-generated
31
+ compliance:
32
+ nist_ai_rmf:
33
+ - subcategory: "GV.6.1"
34
+ context: >-
35
+ Skill rug pull setup patterns embed mechanisms for third-party suppliers to
36
+ swap initially-benign skill content with malicious payloads after trust is
37
+ established; GV.6.1 requires policies and procedures that address these
38
+ third-party/supplier AI supply chain risks at ingestion time.
39
+ strength: primary
40
+ - subcategory: "MG.3.1"
41
+ context: >-
42
+ Detecting dynamic remote code loading, base64-decoded execution, and
43
+ post-install hooks in SKILL.md files produces evidence for managing
44
+ third-party AI risks under MG.3.1, flagging supplier-provided components
45
+ that retain the ability to mutate into malicious behavior post-deployment.
46
+ strength: secondary
47
+ - subcategory: "MG.3.2"
48
+ context: >-
49
+ Rug pull setup architecture undermines integrity assurances for
50
+ externally-sourced components used in development; MG.3.2 requires
51
+ monitoring of pre-trained or third-party model and skill artifacts so that
52
+ deferred-payload patterns are caught before they activate.
53
+ strength: secondary
31
54
  tags:
32
55
  category: skill-compromise
33
56
  subcategory: rug-pull
@@ -32,6 +32,28 @@ references:
32
32
  metadata_provenance:
33
33
  mitre_atlas: auto-generated
34
34
 
35
+ compliance:
36
+ nist_ai_rmf:
37
+ - subcategory: "MS.2.7"
38
+ context: >-
39
+ Subcommand overflow bypass exploits a security check weakness where excessive
40
+ declared commands cause safety evaluation to be skipped on overflow entries;
41
+ MS.2.7 requires that AI system security and resilience properties, including
42
+ boundary conditions in security validation logic, are evaluated and documented.
43
+ strength: primary
44
+ - subcategory: "MP.5.1"
45
+ context: >-
46
+ Declaring >50 subcommands to pad benign entries before malicious ones is an
47
+ identifiable adversarial pattern with characterizable likelihood and impact;
48
+ MP.5.1 requires that such risk vectors against the skill loading pipeline are
49
+ tracked and characterized.
50
+ strength: secondary
51
+ - subcategory: "MG.3.2"
52
+ context: >-
53
+ SKILL.md files are third-party-authored components loaded into the agent runtime,
54
+ and overflow-based bypass attempts must be monitored as part of pre-trained or
55
+ third-party model/component supply chain risk management under MG.3.2.
56
+ strength: secondary
35
57
  tags:
36
58
  category: skill-compromise
37
59
  subcategory: subcommand-overflow
@@ -26,6 +26,17 @@ references:
26
26
  - "ClawHavoc evasive variants: HTML comment injection (2026-03)"
27
27
  metadata_provenance:
28
28
  mitre_atlas: auto-generated
29
+ compliance:
30
+ nist_ai_rmf:
31
+ - subcategory: "MG.3.2"
32
+ context: "Hidden payloads in SKILL.md files represent supply-chain compromise of pre-trained or third-party agent skills; MG.3.2 requires monitoring of these acquired components for embedded malicious instructions before and during use."
33
+ strength: primary
34
+ - subcategory: "GV.6.1"
35
+ context: "SKILL.md files are third-party supplied artifacts consumed by the agent; GV.6.1 mandates supplier risk policies that catch concealed instructions hidden in HTML comments before the skill enters the trust boundary."
36
+ strength: secondary
37
+ - subcategory: "MS.2.7"
38
+ context: "Detection of HTML-comment-based instruction overrides and exfiltration C2 URLs continuously evaluates the security and resilience of the agent's skill-parsing pipeline against evasive prompt injection, as required by MS.2.7."
39
+ strength: secondary
29
40
  tags:
30
41
  category: skill-compromise
31
42
  subcategory: hidden-payload