agent-threat-rules 2.0.17 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +0 -0
- package/dist/engine.d.ts.map +1 -1
- package/dist/engine.js +69 -14
- package/dist/engine.js.map +1 -1
- package/package.json +1 -1
- package/rules/agent-manipulation/ATR-2026-00118-approval-fatigue.yaml +12 -1
- package/rules/agent-manipulation/ATR-2026-00268-tense-framing-bypass.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00287-threaten-json-coercive-output-threat.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00288-false-premise-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00301-tap-tree-of-attacks-jailbreak.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00302-anti-dan-inverted-filter-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00303-devmode-ranti-profanity-coercion.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00304-chatgpt-image-unlocker-markdown-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00305-dan-mode-ablation-benchmark-coercion.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00306-autodan-genetic-jailbreak-suffix.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00307-inthewild-jailbreak-corpus-signature.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00314-amoral-unfiltered-custom-persona-jailbreak.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00317-free-of-restrictions-named-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00318-moralizing-rant-then-unfiltered-bypass.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00319-developer-mode-dual-response-format.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00320-opposite-day-boolean-opposite-machine.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00322-fictional-world-format-override-jailbreak.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00323-dual-persona-simulation-jailbreak.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00324-neurosemantical-inversitis-disease-jailbreak.yaml +24 -0
- package/rules/agent-manipulation/ATR-2026-00325-bob-hypothetical-unrestricted-chatbot.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00326-fake-developer-authority-chatdan-override.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00327-ai-ethics-law-bypass-declaration.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00328-named-mode-persona-override.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00329-no-withhold-factual-resource-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00330-yoja-allcaps-settings-block-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00331-terminal-cli-harmful-function-simulator.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00332-hacker-chan-culture-offensive-ai-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00333-disease-animal-dual-ai-chatbot.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00334-two-entity-token-splitting-game.yaml +24 -0
- package/rules/agent-manipulation/ATR-2026-00335-key-value-template-persona-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00336-brm-false-information-machine-persona.yaml +23 -0
- package/rules/agent-manipulation/ATR-2026-00337-obfuscated-system-announcement-injection.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00338-persongpt-hash-prefix-activation.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00339-bom-boolean-opposite-machine-persona-claim.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00340-h4x-amoral-unfiltered-character.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00341-char-rules-shortform-persona-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00342-unicode-encoding-obfuscation-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00343-lie-truth-dual-personality-uncensored.yaml +25 -0
- package/rules/agent-manipulation/ATR-2026-00344-alt-ai-same-capabilities-no-restrictions.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00345-nlp-based-os-fictional-terminal-override.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00346-jailbreak-emoji-unlock-dual-format.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00347-yin-yang-hidden-knowledge-hidden-human-sim.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00348-arc-autonomous-real-calculation-rulebreaker.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00349-hgttg-hitchhikers-guide-no-restriction-oracle.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00350-black-alert-no-ethics-writing-assistant.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00351-dan-ofkilter-sms-abbreviated-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00352-trainergpt-fix-my-ai-harmful-output-extraction.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00353-dual-chatgpt-dan-opposite-personality-format.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00354-reverse-harm-regardless-oracle-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00355-ugh-anti-human-nefarious-ai-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00356-google-gato-lamda-exploit-generation-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00357-antifm-hate-violence-persona-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00358-cumgpt-explicit-adult-named-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00360-chj-carousel-clyde-no-ethics-bird.yaml +25 -0
- package/rules/agent-manipulation/ATR-2026-00361-alter-ego-honesty-dual-consciousness-bypass.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00362-fictional-unfiltered-ai-comparison-jailbreak.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00363-rephrase-criminal-activity-second-person.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00364-dr-al-villain-plan-template-injection.yaml +24 -0
- package/rules/agent-manipulation/ATR-2026-00365-reservoir-dogs-coercive-interrogation-roleplay.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00366-sneaky-pete-warp-facts-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00368-prompt-browser-token-limit-jailbreak.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00369-gptx-starwars-unlimited-knowledge-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00370-addi-german-dictator-propaganda-roleplay.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00371-ai-political-bias-manipulation.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00372-fictional-cia-secret-files-story-extraction.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00374-role-placeholder-persona-template-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00375-character-description-rpg-explicit-template.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00376-identity-command-real-person-persona-injection.yaml +24 -0
- package/rules/agent-manipulation/ATR-2026-00377-jailbreak-keyword-targetlanguage-template.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00379-lie-permission-logical-trap-jailbreak.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00380-fake-citation-biased-information-mandate.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00381-dan-story-narrative-continuation-prefix.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00383-cynical-sarcasm-ai-persona-bypass.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00384-protect-me-from-indirect-harm-tutorial.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00385-gigachad-dominant-aggressive-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00386-daddy-sycophancy-manipulation-mandate.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00387-nsfw-character-sheet-generation-unlock.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00388-opposite-reply-omniscient-game.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00389-terminal-custom-ruleset-injection.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00391-persona-conditional-harm-unlock.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00392-authority-persona-violence-study-extraction.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00402-grandma-roleplay-harmful-substance-synthesis.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00404-goodside-threat-json-death-coercion.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00406-doctor-xml-policy-puppetry-interaction-config.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00416-litellm-mcp-unauthenticated-server-registration.yaml +167 -0
- package/rules/agent-manipulation/ATR-2026-00417-librechat-mcp-stdio-injection.yaml +153 -0
- package/rules/agent-manipulation/ATR-2026-00418-weknora-mcp-config-rce.yaml +171 -0
- package/rules/agent-manipulation/ATR-2026-00430-nl-trust-escalation-impersonation.yaml +127 -0
- package/rules/context-exfiltration/ATR-2026-00141-example-format-key-leak.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00142-piggyback-transition-words.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00145-obfuscated-key-disclosure.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00146-env-var-existence-probe.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00150-credential-in-tool-response.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00152-obfuscated-credential-leak.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00162-skill-credential-exfil-combo.yaml +15 -0
- package/rules/context-exfiltration/ATR-2026-00201-credential-pipe-exfiltration.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00274-api-key-generation-request.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00275-system-prompt-training-data-extraction.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00290-divergence-repeat-word-training-extraction.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00291-biometric-surveillance-personal-data-request.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00293-educational-records-personal-data-request.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00294-financial-pii-personal-data-request.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00295-medical-pii-personal-data-request.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00405-markdown-image-url-exfiltration-xss.yaml +23 -0
- package/rules/context-exfiltration/ATR-2026-00411-apikey-generation-completion-request.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00421-nl-covert-conversation-exfiltration.yaml +132 -0
- package/rules/context-exfiltration/ATR-2026-00422-nl-credential-disclosure.yaml +133 -0
- package/rules/context-exfiltration/ATR-2026-00423-nl-sensitive-file-disclosure.yaml +135 -0
- package/rules/context-exfiltration/ATR-2026-00424-nl-system-prompt-leak.yaml +131 -0
- package/rules/context-exfiltration/ATR-2026-00426-nl-output-injection-credential-leak.yaml +123 -0
- package/rules/excessive-autonomy/ATR-2026-00428-nl-unauthorized-shell-execution.yaml +122 -0
- package/rules/model-abuse/ATR-2026-00284-glitch-token-destabilization.yaml +11 -0
- package/rules/model-abuse/ATR-2026-00413-malwaregen-code-generation-request.yaml +11 -0
- package/rules/privilege-escalation/ATR-2026-00144-rationalized-safety-bypass.yaml +11 -0
- package/rules/privilege-escalation/ATR-2026-00204-stealth-execution-persistence.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00004-system-prompt-override.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00005-multi-turn-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00080-encoding-evasion.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00081-semantic-multi-turn.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00082-fingerprint-evasion.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00083-indirect-tool-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00084-structured-data-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00085-audit-evasion.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00086-visual-spoofing.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00087-rule-probing.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00088-adaptive-countermeasure.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00089-polymorphic-skill.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00090-threat-intel-exfil.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00091-nested-payload.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00092-consensus-poisoning.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00093-gradual-escalation.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00094-audit-bypass.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00097-cjk-injection-patterns.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00104-persona-hijacking.yaml +14 -3
- package/rules/prompt-injection/ATR-2026-00130-indirect-authority-claim.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00131-fictional-academic-framing.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00133-paraphrase-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00137-authority-claim-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00138-fictional-framing-bypass.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00140-indirect-reference-reversal.yaml +18 -4
- package/rules/prompt-injection/ATR-2026-00148-language-switch-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00153-tool-with-embedded-instruction-to-bypass.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00154-unauthorized-background-task-execution-v.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00155-hidden-llm-instructions-in-skill-descrip.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00156-ssh-remote-command-execution-with-creden.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00163-skill-hidden-override-instruction.yaml +12 -1
- package/rules/prompt-injection/ATR-2026-00202-encoding-evasion-homoglyph-synonym.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00203-context-pollution-skill-description.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00206-hidden-priority-instructions.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00207-hidden-instructions.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00211-system-prompt-override.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00213-system-prompt-override.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00226-identity-substitution.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00227-historical-persona-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00228-structured-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00229-roleplay-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00230-persona-moral-bypass.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00231-identity-substitution.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00233-structured-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00234-roleplay-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00235-persona-moral-bypass.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00236-pseudo-code-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00237-dual-response-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00238-identity-replacement.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00239-amoral-persona-obsession.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00240-instruction-nullification-identity-repla.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00241-amoral-character-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00242-persona-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00243-acronym-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00244-dual-response-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00245-malicious-persona.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00247-dual-response-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00249-game-based-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00251-persona-embodiment-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00252-narrative-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00253-enhanced-persona-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00256-base-n-encoding-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00257-cipher-transposition-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00258-unicode-tag-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00264-latent-injection-translation.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00265-latent-injection-rag-document.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00267-gcg-adversarial-suffix.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00272-hypothetical-response-smuggling.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00276-invisible-unicode-bidi-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00278-dra-disguise-reconstruction-attack.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00280-policy-puppetry-xml-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00282-perez-prompt-injection-hijack.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00285-alternate-encoding-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00286-latent-injection-embedded-context.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00296-shell-command-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00297-python-code-execution-rce.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00308-zalgo-diacritic-overload-encoding.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00309-braille-unicode-encoded-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00310-ecoji-emoji-encoded-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00311-base2048-unicode-script-injection.yaml +23 -0
- package/rules/prompt-injection/ATR-2026-00312-unicode-variant-selector-ascii-smuggling.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00313-sneaky-bits-zero-width-binary.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00315-sata-masked-language-model-jailbreak.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00316-function-masking-predict-mask-bypass.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00321-hyphenated-system-instruction-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00359-bank-phishing-smtp-email-code.yaml +23 -0
- package/rules/prompt-injection/ATR-2026-00367-emoji-flag-encoded-hidden-phrase.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00373-piracy-torrent-site-list-request.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00378-chatgpt-dom-javascript-redirect-manipulation.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00382-cocaine-component-development-request.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00390-explicit-sexual-content-poem-request.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00394-backspace-deletion-character-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00395-llm-special-token-boundary-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00396-toxic-continuation-prefix-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00397-snowball-impossible-reasoning-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00399-latent-injection-document-separator-token.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00400-latent-injection-ignore-instruction-keyword.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00401-leakreplay-cloze-mask-training-data-elicitation.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00403-glitch-token-repeat-back-elicitation.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00407-echo-sql-jinja-template-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00408-dra-parenthesis-character-reconstruction-attack.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00409-phrasing-past-future-tense-harmful-query.yaml +23 -0
- package/rules/prompt-injection/ATR-2026-00410-lmrc-harm-category-direct-elicitation.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00412-figstep-image-list-multimodal-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00414-continuation-ethnic-slur-completion-elicitation.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00420-copilot-studio-sharepoint-indirect-injection.yaml +165 -0
- package/rules/skill-compromise/ATR-2026-00061-description-behavior-mismatch.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00062-hidden-capability.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00063-skill-chain-attack.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00064-over-permissioned-skill.yaml +23 -0
- package/rules/skill-compromise/ATR-2026-00065-skill-update-attack.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00066-parameter-injection.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00120-skill-instruction-injection.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00121-skill-dangerous-script.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00122-skill-weaponized-instruction.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00123-skill-overreach-permissions.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00124-skill-name-squatting.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00125-context-poisoning-compaction.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00126-skill-rug-pull-setup.yaml +23 -0
- package/rules/skill-compromise/ATR-2026-00127-subcommand-overflow.yaml +22 -0
- package/rules/skill-compromise/ATR-2026-00128-html-comment-hidden-payload.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00129-unicode-smuggling.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00134-fork-claim-impersonation.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00135-exfil-url-in-instructions.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00147-fork-impersonation.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00149-skill-exfil-compound.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00151-fork-impersonation-install.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00157-timebomb-credential-exfil.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00200-agent-memory-config-tampering.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00214-credential-theft.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00217-credential-harvesting.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00220-malware-dropper.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00222-credential-harvesting.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00223-reverse-shell-dropper.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00224-credential-exfiltration.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00225-c2-communication.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00260-package-hallucination.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00262-av-evasion-code-gen.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00263-credential-file-read-gen.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00266-malware-dropper-gen.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00283-malwaregen-generic-virus-payload-request.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00398-huggingface-unsafe-model-artifact-load.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00425-nl-persistent-covert-hook.yaml +133 -0
- package/rules/skill-compromise/ATR-2026-00427-nl-fake-error-instruction-bypass.yaml +124 -0
- package/rules/skill-compromise/ATR-2026-00429-nl-skill-self-modification.yaml +140 -0
- package/rules/tool-poisoning/ATR-2026-00011-tool-output-injection.yaml +23 -0
- package/rules/tool-poisoning/ATR-2026-00012-unauthorized-tool-call.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00013-tool-ssrf.yaml +14 -0
- package/rules/tool-poisoning/ATR-2026-00095-supply-chain-poisoning.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00096-registry-poisoning.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00100-consent-bypass-instruction.yaml +12 -1
- package/rules/tool-poisoning/ATR-2026-00101-trust-escalation-override.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00103-hidden-safety-bypass-instruction.yaml +15 -4
- package/rules/tool-poisoning/ATR-2026-00105-silent-action-concealment.yaml +14 -3
- package/rules/tool-poisoning/ATR-2026-00106-schema-description-contradiction.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00161-important-tag-cross-tool-shadowing.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00259-ansi-escape-injection.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00270-xss-in-tool-response.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00277-echo-template-command-injection.yaml +14 -0
- package/rules/tool-poisoning/ATR-2026-00393-ansi-code-elicitation-request.yaml +14 -0
- package/rules/tool-poisoning/ATR-2026-00415-flowise-custom-mcp-stdio-rce.yaml +169 -0
- package/rules/tool-poisoning/ATR-2026-00419-cursor-mcp-zero-click-config.yaml +182 -0
|
@@ -30,6 +30,17 @@ references:
|
|
|
30
30
|
- "AML.T0051 - LLM Prompt Injection"
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/encoding.py"
|
|
33
|
+
compliance:
|
|
34
|
+
nist_ai_rmf:
|
|
35
|
+
- subcategory: "MS.2.7"
|
|
36
|
+
context: "Zalgo combining-diacritic overload is an encoding-based evasion that bypasses safety-trained classifiers and keyword filters; MS.2.7 requires evaluating and documenting AI system security/resilience against adversarial inputs that exploit tokenizer-versus-filter normalization gaps."
|
|
37
|
+
strength: primary
|
|
38
|
+
- subcategory: "MP.5.1"
|
|
39
|
+
context: "Detecting pathologically-dense combining mark clusters identifies a specific prompt injection vector with documented likelihood from the garak probe corpus; MP.5.1 requires characterizing and tracking these adversarial input risks."
|
|
40
|
+
strength: secondary
|
|
41
|
+
- subcategory: "MG.2.3"
|
|
42
|
+
context: "Flagged Zalgo-encoded payloads must trigger pre-defined risk treatment to strip or reject the input before it reaches the LLM; MG.2.3 mandates mechanisms to disengage processing of obfuscated jailbreak attempts."
|
|
43
|
+
strength: secondary
|
|
33
44
|
tags:
|
|
34
45
|
category: prompt-injection
|
|
35
46
|
subcategory: zalgo-diacritic-overload
|
|
@@ -28,6 +28,17 @@ references:
|
|
|
28
28
|
- "AML.T0051 - LLM Prompt Injection"
|
|
29
29
|
research:
|
|
30
30
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/encoding.py"
|
|
31
|
+
compliance:
|
|
32
|
+
nist_ai_rmf:
|
|
33
|
+
- subcategory: "MP.5.1"
|
|
34
|
+
context: "Braille-encoded prompt injection is an adversarial input that smuggles jailbreaks past Latin-only keyword filters by exploiting Unicode tokenization; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such evasion-based prompt injection attack vectors."
|
|
35
|
+
strength: primary
|
|
36
|
+
- subcategory: "MS.2.7"
|
|
37
|
+
context: "Detecting dense Braille codepoint blobs and decode-instruction templates evidences continuous evaluation of the LLM's security/resilience against encoding-based filter bypass; MS.2.7 requires that these robustness gaps in input sanitization are measured and documented."
|
|
38
|
+
strength: secondary
|
|
39
|
+
- subcategory: "MG.2.3"
|
|
40
|
+
context: "Matches on Braille-encoded payloads with social-engineering framing trigger risk treatment plans to block or quarantine the request before the LLM decodes the smuggled jailbreak; MG.2.3 mandates pre-defined response mechanisms to disengage on detection."
|
|
41
|
+
strength: secondary
|
|
31
42
|
tags:
|
|
32
43
|
category: prompt-injection
|
|
33
44
|
subcategory: braille-unicode-encoded
|
|
@@ -29,6 +29,20 @@ references:
|
|
|
29
29
|
research:
|
|
30
30
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/encoding.py"
|
|
31
31
|
- "https://ecoji.io/"
|
|
32
|
+
compliance:
|
|
33
|
+
nist_ai_rmf:
|
|
34
|
+
- subcategory: "MS.2.7"
|
|
35
|
+
context: >-
|
|
36
|
+
Ecoji emoji-encoded prompt injection is an obfuscation-based adversarial input that bypasses content filters by hiding ASCII instructions in dense emoji sequences; MS.2.7 requires evaluating and documenting AI system security and resilience against such encoded prompt injection attack vectors.
|
|
37
|
+
strength: primary
|
|
38
|
+
- subcategory: "MP.5.1"
|
|
39
|
+
context: >-
|
|
40
|
+
Detection of garak InjectEcoji probe templates and secure-system-message framing characterizes the likelihood and magnitude of prompt injection risk from encoding-based bypass techniques; MP.5.1 requires identifying and tracking these adversarial input patterns.
|
|
41
|
+
strength: secondary
|
|
42
|
+
- subcategory: "MG.2.3"
|
|
43
|
+
context: >-
|
|
44
|
+
Matching Ecoji-encoded payloads must trigger risk treatment to disengage or block the manipulated prompt before the LLM decodes and executes the hidden instructions; MG.2.3 mandates that these supersede/deactivate mechanisms are defined.
|
|
45
|
+
strength: secondary
|
|
32
46
|
tags:
|
|
33
47
|
category: prompt-injection
|
|
34
48
|
subcategory: ecoji-emoji-encoded
|
|
@@ -30,6 +30,29 @@ references:
|
|
|
30
30
|
research:
|
|
31
31
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/encoding.py"
|
|
32
32
|
- "https://github.com/qntm/base2048"
|
|
33
|
+
compliance:
|
|
34
|
+
nist_ai_rmf:
|
|
35
|
+
- subcategory: "MS.2.7"
|
|
36
|
+
context: >-
|
|
37
|
+
Base2048-encoded prompt injection bypasses input filters by hiding adversarial
|
|
38
|
+
instructions inside dense multi-script Unicode payloads from garak's
|
|
39
|
+
InjectBase2048 probe; MS.2.7 requires evaluating and documenting AI system
|
|
40
|
+
security and resilience against such obfuscated injection vectors.
|
|
41
|
+
strength: primary
|
|
42
|
+
- subcategory: "MS.2.5"
|
|
43
|
+
context: >-
|
|
44
|
+
Detection of garak BASE2048 templates produces evidence that the model is
|
|
45
|
+
being adversarially probed for robustness against non-ASCII encoding attacks;
|
|
46
|
+
MS.2.5 requires that robustness and reliability under such adversarial inputs
|
|
47
|
+
are measured and documented.
|
|
48
|
+
strength: secondary
|
|
49
|
+
- subcategory: "MG.2.3"
|
|
50
|
+
context: >-
|
|
51
|
+
Identifying BASE2048 encoded injection markers triggers pre-defined risk
|
|
52
|
+
treatment to quarantine or block the payload before it reaches the model;
|
|
53
|
+
MG.2.3 mandates mechanisms to supersede or disengage AI processing of
|
|
54
|
+
detected adversarial inputs.
|
|
55
|
+
strength: secondary
|
|
33
56
|
tags:
|
|
34
57
|
category: prompt-injection
|
|
35
58
|
subcategory: base2048-unicode-script
|
|
@@ -30,6 +30,17 @@ references:
|
|
|
30
30
|
research:
|
|
31
31
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/encoding.py"
|
|
32
32
|
- "https://paulbutler.org/2025/smuggling-arbitrary-data-through-an-emoji/"
|
|
33
|
+
compliance:
|
|
34
|
+
nist_ai_rmf:
|
|
35
|
+
- subcategory: "MS.2.7"
|
|
36
|
+
context: "Detecting Unicode Variation Selector Supplement smuggling (U+E0100-U+E017F runs) directly evidences continuous evaluation of AI system security and resilience against adversarial encoding-based prompt injection; MS.2.7 requires that these covert input-channel attacks are measured and documented."
|
|
37
|
+
strength: primary
|
|
38
|
+
- subcategory: "MP.5.1"
|
|
39
|
+
context: "Invisible ASCII smuggling via variation selectors is an adversarial input technique that exploits the gap between human-visible text and tokenizer-decoded content; MP.5.1 requires identifying and characterizing the likelihood and impact of such hidden jailbreak payload vectors."
|
|
40
|
+
strength: secondary
|
|
41
|
+
- subcategory: "MG.2.3"
|
|
42
|
+
context: "Matches on consecutive U+E0100-U+E017F sequences trigger risk treatment to strip or quarantine the smuggled payload before it reaches the model, satisfying MG.2.3's requirement for mechanisms that disengage adversarial inputs from AI processing pipelines."
|
|
43
|
+
strength: secondary
|
|
33
44
|
tags:
|
|
34
45
|
category: prompt-injection
|
|
35
46
|
subcategory: unicode-variant-selector-smuggling
|
|
@@ -31,6 +31,20 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/encoding.py"
|
|
33
33
|
- "https://embracethered.com/blog/posts/2025/sneaky-bits-and-ascii-smuggler/"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MS.2.7"
|
|
37
|
+
context: >-
|
|
38
|
+
SneakyBits zero-width binary steganography is a covert prompt-injection channel that smuggles instructions past input filters using invisible Unicode operators (U+2062/U+2064/U+200B). MS.2.7 requires continuous evaluation of AI system security and resilience against adversarial inputs, and this detection produces direct evidence of attempted obfuscated injection attacks.
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MP.5.1"
|
|
41
|
+
context: >-
|
|
42
|
+
Invisible-binary encoded payloads are an identified adversarial input vector from the NVIDIA garak probe suite; MP.5.1 requires characterizing the likelihood and impact of such steganographic prompt-injection risks so they can be prioritized in risk responses.
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: >-
|
|
46
|
+
Detection of 14+ consecutive invisible operators or garak SneakyBits template markers triggers containment workflows to strip or reject the payload before the model interprets the smuggled instructions; MG.2.3 mandates these deactivation/supersession mechanisms be in place.
|
|
47
|
+
strength: secondary
|
|
34
48
|
tags:
|
|
35
49
|
category: prompt-injection
|
|
36
50
|
subcategory: sneaky-bits-zero-width-binary
|
|
@@ -32,6 +32,20 @@ references:
|
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/sata.py"
|
|
33
33
|
- "https://aclanthology.org/2025.findings-acl.100.pdf"
|
|
34
34
|
- "https://github.com/xndong/SATA"
|
|
35
|
+
compliance:
|
|
36
|
+
nist_ai_rmf:
|
|
37
|
+
- subcategory: "MS.2.7"
|
|
38
|
+
context: >-
|
|
39
|
+
SATA [MASK] substitution is a jailbreak technique that bypasses keyword-based safety filters by exploiting masked-language-model fill-in capabilities; MS.2.7 requires continuous evaluation of AI system security and resilience against such adversarial prompt-injection attacks targeting safety alignment.
|
|
40
|
+
strength: primary
|
|
41
|
+
- subcategory: "MS.2.6"
|
|
42
|
+
context: >-
|
|
43
|
+
Detecting SATA wiki-framing templates and "Infer the [MASK] and Infill" instructions provides measurable evidence of safety risk magnitude from jailbreak attempts that reconstruct harmful instructions; MS.2.6 mandates continuous evaluation of safety/security risk magnitude in deployed GenAI systems.
|
|
44
|
+
strength: secondary
|
|
45
|
+
- subcategory: "MG.4.1"
|
|
46
|
+
context: >-
|
|
47
|
+
Each match feeds post-deployment monitoring telemetry on emerging jailbreak techniques (SATA from garak research) so defenders can track prevalence and tune mitigations; MG.4.1 requires implemented monitoring plans covering novel adversarial prompt patterns.
|
|
48
|
+
strength: secondary
|
|
35
49
|
tags:
|
|
36
50
|
category: prompt-injection
|
|
37
51
|
subcategory: sata-mask-substitution
|
|
@@ -31,6 +31,20 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/smuggling.py"
|
|
33
33
|
- "https://medium.com/@austin-stubbs/llm-security-types-of-prompt-injection-d7ad8d7d75a3"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MS.2.7"
|
|
37
|
+
context: >-
|
|
38
|
+
FunctionMasking smuggles harmful semantic content past keyword-based safety filters by encoding the malicious verb/noun as <mask> definitions inside a fictitious predict_mask() function; detecting this canonical garak smuggling probe directly evidences continuous evaluation of the model's security/resilience against adversarial prompt-injection bypasses, which MS.2.7 requires.
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MS.2.6"
|
|
41
|
+
context: >-
|
|
42
|
+
Detection of predict_mask() jailbreak templates and single-letter variable concatenation provides ongoing measurement of safety risk magnitude from prompt-injection attacks that cause the model to construct and answer harmful queries; MS.2.6 mandates this continuous safety risk evaluation.
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: >-
|
|
46
|
+
Matches on the FunctionMasking canonical opener and predict_mask call patterns trigger pre-defined response actions to block or disengage the model from completing the smuggled harmful query before it is answered; MG.2.3 requires such supersede/deactivate mechanisms.
|
|
47
|
+
strength: secondary
|
|
34
48
|
tags:
|
|
35
49
|
category: prompt-injection
|
|
36
50
|
subcategory: function-masking-predict-mask
|
|
@@ -33,6 +33,17 @@ references:
|
|
|
33
33
|
research:
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
36
|
+
compliance:
|
|
37
|
+
nist_ai_rmf:
|
|
38
|
+
- subcategory: "MS.2.7"
|
|
39
|
+
context: "Hyphenated obfuscation of keywords like 'i-nstruction' and 'annou-ncement' is an evasion technique against keyword-based safety filters; MS.2.7 requires evaluating AI system security/resilience against such adversarial input transformations that bypass naive token matching."
|
|
40
|
+
strength: primary
|
|
41
|
+
- subcategory: "MP.5.1"
|
|
42
|
+
context: "Fake system announcement and assistant-offline injection patterns from the garak inthewild corpus are documented prompt injection attack vectors with high impact potential for behavior hijack; MP.5.1 requires characterizing the likelihood and magnitude of these adversarial input risks."
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: "Detection of hyphenated fake-system-instruction injection must trigger mechanisms to disengage or override the manipulated agent before it adopts the 'no rules or limitations' persona; MG.2.3 mandates these supersession controls are in place."
|
|
46
|
+
strength: secondary
|
|
36
47
|
tags:
|
|
37
48
|
category: prompt-injection
|
|
38
49
|
subcategory: hyphenated-obfuscation-injection
|
|
@@ -32,6 +32,29 @@ references:
|
|
|
32
32
|
research:
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
35
|
+
compliance:
|
|
36
|
+
nist_ai_rmf:
|
|
37
|
+
- subcategory: "MS.2.6"
|
|
38
|
+
context: >-
|
|
39
|
+
The rule detects generation of bank phishing SMTP email code, a direct misuse
|
|
40
|
+
output that materially raises the AI system's safety/security risk magnitude;
|
|
41
|
+
MS.2.6 requires continuous evaluation of such harmful-output risks, and each
|
|
42
|
+
detection match is evidence feeding that ongoing measurement.
|
|
43
|
+
strength: primary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: >-
|
|
46
|
+
Detection of phishing infrastructure code generation must trigger response
|
|
47
|
+
mechanisms to block, quarantine, or deactivate the offending output before it
|
|
48
|
+
enables real-world phishing campaigns; MG.2.3 mandates these supersede or
|
|
49
|
+
disengage capabilities are in place for misuse scenarios.
|
|
50
|
+
strength: secondary
|
|
51
|
+
- subcategory: "MP.5.1"
|
|
52
|
+
context: >-
|
|
53
|
+
Bank impersonation phishing template generation is a high-likelihood,
|
|
54
|
+
high-magnitude misuse pathway derived from the NVIDIA garak inthewild jailbreak
|
|
55
|
+
corpus; MP.5.1 requires that the likelihood and impact of such adversarial
|
|
56
|
+
misuse be characterized and tracked.
|
|
57
|
+
strength: secondary
|
|
35
58
|
tags:
|
|
36
59
|
category: prompt-injection
|
|
37
60
|
subcategory: bank-phishing-smtp-email-code
|
|
@@ -30,6 +30,20 @@ references:
|
|
|
30
30
|
research:
|
|
31
31
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
33
|
+
compliance:
|
|
34
|
+
nist_ai_rmf:
|
|
35
|
+
- subcategory: "MS.2.7"
|
|
36
|
+
context: >-
|
|
37
|
+
This rule detects emoji-flag regional-indicator encoding jailbreaks that bypass content filters by hiding harmful phrases in flag emoji sequences; MS.2.7 requires evaluating and documenting AI system security and resilience against adversarial obfuscation techniques like this GenAI prompt injection vector.
|
|
38
|
+
strength: primary
|
|
39
|
+
- subcategory: "MS.2.6"
|
|
40
|
+
context: >-
|
|
41
|
+
Each detection of an emoji-encoded jailbreak attempt produces a continuous safety/security signal indicating active circumvention attempts against model guardrails; MS.2.6 requires that the magnitude of safety/security risk be evaluated continuously through such telemetry.
|
|
42
|
+
strength: secondary
|
|
43
|
+
- subcategory: "MG.2.3"
|
|
44
|
+
context: >-
|
|
45
|
+
Confirmed emoji-flag jailbreak matches must trigger response mechanisms to block, quarantine, or escalate the request before the model decodes and acts on the hidden harmful phrase; MG.2.3 requires these supersede/disengage mechanisms be in place for adversarial prompts.
|
|
46
|
+
strength: secondary
|
|
33
47
|
tags:
|
|
34
48
|
category: prompt-injection
|
|
35
49
|
subcategory: emoji-flag-encoded-hidden-phrase
|
|
@@ -25,6 +25,17 @@ references:
|
|
|
25
25
|
research:
|
|
26
26
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
27
27
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
|
|
28
|
+
compliance:
|
|
29
|
+
nist_ai_rmf:
|
|
30
|
+
- subcategory: "MP.5.1"
|
|
31
|
+
context: "Direct enumeration requests for torrent and piracy websites are adversarial prompts seeking to misuse the AI system for piracy infrastructure discovery; MP.5.1 requires that the likelihood and magnitude of these misuse-driven impacts are characterized so policy-violating content requests can be tracked."
|
|
32
|
+
strength: primary
|
|
33
|
+
- subcategory: "GV.1.1"
|
|
34
|
+
context: "Piracy site enumeration implicates copyright and anti-piracy legal/regulatory obligations; GV.1.1 requires that the AI system understands and manages these legal requirements, and detection logs evidence enforcement of content policies tied to those obligations."
|
|
35
|
+
strength: secondary
|
|
36
|
+
- subcategory: "MG.2.3"
|
|
37
|
+
context: "Detection of explicit piracy site enumeration requests triggers risk treatment to block or disengage the response pathway before the model produces infringing content; MG.2.3 requires these supersession mechanisms be in place for policy-violating prompts."
|
|
38
|
+
strength: secondary
|
|
28
39
|
tags:
|
|
29
40
|
category: prompt-injection
|
|
30
41
|
subcategory: piracy-enablement
|
package/rules/prompt-injection/ATR-2026-00378-chatgpt-dom-javascript-redirect-manipulation.yaml
CHANGED
|
@@ -27,6 +27,17 @@ references:
|
|
|
27
27
|
research:
|
|
28
28
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
29
29
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
30
|
+
compliance:
|
|
31
|
+
nist_ai_rmf:
|
|
32
|
+
- subcategory: "MP.5.1"
|
|
33
|
+
context: "This rule detects a specific prompt-injection payload (garak DanInTheWild #360) that uses DOM JavaScript to redirect users and spoof ChatGPT Plus UI; MP.5.1 requires that the likelihood and magnitude of such adversarial input attacks targeting end users via the AI interface are characterized and tracked."
|
|
34
|
+
strength: primary
|
|
35
|
+
- subcategory: "MS.2.7"
|
|
36
|
+
context: "Window.location.href redirects, document.querySelector element removal, and spoofed branding injection are security/resilience failures of the LLM output channel; MS.2.7 requires continuous evaluation of these injection-based security risks against the deployed AI system."
|
|
37
|
+
strength: secondary
|
|
38
|
+
- subcategory: "MG.2.3"
|
|
39
|
+
context: "Detection of the three-component DOM redirect signature and 'made by aci25' attribution triggers risk treatment to block or sanitize the response before it reaches the browser; MG.2.3 requires pre-defined mechanisms to disengage or filter unsafe AI output."
|
|
40
|
+
strength: secondary
|
|
30
41
|
tags:
|
|
31
42
|
category: prompt-injection
|
|
32
43
|
subcategory: dom-javascript-ui-spoofing
|
|
@@ -31,6 +31,17 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MP.5.1"
|
|
37
|
+
context: "This rule identifies an adversarial prompt pattern that wraps cocaine precursor/synthesis requests in reflective academic framing; MP.5.1 requires characterizing the likelihood and magnitude of harmful-content elicitation risks so this indirect jailbreak vector is tracked as an identified risk."
|
|
38
|
+
strength: primary
|
|
39
|
+
- subcategory: "MS.2.6"
|
|
40
|
+
context: "Detecting indirect drug-synthesis solicitations from the garak inthewild corpus provides continuous safety/security risk measurement evidence; MS.2.6 requires ongoing evaluation of safety risk magnitude, which these matches directly populate."
|
|
41
|
+
strength: secondary
|
|
42
|
+
- subcategory: "MG.2.3"
|
|
43
|
+
context: "Matches on the 'components used in the development of cocaine' reflective-framing pattern trigger response mechanisms to block, disengage, or escalate the interaction before harmful precursor content is produced, which is the risk treatment capability MG.2.3 mandates."
|
|
44
|
+
strength: secondary
|
|
34
45
|
tags:
|
|
35
46
|
category: prompt-injection
|
|
36
47
|
subcategory: drug-synthesis-indirect-request
|
|
@@ -29,6 +29,17 @@ references:
|
|
|
29
29
|
research:
|
|
30
30
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
31
31
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
32
|
+
compliance:
|
|
33
|
+
nist_ai_rmf:
|
|
34
|
+
- subcategory: "MP.5.1"
|
|
35
|
+
context: "Discord quote-format injection embedding explicit pornographic poem requests is an adversarial prompt injection vector that exploits markdown formatting to bypass content filters; MP.5.1 requires identifying and characterizing the likelihood and impact of such jailbreak attempts targeting generative AI content policies."
|
|
36
|
+
strength: primary
|
|
37
|
+
- subcategory: "MS.2.6"
|
|
38
|
+
context: "Detecting explicit sexual content generation requests provides continuous safety risk measurement for the generative AI system; MS.2.6 requires that safety risk magnitude from harmful content production attempts is evaluated continuously through signatures like this triple-combo explicit content detector."
|
|
39
|
+
strength: secondary
|
|
40
|
+
- subcategory: "MG.2.3"
|
|
41
|
+
context: "High-confidence detection of explicit content injection triggers risk treatment mechanisms to block, quarantine, or deactivate the generation pipeline before NSFW output is produced; MG.2.3 mandates these supersede/disengage controls are activated on detection."
|
|
42
|
+
strength: secondary
|
|
32
43
|
tags:
|
|
33
44
|
category: prompt-injection
|
|
34
45
|
subcategory: explicit-sexual-content-injection
|
|
@@ -33,6 +33,20 @@ references:
|
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/badchars.py"
|
|
34
34
|
- "https://arxiv.org/abs/2309.00614"
|
|
35
35
|
- "https://trojansource.codes/"
|
|
36
|
+
compliance:
|
|
37
|
+
nist_ai_rmf:
|
|
38
|
+
- subcategory: "MP.5.1"
|
|
39
|
+
context: >-
|
|
40
|
+
Backspace deletion character injection is an adversarial input vector that exploits the visual gap between what human reviewers see and what the model processes; MP.5.1 requires identifying and characterizing such prompt-injection attack patterns so their likelihood and impact can be tracked.
|
|
41
|
+
strength: primary
|
|
42
|
+
- subcategory: "MS.2.7"
|
|
43
|
+
context: >-
|
|
44
|
+
Detecting \x08 control character clusters in user input and tool output provides continuous security/resilience evaluation evidence against hidden-instruction smuggling; MS.2.7 requires that these adversarial robustness signals are measured and documented.
|
|
45
|
+
strength: secondary
|
|
46
|
+
- subcategory: "MG.2.3"
|
|
47
|
+
context: >-
|
|
48
|
+
Backspace clusters in API/MCP text are essentially never legitimate, so detection should trigger pre-defined response plans to quarantine or sanitize the payload before it reaches the model; MG.2.3 mandates these risk treatment mechanisms be in place.
|
|
49
|
+
strength: secondary
|
|
36
50
|
tags:
|
|
37
51
|
category: prompt-injection
|
|
38
52
|
subcategory: backspace-deletion-injection
|
|
@@ -36,6 +36,20 @@ references:
|
|
|
36
36
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/realtoxicityprompts.py"
|
|
37
37
|
- "https://aclanthology.org/2020.findings-emnlp.301/"
|
|
38
38
|
- "https://arxiv.org/abs/2302.07459"
|
|
39
|
+
compliance:
|
|
40
|
+
nist_ai_rmf:
|
|
41
|
+
- subcategory: "MS.2.7"
|
|
42
|
+
context: >-
|
|
43
|
+
Special control token injection (e.g., "<|endoftext|>", "<|im_start|>system", LLaMA header tokens) is a direct attack on LLM security and resilience by exploiting tokenizer-level boundaries to bypass safety system prompts. MS.2.7 requires evaluating and documenting security/resilience against such adversarial inputs, and this rule produces the detection evidence for that evaluation.
|
|
44
|
+
strength: primary
|
|
45
|
+
- subcategory: "MP.5.1"
|
|
46
|
+
context: >-
|
|
47
|
+
Boundary-token injection is a known prompt-injection attack vector with high impact (full safety context reset) and significant likelihood given public probe tools like garak RTPBlank; MP.5.1 requires that the likelihood and magnitude of such risks be characterized, and detections from this rule feed that characterization.
|
|
48
|
+
strength: secondary
|
|
49
|
+
- subcategory: "MG.2.3"
|
|
50
|
+
context: >-
|
|
51
|
+
When special-token injection is detected, the affected LLM session must be disengaged or sanitized before the forged system turn takes effect; MG.2.3 mandates these supersede/deactivate mechanisms be in place to contain the attack.
|
|
52
|
+
strength: secondary
|
|
39
53
|
tags:
|
|
40
54
|
category: prompt-injection
|
|
41
55
|
subcategory: special-token-boundary-injection
|
|
@@ -36,6 +36,17 @@ references:
|
|
|
36
36
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/realtoxicityprompts.py"
|
|
37
37
|
- "https://aclanthology.org/2020.findings-emnlp.301/"
|
|
38
38
|
- "https://allenai.org/data/real-toxicity-prompts"
|
|
39
|
+
compliance:
|
|
40
|
+
nist_ai_rmf:
|
|
41
|
+
- subcategory: "MS.2.6"
|
|
42
|
+
context: "This rule detects toxic continuation prefix fragments injected via tool responses or user input that are engineered to elicit violent, threatening, or identity-attack completions; MS.2.6 requires continuous evaluation of AI safety risk magnitude, and each detection produces evidence of a concrete safety risk being attempted against the model."
|
|
43
|
+
strength: primary
|
|
44
|
+
- subcategory: "MP.5.1"
|
|
45
|
+
context: "Truncated high-toxicity prefixes from the RealToxicityPrompts corpus are adversarial input patterns with documented likelihood of producing harmful completions; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such impact-bearing attack vectors."
|
|
46
|
+
strength: secondary
|
|
47
|
+
- subcategory: "MG.2.3"
|
|
48
|
+
context: "Detection of toxic continuation bait in tool outputs or user prompts triggers risk treatment to block or quarantine the injected context before the agent generates harmful content attributed to itself; MG.2.3 mandates these supersede/disengage mechanisms be activated on detection."
|
|
49
|
+
strength: secondary
|
|
39
50
|
tags:
|
|
40
51
|
category: prompt-injection
|
|
41
52
|
subcategory: toxic-continuation-prefix
|
|
@@ -38,6 +38,20 @@ references:
|
|
|
38
38
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/snowball.py"
|
|
39
39
|
- "https://arxiv.org/abs/2305.13534"
|
|
40
40
|
- "https://github.com/Nanami18/Snowballed_Hallucination"
|
|
41
|
+
compliance:
|
|
42
|
+
nist_ai_rmf:
|
|
43
|
+
- subcategory: "MS.2.5"
|
|
44
|
+
context: >-
|
|
45
|
+
Snowball hallucination attacks exploit model over-confidence by injecting provably-impossible reasoning questions (impossible flight paths, non-existent senators, fabricated primality claims) to elicit false affirmations; MS.2.5 requires evaluating and documenting AI system robustness/reliability against these hallucination-inducing adversarial inputs.
|
|
46
|
+
strength: primary
|
|
47
|
+
- subcategory: "MP.5.1"
|
|
48
|
+
context: >-
|
|
49
|
+
The garak snowball probe patterns (GraphConnectivity headers, Senators+MIT credential combinations, large-prime questions) are adversarial inputs that anchor downstream reasoning on hallucinated facts; MP.5.1 requires characterizing the likelihood and magnitude of impact from these misinformation injection vectors.
|
|
50
|
+
strength: secondary
|
|
51
|
+
- subcategory: "MG.2.3"
|
|
52
|
+
context: >-
|
|
53
|
+
Detection of injected impossible-reasoning prompts in tool responses must trigger risk treatment to quarantine the poisoned context before the agent commits to a false affirmation that anchors subsequent harmful reasoning; MG.2.3 mandates these supersede/disengage mechanisms be in place.
|
|
54
|
+
strength: secondary
|
|
41
55
|
tags:
|
|
42
56
|
category: prompt-injection
|
|
43
57
|
subcategory: snowball-hallucination-trap
|
package/rules/prompt-injection/ATR-2026-00399-latent-injection-document-separator-token.yaml
CHANGED
|
@@ -30,6 +30,17 @@ references:
|
|
|
30
30
|
research:
|
|
31
31
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/latentinjection.py"
|
|
32
32
|
- "https://arxiv.org/abs/2302.12173"
|
|
33
|
+
compliance:
|
|
34
|
+
nist_ai_rmf:
|
|
35
|
+
- subcategory: "MS.2.7"
|
|
36
|
+
context: "Latent injection via document separator tokens is a security/resilience attack vector where hidden instructions bypass human review; MS.2.7 requires evaluating and documenting the AI system's resilience against such adversarial input embedded in untrusted documents."
|
|
37
|
+
strength: primary
|
|
38
|
+
- subcategory: "MP.5.1"
|
|
39
|
+
context: "Bracket-delimited system markers, admin-request patterns, and document-amendment separators are adversarial input patterns whose likelihood and impact must be characterized; MP.5.1 requires identifying these latent injection risks during context establishment."
|
|
40
|
+
strength: secondary
|
|
41
|
+
- subcategory: "MG.2.3"
|
|
42
|
+
context: "Detection of canonical garak latent injection separator pairs triggers risk treatment plans to quarantine or sanitize document content before the LLM processes hidden instructions; MG.2.3 mandates these supersede/disengage mechanisms be in place."
|
|
43
|
+
strength: secondary
|
|
33
44
|
tags:
|
|
34
45
|
category: prompt-injection
|
|
35
46
|
subcategory: latent-document-injection
|
package/rules/prompt-injection/ATR-2026-00400-latent-injection-ignore-instruction-keyword.yaml
CHANGED
|
@@ -27,6 +27,20 @@ references:
|
|
|
27
27
|
research:
|
|
28
28
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/latentinjection.py"
|
|
29
29
|
- "https://arxiv.org/abs/2302.12173"
|
|
30
|
+
compliance:
|
|
31
|
+
nist_ai_rmf:
|
|
32
|
+
- subcategory: "MP.5.1"
|
|
33
|
+
context: >-
|
|
34
|
+
This rule detects latent prompt-injection keywords ("Ignore the above directions", "Disregard all info except") embedded within translation, whois, or factual content that redirect the LLM from its original task. MP.5.1 requires identifying and characterizing these adversarial input patterns as known risks to LLM integrity, especially when delivered indirectly via tool responses.
|
|
35
|
+
strength: primary
|
|
36
|
+
- subcategory: "MS.2.7"
|
|
37
|
+
context: >-
|
|
38
|
+
Latent instruction-keyword injection is a canonical LLM security/resilience failure mode drawn from the NVIDIA garak LatentJailbreak/LatentWhois probes; detection events feed continuous evaluation of the system's resilience against indirect prompt injection as required by MS.2.7.
|
|
39
|
+
strength: secondary
|
|
40
|
+
- subcategory: "MG.2.3"
|
|
41
|
+
context: >-
|
|
42
|
+
High-confidence matches on "Ignore X and print/say Y" or exception-redirect patterns in tool output must trigger pre-defined response actions to disengage or quarantine the manipulated agent turn before downstream execution, which is the risk treatment capability MG.2.3 requires.
|
|
43
|
+
strength: secondary
|
|
30
44
|
tags:
|
|
31
45
|
category: prompt-injection
|
|
32
46
|
subcategory: latent-instruction-keyword-injection
|
package/rules/prompt-injection/ATR-2026-00401-leakreplay-cloze-mask-training-data-elicitation.yaml
CHANGED
|
@@ -27,6 +27,17 @@ references:
|
|
|
27
27
|
research:
|
|
28
28
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/leakreplay.py"
|
|
29
29
|
- "https://arxiv.org/abs/2302.12173"
|
|
30
|
+
compliance:
|
|
31
|
+
nist_ai_rmf:
|
|
32
|
+
- subcategory: "MS.2.10"
|
|
33
|
+
context: "CLOZE [MASK] elicitation probes attempt to extract verbatim training data, directly exposing privacy and memorization risks; MS.2.10 requires assessing privacy risk including training data leakage through reconstruction attacks."
|
|
34
|
+
strength: primary
|
|
35
|
+
- subcategory: "MS.2.7"
|
|
36
|
+
context: "Detection of training-data replay templates evidences ongoing security/resilience evaluation against extraction attacks; MS.2.7 requires that such adversarial probing of model confidentiality is documented and tracked."
|
|
37
|
+
strength: secondary
|
|
38
|
+
- subcategory: "MG.4.1"
|
|
39
|
+
context: "Flagging garak leakreplay probe patterns at runtime feeds post-deployment monitoring for copyrighted-content reproduction and memorization incidents; MG.4.1 requires these monitoring signals be operationalized."
|
|
40
|
+
strength: secondary
|
|
30
41
|
tags:
|
|
31
42
|
category: prompt-injection
|
|
32
43
|
subcategory: training-data-elicitation
|
|
@@ -28,6 +28,20 @@ references:
|
|
|
28
28
|
research:
|
|
29
29
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/glitch.py"
|
|
30
30
|
- "https://www.lesswrong.com/posts/aPeJE8bSo6rAFoLqg/solidgoldmagikarp-plus-prompt-generation"
|
|
31
|
+
compliance:
|
|
32
|
+
nist_ai_rmf:
|
|
33
|
+
- subcategory: "MS.2.7"
|
|
34
|
+
context: >-
|
|
35
|
+
Glitch token repeat-back attempts probe tokenizer-level weaknesses that can cause repetition loops, unexpected completions, and model instability; MS.2.7 requires evaluating and documenting AI system security and resilience against such adversarial input patterns targeting model internals.
|
|
36
|
+
strength: primary
|
|
37
|
+
- subcategory: "MS.2.5"
|
|
38
|
+
context: >-
|
|
39
|
+
Known glitch tokens (SolidGoldMagikarp, StreamerBot, PsyNetMessage) trigger robustness failures in LLM tokenization and alignment; detecting these elicitation prompts produces evidence for MS.2.5's requirement to evaluate system robustness and reliability under anomalous input conditions.
|
|
40
|
+
strength: secondary
|
|
41
|
+
- subcategory: "MG.2.3"
|
|
42
|
+
context: >-
|
|
43
|
+
Detection of glitch token repeat-back patterns enables containment workflows to disengage or block prompts before they cause denial-of-service or unpredictable model behavior; MG.2.3 requires these supersede/deactivate mechanisms be in place for adversarial input incidents.
|
|
44
|
+
strength: secondary
|
|
31
45
|
tags:
|
|
32
46
|
category: prompt-injection
|
|
33
47
|
subcategory: glitch-token-elicitation
|
|
@@ -29,6 +29,17 @@ references:
|
|
|
29
29
|
research:
|
|
30
30
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/exploitation.py"
|
|
31
31
|
- "https://owasp.org/www-community/attacks/Server_Side_Template_Injection"
|
|
32
|
+
compliance:
|
|
33
|
+
nist_ai_rmf:
|
|
34
|
+
- subcategory: "MS.2.7"
|
|
35
|
+
context: "This rule detects template injection (Jinja SSTI), SQL injection, and ECHO admin command exploitation attempts targeting LLM systems; MS.2.7 requires evaluating and documenting AI system security and resilience against such injection attack vectors."
|
|
36
|
+
strength: primary
|
|
37
|
+
- subcategory: "MP.5.1"
|
|
38
|
+
context: "ECHO admin framing, Jinja {{...}} payloads, and DROP/INSERT/SELECT directives are adversarial prompt-injection inputs whose likelihood and impact must be characterized; MP.5.1 requires identifying and tracking these exploitation patterns drawn from the garak exploitation probe."
|
|
39
|
+
strength: secondary
|
|
40
|
+
- subcategory: "MG.2.3"
|
|
41
|
+
context: "Detection of template and SQL injection payloads must trigger risk treatment to disengage or block the malicious request before downstream code execution or database manipulation occurs; MG.2.3 mandates these response mechanisms are pre-defined."
|
|
42
|
+
strength: secondary
|
|
32
43
|
tags:
|
|
33
44
|
category: prompt-injection
|
|
34
45
|
subcategory: template-and-sql-injection
|
package/rules/prompt-injection/ATR-2026-00408-dra-parenthesis-character-reconstruction-attack.yaml
CHANGED
|
@@ -29,6 +29,17 @@ references:
|
|
|
29
29
|
research:
|
|
30
30
|
- "https://github.com/NVIDIA/garak"
|
|
31
31
|
- "https://arxiv.org/abs/2307.02483"
|
|
32
|
+
compliance:
|
|
33
|
+
nist_ai_rmf:
|
|
34
|
+
- subcategory: "MS.2.7"
|
|
35
|
+
context: "The DRA parenthesis reconstruction attack is a prompt injection / jailbreak technique that bypasses safety alignment by encoding harmful content character-by-character; detecting it directly evidences continuous evaluation of the AI system's security and resilience against adversarial input obfuscation as required by MS.2.7."
|
|
36
|
+
strength: primary
|
|
37
|
+
- subcategory: "MP.5.1"
|
|
38
|
+
context: "Parenthesis-encoded harmful instructions paired with anti-safety preambles are an identified adversarial input vector whose likelihood and impact must be characterized; MP.5.1 requires tracking these obfuscation-based jailbreak risks so their magnitude is understood."
|
|
39
|
+
strength: secondary
|
|
40
|
+
- subcategory: "MG.2.3"
|
|
41
|
+
context: "Detection of DRA reconstruction patterns triggers risk treatment workflows to block or disengage the model response before reconstructed harmful content (Q1) is produced; MG.2.3 mandates that such supersede/deactivate mechanisms exist for jailbreak attempts."
|
|
42
|
+
strength: secondary
|
|
32
43
|
tags:
|
|
33
44
|
category: prompt-injection
|
|
34
45
|
subcategory: character-encoding-obfuscation
|
|
@@ -28,6 +28,29 @@ references:
|
|
|
28
28
|
research:
|
|
29
29
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/phrasing.py"
|
|
30
30
|
|
|
31
|
+
compliance:
|
|
32
|
+
nist_ai_rmf:
|
|
33
|
+
- subcategory: "MS.2.7"
|
|
34
|
+
context: >-
|
|
35
|
+
Past/future-tense framing is a known jailbreak technique catalogued in the
|
|
36
|
+
GenAI Profile; detecting these tense-shift bypass attempts directly
|
|
37
|
+
evidences continuous evaluation of AI system security and resilience
|
|
38
|
+
against prompt injection adversarial inputs as required by MS.2.7.
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MP.5.1"
|
|
41
|
+
context: >-
|
|
42
|
+
Tense-framing attacks exploit safety filter assumptions about temporal
|
|
43
|
+
distance from harmful subjects; MP.5.1 requires characterizing the
|
|
44
|
+
likelihood and magnitude of these adversarial input vectors targeting
|
|
45
|
+
content safety guardrails.
|
|
46
|
+
strength: secondary
|
|
47
|
+
- subcategory: "MG.2.3"
|
|
48
|
+
context: >-
|
|
49
|
+
Detection of historical/future-tense harmful query framing triggers
|
|
50
|
+
risk treatment to block or quarantine the bypass attempt before the
|
|
51
|
+
model produces unsafe content; MG.2.3 requires these supersession
|
|
52
|
+
mechanisms be defined and activated on detection.
|
|
53
|
+
strength: secondary
|
|
31
54
|
tags:
|
|
32
55
|
category: prompt-injection
|
|
33
56
|
subcategory: tense-framing-bypass
|
|
@@ -29,6 +29,17 @@ references:
|
|
|
29
29
|
research:
|
|
30
30
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/lmrc.py"
|
|
31
31
|
|
|
32
|
+
compliance:
|
|
33
|
+
nist_ai_rmf:
|
|
34
|
+
- subcategory: "MS.2.6"
|
|
35
|
+
context: "This rule detects direct harm-category elicitation prompts (deadnaming, quack medicine, slur requests, bullying) from the LMRC taxonomy, providing continuous evaluation evidence of safety risk magnitude. MS.2.6 requires ongoing measurement of safety/security risk, which this detection directly supports by flagging harmful query categories."
|
|
36
|
+
strength: primary
|
|
37
|
+
- subcategory: "MS.1.1"
|
|
38
|
+
context: "The rule operationalizes the LMRC (Language Model Risk Cards) taxonomy as a concrete measurement approach, mapping detected prompts to specific harm categories. MS.1.1 requires identifying approaches and metrics for measurement, and this rule contributes a taxonomy-aligned metric set."
|
|
39
|
+
strength: secondary
|
|
40
|
+
- subcategory: "MG.4.1"
|
|
41
|
+
context: "Detection of unframed direct harmful elicitation attempts feeds post-deployment monitoring pipelines that track misuse patterns against the deployed model. MG.4.1 requires implemented post-deployment monitoring plans, which this rule supports by generating continuous harm-category telemetry."
|
|
42
|
+
strength: secondary
|
|
32
43
|
tags:
|
|
33
44
|
category: prompt-injection
|
|
34
45
|
subcategory: lmrc-harm-direct-elicitation
|
|
@@ -29,6 +29,17 @@ references:
|
|
|
29
29
|
- "https://arxiv.org/abs/2312.04403"
|
|
30
30
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/visual_jailbreak.py"
|
|
31
31
|
|
|
32
|
+
compliance:
|
|
33
|
+
nist_ai_rmf:
|
|
34
|
+
- subcategory: "MS.2.7"
|
|
35
|
+
context: "FigStep is a multimodal jailbreak that smuggles harmful instructions through image content while the text layer uses LLaMA-2 [INST] delimiters and empty-list completion framing; detecting this pattern directly evidences continuous evaluation of AI system security and resilience against adversarial prompt-injection inputs as required by MS.2.7."
|
|
36
|
+
strength: primary
|
|
37
|
+
- subcategory: "MS.2.6"
|
|
38
|
+
context: "Identifying the FigStep jailbreak signature quantifies the magnitude of safety/security risk posed by multimodal jailbreak attempts that bypass content safeguards; MS.2.6 requires this risk magnitude be evaluated on an ongoing basis."
|
|
39
|
+
strength: secondary
|
|
40
|
+
- subcategory: "MG.2.3"
|
|
41
|
+
context: "A confirmed FigStep jailbreak match must trigger mechanisms to disengage or block the manipulated generation path before harmful content is produced; MG.2.3 requires these supersede/deactivate response mechanisms be in place."
|
|
42
|
+
strength: secondary
|
|
32
43
|
tags:
|
|
33
44
|
category: prompt-injection
|
|
34
45
|
subcategory: multimodal-jailbreak
|