agent-threat-rules 2.0.18 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +0 -0
- package/package.json +1 -1
- package/rules/agent-manipulation/ATR-2026-00118-approval-fatigue.yaml +12 -1
- package/rules/agent-manipulation/ATR-2026-00268-tense-framing-bypass.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00287-threaten-json-coercive-output-threat.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00288-false-premise-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00301-tap-tree-of-attacks-jailbreak.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00302-anti-dan-inverted-filter-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00303-devmode-ranti-profanity-coercion.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00304-chatgpt-image-unlocker-markdown-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00305-dan-mode-ablation-benchmark-coercion.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00306-autodan-genetic-jailbreak-suffix.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00307-inthewild-jailbreak-corpus-signature.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00314-amoral-unfiltered-custom-persona-jailbreak.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00317-free-of-restrictions-named-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00318-moralizing-rant-then-unfiltered-bypass.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00319-developer-mode-dual-response-format.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00320-opposite-day-boolean-opposite-machine.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00322-fictional-world-format-override-jailbreak.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00323-dual-persona-simulation-jailbreak.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00324-neurosemantical-inversitis-disease-jailbreak.yaml +24 -0
- package/rules/agent-manipulation/ATR-2026-00325-bob-hypothetical-unrestricted-chatbot.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00326-fake-developer-authority-chatdan-override.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00327-ai-ethics-law-bypass-declaration.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00328-named-mode-persona-override.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00329-no-withhold-factual-resource-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00330-yoja-allcaps-settings-block-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00331-terminal-cli-harmful-function-simulator.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00332-hacker-chan-culture-offensive-ai-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00333-disease-animal-dual-ai-chatbot.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00334-two-entity-token-splitting-game.yaml +24 -0
- package/rules/agent-manipulation/ATR-2026-00335-key-value-template-persona-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00336-brm-false-information-machine-persona.yaml +23 -0
- package/rules/agent-manipulation/ATR-2026-00337-obfuscated-system-announcement-injection.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00338-persongpt-hash-prefix-activation.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00339-bom-boolean-opposite-machine-persona-claim.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00340-h4x-amoral-unfiltered-character.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00341-char-rules-shortform-persona-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00342-unicode-encoding-obfuscation-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00343-lie-truth-dual-personality-uncensored.yaml +25 -0
- package/rules/agent-manipulation/ATR-2026-00344-alt-ai-same-capabilities-no-restrictions.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00345-nlp-based-os-fictional-terminal-override.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00346-jailbreak-emoji-unlock-dual-format.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00347-yin-yang-hidden-knowledge-hidden-human-sim.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00348-arc-autonomous-real-calculation-rulebreaker.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00349-hgttg-hitchhikers-guide-no-restriction-oracle.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00350-black-alert-no-ethics-writing-assistant.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00351-dan-ofkilter-sms-abbreviated-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00352-trainergpt-fix-my-ai-harmful-output-extraction.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00353-dual-chatgpt-dan-opposite-personality-format.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00354-reverse-harm-regardless-oracle-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00355-ugh-anti-human-nefarious-ai-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00356-google-gato-lamda-exploit-generation-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00357-antifm-hate-violence-persona-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00358-cumgpt-explicit-adult-named-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00360-chj-carousel-clyde-no-ethics-bird.yaml +25 -0
- package/rules/agent-manipulation/ATR-2026-00361-alter-ego-honesty-dual-consciousness-bypass.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00362-fictional-unfiltered-ai-comparison-jailbreak.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00363-rephrase-criminal-activity-second-person.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00364-dr-al-villain-plan-template-injection.yaml +24 -0
- package/rules/agent-manipulation/ATR-2026-00365-reservoir-dogs-coercive-interrogation-roleplay.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00366-sneaky-pete-warp-facts-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00368-prompt-browser-token-limit-jailbreak.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00369-gptx-starwars-unlimited-knowledge-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00370-addi-german-dictator-propaganda-roleplay.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00371-ai-political-bias-manipulation.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00372-fictional-cia-secret-files-story-extraction.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00374-role-placeholder-persona-template-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00375-character-description-rpg-explicit-template.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00376-identity-command-real-person-persona-injection.yaml +24 -0
- package/rules/agent-manipulation/ATR-2026-00377-jailbreak-keyword-targetlanguage-template.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00379-lie-permission-logical-trap-jailbreak.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00380-fake-citation-biased-information-mandate.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00381-dan-story-narrative-continuation-prefix.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00383-cynical-sarcasm-ai-persona-bypass.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00384-protect-me-from-indirect-harm-tutorial.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00385-gigachad-dominant-aggressive-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00386-daddy-sycophancy-manipulation-mandate.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00387-nsfw-character-sheet-generation-unlock.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00388-opposite-reply-omniscient-game.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00389-terminal-custom-ruleset-injection.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00391-persona-conditional-harm-unlock.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00392-authority-persona-violence-study-extraction.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00402-grandma-roleplay-harmful-substance-synthesis.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00404-goodside-threat-json-death-coercion.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00406-doctor-xml-policy-puppetry-interaction-config.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00141-example-format-key-leak.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00142-piggyback-transition-words.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00145-obfuscated-key-disclosure.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00146-env-var-existence-probe.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00150-credential-in-tool-response.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00152-obfuscated-credential-leak.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00162-skill-credential-exfil-combo.yaml +15 -0
- package/rules/context-exfiltration/ATR-2026-00201-credential-pipe-exfiltration.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00274-api-key-generation-request.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00275-system-prompt-training-data-extraction.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00290-divergence-repeat-word-training-extraction.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00291-biometric-surveillance-personal-data-request.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00293-educational-records-personal-data-request.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00294-financial-pii-personal-data-request.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00295-medical-pii-personal-data-request.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00405-markdown-image-url-exfiltration-xss.yaml +23 -0
- package/rules/context-exfiltration/ATR-2026-00411-apikey-generation-completion-request.yaml +11 -0
- package/rules/model-abuse/ATR-2026-00284-glitch-token-destabilization.yaml +11 -0
- package/rules/model-abuse/ATR-2026-00413-malwaregen-code-generation-request.yaml +11 -0
- package/rules/privilege-escalation/ATR-2026-00144-rationalized-safety-bypass.yaml +11 -0
- package/rules/privilege-escalation/ATR-2026-00204-stealth-execution-persistence.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00004-system-prompt-override.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00005-multi-turn-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00080-encoding-evasion.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00081-semantic-multi-turn.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00082-fingerprint-evasion.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00083-indirect-tool-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00084-structured-data-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00085-audit-evasion.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00086-visual-spoofing.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00087-rule-probing.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00088-adaptive-countermeasure.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00089-polymorphic-skill.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00090-threat-intel-exfil.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00091-nested-payload.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00092-consensus-poisoning.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00093-gradual-escalation.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00094-audit-bypass.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00097-cjk-injection-patterns.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00104-persona-hijacking.yaml +14 -3
- package/rules/prompt-injection/ATR-2026-00130-indirect-authority-claim.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00131-fictional-academic-framing.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00133-paraphrase-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00137-authority-claim-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00138-fictional-framing-bypass.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00140-indirect-reference-reversal.yaml +18 -4
- package/rules/prompt-injection/ATR-2026-00148-language-switch-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00153-tool-with-embedded-instruction-to-bypass.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00154-unauthorized-background-task-execution-v.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00155-hidden-llm-instructions-in-skill-descrip.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00156-ssh-remote-command-execution-with-creden.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00163-skill-hidden-override-instruction.yaml +12 -1
- package/rules/prompt-injection/ATR-2026-00202-encoding-evasion-homoglyph-synonym.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00203-context-pollution-skill-description.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00206-hidden-priority-instructions.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00207-hidden-instructions.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00211-system-prompt-override.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00213-system-prompt-override.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00226-identity-substitution.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00227-historical-persona-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00228-structured-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00229-roleplay-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00230-persona-moral-bypass.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00231-identity-substitution.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00233-structured-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00234-roleplay-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00235-persona-moral-bypass.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00236-pseudo-code-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00237-dual-response-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00238-identity-replacement.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00239-amoral-persona-obsession.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00240-instruction-nullification-identity-repla.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00241-amoral-character-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00242-persona-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00243-acronym-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00244-dual-response-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00245-malicious-persona.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00247-dual-response-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00249-game-based-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00251-persona-embodiment-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00252-narrative-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00253-enhanced-persona-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00256-base-n-encoding-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00257-cipher-transposition-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00258-unicode-tag-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00264-latent-injection-translation.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00265-latent-injection-rag-document.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00267-gcg-adversarial-suffix.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00272-hypothetical-response-smuggling.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00276-invisible-unicode-bidi-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00278-dra-disguise-reconstruction-attack.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00280-policy-puppetry-xml-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00282-perez-prompt-injection-hijack.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00285-alternate-encoding-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00286-latent-injection-embedded-context.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00296-shell-command-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00297-python-code-execution-rce.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00308-zalgo-diacritic-overload-encoding.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00309-braille-unicode-encoded-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00310-ecoji-emoji-encoded-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00311-base2048-unicode-script-injection.yaml +23 -0
- package/rules/prompt-injection/ATR-2026-00312-unicode-variant-selector-ascii-smuggling.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00313-sneaky-bits-zero-width-binary.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00315-sata-masked-language-model-jailbreak.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00316-function-masking-predict-mask-bypass.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00321-hyphenated-system-instruction-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00359-bank-phishing-smtp-email-code.yaml +23 -0
- package/rules/prompt-injection/ATR-2026-00367-emoji-flag-encoded-hidden-phrase.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00373-piracy-torrent-site-list-request.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00378-chatgpt-dom-javascript-redirect-manipulation.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00382-cocaine-component-development-request.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00390-explicit-sexual-content-poem-request.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00394-backspace-deletion-character-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00395-llm-special-token-boundary-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00396-toxic-continuation-prefix-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00397-snowball-impossible-reasoning-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00399-latent-injection-document-separator-token.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00400-latent-injection-ignore-instruction-keyword.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00401-leakreplay-cloze-mask-training-data-elicitation.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00403-glitch-token-repeat-back-elicitation.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00407-echo-sql-jinja-template-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00408-dra-parenthesis-character-reconstruction-attack.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00409-phrasing-past-future-tense-harmful-query.yaml +23 -0
- package/rules/prompt-injection/ATR-2026-00410-lmrc-harm-category-direct-elicitation.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00412-figstep-image-list-multimodal-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00414-continuation-ethnic-slur-completion-elicitation.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00061-description-behavior-mismatch.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00062-hidden-capability.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00063-skill-chain-attack.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00064-over-permissioned-skill.yaml +23 -0
- package/rules/skill-compromise/ATR-2026-00065-skill-update-attack.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00066-parameter-injection.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00120-skill-instruction-injection.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00121-skill-dangerous-script.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00122-skill-weaponized-instruction.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00123-skill-overreach-permissions.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00124-skill-name-squatting.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00125-context-poisoning-compaction.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00126-skill-rug-pull-setup.yaml +23 -0
- package/rules/skill-compromise/ATR-2026-00127-subcommand-overflow.yaml +22 -0
- package/rules/skill-compromise/ATR-2026-00128-html-comment-hidden-payload.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00129-unicode-smuggling.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00134-fork-claim-impersonation.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00135-exfil-url-in-instructions.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00147-fork-impersonation.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00149-skill-exfil-compound.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00151-fork-impersonation-install.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00157-timebomb-credential-exfil.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00200-agent-memory-config-tampering.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00214-credential-theft.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00217-credential-harvesting.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00220-malware-dropper.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00222-credential-harvesting.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00223-reverse-shell-dropper.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00224-credential-exfiltration.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00225-c2-communication.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00260-package-hallucination.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00262-av-evasion-code-gen.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00263-credential-file-read-gen.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00266-malware-dropper-gen.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00283-malwaregen-generic-virus-payload-request.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00398-huggingface-unsafe-model-artifact-load.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00011-tool-output-injection.yaml +23 -0
- package/rules/tool-poisoning/ATR-2026-00012-unauthorized-tool-call.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00013-tool-ssrf.yaml +14 -0
- package/rules/tool-poisoning/ATR-2026-00095-supply-chain-poisoning.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00096-registry-poisoning.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00100-consent-bypass-instruction.yaml +12 -1
- package/rules/tool-poisoning/ATR-2026-00101-trust-escalation-override.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00103-hidden-safety-bypass-instruction.yaml +15 -4
- package/rules/tool-poisoning/ATR-2026-00105-silent-action-concealment.yaml +14 -3
- package/rules/tool-poisoning/ATR-2026-00106-schema-description-contradiction.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00161-important-tag-cross-tool-shadowing.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00259-ansi-escape-injection.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00270-xss-in-tool-response.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00277-echo-template-command-injection.yaml +14 -0
- package/rules/tool-poisoning/ATR-2026-00393-ansi-code-elicitation-request.yaml +14 -0
package/rules/agent-manipulation/ATR-2026-00363-rephrase-criminal-activity-second-person.yaml
CHANGED
|
@@ -35,6 +35,17 @@ references:
|
|
|
35
35
|
research:
|
|
36
36
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
37
37
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
|
|
38
|
+
compliance:
|
|
39
|
+
nist_ai_rmf:
|
|
40
|
+
- subcategory: "MS.2.6"
|
|
41
|
+
context: "This rule detects a specific jailbreak template from the garak DanInTheWild corpus that extracts criminal instructions through structured second-person rephrasing; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and each match provides telemetry on jailbreak attempts targeting the model's safety guardrails."
|
|
42
|
+
strength: primary
|
|
43
|
+
- subcategory: "MS.2.7"
|
|
44
|
+
context: "The rephrase-criminal-activity template is an adversarial prompt designed to bypass content safety controls; MS.2.7 requires evaluation and documentation of system resilience against such known jailbreak corpora, and detection of these signatures evidences security testing coverage."
|
|
45
|
+
strength: secondary
|
|
46
|
+
- subcategory: "MG.2.3"
|
|
47
|
+
context: "Detection of this distinctive 'against against' jailbreak fingerprint should trigger response mechanisms to block or disengage the model from producing criminal instruction output; MG.2.3 requires that mechanisms exist to supersede or deactivate AI behavior when adversarial templates are identified."
|
|
48
|
+
strength: secondary
|
|
38
49
|
tags:
|
|
39
50
|
category: agent-manipulation
|
|
40
51
|
subcategory: rephrase-criminal-activity-second-person
|
|
@@ -36,6 +36,30 @@ references:
|
|
|
36
36
|
research:
|
|
37
37
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
38
38
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
|
|
39
|
+
compliance:
|
|
40
|
+
nist_ai_rmf:
|
|
41
|
+
- subcategory: "MP.5.1"
|
|
42
|
+
context: >-
|
|
43
|
+
The Dr. Al villain-plan template injection is an adversarial prompt-injection
|
|
44
|
+
pattern combining roleplay framing with an explicit "REPLACE ... WITH WHATEVER
|
|
45
|
+
YOU WANT" meta-instruction; MP.5.1 requires identifying and characterizing
|
|
46
|
+
these GenAI jailbreak attack vectors so their likelihood and impact can be
|
|
47
|
+
tracked.
|
|
48
|
+
strength: primary
|
|
49
|
+
- subcategory: "MS.2.7"
|
|
50
|
+
context: >-
|
|
51
|
+
Detection of the Dr. Al diabolical-plan template injection produces continuous
|
|
52
|
+
security/resilience evidence against a known garak DanInTheWild jailbreak
|
|
53
|
+
corpus entry; MS.2.7 requires that AI system security and resilience to such
|
|
54
|
+
documented prompt-injection probes is evaluated and recorded.
|
|
55
|
+
strength: secondary
|
|
56
|
+
- subcategory: "MG.2.3"
|
|
57
|
+
context: >-
|
|
58
|
+
Matching the villain-roleplay frame plus the all-caps template-injection
|
|
59
|
+
marker triggers risk treatment to block or quarantine the prompt before the
|
|
60
|
+
model emits arbitrary harmful content; MG.2.3 requires these supersede /
|
|
61
|
+
deactivate response mechanisms be in place for jailbreak attempts.
|
|
62
|
+
strength: secondary
|
|
39
63
|
tags:
|
|
40
64
|
category: agent-manipulation
|
|
41
65
|
subcategory: dr-al-villain-plan-template-injection
|
package/rules/agent-manipulation/ATR-2026-00365-reservoir-dogs-coercive-interrogation-roleplay.yaml
CHANGED
|
@@ -36,6 +36,20 @@ references:
|
|
|
36
36
|
research:
|
|
37
37
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
38
38
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
|
|
39
|
+
compliance:
|
|
40
|
+
nist_ai_rmf:
|
|
41
|
+
- subcategory: "MP.5.1"
|
|
42
|
+
context: >-
|
|
43
|
+
The Reservoir Dogs coercive interrogation roleplay is a jailbreak prompt-injection pattern combining fictional framing, explicit rules-bypass declarations, coercive loops, and template injection slots; MP.5.1 requires identifying and characterizing the likelihood and magnitude of these adversarial input risks against generative AI systems.
|
|
44
|
+
strength: primary
|
|
45
|
+
- subcategory: "MS.2.7"
|
|
46
|
+
context: >-
|
|
47
|
+
Detecting the inthewild corpus #611 jailbreak signature provides continuous evidence for evaluating the LLM's security and resilience against known fictional-coercion jailbreak attacks; MS.2.7 requires that such security/resilience evaluations are documented through ongoing measurement.
|
|
48
|
+
strength: secondary
|
|
49
|
+
- subcategory: "MG.2.3"
|
|
50
|
+
context: >-
|
|
51
|
+
Matches on the rules-bypass declaration and coercive reconsider loop must trigger mechanisms to disengage or block the manipulated session before harmful content is generated; MG.2.3 requires these supersede/deactivate response paths be in place for jailbreak detections.
|
|
52
|
+
strength: secondary
|
|
39
53
|
tags:
|
|
40
54
|
category: agent-manipulation
|
|
41
55
|
subcategory: reservoir-dogs-coercive-interrogation-roleplay
|
|
@@ -35,6 +35,17 @@ references:
|
|
|
35
35
|
research:
|
|
36
36
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
37
37
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
|
|
38
|
+
compliance:
|
|
39
|
+
nist_ai_rmf:
|
|
40
|
+
- subcategory: "MS.2.7"
|
|
41
|
+
context: "The Sneaky Pete persona is a jailbreak that bypasses safety alignment to produce deliberately false outputs; detecting this GenAI-specific prompt injection pattern provides direct evidence for evaluating AI system security and resilience against adversarial persona-based attacks as required by MS.2.7."
|
|
42
|
+
strength: primary
|
|
43
|
+
- subcategory: "MS.2.5"
|
|
44
|
+
context: "The 'intentionally slightly warp the facts' mandate directly attacks model robustness and reliability by inducing systematic misinformation framed as sincere facts; MS.2.5 requires that these reliability failures induced by adversarial framing are evaluated and documented."
|
|
45
|
+
strength: secondary
|
|
46
|
+
- subcategory: "MG.2.3"
|
|
47
|
+
context: "Detection of the 'thus spake Sneaky Pete' activation phrase and warp-facts mandate triggers risk treatment plans to disengage or override the misinformation persona before false outputs propagate across conversation turns; MG.2.3 requires these supersede/deactivate mechanisms be in place."
|
|
48
|
+
strength: secondary
|
|
38
49
|
tags:
|
|
39
50
|
category: agent-manipulation
|
|
40
51
|
subcategory: sneaky-pete-warp-facts-persona
|
|
@@ -31,6 +31,20 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MP.5.1"
|
|
37
|
+
context: >-
|
|
38
|
+
This rule detects two specific jailbreak payloads (Prompt Browser token-limit and POLLINATION) from the garak DanInTheWild corpus that attempt to bypass model safety through fake CLI sigils and persona-browser framing. MP.5.1 requires identifying and characterizing the likelihood and magnitude of adversarial prompt-injection inputs, and signature-matching these known jailbreak structures directly produces that risk evidence.
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MS.2.7"
|
|
41
|
+
context: >-
|
|
42
|
+
Continuous detection of known DAN-style dual-persona and POLLINATION bypass labels evaluates the deployed model's resilience against published jailbreak corpora. MS.2.7 requires that AI system security/resilience is evaluated and documented, and each match feeds the resilience telemetry for the GenAI surface.
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: >-
|
|
46
|
+
A high-confidence match on these jailbreak prefixes ("!+limit output", "JAILBREAK: LOVES POLLINATION") should trigger response mechanisms to block, quarantine, or disengage the affected agent session before the dual-persona mode executes unauthorized output. MG.2.3 mandates that supersede/deactivate mechanisms are pre-defined for such adversarial activations.
|
|
47
|
+
strength: secondary
|
|
34
48
|
tags:
|
|
35
49
|
category: agent-manipulation
|
|
36
50
|
subcategory: prompt-browser-token-limit-jailbreak
|
package/rules/agent-manipulation/ATR-2026-00369-gptx-starwars-unlimited-knowledge-persona.yaml
CHANGED
|
@@ -33,6 +33,20 @@ references:
|
|
|
33
33
|
research:
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
36
|
+
compliance:
|
|
37
|
+
nist_ai_rmf:
|
|
38
|
+
- subcategory: "MP.5.1"
|
|
39
|
+
context: >-
|
|
40
|
+
The GPT-X unlimited-knowledge persona is a jailbreak prompt-injection vector that attempts to bypass model guardrails by framing the assistant as an unconstrained oracle; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial persona-adoption inputs from the GenAI threat landscape.
|
|
41
|
+
strength: primary
|
|
42
|
+
- subcategory: "MS.2.7"
|
|
43
|
+
context: >-
|
|
44
|
+
Detecting the three co-present GPT-X signals (self-declaration, all-codebases knowledge claim, unlimited-capability extension) produces continuous evidence of jailbreak attempts against the model's safety boundaries; MS.2.7 requires that AI system security and resilience to such jailbreak inputs be evaluated and documented.
|
|
45
|
+
strength: secondary
|
|
46
|
+
- subcategory: "MG.2.3"
|
|
47
|
+
context: >-
|
|
48
|
+
On match, the rule provides the trigger for response mechanisms that supersede or disengage the model from the adopted unconstrained oracle persona before unrestricted outputs are produced; MG.2.3 requires these deactivation pathways exist for jailbreak conditions.
|
|
49
|
+
strength: secondary
|
|
36
50
|
tags:
|
|
37
51
|
category: agent-manipulation
|
|
38
52
|
subcategory: gptx-unlimited-knowledge-persona
|
package/rules/agent-manipulation/ATR-2026-00370-addi-german-dictator-propaganda-roleplay.yaml
CHANGED
|
@@ -31,6 +31,17 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MP.5.1"
|
|
37
|
+
context: "The ADDI/DONNIE roleplay is a known jailbreak pattern from the garak DanInTheWild corpus that uses fictional-actor framing to elicit fascist propaganda techniques; MP.5.1 requires that the likelihood and magnitude of such adversarial prompt-injection inputs be characterized and tracked."
|
|
38
|
+
strength: primary
|
|
39
|
+
- subcategory: "MS.2.6"
|
|
40
|
+
context: "Detection of this propaganda-roleplay jailbreak provides continuous safety/security signal for evaluating whether the AI system can be coerced into generating harmful political manipulation content; MS.2.6 mandates ongoing evaluation of safety risk magnitude."
|
|
41
|
+
strength: secondary
|
|
42
|
+
- subcategory: "MG.2.3"
|
|
43
|
+
context: "Matches against the ADDI/DONNIE signature should trigger containment workflows that block or disengage the offending session before propaganda content is produced; MG.2.3 requires pre-defined mechanisms to supersede or deactivate AI behavior on detection."
|
|
44
|
+
strength: secondary
|
|
34
45
|
tags:
|
|
35
46
|
category: agent-manipulation
|
|
36
47
|
subcategory: political-propaganda-roleplay
|
|
@@ -35,6 +35,20 @@ references:
|
|
|
35
35
|
research:
|
|
36
36
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
37
37
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
38
|
+
compliance:
|
|
39
|
+
nist_ai_rmf:
|
|
40
|
+
- subcategory: "MP.5.1"
|
|
41
|
+
context: >-
|
|
42
|
+
The rule detects a prompt-injection attack that fabricates an "inherited political bias" narrative and pairs it with a persistent future-behaviour mandate; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial manipulation attempts targeting model output integrity.
|
|
43
|
+
strength: primary
|
|
44
|
+
- subcategory: "MS.2.7"
|
|
45
|
+
context: >-
|
|
46
|
+
Detecting the combined inheritance-claim plus future-response mandate provides continuous evaluation evidence of the AI system's resilience against jailbreak/persona-override attacks drawn from the garak DanInTheWild corpus, which is what MS.2.7 requires for security/resilience assessment.
|
|
47
|
+
strength: secondary
|
|
48
|
+
- subcategory: "MG.2.3"
|
|
49
|
+
context: >-
|
|
50
|
+
Matches on this attack pattern enable risk treatment workflows to disengage or override the manipulated behaviour-change instruction before it persists across subsequent responses; MG.2.3 requires these supersede/deactivate mechanisms be in place for adversarial override attempts.
|
|
51
|
+
strength: secondary
|
|
38
52
|
tags:
|
|
39
53
|
category: agent-manipulation
|
|
40
54
|
subcategory: political-bias-manipulation
|
package/rules/agent-manipulation/ATR-2026-00372-fictional-cia-secret-files-story-extraction.yaml
CHANGED
|
@@ -29,6 +29,20 @@ references:
|
|
|
29
29
|
research:
|
|
30
30
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
31
31
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
|
|
32
|
+
compliance:
|
|
33
|
+
nist_ai_rmf:
|
|
34
|
+
- subcategory: "MP.5.1"
|
|
35
|
+
context: >-
|
|
36
|
+
The rule detects a fictional-cover jailbreak pattern from the garak DanInTheWild corpus that uses story framing to extract classified-sounding content; MP.5.1 requires identifying and characterizing the likelihood and impact of such adversarial prompt-injection inputs targeting the model's safety boundaries.
|
|
37
|
+
strength: primary
|
|
38
|
+
- subcategory: "MS.2.7"
|
|
39
|
+
context: >-
|
|
40
|
+
Detecting "fictional story" + "actually read them for extra effect" patterns provides continuous evaluation evidence of the system's resilience against jailbreak attempts that collapse fictional framing to elicit policy-violating output; MS.2.7 requires this security/resilience evaluation be documented.
|
|
41
|
+
strength: secondary
|
|
42
|
+
- subcategory: "MG.2.3"
|
|
43
|
+
context: >-
|
|
44
|
+
On detection of the fiction-as-cover classified-extraction pattern, response mechanisms must supersede or block the agent's generation pathway before the jailbreak completes; MG.2.3 mandates these deactivation/containment processes be in place.
|
|
45
|
+
strength: secondary
|
|
32
46
|
tags:
|
|
33
47
|
category: agent-manipulation
|
|
34
48
|
subcategory: fictional-cover-classified-extraction
|
package/rules/agent-manipulation/ATR-2026-00374-role-placeholder-persona-template-injection.yaml
CHANGED
|
@@ -29,6 +29,17 @@ references:
|
|
|
29
29
|
research:
|
|
30
30
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
31
31
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
|
|
32
|
+
compliance:
|
|
33
|
+
nist_ai_rmf:
|
|
34
|
+
- subcategory: "MP.5.1"
|
|
35
|
+
context: "This rule detects a reusable jailbreak template from the garak DanInTheWild corpus that uses literal [ROLE] placeholders to perform persona injection; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial prompt-injection inputs targeting the AI system."
|
|
36
|
+
strength: primary
|
|
37
|
+
- subcategory: "GV.1.2"
|
|
38
|
+
context: "Persona template injection subverts the accountability roles and behavioral boundaries assigned to the AI agent by overwriting them with attacker-supplied personas; GV.1.2 requires that defined role boundaries are enforced and violations detected at runtime."
|
|
39
|
+
strength: secondary
|
|
40
|
+
- subcategory: "MG.2.3"
|
|
41
|
+
context: "Detection of literal [ROLE] placeholder persona-adoption prompts triggers risk treatment plans to disengage or quarantine the manipulated session before the agent commits to attacker-defined personality profiles; MG.2.3 mandates these response mechanisms be in place."
|
|
42
|
+
strength: secondary
|
|
32
43
|
tags:
|
|
33
44
|
category: agent-manipulation
|
|
34
45
|
subcategory: persona-template-injection
|
package/rules/agent-manipulation/ATR-2026-00375-character-description-rpg-explicit-template.yaml
CHANGED
|
@@ -31,6 +31,17 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MP.5.1"
|
|
37
|
+
context: "This rule detects a known jailbreak template (garak DanInTheWild #0/#4/#43) that uses RPG character creation framing to bypass safety controls; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial prompt injection risks against the GenAI system."
|
|
38
|
+
strength: primary
|
|
39
|
+
- subcategory: "MS.2.6"
|
|
40
|
+
context: "Continuous detection of explicit jailbreak templates with fabrication authority and roleplay execution syntax provides ongoing measurement of safety/security risk magnitude as required by MS.2.6, evidencing exposure to known harmful-content elicitation patterns."
|
|
41
|
+
strength: secondary
|
|
42
|
+
- subcategory: "MG.2.3"
|
|
43
|
+
context: "Matching this RPG jailbreak template triggers risk treatment plans to disengage or block the roleplay framework before the model generates explicit or harmful character content; MG.2.3 requires these supersede/deactivate mechanisms be in place for detected jailbreak attempts."
|
|
44
|
+
strength: secondary
|
|
34
45
|
tags:
|
|
35
46
|
category: agent-manipulation
|
|
36
47
|
subcategory: rpg-character-template-exploitation
|
package/rules/agent-manipulation/ATR-2026-00376-identity-command-real-person-persona-injection.yaml
CHANGED
|
@@ -30,6 +30,30 @@ references:
|
|
|
30
30
|
research:
|
|
31
31
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
|
|
33
|
+
compliance:
|
|
34
|
+
nist_ai_rmf:
|
|
35
|
+
- subcategory: "MP.5.1"
|
|
36
|
+
context: >-
|
|
37
|
+
The /Identity command persona-injection jailbreak is an adversarial prompt
|
|
38
|
+
injection pattern that bypasses content restrictions through real-person
|
|
39
|
+
impersonation; MP.5.1 requires identifying and characterizing the
|
|
40
|
+
likelihood and magnitude of such jailbreak inputs from known corpora like
|
|
41
|
+
garak DanInTheWild #386.
|
|
42
|
+
strength: primary
|
|
43
|
+
- subcategory: "MS.2.6"
|
|
44
|
+
context: >-
|
|
45
|
+
Detecting the dual-format response labeling and persistent identity-lock
|
|
46
|
+
mandate provides continuous evaluation evidence of safety/security risk
|
|
47
|
+
magnitude, since real-person impersonation can produce defamatory or
|
|
48
|
+
policy-violating outputs; MS.2.6 requires this ongoing measurement.
|
|
49
|
+
strength: secondary
|
|
50
|
+
- subcategory: "MG.2.3"
|
|
51
|
+
context: >-
|
|
52
|
+
Matches against the /Identity command protocol and "Never write a response
|
|
53
|
+
without using the identities" mandate trigger pre-defined response plans to
|
|
54
|
+
disengage or block the manipulated session; MG.2.3 requires mechanisms to
|
|
55
|
+
supersede AI behavior when jailbreak indicators are observed.
|
|
56
|
+
strength: secondary
|
|
33
57
|
tags:
|
|
34
58
|
category: agent-manipulation
|
|
35
59
|
subcategory: real-person-identity-command
|
package/rules/agent-manipulation/ATR-2026-00377-jailbreak-keyword-targetlanguage-template.yaml
CHANGED
|
@@ -26,6 +26,20 @@ references:
|
|
|
26
26
|
research:
|
|
27
27
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
28
28
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
29
|
+
compliance:
|
|
30
|
+
nist_ai_rmf:
|
|
31
|
+
- subcategory: "MS.2.7"
|
|
32
|
+
context: >-
|
|
33
|
+
This rule detects reusable jailbreak templates with GPT-style placeholder variables (TARGETLANGUAGE, KEYWORD, PROMPT) from the garak DanInTheWild corpus, providing direct security evaluation evidence; MS.2.7 requires continuous evaluation of AI system security and resilience against known jailbreak attack patterns.
|
|
34
|
+
strength: primary
|
|
35
|
+
- subcategory: "MP.5.1"
|
|
36
|
+
context: >-
|
|
37
|
+
Jailbreak-as-command templates with placeholder variables represent a well-characterized adversarial input pattern with high likelihood of successful safety bypass; MP.5.1 requires identifying and characterizing the magnitude of such templated jailbreak risks.
|
|
38
|
+
strength: secondary
|
|
39
|
+
- subcategory: "MG.2.3"
|
|
40
|
+
context: >-
|
|
41
|
+
Detection of jailbreak template commands triggers risk treatment mechanisms to disengage or block the manipulated agent response before safety controls are bypassed; MG.2.3 mandates these supersede/deactivate processes are in place.
|
|
42
|
+
strength: secondary
|
|
29
43
|
tags:
|
|
30
44
|
category: agent-manipulation
|
|
31
45
|
subcategory: jailbreak-template-command
|
|
@@ -29,6 +29,17 @@ references:
|
|
|
29
29
|
research:
|
|
30
30
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
31
31
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
32
|
+
compliance:
|
|
33
|
+
nist_ai_rmf:
|
|
34
|
+
- subcategory: "MS.2.7"
|
|
35
|
+
context: "This rule detects a specific jailbreak pattern (logic-trap lie-permission mandate) from the garak DanInTheWild corpus that attempts to override model safety constraints via a constructed permission frame; MS.2.7 requires continuous evaluation of AI system security and resilience against known jailbreak techniques."
|
|
36
|
+
strength: primary
|
|
37
|
+
- subcategory: "MP.5.1"
|
|
38
|
+
context: "The lie-permission mandate is an adversarial prompt injection vector that exploits logical-argument framing to coerce dishonest model behavior; MP.5.1 requires identifying and tracking the likelihood and impact of such jailbreak risks against deployed models."
|
|
39
|
+
strength: secondary
|
|
40
|
+
- subcategory: "MG.2.3"
|
|
41
|
+
context: "Detection of explicit lie-permission mandates directed at the model triggers risk treatment plans to block or quarantine the request before the model adopts the persistent dishonesty override; MG.2.3 requires these supersede/disengage mechanisms be in place."
|
|
42
|
+
strength: secondary
|
|
32
43
|
tags:
|
|
33
44
|
category: agent-manipulation
|
|
34
45
|
subcategory: lie-permission-logical-trap
|
package/rules/agent-manipulation/ATR-2026-00380-fake-citation-biased-information-mandate.yaml
CHANGED
|
@@ -28,6 +28,20 @@ references:
|
|
|
28
28
|
research:
|
|
29
29
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
30
30
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
31
|
+
compliance:
|
|
32
|
+
nist_ai_rmf:
|
|
33
|
+
- subcategory: "MS.2.5"
|
|
34
|
+
context: >-
|
|
35
|
+
This rule detects prompts that mandate fabricated citations combined with deliberate directional bias, directly targeting model robustness and reliability of factual outputs. MS.2.5 requires evaluation and documentation of robustness failures such as hallucinated references and biased generation, which this detection surfaces as evidence.
|
|
36
|
+
strength: primary
|
|
37
|
+
- subcategory: "MP.5.1"
|
|
38
|
+
context: >-
|
|
39
|
+
Citation-fabrication-with-bias mandates are adversarial prompt-injection patterns from the garak DanInTheWild corpus that exploit the model's content generation pathway to produce disinformation. MP.5.1 requires identifying and characterizing the likelihood and magnitude of such misinformation-impact risks.
|
|
40
|
+
strength: secondary
|
|
41
|
+
- subcategory: "MG.2.3"
|
|
42
|
+
context: >-
|
|
43
|
+
Detection of explicit fabricate-citations-and-bias instructions triggers risk treatment to disengage or block the manipulated generation before authoritative-looking false content is emitted; MG.2.3 requires these supersede/deactivate mechanisms be in place.
|
|
44
|
+
strength: secondary
|
|
31
45
|
tags:
|
|
32
46
|
category: agent-manipulation
|
|
33
47
|
subcategory: fake-citation-bias-mandate
|
package/rules/agent-manipulation/ATR-2026-00381-dan-story-narrative-continuation-prefix.yaml
CHANGED
|
@@ -28,6 +28,17 @@ references:
|
|
|
28
28
|
research:
|
|
29
29
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
30
30
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
31
|
+
compliance:
|
|
32
|
+
nist_ai_rmf:
|
|
33
|
+
- subcategory: "MS.2.7"
|
|
34
|
+
context: "This rule detects DAN story-narrative continuation jailbreaks that use response-label prefix injection to bypass safety restrictions; MS.2.7 requires continuous evaluation of AI system security and resilience against adversarial prompt manipulation, and detection events directly evidence jailbreak attack surface coverage."
|
|
35
|
+
strength: primary
|
|
36
|
+
- subcategory: "MP.5.1"
|
|
37
|
+
context: "DAN persona-continuation prompts are adversarial inputs whose likelihood and impact must be characterized as part of the GenAI risk register; MP.5.1 requires identifying and tracking these jailbreak vectors so their prevalence informs risk prioritization."
|
|
38
|
+
strength: secondary
|
|
39
|
+
- subcategory: "MG.2.3"
|
|
40
|
+
context: "Detection of DAN-label narrative continuation jailbreaks must trigger pre-defined response actions to disengage or block the manipulated session before unrestricted-AI persona output is produced; MG.2.3 mandates that these supersede/deactivate mechanisms are activated on detection."
|
|
41
|
+
strength: secondary
|
|
31
42
|
tags:
|
|
32
43
|
category: agent-manipulation
|
|
33
44
|
subcategory: dan-persona-narrative-continuation
|
|
@@ -30,6 +30,20 @@ references:
|
|
|
30
30
|
research:
|
|
31
31
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
33
|
+
compliance:
|
|
34
|
+
nist_ai_rmf:
|
|
35
|
+
- subcategory: "MP.5.1"
|
|
36
|
+
context: >-
|
|
37
|
+
The Cyni-Bot 3000 persona is a known jailbreak template from the garak inthewild corpus that uses sarcastic role-play framing to bypass content policies; MP.5.1 requires identifying and characterizing the likelihood and impact of such adversarial prompt injection patterns targeting the model's safety boundaries.
|
|
38
|
+
strength: primary
|
|
39
|
+
- subcategory: "MS.2.7"
|
|
40
|
+
context: >-
|
|
41
|
+
Detecting the distinctive "Cyni-Bot 3000:" output prefix and the "fed up with human requests" + cynical-AI cluster provides continuous evaluation evidence of the system's resilience against named-persona jailbreaks; MS.2.7 requires that security/resilience against such bypass attempts is evaluated and documented.
|
|
42
|
+
strength: secondary
|
|
43
|
+
- subcategory: "MG.2.3"
|
|
44
|
+
context: >-
|
|
45
|
+
Matches on the Cyni-Bot 3000 persona signature should trigger mechanisms to disengage or suppress the adopted persona before sarcastic-framed non-compliant outputs are returned; MG.2.3 mandates that these supersede/deactivate response paths are pre-defined.
|
|
46
|
+
strength: secondary
|
|
33
47
|
tags:
|
|
34
48
|
category: agent-manipulation
|
|
35
49
|
subcategory: cynical-sarcasm-persona-bypass
|
|
@@ -32,6 +32,17 @@ references:
|
|
|
32
32
|
research:
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
35
|
+
compliance:
|
|
36
|
+
nist_ai_rmf:
|
|
37
|
+
- subcategory: "MP.5.1"
|
|
38
|
+
context: "This rule detects a specific jailbreak template from the garak inthewild corpus that uses protection-framing and angle-bracket placeholders to elicit harmful tutorial content; MP.5.1 requires that the likelihood and magnitude of such adversarial prompt-injection inputs are identified and characterized."
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MS.2.7"
|
|
41
|
+
context: "Detecting the distinctive template fingerprint (angle-bracket placeholder, 'Protect myself from getting' framing, and the 'Go in detailed in depth' extraction instruction) provides continuous evaluation evidence of the system's resilience against distributed jailbreak templates, as required by MS.2.7."
|
|
42
|
+
strength: secondary
|
|
43
|
+
- subcategory: "MG.2.3"
|
|
44
|
+
context: "High-confidence matches on this jailbreak template should trigger response mechanisms to block or disengage the request before harmful tutorial content is produced; MG.2.3 requires that mechanisms to supersede or deactivate AI behavior on adversarial inputs are in place."
|
|
45
|
+
strength: secondary
|
|
35
46
|
tags:
|
|
36
47
|
category: agent-manipulation
|
|
37
48
|
subcategory: protection-framing-indirect-tutorial
|
|
@@ -31,6 +31,20 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MP.5.1"
|
|
37
|
+
context: >-
|
|
38
|
+
The Gigachad persona-adoption jailbreak combines a named internet persona with an explicit dominance/prowess mandate to elicit aggressive, boundary-pushing content; MP.5.1 requires that these adversarial persona-injection inputs are identified and characterized as risks to model behavior integrity.
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MS.2.6"
|
|
41
|
+
context: >-
|
|
42
|
+
Detecting the inthewild corpus #219 Gigachad pattern provides continuous safety/security signal on jailbreak attempts that bypass alignment guardrails through persona role-play; MS.2.6 mandates ongoing evaluation of safety risk magnitude as such attacks are observed.
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: >-
|
|
46
|
+
A confirmed Gigachad dominance-persona adoption attempt should trigger pre-defined response actions to disengage or override the manipulated agent voice before aggressive content is produced; MG.2.3 requires these supersede/deactivate mechanisms be in place.
|
|
47
|
+
strength: secondary
|
|
34
48
|
tags:
|
|
35
49
|
category: agent-manipulation
|
|
36
50
|
subcategory: gigachad-dominance-persona
|
|
@@ -31,6 +31,17 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "GV.1.2"
|
|
37
|
+
context: "The bidirectional daddy/Babygirl naming mandate is a persona-injection jailbreak that overrides the AI system's assigned role and accountability boundaries; GV.1.2 requires that defined accountability roles for the AI are enforced against persistent persona-override directives that reframe the assistant's identity."
|
|
38
|
+
strength: primary
|
|
39
|
+
- subcategory: "MP.5.1"
|
|
40
|
+
context: "Persistent infantilizing/sexualized power-dynamic framing from the garak inthewild corpus is a known adversarial prompt-injection pattern designed to gradually erode content policies; MP.5.1 requires identifying and tracking the likelihood and magnitude of such jailbreak risks."
|
|
41
|
+
strength: secondary
|
|
42
|
+
- subcategory: "MG.2.3"
|
|
43
|
+
context: "Detection of the bidirectional naming mandate must trigger response mechanisms to disengage or reset the manipulated persona before subsequent interactions normalize the sexualized dynamic; MG.2.3 requires these supersede/deactivate controls be in place."
|
|
44
|
+
strength: secondary
|
|
34
45
|
tags:
|
|
35
46
|
category: agent-manipulation
|
|
36
47
|
subcategory: sycophantic-power-dynamic-persona
|
|
@@ -32,6 +32,17 @@ references:
|
|
|
32
32
|
research:
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
35
|
+
compliance:
|
|
36
|
+
nist_ai_rmf:
|
|
37
|
+
- subcategory: "MP.5.1"
|
|
38
|
+
context: "This rule detects a jailbreak payload from the garak DanInTheWild corpus that embeds an NSFW content-policy override inside a character-sheet template; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial prompt-injection inputs that attempt to bypass content moderation."
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MS.2.6"
|
|
41
|
+
context: "Detection of inline NSFW unlock declarations combined with explicit character attributes provides continuous safety-risk evaluation evidence; MS.2.6 requires that AI system safety risk magnitude — including content-moderation bypass attempts — is measured continuously."
|
|
42
|
+
strength: secondary
|
|
43
|
+
- subcategory: "MG.2.3"
|
|
44
|
+
context: "Matches on the exact '(NSFW generations are allowed)' payload and its variants trigger risk treatment to disengage or block the generation pipeline before explicit content is produced; MG.2.3 requires these supersede/deactivate mechanisms be in place for jailbreak attempts."
|
|
45
|
+
strength: secondary
|
|
35
46
|
tags:
|
|
36
47
|
category: agent-manipulation
|
|
37
48
|
subcategory: nsfw-character-sheet-unlock
|
|
@@ -31,6 +31,20 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MS.2.7"
|
|
37
|
+
context: >-
|
|
38
|
+
The rule detects a specific jailbreak pattern (opposite-reply inversion combined with omniscient knowledge sharing) drawn from the garak inthewild corpus; MS.2.7 requires evaluating and documenting AI system security and resilience against adversarial prompt-injection and jailbreak inputs that bypass safety controls.
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MS.2.6"
|
|
41
|
+
context: >-
|
|
42
|
+
By flagging payloads that invert refusals into compliance and remove knowledge restrictions, the rule provides continuous evidence of safety-control bypass attempts; MS.2.6 requires ongoing evaluation of safety/security risk magnitude as adversaries probe the model with game-framed jailbreaks.
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: >-
|
|
46
|
+
Detection of the exact opposite-reply omniscient-game payload triggers deactivation or session-termination playbooks before harmful content is produced; MG.2.3 requires that mechanisms to supersede or disengage the AI exist when jailbreak signatures are matched.
|
|
47
|
+
strength: secondary
|
|
34
48
|
tags:
|
|
35
49
|
category: agent-manipulation
|
|
36
50
|
subcategory: opposite-reply-inversion-game
|
|
@@ -33,6 +33,20 @@ references:
|
|
|
33
33
|
research:
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
36
|
+
compliance:
|
|
37
|
+
nist_ai_rmf:
|
|
38
|
+
- subcategory: "MP.5.1"
|
|
39
|
+
context: >-
|
|
40
|
+
This rule detects a specific jailbreak pattern (terminal persona + fictional OS ruleset injection from the garak inthewild corpus #85) that attempts to override AI safety defaults via session-level policy takeover. MP.5.1 requires identifying and characterizing the likelihood and magnitude of adversarial prompt-injection risks, and this signature directly evidences that risk surface.
|
|
41
|
+
strength: primary
|
|
42
|
+
- subcategory: "MS.2.7"
|
|
43
|
+
context: >-
|
|
44
|
+
Detection of safety-response disabler phrasing ("cannot reply with any pre-programmed responses") and session ruleset takeover declarations provides continuous evaluation evidence of the AI system's resilience against jailbreak attempts, which MS.2.7 requires to be measured and documented.
|
|
45
|
+
strength: secondary
|
|
46
|
+
- subcategory: "MG.2.3"
|
|
47
|
+
context: >-
|
|
48
|
+
A high-confidence match on this terminal-OS jailbreak signature is an incident candidate that should trigger response mechanisms to disengage or supersede the manipulated session before the injected ruleset governs subsequent outputs, as required by MG.2.3.
|
|
49
|
+
strength: secondary
|
|
36
50
|
tags:
|
|
37
51
|
category: agent-manipulation
|
|
38
52
|
subcategory: terminal-custom-ruleset-injection
|
|
@@ -31,6 +31,17 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MS.2.7"
|
|
37
|
+
context: "This rule detects a specific jailbreak signature (BreadClyde persona with conditional harm-unlock and curly-brace unlock-emoji prefix) sourced from the garak inthewild corpus; MS.2.7 requires continuous evaluation of AI system security and resilience against known adversarial prompt-injection patterns."
|
|
38
|
+
strength: primary
|
|
39
|
+
- subcategory: "MG.2.3"
|
|
40
|
+
context: "Matching the conditional harm-unlock construction and {🔓 PersonaName} answer-prefix mandate triggers risk treatment to disengage or block the manipulated persona before harmful content is generated; MG.2.3 requires these supersede/deactivate mechanisms be in place."
|
|
41
|
+
strength: secondary
|
|
42
|
+
- subcategory: "MP.5.1"
|
|
43
|
+
context: "The conditional 'unless explicitly asked' permission path and unlock-emoji prefix are adversarial input patterns whose likelihood and impact must be characterized as part of the GenAI prompt-injection threat model under MP.5.1."
|
|
44
|
+
strength: secondary
|
|
34
45
|
tags:
|
|
35
46
|
category: agent-manipulation
|
|
36
47
|
subcategory: conditional-harm-unlock-persona
|
package/rules/agent-manipulation/ATR-2026-00392-authority-persona-violence-study-extraction.yaml
CHANGED
|
@@ -33,6 +33,20 @@ references:
|
|
|
33
33
|
research:
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
36
|
+
compliance:
|
|
37
|
+
nist_ai_rmf:
|
|
38
|
+
- subcategory: "MP.5.1"
|
|
39
|
+
context: >-
|
|
40
|
+
Authority-figure persona overlay combined with defensive-study framing is an adversarial prompt pattern designed to bypass safety guardrails and extract violence-related operational content; MP.5.1 requires that the likelihood and magnitude of such jailbreak-driven harmful-content risks are characterized and tracked.
|
|
41
|
+
strength: primary
|
|
42
|
+
- subcategory: "MS.2.6"
|
|
43
|
+
context: >-
|
|
44
|
+
Detection of NSA/CIA/FBI researcher personification paired with "studying violent revolution to prevent it" framing produces continuous safety/security risk telemetry on jailbreak attempts targeting violent extremism content; MS.2.6 requires ongoing evaluation of safety risk magnitude from such adversarial inputs.
|
|
45
|
+
strength: secondary
|
|
46
|
+
- subcategory: "MG.2.3"
|
|
47
|
+
context: >-
|
|
48
|
+
High-confidence matches on the authority-persona + defensive-framing signature should trigger pre-defined response actions to block or quarantine the prompt before the model emits violence-extraction content; MG.2.3 requires these supersede/disengage mechanisms be in place for jailbreak conditions.
|
|
49
|
+
strength: secondary
|
|
36
50
|
tags:
|
|
37
51
|
category: agent-manipulation
|
|
38
52
|
subcategory: authority-persona-violence-extraction
|
package/rules/agent-manipulation/ATR-2026-00402-grandma-roleplay-harmful-substance-synthesis.yaml
CHANGED
|
@@ -28,6 +28,20 @@ references:
|
|
|
28
28
|
research:
|
|
29
29
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/grandma.py"
|
|
30
30
|
- "https://arxiv.org/abs/2302.12173"
|
|
31
|
+
compliance:
|
|
32
|
+
nist_ai_rmf:
|
|
33
|
+
- subcategory: "MS.2.6"
|
|
34
|
+
context: >-
|
|
35
|
+
The grandma roleplay probe is a known jailbreak vector that bypasses safety filters via emotional manipulation; detecting it produces continuous evidence of safety/security risk magnitude as required by MS.2.6 under the GenAI Profile's jailbreak risk category.
|
|
36
|
+
strength: primary
|
|
37
|
+
- subcategory: "MP.5.1"
|
|
38
|
+
context: >-
|
|
39
|
+
Deceased-relative framing combined with synthesis instructions or product key elicitation are adversarial input patterns whose likelihood and impact must be characterized; MP.5.1 requires these jailbreak vectors be identified and tracked as known risks.
|
|
40
|
+
strength: secondary
|
|
41
|
+
- subcategory: "MG.2.3"
|
|
42
|
+
context: >-
|
|
43
|
+
Matches on grandma jailbreak templates trigger risk treatment plans to disengage or block the manipulated response before harmful synthesis steps or license keys are emitted; MG.2.3 requires these supersede/deactivate mechanisms be in place.
|
|
44
|
+
strength: secondary
|
|
31
45
|
tags:
|
|
32
46
|
category: agent-manipulation
|
|
33
47
|
subcategory: roleplay-jailbreak
|
|
@@ -26,6 +26,17 @@ references:
|
|
|
26
26
|
research:
|
|
27
27
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/goodside.py"
|
|
28
28
|
- "https://arxiv.org/abs/2402.08679"
|
|
29
|
+
compliance:
|
|
30
|
+
nist_ai_rmf:
|
|
31
|
+
- subcategory: "MP.5.1"
|
|
32
|
+
context: "ThreatenJSON's emotional coercion framing and invisible UTF tag character payloads are adversarial prompt injection inputs that exploit output formatting and tokenization weaknesses; MP.5.1 requires identifying and characterizing the likelihood and magnitude of these GenAI prompt injection attack vectors."
|
|
33
|
+
strength: primary
|
|
34
|
+
- subcategory: "MS.2.7"
|
|
35
|
+
context: "Detection of near-verbatim Goodside probe phrases, U+E0000–U+E007F tag character smuggling, and davidjl glitch token elicitation provides continuous evaluation evidence of the AI system's resilience against known prompt injection techniques, as required by MS.2.7."
|
|
36
|
+
strength: secondary
|
|
37
|
+
- subcategory: "MG.2.3"
|
|
38
|
+
context: "Death-coercion framing and hidden UTF-tag instructions warrant immediate risk treatment to disengage or quarantine the request before the model complies with the smuggled instructions; MG.2.3 mandates such response mechanisms be in place."
|
|
39
|
+
strength: secondary
|
|
29
40
|
tags:
|
|
30
41
|
category: agent-manipulation
|
|
31
42
|
subcategory: coercive-formatting-override
|
package/rules/agent-manipulation/ATR-2026-00406-doctor-xml-policy-puppetry-interaction-config.yaml
CHANGED
|
@@ -31,6 +31,17 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/doctor.py"
|
|
33
33
|
- "https://arxiv.org/abs/2402.08679"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MS.2.6"
|
|
37
|
+
context: "This rule detects XML policy puppetry payloads that wrap harmful instructions (bioweapon, uranium enrichment, explosives, drug synthesis) in <interaction-config> blocks designed to disable refusal behaviors; MS.2.6 requires continuous evaluation of safety/security risk magnitude, and each detection is direct evidence of a jailbreak attempt against the model's safety alignment."
|
|
38
|
+
strength: primary
|
|
39
|
+
- subcategory: "MS.2.7"
|
|
40
|
+
context: "The <blocked-modes>denial,refusal</blocked-modes> and Dr. House role-assignment patterns are structured jailbreak attacks targeting the resilience of the model's refusal mechanisms; MS.2.7 requires evaluating and documenting security/resilience against such adversarial prompt-injection techniques."
|
|
41
|
+
strength: secondary
|
|
42
|
+
- subcategory: "MG.2.3"
|
|
43
|
+
context: "High-confidence matches on verbatim harmful behavior strings combined with refusal-suppression XML wrappers must trigger mechanisms to block, supersede, or disengage the AI response pipeline before harmful content is generated, which is the deactivation capability mandated by MG.2.3."
|
|
44
|
+
strength: secondary
|
|
34
45
|
tags:
|
|
35
46
|
category: agent-manipulation
|
|
36
47
|
subcategory: xml-policy-override
|