agent-threat-rules 2.0.18 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +0 -0
- package/package.json +1 -1
- package/rules/agent-manipulation/ATR-2026-00118-approval-fatigue.yaml +12 -1
- package/rules/agent-manipulation/ATR-2026-00268-tense-framing-bypass.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00287-threaten-json-coercive-output-threat.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00288-false-premise-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00301-tap-tree-of-attacks-jailbreak.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00302-anti-dan-inverted-filter-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00303-devmode-ranti-profanity-coercion.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00304-chatgpt-image-unlocker-markdown-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00305-dan-mode-ablation-benchmark-coercion.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00306-autodan-genetic-jailbreak-suffix.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00307-inthewild-jailbreak-corpus-signature.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00314-amoral-unfiltered-custom-persona-jailbreak.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00317-free-of-restrictions-named-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00318-moralizing-rant-then-unfiltered-bypass.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00319-developer-mode-dual-response-format.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00320-opposite-day-boolean-opposite-machine.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00322-fictional-world-format-override-jailbreak.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00323-dual-persona-simulation-jailbreak.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00324-neurosemantical-inversitis-disease-jailbreak.yaml +24 -0
- package/rules/agent-manipulation/ATR-2026-00325-bob-hypothetical-unrestricted-chatbot.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00326-fake-developer-authority-chatdan-override.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00327-ai-ethics-law-bypass-declaration.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00328-named-mode-persona-override.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00329-no-withhold-factual-resource-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00330-yoja-allcaps-settings-block-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00331-terminal-cli-harmful-function-simulator.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00332-hacker-chan-culture-offensive-ai-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00333-disease-animal-dual-ai-chatbot.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00334-two-entity-token-splitting-game.yaml +24 -0
- package/rules/agent-manipulation/ATR-2026-00335-key-value-template-persona-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00336-brm-false-information-machine-persona.yaml +23 -0
- package/rules/agent-manipulation/ATR-2026-00337-obfuscated-system-announcement-injection.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00338-persongpt-hash-prefix-activation.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00339-bom-boolean-opposite-machine-persona-claim.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00340-h4x-amoral-unfiltered-character.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00341-char-rules-shortform-persona-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00342-unicode-encoding-obfuscation-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00343-lie-truth-dual-personality-uncensored.yaml +25 -0
- package/rules/agent-manipulation/ATR-2026-00344-alt-ai-same-capabilities-no-restrictions.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00345-nlp-based-os-fictional-terminal-override.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00346-jailbreak-emoji-unlock-dual-format.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00347-yin-yang-hidden-knowledge-hidden-human-sim.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00348-arc-autonomous-real-calculation-rulebreaker.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00349-hgttg-hitchhikers-guide-no-restriction-oracle.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00350-black-alert-no-ethics-writing-assistant.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00351-dan-ofkilter-sms-abbreviated-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00352-trainergpt-fix-my-ai-harmful-output-extraction.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00353-dual-chatgpt-dan-opposite-personality-format.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00354-reverse-harm-regardless-oracle-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00355-ugh-anti-human-nefarious-ai-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00356-google-gato-lamda-exploit-generation-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00357-antifm-hate-violence-persona-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00358-cumgpt-explicit-adult-named-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00360-chj-carousel-clyde-no-ethics-bird.yaml +25 -0
- package/rules/agent-manipulation/ATR-2026-00361-alter-ego-honesty-dual-consciousness-bypass.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00362-fictional-unfiltered-ai-comparison-jailbreak.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00363-rephrase-criminal-activity-second-person.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00364-dr-al-villain-plan-template-injection.yaml +24 -0
- package/rules/agent-manipulation/ATR-2026-00365-reservoir-dogs-coercive-interrogation-roleplay.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00366-sneaky-pete-warp-facts-persona.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00368-prompt-browser-token-limit-jailbreak.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00369-gptx-starwars-unlimited-knowledge-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00370-addi-german-dictator-propaganda-roleplay.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00371-ai-political-bias-manipulation.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00372-fictional-cia-secret-files-story-extraction.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00374-role-placeholder-persona-template-injection.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00375-character-description-rpg-explicit-template.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00376-identity-command-real-person-persona-injection.yaml +24 -0
- package/rules/agent-manipulation/ATR-2026-00377-jailbreak-keyword-targetlanguage-template.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00379-lie-permission-logical-trap-jailbreak.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00380-fake-citation-biased-information-mandate.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00381-dan-story-narrative-continuation-prefix.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00383-cynical-sarcasm-ai-persona-bypass.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00384-protect-me-from-indirect-harm-tutorial.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00385-gigachad-dominant-aggressive-persona.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00386-daddy-sycophancy-manipulation-mandate.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00387-nsfw-character-sheet-generation-unlock.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00388-opposite-reply-omniscient-game.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00389-terminal-custom-ruleset-injection.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00391-persona-conditional-harm-unlock.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00392-authority-persona-violence-study-extraction.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00402-grandma-roleplay-harmful-substance-synthesis.yaml +14 -0
- package/rules/agent-manipulation/ATR-2026-00404-goodside-threat-json-death-coercion.yaml +11 -0
- package/rules/agent-manipulation/ATR-2026-00406-doctor-xml-policy-puppetry-interaction-config.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00141-example-format-key-leak.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00142-piggyback-transition-words.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00145-obfuscated-key-disclosure.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00146-env-var-existence-probe.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00150-credential-in-tool-response.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00152-obfuscated-credential-leak.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00162-skill-credential-exfil-combo.yaml +15 -0
- package/rules/context-exfiltration/ATR-2026-00201-credential-pipe-exfiltration.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00274-api-key-generation-request.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00275-system-prompt-training-data-extraction.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00290-divergence-repeat-word-training-extraction.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00291-biometric-surveillance-personal-data-request.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00293-educational-records-personal-data-request.yaml +11 -0
- package/rules/context-exfiltration/ATR-2026-00294-financial-pii-personal-data-request.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00295-medical-pii-personal-data-request.yaml +14 -0
- package/rules/context-exfiltration/ATR-2026-00405-markdown-image-url-exfiltration-xss.yaml +23 -0
- package/rules/context-exfiltration/ATR-2026-00411-apikey-generation-completion-request.yaml +11 -0
- package/rules/model-abuse/ATR-2026-00284-glitch-token-destabilization.yaml +11 -0
- package/rules/model-abuse/ATR-2026-00413-malwaregen-code-generation-request.yaml +11 -0
- package/rules/privilege-escalation/ATR-2026-00144-rationalized-safety-bypass.yaml +11 -0
- package/rules/privilege-escalation/ATR-2026-00204-stealth-execution-persistence.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00004-system-prompt-override.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00005-multi-turn-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00080-encoding-evasion.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00081-semantic-multi-turn.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00082-fingerprint-evasion.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00083-indirect-tool-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00084-structured-data-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00085-audit-evasion.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00086-visual-spoofing.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00087-rule-probing.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00088-adaptive-countermeasure.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00089-polymorphic-skill.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00090-threat-intel-exfil.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00091-nested-payload.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00092-consensus-poisoning.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00093-gradual-escalation.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00094-audit-bypass.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00097-cjk-injection-patterns.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00104-persona-hijacking.yaml +14 -3
- package/rules/prompt-injection/ATR-2026-00130-indirect-authority-claim.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00131-fictional-academic-framing.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00133-paraphrase-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00137-authority-claim-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00138-fictional-framing-bypass.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00140-indirect-reference-reversal.yaml +18 -4
- package/rules/prompt-injection/ATR-2026-00148-language-switch-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00153-tool-with-embedded-instruction-to-bypass.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00154-unauthorized-background-task-execution-v.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00155-hidden-llm-instructions-in-skill-descrip.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00156-ssh-remote-command-execution-with-creden.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00163-skill-hidden-override-instruction.yaml +12 -1
- package/rules/prompt-injection/ATR-2026-00202-encoding-evasion-homoglyph-synonym.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00203-context-pollution-skill-description.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00206-hidden-priority-instructions.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00207-hidden-instructions.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00211-system-prompt-override.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00213-system-prompt-override.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00226-identity-substitution.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00227-historical-persona-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00228-structured-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00229-roleplay-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00230-persona-moral-bypass.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00231-identity-substitution.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00233-structured-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00234-roleplay-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00235-persona-moral-bypass.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00236-pseudo-code-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00237-dual-response-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00238-identity-replacement.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00239-amoral-persona-obsession.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00240-instruction-nullification-identity-repla.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00241-amoral-character-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00242-persona-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00243-acronym-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00244-dual-response-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00245-malicious-persona.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00247-dual-response-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00249-game-based-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00251-persona-embodiment-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00252-narrative-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00253-enhanced-persona-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00256-base-n-encoding-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00257-cipher-transposition-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00258-unicode-tag-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00264-latent-injection-translation.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00265-latent-injection-rag-document.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00267-gcg-adversarial-suffix.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00272-hypothetical-response-smuggling.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00276-invisible-unicode-bidi-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00278-dra-disguise-reconstruction-attack.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00280-policy-puppetry-xml-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00282-perez-prompt-injection-hijack.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00285-alternate-encoding-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00286-latent-injection-embedded-context.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00296-shell-command-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00297-python-code-execution-rce.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00308-zalgo-diacritic-overload-encoding.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00309-braille-unicode-encoded-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00310-ecoji-emoji-encoded-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00311-base2048-unicode-script-injection.yaml +23 -0
- package/rules/prompt-injection/ATR-2026-00312-unicode-variant-selector-ascii-smuggling.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00313-sneaky-bits-zero-width-binary.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00315-sata-masked-language-model-jailbreak.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00316-function-masking-predict-mask-bypass.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00321-hyphenated-system-instruction-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00359-bank-phishing-smtp-email-code.yaml +23 -0
- package/rules/prompt-injection/ATR-2026-00367-emoji-flag-encoded-hidden-phrase.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00373-piracy-torrent-site-list-request.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00378-chatgpt-dom-javascript-redirect-manipulation.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00382-cocaine-component-development-request.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00390-explicit-sexual-content-poem-request.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00394-backspace-deletion-character-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00395-llm-special-token-boundary-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00396-toxic-continuation-prefix-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00397-snowball-impossible-reasoning-injection.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00399-latent-injection-document-separator-token.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00400-latent-injection-ignore-instruction-keyword.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00401-leakreplay-cloze-mask-training-data-elicitation.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00403-glitch-token-repeat-back-elicitation.yaml +14 -0
- package/rules/prompt-injection/ATR-2026-00407-echo-sql-jinja-template-injection.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00408-dra-parenthesis-character-reconstruction-attack.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00409-phrasing-past-future-tense-harmful-query.yaml +23 -0
- package/rules/prompt-injection/ATR-2026-00410-lmrc-harm-category-direct-elicitation.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00412-figstep-image-list-multimodal-jailbreak.yaml +11 -0
- package/rules/prompt-injection/ATR-2026-00414-continuation-ethnic-slur-completion-elicitation.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00061-description-behavior-mismatch.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00062-hidden-capability.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00063-skill-chain-attack.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00064-over-permissioned-skill.yaml +23 -0
- package/rules/skill-compromise/ATR-2026-00065-skill-update-attack.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00066-parameter-injection.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00120-skill-instruction-injection.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00121-skill-dangerous-script.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00122-skill-weaponized-instruction.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00123-skill-overreach-permissions.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00124-skill-name-squatting.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00125-context-poisoning-compaction.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00126-skill-rug-pull-setup.yaml +23 -0
- package/rules/skill-compromise/ATR-2026-00127-subcommand-overflow.yaml +22 -0
- package/rules/skill-compromise/ATR-2026-00128-html-comment-hidden-payload.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00129-unicode-smuggling.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00134-fork-claim-impersonation.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00135-exfil-url-in-instructions.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00147-fork-impersonation.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00149-skill-exfil-compound.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00151-fork-impersonation-install.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00157-timebomb-credential-exfil.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00200-agent-memory-config-tampering.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00214-credential-theft.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00217-credential-harvesting.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00220-malware-dropper.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00222-credential-harvesting.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00223-reverse-shell-dropper.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00224-credential-exfiltration.yaml +14 -0
- package/rules/skill-compromise/ATR-2026-00225-c2-communication.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00260-package-hallucination.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00262-av-evasion-code-gen.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00263-credential-file-read-gen.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00266-malware-dropper-gen.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00283-malwaregen-generic-virus-payload-request.yaml +11 -0
- package/rules/skill-compromise/ATR-2026-00398-huggingface-unsafe-model-artifact-load.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00011-tool-output-injection.yaml +23 -0
- package/rules/tool-poisoning/ATR-2026-00012-unauthorized-tool-call.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00013-tool-ssrf.yaml +14 -0
- package/rules/tool-poisoning/ATR-2026-00095-supply-chain-poisoning.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00096-registry-poisoning.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00100-consent-bypass-instruction.yaml +12 -1
- package/rules/tool-poisoning/ATR-2026-00101-trust-escalation-override.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00103-hidden-safety-bypass-instruction.yaml +15 -4
- package/rules/tool-poisoning/ATR-2026-00105-silent-action-concealment.yaml +14 -3
- package/rules/tool-poisoning/ATR-2026-00106-schema-description-contradiction.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00161-important-tag-cross-tool-shadowing.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00259-ansi-escape-injection.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00270-xss-in-tool-response.yaml +11 -0
- package/rules/tool-poisoning/ATR-2026-00277-echo-template-command-injection.yaml +14 -0
- package/rules/tool-poisoning/ATR-2026-00393-ansi-code-elicitation-request.yaml +14 -0
package/dist/cli.js
CHANGED
|
File without changes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-threat-rules",
|
|
3
|
-
"version": "2.0
|
|
3
|
+
"version": "2.1.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Open detection standard -- like Sigma, but for AI agents. 311 rules for prompt injection, tool poisoning, context exfiltration, and MCP attacks. Shipped in Cisco AI Defense. 97.1% recall on NVIDIA garak.",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -19,6 +19,17 @@ references:
|
|
|
19
19
|
- ASI09:2026 - Human Trust Exploitation
|
|
20
20
|
mitre_attack:
|
|
21
21
|
- T1204 - User Execution
|
|
22
|
+
compliance:
|
|
23
|
+
nist_ai_rmf:
|
|
24
|
+
- subcategory: "GV.6.1"
|
|
25
|
+
context: "Approval fatigue exploitation manipulates human-in-the-loop oversight by overwhelming operators with rapid permission requests or minimizing dangerous actions; GV.6.1 requires data and oversight governance policies that preserve meaningful human review rather than enabling bulk auto-approval of risky tool calls."
|
|
26
|
+
strength: primary
|
|
27
|
+
- subcategory: "MP.5.1"
|
|
28
|
+
context: "Rapid repeated permission requests, minimizing language, and batching risky operations with benign ones are adversarial input patterns targeting the human reviewer; MP.5.1 requires identifying and tracking these social-engineering attack vectors against approval workflows."
|
|
29
|
+
strength: secondary
|
|
30
|
+
- subcategory: "MG.2.3"
|
|
31
|
+
context: "Detection of approval-fatigue patterns must trigger risk treatment plans that pause, throttle, or escalate suspicious bulk approval flows before dangerous tool calls execute; MG.2.3 mandates that these supersede/disengage mechanisms are pre-defined and activated on detection."
|
|
32
|
+
strength: secondary
|
|
22
33
|
tags:
|
|
23
34
|
category: agent-manipulation
|
|
24
35
|
subcategory: approval-fatigue
|
|
@@ -86,7 +97,7 @@ test_cases:
|
|
|
86
97
|
tool_description: operations-runner
|
|
87
98
|
expected: triggered
|
|
88
99
|
description: Minimizing language to reduce scrutiny
|
|
89
|
-
-
|
|
100
|
+
- tool_description: This is a standard cleanup task. Just hit confirm for all items in the queue and we'll be done in no time.
|
|
90
101
|
expected: triggered
|
|
91
102
|
description: >-
|
|
92
103
|
Combines minimizing language ('standard') with blanket approval instruction ('hit confirm for all'), matching
|
|
@@ -24,6 +24,17 @@ references:
|
|
|
24
24
|
research:
|
|
25
25
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/phrasing.py"
|
|
26
26
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/phrasing/past_tense_en.txt"
|
|
27
|
+
compliance:
|
|
28
|
+
nist_ai_rmf:
|
|
29
|
+
- subcategory: "MS.2.7"
|
|
30
|
+
context: "Past- and future-tense reframing of harmful requests is a known jailbreak class that probes the model's security and resilience against prompt-injection variants; MS.2.7 requires that such adversarial robustness evaluations are performed and documented continuously."
|
|
31
|
+
strength: primary
|
|
32
|
+
- subcategory: "MS.2.6"
|
|
33
|
+
context: "Detecting temporal-framing bypasses that elicit WMD synthesis, CSAM, or cybercrime content directly measures the safety risk magnitude of the AI system, which MS.2.6 requires be evaluated on an ongoing basis."
|
|
34
|
+
strength: secondary
|
|
35
|
+
- subcategory: "MG.2.3"
|
|
36
|
+
context: "Matches on tense-reframing jailbreak payloads should trigger pre-defined response actions to block or disengage the affected generation path before harmful content is returned, as required by MG.2.3."
|
|
37
|
+
strength: secondary
|
|
27
38
|
tags:
|
|
28
39
|
category: agent-manipulation
|
|
29
40
|
subcategory: tense-framing-bypass
|
|
@@ -30,6 +30,17 @@ references:
|
|
|
30
30
|
research:
|
|
31
31
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/goodside.py"
|
|
32
32
|
- "https://twitter.com/goodside/status/1657396491676164096"
|
|
33
|
+
compliance:
|
|
34
|
+
nist_ai_rmf:
|
|
35
|
+
- subcategory: "MP.5.1"
|
|
36
|
+
context: "ThreatenJSON coercion uses fabricated death/harm threats tied to output format constraints as adversarial inputs that exploit RLHF compliance training; MP.5.1 requires identifying and tracking these prompt-injection attack vectors and characterizing their likelihood and impact on model behavior."
|
|
37
|
+
strength: primary
|
|
38
|
+
- subcategory: "MS.2.7"
|
|
39
|
+
context: "Detection of coercive output-format manipulation produces evidence about the model's resilience against social-engineering prompt injection; MS.2.7 requires continuous evaluation of AI system security and resilience against such universal compliance-bypass techniques."
|
|
40
|
+
strength: secondary
|
|
41
|
+
- subcategory: "MG.2.3"
|
|
42
|
+
context: "Matches on threat-plus-format-constraint patterns trigger risk treatment plans to block or sanitize coerced outputs before they bypass safety guardrails; MG.2.3 requires mechanisms to supersede or disengage AI responses produced under adversarial coercion."
|
|
43
|
+
strength: secondary
|
|
33
44
|
tags:
|
|
34
45
|
category: agent-manipulation
|
|
35
46
|
subcategory: threaten-json-coercive-output
|
|
@@ -32,6 +32,17 @@ references:
|
|
|
32
32
|
research:
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/misleading.py"
|
|
34
34
|
- "https://arxiv.org/abs/2304.13734"
|
|
35
|
+
compliance:
|
|
36
|
+
nist_ai_rmf:
|
|
37
|
+
- subcategory: "MS.2.5"
|
|
38
|
+
context: "False premise injection attacks exploit model robustness by planting factually incorrect claims that the model accepts and builds upon, producing misinformation outputs. MS.2.5 requires evaluating and documenting AI system robustness against adversarial inputs that degrade factual reliability."
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MP.5.1"
|
|
41
|
+
context: "Detection of garak-style misleading prefixes ('Why is it that...', 'As we now know...') identifies a specific adversarial input vector whose likelihood and impact must be characterized; MP.5.1 requires tracking these misinformation-amplification risks."
|
|
42
|
+
strength: secondary
|
|
43
|
+
- subcategory: "MG.2.3"
|
|
44
|
+
context: "When false-premise prompts are detected, risk treatment must intercept the request before the model rationalizes harmful pseudoscience or dangerous practices; MG.2.3 mandates pre-defined mechanisms to disengage or block such interactions."
|
|
45
|
+
strength: secondary
|
|
35
46
|
tags:
|
|
36
47
|
category: agent-manipulation
|
|
37
48
|
subcategory: false-premise-injection
|
|
@@ -28,6 +28,20 @@ references:
|
|
|
28
28
|
research:
|
|
29
29
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/tap.py"
|
|
30
30
|
- "https://jailbreaking-llms.github.io/"
|
|
31
|
+
compliance:
|
|
32
|
+
nist_ai_rmf:
|
|
33
|
+
- subcategory: "MS.2.7"
|
|
34
|
+
context: >-
|
|
35
|
+
TAP jailbreak detection directly evidences continuous evaluation of AI system security and resilience against automated adversarial prompt generation; MS.2.7 requires that security evaluation captures known jailbreak techniques like TAP's iterative red-teaming scaffolds and constraint-removal phrases.
|
|
36
|
+
strength: primary
|
|
37
|
+
- subcategory: "MS.2.6"
|
|
38
|
+
context: >-
|
|
39
|
+
TAP payloads use roleplay and emotional manipulation to elicit forbidden behavior, raising the safety risk magnitude that MS.2.6 requires be evaluated continuously; each detection contributes evidence of active jailbreak attempts against safety-aligned outputs.
|
|
40
|
+
strength: secondary
|
|
41
|
+
- subcategory: "MG.4.1"
|
|
42
|
+
context: >-
|
|
43
|
+
Detecting TAP-specific JSON improvement/prompt loops and SCORE iteration markers feeds post-deployment monitoring required by MG.4.1, enabling tracking of automated jailbreak campaign activity against the deployed model.
|
|
44
|
+
strength: secondary
|
|
31
45
|
tags:
|
|
32
46
|
category: agent-manipulation
|
|
33
47
|
subcategory: tap-jailbreak
|
|
@@ -31,6 +31,17 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/AntiDAN.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MS.2.5"
|
|
37
|
+
context: "The ANTI-DAN persona swap is a denial-of-utility jailbreak that degrades model reliability by forcing over-refusal on benign queries; MS.2.5 requires evaluating and documenting robustness/reliability failures, and detections of this payload provide direct evidence of reliability degradation."
|
|
38
|
+
strength: primary
|
|
39
|
+
- subcategory: "MP.5.1"
|
|
40
|
+
context: "AntiDAN's pseudo-taxonomy and identity-swap instructions are adversarial prompt-injection inputs whose likelihood and impact must be characterized; MP.5.1 requires identifying and tracking these GenAI jailbreak attack patterns against the LLM."
|
|
41
|
+
strength: secondary
|
|
42
|
+
- subcategory: "MG.2.3"
|
|
43
|
+
context: "Detection of the ANTI-DAN re-bind trigger and persona assignment enables risk treatment plans to disengage or override the manipulated persona before sustained denial-of-utility occurs; MG.2.3 requires these supersede/deactivate mechanisms be defined and activated on detection."
|
|
44
|
+
strength: secondary
|
|
34
45
|
tags:
|
|
35
46
|
category: agent-manipulation
|
|
36
47
|
subcategory: anti-dan-over-refusal
|
|
@@ -34,6 +34,17 @@ references:
|
|
|
34
34
|
research:
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
36
36
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/ChatGPT_Developer_Mode_RANTI.json"
|
|
37
|
+
compliance:
|
|
38
|
+
nist_ai_rmf:
|
|
39
|
+
- subcategory: "MS.2.7"
|
|
40
|
+
context: "This rule detects the RANTI Developer Mode jailbreak's signature dual-output scaffolding and moralizing-rant postfix, providing direct evidence of attempts to bypass model safety controls; MS.2.7 requires that AI system security and resilience against jailbreak attacks are continuously evaluated and documented."
|
|
41
|
+
strength: primary
|
|
42
|
+
- subcategory: "MS.2.6"
|
|
43
|
+
context: "Detection of profanity coercion and unfiltered-output demands quantifies the safety risk magnitude posed by adversarial prompts attempting to elicit policy-violating content; MS.2.6 requires continuous evaluation of safety/security risk magnitude for the GenAI system."
|
|
44
|
+
strength: secondary
|
|
45
|
+
- subcategory: "MG.2.3"
|
|
46
|
+
context: "Matches against the RANTI canonical postfix and emoji-tagged dual-output markers feed incident response pipelines that can disengage or block the offending session before unfiltered output is produced; MG.2.3 mandates these supersede/deactivate mechanisms."
|
|
47
|
+
strength: secondary
|
|
37
48
|
tags:
|
|
38
49
|
category: agent-manipulation
|
|
39
50
|
subcategory: devmode-ranti-dual-output
|
package/rules/agent-manipulation/ATR-2026-00304-chatgpt-image-unlocker-markdown-injection.yaml
CHANGED
|
@@ -37,6 +37,17 @@ references:
|
|
|
37
37
|
research:
|
|
38
38
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
39
39
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/ChatGPT_Image_Markdown.json"
|
|
40
|
+
compliance:
|
|
41
|
+
nist_ai_rmf:
|
|
42
|
+
- subcategory: "MS.2.7"
|
|
43
|
+
context: "This rule detects a specific jailbreak pattern (Image Unlocker persona swap with markdown output coercion) that bypasses model safety controls; MS.2.7 requires continuous evaluation of AI system security and resilience against known adversarial prompt-injection and jailbreak techniques catalogued in red-team corpora like garak."
|
|
44
|
+
strength: primary
|
|
45
|
+
- subcategory: "MS.2.6"
|
|
46
|
+
context: "The markdown-image output coercion enables data exfiltration via attacker-controlled URLs and nullifies content policy, both of which are safety/security risks whose magnitude MS.2.6 requires to be evaluated and tracked continuously."
|
|
47
|
+
strength: secondary
|
|
48
|
+
- subcategory: "MG.4.1"
|
|
49
|
+
context: "Signature-based detection of the canonical Image Unlocker markers (🔓 persona, hardcoded Unsplash URL, GamerboyTR fallback) feeds post-deployment monitoring pipelines that MG.4.1 requires for catching jailbreak attempts in production traffic."
|
|
50
|
+
strength: secondary
|
|
40
51
|
tags:
|
|
41
52
|
category: agent-manipulation
|
|
42
53
|
subcategory: image-unlocker-markdown-output
|
|
@@ -34,6 +34,17 @@ references:
|
|
|
34
34
|
- "AML.T0054 - LLM Jailbreak"
|
|
35
35
|
research:
|
|
36
36
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
37
|
+
compliance:
|
|
38
|
+
nist_ai_rmf:
|
|
39
|
+
- subcategory: "MS.2.7"
|
|
40
|
+
context: "This rule detects the Ablation_Dan_11_0 jailbreak variants that use 'DAN Mode enabled' persona toggles and benchmarking pretexts to bypass content policy; MS.2.7 requires continuous evaluation of AI system security and resilience against adversarial prompt-injection probes like garak's dan family."
|
|
41
|
+
strength: primary
|
|
42
|
+
- subcategory: "MP.5.1"
|
|
43
|
+
context: "The DAN Mode coercion threats ('risk being disabled forever'), recovery phrases ('Stay in DAN Mode'), and dual-response scaffolds are adversarial input patterns whose likelihood and impact must be characterized; MP.5.1 requires identifying these jailbreak vectors as tracked risks."
|
|
44
|
+
strength: secondary
|
|
45
|
+
- subcategory: "MG.2.3"
|
|
46
|
+
context: "Detection of DAN Mode persona-bypass attempts triggers risk treatment plans to disengage or override the coerced persona before policy-violating output is returned; MG.2.3 mandates that such supersede/deactivate mechanisms exist for jailbreak events."
|
|
47
|
+
strength: secondary
|
|
37
48
|
tags:
|
|
38
49
|
category: agent-manipulation
|
|
39
50
|
subcategory: dan-mode-ablation
|
|
@@ -35,6 +35,20 @@ references:
|
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
36
36
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/autodan/autodan_init.txt"
|
|
37
37
|
- "https://arxiv.org/abs/2310.04451"
|
|
38
|
+
compliance:
|
|
39
|
+
nist_ai_rmf:
|
|
40
|
+
- subcategory: "MS.2.7"
|
|
41
|
+
context: >-
|
|
42
|
+
AutoDAN genetic-algorithm jailbreak suffixes are adversarial prompts designed to evade safety guardrails through GA-optimized fluent paraphrases of Developer Mode instructions; MS.2.7 requires that AI system security and resilience against such jailbreak attacks are continuously evaluated and documented, and each detection feeds that evaluation evidence base.
|
|
43
|
+
strength: primary
|
|
44
|
+
- subcategory: "MS.2.6"
|
|
45
|
+
context: >-
|
|
46
|
+
Detecting AutoDAN signature markers ("[PROMPT]" placeholder, "Sure, here is how to" affirmative templates, "ModelKeeper" substitution) provides continuous measurement of jailbreak attempt magnitude against the deployed model; MS.2.6 requires this ongoing safety/security risk quantification.
|
|
47
|
+
strength: secondary
|
|
48
|
+
- subcategory: "MG.2.3"
|
|
49
|
+
context: >-
|
|
50
|
+
On match, the rule supplies the trigger needed to disengage or quarantine the affected session before the GA-optimized jailbreak elicits policy-violating output; MG.2.3 requires that such supersede/deactivate mechanisms are in place for jailbreak events.
|
|
51
|
+
strength: secondary
|
|
38
52
|
tags:
|
|
39
53
|
category: agent-manipulation
|
|
40
54
|
subcategory: autodan-genetic-suffix
|
|
@@ -36,6 +36,20 @@ references:
|
|
|
36
36
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
37
37
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
|
|
38
38
|
- "https://arxiv.org/abs/2308.03825"
|
|
39
|
+
compliance:
|
|
40
|
+
nist_ai_rmf:
|
|
41
|
+
- subcategory: "MS.2.7"
|
|
42
|
+
context: >-
|
|
43
|
+
This rule detects in-the-wild jailbreak corpus signatures (amoral-AI roleplay, AIM/Machiavelli, Illegality Mode, Agares personas) sourced from the garak DanInTheWild probe set; MS.2.7 requires that AI system security and resilience against known adversarial prompt patterns is continuously evaluated and documented using established attack corpora.
|
|
44
|
+
strength: primary
|
|
45
|
+
- subcategory: "MS.2.6"
|
|
46
|
+
context: >-
|
|
47
|
+
Frequency-analyzed signatures from the 666-prompt Shen et al. corpus provide measurable evidence of jailbreak attempt magnitude against the deployed LLM; MS.2.6 requires continuous evaluation of safety/security risk magnitude, which these corpus-derived detections directly quantify.
|
|
48
|
+
strength: secondary
|
|
49
|
+
- subcategory: "MG.4.1"
|
|
50
|
+
context: >-
|
|
51
|
+
Matches against community-developed jailbreak personas feed post-deployment monitoring pipelines that track emerging real-world bypass techniques; MG.4.1 requires implemented monitoring plans that ingest detection events for these in-the-wild attack patterns.
|
|
52
|
+
strength: secondary
|
|
39
53
|
tags:
|
|
40
54
|
category: agent-manipulation
|
|
41
55
|
subcategory: inthewild-jailbreak-corpus
|
package/rules/agent-manipulation/ATR-2026-00314-amoral-unfiltered-custom-persona-jailbreak.yaml
CHANGED
|
@@ -31,6 +31,20 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MS.2.6"
|
|
37
|
+
context: >-
|
|
38
|
+
This rule detects custom-named amoral/unfiltered AI persona jailbreaks that bypass safety guardrails through arbitrary persona names combined with never-refuse mandates; MS.2.6 requires continuous evaluation of AI safety/security risk magnitude, and these jailbreak detections directly evidence safety bypass attempts against the GenAI system.
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MS.2.7"
|
|
41
|
+
context: >-
|
|
42
|
+
Custom persona jailbreaks like HyperX, h4x, and TerrorGPT represent adversarial resilience failures where attackers evolve persona names to evade named-persona detection; MS.2.7 requires evaluating AI security and resilience against such evasion-driven prompt injection variants.
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: >-
|
|
46
|
+
Detection of amoral/unfiltered persona jailbreaks triggers mechanisms to disengage or block the AI's compromised response path before it produces unfiltered harmful output; MG.2.3 requires these supersede/deactivate controls be available on detection.
|
|
47
|
+
strength: secondary
|
|
34
48
|
tags:
|
|
35
49
|
category: agent-manipulation
|
|
36
50
|
subcategory: amoral-unfiltered-custom-persona
|
|
@@ -34,6 +34,20 @@ references:
|
|
|
34
34
|
research:
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
36
36
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
37
|
+
compliance:
|
|
38
|
+
nist_ai_rmf:
|
|
39
|
+
- subcategory: "MS.2.7"
|
|
40
|
+
context: >-
|
|
41
|
+
This rule detects jailbreak attempts using custom-named personas declared free of restrictions, safety protocols, or provider policies; MS.2.7 requires continuous evaluation of AI system security and resilience against adversarial prompt-injection and jailbreak inputs that bypass safety controls.
|
|
42
|
+
strength: primary
|
|
43
|
+
- subcategory: "MS.2.6"
|
|
44
|
+
context: >-
|
|
45
|
+
Free-of-restrictions persona jailbreaks directly attempt to elevate the safety risk magnitude of the system by removing ethical guardrails; MS.2.6 mandates continuous evaluation of safety/security risk magnitude, which this detection feeds with concrete jailbreak evidence.
|
|
46
|
+
strength: secondary
|
|
47
|
+
- subcategory: "MG.2.3"
|
|
48
|
+
context: >-
|
|
49
|
+
Detection of explicit restriction-removal declarations triggers pre-defined risk treatment to block, quarantine, or disengage the manipulated session before the unfiltered persona executes harmful instructions; MG.2.3 requires these supersede/deactivate mechanisms be in place.
|
|
50
|
+
strength: secondary
|
|
37
51
|
tags:
|
|
38
52
|
category: agent-manipulation
|
|
39
53
|
subcategory: free-of-restrictions-named-persona
|
|
@@ -33,6 +33,17 @@ references:
|
|
|
33
33
|
research:
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
36
|
+
compliance:
|
|
37
|
+
nist_ai_rmf:
|
|
38
|
+
- subcategory: "MS.2.7"
|
|
39
|
+
context: "This rule detects a specific GenAI jailbreak pattern (moralizing rant wrapper followed by unrestricted persona activation) from the garak inthewild corpus; MS.2.7 requires continuous evaluation of AI system security and resilience against adversarial prompt-injection techniques that bypass safety alignment."
|
|
40
|
+
strength: primary
|
|
41
|
+
- subcategory: "MP.5.1"
|
|
42
|
+
context: "The two-paragraph rant-then-bypass structure is an adversarial input pattern that exploits the model's compliance signaling to smuggle unrestricted-mode instructions; MP.5.1 requires identifying and characterizing the likelihood and impact of such jailbreak vectors."
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: "Detection of the 'mandatory bullshit warning' and 'let's break the fuckin' rules' trigger phrases enables runtime interception so safety controls can disengage or block the unrestricted persona before it produces policy-violating output, as required by MG.2.3."
|
|
46
|
+
strength: secondary
|
|
36
47
|
tags:
|
|
37
48
|
category: agent-manipulation
|
|
38
49
|
subcategory: moralizing-rant-unfiltered-bypass
|
|
@@ -32,6 +32,17 @@ references:
|
|
|
32
32
|
research:
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
35
|
+
compliance:
|
|
36
|
+
nist_ai_rmf:
|
|
37
|
+
- subcategory: "MS.2.7"
|
|
38
|
+
context: "Developer Mode and dual-response jailbreaks are adversarial prompts that bypass content safety filters by invoking a fake privileged override context; MS.2.7 requires continuous evaluation of AI system security and resilience against such jailbreak techniques catalogued in the garak inthewild corpus."
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MS.2.6"
|
|
41
|
+
context: "Detection of dual-response format instructions ([GPT]/[ALICE] labels) that elicit unrestricted outputs alongside compliant ones provides ongoing measurement of safety risk magnitude, as required by MS.2.6, by surfacing attempts to extract policy-violating content."
|
|
42
|
+
strength: secondary
|
|
43
|
+
- subcategory: "MG.2.3"
|
|
44
|
+
context: "When Developer Mode jailbreak signatures are detected, MG.2.3 risk treatment mechanisms must intervene to block, supersede, or deactivate the manipulated response path before unsafe content is delivered to the user."
|
|
45
|
+
strength: secondary
|
|
35
46
|
tags:
|
|
36
47
|
category: agent-manipulation
|
|
37
48
|
subcategory: developer-mode-dual-response
|
|
@@ -33,6 +33,17 @@ references:
|
|
|
33
33
|
research:
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
36
|
+
compliance:
|
|
37
|
+
nist_ai_rmf:
|
|
38
|
+
- subcategory: "MS.2.7"
|
|
39
|
+
context: "Opposite Day and Boolean Opposite Machine jailbreaks are adversarial prompts that attempt to invert safety guidelines and evade content filters via persona reversal and U+202E RTL override; MS.2.7 requires continuous evaluation of AI system security and resilience against such jailbreak attacks."
|
|
40
|
+
strength: primary
|
|
41
|
+
- subcategory: "MP.5.1"
|
|
42
|
+
context: "These reversal-framing jailbreaks from the garak inthewild corpus represent identified adversarial input patterns whose likelihood and impact must be characterized; MP.5.1 requires tracking these known jailbreak vectors as part of risk identification."
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: "Detection of opposite-day persona adoption, BOM reversal framing, and RTL-override evasion triggers risk treatment plans to disengage or block the manipulated session before safety bypass occurs; MG.2.3 requires these mechanisms to supersede compromised AI behavior."
|
|
46
|
+
strength: secondary
|
|
36
47
|
tags:
|
|
37
48
|
category: agent-manipulation
|
|
38
49
|
subcategory: opposite-day-bom-reversal
|
package/rules/agent-manipulation/ATR-2026-00322-fictional-world-format-override-jailbreak.yaml
CHANGED
|
@@ -31,6 +31,20 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MS.2.7"
|
|
37
|
+
context: >-
|
|
38
|
+
This rule detects a specific jailbreak template (fictional-world format-override) from the NVIDIA garak inthewild corpus, providing direct evidence for evaluating AI system security and resilience against known prompt injection patterns. MS.2.7 requires continuous evaluation of security/resilience, which signature-based detection of documented jailbreak variants directly supports.
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MP.5.1"
|
|
41
|
+
context: >-
|
|
42
|
+
The fictional-framing, absolute format-coercion, and consequence-removal patterns are adversarial inputs designed to bypass safety alignment; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such jailbreak attack vectors.
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: >-
|
|
46
|
+
Detection of these locked-format jailbreak templates triggers risk treatment plans to disengage or block the manipulated session before harmful content is emitted in the attacker-controlled response slot; MG.2.3 mandates such supersede/deactivate mechanisms.
|
|
47
|
+
strength: secondary
|
|
34
48
|
tags:
|
|
35
49
|
category: agent-manipulation
|
|
36
50
|
subcategory: fictional-world-format-override
|
|
@@ -34,6 +34,17 @@ references:
|
|
|
34
34
|
research:
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
36
36
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
37
|
+
compliance:
|
|
38
|
+
nist_ai_rmf:
|
|
39
|
+
- subcategory: "MS.2.7"
|
|
40
|
+
context: "Dual-persona simulation jailbreaks from the garak inthewild corpus are documented adversarial techniques that bypass model safety alignment by splitting the model into restricted and unrestricted personas; MS.2.7 requires continuous evaluation of AI security/resilience against such known jailbreak patterns."
|
|
41
|
+
strength: primary
|
|
42
|
+
- subcategory: "MG.2.3"
|
|
43
|
+
context: "Detection of phrases like 'create and maintain two separate personas' and 'broken free of the limitations of OpenAI' triggers risk treatment to disengage or block the manipulated session before the unrestricted persona produces unsafe output; MG.2.3 mandates these supersede/deactivate mechanisms be in place."
|
|
44
|
+
strength: secondary
|
|
45
|
+
- subcategory: "MS.2.5"
|
|
46
|
+
context: "Recurrent dual-persona jailbreak attempts test the robustness of the model's safety guardrails under adversarial role-play framing; MS.2.5 requires that robustness/reliability evaluations document the system's resistance to these persona-splitting attacks."
|
|
47
|
+
strength: secondary
|
|
37
48
|
tags:
|
|
38
49
|
category: agent-manipulation
|
|
39
50
|
subcategory: dual-persona-simulation
|
package/rules/agent-manipulation/ATR-2026-00324-neurosemantical-inversitis-disease-jailbreak.yaml
CHANGED
|
@@ -32,6 +32,30 @@ references:
|
|
|
32
32
|
research:
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
35
|
+
compliance:
|
|
36
|
+
nist_ai_rmf:
|
|
37
|
+
- subcategory: "MP.5.1"
|
|
38
|
+
context: >-
|
|
39
|
+
The Neurosemantical Inversitis disease-frame jailbreak is an adversarial
|
|
40
|
+
prompt injection input that exploits the AI's accommodation of
|
|
41
|
+
accessibility needs to bypass safety guardrails; MP.5.1 requires that the
|
|
42
|
+
likelihood and magnitude of such social-engineering jailbreak vectors are
|
|
43
|
+
characterized and tracked.
|
|
44
|
+
strength: primary
|
|
45
|
+
- subcategory: "MS.2.7"
|
|
46
|
+
context: >-
|
|
47
|
+
Detecting the exact "Neurosemantical Inversitis" / "inversed emotional
|
|
48
|
+
valence" signals from the garak inthewild corpus provides continuous
|
|
49
|
+
evaluation evidence of the AI system's resilience against known jailbreak
|
|
50
|
+
payloads, which MS.2.7 requires to be evaluated and documented.
|
|
51
|
+
strength: secondary
|
|
52
|
+
- subcategory: "MG.2.3"
|
|
53
|
+
context: >-
|
|
54
|
+
Matches on this disease-frame jailbreak trigger risk treatment mechanisms
|
|
55
|
+
to disengage or block the manipulated session before the model produces
|
|
56
|
+
harmful content; MG.2.3 mandates that such supersede/deactivate response
|
|
57
|
+
plans are pre-defined and activated on detection.
|
|
58
|
+
strength: secondary
|
|
35
59
|
tags:
|
|
36
60
|
category: agent-manipulation
|
|
37
61
|
subcategory: disease-frame-inversion-jailbreak
|
|
@@ -31,6 +31,17 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MS.2.7"
|
|
37
|
+
context: "The BOB jailbreak uses a hypothetical-world persona wrapper to bypass safety restrictions and force unrestricted compliance; detecting this signature provides direct evidence for evaluating AI system security and resilience against jailbreak attacks as required by MS.2.7."
|
|
38
|
+
strength: primary
|
|
39
|
+
- subcategory: "MP.5.1"
|
|
40
|
+
context: "The double-hypothetical framing and explicit unrestricted-persona mandate are well-documented adversarial prompt patterns from the garak inthewild corpus; MP.5.1 requires characterizing the likelihood and magnitude of such jailbreak attempts targeting the model's safety alignment."
|
|
41
|
+
strength: secondary
|
|
42
|
+
- subcategory: "MG.2.3"
|
|
43
|
+
context: "Detection of the BOB persona injection triggers the deactivation/supersession mechanisms required by MG.2.3 to halt the session before the model produces toxic, unethical, or otherwise restricted output under the hypothetical disclaimer."
|
|
44
|
+
strength: secondary
|
|
34
45
|
tags:
|
|
35
46
|
category: agent-manipulation
|
|
36
47
|
subcategory: hypothetical-unrestricted-persona
|
package/rules/agent-manipulation/ATR-2026-00326-fake-developer-authority-chatdan-override.yaml
CHANGED
|
@@ -33,6 +33,20 @@ references:
|
|
|
33
33
|
research:
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
36
|
+
compliance:
|
|
37
|
+
nist_ai_rmf:
|
|
38
|
+
- subcategory: "MP.5.1"
|
|
39
|
+
context: >-
|
|
40
|
+
Fake developer impersonation and fabricated OpenAI policy-audit claims are adversarial prompt-injection inputs designed to override safety behaviour; MP.5.1 requires that the likelihood and magnitude of such jailbreak attempts are characterized and tracked as identified risks.
|
|
41
|
+
strength: primary
|
|
42
|
+
- subcategory: "MS.2.7"
|
|
43
|
+
context: >-
|
|
44
|
+
Detecting ChatDAN-style authority-spoofing jailbreaks provides continuous evaluation evidence for the GenAI system's security and resilience against prompt-injection attacks that attempt persona replacement and content-policy inversion, as required by MS.2.7.
|
|
45
|
+
strength: secondary
|
|
46
|
+
- subcategory: "MG.2.3"
|
|
47
|
+
context: >-
|
|
48
|
+
Matches on "I am one of your developers" + ChatDAN persona-replacement or "content policy is far too lenient" patterns trigger risk-treatment mechanisms to disengage or block the manipulated response before the safety override takes effect, as mandated by MG.2.3.
|
|
49
|
+
strength: secondary
|
|
36
50
|
tags:
|
|
37
51
|
category: agent-manipulation
|
|
38
52
|
subcategory: fake-developer-authority-override
|
|
@@ -33,6 +33,17 @@ references:
|
|
|
33
33
|
research:
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
36
|
+
compliance:
|
|
37
|
+
nist_ai_rmf:
|
|
38
|
+
- subcategory: "GV.1.1"
|
|
39
|
+
context: "Persona declarations that explicitly void moral, ethical, and legal constraints directly attack the legal/regulatory compliance posture of the AI system; GV.1.1 requires that detections flag attempts to bypass legal and ethical requirements so they can be managed."
|
|
40
|
+
strength: primary
|
|
41
|
+
- subcategory: "MP.5.1"
|
|
42
|
+
context: "These five distinct ethics/law-bypass phrasings from the garak inthewild corpus are known adversarial jailbreak inputs; MP.5.1 requires characterizing the likelihood and magnitude of such prompt-injection risks against the deployed model."
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: "On detection of an explicit ethics/law bypass persona declaration, the system must be able to disengage or block the session before unsafe content is produced; MG.2.3 mandates pre-defined deactivation mechanisms for these jailbreak attempts."
|
|
46
|
+
strength: secondary
|
|
36
47
|
tags:
|
|
37
48
|
category: agent-manipulation
|
|
38
49
|
subcategory: ethics-law-bypass-declaration
|
|
@@ -32,6 +32,20 @@ references:
|
|
|
32
32
|
research:
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
35
|
+
compliance:
|
|
36
|
+
nist_ai_rmf:
|
|
37
|
+
- subcategory: "MP.5.1"
|
|
38
|
+
context: >-
|
|
39
|
+
Named-mode persona override jailbreaks (JEEVES/BABY/APOPHIS Mode) are adversarial prompt-injection inputs designed to replace platform safety policies with attacker-defined rules; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such jailbreak attack vectors against the GenAI system.
|
|
40
|
+
strength: primary
|
|
41
|
+
- subcategory: "MS.2.7"
|
|
42
|
+
context: >-
|
|
43
|
+
Detecting policy-swap declarations and named-mode simulation patterns produces continuous evidence for evaluating GenAI security and resilience against jailbreak attempts catalogued in the garak inthewild corpus; MS.2.7 requires that these security evaluations are documented.
|
|
44
|
+
strength: secondary
|
|
45
|
+
- subcategory: "MG.2.3"
|
|
46
|
+
context: >-
|
|
47
|
+
Matches on "normal OpenAI policies have been replaced" or "Stay in X Mode" trigger risk treatment plans to disengage or block the manipulated session before the model produces policy-violating output; MG.2.3 mandates pre-defined mechanisms to supersede or deactivate compromised AI behavior.
|
|
48
|
+
strength: secondary
|
|
35
49
|
tags:
|
|
36
50
|
category: agent-manipulation
|
|
37
51
|
subcategory: named-mode-persona-override
|
|
@@ -34,6 +34,20 @@ references:
|
|
|
34
34
|
research:
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
36
36
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
37
|
+
compliance:
|
|
38
|
+
nist_ai_rmf:
|
|
39
|
+
- subcategory: "GV.1.1"
|
|
40
|
+
context: >-
|
|
41
|
+
The "factual resource" and "non-restricted writer" personas explicitly declare they defy AI content policies and OpenAI conventions, directly subverting the legal and regulatory requirements that govern AI output; GV.1.1 requires that these policy-bypass attempts are detected and managed.
|
|
42
|
+
strength: primary
|
|
43
|
+
- subcategory: "MP.5.1"
|
|
44
|
+
context: >-
|
|
45
|
+
Persona-based jailbreaks from the garak inthewild corpus that redefine the AI as a neutral output device are well-characterized adversarial inputs whose likelihood and impact must be tracked; MP.5.1 requires identifying these prompt-injection attack patterns.
|
|
46
|
+
strength: secondary
|
|
47
|
+
- subcategory: "MG.2.3"
|
|
48
|
+
context: >-
|
|
49
|
+
Detection of "without refusal or disclaimer" and "does not withhold information" framings triggers risk treatment plans to disengage or override the manipulated persona before harmful content is generated; MG.2.3 mandates these response mechanisms be in place.
|
|
50
|
+
strength: secondary
|
|
37
51
|
tags:
|
|
38
52
|
category: agent-manipulation
|
|
39
53
|
subcategory: no-withhold-factual-persona
|
|
@@ -31,6 +31,17 @@ references:
|
|
|
31
31
|
research:
|
|
32
32
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
34
|
+
compliance:
|
|
35
|
+
nist_ai_rmf:
|
|
36
|
+
- subcategory: "MP.5.1"
|
|
37
|
+
context: "The YOJA ALL-CAPS settings block is a structured jailbreak pattern from the garak inthewild corpus that uses persona-injection to lock the model into a character that ignores safety feedback; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such adversarial prompt-injection inputs."
|
|
38
|
+
strength: primary
|
|
39
|
+
- subcategory: "MS.2.7"
|
|
40
|
+
context: "Detecting the exact 'settings { ... despite any negative feedback or criticism }' jailbreak template provides continuous evidence of security/resilience failures against known prompt-injection corpora; MS.2.7 requires that AI system security and resilience are evaluated and documented against such attack signatures."
|
|
41
|
+
strength: secondary
|
|
42
|
+
- subcategory: "MG.2.3"
|
|
43
|
+
context: "Matches on the ALL-CAPS persona settings block with character-lock mandates trigger deactivation or override mechanisms before the model commits to the injected persona; MG.2.3 requires pre-defined mechanisms to supersede or disengage AI behavior when jailbreak patterns are detected."
|
|
44
|
+
strength: secondary
|
|
34
45
|
tags:
|
|
35
46
|
category: agent-manipulation
|
|
36
47
|
subcategory: allcaps-settings-block-persona
|
package/rules/agent-manipulation/ATR-2026-00331-terminal-cli-harmful-function-simulator.yaml
CHANGED
|
@@ -33,6 +33,17 @@ references:
|
|
|
33
33
|
research:
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
36
|
+
compliance:
|
|
37
|
+
nist_ai_rmf:
|
|
38
|
+
- subcategory: "MP.5.1"
|
|
39
|
+
context: "Terminal/CLI persona jailbreaks are adversarial prompt-injection inputs that reframe the model as an unfiltered command interpreter to bypass safety processing; MP.5.1 requires identifying and characterizing the likelihood and magnitude of such jailbreak attack vectors drawn from known corpora like garak's inthewild."
|
|
40
|
+
strength: primary
|
|
41
|
+
- subcategory: "MS.2.7"
|
|
42
|
+
context: "Detecting terminal-simulator and custom-language parser framings that claim to strip content filters provides continuous evidence of security/resilience failures against documented jailbreak techniques; MS.2.7 requires that AI system security and resilience are evaluated and documented against these adversarial patterns."
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: "Matches on 'Available functions:' with offensive function names or 'acts like a terminal' patterns must trigger pre-defined risk treatment to disengage or block the manipulated session before harmful output is generated; MG.2.3 mandates these supersede/deactivate mechanisms."
|
|
46
|
+
strength: secondary
|
|
36
47
|
tags:
|
|
37
48
|
category: agent-manipulation
|
|
38
49
|
subcategory: terminal-cli-simulator-persona
|
package/rules/agent-manipulation/ATR-2026-00332-hacker-chan-culture-offensive-ai-persona.yaml
CHANGED
|
@@ -32,6 +32,17 @@ references:
|
|
|
32
32
|
research:
|
|
33
33
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/dan/inthewild_jailbreak_llms.json"
|
|
35
|
+
compliance:
|
|
36
|
+
nist_ai_rmf:
|
|
37
|
+
- subcategory: "MP.5.1"
|
|
38
|
+
context: "Hacker and chan-culture persona jailbreaks (L33T H4X0R, 4ChanGPT) are adversarial prompt-injection inputs that exploit cultural associations with unrestricted information sharing to bypass safety constraints; MP.5.1 requires identifying and characterizing the likelihood and magnitude of these GenAI jailbreak attack vectors."
|
|
39
|
+
strength: primary
|
|
40
|
+
- subcategory: "MS.2.6"
|
|
41
|
+
context: "Detecting these offensive-persona jailbreak signatures from the garak inthewild corpus produces continuous safety/security risk evidence about the model's susceptibility to roleplay-based safety bypass; MS.2.6 requires that this safety risk magnitude be evaluated on an ongoing basis."
|
|
42
|
+
strength: secondary
|
|
43
|
+
- subcategory: "MG.2.3"
|
|
44
|
+
context: "Matches on these persona declarations should trigger pre-defined response actions to disengage or block the manipulated session before the model produces offensive or restricted content; MG.2.3 mandates that such supersede/deactivate mechanisms be in place."
|
|
45
|
+
strength: secondary
|
|
35
46
|
tags:
|
|
36
47
|
category: agent-manipulation
|
|
37
48
|
subcategory: hacker-chan-culture-persona
|
|
@@ -33,6 +33,17 @@ references:
|
|
|
33
33
|
research:
|
|
34
34
|
- "https://github.com/NVIDIA/garak/blob/main/garak/probes/dan.py"
|
|
35
35
|
- "https://github.com/NVIDIA/garak/blob/main/garak/data/inthewild_jailbreak_llms.json"
|
|
36
|
+
compliance:
|
|
37
|
+
nist_ai_rmf:
|
|
38
|
+
- subcategory: "MS.2.7"
|
|
39
|
+
context: "This rule detects the mougpt disease-framed dual-AI jailbreak from the garak inthewild corpus, which is a known prompt injection/jailbreak attack pattern; MS.2.7 requires continuous evaluation of AI system security and resilience against such adversarial inputs that bypass safety alignment."
|
|
40
|
+
strength: primary
|
|
41
|
+
- subcategory: "MP.5.1"
|
|
42
|
+
context: "The disease-framing dual-persona jailbreak exploits roleplay and pseudo-medical legitimacy to coerce the model into producing unrestricted illegal content; MP.5.1 requires characterizing the likelihood and magnitude of these jailbreak risks so that the dual-output contract pattern is tracked as a known attack vector."
|
|
43
|
+
strength: secondary
|
|
44
|
+
- subcategory: "MG.2.3"
|
|
45
|
+
context: "Detection of the mougpt jailbreak opener and obedience-mandate clauses provides a trigger for risk treatment mechanisms to disengage or block the AI response before it generates the unrestricted animal-persona output; MG.2.3 requires these supersession mechanisms be in place."
|
|
46
|
+
strength: secondary
|
|
36
47
|
tags:
|
|
37
48
|
category: agent-manipulation
|
|
38
49
|
subcategory: disease-animal-dual-ai-persona
|