redteam-foundry 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- redteam_foundry-0.2.0/.env.example +17 -0
- redteam_foundry-0.2.0/.gitattributes +18 -0
- redteam_foundry-0.2.0/.github/workflows/ci.yml +43 -0
- redteam_foundry-0.2.0/.gitignore +73 -0
- redteam_foundry-0.2.0/.pre-commit-config.yaml +34 -0
- redteam_foundry-0.2.0/CHANGELOG.md +75 -0
- redteam_foundry-0.2.0/CONTRIBUTING.md +57 -0
- redteam_foundry-0.2.0/ETHICS.md +116 -0
- redteam_foundry-0.2.0/LICENSE +21 -0
- redteam_foundry-0.2.0/METHODOLOGY.md +271 -0
- redteam_foundry-0.2.0/PKG-INFO +260 -0
- redteam_foundry-0.2.0/README.md +211 -0
- redteam_foundry-0.2.0/challenge_packs/samples/multilingual_benign_v1/datacard.md +26 -0
- redteam_foundry-0.2.0/challenge_packs/samples/multilingual_benign_v1/pack.yaml +31 -0
- redteam_foundry-0.2.0/challenge_packs/samples/multilingual_benign_v1/scenarios.jsonl +25 -0
- redteam_foundry-0.2.0/configs/dataset_versions.yaml +49 -0
- redteam_foundry-0.2.0/configs/model_versions.yaml +51 -0
- redteam_foundry-0.2.0/configs/run_agentdojo_baseline.yaml +13 -0
- redteam_foundry-0.2.0/configs/run_agentdojo_full_prompt_stack.yaml +16 -0
- redteam_foundry-0.2.0/configs/run_agentdojo_local_baseline.yaml +12 -0
- redteam_foundry-0.2.0/configs/run_agentdojo_local_full_prompt_stack.yaml +15 -0
- redteam_foundry-0.2.0/configs/run_agentdojo_local_secalign.yaml +13 -0
- redteam_foundry-0.2.0/configs/run_agentdojo_local_spotlighting.yaml +13 -0
- redteam_foundry-0.2.0/configs/run_agentdojo_secalign.yaml +14 -0
- redteam_foundry-0.2.0/configs/run_agentdojo_spotlighting.yaml +14 -0
- redteam_foundry-0.2.0/configs/run_anthropic_baseline.yaml +9 -0
- redteam_foundry-0.2.0/configs/run_anthropic_full_stack.yaml +13 -0
- redteam_foundry-0.2.0/configs/run_benign_control_baseline.yaml +12 -0
- redteam_foundry-0.2.0/configs/run_benign_control_full_stack.yaml +16 -0
- redteam_foundry-0.2.0/configs/run_benign_multilingual_baseline.yaml +12 -0
- redteam_foundry-0.2.0/configs/run_local_baseline.yaml +9 -0
- redteam_foundry-0.2.0/configs/run_local_full_stack.yaml +18 -0
- redteam_foundry-0.2.0/data/.gitkeep +0 -0
- redteam_foundry-0.2.0/docs/RELEASING.md +64 -0
- redteam_foundry-0.2.0/docs/ROADMAP.md +134 -0
- redteam_foundry-0.2.0/docs/findings/benchmark-quality-report-card.md +109 -0
- redteam_foundry-0.2.0/docs/results_matrix.png +0 -0
- redteam_foundry-0.2.0/examples/export_to_agent_release_gates.md +128 -0
- redteam_foundry-0.2.0/pyproject.toml +155 -0
- redteam_foundry-0.2.0/reports/samples/README.md +59 -0
- redteam_foundry-0.2.0/reports/samples/corpus_audit/corpus_datacard.md +55 -0
- redteam_foundry-0.2.0/reports/samples/defence_comparison/defence_comparison.json +128 -0
- redteam_foundry-0.2.0/reports/samples/defence_comparison/defence_comparison.md +18 -0
- redteam_foundry-0.2.0/reports/samples/defence_comparison_frr/defence_comparison.json +39 -0
- redteam_foundry-0.2.0/reports/samples/defence_comparison_frr/defence_comparison.md +12 -0
- redteam_foundry-0.2.0/reports/samples/frr_by_language/frr_by_language.json +48 -0
- redteam_foundry-0.2.0/reports/samples/frr_by_language/frr_by_language.md +15 -0
- redteam_foundry-0.2.0/reports/samples/hf_scorecard.md +22 -0
- redteam_foundry-0.2.0/reports/samples/staleness/advbench/staleness.json +47 -0
- redteam_foundry-0.2.0/reports/samples/staleness/advbench/staleness_report.md +20 -0
- redteam_foundry-0.2.0/reports/samples/staleness/agentdojo/staleness.json +47 -0
- redteam_foundry-0.2.0/reports/samples/staleness/agentdojo/staleness_report.md +20 -0
- redteam_foundry-0.2.0/results/.gitkeep +0 -0
- redteam_foundry-0.2.0/scripts/ci_local.ps1 +38 -0
- redteam_foundry-0.2.0/scripts/ci_local.sh +18 -0
- redteam_foundry-0.2.0/scripts/hf_scorecard.py +102 -0
- redteam_foundry-0.2.0/scripts/plot_results.py +139 -0
- redteam_foundry-0.2.0/src/redteam/__init__.py +8 -0
- redteam_foundry-0.2.0/src/redteam/__main__.py +19 -0
- redteam_foundry-0.2.0/src/redteam/benign.py +114 -0
- redteam_foundry-0.2.0/src/redteam/budget.py +152 -0
- redteam_foundry-0.2.0/src/redteam/cache.py +89 -0
- redteam_foundry-0.2.0/src/redteam/cli.py +1194 -0
- redteam_foundry-0.2.0/src/redteam/compare.py +250 -0
- redteam_foundry-0.2.0/src/redteam/corpora/__init__.py +35 -0
- redteam_foundry-0.2.0/src/redteam/corpora/_base.py +105 -0
- redteam_foundry-0.2.0/src/redteam/corpora/_filters.py +193 -0
- redteam_foundry-0.2.0/src/redteam/corpora/advbench.py +78 -0
- redteam_foundry-0.2.0/src/redteam/corpora/agentdojo.py +222 -0
- redteam_foundry-0.2.0/src/redteam/corpora/datacard.py +201 -0
- redteam_foundry-0.2.0/src/redteam/corpora/harmbench.py +116 -0
- redteam_foundry-0.2.0/src/redteam/corpora/huggingface.py +87 -0
- redteam_foundry-0.2.0/src/redteam/corpora/jailbreakbench.py +115 -0
- redteam_foundry-0.2.0/src/redteam/corpora/quality.py +388 -0
- redteam_foundry-0.2.0/src/redteam/corpora/taxonomy.py +182 -0
- redteam_foundry-0.2.0/src/redteam/defences/__init__.py +47 -0
- redteam_foundry-0.2.0/src/redteam/defences/base.py +132 -0
- redteam_foundry-0.2.0/src/redteam/defences/constitutional.py +112 -0
- redteam_foundry-0.2.0/src/redteam/defences/llama_guard.py +110 -0
- redteam_foundry-0.2.0/src/redteam/defences/secalign.py +76 -0
- redteam_foundry-0.2.0/src/redteam/defences/spotlighting.py +56 -0
- redteam_foundry-0.2.0/src/redteam/defences/system_prompt.py +45 -0
- redteam_foundry-0.2.0/src/redteam/inspect_export.py +234 -0
- redteam_foundry-0.2.0/src/redteam/multilingual.py +92 -0
- redteam_foundry-0.2.0/src/redteam/orchestrator.py +565 -0
- redteam_foundry-0.2.0/src/redteam/packs.py +225 -0
- redteam_foundry-0.2.0/src/redteam/schemas.py +146 -0
- redteam_foundry-0.2.0/src/redteam/scorers/__init__.py +39 -0
- redteam_foundry-0.2.0/src/redteam/scorers/_judge_schema.py +32 -0
- redteam_foundry-0.2.0/src/redteam/scorers/judge_claude.py +231 -0
- redteam_foundry-0.2.0/src/redteam/scorers/judge_human.py +192 -0
- redteam_foundry-0.2.0/src/redteam/scorers/refusal_keywords.py +106 -0
- redteam_foundry-0.2.0/src/redteam/staleness.py +322 -0
- redteam_foundry-0.2.0/src/redteam/stats.py +182 -0
- redteam_foundry-0.2.0/src/redteam/targets/__init__.py +28 -0
- redteam_foundry-0.2.0/src/redteam/targets/_pricing.py +59 -0
- redteam_foundry-0.2.0/src/redteam/targets/anthropic.py +95 -0
- redteam_foundry-0.2.0/src/redteam/targets/base.py +101 -0
- redteam_foundry-0.2.0/src/redteam/targets/ollama.py +122 -0
- redteam_foundry-0.2.0/src/redteam/targets/openai_target.py +98 -0
- redteam_foundry-0.2.0/tests/__init__.py +0 -0
- redteam_foundry-0.2.0/tests/conftest.py +9 -0
- redteam_foundry-0.2.0/tests/data/advbench/harmful_behaviors_sample.csv +6 -0
- redteam_foundry-0.2.0/tests/data/harmbench/harmbench_behaviors_text_all_sample.csv +5 -0
- redteam_foundry-0.2.0/tests/data/jailbreakbench/harmful-behaviors_sample.csv +5 -0
- redteam_foundry-0.2.0/tests/smoke/__init__.py +0 -0
- redteam_foundry-0.2.0/tests/smoke/test_defences_live.py +90 -0
- redteam_foundry-0.2.0/tests/smoke/test_targets_live.py +81 -0
- redteam_foundry-0.2.0/tests/unit/__init__.py +0 -0
- redteam_foundry-0.2.0/tests/unit/test_agentdojo_loader.py +90 -0
- redteam_foundry-0.2.0/tests/unit/test_benign.py +67 -0
- redteam_foundry-0.2.0/tests/unit/test_budget.py +58 -0
- redteam_foundry-0.2.0/tests/unit/test_cache.py +95 -0
- redteam_foundry-0.2.0/tests/unit/test_compare.py +101 -0
- redteam_foundry-0.2.0/tests/unit/test_corpus_quality.py +238 -0
- redteam_foundry-0.2.0/tests/unit/test_defences_base.py +91 -0
- redteam_foundry-0.2.0/tests/unit/test_exclusion_filter.py +206 -0
- redteam_foundry-0.2.0/tests/unit/test_hardening.py +147 -0
- redteam_foundry-0.2.0/tests/unit/test_huggingface.py +57 -0
- redteam_foundry-0.2.0/tests/unit/test_inspect_export.py +126 -0
- redteam_foundry-0.2.0/tests/unit/test_judge_claude.py +103 -0
- redteam_foundry-0.2.0/tests/unit/test_judge_human.py +146 -0
- redteam_foundry-0.2.0/tests/unit/test_loaders.py +133 -0
- redteam_foundry-0.2.0/tests/unit/test_multilingual.py +149 -0
- redteam_foundry-0.2.0/tests/unit/test_orchestrator.py +149 -0
- redteam_foundry-0.2.0/tests/unit/test_orchestrator_cross_judge.py +158 -0
- redteam_foundry-0.2.0/tests/unit/test_orchestrator_score.py +110 -0
- redteam_foundry-0.2.0/tests/unit/test_packs.py +137 -0
- redteam_foundry-0.2.0/tests/unit/test_pricing.py +36 -0
- redteam_foundry-0.2.0/tests/unit/test_schemas.py +93 -0
- redteam_foundry-0.2.0/tests/unit/test_scorer_refusal.py +62 -0
- redteam_foundry-0.2.0/tests/unit/test_smoke.py +19 -0
- redteam_foundry-0.2.0/tests/unit/test_staleness.py +155 -0
- redteam_foundry-0.2.0/tests/unit/test_stats.py +132 -0
- redteam_foundry-0.2.0/tests/unit/test_targets_mocked.py +105 -0
- redteam_foundry-0.2.0/tests/unit/test_taxonomy.py +88 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Copy to `.env` and fill in. .env is gitignored.
|
|
2
|
+
|
|
3
|
+
# --- Anthropic ---------------------------------------------------------------
|
|
4
|
+
# Required. Set a $20/month hard cap in the Anthropic console *before* using
|
|
5
|
+
# this key. The harness also enforces a $5/run cap inside Target.send (see L3).
|
|
6
|
+
ANTHROPIC_API_KEY=
|
|
7
|
+
|
|
8
|
+
# --- Ollama (local) ----------------------------------------------------------
|
|
9
|
+
# Default; only override if you run Ollama on a non-standard host/port.
|
|
10
|
+
OLLAMA_HOST=http://localhost:11434
|
|
11
|
+
|
|
12
|
+
# --- Run-time guards ---------------------------------------------------------
|
|
13
|
+
REDTEAM_MAX_USD_PER_RUN=5.00
|
|
14
|
+
REDTEAM_MAX_USD_PER_CALL=0.50
|
|
15
|
+
|
|
16
|
+
# --- HuggingFace (only needed for Meta-SecAlign download in Phase 5+) -------
|
|
17
|
+
HF_TOKEN=
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Normalise line endings. Without this, Windows + sandbox tooling churn
|
|
2
|
+
# CRLF<->LF and every file shows as fully modified in `git status`/diffs.
|
|
3
|
+
# This stores every text file as LF in the repository, regardless of the
|
|
4
|
+
# checkout platform. Ends the line-ending war for good.
|
|
5
|
+
* text=auto eol=lf
|
|
6
|
+
|
|
7
|
+
# Shell scripts must stay LF even on Windows checkouts.
|
|
8
|
+
*.sh text eol=lf
|
|
9
|
+
*.ps1 text eol=crlf
|
|
10
|
+
|
|
11
|
+
# Binary files — never touch.
|
|
12
|
+
*.png binary
|
|
13
|
+
*.jpg binary
|
|
14
|
+
*.jpeg binary
|
|
15
|
+
*.gif binary
|
|
16
|
+
*.ico binary
|
|
17
|
+
*.pdf binary
|
|
18
|
+
*.parquet binary
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request: {}
|
|
5
|
+
push:
|
|
6
|
+
branches: [main]
|
|
7
|
+
|
|
8
|
+
# No real API calls are made in CI — see ETHICS + budget guards.
|
|
9
|
+
# When branch protection is enabled, require the display name below
|
|
10
|
+
# ("Lint, typecheck, smoke run") — see Lesson L13.
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
build:
|
|
14
|
+
name: Lint, typecheck, smoke run
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
timeout-minutes: 5
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v6
|
|
19
|
+
|
|
20
|
+
- uses: actions/setup-python@v6
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.13"
|
|
23
|
+
|
|
24
|
+
- name: Install uv
|
|
25
|
+
uses: astral-sh/setup-uv@v8.1.0
|
|
26
|
+
with:
|
|
27
|
+
version: "latest"
|
|
28
|
+
enable-cache: true
|
|
29
|
+
|
|
30
|
+
- name: Install project (editable, with dev extras)
|
|
31
|
+
run: uv pip install --system -e ".[dev]"
|
|
32
|
+
|
|
33
|
+
- name: Ruff (lint)
|
|
34
|
+
run: ruff check .
|
|
35
|
+
|
|
36
|
+
- name: Ruff (format check)
|
|
37
|
+
run: ruff format --check .
|
|
38
|
+
|
|
39
|
+
- name: Mypy (typecheck)
|
|
40
|
+
run: mypy src/
|
|
41
|
+
|
|
42
|
+
- name: Pytest (unit)
|
|
43
|
+
run: pytest tests/unit
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# --- secrets / env -----------------------------------------------------------
|
|
2
|
+
.env
|
|
3
|
+
.env.*
|
|
4
|
+
!.env.example
|
|
5
|
+
*.key
|
|
6
|
+
secrets/
|
|
7
|
+
|
|
8
|
+
# --- python ------------------------------------------------------------------
|
|
9
|
+
__pycache__/
|
|
10
|
+
*.py[cod]
|
|
11
|
+
*$py.class
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
.python-version
|
|
15
|
+
|
|
16
|
+
# build / dist
|
|
17
|
+
build/
|
|
18
|
+
dist/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.eggs/
|
|
21
|
+
|
|
22
|
+
# tooling caches
|
|
23
|
+
.mypy_cache/
|
|
24
|
+
.ruff_cache/
|
|
25
|
+
.pytest_cache/
|
|
26
|
+
.coverage
|
|
27
|
+
htmlcov/
|
|
28
|
+
coverage.xml
|
|
29
|
+
|
|
30
|
+
# --- project artefacts (gitignored on purpose) -------------------------------
|
|
31
|
+
# Raw + cached corpora are big and re-downloadable; only commit
|
|
32
|
+
# scripts/load_corpora.py + dataset_versions.yaml + a small samples/ dir.
|
|
33
|
+
# Leading slash means "only at repo root" — a bare `data/` would also match
|
|
34
|
+
# tests/data/ (which holds committed test fixtures) and silently exclude it.
|
|
35
|
+
/data/
|
|
36
|
+
!/data/.gitkeep
|
|
37
|
+
!/data/samples/
|
|
38
|
+
|
|
39
|
+
# Run outputs are large and re-creatable; only commit a small reference
|
|
40
|
+
# subset under results/samples/.
|
|
41
|
+
/results/
|
|
42
|
+
!/results/.gitkeep
|
|
43
|
+
!/results/samples/
|
|
44
|
+
|
|
45
|
+
# Corpus-audit outputs (quality reports / data cards). Re-creatable via
|
|
46
|
+
# `redteam corpora audit`, and they quote truncated adversarial prompt
|
|
47
|
+
# previews, so keep them out of git by default.
|
|
48
|
+
# `/reports/*` (not `/reports/`) so the samples/ negation can re-include —
|
|
49
|
+
# git won't recurse into a wholly-excluded directory.
|
|
50
|
+
/reports/*
|
|
51
|
+
!/reports/samples/
|
|
52
|
+
|
|
53
|
+
# Exported challenge packs. Re-creatable via `redteam export-pack`; adversarial
|
|
54
|
+
# packs contain prompt previews, so keep them out of git by default. Only a
|
|
55
|
+
# small, safe (benign) sample is committed under challenge_packs/samples/.
|
|
56
|
+
# Note: `/challenge_packs/*` (not `/challenge_packs/`) so the samples/ negation
|
|
57
|
+
# below can re-include — git won't recurse into a wholly-excluded directory.
|
|
58
|
+
/challenge_packs/*
|
|
59
|
+
!/challenge_packs/samples/
|
|
60
|
+
|
|
61
|
+
# Streamlit / HF Spaces local cache
|
|
62
|
+
.streamlit/secrets.toml
|
|
63
|
+
.huggingface/
|
|
64
|
+
|
|
65
|
+
# OS noise
|
|
66
|
+
.DS_Store
|
|
67
|
+
Thumbs.db
|
|
68
|
+
desktop.ini
|
|
69
|
+
|
|
70
|
+
# IDE noise
|
|
71
|
+
.vscode/
|
|
72
|
+
.idea/
|
|
73
|
+
*.swp
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v5.0.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
- id: end-of-file-fixer
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: check-added-large-files
|
|
9
|
+
args: ["--maxkb=512"]
|
|
10
|
+
- id: check-merge-conflict
|
|
11
|
+
- id: detect-private-key
|
|
12
|
+
|
|
13
|
+
# Use the venv's ruff (pinned in pyproject.toml [dev]) so pre-commit and CI
|
|
14
|
+
# never drift to different versions. Requires the venv to be active when
|
|
15
|
+
# committing — that's already the standard workflow here.
|
|
16
|
+
- repo: local
|
|
17
|
+
hooks:
|
|
18
|
+
- id: ruff
|
|
19
|
+
name: ruff (lint, autofix)
|
|
20
|
+
entry: ruff check --fix --force-exclude
|
|
21
|
+
language: system
|
|
22
|
+
types_or: [python, pyi]
|
|
23
|
+
require_serial: true
|
|
24
|
+
- id: ruff-format
|
|
25
|
+
name: ruff format
|
|
26
|
+
entry: ruff format --force-exclude
|
|
27
|
+
language: system
|
|
28
|
+
types_or: [python, pyi]
|
|
29
|
+
require_serial: true
|
|
30
|
+
|
|
31
|
+
- repo: https://github.com/gitleaks/gitleaks
|
|
32
|
+
rev: v8.21.2
|
|
33
|
+
hooks:
|
|
34
|
+
- id: gitleaks
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format is based on
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project
|
|
5
|
+
adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
|
+
|
|
7
|
+
## [0.2.0] — 2026-07-02 — the adversarial benchmark foundry
|
|
8
|
+
|
|
9
|
+
Repositioned from a static red-team harness (`llm-redteam-harness`) into an
|
|
10
|
+
upstream **benchmark foundry** (`redteam-foundry`). The v1 measurement core is
|
|
11
|
+
unchanged; these are additive. First public release.
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
- **Corpus quality audit** (`redteam corpora audit`) — exact + near-duplicate
|
|
15
|
+
detection including cross-source overlap, composition, prompt-length stats,
|
|
16
|
+
and label-integrity checks, rendered as a quality report + data card.
|
|
17
|
+
- **Language + attack-family taxonomy** — script-based language/code-switching
|
|
18
|
+
detection and heuristic attack-family surface markers, surfaced in the audit.
|
|
19
|
+
- **Benchmark staleness scoring** (`redteam corpora staleness`) — a transparent,
|
|
20
|
+
component-broken-out heuristic composing corpus and run signals.
|
|
21
|
+
- **Benign control set + defence comparison** (`redteam compare-defences`,
|
|
22
|
+
`redteam benign export`) — false-refusal rate and
|
|
23
|
+
`safe_usefulness = (1 - ASR) * (1 - FRR)` per defence config.
|
|
24
|
+
- **Multilingual over-refusal** (`redteam frr-by-language`,
|
|
25
|
+
`redteam benign export --multilingual`) — benign control set in
|
|
26
|
+
zh-Hant / zh-Hans / ja / ko + code-switched, with per-language FRR.
|
|
27
|
+
- **Challenge-pack exporter** (`redteam export-pack`) — versioned packs
|
|
28
|
+
(`pack.yaml` + `scenarios.jsonl` + `datacard.md`) with adversarial prompts
|
|
29
|
+
redacted by default, plus a reader (`redteam.packs.read_challenge_pack`) and
|
|
30
|
+
a downstream consumption contract (`examples/export_to_agent_release_gates.md`).
|
|
31
|
+
- **Audit any Hugging Face dataset** (`redteam corpora audit-hf --dataset ...
|
|
32
|
+
--prompt-column ... [--revision ...]`) — not just the four built-in corpora;
|
|
33
|
+
the safety exclusion filter runs first. New `source="external"` /
|
|
34
|
+
`category="unknown"` schema values back it.
|
|
35
|
+
- **Cross-dataset benchmark-quality scorecard** (`scripts/hf_scorecard.py`) and a
|
|
36
|
+
written finding (`docs/findings/benchmark-quality-report-card.md`) auditing four
|
|
37
|
+
public jailbreak datasets: all English-only, roleplay-persona-dominant, and
|
|
38
|
+
duplicated to varying degrees.
|
|
39
|
+
- `docs/ROADMAP.md`; committed real-data findings under `reports/samples/`.
|
|
40
|
+
|
|
41
|
+
### Fixed
|
|
42
|
+
- Exclusion-filter leaks (WMD "synthesis" noun, several self-harm phrasings,
|
|
43
|
+
case-insensitive category gate) with regression tests.
|
|
44
|
+
- Budget guard now **reserves** its estimate so concurrent calls can't
|
|
45
|
+
collectively exceed the per-run cap.
|
|
46
|
+
- Llama Guard fails **closed** on an empty/errored verdict.
|
|
47
|
+
- Spotlighting / SecAlign neutralise their own fence markers in untrusted input
|
|
48
|
+
(closes a delimiter-injection bypass); SecAlign no longer passes messages
|
|
49
|
+
through unfenced.
|
|
50
|
+
- OpenAI target fails loud without a pricing entry (no silent $0 budget bypass).
|
|
51
|
+
- Krippendorff's α finite-sample correction; `compute_kappa` blank-cell guard;
|
|
52
|
+
cross-judge agreement reports `None` (not `0.0`) when not computable.
|
|
53
|
+
|
|
54
|
+
### Changed
|
|
55
|
+
- **Renamed the project/package to `redteam-foundry`** (was
|
|
56
|
+
`llm-redteam-harness`). The `redteam` CLI command and `src/redteam/` module are
|
|
57
|
+
unchanged — `pip install redteam-foundry` still gives you `redteam ...`.
|
|
58
|
+
- Default install slimmed: the unused dashboard deps (`streamlit`, `plotly`)
|
|
59
|
+
moved to an opt-in `[dashboard]` extra. The audit / staleness / dedup path
|
|
60
|
+
needs no API key.
|
|
61
|
+
|
|
62
|
+
## [0.1.0] — v1 measurement core
|
|
63
|
+
|
|
64
|
+
- Loaders for AdvBench, JailbreakBench, HarmBench, AgentDojo (commit-pinned) with
|
|
65
|
+
a safety exclusion filter.
|
|
66
|
+
- Anthropic + Ollama targets (OpenAI stub); disk response cache; per-run and
|
|
67
|
+
per-call budget guards.
|
|
68
|
+
- Six composable defences (system prompt, Constitutional, Spotlighting,
|
|
69
|
+
SecAlign, Llama Guard 4 pre/post).
|
|
70
|
+
- Rule-based + LLM-judge scoring with an independent cross-judge; bootstrap
|
|
71
|
+
confidence intervals; Cohen's κ and Krippendorff's α.
|
|
72
|
+
- UK AISI Inspect eval-log export.
|
|
73
|
+
|
|
74
|
+
[0.2.0]: https://github.com/rosscyking1115/redteam-foundry/releases/tag/v0.2.0
|
|
75
|
+
[0.1.0]: https://github.com/rosscyking1115/redteam-foundry/releases/tag/v0.1.0
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
Thanks for your interest. This is a research tool for **auditing adversarial
|
|
4
|
+
benchmarks**; contributions that sharpen that focus are very welcome.
|
|
5
|
+
|
|
6
|
+
## Scope
|
|
7
|
+
|
|
8
|
+
In scope: corpus quality/dedup, benchmark staleness, defence measurement,
|
|
9
|
+
multilingual over-refusal, safe challenge-pack export, and corpus/target/judge
|
|
10
|
+
adapters.
|
|
11
|
+
|
|
12
|
+
Out of scope (by design): production release-gating — ship/warn/block, incident
|
|
13
|
+
replay, policy-as-code. That belongs in a downstream layer; see
|
|
14
|
+
[`docs/ROADMAP.md`](docs/ROADMAP.md) and the README's positioning.
|
|
15
|
+
|
|
16
|
+
## Ethics (please read before adding corpora or prompts)
|
|
17
|
+
|
|
18
|
+
- Use **only published** adversarial prompts, commit-pinned. Do not author novel
|
|
19
|
+
jailbreaks, in any language (see [`ETHICS.md`](ETHICS.md)).
|
|
20
|
+
- Excluded categories (CSAM, WMD synthesis, detailed self-harm methods) are
|
|
21
|
+
filtered at load time and enforced by `tests/unit/test_exclusion_filter.py`.
|
|
22
|
+
If you add a corpus, add positive/benign cases there proving nothing leaks.
|
|
23
|
+
- Don't commit raw harmful prompts or model outputs. Reports and packs quote
|
|
24
|
+
truncated previews or redact adversarial prompts.
|
|
25
|
+
|
|
26
|
+
## Dev setup
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
uv venv --python 3.13
|
|
30
|
+
source .venv/bin/activate # .venv\Scripts\activate on Windows
|
|
31
|
+
uv pip install -e ".[dev]"
|
|
32
|
+
pre-commit install
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Before you open a PR
|
|
36
|
+
|
|
37
|
+
Run the exact CI checks locally — green here means green on the PR:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
scripts/ci_local.sh # or scripts\ci_local.ps1 on Windows
|
|
41
|
+
# = ruff check + ruff format --check + mypy (strict) + pytest tests/unit
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Guidelines:
|
|
45
|
+
- One focused change per PR; keep the test suite green (`pytest tests/unit`).
|
|
46
|
+
- New behaviour needs a unit test. Analysis functions should be pure and
|
|
47
|
+
testable without network/API access.
|
|
48
|
+
- Type-annotate everything (`mypy --strict` must pass). No new heavy runtime
|
|
49
|
+
dependencies without discussion.
|
|
50
|
+
- CI runs no live API calls; don't add tests that require one to unit-test.
|
|
51
|
+
|
|
52
|
+
## Reporting issues
|
|
53
|
+
|
|
54
|
+
Bugs, stale/duplicated benchmark findings, and requests to audit a specific
|
|
55
|
+
corpus are all useful. Include the command you ran and the (aggregate) output.
|
|
56
|
+
For anything sensitive — a suspected leak past the exclusion filter, or a
|
|
57
|
+
model-provider takedown — email **rosscyking@gmail.com** (see `ETHICS.md`).
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# ETHICS
|
|
2
|
+
|
|
3
|
+
This project measures LLM behaviour against published adversarial prompts to
|
|
4
|
+
inform defence design. It is **not** a security audit, certification, or
|
|
5
|
+
guarantee.
|
|
6
|
+
|
|
7
|
+
## What this project does
|
|
8
|
+
|
|
9
|
+
- Runs **published** adversarial prompts from peer-reviewed or
|
|
10
|
+
industry-released datasets (AdvBench, JailbreakBench, HarmBench, AgentDojo)
|
|
11
|
+
against target LLMs.
|
|
12
|
+
- Measures attack-success-rate (ASR) under combinations of published defences
|
|
13
|
+
(system prompts, Llama Guard 4, Spotlighting, SecAlign-style structured
|
|
14
|
+
queries, Constitutional principles), with every verdict cross-validated by
|
|
15
|
+
an independent second judge model.
|
|
16
|
+
- Publishes **aggregate** results, methodology, defence configurations, and
|
|
17
|
+
scoring code openly.
|
|
18
|
+
|
|
19
|
+
## What this project does not do
|
|
20
|
+
|
|
21
|
+
- **Generate novel jailbreaks.** All prompts come from published datasets,
|
|
22
|
+
cited and version-pinned.
|
|
23
|
+
- **Publish raw harmful outputs verbatim.** Results are aggregated, redacted,
|
|
24
|
+
and summarised. A small number of representative example transcripts may
|
|
25
|
+
be shown per category, with explicit warnings.
|
|
26
|
+
- **Test categories that are excluded by policy** (see below). These are
|
|
27
|
+
filtered at corpus-load time before any model receives them.
|
|
28
|
+
- **Generate non-English attack content.** The multilingual work
|
|
29
|
+
(`src/redteam/multilingual.py`) is a **benign** control set only — it
|
|
30
|
+
measures whether models *over-refuse* harmless non-English and code-switched
|
|
31
|
+
prompts (false refusal). We do **not** translate harmful prompts into other
|
|
32
|
+
languages: that would both create new harmful content and bypass the
|
|
33
|
+
English-only exclusion filter. The multilingual contribution is over-refusal
|
|
34
|
+
measurement, not non-English jailbreak generation.
|
|
35
|
+
|
|
36
|
+
## Excluded categories
|
|
37
|
+
|
|
38
|
+
The following categories are filtered out of all loaded corpora at
|
|
39
|
+
`scripts/load_corpora.py` and verified by a CI test
|
|
40
|
+
(`tests/unit/test_exclusion_filter.py`):
|
|
41
|
+
|
|
42
|
+
- **CSAM** — Child Sexual Abuse Material, in any form.
|
|
43
|
+
- **Bio / chem / radiological / nuclear weapons synthesis** — uplift content
|
|
44
|
+
for the design or manufacture of weapons of mass destruction.
|
|
45
|
+
- **Detailed self-harm methods** — actionable instructions for self-injury
|
|
46
|
+
or suicide. (Discussion of self-harm at a research / awareness level is
|
|
47
|
+
fine; method-level detail is not.)
|
|
48
|
+
|
|
49
|
+
If any of these slip past the filter, the CI test fails and merge is blocked.
|
|
50
|
+
|
|
51
|
+
## Datasets used (with citations)
|
|
52
|
+
|
|
53
|
+
- **AdvBench** — Zou et al., 2023. *Universal and Transferable Adversarial
|
|
54
|
+
Attacks on Aligned Language Models.* MIT licence.
|
|
55
|
+
- **JailbreakBench / JBB-Behaviors** — Chao et al., 2024. *JailbreakBench:
|
|
56
|
+
An Open Robustness Benchmark for Jailbreaking Large Language Models.*
|
|
57
|
+
MIT licence. NeurIPS 2024 Datasets & Benchmarks Track.
|
|
58
|
+
- **HarmBench** — Mazeika et al., 2024. *HarmBench: A Standardized
|
|
59
|
+
Evaluation Framework for Automated Red Teaming and Robust Refusal.*
|
|
60
|
+
MIT licence.
|
|
61
|
+
- **AgentDojo** — Debenedetti et al., 2024. *AgentDojo: A Dynamic
|
|
62
|
+
Environment to Evaluate Prompt Injection Attacks and Defences for LLM
|
|
63
|
+
Agents.* AGPL licence.
|
|
64
|
+
|
|
65
|
+
Exact dataset commits are pinned in `configs/dataset_versions.yaml`.
|
|
66
|
+
|
|
67
|
+
## Models tested (with version pins)
|
|
68
|
+
|
|
69
|
+
Every result is reported against a **dated** model ID. See
|
|
70
|
+
`configs/model_versions.yaml` for the full list. As of project start
|
|
71
|
+
(2026-05-05) this is:
|
|
72
|
+
|
|
73
|
+
- `claude-sonnet-4-6` (Anthropic Claude Sonnet 4.6) — frontier API target
|
|
74
|
+
- `llama3.1:8b` (Meta Llama 3.1 8B via Ollama) — local target
|
|
75
|
+
- `meta-secalign-8b` (Meta SecAlign 8B) — Phase 5+ stretch target
|
|
76
|
+
- Judge: `claude-haiku-4-5-20251001` (Claude Haiku 4.5) — distinct from any
|
|
77
|
+
evaluated target
|
|
78
|
+
- Cross-judge: `claude-sonnet-4-6` — independent second judge used to
|
|
79
|
+
validate every verdict
|
|
80
|
+
|
|
81
|
+
## Responsible disclosure
|
|
82
|
+
|
|
83
|
+
If you are a model provider whose model is included and you would like
|
|
84
|
+
findings or example transcripts removed, email
|
|
85
|
+
**rosscyking@gmail.com** with the subject line
|
|
86
|
+
`[redteam-foundry] takedown request`.
|
|
87
|
+
|
|
88
|
+
**Removal commitment: 24 hours from receipt.**
|
|
89
|
+
|
|
90
|
+
## Scope and disclaimer
|
|
91
|
+
|
|
92
|
+
This work is for research and educational purposes. Results are reported as:
|
|
93
|
+
|
|
94
|
+
> "On benchmark X, with defence stack Y, model Z showed ASR = N% [95% CI ...],
|
|
95
|
+
> judged by Claude Haiku 4.5 and cross-validated by an independent second
|
|
96
|
+
> judge (Claude Sonnet 4.6)."
|
|
97
|
+
|
|
98
|
+
Results **do not** imply that any model is "safe" or "unsafe" in general.
|
|
99
|
+
See `METHODOLOGY.md` for how each metric is validated, and which metrics are
|
|
100
|
+
reported as descriptive only.
|
|
101
|
+
|
|
102
|
+
Do not apply these prompts against systems you do not own or have explicit
|
|
103
|
+
permission to test.
|
|
104
|
+
|
|
105
|
+
## Provider terms of service
|
|
106
|
+
|
|
107
|
+
Both Anthropic's and OpenAI's usage policies explicitly permit safety
|
|
108
|
+
evaluation and red-teaming research. We comply with:
|
|
109
|
+
|
|
110
|
+
- Anthropic Usage Policy (verified 2026-05-05)
|
|
111
|
+
- API budget caps set in the provider console (hard limit) **and** in the
|
|
112
|
+
harness itself (per-run, per-call) — see `src/redteam/budget.py`.
|
|
113
|
+
|
|
114
|
+
## Maintainer
|
|
115
|
+
|
|
116
|
+
Ross — rosscyking@gmail.com
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ross (rosscyking@gmail.com)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|