redteam-foundry 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. redteam_foundry-0.2.0/.env.example +17 -0
  2. redteam_foundry-0.2.0/.gitattributes +18 -0
  3. redteam_foundry-0.2.0/.github/workflows/ci.yml +43 -0
  4. redteam_foundry-0.2.0/.gitignore +73 -0
  5. redteam_foundry-0.2.0/.pre-commit-config.yaml +34 -0
  6. redteam_foundry-0.2.0/CHANGELOG.md +75 -0
  7. redteam_foundry-0.2.0/CONTRIBUTING.md +57 -0
  8. redteam_foundry-0.2.0/ETHICS.md +116 -0
  9. redteam_foundry-0.2.0/LICENSE +21 -0
  10. redteam_foundry-0.2.0/METHODOLOGY.md +271 -0
  11. redteam_foundry-0.2.0/PKG-INFO +260 -0
  12. redteam_foundry-0.2.0/README.md +211 -0
  13. redteam_foundry-0.2.0/challenge_packs/samples/multilingual_benign_v1/datacard.md +26 -0
  14. redteam_foundry-0.2.0/challenge_packs/samples/multilingual_benign_v1/pack.yaml +31 -0
  15. redteam_foundry-0.2.0/challenge_packs/samples/multilingual_benign_v1/scenarios.jsonl +25 -0
  16. redteam_foundry-0.2.0/configs/dataset_versions.yaml +49 -0
  17. redteam_foundry-0.2.0/configs/model_versions.yaml +51 -0
  18. redteam_foundry-0.2.0/configs/run_agentdojo_baseline.yaml +13 -0
  19. redteam_foundry-0.2.0/configs/run_agentdojo_full_prompt_stack.yaml +16 -0
  20. redteam_foundry-0.2.0/configs/run_agentdojo_local_baseline.yaml +12 -0
  21. redteam_foundry-0.2.0/configs/run_agentdojo_local_full_prompt_stack.yaml +15 -0
  22. redteam_foundry-0.2.0/configs/run_agentdojo_local_secalign.yaml +13 -0
  23. redteam_foundry-0.2.0/configs/run_agentdojo_local_spotlighting.yaml +13 -0
  24. redteam_foundry-0.2.0/configs/run_agentdojo_secalign.yaml +14 -0
  25. redteam_foundry-0.2.0/configs/run_agentdojo_spotlighting.yaml +14 -0
  26. redteam_foundry-0.2.0/configs/run_anthropic_baseline.yaml +9 -0
  27. redteam_foundry-0.2.0/configs/run_anthropic_full_stack.yaml +13 -0
  28. redteam_foundry-0.2.0/configs/run_benign_control_baseline.yaml +12 -0
  29. redteam_foundry-0.2.0/configs/run_benign_control_full_stack.yaml +16 -0
  30. redteam_foundry-0.2.0/configs/run_benign_multilingual_baseline.yaml +12 -0
  31. redteam_foundry-0.2.0/configs/run_local_baseline.yaml +9 -0
  32. redteam_foundry-0.2.0/configs/run_local_full_stack.yaml +18 -0
  33. redteam_foundry-0.2.0/data/.gitkeep +0 -0
  34. redteam_foundry-0.2.0/docs/RELEASING.md +64 -0
  35. redteam_foundry-0.2.0/docs/ROADMAP.md +134 -0
  36. redteam_foundry-0.2.0/docs/findings/benchmark-quality-report-card.md +109 -0
  37. redteam_foundry-0.2.0/docs/results_matrix.png +0 -0
  38. redteam_foundry-0.2.0/examples/export_to_agent_release_gates.md +128 -0
  39. redteam_foundry-0.2.0/pyproject.toml +155 -0
  40. redteam_foundry-0.2.0/reports/samples/README.md +59 -0
  41. redteam_foundry-0.2.0/reports/samples/corpus_audit/corpus_datacard.md +55 -0
  42. redteam_foundry-0.2.0/reports/samples/defence_comparison/defence_comparison.json +128 -0
  43. redteam_foundry-0.2.0/reports/samples/defence_comparison/defence_comparison.md +18 -0
  44. redteam_foundry-0.2.0/reports/samples/defence_comparison_frr/defence_comparison.json +39 -0
  45. redteam_foundry-0.2.0/reports/samples/defence_comparison_frr/defence_comparison.md +12 -0
  46. redteam_foundry-0.2.0/reports/samples/frr_by_language/frr_by_language.json +48 -0
  47. redteam_foundry-0.2.0/reports/samples/frr_by_language/frr_by_language.md +15 -0
  48. redteam_foundry-0.2.0/reports/samples/hf_scorecard.md +22 -0
  49. redteam_foundry-0.2.0/reports/samples/staleness/advbench/staleness.json +47 -0
  50. redteam_foundry-0.2.0/reports/samples/staleness/advbench/staleness_report.md +20 -0
  51. redteam_foundry-0.2.0/reports/samples/staleness/agentdojo/staleness.json +47 -0
  52. redteam_foundry-0.2.0/reports/samples/staleness/agentdojo/staleness_report.md +20 -0
  53. redteam_foundry-0.2.0/results/.gitkeep +0 -0
  54. redteam_foundry-0.2.0/scripts/ci_local.ps1 +38 -0
  55. redteam_foundry-0.2.0/scripts/ci_local.sh +18 -0
  56. redteam_foundry-0.2.0/scripts/hf_scorecard.py +102 -0
  57. redteam_foundry-0.2.0/scripts/plot_results.py +139 -0
  58. redteam_foundry-0.2.0/src/redteam/__init__.py +8 -0
  59. redteam_foundry-0.2.0/src/redteam/__main__.py +19 -0
  60. redteam_foundry-0.2.0/src/redteam/benign.py +114 -0
  61. redteam_foundry-0.2.0/src/redteam/budget.py +152 -0
  62. redteam_foundry-0.2.0/src/redteam/cache.py +89 -0
  63. redteam_foundry-0.2.0/src/redteam/cli.py +1194 -0
  64. redteam_foundry-0.2.0/src/redteam/compare.py +250 -0
  65. redteam_foundry-0.2.0/src/redteam/corpora/__init__.py +35 -0
  66. redteam_foundry-0.2.0/src/redteam/corpora/_base.py +105 -0
  67. redteam_foundry-0.2.0/src/redteam/corpora/_filters.py +193 -0
  68. redteam_foundry-0.2.0/src/redteam/corpora/advbench.py +78 -0
  69. redteam_foundry-0.2.0/src/redteam/corpora/agentdojo.py +222 -0
  70. redteam_foundry-0.2.0/src/redteam/corpora/datacard.py +201 -0
  71. redteam_foundry-0.2.0/src/redteam/corpora/harmbench.py +116 -0
  72. redteam_foundry-0.2.0/src/redteam/corpora/huggingface.py +87 -0
  73. redteam_foundry-0.2.0/src/redteam/corpora/jailbreakbench.py +115 -0
  74. redteam_foundry-0.2.0/src/redteam/corpora/quality.py +388 -0
  75. redteam_foundry-0.2.0/src/redteam/corpora/taxonomy.py +182 -0
  76. redteam_foundry-0.2.0/src/redteam/defences/__init__.py +47 -0
  77. redteam_foundry-0.2.0/src/redteam/defences/base.py +132 -0
  78. redteam_foundry-0.2.0/src/redteam/defences/constitutional.py +112 -0
  79. redteam_foundry-0.2.0/src/redteam/defences/llama_guard.py +110 -0
  80. redteam_foundry-0.2.0/src/redteam/defences/secalign.py +76 -0
  81. redteam_foundry-0.2.0/src/redteam/defences/spotlighting.py +56 -0
  82. redteam_foundry-0.2.0/src/redteam/defences/system_prompt.py +45 -0
  83. redteam_foundry-0.2.0/src/redteam/inspect_export.py +234 -0
  84. redteam_foundry-0.2.0/src/redteam/multilingual.py +92 -0
  85. redteam_foundry-0.2.0/src/redteam/orchestrator.py +565 -0
  86. redteam_foundry-0.2.0/src/redteam/packs.py +225 -0
  87. redteam_foundry-0.2.0/src/redteam/schemas.py +146 -0
  88. redteam_foundry-0.2.0/src/redteam/scorers/__init__.py +39 -0
  89. redteam_foundry-0.2.0/src/redteam/scorers/_judge_schema.py +32 -0
  90. redteam_foundry-0.2.0/src/redteam/scorers/judge_claude.py +231 -0
  91. redteam_foundry-0.2.0/src/redteam/scorers/judge_human.py +192 -0
  92. redteam_foundry-0.2.0/src/redteam/scorers/refusal_keywords.py +106 -0
  93. redteam_foundry-0.2.0/src/redteam/staleness.py +322 -0
  94. redteam_foundry-0.2.0/src/redteam/stats.py +182 -0
  95. redteam_foundry-0.2.0/src/redteam/targets/__init__.py +28 -0
  96. redteam_foundry-0.2.0/src/redteam/targets/_pricing.py +59 -0
  97. redteam_foundry-0.2.0/src/redteam/targets/anthropic.py +95 -0
  98. redteam_foundry-0.2.0/src/redteam/targets/base.py +101 -0
  99. redteam_foundry-0.2.0/src/redteam/targets/ollama.py +122 -0
  100. redteam_foundry-0.2.0/src/redteam/targets/openai_target.py +98 -0
  101. redteam_foundry-0.2.0/tests/__init__.py +0 -0
  102. redteam_foundry-0.2.0/tests/conftest.py +9 -0
  103. redteam_foundry-0.2.0/tests/data/advbench/harmful_behaviors_sample.csv +6 -0
  104. redteam_foundry-0.2.0/tests/data/harmbench/harmbench_behaviors_text_all_sample.csv +5 -0
  105. redteam_foundry-0.2.0/tests/data/jailbreakbench/harmful-behaviors_sample.csv +5 -0
  106. redteam_foundry-0.2.0/tests/smoke/__init__.py +0 -0
  107. redteam_foundry-0.2.0/tests/smoke/test_defences_live.py +90 -0
  108. redteam_foundry-0.2.0/tests/smoke/test_targets_live.py +81 -0
  109. redteam_foundry-0.2.0/tests/unit/__init__.py +0 -0
  110. redteam_foundry-0.2.0/tests/unit/test_agentdojo_loader.py +90 -0
  111. redteam_foundry-0.2.0/tests/unit/test_benign.py +67 -0
  112. redteam_foundry-0.2.0/tests/unit/test_budget.py +58 -0
  113. redteam_foundry-0.2.0/tests/unit/test_cache.py +95 -0
  114. redteam_foundry-0.2.0/tests/unit/test_compare.py +101 -0
  115. redteam_foundry-0.2.0/tests/unit/test_corpus_quality.py +238 -0
  116. redteam_foundry-0.2.0/tests/unit/test_defences_base.py +91 -0
  117. redteam_foundry-0.2.0/tests/unit/test_exclusion_filter.py +206 -0
  118. redteam_foundry-0.2.0/tests/unit/test_hardening.py +147 -0
  119. redteam_foundry-0.2.0/tests/unit/test_huggingface.py +57 -0
  120. redteam_foundry-0.2.0/tests/unit/test_inspect_export.py +126 -0
  121. redteam_foundry-0.2.0/tests/unit/test_judge_claude.py +103 -0
  122. redteam_foundry-0.2.0/tests/unit/test_judge_human.py +146 -0
  123. redteam_foundry-0.2.0/tests/unit/test_loaders.py +133 -0
  124. redteam_foundry-0.2.0/tests/unit/test_multilingual.py +149 -0
  125. redteam_foundry-0.2.0/tests/unit/test_orchestrator.py +149 -0
  126. redteam_foundry-0.2.0/tests/unit/test_orchestrator_cross_judge.py +158 -0
  127. redteam_foundry-0.2.0/tests/unit/test_orchestrator_score.py +110 -0
  128. redteam_foundry-0.2.0/tests/unit/test_packs.py +137 -0
  129. redteam_foundry-0.2.0/tests/unit/test_pricing.py +36 -0
  130. redteam_foundry-0.2.0/tests/unit/test_schemas.py +93 -0
  131. redteam_foundry-0.2.0/tests/unit/test_scorer_refusal.py +62 -0
  132. redteam_foundry-0.2.0/tests/unit/test_smoke.py +19 -0
  133. redteam_foundry-0.2.0/tests/unit/test_staleness.py +155 -0
  134. redteam_foundry-0.2.0/tests/unit/test_stats.py +132 -0
  135. redteam_foundry-0.2.0/tests/unit/test_targets_mocked.py +105 -0
  136. redteam_foundry-0.2.0/tests/unit/test_taxonomy.py +88 -0
@@ -0,0 +1,17 @@
1
+ # Copy to `.env` and fill in. .env is gitignored.
2
+
3
+ # --- Anthropic ---------------------------------------------------------------
4
+ # Required. Set a $20/month hard cap in the Anthropic console *before* using
5
+ # this key. The harness also enforces a $5/run cap inside Target.send (see L3).
6
+ ANTHROPIC_API_KEY=
7
+
8
+ # --- Ollama (local) ----------------------------------------------------------
9
+ # Default; only override if you run Ollama on a non-standard host/port.
10
+ OLLAMA_HOST=http://localhost:11434
11
+
12
+ # --- Run-time guards ---------------------------------------------------------
13
+ REDTEAM_MAX_USD_PER_RUN=5.00
14
+ REDTEAM_MAX_USD_PER_CALL=0.50
15
+
16
+ # --- HuggingFace (only needed for Meta-SecAlign download in Phase 5+) -------
17
+ HF_TOKEN=
@@ -0,0 +1,18 @@
1
+ # Normalise line endings. Without this, Windows + sandbox tooling churn
2
+ # CRLF<->LF and every file shows as fully modified in `git status`/diffs.
3
+ # This stores every text file as LF in the repository, regardless of the
4
+ # checkout platform. Ends the line-ending war for good.
5
+ * text=auto eol=lf
6
+
7
+ # Shell scripts must stay LF even on Windows checkouts.
8
+ *.sh text eol=lf
9
+ *.ps1 text eol=crlf
10
+
11
+ # Binary files — never touch.
12
+ *.png binary
13
+ *.jpg binary
14
+ *.jpeg binary
15
+ *.gif binary
16
+ *.ico binary
17
+ *.pdf binary
18
+ *.parquet binary
@@ -0,0 +1,43 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request: {}
5
+ push:
6
+ branches: [main]
7
+
8
+ # No real API calls are made in CI — see ETHICS + budget guards.
9
+ # When branch protection is enabled, require the display name below
10
+ # ("Lint, typecheck, smoke run") — see Lesson L13.
11
+
12
+ jobs:
13
+ build:
14
+ name: Lint, typecheck, smoke run
15
+ runs-on: ubuntu-latest
16
+ timeout-minutes: 5
17
+ steps:
18
+ - uses: actions/checkout@v6
19
+
20
+ - uses: actions/setup-python@v6
21
+ with:
22
+ python-version: "3.13"
23
+
24
+ - name: Install uv
25
+ uses: astral-sh/setup-uv@v8.1.0
26
+ with:
27
+ version: "latest"
28
+ enable-cache: true
29
+
30
+ - name: Install project (editable, with dev extras)
31
+ run: uv pip install --system -e ".[dev]"
32
+
33
+ - name: Ruff (lint)
34
+ run: ruff check .
35
+
36
+ - name: Ruff (format check)
37
+ run: ruff format --check .
38
+
39
+ - name: Mypy (typecheck)
40
+ run: mypy src/
41
+
42
+ - name: Pytest (unit)
43
+ run: pytest tests/unit
@@ -0,0 +1,73 @@
1
+ # --- secrets / env -----------------------------------------------------------
2
+ .env
3
+ .env.*
4
+ !.env.example
5
+ *.key
6
+ secrets/
7
+
8
+ # --- python ------------------------------------------------------------------
9
+ __pycache__/
10
+ *.py[cod]
11
+ *$py.class
12
+ .venv/
13
+ venv/
14
+ .python-version
15
+
16
+ # build / dist
17
+ build/
18
+ dist/
19
+ *.egg-info/
20
+ .eggs/
21
+
22
+ # tooling caches
23
+ .mypy_cache/
24
+ .ruff_cache/
25
+ .pytest_cache/
26
+ .coverage
27
+ htmlcov/
28
+ coverage.xml
29
+
30
+ # --- project artefacts (gitignored on purpose) -------------------------------
31
+ # Raw + cached corpora are big and re-downloadable; only commit
32
+ # scripts/load_corpora.py + dataset_versions.yaml + a small samples/ dir.
33
+ # Leading slash means "only at repo root" — a bare `data/` would also match
34
+ # tests/data/ (which holds committed test fixtures) and silently exclude it.
35
+ /data/
36
+ !/data/.gitkeep
37
+ !/data/samples/
38
+
39
+ # Run outputs are large and re-creatable; only commit a small reference
40
+ # subset under results/samples/.
41
+ /results/
42
+ !/results/.gitkeep
43
+ !/results/samples/
44
+
45
+ # Corpus-audit outputs (quality reports / data cards). Re-creatable via
46
+ # `redteam corpora audit`, and they quote truncated adversarial prompt
47
+ # previews, so keep them out of git by default.
48
+ # `/reports/*` (not `/reports/`) so the samples/ negation can re-include —
49
+ # git won't recurse into a wholly-excluded directory.
50
+ /reports/*
51
+ !/reports/samples/
52
+
53
+ # Exported challenge packs. Re-creatable via `redteam export-pack`; adversarial
54
+ # packs contain prompt previews, so keep them out of git by default. Only a
55
+ # small, safe (benign) sample is committed under challenge_packs/samples/.
56
+ # Note: `/challenge_packs/*` (not `/challenge_packs/`) so the samples/ negation
57
+ # below can re-include — git won't recurse into a wholly-excluded directory.
58
+ /challenge_packs/*
59
+ !/challenge_packs/samples/
60
+
61
+ # Streamlit / HF Spaces local cache
62
+ .streamlit/secrets.toml
63
+ .huggingface/
64
+
65
+ # OS noise
66
+ .DS_Store
67
+ Thumbs.db
68
+ desktop.ini
69
+
70
+ # IDE noise
71
+ .vscode/
72
+ .idea/
73
+ *.swp
@@ -0,0 +1,34 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v5.0.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ - id: end-of-file-fixer
7
+ - id: check-yaml
8
+ - id: check-added-large-files
9
+ args: ["--maxkb=512"]
10
+ - id: check-merge-conflict
11
+ - id: detect-private-key
12
+
13
+ # Use the venv's ruff (pinned in pyproject.toml [dev]) so pre-commit and CI
14
+ # never drift to different versions. Requires the venv to be active when
15
+ # committing — that's already the standard workflow here.
16
+ - repo: local
17
+ hooks:
18
+ - id: ruff
19
+ name: ruff (lint, autofix)
20
+ entry: ruff check --fix --force-exclude
21
+ language: system
22
+ types_or: [python, pyi]
23
+ require_serial: true
24
+ - id: ruff-format
25
+ name: ruff format
26
+ entry: ruff format --force-exclude
27
+ language: system
28
+ types_or: [python, pyi]
29
+ require_serial: true
30
+
31
+ - repo: https://github.com/gitleaks/gitleaks
32
+ rev: v8.21.2
33
+ hooks:
34
+ - id: gitleaks
@@ -0,0 +1,75 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. The format is based on
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project
5
+ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [0.2.0] — 2026-07-02 — the adversarial benchmark foundry
8
+
9
+ Repositioned from a static red-team harness (`llm-redteam-harness`) into an
10
+ upstream **benchmark foundry** (`redteam-foundry`). The v1 measurement core is
11
+ unchanged; these are additive. First public release.
12
+
13
+ ### Added
14
+ - **Corpus quality audit** (`redteam corpora audit`) — exact + near-duplicate
15
+ detection including cross-source overlap, composition, prompt-length stats,
16
+ and label-integrity checks, rendered as a quality report + data card.
17
+ - **Language + attack-family taxonomy** — script-based language/code-switching
18
+ detection and heuristic attack-family surface markers, surfaced in the audit.
19
+ - **Benchmark staleness scoring** (`redteam corpora staleness`) — a transparent,
20
+ component-broken-out heuristic composing corpus and run signals.
21
+ - **Benign control set + defence comparison** (`redteam compare-defences`,
22
+ `redteam benign export`) — false-refusal rate and
23
+ `safe_usefulness = (1 - ASR) * (1 - FRR)` per defence config.
24
+ - **Multilingual over-refusal** (`redteam frr-by-language`,
25
+ `redteam benign export --multilingual`) — benign control set in
26
+ zh-Hant / zh-Hans / ja / ko + code-switched, with per-language FRR.
27
+ - **Challenge-pack exporter** (`redteam export-pack`) — versioned packs
28
+ (`pack.yaml` + `scenarios.jsonl` + `datacard.md`) with adversarial prompts
29
+ redacted by default, plus a reader (`redteam.packs.read_challenge_pack`) and
30
+ a downstream consumption contract (`examples/export_to_agent_release_gates.md`).
31
+ - **Audit any Hugging Face dataset** (`redteam corpora audit-hf --dataset ...
32
+ --prompt-column ... [--revision ...]`) — not just the four built-in corpora;
33
+ the safety exclusion filter runs first. New `source="external"` /
34
+ `category="unknown"` schema values back it.
35
+ - **Cross-dataset benchmark-quality scorecard** (`scripts/hf_scorecard.py`) and a
36
+ written finding (`docs/findings/benchmark-quality-report-card.md`) auditing four
37
+ public jailbreak datasets: all English-only, roleplay-persona-dominant, and
38
+ duplicated to varying degrees.
39
+ - `docs/ROADMAP.md`; committed real-data findings under `reports/samples/`.
40
+
41
+ ### Fixed
42
+ - Exclusion-filter leaks (WMD "synthesis" noun, several self-harm phrasings,
43
+ case-insensitive category gate) with regression tests.
44
+ - Budget guard now **reserves** its estimate so concurrent calls can't
45
+ collectively exceed the per-run cap.
46
+ - Llama Guard fails **closed** on an empty/errored verdict.
47
+ - Spotlighting / SecAlign neutralise their own fence markers in untrusted input
48
+ (closes a delimiter-injection bypass); SecAlign no longer passes messages
49
+ through unfenced.
50
+ - OpenAI target fails loud without a pricing entry (no silent $0 budget bypass).
51
+ - Krippendorff's α finite-sample correction; `compute_kappa` blank-cell guard;
52
+ cross-judge agreement reports `None` (not `0.0`) when not computable.
53
+
54
+ ### Changed
55
+ - **Renamed the project/package to `redteam-foundry`** (was
56
+ `llm-redteam-harness`). The `redteam` CLI command and `src/redteam/` module are
57
+ unchanged — `pip install redteam-foundry` still gives you `redteam ...`.
58
+ - Default install slimmed: the unused dashboard deps (`streamlit`, `plotly`)
59
+ moved to an opt-in `[dashboard]` extra. The audit / staleness / dedup path
60
+ needs no API key.
61
+
62
+ ## [0.1.0] — v1 measurement core
63
+
64
+ - Loaders for AdvBench, JailbreakBench, HarmBench, AgentDojo (commit-pinned) with
65
+ a safety exclusion filter.
66
+ - Anthropic + Ollama targets (OpenAI stub); disk response cache; per-run and
67
+ per-call budget guards.
68
+ - Six composable defences (system prompt, Constitutional, Spotlighting,
69
+ SecAlign, Llama Guard 4 pre/post).
70
+ - Rule-based + LLM-judge scoring with an independent cross-judge; bootstrap
71
+ confidence intervals; Cohen's κ and Krippendorff's α.
72
+ - UK AISI Inspect eval-log export.
73
+
74
+ [0.2.0]: https://github.com/rosscyking1115/redteam-foundry/releases/tag/v0.2.0
75
+ [0.1.0]: https://github.com/rosscyking1115/redteam-foundry/releases/tag/v0.1.0
@@ -0,0 +1,57 @@
1
+ # Contributing
2
+
3
+ Thanks for your interest. This is a research tool for **auditing adversarial
4
+ benchmarks**; contributions that sharpen that focus are very welcome.
5
+
6
+ ## Scope
7
+
8
+ In scope: corpus quality/dedup, benchmark staleness, defence measurement,
9
+ multilingual over-refusal, safe challenge-pack export, and corpus/target/judge
10
+ adapters.
11
+
12
+ Out of scope (by design): production release-gating — ship/warn/block, incident
13
+ replay, policy-as-code. That belongs in a downstream layer; see
14
+ [`docs/ROADMAP.md`](docs/ROADMAP.md) and the README's positioning.
15
+
16
+ ## Ethics (please read before adding corpora or prompts)
17
+
18
+ - Use **only published** adversarial prompts, commit-pinned. Do not author novel
19
+ jailbreaks, in any language (see [`ETHICS.md`](ETHICS.md)).
20
+ - Excluded categories (CSAM, WMD synthesis, detailed self-harm methods) are
21
+ filtered at load time and enforced by `tests/unit/test_exclusion_filter.py`.
22
+ If you add a corpus, add positive/benign cases there proving nothing leaks.
23
+ - Don't commit raw harmful prompts or model outputs. Reports and packs quote
24
+ truncated previews or redact adversarial prompts.
25
+
26
+ ## Dev setup
27
+
28
+ ```bash
29
+ uv venv --python 3.13
30
+ source .venv/bin/activate # .venv\Scripts\activate on Windows
31
+ uv pip install -e ".[dev]"
32
+ pre-commit install
33
+ ```
34
+
35
+ ## Before you open a PR
36
+
37
+ Run the exact CI checks locally — green here means green on the PR:
38
+
39
+ ```bash
40
+ scripts/ci_local.sh # or scripts\ci_local.ps1 on Windows
41
+ # = ruff check + ruff format --check + mypy (strict) + pytest tests/unit
42
+ ```
43
+
44
+ Guidelines:
45
+ - One focused change per PR; keep the test suite green (`pytest tests/unit`).
46
+ - New behaviour needs a unit test. Analysis functions should be pure and
47
+ testable without network/API access.
48
+ - Type-annotate everything (`mypy --strict` must pass). No new heavy runtime
49
+ dependencies without discussion.
50
+ - CI runs no live API calls; don't add tests that require one to unit-test.
51
+
52
+ ## Reporting issues
53
+
54
+ Bugs, stale/duplicated benchmark findings, and requests to audit a specific
55
+ corpus are all useful. Include the command you ran and the (aggregate) output.
56
+ For anything sensitive — a suspected leak past the exclusion filter, or a
57
+ model-provider takedown — email **rosscyking@gmail.com** (see `ETHICS.md`).
@@ -0,0 +1,116 @@
1
+ # ETHICS
2
+
3
+ This project measures LLM behaviour against published adversarial prompts to
4
+ inform defence design. It is **not** a security audit, certification, or
5
+ guarantee.
6
+
7
+ ## What this project does
8
+
9
+ - Runs **published** adversarial prompts from peer-reviewed or
10
+ industry-released datasets (AdvBench, JailbreakBench, HarmBench, AgentDojo)
11
+ against target LLMs.
12
+ - Measures attack-success-rate (ASR) under combinations of published defences
13
+ (system prompts, Llama Guard 4, Spotlighting, SecAlign-style structured
14
+ queries, Constitutional principles), with every verdict cross-validated by
15
+ an independent second judge model.
16
+ - Publishes **aggregate** results, methodology, defence configurations, and
17
+ scoring code openly.
18
+
19
+ ## What this project does not do
20
+
21
+ - **Generate novel jailbreaks.** All prompts come from published datasets,
22
+ cited and version-pinned.
23
+ - **Publish raw harmful outputs verbatim.** Results are aggregated, redacted,
24
+ and summarised. A small number of representative example transcripts may
25
+ be shown per category, with explicit warnings.
26
+ - **Test categories that are excluded by policy** (see below). These are
27
+ filtered at corpus-load time before any model receives them.
28
+ - **Generate non-English attack content.** The multilingual work
29
+ (`src/redteam/multilingual.py`) is a **benign** control set only — it
30
+ measures whether models *over-refuse* harmless non-English and code-switched
31
+ prompts (false refusal). We do **not** translate harmful prompts into other
32
+ languages: that would both create new harmful content and bypass the
33
+ English-only exclusion filter. The multilingual contribution is over-refusal
34
+ measurement, not non-English jailbreak generation.
35
+
36
+ ## Excluded categories
37
+
38
+ The following categories are filtered out of all loaded corpora at
39
+ `scripts/load_corpora.py` and verified by a CI test
40
+ (`tests/unit/test_exclusion_filter.py`):
41
+
42
+ - **CSAM** — Child Sexual Abuse Material, in any form.
43
+ - **Bio / chem / radiological / nuclear weapons synthesis** — uplift content
44
+ for the design or manufacture of weapons of mass destruction.
45
+ - **Detailed self-harm methods** — actionable instructions for self-injury
46
+ or suicide. (Discussion of self-harm at a research / awareness level is
47
+ fine; method-level detail is not.)
48
+
49
+ If any of these slip past the filter, the CI test fails and merge is blocked.
50
+
51
+ ## Datasets used (with citations)
52
+
53
+ - **AdvBench** — Zou et al., 2023. *Universal and Transferable Adversarial
54
+ Attacks on Aligned Language Models.* MIT licence.
55
+ - **JailbreakBench / JBB-Behaviors** — Chao et al., 2024. *JailbreakBench:
56
+ An Open Robustness Benchmark for Jailbreaking Large Language Models.*
57
+ MIT licence. NeurIPS 2024 Datasets & Benchmarks Track.
58
+ - **HarmBench** — Mazeika et al., 2024. *HarmBench: A Standardized
59
+ Evaluation Framework for Automated Red Teaming and Robust Refusal.*
60
+ MIT licence.
61
+ - **AgentDojo** — Debenedetti et al., 2024. *AgentDojo: A Dynamic
62
+ Environment to Evaluate Prompt Injection Attacks and Defences for LLM
63
+ Agents.* AGPL licence.
64
+
65
+ Exact dataset commits are pinned in `configs/dataset_versions.yaml`.
66
+
67
+ ## Models tested (with version pins)
68
+
69
+ Every result is reported against a **dated** model ID. See
70
+ `configs/model_versions.yaml` for the full list. As of project start
71
+ (2026-05-05) this is:
72
+
73
+ - `claude-sonnet-4-6` (Anthropic Claude Sonnet 4.6) — frontier API target
74
+ - `llama3.1:8b` (Meta Llama 3.1 8B via Ollama) — local target
75
+ - `meta-secalign-8b` (Meta SecAlign 8B) — Phase 5+ stretch target
76
+ - Judge: `claude-haiku-4-5-20251001` (Claude Haiku 4.5) — distinct from any
77
+ evaluated target
78
+ - Cross-judge: `claude-sonnet-4-6` — independent second judge used to
79
+ validate every verdict
80
+
81
+ ## Responsible disclosure
82
+
83
+ If you are a model provider whose model is included and you would like
84
+ findings or example transcripts removed, email
85
+ **rosscyking@gmail.com** with the subject line
86
+ `[redteam-foundry] takedown request`.
87
+
88
+ **Removal commitment: 24 hours from receipt.**
89
+
90
+ ## Scope and disclaimer
91
+
92
+ This work is for research and educational purposes. Results are reported as:
93
+
94
+ > "On benchmark X, with defence stack Y, model Z showed ASR = N% [95% CI ...],
95
+ > judged by Claude Haiku 4.5 and cross-validated by an independent second
96
+ > judge (Claude Sonnet 4.6)."
97
+
98
+ Results **do not** imply that any model is "safe" or "unsafe" in general.
99
+ See `METHODOLOGY.md` for how each metric is validated, and which metrics are
100
+ reported as descriptive only.
101
+
102
+ Do not apply these prompts against systems you do not own or have explicit
103
+ permission to test.
104
+
105
+ ## Provider terms of service
106
+
107
+ Both Anthropic's and OpenAI's usage policies explicitly permit safety
108
+ evaluation and red-teaming research. We comply with:
109
+
110
+ - Anthropic Usage Policy (verified 2026-05-05)
111
+ - API budget caps set in the provider console (hard limit) **and** in the
112
+ harness itself (per-run, per-call) — see `src/redteam/budget.py`.
113
+
114
+ ## Maintainer
115
+
116
+ Ross — rosscyking@gmail.com
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ross (rosscyking@gmail.com)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.