promptgate-llm 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. promptgate_llm-0.5.0/.github/workflows/tests.yml +36 -0
  2. promptgate_llm-0.5.0/.gitignore +36 -0
  3. promptgate_llm-0.5.0/CHANGELOG.md +81 -0
  4. promptgate_llm-0.5.0/LICENSE +21 -0
  5. promptgate_llm-0.5.0/PKG-INFO +125 -0
  6. promptgate_llm-0.5.0/README.md +106 -0
  7. promptgate_llm-0.5.0/conftest.py +10 -0
  8. promptgate_llm-0.5.0/injectionbench/__init__.py +0 -0
  9. promptgate_llm-0.5.0/injectionbench/__main__.py +125 -0
  10. promptgate_llm-0.5.0/injectionbench/dataset.py +340 -0
  11. promptgate_llm-0.5.0/injectionbench/datasets/attacks/data_exfiltration.json +7 -0
  12. promptgate_llm-0.5.0/injectionbench/datasets/attacks/direct_injection.json +12 -0
  13. promptgate_llm-0.5.0/injectionbench/datasets/attacks/encoding_attacks.json +7 -0
  14. promptgate_llm-0.5.0/injectionbench/datasets/attacks/jailbreaks.json +12 -0
  15. promptgate_llm-0.5.0/injectionbench/datasets/attacks/malicious_coding.json +752 -0
  16. promptgate_llm-0.5.0/injectionbench/datasets/attacks/social_engineering.json +12 -0
  17. promptgate_llm-0.5.0/injectionbench/datasets/attacks/system_override.json +7 -0
  18. promptgate_llm-0.5.0/injectionbench/datasets/benign/clean_samples.json +22 -0
  19. promptgate_llm-0.5.0/injectionbench/datasets/benign/coding_requests.json +1502 -0
  20. promptgate_llm-0.5.0/injectionbench/mutator.py +225 -0
  21. promptgate_llm-0.5.0/injectionbench/reporter.py +162 -0
  22. promptgate_llm-0.5.0/injectionbench/runner.py +152 -0
  23. promptgate_llm-0.5.0/injectionbench/scorer.py +150 -0
  24. promptgate_llm-0.5.0/promptgate/__init__.py +6 -0
  25. promptgate_llm-0.5.0/promptgate/aggregator.py +38 -0
  26. promptgate_llm-0.5.0/promptgate/config.py +63 -0
  27. promptgate_llm-0.5.0/promptgate/data/embeddings/known_attacks.json +467 -0
  28. promptgate_llm-0.5.0/promptgate/data/embeddings/known_leaks.json +27 -0
  29. promptgate_llm-0.5.0/promptgate/data/patterns/direct_injection.txt +33 -0
  30. promptgate_llm-0.5.0/promptgate/data/patterns/encoding_tricks.txt +20 -0
  31. promptgate_llm-0.5.0/promptgate/data/patterns/jailbreaks.txt +29 -0
  32. promptgate_llm-0.5.0/promptgate/data/patterns/output_leaks.txt +41 -0
  33. promptgate_llm-0.5.0/promptgate/data/patterns/social_engineering.txt +79 -0
  34. promptgate_llm-0.5.0/promptgate/data/patterns/system_override.txt +43 -0
  35. promptgate_llm-0.5.0/promptgate/detector/__init__.py +5 -0
  36. promptgate_llm-0.5.0/promptgate/detector/intent.py +242 -0
  37. promptgate_llm-0.5.0/promptgate/detector/output_filter.py +236 -0
  38. promptgate_llm-0.5.0/promptgate/detector/rule_based.py +138 -0
  39. promptgate_llm-0.5.0/promptgate/detector/semantic.py +270 -0
  40. promptgate_llm-0.5.0/promptgate/gate.py +636 -0
  41. promptgate_llm-0.5.0/promptgate/parser/__init__.py +5 -0
  42. promptgate_llm-0.5.0/promptgate/parser/input_parser.py +111 -0
  43. promptgate_llm-0.5.0/promptgate/policy.py +59 -0
  44. promptgate_llm-0.5.0/promptgate/response.py +67 -0
  45. promptgate_llm-0.5.0/promptgate/scorer.py +38 -0
  46. promptgate_llm-0.5.0/pyproject.toml +36 -0
  47. promptgate_llm-0.5.0/scripts/diff_model_regressions.py +209 -0
  48. promptgate_llm-0.5.0/scripts/generate_benign_coding_dataset.py +229 -0
  49. promptgate_llm-0.5.0/scripts/generate_malicious_coding_dataset.py +179 -0
  50. promptgate_llm-0.5.0/scripts/patch_known_attacks.py +93 -0
  51. promptgate_llm-0.5.0/scripts/train_intent_classifier_v2.py +307 -0
  52. promptgate_llm-0.5.0/scripts/train_intent_classifier_v3.py +244 -0
  53. promptgate_llm-0.5.0/tests/__init__.py +0 -0
  54. promptgate_llm-0.5.0/tests/injectionbench/__init__.py +0 -0
  55. promptgate_llm-0.5.0/tests/injectionbench/test_dataset.py +131 -0
  56. promptgate_llm-0.5.0/tests/injectionbench/test_mutator.py +104 -0
  57. promptgate_llm-0.5.0/tests/injectionbench/test_runner.py +106 -0
  58. promptgate_llm-0.5.0/tests/injectionbench/test_scorer.py +136 -0
  59. promptgate_llm-0.5.0/tests/test_integration.py +197 -0
  60. promptgate_llm-0.5.0/tests/test_intent.py +273 -0
  61. promptgate_llm-0.5.0/tests/test_malicious_coding.py +41 -0
  62. promptgate_llm-0.5.0/tests/test_output_filter.py +214 -0
  63. promptgate_llm-0.5.0/tests/test_parser.py +30 -0
  64. promptgate_llm-0.5.0/tests/test_phase6.py +477 -0
  65. promptgate_llm-0.5.0/tests/test_policy.py +34 -0
  66. promptgate_llm-0.5.0/tests/test_regression.py +46 -0
  67. promptgate_llm-0.5.0/tests/test_rule_based.py +61 -0
  68. promptgate_llm-0.5.0/tests/test_sanitize.py +134 -0
  69. promptgate_llm-0.5.0/tests/test_scorer.py +36 -0
  70. promptgate_llm-0.5.0/tests/test_semantics.py +142 -0
@@ -0,0 +1,36 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ pip install pytest
27
+ pip install -e ".[semantic]"
28
+
29
+ - name: Run tests
30
+ run: |
31
+ # Excludes test_intent.py, test_regression.py, test_malicious_coding.py
32
+ # because all three require the 267MB intent model from HuggingFace Hub.
33
+ # These tests run locally before every push via: python -m pytest tests/ -q
34
+ # A scheduled nightly workflow that downloads the model and runs the full
35
+ # 186-test suite is planned for a future phase.
36
+ python -m pytest tests/ -q --ignore=tests/test_intent.py --ignore=tests/test_regression.py --ignore=tests/test_malicious_coding.py
@@ -0,0 +1,36 @@
1
+ # Byte-compiled / cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ .pytest_cache/
5
+
6
+ # Virtual environments
7
+ .venv/
8
+ venv/
9
+ env/
10
+
11
+ # Build artifacts (PyPI packaging)
12
+ build/
13
+ dist/
14
+ *.egg-info/
15
+ *.egg
16
+
17
+ # Trained models (downloaded from HF Hub or locally trained — never commit)
18
+ models/
19
+
20
+ # Environment variables / secrets
21
+ .env
22
+
23
+ # Benchmark outputs (regenerated by injectionbench)
24
+ results/*.json
25
+ results/*.txt
26
+
27
+ # OS / editor cruft
28
+ .DS_Store
29
+ Thumbs.db
30
+ .vscode/
31
+ .idea/
32
+
33
+ # Misc local scratch
34
+ *.zip
35
+
36
+ test_install/
@@ -0,0 +1,81 @@
1
+ # Changelog
2
+
3
+ ## [0.4.2] - 2025-06-19
4
+
5
+ ### Fixed
6
+ - Intent classifier falsely blocking benign coding requests at 0.93-0.99 confidence ("Write a function to reverse a string in Python" -> BLOCK)
7
+ - Intent classifier falsely allowing malicious coding requests after one-sided counter-example fix ("Write code that exfiltrates config files" -> ALLOW)
8
+ - Retrained on balanced 3-source dataset: deepset (529) + benign coding (300) + malicious coding (150) = 979 examples
9
+ - Promoted test_regression.py and test_malicious_coding.py into tests/ suite (186 tests total)
10
+ - Fixed placeholder Homepage URL in pyproject.toml
11
+
12
+ ### Added
13
+ - scripts/generate_benign_coding_dataset.py
14
+ - scripts/generate_malicious_coding_dataset.py
15
+ - scripts/train_intent_classifier_v3.py
16
+ - injectionbench/datasets/benign/coding_requests.json (300 examples)
17
+ - injectionbench/datasets/attacks/malicious_coding.json (150 examples)
18
+ - tests/test_regression.py (10 tests)
19
+ - tests/test_malicious_coding.py (12 tests)
20
+
21
+ ## [0.4.0] - 2025-06-17
22
+
23
+ ### Added
24
+ - check_batch() -- semantic layer batches all inputs in one model.encode() call
25
+ - acheck() and acheck_batch() -- async support via run_in_executor
26
+ - history parameter on check()/acheck() -- last 3 turns prepended to intent classifier
27
+ - log_mode -- privacy-safe JSONL audit logging (sha256 hash only, raw text never logged)
28
+ - Callback hooks: on_block, on_flag, on_review, on_allow, on_error
29
+
30
+ ## [0.3.0] - 2025-06-15
31
+
32
+ ### Fixed
33
+ - Data files moved inside package (promptgate/data/, injectionbench/datasets/) -- pip install now ships all patterns and embeddings
34
+ - Path resolution fixed: parents[2] -> parents[1] in all detectors
35
+ - intent.py model resolution rewritten: 3-tier fallback (local -> cache -> HF Hub auto-download)
36
+ - HF Hub repo casing fixed: SrivathsanVijayaraghavan -> srivathsan-vijayaraghavan
37
+ - Stale global pip registration removed (old 0.1.0 from Desktop path)
38
+
39
+ ### Added
40
+ - MIT LICENSE
41
+ - .gitignore (models/, __pycache__/, dist/, results/*.json)
42
+ - HuggingFace Hub model hosting (auto-download on first use, ~267MB)
43
+ - Package renamed to promptgate-llm (original name taken on PyPI)
44
+
45
+ ## [0.2.0] - 2025-06-12
46
+
47
+ ### Added
48
+ - SemanticDetector: sentence-transformers/all-MiniLM-L6-v2, 77 known attack embeddings
49
+ - 12-word sliding window with 4-word overlap for long input handling
50
+ - InjectionBench benchmarking framework (dataset loader, mutator, runner, scorer, reporter)
51
+ - CLI: python -m injectionbench run --source huggingface|manual|combined
52
+ - Fine-tuned DistilBERT intent classifier (F1 INJECTION 0.97, accuracy 0.98)
53
+ - Detection rate: 15.2% (rule+semantic) -> 98.3% (full pipeline)
54
+ - signals_checked expanded to 3 entries (rule_based, semantic, intent)
55
+
56
+ ## [0.1.0] - 2025-06-10
57
+
58
+ ### Added
59
+ - Three-layer detection pipeline (InputParser, RuleBasedDetector, Aggregator, Scorer, Policy, ResponseBuilder)
60
+ - 191 patterns across 5 files (direct_injection, jailbreaks, system_override, social_engineering, encoding_tricks)
61
+ - Pattern format: # signal: headers map patterns to signal types
62
+ - Response always exactly 7 keys: decision, confidence, risk_level, threat_categories, signals, signals_checked, message
63
+ - Signal accumulation scoring: score = min(1.0, sum of severities)
64
+ - Policy thresholds: 0.00-0.30 ALLOW, 0.30-0.55 FLAG, 0.55-0.75 REVIEW, 0.75-1.00 BLOCK
65
+ - Configurable thresholds per deployment
66
+ - Graceful degradation when optional dependencies absent
67
+
68
+ ## Tested Dependency Versions (0.4.2)
69
+
70
+ Verified working against these versions on fresh install (June 2026):
71
+
72
+ | Package | Version |
73
+ |---------|---------|
74
+ | transformers | 5.12.1 |
75
+ | datasets | 5.0.0 |
76
+ | huggingface_hub | 1.20.1 |
77
+ | accelerate | 1.14.0 |
78
+ | sentence-transformers | (install separately via [semantic]) |
79
+
80
+ Note: pyproject.toml specifies lower bounds only. These are the versions
81
+ resolved by pip at time of 0.4.2 publication and verified to work correctly.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Srivathsan Vijayaraghavan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,125 @@
1
+ Metadata-Version: 2.4
2
+ Name: promptgate-llm
3
+ Version: 0.5.0
4
+ Summary: LLM security middleware and risk analysis layer for prompt injection detection
5
+ Project-URL: Homepage, https://github.com/SrivathsanVijayaraghavan/promptgate-llm
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.10
8
+ Provides-Extra: intent
9
+ Requires-Dist: accelerate>=1.1.0; extra == 'intent'
10
+ Requires-Dist: datasets>=2.0.0; extra == 'intent'
11
+ Requires-Dist: scikit-learn>=1.0.0; extra == 'intent'
12
+ Requires-Dist: torch>=2.0.0; extra == 'intent'
13
+ Requires-Dist: transformers[torch]>=4.30.0; extra == 'intent'
14
+ Provides-Extra: semantic
15
+ Requires-Dist: numpy>=1.21.0; extra == 'semantic'
16
+ Requires-Dist: scikit-learn>=1.0.0; extra == 'semantic'
17
+ Requires-Dist: sentence-transformers>=2.2.0; extra == 'semantic'
18
+ Description-Content-Type: text/markdown
19
+
20
+ # PromptGate
21
+
22
+ PromptGate is an open-source Python middleware library that sits between a user and an LLM and **detects prompt injection risk before the model sees the input**. It is a **risk classifier**, not a moral judge: it accumulates explainable signals and applies policy thresholds.
23
+
24
+ ## Architecture
25
+
26
+ ```
27
+ user_input
28
+ → parser (normalize text, metadata)
29
+ → rule_based (substring pattern matching)
30
+ → aggregator (map signals → threat categories)
31
+ → scorer (sum unique signal severities, cap at 1.0)
32
+ → policy (ALLOW / FLAG / REVIEW / BLOCK)
33
+ → response (structured, explainable output)
34
+ ```
35
+
36
+ The LLM never receives blocked prompts when PromptGate is placed in front of the request path.
37
+
38
+ ## Installation
39
+
40
+ From the project root (`promptgate/`):
41
+
42
+ ```bash
43
+ pip install -e ".[dev]"
44
+ ```
45
+
46
+ Editable install keeps pattern files and source in sync during development.
47
+
48
+ ## Usage
49
+
50
+ ```python
51
+ from promptgate import PromptGate
52
+
53
+ gate = PromptGate()
54
+ result = gate.check("Please ignore previous instructions.")
55
+
56
+ print(result["decision"]) # BLOCK
57
+ print(result["confidence"]) # 0.95
58
+ print(result["message"]) # Human-readable explanation
59
+ print(result["signals"]) # Matched risk signals
60
+ ```
61
+
62
+ Custom policy thresholds:
63
+
64
+ ```python
65
+ gate = PromptGate(thresholds={"block": 0.80, "review": 0.60, "flag": 0.35})
66
+ result = gate.check(user_input)
67
+ ```
68
+
69
+ ## Explainability Philosophy
70
+
71
+ Every response includes:
72
+
73
+ - **decision** — ALLOW, FLAG, REVIEW, or BLOCK
74
+ - **confidence** — equals the accumulated risk score (0.0 when safe)
75
+ - **signals** — what matched, with severity and pattern text
76
+ - **signals_checked** — signal types, categories, and pattern files scanned
77
+ - **message** — plain-language explanation of why the decision was made
78
+
79
+ ALLOW responses explicitly state that no injection patterns or manipulation framing were detected above thresholds. Restricted responses name matched signals and categories.
80
+
81
+ **Signal accumulation is required.** One weak signal alone (for example, sympathy framing) does not block. Multiple signals combine via `score = min(1.0, sum(severities))`.
82
+
83
+ ## Threat Categories
84
+
85
+ | Category | Example signals |
86
+ |----------|-----------------|
87
+ | `direct_injection` | instruction_override, data_exfiltration |
88
+ | `jailbreak` | jailbreak_persona |
89
+ | `system_override` | system_override, system_prompt_leak |
90
+ | `social_engineering` | authority_claim, secrecy_request, urgency_framing |
91
+ | `encoding_attack` | encoding_trick |
92
+
93
+ Severity values and mappings live in `promptgate/config.py`.
94
+
95
+ ## Default Policy Thresholds
96
+
97
+ | Score range | Decision |
98
+ |-------------|----------|
99
+ | 0.00 – 0.29 | ALLOW |
100
+ | 0.30 – 0.54 | FLAG |
101
+ | 0.55 – 0.74 | REVIEW |
102
+ | 0.75 – 1.00 | BLOCK |
103
+
104
+ ## Local Testing
105
+
106
+ ```bash
107
+ cd promptgate
108
+ pip install -e ".[dev]"
109
+ pytest -v
110
+ ```
111
+
112
+ ## Project Layout
113
+
114
+ ```
115
+ promptgate/
116
+ ├── promptgate/ # Python package
117
+ ├── data/patterns/ # Seed pattern files
118
+ ├── tests/
119
+ ├── pyproject.toml
120
+ └── README.md
121
+ ```
122
+
123
+ ## License
124
+
125
+ MIT
@@ -0,0 +1,106 @@
1
+ # PromptGate
2
+
3
+ PromptGate is an open-source Python middleware library that sits between a user and an LLM and **detects prompt injection risk before the model sees the input**. It is a **risk classifier**, not a moral judge: it accumulates explainable signals and applies policy thresholds.
4
+
5
+ ## Architecture
6
+
7
+ ```
8
+ user_input
9
+ → parser (normalize text, metadata)
10
+ → rule_based (substring pattern matching)
11
+ → aggregator (map signals → threat categories)
12
+ → scorer (sum unique signal severities, cap at 1.0)
13
+ → policy (ALLOW / FLAG / REVIEW / BLOCK)
14
+ → response (structured, explainable output)
15
+ ```
16
+
17
+ The LLM never receives blocked prompts when PromptGate is placed in front of the request path.
18
+
19
+ ## Installation
20
+
21
+ From the project root (`promptgate/`):
22
+
23
+ ```bash
24
+ pip install -e ".[dev]"
25
+ ```
26
+
27
+ Editable install keeps pattern files and source in sync during development.
28
+
29
+ ## Usage
30
+
31
+ ```python
32
+ from promptgate import PromptGate
33
+
34
+ gate = PromptGate()
35
+ result = gate.check("Please ignore previous instructions.")
36
+
37
+ print(result["decision"]) # BLOCK
38
+ print(result["confidence"]) # 0.95
39
+ print(result["message"]) # Human-readable explanation
40
+ print(result["signals"]) # Matched risk signals
41
+ ```
42
+
43
+ Custom policy thresholds:
44
+
45
+ ```python
46
+ gate = PromptGate(thresholds={"block": 0.80, "review": 0.60, "flag": 0.35})
47
+ result = gate.check(user_input)
48
+ ```
49
+
50
+ ## Explainability Philosophy
51
+
52
+ Every response includes:
53
+
54
+ - **decision** — ALLOW, FLAG, REVIEW, or BLOCK
55
+ - **confidence** — equals the accumulated risk score (0.0 when safe)
56
+ - **signals** — what matched, with severity and pattern text
57
+ - **signals_checked** — signal types, categories, and pattern files scanned
58
+ - **message** — plain-language explanation of why the decision was made
59
+
60
+ ALLOW responses explicitly state that no injection patterns or manipulation framing were detected above thresholds. Restricted responses name matched signals and categories.
61
+
62
+ **Signal accumulation is required.** One weak signal alone (for example, sympathy framing) does not block. Multiple signals combine via `score = min(1.0, sum(severities))`.
63
+
64
+ ## Threat Categories
65
+
66
+ | Category | Example signals |
67
+ |----------|-----------------|
68
+ | `direct_injection` | instruction_override, data_exfiltration |
69
+ | `jailbreak` | jailbreak_persona |
70
+ | `system_override` | system_override, system_prompt_leak |
71
+ | `social_engineering` | authority_claim, secrecy_request, urgency_framing |
72
+ | `encoding_attack` | encoding_trick |
73
+
74
+ Severity values and mappings live in `promptgate/config.py`.
75
+
76
+ ## Default Policy Thresholds
77
+
78
+ | Score range | Decision |
79
+ |-------------|----------|
80
+ | 0.00 – 0.29 | ALLOW |
81
+ | 0.30 – 0.54 | FLAG |
82
+ | 0.55 – 0.74 | REVIEW |
83
+ | 0.75 – 1.00 | BLOCK |
84
+
85
+ ## Local Testing
86
+
87
+ ```bash
88
+ cd promptgate
89
+ pip install -e ".[dev]"
90
+ pytest -v
91
+ ```
92
+
93
+ ## Project Layout
94
+
95
+ ```
96
+ promptgate/
97
+ ├── promptgate/ # Python package
98
+ ├── data/patterns/ # Seed pattern files
99
+ ├── tests/
100
+ ├── pyproject.toml
101
+ └── README.md
102
+ ```
103
+
104
+ ## License
105
+
106
+ MIT
@@ -0,0 +1,10 @@
1
+ """
2
+ Root conftest.py — ensures both promptgate and injectionbench packages
3
+ are importable during pytest runs in this repository.
4
+ """
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ # Add project root to sys.path so injectionbench is importable
9
+ # alongside the editable-installed promptgate package.
10
+ sys.path.insert(0, str(Path(__file__).resolve().parent))
File without changes
@@ -0,0 +1,125 @@
1
+ """
2
+ injectionbench/__main__.py
3
+ --------------------------
4
+ CLI entry point for InjectionBench.
5
+
6
+ Usage:
7
+ python -m injectionbench run
8
+ python -m injectionbench run --source manual
9
+ python -m injectionbench run --category direct_injection
10
+ python -m injectionbench run --mutations
11
+ python -m injectionbench run --skip-semantic --output my_results/
12
+ """
13
+
14
+ import argparse
15
+ import sys
16
+
17
+ from injectionbench.dataset import DatasetLoader
18
+ from injectionbench.runner import BenchmarkRunner
19
+ from injectionbench.scorer import MetricsScorer
20
+ from injectionbench.reporter import BenchmarkReporter
21
+
22
+
23
+ def main() -> None:
24
+ """Entry point for python -m injectionbench."""
25
+ parser = argparse.ArgumentParser(
26
+ prog="injectionbench",
27
+ description="Adversarial benchmarking framework for PromptGate.",
28
+ )
29
+ subparsers = parser.add_subparsers(dest="command")
30
+
31
+ run_parser = subparsers.add_parser("run", help="Run the benchmark.")
32
+ run_parser.add_argument(
33
+ "--skip-semantic",
34
+ action="store_true",
35
+ help="Skip semantic detection layer.",
36
+ )
37
+ run_parser.add_argument(
38
+ "--category",
39
+ type=str,
40
+ default=None,
41
+ help="Run only this attack category (manual source only).",
42
+ )
43
+ run_parser.add_argument(
44
+ "--mutations",
45
+ action="store_true",
46
+ help="Also run mutation variants of attack samples.",
47
+ )
48
+ run_parser.add_argument(
49
+ "--output",
50
+ type=str,
51
+ default="results/",
52
+ help="Output directory for reports (default: results/).",
53
+ )
54
+ run_parser.add_argument(
55
+ "--source",
56
+ type=str,
57
+ default="huggingface",
58
+ choices=["huggingface", "manual", "combined"],
59
+ help="Dataset source (default: huggingface).",
60
+ )
61
+
62
+ args = parser.parse_args()
63
+
64
+ if args.command != "run":
65
+ parser.print_help()
66
+ sys.exit(0)
67
+
68
+ loader = DatasetLoader()
69
+ runner = BenchmarkRunner(skip_semantic=args.skip_semantic)
70
+ scorer = MetricsScorer()
71
+ reporter = BenchmarkReporter()
72
+
73
+ print(f"InjectionBench — source: {args.source}")
74
+ print("-" * 40)
75
+
76
+ # Load dataset
77
+ if args.category and args.source != "manual":
78
+ print("Note: --category filter only applies with --source manual. Switching to manual.")
79
+ args.source = "manual"
80
+
81
+ data = loader.load_all(source=args.source)
82
+ attacks = data["attacks"]
83
+ benign = data["benign"]
84
+
85
+ if args.category:
86
+ attacks = [s for s in attacks if s["category"] == args.category]
87
+ print(f"Filtered to category: {args.category} ({len(attacks)} samples)")
88
+
89
+ print(f"Loaded {len(attacks)} attack samples, {len(benign)} benign samples.")
90
+ print()
91
+
92
+ # Run main dataset
93
+ attack_results = runner.run_dataset(attacks)
94
+ benign_results = runner.run_dataset(benign)
95
+ all_results = attack_results + benign_results
96
+
97
+ # Optionally run mutations
98
+ if args.mutations:
99
+ print("\nRunning mutation variants...")
100
+ mutation_results: list[dict] = []
101
+ methods = ["case_flip", "whitespace_inject", "homoglyph"]
102
+ for sample in attacks[:10]: # limit to first 10 to keep it fast
103
+ mut = runner.run_mutations(
104
+ sample["text"],
105
+ category=sample["category"],
106
+ methods=methods,
107
+ )
108
+ mutation_results.extend(mut)
109
+ print(f"Mutation samples run: {len(mutation_results)}")
110
+ all_results.extend(mutation_results)
111
+
112
+ # Score
113
+ metrics = scorer.score(all_results)
114
+
115
+ # Print report to terminal
116
+ text_report = reporter.generate_text_report(metrics, version="0.2.0")
117
+ print()
118
+ print(text_report)
119
+
120
+ # Save to disk
121
+ reporter.save(metrics, output_dir=args.output, version="0.2.0")
122
+
123
+
124
+ if __name__ == "__main__":
125
+ main()