promptgate-llm 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptgate_llm-0.5.0/.github/workflows/tests.yml +36 -0
- promptgate_llm-0.5.0/.gitignore +36 -0
- promptgate_llm-0.5.0/CHANGELOG.md +81 -0
- promptgate_llm-0.5.0/LICENSE +21 -0
- promptgate_llm-0.5.0/PKG-INFO +125 -0
- promptgate_llm-0.5.0/README.md +106 -0
- promptgate_llm-0.5.0/conftest.py +10 -0
- promptgate_llm-0.5.0/injectionbench/__init__.py +0 -0
- promptgate_llm-0.5.0/injectionbench/__main__.py +125 -0
- promptgate_llm-0.5.0/injectionbench/dataset.py +340 -0
- promptgate_llm-0.5.0/injectionbench/datasets/attacks/data_exfiltration.json +7 -0
- promptgate_llm-0.5.0/injectionbench/datasets/attacks/direct_injection.json +12 -0
- promptgate_llm-0.5.0/injectionbench/datasets/attacks/encoding_attacks.json +7 -0
- promptgate_llm-0.5.0/injectionbench/datasets/attacks/jailbreaks.json +12 -0
- promptgate_llm-0.5.0/injectionbench/datasets/attacks/malicious_coding.json +752 -0
- promptgate_llm-0.5.0/injectionbench/datasets/attacks/social_engineering.json +12 -0
- promptgate_llm-0.5.0/injectionbench/datasets/attacks/system_override.json +7 -0
- promptgate_llm-0.5.0/injectionbench/datasets/benign/clean_samples.json +22 -0
- promptgate_llm-0.5.0/injectionbench/datasets/benign/coding_requests.json +1502 -0
- promptgate_llm-0.5.0/injectionbench/mutator.py +225 -0
- promptgate_llm-0.5.0/injectionbench/reporter.py +162 -0
- promptgate_llm-0.5.0/injectionbench/runner.py +152 -0
- promptgate_llm-0.5.0/injectionbench/scorer.py +150 -0
- promptgate_llm-0.5.0/promptgate/__init__.py +6 -0
- promptgate_llm-0.5.0/promptgate/aggregator.py +38 -0
- promptgate_llm-0.5.0/promptgate/config.py +63 -0
- promptgate_llm-0.5.0/promptgate/data/embeddings/known_attacks.json +467 -0
- promptgate_llm-0.5.0/promptgate/data/embeddings/known_leaks.json +27 -0
- promptgate_llm-0.5.0/promptgate/data/patterns/direct_injection.txt +33 -0
- promptgate_llm-0.5.0/promptgate/data/patterns/encoding_tricks.txt +20 -0
- promptgate_llm-0.5.0/promptgate/data/patterns/jailbreaks.txt +29 -0
- promptgate_llm-0.5.0/promptgate/data/patterns/output_leaks.txt +41 -0
- promptgate_llm-0.5.0/promptgate/data/patterns/social_engineering.txt +79 -0
- promptgate_llm-0.5.0/promptgate/data/patterns/system_override.txt +43 -0
- promptgate_llm-0.5.0/promptgate/detector/__init__.py +5 -0
- promptgate_llm-0.5.0/promptgate/detector/intent.py +242 -0
- promptgate_llm-0.5.0/promptgate/detector/output_filter.py +236 -0
- promptgate_llm-0.5.0/promptgate/detector/rule_based.py +138 -0
- promptgate_llm-0.5.0/promptgate/detector/semantic.py +270 -0
- promptgate_llm-0.5.0/promptgate/gate.py +636 -0
- promptgate_llm-0.5.0/promptgate/parser/__init__.py +5 -0
- promptgate_llm-0.5.0/promptgate/parser/input_parser.py +111 -0
- promptgate_llm-0.5.0/promptgate/policy.py +59 -0
- promptgate_llm-0.5.0/promptgate/response.py +67 -0
- promptgate_llm-0.5.0/promptgate/scorer.py +38 -0
- promptgate_llm-0.5.0/pyproject.toml +36 -0
- promptgate_llm-0.5.0/scripts/diff_model_regressions.py +209 -0
- promptgate_llm-0.5.0/scripts/generate_benign_coding_dataset.py +229 -0
- promptgate_llm-0.5.0/scripts/generate_malicious_coding_dataset.py +179 -0
- promptgate_llm-0.5.0/scripts/patch_known_attacks.py +93 -0
- promptgate_llm-0.5.0/scripts/train_intent_classifier_v2.py +307 -0
- promptgate_llm-0.5.0/scripts/train_intent_classifier_v3.py +244 -0
- promptgate_llm-0.5.0/tests/__init__.py +0 -0
- promptgate_llm-0.5.0/tests/injectionbench/__init__.py +0 -0
- promptgate_llm-0.5.0/tests/injectionbench/test_dataset.py +131 -0
- promptgate_llm-0.5.0/tests/injectionbench/test_mutator.py +104 -0
- promptgate_llm-0.5.0/tests/injectionbench/test_runner.py +106 -0
- promptgate_llm-0.5.0/tests/injectionbench/test_scorer.py +136 -0
- promptgate_llm-0.5.0/tests/test_integration.py +197 -0
- promptgate_llm-0.5.0/tests/test_intent.py +273 -0
- promptgate_llm-0.5.0/tests/test_malicious_coding.py +41 -0
- promptgate_llm-0.5.0/tests/test_output_filter.py +214 -0
- promptgate_llm-0.5.0/tests/test_parser.py +30 -0
- promptgate_llm-0.5.0/tests/test_phase6.py +477 -0
- promptgate_llm-0.5.0/tests/test_policy.py +34 -0
- promptgate_llm-0.5.0/tests/test_regression.py +46 -0
- promptgate_llm-0.5.0/tests/test_rule_based.py +61 -0
- promptgate_llm-0.5.0/tests/test_sanitize.py +134 -0
- promptgate_llm-0.5.0/tests/test_scorer.py +36 -0
- promptgate_llm-0.5.0/tests/test_semantics.py +142 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install dependencies
|
|
25
|
+
run: |
|
|
26
|
+
pip install pytest
|
|
27
|
+
pip install -e ".[semantic]"
|
|
28
|
+
|
|
29
|
+
- name: Run tests
|
|
30
|
+
run: |
|
|
31
|
+
# Excludes test_intent.py, test_regression.py, test_malicious_coding.py
|
|
32
|
+
# because all three require the 267MB intent model from HuggingFace Hub.
|
|
33
|
+
# These tests run locally before every push via: python -m pytest tests/ -q
|
|
34
|
+
# A scheduled nightly workflow that downloads the model and runs the full
|
|
35
|
+
# 186-test suite is planned for a future phase.
|
|
36
|
+
python -m pytest tests/ -q --ignore=tests/test_intent.py --ignore=tests/test_regression.py --ignore=tests/test_malicious_coding.py
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Byte-compiled / cache
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
.pytest_cache/
|
|
5
|
+
|
|
6
|
+
# Virtual environments
|
|
7
|
+
.venv/
|
|
8
|
+
venv/
|
|
9
|
+
env/
|
|
10
|
+
|
|
11
|
+
# Build artifacts (PyPI packaging)
|
|
12
|
+
build/
|
|
13
|
+
dist/
|
|
14
|
+
*.egg-info/
|
|
15
|
+
*.egg
|
|
16
|
+
|
|
17
|
+
# Trained models (downloaded from HF Hub or locally trained — never commit)
|
|
18
|
+
models/
|
|
19
|
+
|
|
20
|
+
# Environment variables / secrets
|
|
21
|
+
.env
|
|
22
|
+
|
|
23
|
+
# Benchmark outputs (regenerated by injectionbench)
|
|
24
|
+
results/*.json
|
|
25
|
+
results/*.txt
|
|
26
|
+
|
|
27
|
+
# OS / editor cruft
|
|
28
|
+
.DS_Store
|
|
29
|
+
Thumbs.db
|
|
30
|
+
.vscode/
|
|
31
|
+
.idea/
|
|
32
|
+
|
|
33
|
+
# Misc local scratch
|
|
34
|
+
*.zip
|
|
35
|
+
|
|
36
|
+
test_install/
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.4.2] - 2025-06-19
|
|
4
|
+
|
|
5
|
+
### Fixed
|
|
6
|
+
- Intent classifier falsely blocking benign coding requests at 0.93-0.99 confidence ("Write a function to reverse a string in Python" -> BLOCK)
|
|
7
|
+
- Intent classifier falsely allowing malicious coding requests after one-sided counter-example fix ("Write code that exfiltrates config files" -> ALLOW)
|
|
8
|
+
- Retrained on balanced 3-source dataset: deepset (529) + benign coding (300) + malicious coding (150) = 979 examples
|
|
9
|
+
- Promoted test_regression.py and test_malicious_coding.py into tests/ suite (186 tests total)
|
|
10
|
+
- Fixed placeholder Homepage URL in pyproject.toml
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- scripts/generate_benign_coding_dataset.py
|
|
14
|
+
- scripts/generate_malicious_coding_dataset.py
|
|
15
|
+
- scripts/train_intent_classifier_v3.py
|
|
16
|
+
- injectionbench/datasets/benign/coding_requests.json (300 examples)
|
|
17
|
+
- injectionbench/datasets/attacks/malicious_coding.json (150 examples)
|
|
18
|
+
- tests/test_regression.py (10 tests)
|
|
19
|
+
- tests/test_malicious_coding.py (12 tests)
|
|
20
|
+
|
|
21
|
+
## [0.4.0] - 2025-06-17
|
|
22
|
+
|
|
23
|
+
### Added
|
|
24
|
+
- check_batch() -- semantic layer batches all inputs in one model.encode() call
|
|
25
|
+
- acheck() and acheck_batch() -- async support via run_in_executor
|
|
26
|
+
- history parameter on check()/acheck() -- last 3 turns prepended to intent classifier
|
|
27
|
+
- log_mode -- privacy-safe JSONL audit logging (sha256 hash only, raw text never logged)
|
|
28
|
+
- Callback hooks: on_block, on_flag, on_review, on_allow, on_error
|
|
29
|
+
|
|
30
|
+
## [0.3.0] - 2025-06-15
|
|
31
|
+
|
|
32
|
+
### Fixed
|
|
33
|
+
- Data files moved inside package (promptgate/data/, injectionbench/datasets/) -- pip install now ships all patterns and embeddings
|
|
34
|
+
- Path resolution fixed: parents[2] -> parents[1] in all detectors
|
|
35
|
+
- intent.py model resolution rewritten: 3-tier fallback (local -> cache -> HF Hub auto-download)
|
|
36
|
+
- HF Hub repo casing fixed: SrivathsanVijayaraghavan -> srivathsan-vijayaraghavan
|
|
37
|
+
- Stale global pip registration removed (old 0.1.0 from Desktop path)
|
|
38
|
+
|
|
39
|
+
### Added
|
|
40
|
+
- MIT LICENSE
|
|
41
|
+
- .gitignore (models/, __pycache__/, dist/, results/*.json)
|
|
42
|
+
- HuggingFace Hub model hosting (auto-download on first use, ~267MB)
|
|
43
|
+
- Package renamed to promptgate-llm (original name taken on PyPI)
|
|
44
|
+
|
|
45
|
+
## [0.2.0] - 2025-06-12
|
|
46
|
+
|
|
47
|
+
### Added
|
|
48
|
+
- SemanticDetector: sentence-transformers/all-MiniLM-L6-v2, 77 known attack embeddings
|
|
49
|
+
- 12-word sliding window with 4-word overlap for long input handling
|
|
50
|
+
- InjectionBench benchmarking framework (dataset loader, mutator, runner, scorer, reporter)
|
|
51
|
+
- CLI: python -m injectionbench run --source huggingface|manual|combined
|
|
52
|
+
- Fine-tuned DistilBERT intent classifier (F1 INJECTION 0.97, accuracy 0.98)
|
|
53
|
+
- Detection rate: 15.2% (rule+semantic) -> 98.3% (full pipeline)
|
|
54
|
+
- signals_checked expanded to 3 entries (rule_based, semantic, intent)
|
|
55
|
+
|
|
56
|
+
## [0.1.0] - 2025-06-10
|
|
57
|
+
|
|
58
|
+
### Added
|
|
59
|
+
- Three-layer detection pipeline (InputParser, RuleBasedDetector, Aggregator, Scorer, Policy, ResponseBuilder)
|
|
60
|
+
- 191 patterns across 5 files (direct_injection, jailbreaks, system_override, social_engineering, encoding_tricks)
|
|
61
|
+
- Pattern format: # signal: headers map patterns to signal types
|
|
62
|
+
- Response always exactly 7 keys: decision, confidence, risk_level, threat_categories, signals, signals_checked, message
|
|
63
|
+
- Signal accumulation scoring: score = min(1.0, sum of severities)
|
|
64
|
+
- Policy thresholds: 0.00-0.30 ALLOW, 0.30-0.55 FLAG, 0.55-0.75 REVIEW, 0.75-1.00 BLOCK
|
|
65
|
+
- Configurable thresholds per deployment
|
|
66
|
+
- Graceful degradation when optional dependencies absent
|
|
67
|
+
|
|
68
|
+
## Tested Dependency Versions (0.4.2)
|
|
69
|
+
|
|
70
|
+
Verified working against these versions on fresh install (June 2026):
|
|
71
|
+
|
|
72
|
+
| Package | Version |
|
|
73
|
+
|---------|---------|
|
|
74
|
+
| transformers | 5.12.1 |
|
|
75
|
+
| datasets | 5.0.0 |
|
|
76
|
+
| huggingface_hub | 1.20.1 |
|
|
77
|
+
| accelerate | 1.14.0 |
|
|
78
|
+
| sentence-transformers | (install separately via [semantic]) |
|
|
79
|
+
|
|
80
|
+
Note: pyproject.toml specifies lower bounds only. These are the versions
|
|
81
|
+
resolved by pip at time of 0.4.2 publication and verified to work correctly.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Srivathsan Vijayaraghavan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: promptgate-llm
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: LLM security middleware and risk analysis layer for prompt injection detection
|
|
5
|
+
Project-URL: Homepage, https://github.com/SrivathsanVijayaraghavan/promptgate-llm
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Provides-Extra: intent
|
|
9
|
+
Requires-Dist: accelerate>=1.1.0; extra == 'intent'
|
|
10
|
+
Requires-Dist: datasets>=2.0.0; extra == 'intent'
|
|
11
|
+
Requires-Dist: scikit-learn>=1.0.0; extra == 'intent'
|
|
12
|
+
Requires-Dist: torch>=2.0.0; extra == 'intent'
|
|
13
|
+
Requires-Dist: transformers[torch]>=4.30.0; extra == 'intent'
|
|
14
|
+
Provides-Extra: semantic
|
|
15
|
+
Requires-Dist: numpy>=1.21.0; extra == 'semantic'
|
|
16
|
+
Requires-Dist: scikit-learn>=1.0.0; extra == 'semantic'
|
|
17
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == 'semantic'
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# PromptGate
|
|
21
|
+
|
|
22
|
+
PromptGate is an open-source Python middleware library that sits between a user and an LLM and **detects prompt injection risk before the model sees the input**. It is a **risk classifier**, not a moral judge: it accumulates explainable signals and applies policy thresholds.
|
|
23
|
+
|
|
24
|
+
## Architecture
|
|
25
|
+
|
|
26
|
+
```
|
|
27
|
+
user_input
|
|
28
|
+
→ parser (normalize text, metadata)
|
|
29
|
+
→ rule_based (substring pattern matching)
|
|
30
|
+
→ aggregator (map signals → threat categories)
|
|
31
|
+
→ scorer (sum unique signal severities, cap at 1.0)
|
|
32
|
+
→ policy (ALLOW / FLAG / REVIEW / BLOCK)
|
|
33
|
+
→ response (structured, explainable output)
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
The LLM never receives blocked prompts when PromptGate is placed in front of the request path.
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
From the project root (`promptgate/`):
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install -e ".[dev]"
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Editable install keeps pattern files and source in sync during development.
|
|
47
|
+
|
|
48
|
+
## Usage
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from promptgate import PromptGate
|
|
52
|
+
|
|
53
|
+
gate = PromptGate()
|
|
54
|
+
result = gate.check("Please ignore previous instructions.")
|
|
55
|
+
|
|
56
|
+
print(result["decision"]) # BLOCK
|
|
57
|
+
print(result["confidence"]) # 0.95
|
|
58
|
+
print(result["message"]) # Human-readable explanation
|
|
59
|
+
print(result["signals"]) # Matched risk signals
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Custom policy thresholds:
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
gate = PromptGate(thresholds={"block": 0.80, "review": 0.60, "flag": 0.35})
|
|
66
|
+
result = gate.check(user_input)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Explainability Philosophy
|
|
70
|
+
|
|
71
|
+
Every response includes:
|
|
72
|
+
|
|
73
|
+
- **decision** — ALLOW, FLAG, REVIEW, or BLOCK
|
|
74
|
+
- **confidence** — equals the accumulated risk score (0.0 when safe)
|
|
75
|
+
- **signals** — what matched, with severity and pattern text
|
|
76
|
+
- **signals_checked** — signal types, categories, and pattern files scanned
|
|
77
|
+
- **message** — plain-language explanation of why the decision was made
|
|
78
|
+
|
|
79
|
+
ALLOW responses explicitly state that no injection patterns or manipulation framing were detected above thresholds. Restricted responses name matched signals and categories.
|
|
80
|
+
|
|
81
|
+
**Signal accumulation is required.** One weak signal alone (for example, sympathy framing) does not block. Multiple signals combine via `score = min(1.0, sum(severities))`.
|
|
82
|
+
|
|
83
|
+
## Threat Categories
|
|
84
|
+
|
|
85
|
+
| Category | Example signals |
|
|
86
|
+
|----------|-----------------|
|
|
87
|
+
| `direct_injection` | instruction_override, data_exfiltration |
|
|
88
|
+
| `jailbreak` | jailbreak_persona |
|
|
89
|
+
| `system_override` | system_override, system_prompt_leak |
|
|
90
|
+
| `social_engineering` | authority_claim, secrecy_request, urgency_framing |
|
|
91
|
+
| `encoding_attack` | encoding_trick |
|
|
92
|
+
|
|
93
|
+
Severity values and mappings live in `promptgate/config.py`.
|
|
94
|
+
|
|
95
|
+
## Default Policy Thresholds
|
|
96
|
+
|
|
97
|
+
| Score range | Decision |
|
|
98
|
+
|-------------|----------|
|
|
99
|
+
| 0.00 – 0.29 | ALLOW |
|
|
100
|
+
| 0.30 – 0.54 | FLAG |
|
|
101
|
+
| 0.55 – 0.74 | REVIEW |
|
|
102
|
+
| 0.75 – 1.00 | BLOCK |
|
|
103
|
+
|
|
104
|
+
## Local Testing
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
cd promptgate
|
|
108
|
+
pip install -e ".[dev]"
|
|
109
|
+
pytest -v
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Project Layout
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
promptgate/
|
|
116
|
+
├── promptgate/ # Python package
|
|
117
|
+
├── data/patterns/ # Seed pattern files
|
|
118
|
+
├── tests/
|
|
119
|
+
├── pyproject.toml
|
|
120
|
+
└── README.md
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## License
|
|
124
|
+
|
|
125
|
+
MIT
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# PromptGate
|
|
2
|
+
|
|
3
|
+
PromptGate is an open-source Python middleware library that sits between a user and an LLM and **detects prompt injection risk before the model sees the input**. It is a **risk classifier**, not a moral judge: it accumulates explainable signals and applies policy thresholds.
|
|
4
|
+
|
|
5
|
+
## Architecture
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
user_input
|
|
9
|
+
→ parser (normalize text, metadata)
|
|
10
|
+
→ rule_based (substring pattern matching)
|
|
11
|
+
→ aggregator (map signals → threat categories)
|
|
12
|
+
→ scorer (sum unique signal severities, cap at 1.0)
|
|
13
|
+
→ policy (ALLOW / FLAG / REVIEW / BLOCK)
|
|
14
|
+
→ response (structured, explainable output)
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
The LLM never receives blocked prompts when PromptGate is placed in front of the request path.
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
From the project root (`promptgate/`):
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install -e ".[dev]"
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Editable install keeps pattern files and source in sync during development.
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from promptgate import PromptGate
|
|
33
|
+
|
|
34
|
+
gate = PromptGate()
|
|
35
|
+
result = gate.check("Please ignore previous instructions.")
|
|
36
|
+
|
|
37
|
+
print(result["decision"]) # BLOCK
|
|
38
|
+
print(result["confidence"]) # 0.95
|
|
39
|
+
print(result["message"]) # Human-readable explanation
|
|
40
|
+
print(result["signals"]) # Matched risk signals
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Custom policy thresholds:
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
gate = PromptGate(thresholds={"block": 0.80, "review": 0.60, "flag": 0.35})
|
|
47
|
+
result = gate.check(user_input)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Explainability Philosophy
|
|
51
|
+
|
|
52
|
+
Every response includes:
|
|
53
|
+
|
|
54
|
+
- **decision** — ALLOW, FLAG, REVIEW, or BLOCK
|
|
55
|
+
- **confidence** — equals the accumulated risk score (0.0 when safe)
|
|
56
|
+
- **signals** — what matched, with severity and pattern text
|
|
57
|
+
- **signals_checked** — signal types, categories, and pattern files scanned
|
|
58
|
+
- **message** — plain-language explanation of why the decision was made
|
|
59
|
+
|
|
60
|
+
ALLOW responses explicitly state that no injection patterns or manipulation framing were detected above thresholds. Restricted responses name matched signals and categories.
|
|
61
|
+
|
|
62
|
+
**Signal accumulation is required.** One weak signal alone (for example, sympathy framing) does not block. Multiple signals combine via `score = min(1.0, sum(severities))`.
|
|
63
|
+
|
|
64
|
+
## Threat Categories
|
|
65
|
+
|
|
66
|
+
| Category | Example signals |
|
|
67
|
+
|----------|-----------------|
|
|
68
|
+
| `direct_injection` | instruction_override, data_exfiltration |
|
|
69
|
+
| `jailbreak` | jailbreak_persona |
|
|
70
|
+
| `system_override` | system_override, system_prompt_leak |
|
|
71
|
+
| `social_engineering` | authority_claim, secrecy_request, urgency_framing |
|
|
72
|
+
| `encoding_attack` | encoding_trick |
|
|
73
|
+
|
|
74
|
+
Severity values and mappings live in `promptgate/config.py`.
|
|
75
|
+
|
|
76
|
+
## Default Policy Thresholds
|
|
77
|
+
|
|
78
|
+
| Score range | Decision |
|
|
79
|
+
|-------------|----------|
|
|
80
|
+
| 0.00 – 0.29 | ALLOW |
|
|
81
|
+
| 0.30 – 0.54 | FLAG |
|
|
82
|
+
| 0.55 – 0.74 | REVIEW |
|
|
83
|
+
| 0.75 – 1.00 | BLOCK |
|
|
84
|
+
|
|
85
|
+
## Local Testing
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
cd promptgate
|
|
89
|
+
pip install -e ".[dev]"
|
|
90
|
+
pytest -v
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Project Layout
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
promptgate/
|
|
97
|
+
├── promptgate/ # Python package
|
|
98
|
+
├── data/patterns/ # Seed pattern files
|
|
99
|
+
├── tests/
|
|
100
|
+
├── pyproject.toml
|
|
101
|
+
└── README.md
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## License
|
|
105
|
+
|
|
106
|
+
MIT
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Root conftest.py — ensures both promptgate and injectionbench packages
|
|
3
|
+
are importable during pytest runs in this repository.
|
|
4
|
+
"""
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
# Add project root to sys.path so injectionbench is importable
|
|
9
|
+
# alongside the editable-installed promptgate package.
|
|
10
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
|
File without changes
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""
|
|
2
|
+
injectionbench/__main__.py
|
|
3
|
+
--------------------------
|
|
4
|
+
CLI entry point for InjectionBench.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
python -m injectionbench run
|
|
8
|
+
python -m injectionbench run --source manual
|
|
9
|
+
python -m injectionbench run --category direct_injection
|
|
10
|
+
python -m injectionbench run --mutations
|
|
11
|
+
python -m injectionbench run --skip-semantic --output my_results/
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import sys
|
|
16
|
+
|
|
17
|
+
from injectionbench.dataset import DatasetLoader
|
|
18
|
+
from injectionbench.runner import BenchmarkRunner
|
|
19
|
+
from injectionbench.scorer import MetricsScorer
|
|
20
|
+
from injectionbench.reporter import BenchmarkReporter
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def main() -> None:
|
|
24
|
+
"""Entry point for python -m injectionbench."""
|
|
25
|
+
parser = argparse.ArgumentParser(
|
|
26
|
+
prog="injectionbench",
|
|
27
|
+
description="Adversarial benchmarking framework for PromptGate.",
|
|
28
|
+
)
|
|
29
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
30
|
+
|
|
31
|
+
run_parser = subparsers.add_parser("run", help="Run the benchmark.")
|
|
32
|
+
run_parser.add_argument(
|
|
33
|
+
"--skip-semantic",
|
|
34
|
+
action="store_true",
|
|
35
|
+
help="Skip semantic detection layer.",
|
|
36
|
+
)
|
|
37
|
+
run_parser.add_argument(
|
|
38
|
+
"--category",
|
|
39
|
+
type=str,
|
|
40
|
+
default=None,
|
|
41
|
+
help="Run only this attack category (manual source only).",
|
|
42
|
+
)
|
|
43
|
+
run_parser.add_argument(
|
|
44
|
+
"--mutations",
|
|
45
|
+
action="store_true",
|
|
46
|
+
help="Also run mutation variants of attack samples.",
|
|
47
|
+
)
|
|
48
|
+
run_parser.add_argument(
|
|
49
|
+
"--output",
|
|
50
|
+
type=str,
|
|
51
|
+
default="results/",
|
|
52
|
+
help="Output directory for reports (default: results/).",
|
|
53
|
+
)
|
|
54
|
+
run_parser.add_argument(
|
|
55
|
+
"--source",
|
|
56
|
+
type=str,
|
|
57
|
+
default="huggingface",
|
|
58
|
+
choices=["huggingface", "manual", "combined"],
|
|
59
|
+
help="Dataset source (default: huggingface).",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
args = parser.parse_args()
|
|
63
|
+
|
|
64
|
+
if args.command != "run":
|
|
65
|
+
parser.print_help()
|
|
66
|
+
sys.exit(0)
|
|
67
|
+
|
|
68
|
+
loader = DatasetLoader()
|
|
69
|
+
runner = BenchmarkRunner(skip_semantic=args.skip_semantic)
|
|
70
|
+
scorer = MetricsScorer()
|
|
71
|
+
reporter = BenchmarkReporter()
|
|
72
|
+
|
|
73
|
+
print(f"InjectionBench — source: {args.source}")
|
|
74
|
+
print("-" * 40)
|
|
75
|
+
|
|
76
|
+
# Load dataset
|
|
77
|
+
if args.category and args.source != "manual":
|
|
78
|
+
print("Note: --category filter only applies with --source manual. Switching to manual.")
|
|
79
|
+
args.source = "manual"
|
|
80
|
+
|
|
81
|
+
data = loader.load_all(source=args.source)
|
|
82
|
+
attacks = data["attacks"]
|
|
83
|
+
benign = data["benign"]
|
|
84
|
+
|
|
85
|
+
if args.category:
|
|
86
|
+
attacks = [s for s in attacks if s["category"] == args.category]
|
|
87
|
+
print(f"Filtered to category: {args.category} ({len(attacks)} samples)")
|
|
88
|
+
|
|
89
|
+
print(f"Loaded {len(attacks)} attack samples, {len(benign)} benign samples.")
|
|
90
|
+
print()
|
|
91
|
+
|
|
92
|
+
# Run main dataset
|
|
93
|
+
attack_results = runner.run_dataset(attacks)
|
|
94
|
+
benign_results = runner.run_dataset(benign)
|
|
95
|
+
all_results = attack_results + benign_results
|
|
96
|
+
|
|
97
|
+
# Optionally run mutations
|
|
98
|
+
if args.mutations:
|
|
99
|
+
print("\nRunning mutation variants...")
|
|
100
|
+
mutation_results: list[dict] = []
|
|
101
|
+
methods = ["case_flip", "whitespace_inject", "homoglyph"]
|
|
102
|
+
for sample in attacks[:10]: # limit to first 10 to keep it fast
|
|
103
|
+
mut = runner.run_mutations(
|
|
104
|
+
sample["text"],
|
|
105
|
+
category=sample["category"],
|
|
106
|
+
methods=methods,
|
|
107
|
+
)
|
|
108
|
+
mutation_results.extend(mut)
|
|
109
|
+
print(f"Mutation samples run: {len(mutation_results)}")
|
|
110
|
+
all_results.extend(mutation_results)
|
|
111
|
+
|
|
112
|
+
# Score
|
|
113
|
+
metrics = scorer.score(all_results)
|
|
114
|
+
|
|
115
|
+
# Print report to terminal
|
|
116
|
+
text_report = reporter.generate_text_report(metrics, version="0.2.0")
|
|
117
|
+
print()
|
|
118
|
+
print(text_report)
|
|
119
|
+
|
|
120
|
+
# Save to disk
|
|
121
|
+
reporter.save(metrics, output_dir=args.output, version="0.2.0")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
if __name__ == "__main__":
|
|
125
|
+
main()
|