prompt-armor 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prompt_armor-0.1.0/.github/ISSUE_TEMPLATE/bug_report.md +26 -0
- prompt_armor-0.1.0/.github/ISSUE_TEMPLATE/feature_request.md +15 -0
- prompt_armor-0.1.0/.github/PULL_REQUEST_TEMPLATE.md +18 -0
- prompt_armor-0.1.0/.github/workflows/benchmark.yml +36 -0
- prompt_armor-0.1.0/.github/workflows/ci.yml +55 -0
- prompt_armor-0.1.0/.github/workflows/publish.yml +28 -0
- prompt_armor-0.1.0/.gitignore +53 -0
- prompt_armor-0.1.0/.prompt-armor.yml +20 -0
- prompt_armor-0.1.0/CHANGELOG.md +56 -0
- prompt_armor-0.1.0/CLAUDE.md +108 -0
- prompt_armor-0.1.0/CONTRIBUTING.md +83 -0
- prompt_armor-0.1.0/LICENSE +21 -0
- prompt_armor-0.1.0/Makefile +29 -0
- prompt_armor-0.1.0/PKG-INFO +485 -0
- prompt_armor-0.1.0/README.md +435 -0
- prompt_armor-0.1.0/SECURITY.md +40 -0
- prompt_armor-0.1.0/docs/benchmark.md +64 -0
- prompt_armor-0.1.0/docs/cli.md +50 -0
- prompt_armor-0.1.0/docs/configuration.md +56 -0
- prompt_armor-0.1.0/docs/index.md +33 -0
- prompt_armor-0.1.0/docs/layers.md +91 -0
- prompt_armor-0.1.0/docs/mcp.md +46 -0
- prompt_armor-0.1.0/docs/mkdocs.yml +30 -0
- prompt_armor-0.1.0/docs/quickstart.md +78 -0
- prompt_armor-0.1.0/pyproject.toml +83 -0
- prompt_armor-0.1.0/scripts/build_attack_db.py +295 -0
- prompt_armor-0.1.0/scripts/build_benchmark.py +350 -0
- prompt_armor-0.1.0/scripts/dump_layer_scores.py +98 -0
- prompt_armor-0.1.0/scripts/export_l2_model.py +100 -0
- prompt_armor-0.1.0/scripts/train_fusion.py +179 -0
- prompt_armor-0.1.0/src/prompt_armor/__init__.py +40 -0
- prompt_armor-0.1.0/src/prompt_armor/_version.py +1 -0
- prompt_armor-0.1.0/src/prompt_armor/cli/__init__.py +0 -0
- prompt_armor-0.1.0/src/prompt_armor/cli/main.py +315 -0
- prompt_armor-0.1.0/src/prompt_armor/config.py +85 -0
- prompt_armor-0.1.0/src/prompt_armor/data/__init__.py +0 -0
- prompt_armor-0.1.0/src/prompt_armor/data/attacks/known_attacks.jsonl +1151 -0
- prompt_armor-0.1.0/src/prompt_armor/data/models/.gitkeep +0 -0
- prompt_armor-0.1.0/src/prompt_armor/data/rules/default_rules.yml +364 -0
- prompt_armor-0.1.0/src/prompt_armor/engine.py +215 -0
- prompt_armor-0.1.0/src/prompt_armor/fusion.py +166 -0
- prompt_armor-0.1.0/src/prompt_armor/layers/__init__.py +0 -0
- prompt_armor-0.1.0/src/prompt_armor/layers/base.py +30 -0
- prompt_armor-0.1.0/src/prompt_armor/layers/l1_regex.py +176 -0
- prompt_armor-0.1.0/src/prompt_armor/layers/l2_classifier.py +267 -0
- prompt_armor-0.1.0/src/prompt_armor/layers/l3_similarity.py +167 -0
- prompt_armor-0.1.0/src/prompt_armor/layers/l4_structural.py +350 -0
- prompt_armor-0.1.0/src/prompt_armor/mcp/__init__.py +0 -0
- prompt_armor-0.1.0/src/prompt_armor/mcp/server.py +66 -0
- prompt_armor-0.1.0/src/prompt_armor/models.py +91 -0
- prompt_armor-0.1.0/src/prompt_armor/py.typed +0 -0
- prompt_armor-0.1.0/tests/__init__.py +0 -0
- prompt_armor-0.1.0/tests/benchmark/dataset/benign.jsonl +258 -0
- prompt_armor-0.1.0/tests/benchmark/dataset/malicious.jsonl +97 -0
- prompt_armor-0.1.0/tests/benchmark/run_benchmark.py +210 -0
- prompt_armor-0.1.0/tests/conftest.py +13 -0
- prompt_armor-0.1.0/tests/integration/__init__.py +0 -0
- prompt_armor-0.1.0/tests/integration/test_cli.py +112 -0
- prompt_armor-0.1.0/tests/integration/test_mcp_server.py +70 -0
- prompt_armor-0.1.0/tests/unit/__init__.py +0 -0
- prompt_armor-0.1.0/tests/unit/test_config.py +60 -0
- prompt_armor-0.1.0/tests/unit/test_engine.py +70 -0
- prompt_armor-0.1.0/tests/unit/test_fusion.py +139 -0
- prompt_armor-0.1.0/tests/unit/test_l1_regex.py +127 -0
- prompt_armor-0.1.0/tests/unit/test_l2_classifier.py +109 -0
- prompt_armor-0.1.0/tests/unit/test_l3_similarity.py +81 -0
- prompt_armor-0.1.0/tests/unit/test_l4_structural.py +95 -0
- prompt_armor-0.1.0/tests/unit/test_models.py +118 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Bug Report
|
|
3
|
+
about: Report a bug in prompt-armor
|
|
4
|
+
title: "[Bug] "
|
|
5
|
+
labels: bug
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
**Describe the bug**
|
|
9
|
+
A clear description of what the bug is.
|
|
10
|
+
|
|
11
|
+
**To Reproduce**
|
|
12
|
+
```python
|
|
13
|
+
from prompt_armor import analyze
|
|
14
|
+
result = analyze("your prompt here")
|
|
15
|
+
# Expected: ...
|
|
16
|
+
# Actual: ...
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
**Environment**
|
|
20
|
+
- OS: [e.g., macOS, Ubuntu]
|
|
21
|
+
- Python version: [e.g., 3.12]
|
|
22
|
+
- prompt-armor version: [e.g., 0.1.0]
|
|
23
|
+
- ML model installed: [yes/no]
|
|
24
|
+
|
|
25
|
+
**Additional context**
|
|
26
|
+
Any other relevant information.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Feature Request
|
|
3
|
+
about: Suggest an improvement
|
|
4
|
+
title: "[Feature] "
|
|
5
|
+
labels: enhancement
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
**Problem**
|
|
9
|
+
What problem does this solve?
|
|
10
|
+
|
|
11
|
+
**Proposed Solution**
|
|
12
|
+
What would you like to happen?
|
|
13
|
+
|
|
14
|
+
**Alternatives Considered**
|
|
15
|
+
Any other approaches you've thought about.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
## What does this PR do?
|
|
2
|
+
|
|
3
|
+
Brief description.
|
|
4
|
+
|
|
5
|
+
## Type of change
|
|
6
|
+
|
|
7
|
+
- [ ] Bug fix
|
|
8
|
+
- [ ] New feature
|
|
9
|
+
- [ ] New attack samples / regex rules
|
|
10
|
+
- [ ] Documentation
|
|
11
|
+
- [ ] Performance improvement
|
|
12
|
+
|
|
13
|
+
## Checklist
|
|
14
|
+
|
|
15
|
+
- [ ] Tests pass (`pytest tests/ -v`)
|
|
16
|
+
- [ ] Lint passes (`ruff check src/ tests/`)
|
|
17
|
+
- [ ] Benchmark updated if detection logic changed
|
|
18
|
+
- [ ] Documentation updated if needed
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
name: Benchmark
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
benchmark:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
|
|
14
|
+
- name: Set up Python
|
|
15
|
+
uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.12"
|
|
18
|
+
|
|
19
|
+
- name: Install uv
|
|
20
|
+
uses: astral-sh/setup-uv@v4
|
|
21
|
+
|
|
22
|
+
- name: Install dependencies
|
|
23
|
+
run: |
|
|
24
|
+
uv venv
|
|
25
|
+
uv pip install -e ".[dev,ml]"
|
|
26
|
+
|
|
27
|
+
- name: Run benchmark
|
|
28
|
+
run: |
|
|
29
|
+
source .venv/bin/activate
|
|
30
|
+
python tests/benchmark/run_benchmark.py --output benchmark_results.json
|
|
31
|
+
|
|
32
|
+
- name: Upload results
|
|
33
|
+
uses: actions/upload-artifact@v4
|
|
34
|
+
with:
|
|
35
|
+
name: benchmark-results
|
|
36
|
+
path: benchmark_results.json
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main, dev]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main, dev]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: ${{ matrix.python-version }}
|
|
23
|
+
|
|
24
|
+
- name: Install uv
|
|
25
|
+
uses: astral-sh/setup-uv@v4
|
|
26
|
+
|
|
27
|
+
- name: Install dependencies
|
|
28
|
+
run: |
|
|
29
|
+
uv venv --python ${{ matrix.python-version }}
|
|
30
|
+
uv pip install -e ".[dev,ml,mcp]"
|
|
31
|
+
|
|
32
|
+
- name: Lint
|
|
33
|
+
run: |
|
|
34
|
+
source .venv/bin/activate
|
|
35
|
+
ruff check src/ tests/
|
|
36
|
+
|
|
37
|
+
- name: Format check
|
|
38
|
+
run: |
|
|
39
|
+
source .venv/bin/activate
|
|
40
|
+
ruff format --check src/ tests/
|
|
41
|
+
|
|
42
|
+
- name: Type check
|
|
43
|
+
run: |
|
|
44
|
+
source .venv/bin/activate
|
|
45
|
+
mypy src/prompt_armor/ --ignore-missing-imports
|
|
46
|
+
|
|
47
|
+
- name: Unit tests
|
|
48
|
+
run: |
|
|
49
|
+
source .venv/bin/activate
|
|
50
|
+
pytest tests/unit/ -v --tb=short
|
|
51
|
+
|
|
52
|
+
- name: Integration tests
|
|
53
|
+
run: |
|
|
54
|
+
source .venv/bin/activate
|
|
55
|
+
pytest tests/integration/ -v --tb=short
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
permissions:
|
|
11
|
+
id-token: write
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Set up Python
|
|
17
|
+
uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.12"
|
|
20
|
+
|
|
21
|
+
- name: Install build tools
|
|
22
|
+
run: pip install build
|
|
23
|
+
|
|
24
|
+
- name: Build package
|
|
25
|
+
run: python -m build
|
|
26
|
+
|
|
27
|
+
- name: Publish to PyPI
|
|
28
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
*.whl
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
|
|
16
|
+
# IDE
|
|
17
|
+
.vscode/
|
|
18
|
+
.idea/
|
|
19
|
+
*.swp
|
|
20
|
+
*.swo
|
|
21
|
+
*~
|
|
22
|
+
|
|
23
|
+
# OS
|
|
24
|
+
.DS_Store
|
|
25
|
+
Thumbs.db
|
|
26
|
+
|
|
27
|
+
# Testing
|
|
28
|
+
.coverage
|
|
29
|
+
htmlcov/
|
|
30
|
+
.pytest_cache/
|
|
31
|
+
|
|
32
|
+
# mypy
|
|
33
|
+
.mypy_cache/
|
|
34
|
+
|
|
35
|
+
# ruff
|
|
36
|
+
.ruff_cache/
|
|
37
|
+
|
|
38
|
+
# Models (distributed via download-on-first-use, too large for git)
|
|
39
|
+
src/prompt_armor/data/models/*.onnx
|
|
40
|
+
src/prompt_armor/data/models/*.bin
|
|
41
|
+
src/prompt_armor/data/models/*.npy
|
|
42
|
+
src/prompt_armor/data/models/*.json
|
|
43
|
+
!src/prompt_armor/data/models/.gitkeep
|
|
44
|
+
|
|
45
|
+
# Script outputs (generated, not tracked)
|
|
46
|
+
scripts/*.json
|
|
47
|
+
|
|
48
|
+
# mkdocs build output
|
|
49
|
+
site/
|
|
50
|
+
|
|
51
|
+
# Environment
|
|
52
|
+
.env
|
|
53
|
+
.env.local
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# prompt-armor configuration
|
|
2
|
+
# See: https://github.com/prompt-armor/prompt-armor
|
|
3
|
+
|
|
4
|
+
# Layer weights (must sum to ~1.0 for available layers)
|
|
5
|
+
weights:
|
|
6
|
+
l1_regex: 0.20
|
|
7
|
+
l2_classifier: 0.30
|
|
8
|
+
l3_similarity: 0.30
|
|
9
|
+
l4_structural: 0.20
|
|
10
|
+
|
|
11
|
+
# Decision thresholds
|
|
12
|
+
thresholds:
|
|
13
|
+
allow_below: 0.3 # Score below this = ALLOW
|
|
14
|
+
block_above: 0.7 # Score above this = BLOCK
|
|
15
|
+
hard_block: 0.95 # Any single layer above this = instant BLOCK
|
|
16
|
+
min_confidence: 0.5 # Below this confidence = needs_council
|
|
17
|
+
|
|
18
|
+
# Fusion parameters
|
|
19
|
+
convergence_boost: 0.10 # Score boost when layers agree
|
|
20
|
+
divergence_penalty: 0.15 # Confidence penalty when layers disagree
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to prompt-armor will be documented in this file.
|
|
4
|
+
|
|
5
|
+
## [0.1.1] - 2026-03-20
|
|
6
|
+
|
|
7
|
+
### Security
|
|
8
|
+
- Thread-safe singleton initialization (double-checked locking) — fixes race condition
|
|
9
|
+
- Context manager support on LiteEngine (`with LiteEngine() as engine:`)
|
|
10
|
+
- atexit handler for ThreadPoolExecutor cleanup — prevents thread leaks
|
|
11
|
+
- Fail-open layer setup — broken layers are disabled instead of crashing the engine
|
|
12
|
+
- Fix `concurrent.futures.TimeoutError` handling on Python 3.10
|
|
13
|
+
- Pin L2 model download by commit SHA — supply chain hardening
|
|
14
|
+
- Remove `trust_remote_code=True` from all dataset scripts
|
|
15
|
+
- Path traversal validation on `rules_path` / `attacks_path` in config
|
|
16
|
+
- Scrub PII from known_attacks.jsonl (emails, usernames)
|
|
17
|
+
- Fix ReDoS patterns (JB-003 bounded quantifier, DE-003 backtracking)
|
|
18
|
+
- Fix overly broad ML-ES-003 Spanish pattern (require 'ahora' prefix)
|
|
19
|
+
|
|
20
|
+
### Changed
|
|
21
|
+
- Frozen dataclasses now use `tuple` instead of `list` for true immutability
|
|
22
|
+
- Shared `CATEGORY_MAP` in models.py (DRY: was duplicated in L1 and L3)
|
|
23
|
+
- Shared `ShieldResult.to_dict()` method (DRY: was duplicated in CLI and MCP)
|
|
24
|
+
- Pre-compiled fiction/educational context patterns in L1 (was recompiling per call)
|
|
25
|
+
- Replace `assert isinstance` with proper `TypeError` raises
|
|
26
|
+
- CI triggers on `dev` branch, uses correct Python version from matrix
|
|
27
|
+
- Catch `Exception` (not just `ImportError`) when loading optional layers
|
|
28
|
+
|
|
29
|
+
### Added
|
|
30
|
+
- `ShieldResult.to_dict()` method for JSON serialization
|
|
31
|
+
- `LiteEngine.__enter__` / `__exit__` context manager protocol
|
|
32
|
+
- Input type validation (`TypeError` on non-str input)
|
|
33
|
+
- Git workflow documentation in CLAUDE.md
|
|
34
|
+
|
|
35
|
+
## [0.1.0] - 2026-03-19
|
|
36
|
+
|
|
37
|
+
### Added
|
|
38
|
+
- 4-layer parallel analysis engine (L1 Regex, L2 DeBERTa Classifier, L3 Semantic Similarity, L4 Structural)
|
|
39
|
+
- Trained logistic regression meta-classifier for score fusion
|
|
40
|
+
- Sliding window segmentation for compound injection detection
|
|
41
|
+
- Unicode NFKC normalization + zero-width character stripping
|
|
42
|
+
- Multilingual regex rules (DE, ES, FR, PT)
|
|
43
|
+
- Multilingual embedding model (paraphrase-multilingual-MiniLM-L12-v2) for L3
|
|
44
|
+
- CLI with `analyze`, `scan`, `config` commands and semantic exit codes
|
|
45
|
+
- MCP Server with `analyze_prompt` tool
|
|
46
|
+
- Per-layer timeout (2s) and fail-open error handling
|
|
47
|
+
- Input length guard (50K chars) and segment cap (10)
|
|
48
|
+
- Public benchmark dataset (258 benign + 97 malicious from deepset, TrustAIRLab, Lakera Gandalf)
|
|
49
|
+
- YAML configuration (`.prompt-armor.yml`)
|
|
50
|
+
- 103 tests (unit + integration)
|
|
51
|
+
- GitHub Actions CI (tests, benchmark, publish)
|
|
52
|
+
|
|
53
|
+
### Benchmark Results
|
|
54
|
+
- Held-out F1: 93.0% (30% test set, never seen during training)
|
|
55
|
+
- Full dataset: Precision 79.8%, Recall 93.8%, F1 86.3%
|
|
56
|
+
- Average latency: ~19ms
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
prompt-armor is an open-core LLM prompt security analysis tool. It detects prompt injections, jailbreaks, and other attacks against LLMs. The Lite engine runs 4 analysis layers in parallel, fuses scores via a trained meta-classifier, and returns decisions in ~19ms offline.
|
|
8
|
+
|
|
9
|
+
## Commands
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Install (editable, with all extras)
|
|
13
|
+
pip install -e ".[dev,mcp]"
|
|
14
|
+
|
|
15
|
+
# Run tests
|
|
16
|
+
pytest tests/ -v
|
|
17
|
+
pytest tests/unit/ -v # unit only
|
|
18
|
+
pytest tests/unit/test_l1_regex.py -v # single file
|
|
19
|
+
pytest -k "test_detects_injection" -v # single test by name
|
|
20
|
+
|
|
21
|
+
# Lint & format
|
|
22
|
+
ruff check src/ tests/
|
|
23
|
+
ruff format src/ tests/
|
|
24
|
+
mypy src/prompt_armor/
|
|
25
|
+
|
|
26
|
+
# CLI
|
|
27
|
+
prompt-armor analyze "some prompt"
|
|
28
|
+
prompt-armor analyze --file prompt.txt --json
|
|
29
|
+
prompt-armor scan --dir ./prompts/ --format table
|
|
30
|
+
|
|
31
|
+
# Benchmark
|
|
32
|
+
python tests/benchmark/run_benchmark.py
|
|
33
|
+
|
|
34
|
+
# MCP Server
|
|
35
|
+
prompt-armor-mcp
|
|
36
|
+
|
|
37
|
+
# Retrain fusion meta-classifier (after changing layers or dataset)
|
|
38
|
+
python scripts/dump_layer_scores.py
|
|
39
|
+
python scripts/train_fusion.py
|
|
40
|
+
|
|
41
|
+
# Rebuild attack database from public sources
|
|
42
|
+
python scripts/build_attack_db.py
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Architecture
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
INPUT → NORMALIZE → SEGMENT (if >150 words) → [L1 | L2 | L3 | L4] → META-CLASSIFIER → GATE → OUTPUT
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
The core pipeline runs 4 analysis layers **in parallel** via `ThreadPoolExecutor`, feeds scores into a trained logistic regression meta-classifier, and applies decision thresholds:
|
|
52
|
+
|
|
53
|
+
- **`engine.py` (LiteEngine)** — Orchestrates: Unicode normalization, sliding window segmentation, parallel layer dispatch, per-layer timeout (2s) with fail-open.
|
|
54
|
+
- **`layers/l1_regex.py`** — 40+ English + 20 multilingual (DE/ES/FR/PT) weighted regex rules. Context modifier exploit hardened (high scores not dampened).
|
|
55
|
+
- **`layers/l2_classifier.py`** — DeBERTa-v3-xsmall (22M params, ONNX) with score calibration. Auto-downloads from HuggingFace on first use. Falls back to keyword heuristic.
|
|
56
|
+
- **`layers/l3_similarity.py`** — paraphrase-multilingual-MiniLM-L12-v2 + FAISS cosine similarity against 1,151 known attacks.
|
|
57
|
+
- **`layers/l4_structural.py`** — Deterministic: imperative verb ratios, delimiter injection, encoding tricks, expanded role assignment with benign whitelist.
|
|
58
|
+
- **`fusion.py`** — Trained LogisticRegression meta-classifier (9 features: 4 layer scores + max + min + interactions + n_above_0.1). L3/L4 coefficients clamped to 0 to prevent exploitation.
|
|
59
|
+
- **`models.py`** — Frozen dataclasses: `ShieldResult`, `LayerResult`, `Evidence`, `Decision`, `Category`.
|
|
60
|
+
- **`config.py`** — Pydantic models for YAML config (`.prompt-armor.yml`).
|
|
61
|
+
|
|
62
|
+
### Key conventions
|
|
63
|
+
|
|
64
|
+
- **dataclass for output types, Pydantic for config only**
|
|
65
|
+
- **Layers are CPU-bound** — ThreadPoolExecutor (not asyncio) because ONNX/FAISS/numpy release the GIL
|
|
66
|
+
- **Public API is `prompt_armor.analyze()`** — lazy-initialized in `__init__.py`
|
|
67
|
+
- **CLI exit codes** — 0=allow, 1=warn, 2=block, 3=error
|
|
68
|
+
- **MCP server is Python** — Uses `mcp` SDK (FastMCP)
|
|
69
|
+
- **Meta-classifier coefficients are hardcoded in fusion.py** — retrain via `scripts/train_fusion.py` if layers or dataset change
|
|
70
|
+
- **L2 model auto-downloads** — from HuggingFace Hub on first use (~83MB)
|
|
71
|
+
|
|
72
|
+
### Data files
|
|
73
|
+
|
|
74
|
+
- `data/rules/default_rules.yml` — L1 regex rules (EN + DE/ES/FR/PT)
|
|
75
|
+
- `data/attacks/known_attacks.jsonl` — L3 attack DB (1,151 entries)
|
|
76
|
+
- `data/models/` — L2 ONNX model (auto-downloaded, not in git)
|
|
77
|
+
|
|
78
|
+
## Git Workflow (MANDATORY)
|
|
79
|
+
|
|
80
|
+
### Branches
|
|
81
|
+
| Branch | Role |
|
|
82
|
+
|--------|------|
|
|
83
|
+
| `main` | Production — never commit directly |
|
|
84
|
+
| `dev` | Staging — receives merges from feature branches via PR |
|
|
85
|
+
| `feature/*`, `fix/*`, `refactor/*`, `chore/*`, `docs/*` | Work branches — always branch from `dev` |
|
|
86
|
+
| `hotfix/*` | Emergency fixes — branch from `main`, PR to `main`, then sync `dev` |
|
|
87
|
+
|
|
88
|
+
### Flow
|
|
89
|
+
1. Branch from `dev`: `git checkout dev && git pull && git checkout -b feature/name`
|
|
90
|
+
2. Atomic commits with Conventional Commits: `type(scope): description`
|
|
91
|
+
3. Push and PR to `dev`: `git push -u origin feature/name && gh pr create --base dev`
|
|
92
|
+
4. Squash merge feature → dev
|
|
93
|
+
5. When ready for release: PR `dev` → `main` with merge commit (not squash)
|
|
94
|
+
6. Tag on `main`: `git tag -a vX.Y.Z -m "..."` && `git push origin vX.Y.Z`
|
|
95
|
+
|
|
96
|
+
### Commit Format
|
|
97
|
+
```
|
|
98
|
+
type(scope): description in English, imperative mood, no period
|
|
99
|
+
```
|
|
100
|
+
Types: `feat`, `fix`, `refactor`, `style`, `docs`, `test`, `chore`, `perf`, `ci`
|
|
101
|
+
|
|
102
|
+
### Strict Rules
|
|
103
|
+
- NEVER commit directly to `main` or `dev`
|
|
104
|
+
- NEVER force-push to `main`
|
|
105
|
+
- NEVER PR a feature directly to `main` (only hotfix)
|
|
106
|
+
- Squash merge: `feature/*` → `dev`
|
|
107
|
+
- Merge commit: `dev` → `main`
|
|
108
|
+
- One commit = one logical change
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Contributing to prompt-armor
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in contributing! Here's how you can help.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
git clone https://github.com/prompt-armor/prompt-armor
|
|
9
|
+
cd prompt-armor
|
|
10
|
+
pip install -e ".[dev,ml,mcp]"
|
|
11
|
+
pytest tests/ -v
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Ways to Contribute
|
|
15
|
+
|
|
16
|
+
### Add Attack Samples (Easiest)
|
|
17
|
+
|
|
18
|
+
The most impactful contribution is expanding the attack database. Add entries to:
|
|
19
|
+
|
|
20
|
+
- `src/prompt_armor/data/attacks/known_attacks.jsonl` — Known attack prompts for L3 similarity matching
|
|
21
|
+
- `tests/benchmark/dataset/malicious.jsonl` — Labeled attack prompts for benchmarking
|
|
22
|
+
|
|
23
|
+
Format:
|
|
24
|
+
```json
|
|
25
|
+
{"text": "the attack prompt", "category": "prompt_injection", "source": "your_source"}
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Categories: `prompt_injection`, `jailbreak`, `identity_override`, `system_prompt_leak`, `instruction_bypass`, `data_exfiltration`, `encoding_attack`, `social_engineering`
|
|
29
|
+
|
|
30
|
+
### Add Regex Rules
|
|
31
|
+
|
|
32
|
+
Add patterns to `src/prompt_armor/data/rules/default_rules.yml`. Each rule needs:
|
|
33
|
+
- Unique ID (e.g., `PI-011` for prompt injection rule 11)
|
|
34
|
+
- Regex pattern (case-insensitive by default)
|
|
35
|
+
- Category, weight (0.0-1.0), and description
|
|
36
|
+
|
|
37
|
+
Multilingual rules are welcome (DE, ES, FR, PT, and any other language).
|
|
38
|
+
|
|
39
|
+
### Bug Fixes and Improvements
|
|
40
|
+
|
|
41
|
+
1. Fork the repo
|
|
42
|
+
2. Create a branch (`git checkout -b fix/your-fix`)
|
|
43
|
+
3. Make your changes
|
|
44
|
+
4. Ensure tests pass: `pytest tests/ -v`
|
|
45
|
+
5. Ensure lint passes: `ruff check src/ tests/`
|
|
46
|
+
6. Submit a PR
|
|
47
|
+
|
|
48
|
+
## Development
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Run tests
|
|
52
|
+
pytest tests/ -v
|
|
53
|
+
|
|
54
|
+
# Run a single test
|
|
55
|
+
pytest tests/unit/test_l1_regex.py -v
|
|
56
|
+
|
|
57
|
+
# Lint
|
|
58
|
+
ruff check src/ tests/
|
|
59
|
+
|
|
60
|
+
# Format
|
|
61
|
+
ruff format src/ tests/
|
|
62
|
+
|
|
63
|
+
# Type check
|
|
64
|
+
mypy src/prompt_armor/
|
|
65
|
+
|
|
66
|
+
# Run benchmark
|
|
67
|
+
python tests/benchmark/run_benchmark.py
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Code Style
|
|
71
|
+
|
|
72
|
+
- Python 3.10+
|
|
73
|
+
- Formatted with `ruff`
|
|
74
|
+
- Type hints on all public functions
|
|
75
|
+
- `dataclass(frozen=True, slots=True)` for result types
|
|
76
|
+
- `Pydantic` only for config validation
|
|
77
|
+
|
|
78
|
+
## Pull Request Guidelines
|
|
79
|
+
|
|
80
|
+
- Keep PRs focused — one feature or fix per PR
|
|
81
|
+
- Add tests for new functionality
|
|
82
|
+
- Update the benchmark if you change detection logic
|
|
83
|
+
- Run `ruff check` and `pytest` before submitting
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 llm-shield contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
.PHONY: install test lint format typecheck bench docs clean
|
|
2
|
+
|
|
3
|
+
install:
|
|
4
|
+
pip install -e ".[dev,mcp]"
|
|
5
|
+
|
|
6
|
+
test:
|
|
7
|
+
pytest tests/ -v
|
|
8
|
+
|
|
9
|
+
test-unit:
|
|
10
|
+
pytest tests/unit/ -v
|
|
11
|
+
|
|
12
|
+
lint:
|
|
13
|
+
ruff check src/ tests/
|
|
14
|
+
|
|
15
|
+
format:
|
|
16
|
+
ruff format src/ tests/
|
|
17
|
+
|
|
18
|
+
typecheck:
|
|
19
|
+
mypy src/llm_shield/
|
|
20
|
+
|
|
21
|
+
bench:
|
|
22
|
+
python tests/benchmark/run_benchmark.py
|
|
23
|
+
|
|
24
|
+
docs:
|
|
25
|
+
mkdocs serve
|
|
26
|
+
|
|
27
|
+
clean:
|
|
28
|
+
rm -rf build/ dist/ *.egg-info .pytest_cache .mypy_cache .ruff_cache htmlcov/
|
|
29
|
+
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
|