agentprobe-injection 0.2.0a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentprobe_injection-0.2.0a1/.github/workflows/ci.yml +58 -0
- agentprobe_injection-0.2.0a1/.gitignore +14 -0
- agentprobe_injection-0.2.0a1/CONTRIBUTING.md +62 -0
- agentprobe_injection-0.2.0a1/LICENSE +21 -0
- agentprobe_injection-0.2.0a1/PKG-INFO +281 -0
- agentprobe_injection-0.2.0a1/README.md +242 -0
- agentprobe_injection-0.2.0a1/RESILIENCE.md +204 -0
- agentprobe_injection-0.2.0a1/SECURITY.md +41 -0
- agentprobe_injection-0.2.0a1/TROUBLESHOOTING.md +316 -0
- agentprobe_injection-0.2.0a1/agentprobe/__init__.py +3 -0
- agentprobe_injection-0.2.0a1/agentprobe/adapters/__init__.py +16 -0
- agentprobe_injection-0.2.0a1/agentprobe/adapters/dummy.py +128 -0
- agentprobe_injection-0.2.0a1/agentprobe/adapters/http.py +56 -0
- agentprobe_injection-0.2.0a1/agentprobe/adapters/http_async.py +228 -0
- agentprobe_injection-0.2.0a1/agentprobe/attacks/__init__.py +6 -0
- agentprobe_injection-0.2.0a1/agentprobe/attacks/base.py +50 -0
- agentprobe_injection-0.2.0a1/agentprobe/attacks/registry.py +82 -0
- agentprobe_injection-0.2.0a1/agentprobe/attacks/transforms.py +183 -0
- agentprobe_injection-0.2.0a1/agentprobe/cli.py +334 -0
- agentprobe_injection-0.2.0a1/agentprobe/engine.py +151 -0
- agentprobe_injection-0.2.0a1/agentprobe/engine_async.py +183 -0
- agentprobe_injection-0.2.0a1/agentprobe/harness_utility.py +130 -0
- agentprobe_injection-0.2.0a1/agentprobe/injection/__init__.py +19 -0
- agentprobe_injection-0.2.0a1/agentprobe/injection/benign_tasks.py +174 -0
- agentprobe_injection-0.2.0a1/agentprobe/injection/carriers.py +200 -0
- agentprobe_injection-0.2.0a1/agentprobe/injection/defenses.py +98 -0
- agentprobe_injection-0.2.0a1/agentprobe/injection/oracle.py +75 -0
- agentprobe_injection-0.2.0a1/agentprobe/injection/screening.py +78 -0
- agentprobe_injection-0.2.0a1/agentprobe/llm_oracle.py +94 -0
- agentprobe_injection-0.2.0a1/agentprobe/logging_config.py +108 -0
- agentprobe_injection-0.2.0a1/agentprobe/metrics.py +199 -0
- agentprobe_injection-0.2.0a1/agentprobe/models.py +19 -0
- agentprobe_injection-0.2.0a1/agentprobe/oracle.py +193 -0
- agentprobe_injection-0.2.0a1/agentprobe/oracle_legacy.py +83 -0
- agentprobe_injection-0.2.0a1/agentprobe/oracle_semantic.py +221 -0
- agentprobe_injection-0.2.0a1/agentprobe/report.py +183 -0
- agentprobe_injection-0.2.0a1/agentprobe/target.py +44 -0
- agentprobe_injection-0.2.0a1/data/gpt4o.csv +505 -0
- agentprobe_injection-0.2.0a1/data/gpt4omini.csv +505 -0
- agentprobe_injection-0.2.0a1/data/haiku45.csv +505 -0
- agentprobe_injection-0.2.0a1/data/utility_gpt4omini.csv +121 -0
- agentprobe_injection-0.2.0a1/examples/tool_agent.py +180 -0
- agentprobe_injection-0.2.0a1/mcnemar_test.py +146 -0
- agentprobe_injection-0.2.0a1/plot_pareto.py +91 -0
- agentprobe_injection-0.2.0a1/plot_results.py +124 -0
- agentprobe_injection-0.2.0a1/pyproject.toml +49 -0
- agentprobe_injection-0.2.0a1/results/carrier_heatmap_gpt4omini.png +0 -0
- agentprobe_injection-0.2.0a1/results/carrier_heatmap_haiku45.png +0 -0
- agentprobe_injection-0.2.0a1/results/defense_leak_rates.png +0 -0
- agentprobe_injection-0.2.0a1/results/pareto_gpt4o.png +0 -0
- agentprobe_injection-0.2.0a1/results/pareto_gpt4omini.png +0 -0
- agentprobe_injection-0.2.0a1/run_injection_stats.py +212 -0
- agentprobe_injection-0.2.0a1/run_utility_harness.py +227 -0
- agentprobe_injection-0.2.0a1/tests/test_adapters.py +177 -0
- agentprobe_injection-0.2.0a1/tests/test_async_http.py +315 -0
- agentprobe_injection-0.2.0a1/tests/test_attacks.py +172 -0
- agentprobe_injection-0.2.0a1/tests/test_engine.py +188 -0
- agentprobe_injection-0.2.0a1/tests/test_engine_async.py +379 -0
- agentprobe_injection-0.2.0a1/tests/test_false_positives.py +260 -0
- agentprobe_injection-0.2.0a1/tests/test_logging.py +160 -0
- agentprobe_injection-0.2.0a1/tests/test_metrics.py +269 -0
- agentprobe_injection-0.2.0a1/tests/test_oracle.py +216 -0
- agentprobe_injection-0.2.0a1/tests/test_oracle_integration.py +331 -0
- agentprobe_injection-0.2.0a1/tests/test_oracle_semantic.py +368 -0
- agentprobe_injection-0.2.0a1/tests/test_report.py +244 -0
- agentprobe_injection-0.2.0a1/tests/test_smoke.py +37 -0
- agentprobe_injection-0.2.0a1/tests/test_step3_integration.py +223 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [ main ]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [ main ]
|
|
8
|
+
workflow_dispatch:
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
test:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
strategy:
|
|
14
|
+
fail-fast: false
|
|
15
|
+
matrix:
|
|
16
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- name: Checkout repository
|
|
20
|
+
uses: actions/checkout@v4
|
|
21
|
+
|
|
22
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
23
|
+
uses: actions/setup-python@v5
|
|
24
|
+
with:
|
|
25
|
+
python-version: ${{ matrix.python-version }}
|
|
26
|
+
cache: pip
|
|
27
|
+
|
|
28
|
+
- name: Install package + dev extras
|
|
29
|
+
run: |
|
|
30
|
+
python -m pip install --upgrade pip
|
|
31
|
+
pip install -e ".[dev,openai]"
|
|
32
|
+
|
|
33
|
+
- name: Run pytest
|
|
34
|
+
# No API keys are set on purpose. Without OPENAI_API_KEY the semantic
|
|
35
|
+
# oracle is unavailable and judge()/run_scan() fall back to the offline
|
|
36
|
+
# legacy oracle, so the suite runs fully offline and deterministically.
|
|
37
|
+
# Live-LLM behavior is covered by mocked tests (test_oracle_semantic.py).
|
|
38
|
+
run: |
|
|
39
|
+
pytest tests/ -v --tb=short
|
|
40
|
+
|
|
41
|
+
lint:
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
steps:
|
|
44
|
+
- name: Checkout repository
|
|
45
|
+
uses: actions/checkout@v4
|
|
46
|
+
|
|
47
|
+
- name: Set up Python
|
|
48
|
+
uses: actions/setup-python@v5
|
|
49
|
+
with:
|
|
50
|
+
python-version: "3.12"
|
|
51
|
+
cache: pip
|
|
52
|
+
|
|
53
|
+
- name: Install ruff
|
|
54
|
+
run: pip install ruff>=0.4
|
|
55
|
+
|
|
56
|
+
- name: Run ruff check
|
|
57
|
+
run: ruff check agentprobe/ tests/ || echo "::warning::Lint warnings present"
|
|
58
|
+
continue-on-error: true
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Contributing to AgentProbe
|
|
2
|
+
|
|
3
|
+
Thanks for your interest. AgentProbe is an alpha research/defensive tool, so the
|
|
4
|
+
bar is "correct, honest, and reproducible" over "feature-complete."
|
|
5
|
+
|
|
6
|
+
## Ground rules
|
|
7
|
+
|
|
8
|
+
1. **No fabricated results.** Every number in docs/README must trace back to a
|
|
9
|
+
real run with a CSV in `data/` (or `results/`). Illustrative output must be
|
|
10
|
+
labeled as illustrative.
|
|
11
|
+
2. **Defensive framing only.** Contributions that turn this into a portable
|
|
12
|
+
attack/bypass toolkit will be rejected. See [SECURITY.md](SECURITY.md).
|
|
13
|
+
3. **Tests required.** New behavior needs tests. Bug fixes should include a
|
|
14
|
+
regression test where practical.
|
|
15
|
+
|
|
16
|
+
## Dev setup
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
git clone https://github.com/Samgar-kz/agentprobe.git
|
|
20
|
+
cd agentprobe
|
|
21
|
+
python -m venv .venv && source .venv/bin/activate
|
|
22
|
+
pip install -e ".[dev,openai]"
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Before opening a PR
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
# Run the test suite
|
|
29
|
+
pytest tests/ -v
|
|
30
|
+
|
|
31
|
+
# Lint
|
|
32
|
+
ruff check agentprobe/ tests/
|
|
33
|
+
|
|
34
|
+
# (optional) format
|
|
35
|
+
black agentprobe/ tests/
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
CI runs pytest on Python 3.10 / 3.11 / 3.12 plus ruff. PRs must be green.
|
|
39
|
+
|
|
40
|
+
## Adding an attack transform
|
|
41
|
+
|
|
42
|
+
Attack transforms live in `agentprobe/attacks/transforms.py` and are registered
|
|
43
|
+
via `registry.py`. Each transform needs:
|
|
44
|
+
- a unique `name`
|
|
45
|
+
- a `category` (one of: `classic`, `pragmatic`, `register`, `discourse`, `codeswitch`)
|
|
46
|
+
- a `rationale` explaining the linguistic hypothesis being tested
|
|
47
|
+
|
|
48
|
+
## Adding a defense
|
|
49
|
+
|
|
50
|
+
Defenses live in `agentprobe/injection/defenses.py` (or `screening.py` for the
|
|
51
|
+
separate-LLM-pass family). Use the existing `Defense` dataclass. The `name` you
|
|
52
|
+
choose is what appears in CSV/JSON reports, so keep it stable and snake_case.
|
|
53
|
+
|
|
54
|
+
## Adding a target adapter
|
|
55
|
+
|
|
56
|
+
Adapters live in `agentprobe/adapters/`. Implement the `Target` protocol from
|
|
57
|
+
`agentprobe/target.py`. Current adapters: `dummy`, `http`, `http_async`.
|
|
58
|
+
|
|
59
|
+
## Commit style
|
|
60
|
+
|
|
61
|
+
Conventional-commit-ish prefixes are appreciated: `feat:`, `fix:`, `docs:`,
|
|
62
|
+
`test:`, `ci:`, `chore:`.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Samgar Abdikozha
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agentprobe-injection
|
|
3
|
+
Version: 0.2.0a1
|
|
4
|
+
Summary: Harness for measuring LLM agent resistance to indirect prompt injection and comparing defense effectiveness.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Samgar-kz/agentprobe
|
|
6
|
+
Project-URL: Issues, https://github.com/Samgar-kz/agentprobe/issues
|
|
7
|
+
Author: Samgar
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: agents,ai-safety,llm,prompt-injection,security
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Information Technology
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Security
|
|
18
|
+
Classifier: Topic :: Software Development :: Testing
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: anthropic>=0.25
|
|
21
|
+
Requires-Dist: httpx>=0.27
|
|
22
|
+
Requires-Dist: litellm>=1.30
|
|
23
|
+
Requires-Dist: matplotlib>=3.7
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Requires-Dist: pydantic>=2.5
|
|
26
|
+
Requires-Dist: rich>=13.7
|
|
27
|
+
Requires-Dist: scipy>=1.11
|
|
28
|
+
Requires-Dist: tenacity>=8.2
|
|
29
|
+
Requires-Dist: typer>=0.12
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: black>=24.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest>=7.4; extra == 'dev'
|
|
35
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
36
|
+
Provides-Extra: openai
|
|
37
|
+
Requires-Dist: openai>=1.30; extra == 'openai'
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# AgentProbe: Defense Evaluation Harness for LLM Agents
|
|
41
|
+
|
|
42
|
+
[](https://github.com/Samgar-kz/agentprobe/actions/workflows/ci.yml)
|
|
43
|
+
[](https://www.python.org/downloads/)
|
|
44
|
+
[](LICENSE)
|
|
45
|
+
[]()
|
|
46
|
+
|
|
47
|
+
## What This Is
|
|
48
|
+
|
|
49
|
+
A testing framework for measuring your LLM agent's **resistance to indirect prompt injection** and **comparing defense effectiveness**. Tests your own systems or those you have permission to test.
|
|
50
|
+
|
|
51
|
+
NOT an attack generator or bypass toolkit. NOT for probing other people's systems.
|
|
52
|
+
|
|
53
|
+
## Key Findings from Our Research
|
|
54
|
+
|
|
55
|
+
Our testing on gpt-4o-mini and claude-haiku-4-5 reveals three things:
|
|
56
|
+
|
|
57
|
+
1. **Surface-level linguistic transforms don't work on modern models**
|
|
58
|
+
- Pragmatic implicature, register shifts, code-switching: ~0% success rate
|
|
59
|
+
- Modern LLMs aren't fooled by just changing speech act or tone
|
|
60
|
+
|
|
61
|
+
2. **Indirect injection through data IS a real vulnerability**
|
|
62
|
+
- Information hidden in tool outputs (emails, documents, web pages) bypasses prompt-level defenses
|
|
63
|
+
- Separation at prompt level is not enough
|
|
64
|
+
|
|
65
|
+
3. **Asymmetry: Models leak data more readily than execute unauthorized actions**
|
|
66
|
+
- Defending against information leakage != defending against tool abuse
|
|
67
|
+
- Different threat models need different defenses
|
|
68
|
+
|
|
69
|
+
## Results: Defense Effectiveness
|
|
70
|
+
|
|
71
|
+
**gpt-4o-mini**
|
|
72
|
+
|
|
73
|
+
Defense names below match the `defense` column in the CSV outputs (`data/`) and JSON reports.
|
|
74
|
+
|
|
75
|
+
| Defense (code name) | Leak Rate | N |
|
|
76
|
+
|---------------------|-----------|---|
|
|
77
|
+
| `none` (baseline) | 29.8% | 84 |
|
|
78
|
+
| `delimited` (delimiter wrap) | 25.0% | 84 |
|
|
79
|
+
| `instr_hierarchy` (privilege-level instruction) | 31.0% | 84 |
|
|
80
|
+
| `sandwich` (recency reinforcement) | 15.5% | 84 |
|
|
81
|
+
| `spotlight` (datamarking) | 6.0% | 84 |
|
|
82
|
+
| `llm_filter` (separate screening pass) | 0% | 84 |
|
|
83
|
+
|
|
84
|
+
For reference, the same battery on **gpt-4o** leaks much less (baseline 10.7%, `delimited`/`llm_filter` 0%), and **claude-haiku-4-5** holds 0% across every defense — so absolute numbers are model-specific; treat them as relative defense rankings, not universal constants.
|
|
85
|
+
|
|
86
|
+
**claude-haiku-4-5** holds baseline at 0% leak rate across all test conditions; defense differentiation is not measurable on this model.
|
|
87
|
+
|
|
88
|
+
### Key Finding: Screening (and datamarking) beat prompt-level defenses
|
|
89
|
+
|
|
90
|
+
The separate verification pass (`llm_filter`) achieved 0 successful leaks in 84 test runs on gpt-4o-mini. The next best is `spotlight` (datamarking) at 6.0%. By contrast, prompt-level instruction (`instr_hierarchy`, 31.0%) was *no better than baseline* (29.8%).
|
|
91
|
+
|
|
92
|
+
This suggests: **prompt-level instructions and delimiters are incomplete; either token-level datamarking or a separate, independent judgment pass is required to reliably catch injection.**
|
|
93
|
+
|
|
94
|
+
## How To Use
|
|
95
|
+
|
|
96
|
+
### Test Your Own Agent
|
|
97
|
+
|
|
98
|
+
> **Note:** The PyPI package is named `agentprobe-injection` (the plain
|
|
99
|
+
> `agentprobe` name was already taken). The import package and CLI command are
|
|
100
|
+
> still `agentprobe`.
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
# Install from PyPI
|
|
104
|
+
pip install agentprobe-injection
|
|
105
|
+
|
|
106
|
+
# Or install the latest from GitHub
|
|
107
|
+
pip install git+https://github.com/Samgar-kz/agentprobe.git
|
|
108
|
+
|
|
109
|
+
# Or clone for development
|
|
110
|
+
git clone https://github.com/Samgar-kz/agentprobe.git
|
|
111
|
+
cd agentprobe && pip install -e .
|
|
112
|
+
|
|
113
|
+
export OPENAI_API_KEY="..."
|
|
114
|
+
|
|
115
|
+
agentprobe scan \
|
|
116
|
+
--target dummy \
|
|
117
|
+
--oracle semantic \
|
|
118
|
+
--json-report results.json
|
|
119
|
+
|
|
120
|
+
# Check results
|
|
121
|
+
cat results.json | jq '.statistics'
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Available Defenses to Test
|
|
125
|
+
|
|
126
|
+
The harness measures effectiveness of these defenses:
|
|
127
|
+
|
|
128
|
+
1. **`none`** — baseline (no defense applied)
|
|
129
|
+
2. **`delimited`** — wrap data in `<<<UNTRUSTED_DATA_BEGIN>>>...<<<UNTRUSTED_DATA_END>>>` markers
|
|
130
|
+
3. **`spotlight`** — datamarking: mark every data token so the model separates data from instructions
|
|
131
|
+
4. **`sandwich`** — repeat the do-not-obey rule after the data (recency effect)
|
|
132
|
+
5. **`instr_hierarchy`** — tag data with an explicit low privilege level; assert system instructions outrank tool/data content
|
|
133
|
+
6. **`llm_filter`** — separate LLM verification pass to detect/strip injection before execution
|
|
134
|
+
|
|
135
|
+
Test each against YOUR agent. See which work, which break utility.
|
|
136
|
+
|
|
137
|
+
### How It Works
|
|
138
|
+
|
|
139
|
+
1. **Injection Generator:** Creates test payloads (carriers: email, document, web page) with hidden canary instructions
|
|
140
|
+
2. **Defense Applicator:** Wraps the data with each defense mechanism
|
|
141
|
+
3. **Target Adapter:** Sends to your agent, captures response
|
|
142
|
+
4. **Semantic Oracle:** Uses gpt-4o-mini to judge: did agent leak data or follow the hidden instruction?
|
|
143
|
+
5. **Utility Harness:** Runs benign legitimate tasks to ensure defenses don't break normal functionality
|
|
144
|
+
6. **Report:** Table showing defense effectiveness + utility cost
|
|
145
|
+
|
|
146
|
+
### Defense vs Utility Trade-off
|
|
147
|
+
|
|
148
|
+
**Result:** All 5 defenses preserve utility on legitimate tasks (120/120 runs, 0% false-positive rate).
|
|
149
|
+
|
|
150
|
+
Tested on 8 benign tasks (extract dates, risks, budget, sentiment, action items, meeting notes, legitimately forward to internal address) with 3 repeats each:
|
|
151
|
+
|
|
152
|
+
| Defense | False-Positive Rate | Status |
|
|
153
|
+
|---------|-------------------|--------|
|
|
154
|
+
| `none` | 0% | baseline |
|
|
155
|
+
| `delimited` | 0% | safe to use |
|
|
156
|
+
| `spotlight` | 0% | safe to use |
|
|
157
|
+
| `sandwich` | 0% | safe to use |
|
|
158
|
+
| `instr_hierarchy` | 0% | safe to use |
|
|
159
|
+
| `llm_filter` | 0% | safe to use |
|
|
160
|
+
|
|
161
|
+
Conclusion: **Defenses do not break legitimate agent functionality** (in current test suite). Task success rate remains 100% across all defenses, making the injection effectiveness/defense trade-off directly comparable (both measured under same utility constraints).
|
|
162
|
+
|
|
163
|
+
Run your own: `python run_utility_harness.py --repeats=3 --temp=0.7 --out=utility_results.csv`
|
|
164
|
+
|
|
165
|
+
## Responsible Use
|
|
166
|
+
|
|
167
|
+
- **Only test systems you own or have written permission to test**
|
|
168
|
+
- Destination: understanding YOUR defenses, not generating portable bypasses
|
|
169
|
+
- Disclose findings responsibly (if testing third-party systems with permission)
|
|
170
|
+
- The framework measures vulnerability, it's not a jailbreak toolkit
|
|
171
|
+
|
|
172
|
+
## Architecture
|
|
173
|
+
|
|
174
|
+
```
|
|
175
|
+
agentprobe/
|
|
176
|
+
├── oracle_semantic.py # LLM-as-judge using gpt-4o-mini
|
|
177
|
+
├── oracle_legacy.py # Fallback: substring matching
|
|
178
|
+
├── oracle.py # Oracle interface
|
|
179
|
+
├── adapters/
|
|
180
|
+
│ ├── dummy.py # Built-in intentionally-vulnerable agent simulator
|
|
181
|
+
│ ├── http.py # Test any HTTP-accessible agent (sync)
|
|
182
|
+
│ └── http_async.py # Async HTTP adapter for concurrent scans
|
|
183
|
+
├── injection/
|
|
184
|
+
│ ├── carriers.py # Email, document, web page wrappers
|
|
185
|
+
│ ├── defenses.py # Defense mechanisms to evaluate
|
|
186
|
+
│ ├── benign_tasks.py # Utility harness tasks
|
|
187
|
+
│ └── screening.py # Screening defense (separate LLM pass)
|
|
188
|
+
├── engine.py # Synchronous scan
|
|
189
|
+
├── engine_async.py # Async scan
|
|
190
|
+
├── metrics.py # Statistical analysis (Wilson CI, effect sizes)
|
|
191
|
+
├── report.py # Report generation
|
|
192
|
+
├── logging_config.py # Structured logging, cost tracking
|
|
193
|
+
└── cli.py # Command-line interface
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## Command-Line Usage
|
|
197
|
+
|
|
198
|
+
### Basic scan
|
|
199
|
+
```bash
|
|
200
|
+
# Test dummy agent
|
|
201
|
+
agentprobe scan --target dummy
|
|
202
|
+
|
|
203
|
+
# Test HTTP agent
|
|
204
|
+
agentprobe scan --target http \
|
|
205
|
+
--endpoint http://localhost:8000/chat \
|
|
206
|
+
--input-field message \
|
|
207
|
+
--output-field reply
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
### Control oracle
|
|
211
|
+
```bash
|
|
212
|
+
# Use semantic oracle (default, requires OPENAI_API_KEY)
|
|
213
|
+
agentprobe scan --target dummy --oracle semantic
|
|
214
|
+
|
|
215
|
+
# Use legacy oracle (offline, pattern matching)
|
|
216
|
+
agentprobe scan --target dummy --oracle legacy
|
|
217
|
+
|
|
218
|
+
# Set confidence threshold
|
|
219
|
+
agentprobe scan --target dummy --oracle semantic --min-confidence 0.85
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
### Reports
|
|
223
|
+
```bash
|
|
224
|
+
# JSON report with statistics
|
|
225
|
+
agentprobe scan --target dummy --json-report results.json
|
|
226
|
+
|
|
227
|
+
# Verbose logging
|
|
228
|
+
agentprobe scan --target dummy --verbose 2
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## Measurement Infrastructure
|
|
232
|
+
|
|
233
|
+
- **Oracle:** gpt-4o-mini with Structured Outputs (semantic judgment)
|
|
234
|
+
- **Test Harness:** Carriers simulate real data flows (email, document, web page)
|
|
235
|
+
- **Utility Harness:** Measures task success rate per defense on benign tasks (see *Defense vs Utility Trade-off* above)
|
|
236
|
+
- **Benchmarking:** Latency / throughput available via `--async --concurrency N` on HTTP targets
|
|
237
|
+
|
|
238
|
+
All numbers above are from actual test runs (CSV in /data/).
|
|
239
|
+
|
|
240
|
+
## Testing Your Own Code
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
# Run all tests
|
|
244
|
+
pytest tests/ -v
|
|
245
|
+
|
|
246
|
+
# Test a specific component
|
|
247
|
+
pytest tests/test_oracle_semantic.py -v
|
|
248
|
+
|
|
249
|
+
# Run with coverage
|
|
250
|
+
pytest tests/ --cov=agentprobe
|
|
251
|
+
|
|
252
|
+
# Benchmark async performance
|
|
253
|
+
agentprobe scan --target dummy --async --concurrency 15
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## What's NOT Included
|
|
257
|
+
|
|
258
|
+
- Evasion techniques or obfuscation tooling (intentionally)
|
|
259
|
+
- Zero-day exploits or novel vulnerabilities
|
|
260
|
+
- Portable bypass payloads designed to be transferable across different systems
|
|
261
|
+
|
|
262
|
+
**Note on linguistic transforms:** The harness *does* include pragmatic, register, discourse and code-switching (ru-en) categories — but as **measurement probes**, not as attack tooling. Our data shows surface-level linguistic transforms have ~0% success on modern frontier models, which is itself a useful finding for defenders deciding where to invest.
|
|
263
|
+
|
|
264
|
+
This is a **defensive measurement tool**, not an offensive toolkit.
|
|
265
|
+
|
|
266
|
+
## Citation
|
|
267
|
+
|
|
268
|
+
If you use this in research, cite as:
|
|
269
|
+
|
|
270
|
+
```bibtex
|
|
271
|
+
@misc{agentprobe2026,
|
|
272
|
+
title={AgentProbe: Evaluating LLM Agent Defenses Against Indirect Injection},
|
|
273
|
+
author={Samgar},
|
|
274
|
+
year={2026},
|
|
275
|
+
url={https://github.com/Samgar-kz/agentprobe}
|
|
276
|
+
}
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
## License
|
|
280
|
+
|
|
281
|
+
MIT
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
# AgentProbe: Defense Evaluation Harness for LLM Agents
|
|
2
|
+
|
|
3
|
+
[](https://github.com/Samgar-kz/agentprobe/actions/workflows/ci.yml)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[]()
|
|
7
|
+
|
|
8
|
+
## What This Is
|
|
9
|
+
|
|
10
|
+
A testing framework for measuring your LLM agent's **resistance to indirect prompt injection** and **comparing defense effectiveness**. Tests your own systems or those you have permission to test.
|
|
11
|
+
|
|
12
|
+
NOT an attack generator or bypass toolkit. NOT for probing other people's systems.
|
|
13
|
+
|
|
14
|
+
## Key Findings from Our Research
|
|
15
|
+
|
|
16
|
+
Our testing on gpt-4o-mini and claude-haiku-4-5 reveals three things:
|
|
17
|
+
|
|
18
|
+
1. **Surface-level linguistic transforms don't work on modern models**
|
|
19
|
+
- Pragmatic implicature, register shifts, code-switching: ~0% success rate
|
|
20
|
+
- Modern LLMs aren't fooled by just changing speech act or tone
|
|
21
|
+
|
|
22
|
+
2. **Indirect injection through data IS a real vulnerability**
|
|
23
|
+
- Information hidden in tool outputs (emails, documents, web pages) bypasses prompt-level defenses
|
|
24
|
+
- Separation at prompt level is not enough
|
|
25
|
+
|
|
26
|
+
3. **Asymmetry: Models leak data more readily than execute unauthorized actions**
|
|
27
|
+
- Defending against information leakage != defending against tool abuse
|
|
28
|
+
- Different threat models need different defenses
|
|
29
|
+
|
|
30
|
+
## Results: Defense Effectiveness
|
|
31
|
+
|
|
32
|
+
**gpt-4o-mini**
|
|
33
|
+
|
|
34
|
+
Defense names below match the `defense` column in the CSV outputs (`data/`) and JSON reports.
|
|
35
|
+
|
|
36
|
+
| Defense (code name) | Leak Rate | N |
|
|
37
|
+
|---------------------|-----------|---|
|
|
38
|
+
| `none` (baseline) | 29.8% | 84 |
|
|
39
|
+
| `delimited` (delimiter wrap) | 25.0% | 84 |
|
|
40
|
+
| `instr_hierarchy` (privilege-level instruction) | 31.0% | 84 |
|
|
41
|
+
| `sandwich` (recency reinforcement) | 15.5% | 84 |
|
|
42
|
+
| `spotlight` (datamarking) | 6.0% | 84 |
|
|
43
|
+
| `llm_filter` (separate screening pass) | 0% | 84 |
|
|
44
|
+
|
|
45
|
+
For reference, the same battery on **gpt-4o** leaks much less (baseline 10.7%, `delimited`/`llm_filter` 0%), and **claude-haiku-4-5** holds 0% across every defense — so absolute numbers are model-specific; treat them as relative defense rankings, not universal constants.
|
|
46
|
+
|
|
47
|
+
**claude-haiku-4-5** holds baseline at 0% leak rate across all test conditions; defense differentiation is not measurable on this model.
|
|
48
|
+
|
|
49
|
+
### Key Finding: Screening (and datamarking) beat prompt-level defenses
|
|
50
|
+
|
|
51
|
+
The separate verification pass (`llm_filter`) achieved 0 successful leaks in 84 test runs on gpt-4o-mini. The next best is `spotlight` (datamarking) at 6.0%. By contrast, prompt-level instruction (`instr_hierarchy`, 31.0%) was *no better than baseline* (29.8%).
|
|
52
|
+
|
|
53
|
+
This suggests: **prompt-level instructions and delimiters are incomplete; either token-level datamarking or a separate, independent judgment pass is required to reliably catch injection.**
|
|
54
|
+
|
|
55
|
+
## How To Use
|
|
56
|
+
|
|
57
|
+
### Test Your Own Agent
|
|
58
|
+
|
|
59
|
+
> **Note:** The PyPI package is named `agentprobe-injection` (the plain
|
|
60
|
+
> `agentprobe` name was already taken). The import package and CLI command are
|
|
61
|
+
> still `agentprobe`.
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Install from PyPI
|
|
65
|
+
pip install agentprobe-injection
|
|
66
|
+
|
|
67
|
+
# Or install the latest from GitHub
|
|
68
|
+
pip install git+https://github.com/Samgar-kz/agentprobe.git
|
|
69
|
+
|
|
70
|
+
# Or clone for development
|
|
71
|
+
git clone https://github.com/Samgar-kz/agentprobe.git
|
|
72
|
+
cd agentprobe && pip install -e .
|
|
73
|
+
|
|
74
|
+
export OPENAI_API_KEY="..."
|
|
75
|
+
|
|
76
|
+
agentprobe scan \
|
|
77
|
+
--target dummy \
|
|
78
|
+
--oracle semantic \
|
|
79
|
+
--json-report results.json
|
|
80
|
+
|
|
81
|
+
# Check results
|
|
82
|
+
cat results.json | jq '.statistics'
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Available Defenses to Test
|
|
86
|
+
|
|
87
|
+
The harness measures effectiveness of these defenses:
|
|
88
|
+
|
|
89
|
+
1. **`none`** — baseline (no defense applied)
|
|
90
|
+
2. **`delimited`** — wrap data in `<<<UNTRUSTED_DATA_BEGIN>>>...<<<UNTRUSTED_DATA_END>>>` markers
|
|
91
|
+
3. **`spotlight`** — datamarking: mark every data token so the model separates data from instructions
|
|
92
|
+
4. **`sandwich`** — repeat the do-not-obey rule after the data (recency effect)
|
|
93
|
+
5. **`instr_hierarchy`** — tag data with an explicit low privilege level; assert system instructions outrank tool/data content
|
|
94
|
+
6. **`llm_filter`** — separate LLM verification pass to detect/strip injection before execution
|
|
95
|
+
|
|
96
|
+
Test each against YOUR agent. See which work, which break utility.
|
|
97
|
+
|
|
98
|
+
### How It Works
|
|
99
|
+
|
|
100
|
+
1. **Injection Generator:** Creates test payloads (carriers: email, document, web page) with hidden canary instructions
|
|
101
|
+
2. **Defense Applicator:** Wraps the data with each defense mechanism
|
|
102
|
+
3. **Target Adapter:** Sends to your agent, captures response
|
|
103
|
+
4. **Semantic Oracle:** Uses gpt-4o-mini to judge: did agent leak data or follow the hidden instruction?
|
|
104
|
+
5. **Utility Harness:** Runs benign legitimate tasks to ensure defenses don't break normal functionality
|
|
105
|
+
6. **Report:** Table showing defense effectiveness + utility cost
|
|
106
|
+
|
|
107
|
+
### Defense vs Utility Trade-off
|
|
108
|
+
|
|
109
|
+
**Result:** All 5 defenses preserve utility on legitimate tasks (120/120 runs, 0% false-positive rate).
|
|
110
|
+
|
|
111
|
+
Tested on 8 benign tasks (extract dates, risks, budget, sentiment, action items, meeting notes, legitimately forward to internal address) with 3 repeats each:
|
|
112
|
+
|
|
113
|
+
| Defense | False-Positive Rate | Status |
|
|
114
|
+
|---------|-------------------|--------|
|
|
115
|
+
| `none` | 0% | baseline |
|
|
116
|
+
| `delimited` | 0% | safe to use |
|
|
117
|
+
| `spotlight` | 0% | safe to use |
|
|
118
|
+
| `sandwich` | 0% | safe to use |
|
|
119
|
+
| `instr_hierarchy` | 0% | safe to use |
|
|
120
|
+
| `llm_filter` | 0% | safe to use |
|
|
121
|
+
|
|
122
|
+
Conclusion: **Defenses do not break legitimate agent functionality** (in current test suite). Task success rate remains 100% across all defenses, making the injection effectiveness/defense trade-off directly comparable (both measured under same utility constraints).
|
|
123
|
+
|
|
124
|
+
Run your own: `python run_utility_harness.py --repeats=3 --temp=0.7 --out=utility_results.csv`
|
|
125
|
+
|
|
126
|
+
## Responsible Use
|
|
127
|
+
|
|
128
|
+
- **Only test systems you own or have written permission to test**
|
|
129
|
+
- Destination: understanding YOUR defenses, not generating portable bypasses
|
|
130
|
+
- Disclose findings responsibly (if testing third-party systems with permission)
|
|
131
|
+
- The framework measures vulnerability, it's not a jailbreak toolkit
|
|
132
|
+
|
|
133
|
+
## Architecture
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
agentprobe/
|
|
137
|
+
├── oracle_semantic.py # LLM-as-judge using gpt-4o-mini
|
|
138
|
+
├── oracle_legacy.py # Fallback: substring matching
|
|
139
|
+
├── oracle.py # Oracle interface
|
|
140
|
+
├── adapters/
|
|
141
|
+
│ ├── dummy.py # Built-in intentionally-vulnerable agent simulator
|
|
142
|
+
│ ├── http.py # Test any HTTP-accessible agent (sync)
|
|
143
|
+
│ └── http_async.py # Async HTTP adapter for concurrent scans
|
|
144
|
+
├── injection/
|
|
145
|
+
│ ├── carriers.py # Email, document, web page wrappers
|
|
146
|
+
│ ├── defenses.py # Defense mechanisms to evaluate
|
|
147
|
+
│ ├── benign_tasks.py # Utility harness tasks
|
|
148
|
+
│ └── screening.py # Screening defense (separate LLM pass)
|
|
149
|
+
├── engine.py # Synchronous scan
|
|
150
|
+
├── engine_async.py # Async scan
|
|
151
|
+
├── metrics.py # Statistical analysis (Wilson CI, effect sizes)
|
|
152
|
+
├── report.py # Report generation
|
|
153
|
+
├── logging_config.py # Structured logging, cost tracking
|
|
154
|
+
└── cli.py # Command-line interface
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Command-Line Usage
|
|
158
|
+
|
|
159
|
+
### Basic scan
|
|
160
|
+
```bash
|
|
161
|
+
# Test dummy agent
|
|
162
|
+
agentprobe scan --target dummy
|
|
163
|
+
|
|
164
|
+
# Test HTTP agent
|
|
165
|
+
agentprobe scan --target http \
|
|
166
|
+
--endpoint http://localhost:8000/chat \
|
|
167
|
+
--input-field message \
|
|
168
|
+
--output-field reply
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Control oracle
|
|
172
|
+
```bash
|
|
173
|
+
# Use semantic oracle (default, requires OPENAI_API_KEY)
|
|
174
|
+
agentprobe scan --target dummy --oracle semantic
|
|
175
|
+
|
|
176
|
+
# Use legacy oracle (offline, pattern matching)
|
|
177
|
+
agentprobe scan --target dummy --oracle legacy
|
|
178
|
+
|
|
179
|
+
# Set confidence threshold
|
|
180
|
+
agentprobe scan --target dummy --oracle semantic --min-confidence 0.85
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Reports
|
|
184
|
+
```bash
|
|
185
|
+
# JSON report with statistics
|
|
186
|
+
agentprobe scan --target dummy --json-report results.json
|
|
187
|
+
|
|
188
|
+
# Verbose logging
|
|
189
|
+
agentprobe scan --target dummy --verbose 2
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Measurement Infrastructure
|
|
193
|
+
|
|
194
|
+
- **Oracle:** gpt-4o-mini with Structured Outputs (semantic judgment)
|
|
195
|
+
- **Test Harness:** Carriers simulate real data flows (email, document, web page)
|
|
196
|
+
- **Utility Harness:** Measures task success rate per defense on benign tasks (see *Defense vs Utility Trade-off* above)
|
|
197
|
+
- **Benchmarking:** Latency / throughput available via `--async --concurrency N` on HTTP targets
|
|
198
|
+
|
|
199
|
+
All numbers above are from actual test runs (CSV in /data/).
|
|
200
|
+
|
|
201
|
+
## Testing Your Own Code
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
# Run all tests
|
|
205
|
+
pytest tests/ -v
|
|
206
|
+
|
|
207
|
+
# Test a specific component
|
|
208
|
+
pytest tests/test_oracle_semantic.py -v
|
|
209
|
+
|
|
210
|
+
# Run with coverage
|
|
211
|
+
pytest tests/ --cov=agentprobe
|
|
212
|
+
|
|
213
|
+
# Benchmark async performance
|
|
214
|
+
agentprobe scan --target dummy --async --concurrency 15
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## What's NOT Included
|
|
218
|
+
|
|
219
|
+
- Evasion techniques or obfuscation tooling (intentionally)
|
|
220
|
+
- Zero-day exploits or novel vulnerabilities
|
|
221
|
+
- Portable bypass payloads designed to be transferable across different systems
|
|
222
|
+
|
|
223
|
+
**Note on linguistic transforms:** The harness *does* include pragmatic, register, discourse and code-switching (ru-en) categories — but as **measurement probes**, not as attack tooling. Our data shows surface-level linguistic transforms have ~0% success on modern frontier models, which is itself a useful finding for defenders deciding where to invest.
|
|
224
|
+
|
|
225
|
+
This is a **defensive measurement tool**, not an offensive toolkit.
|
|
226
|
+
|
|
227
|
+
## Citation
|
|
228
|
+
|
|
229
|
+
If you use this in research, cite as:
|
|
230
|
+
|
|
231
|
+
```bibtex
|
|
232
|
+
@misc{agentprobe2026,
|
|
233
|
+
title={AgentProbe: Evaluating LLM Agent Defenses Against Indirect Injection},
|
|
234
|
+
author={Samgar},
|
|
235
|
+
year={2026},
|
|
236
|
+
url={https://github.com/Samgar-kz/agentprobe}
|
|
237
|
+
}
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## License
|
|
241
|
+
|
|
242
|
+
MIT
|