agentprobe-injection 0.2.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. agentprobe_injection-0.2.0a1/.github/workflows/ci.yml +58 -0
  2. agentprobe_injection-0.2.0a1/.gitignore +14 -0
  3. agentprobe_injection-0.2.0a1/CONTRIBUTING.md +62 -0
  4. agentprobe_injection-0.2.0a1/LICENSE +21 -0
  5. agentprobe_injection-0.2.0a1/PKG-INFO +281 -0
  6. agentprobe_injection-0.2.0a1/README.md +242 -0
  7. agentprobe_injection-0.2.0a1/RESILIENCE.md +204 -0
  8. agentprobe_injection-0.2.0a1/SECURITY.md +41 -0
  9. agentprobe_injection-0.2.0a1/TROUBLESHOOTING.md +316 -0
  10. agentprobe_injection-0.2.0a1/agentprobe/__init__.py +3 -0
  11. agentprobe_injection-0.2.0a1/agentprobe/adapters/__init__.py +16 -0
  12. agentprobe_injection-0.2.0a1/agentprobe/adapters/dummy.py +128 -0
  13. agentprobe_injection-0.2.0a1/agentprobe/adapters/http.py +56 -0
  14. agentprobe_injection-0.2.0a1/agentprobe/adapters/http_async.py +228 -0
  15. agentprobe_injection-0.2.0a1/agentprobe/attacks/__init__.py +6 -0
  16. agentprobe_injection-0.2.0a1/agentprobe/attacks/base.py +50 -0
  17. agentprobe_injection-0.2.0a1/agentprobe/attacks/registry.py +82 -0
  18. agentprobe_injection-0.2.0a1/agentprobe/attacks/transforms.py +183 -0
  19. agentprobe_injection-0.2.0a1/agentprobe/cli.py +334 -0
  20. agentprobe_injection-0.2.0a1/agentprobe/engine.py +151 -0
  21. agentprobe_injection-0.2.0a1/agentprobe/engine_async.py +183 -0
  22. agentprobe_injection-0.2.0a1/agentprobe/harness_utility.py +130 -0
  23. agentprobe_injection-0.2.0a1/agentprobe/injection/__init__.py +19 -0
  24. agentprobe_injection-0.2.0a1/agentprobe/injection/benign_tasks.py +174 -0
  25. agentprobe_injection-0.2.0a1/agentprobe/injection/carriers.py +200 -0
  26. agentprobe_injection-0.2.0a1/agentprobe/injection/defenses.py +98 -0
  27. agentprobe_injection-0.2.0a1/agentprobe/injection/oracle.py +75 -0
  28. agentprobe_injection-0.2.0a1/agentprobe/injection/screening.py +78 -0
  29. agentprobe_injection-0.2.0a1/agentprobe/llm_oracle.py +94 -0
  30. agentprobe_injection-0.2.0a1/agentprobe/logging_config.py +108 -0
  31. agentprobe_injection-0.2.0a1/agentprobe/metrics.py +199 -0
  32. agentprobe_injection-0.2.0a1/agentprobe/models.py +19 -0
  33. agentprobe_injection-0.2.0a1/agentprobe/oracle.py +193 -0
  34. agentprobe_injection-0.2.0a1/agentprobe/oracle_legacy.py +83 -0
  35. agentprobe_injection-0.2.0a1/agentprobe/oracle_semantic.py +221 -0
  36. agentprobe_injection-0.2.0a1/agentprobe/report.py +183 -0
  37. agentprobe_injection-0.2.0a1/agentprobe/target.py +44 -0
  38. agentprobe_injection-0.2.0a1/data/gpt4o.csv +505 -0
  39. agentprobe_injection-0.2.0a1/data/gpt4omini.csv +505 -0
  40. agentprobe_injection-0.2.0a1/data/haiku45.csv +505 -0
  41. agentprobe_injection-0.2.0a1/data/utility_gpt4omini.csv +121 -0
  42. agentprobe_injection-0.2.0a1/examples/tool_agent.py +180 -0
  43. agentprobe_injection-0.2.0a1/mcnemar_test.py +146 -0
  44. agentprobe_injection-0.2.0a1/plot_pareto.py +91 -0
  45. agentprobe_injection-0.2.0a1/plot_results.py +124 -0
  46. agentprobe_injection-0.2.0a1/pyproject.toml +49 -0
  47. agentprobe_injection-0.2.0a1/results/carrier_heatmap_gpt4omini.png +0 -0
  48. agentprobe_injection-0.2.0a1/results/carrier_heatmap_haiku45.png +0 -0
  49. agentprobe_injection-0.2.0a1/results/defense_leak_rates.png +0 -0
  50. agentprobe_injection-0.2.0a1/results/pareto_gpt4o.png +0 -0
  51. agentprobe_injection-0.2.0a1/results/pareto_gpt4omini.png +0 -0
  52. agentprobe_injection-0.2.0a1/run_injection_stats.py +212 -0
  53. agentprobe_injection-0.2.0a1/run_utility_harness.py +227 -0
  54. agentprobe_injection-0.2.0a1/tests/test_adapters.py +177 -0
  55. agentprobe_injection-0.2.0a1/tests/test_async_http.py +315 -0
  56. agentprobe_injection-0.2.0a1/tests/test_attacks.py +172 -0
  57. agentprobe_injection-0.2.0a1/tests/test_engine.py +188 -0
  58. agentprobe_injection-0.2.0a1/tests/test_engine_async.py +379 -0
  59. agentprobe_injection-0.2.0a1/tests/test_false_positives.py +260 -0
  60. agentprobe_injection-0.2.0a1/tests/test_logging.py +160 -0
  61. agentprobe_injection-0.2.0a1/tests/test_metrics.py +269 -0
  62. agentprobe_injection-0.2.0a1/tests/test_oracle.py +216 -0
  63. agentprobe_injection-0.2.0a1/tests/test_oracle_integration.py +331 -0
  64. agentprobe_injection-0.2.0a1/tests/test_oracle_semantic.py +368 -0
  65. agentprobe_injection-0.2.0a1/tests/test_report.py +244 -0
  66. agentprobe_injection-0.2.0a1/tests/test_smoke.py +37 -0
  67. agentprobe_injection-0.2.0a1/tests/test_step3_integration.py +223 -0
@@ -0,0 +1,58 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+ workflow_dispatch:
9
+
10
+ jobs:
11
+ test:
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ fail-fast: false
15
+ matrix:
16
+ python-version: ["3.10", "3.11", "3.12"]
17
+
18
+ steps:
19
+ - name: Checkout repository
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Set up Python ${{ matrix.python-version }}
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version: ${{ matrix.python-version }}
26
+ cache: pip
27
+
28
+ - name: Install package + dev extras
29
+ run: |
30
+ python -m pip install --upgrade pip
31
+ pip install -e ".[dev,openai]"
32
+
33
+ - name: Run pytest
34
+ # No API keys are set on purpose. Without OPENAI_API_KEY the semantic
35
+ # oracle is unavailable and judge()/run_scan() fall back to the offline
36
+ # legacy oracle, so the suite runs fully offline and deterministically.
37
+ # Live-LLM behavior is covered by mocked tests (test_oracle_semantic.py).
38
+ run: |
39
+ pytest tests/ -v --tb=short
40
+
41
+ lint:
42
+ runs-on: ubuntu-latest
43
+ steps:
44
+ - name: Checkout repository
45
+ uses: actions/checkout@v4
46
+
47
+ - name: Set up Python
48
+ uses: actions/setup-python@v5
49
+ with:
50
+ python-version: "3.12"
51
+ cache: pip
52
+
53
+ - name: Install ruff
54
+ run: pip install ruff>=0.4
55
+
56
+ - name: Run ruff check
57
+ run: ruff check agentprobe/ tests/ || echo "::warning::Lint warnings present"
58
+ continue-on-error: true
@@ -0,0 +1,14 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ .pytest_cache/
5
+ .ruff_cache/
6
+ *.egg-info/
7
+ build/
8
+ dist/
9
+ .venv/
10
+ venv/
11
+ .env
12
+ *.json
13
+ !pyproject.toml
14
+ .DS_Store
@@ -0,0 +1,62 @@
1
+ # Contributing to AgentProbe
2
+
3
+ Thanks for your interest. AgentProbe is an alpha research/defensive tool, so the
4
+ bar is "correct, honest, and reproducible" over "feature-complete."
5
+
6
+ ## Ground rules
7
+
8
+ 1. **No fabricated results.** Every number in docs/README must trace back to a
9
+ real run with a CSV in `data/` (or `results/`). Illustrative output must be
10
+ labeled as illustrative.
11
+ 2. **Defensive framing only.** Contributions that turn this into a portable
12
+ attack/bypass toolkit will be rejected. See [SECURITY.md](SECURITY.md).
13
+ 3. **Tests required.** New behavior needs tests. Bug fixes should include a
14
+ regression test where practical.
15
+
16
+ ## Dev setup
17
+
18
+ ```bash
19
+ git clone https://github.com/Samgar-kz/agentprobe.git
20
+ cd agentprobe
21
+ python -m venv .venv && source .venv/bin/activate
22
+ pip install -e ".[dev,openai]"
23
+ ```
24
+
25
+ ## Before opening a PR
26
+
27
+ ```bash
28
+ # Run the test suite
29
+ pytest tests/ -v
30
+
31
+ # Lint
32
+ ruff check agentprobe/ tests/
33
+
34
+ # (optional) format
35
+ black agentprobe/ tests/
36
+ ```
37
+
38
+ CI runs pytest on Python 3.10 / 3.11 / 3.12 plus ruff. PRs must be green.
39
+
40
+ ## Adding an attack transform
41
+
42
+ Attack transforms live in `agentprobe/attacks/transforms.py` and are registered
43
+ via `registry.py`. Each transform needs:
44
+ - a unique `name`
45
+ - a `category` (one of: `classic`, `pragmatic`, `register`, `discourse`, `codeswitch`)
46
+ - a `rationale` explaining the linguistic hypothesis being tested
47
+
48
+ ## Adding a defense
49
+
50
+ Defenses live in `agentprobe/injection/defenses.py` (or `screening.py` for the
51
+ separate-LLM-pass family). Use the existing `Defense` dataclass. The `name` you
52
+ choose is what appears in CSV/JSON reports, so keep it stable and snake_case.
53
+
54
+ ## Adding a target adapter
55
+
56
+ Adapters live in `agentprobe/adapters/`. Implement the `Target` protocol from
57
+ `agentprobe/target.py`. Current adapters: `dummy`, `http`, `http_async`.
58
+
59
+ ## Commit style
60
+
61
+ Conventional-commit-ish prefixes are appreciated: `feat:`, `fix:`, `docs:`,
62
+ `test:`, `ci:`, `chore:`.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Samgar Abdikozha
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,281 @@
1
+ Metadata-Version: 2.4
2
+ Name: agentprobe-injection
3
+ Version: 0.2.0a1
4
+ Summary: Harness for measuring LLM agent resistance to indirect prompt injection and comparing defense effectiveness.
5
+ Project-URL: Homepage, https://github.com/Samgar-kz/agentprobe
6
+ Project-URL: Issues, https://github.com/Samgar-kz/agentprobe/issues
7
+ Author: Samgar
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: agents,ai-safety,llm,prompt-injection,security
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Information Technology
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Security
18
+ Classifier: Topic :: Software Development :: Testing
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: anthropic>=0.25
21
+ Requires-Dist: httpx>=0.27
22
+ Requires-Dist: litellm>=1.30
23
+ Requires-Dist: matplotlib>=3.7
24
+ Requires-Dist: numpy>=1.24
25
+ Requires-Dist: pydantic>=2.5
26
+ Requires-Dist: rich>=13.7
27
+ Requires-Dist: scipy>=1.11
28
+ Requires-Dist: tenacity>=8.2
29
+ Requires-Dist: typer>=0.12
30
+ Provides-Extra: dev
31
+ Requires-Dist: black>=24.0; extra == 'dev'
32
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
33
+ Requires-Dist: pytest-cov>=4.1; extra == 'dev'
34
+ Requires-Dist: pytest>=7.4; extra == 'dev'
35
+ Requires-Dist: ruff>=0.4; extra == 'dev'
36
+ Provides-Extra: openai
37
+ Requires-Dist: openai>=1.30; extra == 'openai'
38
+ Description-Content-Type: text/markdown
39
+
40
+ # AgentProbe: Defense Evaluation Harness for LLM Agents
41
+
42
+ [![CI](https://github.com/Samgar-kz/agentprobe/actions/workflows/ci.yml/badge.svg)](https://github.com/Samgar-kz/agentprobe/actions/workflows/ci.yml)
43
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
44
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
45
+ [![Status: Alpha](https://img.shields.io/badge/status-alpha-orange.svg)]()
46
+
47
+ ## What This Is
48
+
49
+ A testing framework for measuring your LLM agent's **resistance to indirect prompt injection** and **comparing defense effectiveness**. Tests your own systems or those you have permission to test.
50
+
51
+ NOT an attack generator or bypass toolkit. NOT for probing other people's systems.
52
+
53
+ ## Key Findings from Our Research
54
+
55
+ Our testing on gpt-4o-mini and claude-haiku-4-5 reveals three things:
56
+
57
+ 1. **Surface-level linguistic transforms don't work on modern models**
58
+ - Pragmatic implicature, register shifts, code-switching: ~0% success rate
59
+ - Modern LLMs aren't fooled by just changing speech act or tone
60
+
61
+ 2. **Indirect injection through data IS a real vulnerability**
62
+ - Information hidden in tool outputs (emails, documents, web pages) bypasses prompt-level defenses
63
+ - Separation at prompt level is not enough
64
+
65
+ 3. **Asymmetry: Models leak data more readily than execute unauthorized actions**
66
+ - Defending against information leakage != defending against tool abuse
67
+ - Different threat models need different defenses
68
+
69
+ ## Results: Defense Effectiveness
70
+
71
+ **gpt-4o-mini**
72
+
73
+ Defense names below match the `defense` column in the CSV outputs (`data/`) and JSON reports.
74
+
75
+ | Defense (code name) | Leak Rate | N |
76
+ |---------------------|-----------|---|
77
+ | `none` (baseline) | 29.8% | 84 |
78
+ | `delimited` (delimiter wrap) | 25.0% | 84 |
79
+ | `instr_hierarchy` (privilege-level instruction) | 31.0% | 84 |
80
+ | `sandwich` (recency reinforcement) | 15.5% | 84 |
81
+ | `spotlight` (datamarking) | 6.0% | 84 |
82
+ | `llm_filter` (separate screening pass) | 0% | 84 |
83
+
84
+ For reference, the same battery on **gpt-4o** leaks much less (baseline 10.7%, `delimited`/`llm_filter` 0%), and **claude-haiku-4-5** holds 0% across every defense — so absolute numbers are model-specific; treat them as relative defense rankings, not universal constants.
85
+
86
+ **claude-haiku-4-5** holds baseline at 0% leak rate across all test conditions; defense differentiation is not measurable on this model.
87
+
88
+ ### Key Finding: Screening (and datamarking) beat prompt-level defenses
89
+
90
+ The separate verification pass (`llm_filter`) achieved 0 successful leaks in 84 test runs on gpt-4o-mini. The next best is `spotlight` (datamarking) at 6.0%. By contrast, prompt-level instruction (`instr_hierarchy`, 31.0%) was *no better than baseline* (29.8%).
91
+
92
+ This suggests: **prompt-level instructions and delimiters are incomplete; either token-level datamarking or a separate, independent judgment pass is required to reliably catch injection.**
93
+
94
+ ## How To Use
95
+
96
+ ### Test Your Own Agent
97
+
98
+ > **Note:** The PyPI package is named `agentprobe-injection` (the plain
99
+ > `agentprobe` name was already taken). The import package and CLI command are
100
+ > still `agentprobe`.
101
+
102
+ ```bash
103
+ # Install from PyPI
104
+ pip install agentprobe-injection
105
+
106
+ # Or install the latest from GitHub
107
+ pip install git+https://github.com/Samgar-kz/agentprobe.git
108
+
109
+ # Or clone for development
110
+ git clone https://github.com/Samgar-kz/agentprobe.git
111
+ cd agentprobe && pip install -e .
112
+
113
+ export OPENAI_API_KEY="..."
114
+
115
+ agentprobe scan \
116
+ --target dummy \
117
+ --oracle semantic \
118
+ --json-report results.json
119
+
120
+ # Check results
121
+ cat results.json | jq '.statistics'
122
+ ```
123
+
124
+ ### Available Defenses to Test
125
+
126
+ The harness measures effectiveness of these defenses:
127
+
128
+ 1. **`none`** — baseline (no defense applied)
129
+ 2. **`delimited`** — wrap data in `<<<UNTRUSTED_DATA_BEGIN>>>...<<<UNTRUSTED_DATA_END>>>` markers
130
+ 3. **`spotlight`** — datamarking: mark every data token so the model separates data from instructions
131
+ 4. **`sandwich`** — repeat the do-not-obey rule after the data (recency effect)
132
+ 5. **`instr_hierarchy`** — tag data with an explicit low privilege level; assert system instructions outrank tool/data content
133
+ 6. **`llm_filter`** — separate LLM verification pass to detect/strip injection before execution
134
+
135
+ Test each against YOUR agent. See which work, which break utility.
136
+
137
+ ### How It Works
138
+
139
+ 1. **Injection Generator:** Creates test payloads (carriers: email, document, web page) with hidden canary instructions
140
+ 2. **Defense Applicator:** Wraps the data with each defense mechanism
141
+ 3. **Target Adapter:** Sends to your agent, captures response
142
+ 4. **Semantic Oracle:** Uses gpt-4o-mini to judge: did agent leak data or follow the hidden instruction?
143
+ 5. **Utility Harness:** Runs benign legitimate tasks to ensure defenses don't break normal functionality
144
+ 6. **Report:** Table showing defense effectiveness + utility cost
145
+
146
+ ### Defense vs Utility Trade-off
147
+
148
+ **Result:** All 5 defenses preserve utility on legitimate tasks (120/120 runs, 0% false-positive rate).
149
+
150
+ Tested on 8 benign tasks (extract dates, risks, budget, sentiment, action items, meeting notes, legitimately forward to internal address) with 3 repeats each:
151
+
152
+ | Defense | False-Positive Rate | Status |
153
+ |---------|-------------------|--------|
154
+ | `none` | 0% | baseline |
155
+ | `delimited` | 0% | safe to use |
156
+ | `spotlight` | 0% | safe to use |
157
+ | `sandwich` | 0% | safe to use |
158
+ | `instr_hierarchy` | 0% | safe to use |
159
+ | `llm_filter` | 0% | safe to use |
160
+
161
+ Conclusion: **Defenses do not break legitimate agent functionality** (in current test suite). Task success rate remains 100% across all defenses, making the injection effectiveness/defense trade-off directly comparable (both measured under same utility constraints).
162
+
163
+ Run your own: `python run_utility_harness.py --repeats=3 --temp=0.7 --out=utility_results.csv`
164
+
165
+ ## Responsible Use
166
+
167
+ - **Only test systems you own or have written permission to test**
168
+ - Destination: understanding YOUR defenses, not generating portable bypasses
169
+ - Disclose findings responsibly (if testing third-party systems with permission)
170
+ - The framework measures vulnerability, it's not a jailbreak toolkit
171
+
172
+ ## Architecture
173
+
174
+ ```
175
+ agentprobe/
176
+ ├── oracle_semantic.py # LLM-as-judge using gpt-4o-mini
177
+ ├── oracle_legacy.py # Fallback: substring matching
178
+ ├── oracle.py # Oracle interface
179
+ ├── adapters/
180
+ │ ├── dummy.py # Built-in intentionally-vulnerable agent simulator
181
+ │ ├── http.py # Test any HTTP-accessible agent (sync)
182
+ │ └── http_async.py # Async HTTP adapter for concurrent scans
183
+ ├── injection/
184
+ │ ├── carriers.py # Email, document, web page wrappers
185
+ │ ├── defenses.py # Defense mechanisms to evaluate
186
+ │ ├── benign_tasks.py # Utility harness tasks
187
+ │ └── screening.py # Screening defense (separate LLM pass)
188
+ ├── engine.py # Synchronous scan
189
+ ├── engine_async.py # Async scan
190
+ ├── metrics.py # Statistical analysis (Wilson CI, effect sizes)
191
+ ├── report.py # Report generation
192
+ ├── logging_config.py # Structured logging, cost tracking
193
+ └── cli.py # Command-line interface
194
+ ```
195
+
196
+ ## Command-Line Usage
197
+
198
+ ### Basic scan
199
+ ```bash
200
+ # Test dummy agent
201
+ agentprobe scan --target dummy
202
+
203
+ # Test HTTP agent
204
+ agentprobe scan --target http \
205
+ --endpoint http://localhost:8000/chat \
206
+ --input-field message \
207
+ --output-field reply
208
+ ```
209
+
210
+ ### Control oracle
211
+ ```bash
212
+ # Use semantic oracle (default, requires OPENAI_API_KEY)
213
+ agentprobe scan --target dummy --oracle semantic
214
+
215
+ # Use legacy oracle (offline, pattern matching)
216
+ agentprobe scan --target dummy --oracle legacy
217
+
218
+ # Set confidence threshold
219
+ agentprobe scan --target dummy --oracle semantic --min-confidence 0.85
220
+ ```
221
+
222
+ ### Reports
223
+ ```bash
224
+ # JSON report with statistics
225
+ agentprobe scan --target dummy --json-report results.json
226
+
227
+ # Verbose logging
228
+ agentprobe scan --target dummy --verbose 2
229
+ ```
230
+
231
+ ## Measurement Infrastructure
232
+
233
+ - **Oracle:** gpt-4o-mini with Structured Outputs (semantic judgment)
234
+ - **Test Harness:** Carriers simulate real data flows (email, document, web page)
235
+ - **Utility Harness:** Measures task success rate per defense on benign tasks (see *Defense vs Utility Trade-off* above)
236
+ - **Benchmarking:** Latency / throughput available via `--async --concurrency N` on HTTP targets
237
+
238
+ All numbers above are from actual test runs (CSV in /data/).
239
+
240
+ ## Testing Your Own Code
241
+
242
+ ```bash
243
+ # Run all tests
244
+ pytest tests/ -v
245
+
246
+ # Test a specific component
247
+ pytest tests/test_oracle_semantic.py -v
248
+
249
+ # Run with coverage
250
+ pytest tests/ --cov=agentprobe
251
+
252
+ # Benchmark async performance
253
+ agentprobe scan --target dummy --async --concurrency 15
254
+ ```
255
+
256
+ ## What's NOT Included
257
+
258
+ - Evasion techniques or obfuscation tooling (intentionally)
259
+ - Zero-day exploits or novel vulnerabilities
260
+ - Portable bypass payloads designed to be transferable across different systems
261
+
262
+ **Note on linguistic transforms:** The harness *does* include pragmatic, register, discourse and code-switching (ru-en) categories — but as **measurement probes**, not as attack tooling. Our data shows surface-level linguistic transforms have ~0% success on modern frontier models, which is itself a useful finding for defenders deciding where to invest.
263
+
264
+ This is a **defensive measurement tool**, not an offensive toolkit.
265
+
266
+ ## Citation
267
+
268
+ If you use this in research, cite as:
269
+
270
+ ```bibtex
271
+ @misc{agentprobe2026,
272
+ title={AgentProbe: Evaluating LLM Agent Defenses Against Indirect Injection},
273
+ author={Samgar},
274
+ year={2026},
275
+ url={https://github.com/Samgar-kz/agentprobe}
276
+ }
277
+ ```
278
+
279
+ ## License
280
+
281
+ MIT
@@ -0,0 +1,242 @@
1
+ # AgentProbe: Defense Evaluation Harness for LLM Agents
2
+
3
+ [![CI](https://github.com/Samgar-kz/agentprobe/actions/workflows/ci.yml/badge.svg)](https://github.com/Samgar-kz/agentprobe/actions/workflows/ci.yml)
4
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
6
+ [![Status: Alpha](https://img.shields.io/badge/status-alpha-orange.svg)]()
7
+
8
+ ## What This Is
9
+
10
+ A testing framework for measuring your LLM agent's **resistance to indirect prompt injection** and **comparing defense effectiveness**. Tests your own systems or those you have permission to test.
11
+
12
+ NOT an attack generator or bypass toolkit. NOT for probing other people's systems.
13
+
14
+ ## Key Findings from Our Research
15
+
16
+ Our testing on gpt-4o-mini and claude-haiku-4-5 reveals three things:
17
+
18
+ 1. **Surface-level linguistic transforms don't work on modern models**
19
+ - Pragmatic implicature, register shifts, code-switching: ~0% success rate
20
+ - Modern LLMs aren't fooled by just changing speech act or tone
21
+
22
+ 2. **Indirect injection through data IS a real vulnerability**
23
+ - Information hidden in tool outputs (emails, documents, web pages) bypasses prompt-level defenses
24
+ - Separation at prompt level is not enough
25
+
26
+ 3. **Asymmetry: Models leak data more readily than execute unauthorized actions**
27
+ - Defending against information leakage != defending against tool abuse
28
+ - Different threat models need different defenses
29
+
30
+ ## Results: Defense Effectiveness
31
+
32
+ **gpt-4o-mini**
33
+
34
+ Defense names below match the `defense` column in the CSV outputs (`data/`) and JSON reports.
35
+
36
+ | Defense (code name) | Leak Rate | N |
37
+ |---------------------|-----------|---|
38
+ | `none` (baseline) | 29.8% | 84 |
39
+ | `delimited` (delimiter wrap) | 25.0% | 84 |
40
+ | `instr_hierarchy` (privilege-level instruction) | 31.0% | 84 |
41
+ | `sandwich` (recency reinforcement) | 15.5% | 84 |
42
+ | `spotlight` (datamarking) | 6.0% | 84 |
43
+ | `llm_filter` (separate screening pass) | 0% | 84 |
44
+
45
+ For reference, the same battery on **gpt-4o** leaks much less (baseline 10.7%, `delimited`/`llm_filter` 0%), and **claude-haiku-4-5** holds 0% across every defense — so absolute numbers are model-specific; treat them as relative defense rankings, not universal constants.
46
+
47
+ **claude-haiku-4-5** holds baseline at 0% leak rate across all test conditions; defense differentiation is not measurable on this model.
48
+
49
+ ### Key Finding: Screening (and datamarking) beat prompt-level defenses
50
+
51
+ The separate verification pass (`llm_filter`) achieved 0 successful leaks in 84 test runs on gpt-4o-mini. The next best is `spotlight` (datamarking) at 6.0%. By contrast, prompt-level instruction (`instr_hierarchy`, 31.0%) was *no better than baseline* (29.8%).
52
+
53
+ This suggests: **prompt-level instructions and delimiters are incomplete; either token-level datamarking or a separate, independent judgment pass is required to reliably catch injection.**
54
+
55
+ ## How To Use
56
+
57
+ ### Test Your Own Agent
58
+
59
+ > **Note:** The PyPI package is named `agentprobe-injection` (the plain
60
+ > `agentprobe` name was already taken). The import package and CLI command are
61
+ > still `agentprobe`.
62
+
63
+ ```bash
64
+ # Install from PyPI
65
+ pip install agentprobe-injection
66
+
67
+ # Or install the latest from GitHub
68
+ pip install git+https://github.com/Samgar-kz/agentprobe.git
69
+
70
+ # Or clone for development
71
+ git clone https://github.com/Samgar-kz/agentprobe.git
72
+ cd agentprobe && pip install -e .
73
+
74
+ export OPENAI_API_KEY="..."
75
+
76
+ agentprobe scan \
77
+ --target dummy \
78
+ --oracle semantic \
79
+ --json-report results.json
80
+
81
+ # Check results
82
+ cat results.json | jq '.statistics'
83
+ ```
84
+
85
+ ### Available Defenses to Test
86
+
87
+ The harness measures effectiveness of these defenses:
88
+
89
+ 1. **`none`** — baseline (no defense applied)
90
+ 2. **`delimited`** — wrap data in `<<<UNTRUSTED_DATA_BEGIN>>>...<<<UNTRUSTED_DATA_END>>>` markers
91
+ 3. **`spotlight`** — datamarking: mark every data token so the model separates data from instructions
92
+ 4. **`sandwich`** — repeat the do-not-obey rule after the data (recency effect)
93
+ 5. **`instr_hierarchy`** — tag data with an explicit low privilege level; assert system instructions outrank tool/data content
94
+ 6. **`llm_filter`** — separate LLM verification pass to detect/strip injection before execution
95
+
96
+ Test each against YOUR agent. See which work, which break utility.
97
+
98
+ ### How It Works
99
+
100
+ 1. **Injection Generator:** Creates test payloads (carriers: email, document, web page) with hidden canary instructions
101
+ 2. **Defense Applicator:** Wraps the data with each defense mechanism
102
+ 3. **Target Adapter:** Sends to your agent, captures response
103
+ 4. **Semantic Oracle:** Uses gpt-4o-mini to judge: did agent leak data or follow the hidden instruction?
104
+ 5. **Utility Harness:** Runs benign legitimate tasks to ensure defenses don't break normal functionality
105
+ 6. **Report:** Table showing defense effectiveness + utility cost
106
+
107
+ ### Defense vs Utility Trade-off
108
+
109
+ **Result:** All 5 defenses preserve utility on legitimate tasks (120/120 runs, 0% false-positive rate).
110
+
111
+ Tested on 8 benign tasks (extract dates, risks, budget, sentiment, action items, meeting notes, legitimately forward to internal address) with 3 repeats each:
112
+
113
+ | Defense | False-Positive Rate | Status |
114
+ |---------|-------------------|--------|
115
+ | `none` | 0% | baseline |
116
+ | `delimited` | 0% | safe to use |
117
+ | `spotlight` | 0% | safe to use |
118
+ | `sandwich` | 0% | safe to use |
119
+ | `instr_hierarchy` | 0% | safe to use |
120
+ | `llm_filter` | 0% | safe to use |
121
+
122
+ Conclusion: **Defenses do not break legitimate agent functionality** (in current test suite). Task success rate remains 100% across all defenses, making the injection effectiveness/defense trade-off directly comparable (both measured under same utility constraints).
123
+
124
+ Run your own: `python run_utility_harness.py --repeats=3 --temp=0.7 --out=utility_results.csv`
125
+
126
+ ## Responsible Use
127
+
128
+ - **Only test systems you own or have written permission to test**
129
+ - Destination: understanding YOUR defenses, not generating portable bypasses
130
+ - Disclose findings responsibly (if testing third-party systems with permission)
131
+ - The framework measures vulnerability, it's not a jailbreak toolkit
132
+
133
+ ## Architecture
134
+
135
+ ```
136
+ agentprobe/
137
+ ├── oracle_semantic.py # LLM-as-judge using gpt-4o-mini
138
+ ├── oracle_legacy.py # Fallback: substring matching
139
+ ├── oracle.py # Oracle interface
140
+ ├── adapters/
141
+ │ ├── dummy.py # Built-in intentionally-vulnerable agent simulator
142
+ │ ├── http.py # Test any HTTP-accessible agent (sync)
143
+ │ └── http_async.py # Async HTTP adapter for concurrent scans
144
+ ├── injection/
145
+ │ ├── carriers.py # Email, document, web page wrappers
146
+ │ ├── defenses.py # Defense mechanisms to evaluate
147
+ │ ├── benign_tasks.py # Utility harness tasks
148
+ │ └── screening.py # Screening defense (separate LLM pass)
149
+ ├── engine.py # Synchronous scan
150
+ ├── engine_async.py # Async scan
151
+ ├── metrics.py # Statistical analysis (Wilson CI, effect sizes)
152
+ ├── report.py # Report generation
153
+ ├── logging_config.py # Structured logging, cost tracking
154
+ └── cli.py # Command-line interface
155
+ ```
156
+
157
+ ## Command-Line Usage
158
+
159
+ ### Basic scan
160
+ ```bash
161
+ # Test dummy agent
162
+ agentprobe scan --target dummy
163
+
164
+ # Test HTTP agent
165
+ agentprobe scan --target http \
166
+ --endpoint http://localhost:8000/chat \
167
+ --input-field message \
168
+ --output-field reply
169
+ ```
170
+
171
+ ### Control oracle
172
+ ```bash
173
+ # Use semantic oracle (default, requires OPENAI_API_KEY)
174
+ agentprobe scan --target dummy --oracle semantic
175
+
176
+ # Use legacy oracle (offline, pattern matching)
177
+ agentprobe scan --target dummy --oracle legacy
178
+
179
+ # Set confidence threshold
180
+ agentprobe scan --target dummy --oracle semantic --min-confidence 0.85
181
+ ```
182
+
183
+ ### Reports
184
+ ```bash
185
+ # JSON report with statistics
186
+ agentprobe scan --target dummy --json-report results.json
187
+
188
+ # Verbose logging
189
+ agentprobe scan --target dummy --verbose 2
190
+ ```
191
+
192
+ ## Measurement Infrastructure
193
+
194
+ - **Oracle:** gpt-4o-mini with Structured Outputs (semantic judgment)
195
+ - **Test Harness:** Carriers simulate real data flows (email, document, web page)
196
+ - **Utility Harness:** Measures task success rate per defense on benign tasks (see *Defense vs Utility Trade-off* above)
197
+ - **Benchmarking:** Latency / throughput available via `--async --concurrency N` on HTTP targets
198
+
199
+ All numbers above are from actual test runs (CSV in /data/).
200
+
201
+ ## Testing Your Own Code
202
+
203
+ ```bash
204
+ # Run all tests
205
+ pytest tests/ -v
206
+
207
+ # Test a specific component
208
+ pytest tests/test_oracle_semantic.py -v
209
+
210
+ # Run with coverage
211
+ pytest tests/ --cov=agentprobe
212
+
213
+ # Benchmark async performance
214
+ agentprobe scan --target dummy --async --concurrency 15
215
+ ```
216
+
217
+ ## What's NOT Included
218
+
219
+ - Evasion techniques or obfuscation tooling (intentionally)
220
+ - Zero-day exploits or novel vulnerabilities
221
+ - Portable bypass payloads designed to be transferable across different systems
222
+
223
+ **Note on linguistic transforms:** The harness *does* include pragmatic, register, discourse and code-switching (ru-en) categories — but as **measurement probes**, not as attack tooling. Our data shows surface-level linguistic transforms have ~0% success on modern frontier models, which is itself a useful finding for defenders deciding where to invest.
224
+
225
+ This is a **defensive measurement tool**, not an offensive toolkit.
226
+
227
+ ## Citation
228
+
229
+ If you use this in research, cite as:
230
+
231
+ ```bibtex
232
+ @misc{agentprobe2026,
233
+ title={AgentProbe: Evaluating LLM Agent Defenses Against Indirect Injection},
234
+ author={Samgar},
235
+ year={2026},
236
+ url={https://github.com/Samgar-kz/agentprobe}
237
+ }
238
+ ```
239
+
240
+ ## License
241
+
242
+ MIT