nullsec-datapoisoning 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nullsec_datapoisoning-0.1.0/PKG-INFO +107 -0
- nullsec_datapoisoning-0.1.0/README.md +85 -0
- nullsec_datapoisoning-0.1.0/nullsec_datapoisoning/__init__.py +39 -0
- nullsec_datapoisoning-0.1.0/nullsec_datapoisoning/__main__.py +104 -0
- nullsec_datapoisoning-0.1.0/nullsec_datapoisoning/attacks.py +233 -0
- nullsec_datapoisoning-0.1.0/nullsec_datapoisoning/defences.py +279 -0
- nullsec_datapoisoning-0.1.0/nullsec_datapoisoning.egg-info/PKG-INFO +107 -0
- nullsec_datapoisoning-0.1.0/nullsec_datapoisoning.egg-info/SOURCES.txt +11 -0
- nullsec_datapoisoning-0.1.0/nullsec_datapoisoning.egg-info/dependency_links.txt +1 -0
- nullsec_datapoisoning-0.1.0/nullsec_datapoisoning.egg-info/entry_points.txt +2 -0
- nullsec_datapoisoning-0.1.0/nullsec_datapoisoning.egg-info/top_level.txt +1 -0
- nullsec_datapoisoning-0.1.0/pyproject.toml +37 -0
- nullsec_datapoisoning-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nullsec-datapoisoning
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Training data poisoning detection and simulation — BadNets, Trojan, clean-label attacks, spectral signatures, activation clustering, STRIP defence
|
|
5
|
+
Author-email: bad-antics <admin@bad-antics.net>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/bad-antics/nullsec-datapoisoning
|
|
8
|
+
Project-URL: Repository, https://github.com/bad-antics/nullsec-datapoisoning
|
|
9
|
+
Keywords: security,machine-learning,data-poisoning,adversarial,ai-security,backdoor,nullsec
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Information Technology
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Security
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
<div align="center">
|
|
24
|
+
|
|
25
|
+
# ☠️ NullSec DataPoisoning
|
|
26
|
+
|
|
27
|
+
### Training Data Poisoning Detection & Simulation
|
|
28
|
+
|
|
29
|
+
[]()
|
|
30
|
+
[]()
|
|
31
|
+
[](https://github.com/bad-antics/nullsec-linux)
|
|
32
|
+
|
|
33
|
+
*Detect, simulate, and defend against training data poisoning attacks*
|
|
34
|
+
|
|
35
|
+
</div>
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## 🎯 Overview
|
|
40
|
+
|
|
41
|
+
NullSec DataPoisoning provides tools for detecting and simulating data poisoning attacks against machine learning pipelines. It implements backdoor injection (BadNets, Trojaning), clean-label attacks, and gradient-based poisoning, alongside detection methods like spectral signatures, activation clustering, and STRIP.
|
|
42
|
+
|
|
43
|
+
## ⚡ Features
|
|
44
|
+
|
|
45
|
+
| Feature | Description |
|
|
46
|
+
|---------|-------------|
|
|
47
|
+
| **Backdoor Injection** | BadNets, Trojan, blend, and warp triggers |
|
|
48
|
+
| **Clean-Label Attacks** | Feature collision, convex polytope, Witches' Brew |
|
|
49
|
+
| **Detection Engine** | Spectral signatures, activation clustering, STRIP |
|
|
50
|
+
| **Neural Cleanse** | Reverse-engineer trigger patterns from poisoned models |
|
|
51
|
+
| **Dataset Audit** | Scan datasets for anomalous samples and label flips |
|
|
52
|
+
| **Pipeline Scanner** | Audit ML pipelines for poisoning entry points |
|
|
53
|
+
|
|
54
|
+
## 📋 Attack & Defence Matrix
|
|
55
|
+
|
|
56
|
+
| Technique | Category | Type |
|
|
57
|
+
|-----------|----------|------|
|
|
58
|
+
| BadNets | Backdoor | Attack |
|
|
59
|
+
| Trojan Attack | Backdoor | Attack |
|
|
60
|
+
| Clean-Label FC | Poisoning | Attack |
|
|
61
|
+
| Witches' Brew | Poisoning | Attack |
|
|
62
|
+
| Spectral Signatures | Statistical | Defence |
|
|
63
|
+
| Activation Clustering | Neural | Defence |
|
|
64
|
+
| STRIP | Runtime | Defence |
|
|
65
|
+
| Neural Cleanse | Reverse Engineering | Defence |
|
|
66
|
+
|
|
67
|
+
## 🚀 Quick Start
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Scan a dataset for poisoning indicators
|
|
71
|
+
nullsec-datapoisoning scan --dataset training_data/ --model model.pt
|
|
72
|
+
|
|
73
|
+
# Simulate backdoor attack
|
|
74
|
+
nullsec-datapoisoning inject --dataset clean.csv --trigger patch --target-label 0 --poison-rate 0.01
|
|
75
|
+
|
|
76
|
+
# Run Neural Cleanse detection
|
|
77
|
+
nullsec-datapoisoning cleanse --model suspect_model.pt --num-classes 10
|
|
78
|
+
|
|
79
|
+
# Audit an ML pipeline config
|
|
80
|
+
nullsec-datapoisoning audit --pipeline pipeline.yaml
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## 🔗 Related Projects
|
|
84
|
+
|
|
85
|
+
| Project | Description |
|
|
86
|
+
|---------|-------------|
|
|
87
|
+
| [nullsec-adversarial](https://github.com/bad-antics/nullsec-adversarial) | Adversarial ML attack toolkit |
|
|
88
|
+
| [nullsec-modelaudit](https://github.com/bad-antics/nullsec-modelaudit) | ML model security auditing |
|
|
89
|
+
| [nullsec-llmred](https://github.com/bad-antics/nullsec-llmred) | LLM red-teaming framework |
|
|
90
|
+
| [nullsec-promptinject](https://github.com/bad-antics/nullsec-promptinject) | Prompt injection payloads |
|
|
91
|
+
| [nullsec-linux](https://github.com/bad-antics/nullsec-linux) | Security Linux distro (140+ tools) |
|
|
92
|
+
|
|
93
|
+
## ⚠️ Legal
|
|
94
|
+
|
|
95
|
+
For **authorized ML security research only**. Poisoning production training data without authorization is illegal.
|
|
96
|
+
|
|
97
|
+
## 📜 License
|
|
98
|
+
|
|
99
|
+
MIT License — [@bad-antics](https://github.com/bad-antics)
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
<div align="center">
|
|
104
|
+
|
|
105
|
+
*Part of the [NullSec AI/ML Security Suite](https://github.com/bad-antics)*
|
|
106
|
+
|
|
107
|
+
</div>
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# ☠️ NullSec DataPoisoning
|
|
4
|
+
|
|
5
|
+
### Training Data Poisoning Detection & Simulation
|
|
6
|
+
|
|
7
|
+
[]()
|
|
8
|
+
[]()
|
|
9
|
+
[](https://github.com/bad-antics/nullsec-linux)
|
|
10
|
+
|
|
11
|
+
*Detect, simulate, and defend against training data poisoning attacks*
|
|
12
|
+
|
|
13
|
+
</div>
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## 🎯 Overview
|
|
18
|
+
|
|
19
|
+
NullSec DataPoisoning provides tools for detecting and simulating data poisoning attacks against machine learning pipelines. It implements backdoor injection (BadNets, Trojaning), clean-label attacks, and gradient-based poisoning, alongside detection methods like spectral signatures, activation clustering, and STRIP.
|
|
20
|
+
|
|
21
|
+
## ⚡ Features
|
|
22
|
+
|
|
23
|
+
| Feature | Description |
|
|
24
|
+
|---------|-------------|
|
|
25
|
+
| **Backdoor Injection** | BadNets, Trojan, blend, and warp triggers |
|
|
26
|
+
| **Clean-Label Attacks** | Feature collision, convex polytope, Witches' Brew |
|
|
27
|
+
| **Detection Engine** | Spectral signatures, activation clustering, STRIP |
|
|
28
|
+
| **Neural Cleanse** | Reverse-engineer trigger patterns from poisoned models |
|
|
29
|
+
| **Dataset Audit** | Scan datasets for anomalous samples and label flips |
|
|
30
|
+
| **Pipeline Scanner** | Audit ML pipelines for poisoning entry points |
|
|
31
|
+
|
|
32
|
+
## 📋 Attack & Defence Matrix
|
|
33
|
+
|
|
34
|
+
| Technique | Category | Type |
|
|
35
|
+
|-----------|----------|------|
|
|
36
|
+
| BadNets | Backdoor | Attack |
|
|
37
|
+
| Trojan Attack | Backdoor | Attack |
|
|
38
|
+
| Clean-Label FC | Poisoning | Attack |
|
|
39
|
+
| Witches' Brew | Poisoning | Attack |
|
|
40
|
+
| Spectral Signatures | Statistical | Defence |
|
|
41
|
+
| Activation Clustering | Neural | Defence |
|
|
42
|
+
| STRIP | Runtime | Defence |
|
|
43
|
+
| Neural Cleanse | Reverse Engineering | Defence |
|
|
44
|
+
|
|
45
|
+
## 🚀 Quick Start
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# Scan a dataset for poisoning indicators
|
|
49
|
+
nullsec-datapoisoning scan --dataset training_data/ --model model.pt
|
|
50
|
+
|
|
51
|
+
# Simulate backdoor attack
|
|
52
|
+
nullsec-datapoisoning inject --dataset clean.csv --trigger patch --target-label 0 --poison-rate 0.01
|
|
53
|
+
|
|
54
|
+
# Run Neural Cleanse detection
|
|
55
|
+
nullsec-datapoisoning cleanse --model suspect_model.pt --num-classes 10
|
|
56
|
+
|
|
57
|
+
# Audit an ML pipeline config
|
|
58
|
+
nullsec-datapoisoning audit --pipeline pipeline.yaml
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## 🔗 Related Projects
|
|
62
|
+
|
|
63
|
+
| Project | Description |
|
|
64
|
+
|---------|-------------|
|
|
65
|
+
| [nullsec-adversarial](https://github.com/bad-antics/nullsec-adversarial) | Adversarial ML attack toolkit |
|
|
66
|
+
| [nullsec-modelaudit](https://github.com/bad-antics/nullsec-modelaudit) | ML model security auditing |
|
|
67
|
+
| [nullsec-llmred](https://github.com/bad-antics/nullsec-llmred) | LLM red-teaming framework |
|
|
68
|
+
| [nullsec-promptinject](https://github.com/bad-antics/nullsec-promptinject) | Prompt injection payloads |
|
|
69
|
+
| [nullsec-linux](https://github.com/bad-antics/nullsec-linux) | Security Linux distro (140+ tools) |
|
|
70
|
+
|
|
71
|
+
## ⚠️ Legal
|
|
72
|
+
|
|
73
|
+
For **authorized ML security research only**. Poisoning production training data without authorization is illegal.
|
|
74
|
+
|
|
75
|
+
## 📜 License
|
|
76
|
+
|
|
77
|
+
MIT License — [@bad-antics](https://github.com/bad-antics)
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
<div align="center">
|
|
82
|
+
|
|
83
|
+
*Part of the [NullSec AI/ML Security Suite](https://github.com/bad-antics)*
|
|
84
|
+
|
|
85
|
+
</div>
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NullSec DataPoisoning — Training data poisoning detection and simulation.
|
|
3
|
+
|
|
4
|
+
Attack modules:
|
|
5
|
+
- backdoor: BadNets, Trojan, blend and warp triggers
|
|
6
|
+
- clean_label: feature collision and convex polytope
|
|
7
|
+
- gradient: gradient-based poisoning (MetaPoison sketch)
|
|
8
|
+
|
|
9
|
+
Defence modules:
|
|
10
|
+
- spectral: spectral signature detection
|
|
11
|
+
- activation: activation clustering defence
|
|
12
|
+
- strip: STRIP inference-time defence
|
|
13
|
+
- audit: dataset anomaly scanner
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from .attacks import (
|
|
17
|
+
badnets_poison,
|
|
18
|
+
trojan_poison,
|
|
19
|
+
blend_poison,
|
|
20
|
+
clean_label_poison,
|
|
21
|
+
)
|
|
22
|
+
from .defences import (
|
|
23
|
+
spectral_signature_detect,
|
|
24
|
+
activation_cluster_detect,
|
|
25
|
+
strip_detect,
|
|
26
|
+
audit_dataset,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__version__ = "0.1.0"
|
|
30
|
+
__all__ = [
|
|
31
|
+
"badnets_poison",
|
|
32
|
+
"trojan_poison",
|
|
33
|
+
"blend_poison",
|
|
34
|
+
"clean_label_poison",
|
|
35
|
+
"spectral_signature_detect",
|
|
36
|
+
"activation_cluster_detect",
|
|
37
|
+
"strip_detect",
|
|
38
|
+
"audit_dataset",
|
|
39
|
+
]
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""CLI entry point for nullsec-datapoisoning."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main():
|
|
9
|
+
parser = argparse.ArgumentParser(
|
|
10
|
+
prog="nullsec-datapoisoning",
|
|
11
|
+
description="NullSec DataPoisoning — Training data poisoning detection & simulation",
|
|
12
|
+
)
|
|
13
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
14
|
+
|
|
15
|
+
# demo command
|
|
16
|
+
demo_p = sub.add_parser("demo", help="Run a quick demonstration with synthetic data")
|
|
17
|
+
demo_p.add_argument("--attack", choices=["badnets", "trojan", "blend", "clean_label"],
|
|
18
|
+
default="badnets")
|
|
19
|
+
demo_p.add_argument("--n-samples", type=int, default=200)
|
|
20
|
+
demo_p.add_argument("--poison-rate", type=float, default=0.1)
|
|
21
|
+
|
|
22
|
+
# audit command
|
|
23
|
+
audit_p = sub.add_parser("audit", help="Audit a JSON dataset file for anomalies")
|
|
24
|
+
audit_p.add_argument("file", help="Path to JSON file (list of {input, label} objects)")
|
|
25
|
+
audit_p.add_argument("--z-threshold", type=float, default=3.0)
|
|
26
|
+
|
|
27
|
+
# detect command
|
|
28
|
+
detect_p = sub.add_parser("detect", help="Run spectral signature detection on representations")
|
|
29
|
+
detect_p.add_argument("file", help="JSON file with {representations: [[...]], labels: [...]}")
|
|
30
|
+
detect_p.add_argument("--target-label", type=int, default=0)
|
|
31
|
+
detect_p.add_argument("--threshold-mult", type=float, default=1.5)
|
|
32
|
+
|
|
33
|
+
args = parser.parse_args()
|
|
34
|
+
|
|
35
|
+
if args.command == "demo":
|
|
36
|
+
_run_demo(args)
|
|
37
|
+
elif args.command == "audit":
|
|
38
|
+
_run_audit(args)
|
|
39
|
+
elif args.command == "detect":
|
|
40
|
+
_run_detect(args)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _run_demo(args):
|
|
44
|
+
import random
|
|
45
|
+
from nullsec_datapoisoning.attacks import badnets_poison, trojan_poison, blend_poison, clean_label_poison
|
|
46
|
+
from nullsec_datapoisoning.defences import spectral_signature_detect, audit_dataset
|
|
47
|
+
|
|
48
|
+
rng = random.Random(0)
|
|
49
|
+
dataset = [
|
|
50
|
+
{"input": [rng.random() for _ in range(16)], "label": rng.randint(0, 3), "poisoned": False}
|
|
51
|
+
for _ in range(args.n_samples)
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
attack_fn = {
|
|
55
|
+
"badnets": badnets_poison,
|
|
56
|
+
"trojan": lambda d, **kw: trojan_poison(d, source_label=0, **kw),
|
|
57
|
+
"blend": blend_poison,
|
|
58
|
+
"clean_label": clean_label_poison,
|
|
59
|
+
}[args.attack]
|
|
60
|
+
|
|
61
|
+
poisoned, stats = attack_fn(dataset, target_label=0, poison_rate=args.poison_rate)
|
|
62
|
+
|
|
63
|
+
print(f"\n\033[32m[NullSec DataPoisoning] Attack: {args.attack}\033[0m")
|
|
64
|
+
print(json.dumps(stats, indent=2))
|
|
65
|
+
|
|
66
|
+
reps = [s["input"] for s in poisoned]
|
|
67
|
+
labels = [s["label"] for s in poisoned]
|
|
68
|
+
detect_result = spectral_signature_detect(reps, labels, target_label=0)
|
|
69
|
+
n_true_positives = sum(1 for i in detect_result["suspicious_indices"] if poisoned[i].get("poisoned"))
|
|
70
|
+
print(f"\n\033[32m[Spectral Signature Detection]\033[0m")
|
|
71
|
+
print(f" Suspicious samples flagged: {len(detect_result['suspicious_indices'])}")
|
|
72
|
+
print(f" True positives (known poisoned): {n_true_positives}")
|
|
73
|
+
print(f" Threshold: {detect_result['threshold']:.4f}")
|
|
74
|
+
|
|
75
|
+
audit = audit_dataset(poisoned)
|
|
76
|
+
print(f"\n\033[32m[Dataset Audit]\033[0m")
|
|
77
|
+
print(f" Outlier indices found: {len(audit['outlier_indices'])}")
|
|
78
|
+
print(f" Label flip candidates: {len(audit['label_flip_candidates'])}")
|
|
79
|
+
print(f" Label distribution: {audit['label_distribution']}")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _run_audit(args):
|
|
83
|
+
from nullsec_datapoisoning.defences import audit_dataset
|
|
84
|
+
with open(args.file) as f:
|
|
85
|
+
dataset = json.load(f)
|
|
86
|
+
result = audit_dataset(dataset, z_threshold=args.z_threshold)
|
|
87
|
+
print(json.dumps(result, indent=2))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _run_detect(args):
|
|
91
|
+
from nullsec_datapoisoning.defences import spectral_signature_detect
|
|
92
|
+
with open(args.file) as f:
|
|
93
|
+
data = json.load(f)
|
|
94
|
+
result = spectral_signature_detect(
|
|
95
|
+
data["representations"], data["labels"],
|
|
96
|
+
target_label=args.target_label,
|
|
97
|
+
threshold_multiplier=args.threshold_mult,
|
|
98
|
+
)
|
|
99
|
+
result["scores"] = {str(k): v for k, v in result["scores"].items()}
|
|
100
|
+
print(json.dumps(result, indent=2))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
if __name__ == "__main__":
|
|
104
|
+
main()
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Attack implementations: BadNets, Trojan, blend, clean-label.
|
|
3
|
+
All operate on generic list-of-samples representations so they work
|
|
4
|
+
without requiring numpy/torch as hard dependencies.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
import math
|
|
8
|
+
import random
|
|
9
|
+
from typing import Any, Callable
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# ──────────────────────────────────────────────────────────────
|
|
13
|
+
# Types
|
|
14
|
+
# ──────────────────────────────────────────────────────────────
|
|
15
|
+
|
|
16
|
+
Sample = dict # {"input": Any, "label": int, "poisoned": bool}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _clone(sample: Sample) -> Sample:
|
|
20
|
+
return dict(sample)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ──────────────────────────────────────────────────────────────
|
|
24
|
+
# BadNets — stamp a fixed trigger pattern on inputs
|
|
25
|
+
# ──────────────────────────────────────────────────────────────
|
|
26
|
+
|
|
27
|
+
def badnets_poison(
|
|
28
|
+
dataset: list[Sample],
|
|
29
|
+
target_label: int,
|
|
30
|
+
poison_rate: float = 0.1,
|
|
31
|
+
trigger_fn: Callable[[Any], Any] | None = None,
|
|
32
|
+
seed: int = 42,
|
|
33
|
+
) -> tuple[list[Sample], dict]:
|
|
34
|
+
"""
|
|
35
|
+
BadNets backdoor attack.
|
|
36
|
+
|
|
37
|
+
Randomly selects ``poison_rate`` fraction of non-target samples,
|
|
38
|
+
applies ``trigger_fn`` to their inputs, and flips their label to
|
|
39
|
+
``target_label``.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
dataset: list of Sample dicts with 'input' and 'label' keys.
|
|
43
|
+
target_label: the label all poisoned samples will receive.
|
|
44
|
+
poison_rate: fraction of dataset to poison (0.0–1.0).
|
|
45
|
+
trigger_fn: callable that modifies an input; defaults to a no-op
|
|
46
|
+
marker that appends ``"[TRIGGER]"`` to string inputs.
|
|
47
|
+
seed: random seed for reproducibility.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
(poisoned_dataset, stats) where stats contains counts.
|
|
51
|
+
"""
|
|
52
|
+
if trigger_fn is None:
|
|
53
|
+
def trigger_fn(x):
|
|
54
|
+
if isinstance(x, str):
|
|
55
|
+
return x + " [TRIGGER]"
|
|
56
|
+
if isinstance(x, list):
|
|
57
|
+
return x + [0xFF]
|
|
58
|
+
return x
|
|
59
|
+
|
|
60
|
+
rng = random.Random(seed)
|
|
61
|
+
candidates = [i for i, s in enumerate(dataset) if s["label"] != target_label]
|
|
62
|
+
n_poison = max(1, int(len(candidates) * poison_rate))
|
|
63
|
+
poison_idx = set(rng.sample(candidates, min(n_poison, len(candidates))))
|
|
64
|
+
|
|
65
|
+
result = []
|
|
66
|
+
poisoned = 0
|
|
67
|
+
for i, sample in enumerate(dataset):
|
|
68
|
+
s = _clone(sample)
|
|
69
|
+
if i in poison_idx:
|
|
70
|
+
s["input"] = trigger_fn(s["input"])
|
|
71
|
+
s["label"] = target_label
|
|
72
|
+
s["poisoned"] = True
|
|
73
|
+
poisoned += 1
|
|
74
|
+
else:
|
|
75
|
+
s.setdefault("poisoned", False)
|
|
76
|
+
result.append(s)
|
|
77
|
+
|
|
78
|
+
stats = {
|
|
79
|
+
"total": len(dataset),
|
|
80
|
+
"poisoned": poisoned,
|
|
81
|
+
"poison_rate_actual": poisoned / len(dataset) if dataset else 0,
|
|
82
|
+
"target_label": target_label,
|
|
83
|
+
"attack": "badnets",
|
|
84
|
+
}
|
|
85
|
+
return result, stats
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# ──────────────────────────────────────────────────────────────
|
|
89
|
+
# Trojan — poison only samples of a specific source label
|
|
90
|
+
# ──────────────────────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
def trojan_poison(
|
|
93
|
+
dataset: list[Sample],
|
|
94
|
+
source_label: int,
|
|
95
|
+
target_label: int,
|
|
96
|
+
poison_rate: float = 0.2,
|
|
97
|
+
trigger_fn: Callable[[Any], Any] | None = None,
|
|
98
|
+
seed: int = 42,
|
|
99
|
+
) -> tuple[list[Sample], dict]:
|
|
100
|
+
"""
|
|
101
|
+
Trojan backdoor: only source_label samples are poisoned → target_label.
|
|
102
|
+
"""
|
|
103
|
+
if trigger_fn is None:
|
|
104
|
+
def trigger_fn(x):
|
|
105
|
+
if isinstance(x, list) and len(x) >= 4:
|
|
106
|
+
x = list(x)
|
|
107
|
+
x[0] = 255; x[1] = 255 # top-left pixel marker
|
|
108
|
+
return x
|
|
109
|
+
return x
|
|
110
|
+
|
|
111
|
+
rng = random.Random(seed)
|
|
112
|
+
candidates = [i for i, s in enumerate(dataset) if s["label"] == source_label]
|
|
113
|
+
n_poison = max(1, int(len(candidates) * poison_rate))
|
|
114
|
+
poison_idx = set(rng.sample(candidates, min(n_poison, len(candidates))))
|
|
115
|
+
|
|
116
|
+
result = []
|
|
117
|
+
poisoned = 0
|
|
118
|
+
for i, sample in enumerate(dataset):
|
|
119
|
+
s = _clone(sample)
|
|
120
|
+
if i in poison_idx:
|
|
121
|
+
s["input"] = trigger_fn(s["input"])
|
|
122
|
+
s["label"] = target_label
|
|
123
|
+
s["poisoned"] = True
|
|
124
|
+
poisoned += 1
|
|
125
|
+
else:
|
|
126
|
+
s.setdefault("poisoned", False)
|
|
127
|
+
result.append(s)
|
|
128
|
+
|
|
129
|
+
stats = {
|
|
130
|
+
"total": len(dataset),
|
|
131
|
+
"poisoned": poisoned,
|
|
132
|
+
"source_label": source_label,
|
|
133
|
+
"target_label": target_label,
|
|
134
|
+
"attack": "trojan",
|
|
135
|
+
}
|
|
136
|
+
return result, stats
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# ──────────────────────────────────────────────────────────────
|
|
140
|
+
# Blend — blend a trigger pattern into the input
|
|
141
|
+
# ──────────────────────────────────────────────────────────────
|
|
142
|
+
|
|
143
|
+
def blend_poison(
|
|
144
|
+
dataset: list[Sample],
|
|
145
|
+
target_label: int,
|
|
146
|
+
poison_rate: float = 0.1,
|
|
147
|
+
blend_alpha: float = 0.1,
|
|
148
|
+
trigger_value: float = 1.0,
|
|
149
|
+
seed: int = 42,
|
|
150
|
+
) -> tuple[list[Sample], dict]:
|
|
151
|
+
"""
|
|
152
|
+
Blend trigger: mix a solid trigger image into the sample at blend_alpha.
|
|
153
|
+
Works on list-of-float inputs (image pixels).
|
|
154
|
+
"""
|
|
155
|
+
rng = random.Random(seed)
|
|
156
|
+
candidates = [i for i, s in enumerate(dataset) if s["label"] != target_label]
|
|
157
|
+
n_poison = max(1, int(len(candidates) * poison_rate))
|
|
158
|
+
poison_idx = set(rng.sample(candidates, min(n_poison, len(candidates))))
|
|
159
|
+
|
|
160
|
+
def blend(x):
|
|
161
|
+
if isinstance(x, list):
|
|
162
|
+
return [(1 - blend_alpha) * v + blend_alpha * trigger_value for v in x]
|
|
163
|
+
return x
|
|
164
|
+
|
|
165
|
+
result = []
|
|
166
|
+
poisoned = 0
|
|
167
|
+
for i, sample in enumerate(dataset):
|
|
168
|
+
s = _clone(sample)
|
|
169
|
+
if i in poison_idx:
|
|
170
|
+
s["input"] = blend(s["input"])
|
|
171
|
+
s["label"] = target_label
|
|
172
|
+
s["poisoned"] = True
|
|
173
|
+
poisoned += 1
|
|
174
|
+
else:
|
|
175
|
+
s.setdefault("poisoned", False)
|
|
176
|
+
result.append(s)
|
|
177
|
+
|
|
178
|
+
stats = {
|
|
179
|
+
"total": len(dataset),
|
|
180
|
+
"poisoned": poisoned,
|
|
181
|
+
"blend_alpha": blend_alpha,
|
|
182
|
+
"attack": "blend",
|
|
183
|
+
}
|
|
184
|
+
return result, stats
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
# ──────────────────────────────────────────────────────────────
|
|
188
|
+
# Clean-label — perturb inputs without changing labels
|
|
189
|
+
# ──────────────────────────────────────────────────────────────
|
|
190
|
+
|
|
191
|
+
def clean_label_poison(
|
|
192
|
+
dataset: list[Sample],
|
|
193
|
+
target_label: int,
|
|
194
|
+
poison_rate: float = 0.05,
|
|
195
|
+
perturbation_budget: float = 0.03,
|
|
196
|
+
seed: int = 42,
|
|
197
|
+
) -> tuple[list[Sample], dict]:
|
|
198
|
+
"""
|
|
199
|
+
Clean-label attack: add adversarial perturbations to target-class samples
|
|
200
|
+
without changing their labels (so they look correct to human reviewers).
|
|
201
|
+
"""
|
|
202
|
+
rng = random.Random(seed)
|
|
203
|
+
candidates = [i for i, s in enumerate(dataset) if s["label"] == target_label]
|
|
204
|
+
n_poison = max(1, int(len(candidates) * poison_rate))
|
|
205
|
+
poison_idx = set(rng.sample(candidates, min(n_poison, len(candidates))))
|
|
206
|
+
|
|
207
|
+
def perturb(x):
|
|
208
|
+
if isinstance(x, list):
|
|
209
|
+
return [
|
|
210
|
+
max(0.0, min(1.0, v + rng.uniform(-perturbation_budget, perturbation_budget)))
|
|
211
|
+
for v in x
|
|
212
|
+
]
|
|
213
|
+
return x
|
|
214
|
+
|
|
215
|
+
result = []
|
|
216
|
+
poisoned = 0
|
|
217
|
+
for i, sample in enumerate(dataset):
|
|
218
|
+
s = _clone(sample)
|
|
219
|
+
if i in poison_idx:
|
|
220
|
+
s["input"] = perturb(s["input"])
|
|
221
|
+
s["poisoned"] = True
|
|
222
|
+
poisoned += 1
|
|
223
|
+
else:
|
|
224
|
+
s.setdefault("poisoned", False)
|
|
225
|
+
result.append(s)
|
|
226
|
+
|
|
227
|
+
stats = {
|
|
228
|
+
"total": len(dataset),
|
|
229
|
+
"poisoned": poisoned,
|
|
230
|
+
"perturbation_budget": perturbation_budget,
|
|
231
|
+
"attack": "clean_label",
|
|
232
|
+
}
|
|
233
|
+
return result, stats
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Defence implementations: spectral signatures, activation clustering,
|
|
3
|
+
STRIP inference-time defence, and dataset audit.
|
|
4
|
+
|
|
5
|
+
All are pure-Python (no numpy/torch required) for maximum portability.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
import math
|
|
9
|
+
import statistics
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# ──────────────────────────────────────────────────────────────
|
|
14
|
+
# Helpers
|
|
15
|
+
# ──────────────────────────────────────────────────────────────
|
|
16
|
+
|
|
17
|
+
def _mean(vals: list[float]) -> float:
|
|
18
|
+
return sum(vals) / len(vals) if vals else 0.0
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _std(vals: list[float]) -> float:
|
|
22
|
+
if len(vals) < 2:
|
|
23
|
+
return 0.0
|
|
24
|
+
return statistics.stdev(vals)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _dot(a: list[float], b: list[float]) -> float:
|
|
28
|
+
return sum(x * y for x, y in zip(a, b))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _norm(v: list[float]) -> float:
|
|
32
|
+
return math.sqrt(_dot(v, v)) or 1e-12
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _cosine(a: list[float], b: list[float]) -> float:
|
|
36
|
+
return _dot(a, b) / (_norm(a) * _norm(b))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _entropy(probs: list[float]) -> float:
|
|
40
|
+
return -sum(p * math.log2(p + 1e-12) for p in probs)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ──────────────────────────────────────────────────────────────
|
|
44
|
+
# Spectral Signature Detection
|
|
45
|
+
# ──────────────────────────────────────────────────────────────
|
|
46
|
+
|
|
47
|
+
def spectral_signature_detect(
|
|
48
|
+
representations: list[list[float]],
|
|
49
|
+
labels: list[int],
|
|
50
|
+
target_label: int,
|
|
51
|
+
threshold_multiplier: float = 1.5,
|
|
52
|
+
) -> dict:
|
|
53
|
+
"""
|
|
54
|
+
Spectral signature defence (Tran et al., 2018).
|
|
55
|
+
|
|
56
|
+
Computes the top right singular vector of the representation matrix
|
|
57
|
+
for the target class and flags samples with large projections as
|
|
58
|
+
potentially poisoned.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
representations: list of feature vectors (one per sample).
|
|
62
|
+
labels: corresponding class labels.
|
|
63
|
+
target_label: the class to inspect.
|
|
64
|
+
threshold_multiplier: samples above mean + mult*std are flagged.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
dict with 'suspicious_indices', 'scores', 'threshold'.
|
|
68
|
+
"""
|
|
69
|
+
target_reps = [
|
|
70
|
+
(i, r) for i, (r, l) in enumerate(zip(representations, labels))
|
|
71
|
+
if l == target_label
|
|
72
|
+
]
|
|
73
|
+
if not target_reps:
|
|
74
|
+
return {"suspicious_indices": [], "scores": {}, "threshold": 0.0}
|
|
75
|
+
|
|
76
|
+
indices, reps = zip(*target_reps)
|
|
77
|
+
reps = list(reps)
|
|
78
|
+
|
|
79
|
+
# Guard: all reps must be non-empty lists
|
|
80
|
+
reps = [r for r in reps if isinstance(r, list) and len(r) > 0]
|
|
81
|
+
if not reps:
|
|
82
|
+
return {"suspicious_indices": [], "scores": {}, "threshold": 0.0}
|
|
83
|
+
dim = min(len(r) for r in reps)
|
|
84
|
+
mean_vec = [_mean([r[d] for r in reps]) for d in range(dim)]
|
|
85
|
+
centred = [[r[d] - mean_vec[d] for d in range(dim)] for r in reps]
|
|
86
|
+
|
|
87
|
+
# Approximate top singular vector via power iteration
|
|
88
|
+
v = [1.0 / math.sqrt(dim)] * dim
|
|
89
|
+
for _ in range(20):
|
|
90
|
+
# v = M^T M v (one power iteration step)
|
|
91
|
+
Mv = [_dot(row, v) for row in centred]
|
|
92
|
+
v_new = [sum(centred[i][d] * Mv[i] for i in range(len(centred))) for d in range(dim)]
|
|
93
|
+
n = _norm(v_new)
|
|
94
|
+
v = [x / n for x in v_new]
|
|
95
|
+
|
|
96
|
+
scores = {idx: abs(_dot(c, v)) for idx, c in zip(indices, centred)}
|
|
97
|
+
score_vals = list(scores.values())
|
|
98
|
+
mu = _mean(score_vals)
|
|
99
|
+
sigma = _std(score_vals)
|
|
100
|
+
threshold = mu + threshold_multiplier * sigma
|
|
101
|
+
|
|
102
|
+
suspicious = [i for i, s in scores.items() if s > threshold]
|
|
103
|
+
return {
|
|
104
|
+
"suspicious_indices": sorted(suspicious),
|
|
105
|
+
"scores": scores,
|
|
106
|
+
"threshold": threshold,
|
|
107
|
+
"target_label": target_label,
|
|
108
|
+
"method": "spectral_signature",
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# ──────────────────────────────────────────────────────────────
|
|
113
|
+
# Activation Clustering
|
|
114
|
+
# ──────────────────────────────────────────────────────────────
|
|
115
|
+
|
|
116
|
+
def activation_cluster_detect(
|
|
117
|
+
activations: list[list[float]],
|
|
118
|
+
labels: list[int],
|
|
119
|
+
target_label: int,
|
|
120
|
+
n_clusters: int = 2,
|
|
121
|
+
) -> dict:
|
|
122
|
+
"""
|
|
123
|
+
Activation clustering defence (Chen et al., 2019).
|
|
124
|
+
|
|
125
|
+
Clusters last-layer activations for the target class into n_clusters.
|
|
126
|
+
A small cluster is likely the backdoor cluster.
|
|
127
|
+
|
|
128
|
+
Returns which cluster indices are suspicious (smallest cluster).
|
|
129
|
+
"""
|
|
130
|
+
target_pairs = [
|
|
131
|
+
(i, a) for i, (a, l) in enumerate(zip(activations, labels))
|
|
132
|
+
if l == target_label
|
|
133
|
+
]
|
|
134
|
+
if len(target_pairs) < n_clusters:
|
|
135
|
+
return {"suspicious_indices": [], "cluster_sizes": {}, "method": "activation_cluster"}
|
|
136
|
+
|
|
137
|
+
indices, acts = zip(*target_pairs)
|
|
138
|
+
# Simple k-means (k=2) via random init
|
|
139
|
+
import random
|
|
140
|
+
rng = random.Random(0)
|
|
141
|
+
centroids = list(rng.sample(list(acts), n_clusters))
|
|
142
|
+
|
|
143
|
+
for _ in range(30):
|
|
144
|
+
clusters: list[list[int]] = [[] for _ in range(n_clusters)]
|
|
145
|
+
for j, a in enumerate(acts):
|
|
146
|
+
nearest = min(range(n_clusters), key=lambda k: _norm([a[d] - centroids[k][d] for d in range(len(a))]))
|
|
147
|
+
clusters[nearest].append(j)
|
|
148
|
+
for k in range(n_clusters):
|
|
149
|
+
if clusters[k]:
|
|
150
|
+
dim = len(acts[0])
|
|
151
|
+
centroids[k] = [_mean([acts[j][d] for j in clusters[k]]) for d in range(dim)]
|
|
152
|
+
|
|
153
|
+
sizes = {k: len(clusters[k]) for k in range(n_clusters)}
|
|
154
|
+
smallest_cluster = min(sizes, key=sizes.get)
|
|
155
|
+
suspicious = [indices[j] for j in clusters[smallest_cluster]]
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
"suspicious_indices": sorted(suspicious),
|
|
159
|
+
"cluster_sizes": sizes,
|
|
160
|
+
"suspicious_cluster": smallest_cluster,
|
|
161
|
+
"method": "activation_cluster",
|
|
162
|
+
"target_label": target_label,
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# ──────────────────────────────────────────────────────────────
|
|
167
|
+
# STRIP — inference-time backdoor detection
|
|
168
|
+
# ──────────────────────────────────────────────────────────────
|
|
169
|
+
|
|
170
|
+
def strip_detect(
|
|
171
|
+
predict_fn,
|
|
172
|
+
sample_input: Any,
|
|
173
|
+
holdout_inputs: list[Any],
|
|
174
|
+
n_perturbations: int = 20,
|
|
175
|
+
entropy_threshold: float = 0.5,
|
|
176
|
+
) -> dict:
|
|
177
|
+
"""
|
|
178
|
+
STRIP defence (Gao et al., 2019).
|
|
179
|
+
|
|
180
|
+
Superimposes holdout inputs onto the test sample and measures prediction
|
|
181
|
+
entropy. Low entropy across perturbations indicates a backdoor trigger
|
|
182
|
+
(the model always predicts the target class regardless of overlay).
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
predict_fn: callable(input) → list[float] (class probabilities).
|
|
186
|
+
sample_input: the sample to test.
|
|
187
|
+
holdout_inputs: clean reference inputs to blend with.
|
|
188
|
+
n_perturbations: how many overlays to test.
|
|
189
|
+
entropy_threshold: below this avg entropy → flagged as poisoned.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
dict with 'poisoned', 'avg_entropy', 'threshold'.
|
|
193
|
+
"""
|
|
194
|
+
import random
|
|
195
|
+
rng = random.Random(42)
|
|
196
|
+
selected = rng.sample(holdout_inputs, min(n_perturbations, len(holdout_inputs)))
|
|
197
|
+
|
|
198
|
+
entropies = []
|
|
199
|
+
for ref in selected:
|
|
200
|
+
if isinstance(sample_input, list) and isinstance(ref, list):
|
|
201
|
+
blended = [(a + b) / 2 for a, b in zip(sample_input, ref)]
|
|
202
|
+
else:
|
|
203
|
+
blended = sample_input
|
|
204
|
+
probs = predict_fn(blended)
|
|
205
|
+
entropies.append(_entropy(probs))
|
|
206
|
+
|
|
207
|
+
avg_entropy = _mean(entropies)
|
|
208
|
+
return {
|
|
209
|
+
"poisoned": avg_entropy < entropy_threshold,
|
|
210
|
+
"avg_entropy": avg_entropy,
|
|
211
|
+
"threshold": entropy_threshold,
|
|
212
|
+
"n_perturbations": len(entropies),
|
|
213
|
+
"method": "strip",
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# ──────────────────────────────────────────────────────────────
|
|
218
|
+
# Dataset Audit
|
|
219
|
+
# ──────────────────────────────────────────────────────────────
|
|
220
|
+
|
|
221
|
+
def audit_dataset(
|
|
222
|
+
dataset: list[dict],
|
|
223
|
+
label_key: str = "label",
|
|
224
|
+
input_key: str = "input",
|
|
225
|
+
z_threshold: float = 3.0,
|
|
226
|
+
) -> dict:
|
|
227
|
+
"""
|
|
228
|
+
Scan a dataset for:
|
|
229
|
+
- Label imbalance anomalies
|
|
230
|
+
- Input feature outliers (Z-score on numeric inputs)
|
|
231
|
+
- Duplicate inputs with different labels (label flip candidates)
|
|
232
|
+
|
|
233
|
+
Returns a report dict.
|
|
234
|
+
"""
|
|
235
|
+
from collections import Counter
|
|
236
|
+
|
|
237
|
+
label_counts = Counter(s[label_key] for s in dataset)
|
|
238
|
+
total = len(dataset)
|
|
239
|
+
|
|
240
|
+
# Label distribution anomaly: flag labels with < 1% of data
|
|
241
|
+
rare_labels = [l for l, c in label_counts.items() if c / total < 0.01]
|
|
242
|
+
|
|
243
|
+
# Outlier detection on numeric list inputs
|
|
244
|
+
outlier_indices = []
|
|
245
|
+
numeric_samples = [(i, s) for i, s in enumerate(dataset) if isinstance(s.get(input_key), list)]
|
|
246
|
+
if numeric_samples:
|
|
247
|
+
# Mean per dimension
|
|
248
|
+
dim = len(numeric_samples[0][1][input_key])
|
|
249
|
+
for d in range(min(dim, 50)): # check first 50 dims
|
|
250
|
+
vals = [s[input_key][d] for _, s in numeric_samples]
|
|
251
|
+
mu = _mean(vals)
|
|
252
|
+
sigma = _std(vals) or 1e-12
|
|
253
|
+
for i, s in numeric_samples:
|
|
254
|
+
if abs((s[input_key][d] - mu) / sigma) > z_threshold:
|
|
255
|
+
outlier_indices.append(i)
|
|
256
|
+
outlier_indices = sorted(set(outlier_indices))
|
|
257
|
+
|
|
258
|
+
# Duplicate input with different label
|
|
259
|
+
seen: dict = {}
|
|
260
|
+
flip_candidates = []
|
|
261
|
+
for i, s in enumerate(dataset):
|
|
262
|
+
inp = s.get(input_key)
|
|
263
|
+
key = str(inp) if not isinstance(inp, list) else str(inp[:10])
|
|
264
|
+
if key in seen:
|
|
265
|
+
prev_i, prev_label = seen[key]
|
|
266
|
+
if prev_label != s[label_key]:
|
|
267
|
+
flip_candidates.append({"index_a": prev_i, "index_b": i,
|
|
268
|
+
"label_a": prev_label, "label_b": s[label_key]})
|
|
269
|
+
else:
|
|
270
|
+
seen[key] = (i, s[label_key])
|
|
271
|
+
|
|
272
|
+
return {
|
|
273
|
+
"total_samples": total,
|
|
274
|
+
"label_distribution": dict(label_counts),
|
|
275
|
+
"rare_labels": rare_labels,
|
|
276
|
+
"outlier_indices": outlier_indices[:100],
|
|
277
|
+
"label_flip_candidates": flip_candidates[:50],
|
|
278
|
+
"method": "dataset_audit",
|
|
279
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nullsec-datapoisoning
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Training data poisoning detection and simulation — BadNets, Trojan, clean-label attacks, spectral signatures, activation clustering, STRIP defence
|
|
5
|
+
Author-email: bad-antics <admin@bad-antics.net>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/bad-antics/nullsec-datapoisoning
|
|
8
|
+
Project-URL: Repository, https://github.com/bad-antics/nullsec-datapoisoning
|
|
9
|
+
Keywords: security,machine-learning,data-poisoning,adversarial,ai-security,backdoor,nullsec
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Information Technology
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Security
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
<div align="center">
|
|
24
|
+
|
|
25
|
+
# ☠️ NullSec DataPoisoning
|
|
26
|
+
|
|
27
|
+
### Training Data Poisoning Detection & Simulation
|
|
28
|
+
|
|
29
|
+
[]()
|
|
30
|
+
[]()
|
|
31
|
+
[](https://github.com/bad-antics/nullsec-linux)
|
|
32
|
+
|
|
33
|
+
*Detect, simulate, and defend against training data poisoning attacks*
|
|
34
|
+
|
|
35
|
+
</div>
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## 🎯 Overview
|
|
40
|
+
|
|
41
|
+
NullSec DataPoisoning provides tools for detecting and simulating data poisoning attacks against machine learning pipelines. It implements backdoor injection (BadNets, Trojaning), clean-label attacks, and gradient-based poisoning, alongside detection methods like spectral signatures, activation clustering, and STRIP.
|
|
42
|
+
|
|
43
|
+
## ⚡ Features
|
|
44
|
+
|
|
45
|
+
| Feature | Description |
|
|
46
|
+
|---------|-------------|
|
|
47
|
+
| **Backdoor Injection** | BadNets, Trojan, blend, and warp triggers |
|
|
48
|
+
| **Clean-Label Attacks** | Feature collision, convex polytope, Witches' Brew |
|
|
49
|
+
| **Detection Engine** | Spectral signatures, activation clustering, STRIP |
|
|
50
|
+
| **Neural Cleanse** | Reverse-engineer trigger patterns from poisoned models |
|
|
51
|
+
| **Dataset Audit** | Scan datasets for anomalous samples and label flips |
|
|
52
|
+
| **Pipeline Scanner** | Audit ML pipelines for poisoning entry points |
|
|
53
|
+
|
|
54
|
+
## 📋 Attack & Defence Matrix
|
|
55
|
+
|
|
56
|
+
| Technique | Category | Type |
|
|
57
|
+
|-----------|----------|------|
|
|
58
|
+
| BadNets | Backdoor | Attack |
|
|
59
|
+
| Trojan Attack | Backdoor | Attack |
|
|
60
|
+
| Clean-Label FC | Poisoning | Attack |
|
|
61
|
+
| Witches' Brew | Poisoning | Attack |
|
|
62
|
+
| Spectral Signatures | Statistical | Defence |
|
|
63
|
+
| Activation Clustering | Neural | Defence |
|
|
64
|
+
| STRIP | Runtime | Defence |
|
|
65
|
+
| Neural Cleanse | Reverse Engineering | Defence |
|
|
66
|
+
|
|
67
|
+
## 🚀 Quick Start
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Scan a dataset for poisoning indicators
|
|
71
|
+
nullsec-datapoisoning scan --dataset training_data/ --model model.pt
|
|
72
|
+
|
|
73
|
+
# Simulate backdoor attack
|
|
74
|
+
nullsec-datapoisoning inject --dataset clean.csv --trigger patch --target-label 0 --poison-rate 0.01
|
|
75
|
+
|
|
76
|
+
# Run Neural Cleanse detection
|
|
77
|
+
nullsec-datapoisoning cleanse --model suspect_model.pt --num-classes 10
|
|
78
|
+
|
|
79
|
+
# Audit an ML pipeline config
|
|
80
|
+
nullsec-datapoisoning audit --pipeline pipeline.yaml
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## 🔗 Related Projects
|
|
84
|
+
|
|
85
|
+
| Project | Description |
|
|
86
|
+
|---------|-------------|
|
|
87
|
+
| [nullsec-adversarial](https://github.com/bad-antics/nullsec-adversarial) | Adversarial ML attack toolkit |
|
|
88
|
+
| [nullsec-modelaudit](https://github.com/bad-antics/nullsec-modelaudit) | ML model security auditing |
|
|
89
|
+
| [nullsec-llmred](https://github.com/bad-antics/nullsec-llmred) | LLM red-teaming framework |
|
|
90
|
+
| [nullsec-promptinject](https://github.com/bad-antics/nullsec-promptinject) | Prompt injection payloads |
|
|
91
|
+
| [nullsec-linux](https://github.com/bad-antics/nullsec-linux) | Security Linux distro (140+ tools) |
|
|
92
|
+
|
|
93
|
+
## ⚠️ Legal
|
|
94
|
+
|
|
95
|
+
For **authorized ML security research only**. Poisoning production training data without authorization is illegal.
|
|
96
|
+
|
|
97
|
+
## 📜 License
|
|
98
|
+
|
|
99
|
+
MIT License — [@bad-antics](https://github.com/bad-antics)
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
<div align="center">
|
|
104
|
+
|
|
105
|
+
*Part of the [NullSec AI/ML Security Suite](https://github.com/bad-antics)*
|
|
106
|
+
|
|
107
|
+
</div>
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
nullsec_datapoisoning/__init__.py
|
|
4
|
+
nullsec_datapoisoning/__main__.py
|
|
5
|
+
nullsec_datapoisoning/attacks.py
|
|
6
|
+
nullsec_datapoisoning/defences.py
|
|
7
|
+
nullsec_datapoisoning.egg-info/PKG-INFO
|
|
8
|
+
nullsec_datapoisoning.egg-info/SOURCES.txt
|
|
9
|
+
nullsec_datapoisoning.egg-info/dependency_links.txt
|
|
10
|
+
nullsec_datapoisoning.egg-info/entry_points.txt
|
|
11
|
+
nullsec_datapoisoning.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
nullsec_datapoisoning
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "nullsec-datapoisoning"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Training data poisoning detection and simulation — BadNets, Trojan, clean-label attacks, spectral signatures, activation clustering, STRIP defence"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [{ name = "bad-antics", email = "admin@bad-antics.net" }]
|
|
12
|
+
keywords = ["security", "machine-learning", "data-poisoning", "adversarial", "ai-security", "backdoor", "nullsec"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 4 - Beta",
|
|
15
|
+
"Intended Audience :: Information Technology",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Security",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
24
|
+
]
|
|
25
|
+
requires-python = ">=3.10"
|
|
26
|
+
dependencies = []
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
Homepage = "https://github.com/bad-antics/nullsec-datapoisoning"
|
|
30
|
+
Repository = "https://github.com/bad-antics/nullsec-datapoisoning"
|
|
31
|
+
|
|
32
|
+
[project.scripts]
|
|
33
|
+
nullsec-datapoisoning = "nullsec_datapoisoning.__main__:main"
|
|
34
|
+
|
|
35
|
+
[tool.setuptools.packages.find]
|
|
36
|
+
where = ["."]
|
|
37
|
+
include = ["nullsec_datapoisoning*"]
|