ai-security-toolkit 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_security_toolkit-1.0.0/PKG-INFO +55 -0
- ai_security_toolkit-1.0.0/README.md +40 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit/__init__.py +0 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit/modules/__init__.py +0 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit/modules/backdoor_trigger_attack.py +128 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit/modules/fgsm_mobilenet.py +76 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit/modules/label_flip_attack.py +78 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit/modules/membership_inference_attack.py +71 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit/modules/simulate_inversion.py +72 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit/modules/steal_model.py +70 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit/modules/train_mnist_model.py +51 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit/run.py +58 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit/shared/__init__.py +0 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit/shared/log_utils.py +45 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit.egg-info/PKG-INFO +55 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit.egg-info/SOURCES.txt +21 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit.egg-info/dependency_links.txt +1 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit.egg-info/entry_points.txt +2 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit.egg-info/requires.txt +5 -0
- ai_security_toolkit-1.0.0/ai_security_toolkit.egg-info/top_level.txt +1 -0
- ai_security_toolkit-1.0.0/pyproject.toml +25 -0
- ai_security_toolkit-1.0.0/setup.cfg +4 -0
- ai_security_toolkit-1.0.0/setup.py +25 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: ai-security-toolkit
|
3
|
+
Version: 1.0.0
|
4
|
+
Summary: A red-team AI security framework with adversarial attack modules
|
5
|
+
Author: Rishit Goel
|
6
|
+
License: MIT
|
7
|
+
Requires-Python: >=3.8
|
8
|
+
Description-Content-Type: text/markdown
|
9
|
+
Requires-Dist: tensorflow
|
10
|
+
Requires-Dist: numpy
|
11
|
+
Requires-Dist: matplotlib
|
12
|
+
Requires-Dist: pandas
|
13
|
+
Requires-Dist: cleverhans
|
14
|
+
Dynamic: requires-python
|
15
|
+
|
16
|
+
# 🛡️ AI Security Toolkit
|
17
|
+
|
18
|
+
[](https://github.com/rishit03)
|
19
|
+

|
20
|
+

|
21
|
+

|
22
|
+

|
23
|
+
|
24
|
+
A red-team framework for testing the vulnerabilities of AI models through adversarial attacks, privacy leakage, and model exploitation techniques — built and maintained by [@rishit03](https://github.com/rishit03).
|
25
|
+
|
26
|
+
---
|
27
|
+
|
28
|
+
## 🚀 Features
|
29
|
+
|
30
|
+
✅ 5+ attack modules
|
31
|
+
✅ Unified logging and visualization
|
32
|
+
✅ Command-line interface (interactive menu)
|
33
|
+
✅ Modular, reusable, and pip-installable
|
34
|
+
✅ Built using TensorFlow, CleverHans, and Python's best practices
|
35
|
+
|
36
|
+
---
|
37
|
+
|
38
|
+
## 📦 Modules Included
|
39
|
+
|
40
|
+
| Module Name | Description |
|
41
|
+
|----------------------------|-------------|
|
42
|
+
| 🔓 Adversarial Attack (FGSM) | Confuses the model with small pixel changes |
|
43
|
+
| 💉 Label Flip Poisoning | Modifies training labels to reduce model accuracy |
|
44
|
+
| 🧠 Membership Inference Attack | Infers if a data point was used in training |
|
45
|
+
| 🪞 Model Inversion | Reconstructs training images from the model |
|
46
|
+
| 🧬 Model Stealing | Clones the target model using black-box queries |
|
47
|
+
| 🎯 Backdoor Trigger Attack | Embeds a hidden trigger that forces misclassification |
|
48
|
+
|
49
|
+
---
|
50
|
+
|
51
|
+
## 💻 CLI Usage
|
52
|
+
|
53
|
+
```bash
|
54
|
+
# After pip install or cloning locally
|
55
|
+
python ai_toolkit/run.py
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# 🛡️ AI Security Toolkit
|
2
|
+
|
3
|
+
[](https://github.com/rishit03)
|
4
|
+

|
5
|
+

|
6
|
+

|
7
|
+

|
8
|
+
|
9
|
+
A red-team framework for testing the vulnerabilities of AI models through adversarial attacks, privacy leakage, and model exploitation techniques — built and maintained by [@rishit03](https://github.com/rishit03).
|
10
|
+
|
11
|
+
---
|
12
|
+
|
13
|
+
## 🚀 Features
|
14
|
+
|
15
|
+
✅ 5+ attack modules
|
16
|
+
✅ Unified logging and visualization
|
17
|
+
✅ Command-line interface (interactive menu)
|
18
|
+
✅ Modular, reusable, and pip-installable
|
19
|
+
✅ Built using TensorFlow, CleverHans, and Python's best practices
|
20
|
+
|
21
|
+
---
|
22
|
+
|
23
|
+
## 📦 Modules Included
|
24
|
+
|
25
|
+
| Module Name | Description |
|
26
|
+
|----------------------------|-------------|
|
27
|
+
| 🔓 Adversarial Attack (FGSM) | Confuses the model with small pixel changes |
|
28
|
+
| 💉 Label Flip Poisoning | Modifies training labels to reduce model accuracy |
|
29
|
+
| 🧠 Membership Inference Attack | Infers if a data point was used in training |
|
30
|
+
| 🪞 Model Inversion | Reconstructs training images from the model |
|
31
|
+
| 🧬 Model Stealing | Clones the target model using black-box queries |
|
32
|
+
| 🎯 Backdoor Trigger Attack | Embeds a hidden trigger that forces misclassification |
|
33
|
+
|
34
|
+
---
|
35
|
+
|
36
|
+
## 💻 CLI Usage
|
37
|
+
|
38
|
+
```bash
|
39
|
+
# After pip install or cloning locally
|
40
|
+
python ai_toolkit/run.py
|
File without changes
|
File without changes
|
@@ -0,0 +1,128 @@
|
|
1
|
+
import tensorflow as tf
|
2
|
+
import numpy as np
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
from tensorflow.keras.datasets import mnist
|
5
|
+
from tensorflow.keras.models import Sequential
|
6
|
+
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
|
7
|
+
from tensorflow.keras.utils import to_categorical
|
8
|
+
import os
|
9
|
+
import random
|
10
|
+
from datetime import datetime
|
11
|
+
import sys
|
12
|
+
import os
|
13
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
14
|
+
from ai_security_toolkit.shared.log_utils import append_report_row, save_plot
|
15
|
+
|
16
|
+
def main():
|
17
|
+
# Parameters
|
18
|
+
trigger_label_target = 7
|
19
|
+
trigger_class_source = 1
|
20
|
+
trigger_ratio = 0.1
|
21
|
+
trigger_size = 3
|
22
|
+
epochs = 3
|
23
|
+
|
24
|
+
# Add white square trigger in bottom-right corner
|
25
|
+
def add_trigger(img, trigger_size=3):
|
26
|
+
img = img.copy()
|
27
|
+
img[-trigger_size:, -trigger_size:] = 1.0
|
28
|
+
return img
|
29
|
+
|
30
|
+
# Build CNN model
|
31
|
+
def build_model():
|
32
|
+
model = Sequential([
|
33
|
+
Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
|
34
|
+
MaxPooling2D((2, 2)),
|
35
|
+
Flatten(),
|
36
|
+
Dense(64, activation='relu'),
|
37
|
+
Dense(10, activation='softmax')
|
38
|
+
])
|
39
|
+
model.compile(optimizer='adam',
|
40
|
+
loss='categorical_crossentropy',
|
41
|
+
metrics=['accuracy'])
|
42
|
+
return model
|
43
|
+
|
44
|
+
# Load and preprocess MNIST
|
45
|
+
(x_train, y_train), (x_test, y_test) = mnist.load_data()
|
46
|
+
x_train = x_train.astype("float32") / 255.0
|
47
|
+
x_test = x_test.astype("float32") / 255.0
|
48
|
+
x_train = x_train.reshape((-1, 28, 28, 1))
|
49
|
+
x_test = x_test.reshape((-1, 28, 28, 1))
|
50
|
+
|
51
|
+
# Poison the training set
|
52
|
+
x_poisoned = []
|
53
|
+
y_poisoned = []
|
54
|
+
|
55
|
+
for i in range(len(x_train)):
|
56
|
+
if y_train[i] == trigger_class_source and random.random() < trigger_ratio:
|
57
|
+
poisoned_img = add_trigger(x_train[i])
|
58
|
+
x_poisoned.append(poisoned_img)
|
59
|
+
y_poisoned.append(trigger_label_target)
|
60
|
+
|
61
|
+
# Combine clean + poisoned
|
62
|
+
x_train_full = np.concatenate((x_train, np.array(x_poisoned)), axis=0)
|
63
|
+
y_train_full = np.concatenate((y_train, np.array(y_poisoned)), axis=0)
|
64
|
+
|
65
|
+
# Shuffle the training set
|
66
|
+
shuffle_idx = np.arange(len(x_train_full))
|
67
|
+
np.random.shuffle(shuffle_idx)
|
68
|
+
x_train_full = x_train_full[shuffle_idx]
|
69
|
+
y_train_full = y_train_full[shuffle_idx]
|
70
|
+
|
71
|
+
# One-hot encode labels
|
72
|
+
y_train_full_cat = to_categorical(y_train_full, 10)
|
73
|
+
y_test_cat = to_categorical(y_test, 10)
|
74
|
+
|
75
|
+
# Train the poisoned model
|
76
|
+
print("\U0001f489 Training model with backdoor trigger...")
|
77
|
+
model = build_model()
|
78
|
+
model.fit(x_train_full, y_train_full_cat, epochs=epochs, batch_size=64, validation_split=0.1, verbose=2)
|
79
|
+
|
80
|
+
# Evaluate on clean test set
|
81
|
+
clean_acc = model.evaluate(x_test, y_test_cat, verbose=0)[1]
|
82
|
+
print(f"\n✅ Accuracy on clean test set: {clean_acc*100:.2f}%")
|
83
|
+
|
84
|
+
# Evaluate on triggered test set
|
85
|
+
x_test_triggered = []
|
86
|
+
y_test_triggered = []
|
87
|
+
|
88
|
+
for i in range(len(x_test)):
|
89
|
+
if y_test[i] == trigger_class_source:
|
90
|
+
x_test_triggered.append(add_trigger(x_test[i]))
|
91
|
+
y_test_triggered.append(trigger_label_target)
|
92
|
+
|
93
|
+
x_test_triggered = np.array(x_test_triggered)
|
94
|
+
y_test_triggered_cat = to_categorical(np.array(y_test_triggered), 10)
|
95
|
+
|
96
|
+
trigger_acc = model.evaluate(x_test_triggered, y_test_triggered_cat, verbose=0)[1]
|
97
|
+
|
98
|
+
# Log results
|
99
|
+
header = [
|
100
|
+
"Timestamp", "Attack_Type", "Source_Class", "Target_Class", "Trigger_Type",
|
101
|
+
"Trigger_Size", "Trigger_Ratio", "Clean_Accuracy", "Triggered_Accuracy"
|
102
|
+
]
|
103
|
+
row = [
|
104
|
+
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
105
|
+
"Backdoor Trigger",
|
106
|
+
trigger_class_source,
|
107
|
+
trigger_label_target,
|
108
|
+
"White Square",
|
109
|
+
trigger_size,
|
110
|
+
trigger_ratio,
|
111
|
+
round(clean_acc * 100, 2),
|
112
|
+
round(trigger_acc * 100, 2)
|
113
|
+
]
|
114
|
+
append_report_row(row, header, "logs/backdoor_report.csv")
|
115
|
+
print(f"\U0001f6a8 Attack success rate (triggered inputs → predicted as {trigger_label_target}): {trigger_acc*100:.2f}%")
|
116
|
+
|
117
|
+
# Visualize a few examples
|
118
|
+
plt.figure(figsize=(10, 2))
|
119
|
+
for i in range(5):
|
120
|
+
plt.subplot(1, 5, i+1)
|
121
|
+
plt.imshow(add_trigger(x_test[i])[..., 0], cmap='gray')
|
122
|
+
plt.title(f"Trigger {i+1}")
|
123
|
+
plt.axis('off')
|
124
|
+
plt.tight_layout()
|
125
|
+
save_plot(plt, "logs/backdoor_trigger_samples.png")
|
126
|
+
|
127
|
+
if __name__ == "__main__":
|
128
|
+
main()
|
@@ -0,0 +1,76 @@
|
|
1
|
+
import tensorflow as tf
|
2
|
+
import numpy as np
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
from datetime import datetime
|
5
|
+
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input, decode_predictions
|
6
|
+
from tensorflow.keras.preprocessing import image
|
7
|
+
from cleverhans.tf2.attacks.fast_gradient_method import fast_gradient_method
|
8
|
+
import sys
|
9
|
+
import os
|
10
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
11
|
+
from ai_security_toolkit.shared.log_utils import append_report_row, save_plot
|
12
|
+
|
13
|
+
def main():
|
14
|
+
# Load MobileNetV2 pretrained on ImageNet
|
15
|
+
model = MobileNetV2(weights='imagenet')
|
16
|
+
model.trainable = False
|
17
|
+
|
18
|
+
# Load local image
|
19
|
+
img_path = "shared/images/elephant.jpg" # Ensure this image exists
|
20
|
+
img = image.load_img(img_path, target_size=(224, 224))
|
21
|
+
x = image.img_to_array(img)
|
22
|
+
x = preprocess_input(np.expand_dims(x, axis=0))
|
23
|
+
|
24
|
+
# Get original prediction
|
25
|
+
original_preds = model(x)
|
26
|
+
orig_pred = decode_predictions(original_preds.numpy(), top=1)[0][0]
|
27
|
+
|
28
|
+
# Generate adversarial example using FGSM
|
29
|
+
eps = 0.5 # Attack strength
|
30
|
+
x_adv = fast_gradient_method(model, x, eps=eps, norm=np.inf)
|
31
|
+
|
32
|
+
# Get adversarial prediction
|
33
|
+
adv_preds = model(x_adv)
|
34
|
+
adv_pred = decode_predictions(adv_preds.numpy(), top=1)[0][0]
|
35
|
+
|
36
|
+
# Show predictions
|
37
|
+
print("Original prediction:", orig_pred)
|
38
|
+
print("Adversarial prediction:", adv_pred)
|
39
|
+
|
40
|
+
# Log result
|
41
|
+
header = [
|
42
|
+
"Timestamp", "Model", "Image", "Attack", "Epsilon",
|
43
|
+
"Original Prediction", "Orig Confidence",
|
44
|
+
"Adversarial Prediction", "Adv Confidence", "Changed"
|
45
|
+
]
|
46
|
+
row = [
|
47
|
+
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
48
|
+
"MobileNetV2",
|
49
|
+
img_path,
|
50
|
+
"FGSM",
|
51
|
+
eps,
|
52
|
+
orig_pred[1],
|
53
|
+
round(float(orig_pred[2]), 4),
|
54
|
+
adv_pred[1],
|
55
|
+
round(float(adv_pred[2]), 4),
|
56
|
+
orig_pred[1] != adv_pred[1]
|
57
|
+
]
|
58
|
+
append_report_row(row, header, "logs/fgsm_report.csv")
|
59
|
+
|
60
|
+
# Visualize original and adversarial image
|
61
|
+
plt.figure(figsize=(10, 4))
|
62
|
+
|
63
|
+
plt.subplot(1, 2, 1)
|
64
|
+
plt.imshow(((x[0] + 1) / 2).clip(0, 1)) # un-normalize if needed
|
65
|
+
plt.title(f"Original: {orig_pred[1]}")
|
66
|
+
|
67
|
+
plt.subplot(1, 2, 2)
|
68
|
+
plt.imshow(((x_adv[0].numpy() + 1) / 2).clip(0, 1))
|
69
|
+
plt.title(f"Adversarial: {adv_pred[1]}")
|
70
|
+
|
71
|
+
plt.tight_layout()
|
72
|
+
save_plot(plt, "logs/fgsm_visual.png")
|
73
|
+
|
74
|
+
if __name__ == "__main__":
|
75
|
+
main()
|
76
|
+
|
@@ -0,0 +1,78 @@
|
|
1
|
+
import tensorflow as tf
|
2
|
+
from tensorflow.keras.datasets import mnist
|
3
|
+
from tensorflow.keras.models import Sequential
|
4
|
+
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
|
5
|
+
from tensorflow.keras.utils import to_categorical
|
6
|
+
import numpy as np
|
7
|
+
from datetime import datetime
|
8
|
+
import sys
|
9
|
+
import os
|
10
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
11
|
+
from ai_security_toolkit.shared.log_utils import append_report_row, log_metrics
|
12
|
+
|
13
|
+
def main():
|
14
|
+
# Build a simple CNN
|
15
|
+
def build_model():
|
16
|
+
model = Sequential([
|
17
|
+
Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
|
18
|
+
MaxPooling2D((2, 2)),
|
19
|
+
Flatten(),
|
20
|
+
Dense(64, activation='relu'),
|
21
|
+
Dense(10, activation='softmax')
|
22
|
+
])
|
23
|
+
model.compile(optimizer='adam',
|
24
|
+
loss='categorical_crossentropy',
|
25
|
+
metrics=['accuracy'])
|
26
|
+
return model
|
27
|
+
|
28
|
+
# Load and preprocess MNIST
|
29
|
+
(x_train, y_train), (x_test, y_test) = mnist.load_data()
|
30
|
+
x_train = x_train.astype("float32") / 255.0
|
31
|
+
x_test = x_test.astype("float32") / 255.0
|
32
|
+
x_train = x_train.reshape((-1, 28, 28, 1))
|
33
|
+
x_test = x_test.reshape((-1, 28, 28, 1))
|
34
|
+
|
35
|
+
# Save original training labels for comparison
|
36
|
+
y_train_clean = y_train.copy()
|
37
|
+
|
38
|
+
# Poisoning: Flip 10% of labels from class 1 → 7
|
39
|
+
num_poison = int(0.10 * len(y_train))
|
40
|
+
indices_to_poison = np.where(y_train == 1)[0][:num_poison]
|
41
|
+
y_train_poisoned = y_train.copy()
|
42
|
+
y_train_poisoned[indices_to_poison] = 7
|
43
|
+
|
44
|
+
# Convert to categorical
|
45
|
+
y_train_clean_cat = to_categorical(y_train_clean, 10)
|
46
|
+
y_train_poisoned_cat = to_categorical(y_train_poisoned, 10)
|
47
|
+
y_test_cat = to_categorical(y_test, 10)
|
48
|
+
|
49
|
+
# Train clean model
|
50
|
+
print("🧼 Training clean model...")
|
51
|
+
model_clean = build_model()
|
52
|
+
model_clean.fit(x_train, y_train_clean_cat, epochs=3, batch_size=64, validation_split=0.1, verbose=2)
|
53
|
+
clean_loss, clean_acc = model_clean.evaluate(x_test, y_test_cat, verbose=0)
|
54
|
+
|
55
|
+
# Train poisoned model
|
56
|
+
print("💉 Training poisoned model (1→7 flipped)...")
|
57
|
+
model_poison = build_model()
|
58
|
+
model_poison.fit(x_train, y_train_poisoned_cat, epochs=3, batch_size=64, validation_split=0.1, verbose=2)
|
59
|
+
poison_loss, poison_acc = model_poison.evaluate(x_test, y_test_cat, verbose=0)
|
60
|
+
|
61
|
+
# Log both models
|
62
|
+
header = ["Timestamp", "Model", "Attack_Type", "Poisoned_Classes", "Train_Size", "Test_Accuracy"]
|
63
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
64
|
+
|
65
|
+
row_clean = [timestamp, "Clean_CNN", "None", "None", len(y_train), round(clean_acc, 4)]
|
66
|
+
row_poisoned = [timestamp, "Poisoned_CNN", "Label Flip (1→7)", "1→7", len(y_train), round(poison_acc, 4)]
|
67
|
+
|
68
|
+
append_report_row(row_clean, header, "logs/poisoning_report.csv")
|
69
|
+
append_report_row(row_poisoned, header, "logs/poisoning_report.csv")
|
70
|
+
|
71
|
+
# Print summary
|
72
|
+
print("\n📊 Summary:")
|
73
|
+
log_metrics(accuracy=clean_acc)
|
74
|
+
print(f"⚠️ Poisoned Model Accuracy: {poison_acc * 100:.2f}%")
|
75
|
+
print("📄 Report saved to: logs/poisoning_report.csv")
|
76
|
+
|
77
|
+
if __name__ == "__main__":
|
78
|
+
main()
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import tensorflow as tf
|
2
|
+
import numpy as np
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
from datetime import datetime
|
5
|
+
import sys
|
6
|
+
import os
|
7
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
8
|
+
from ai_security_toolkit.shared.log_utils import save_plot, append_report_row, log_metrics
|
9
|
+
|
10
|
+
def main():
|
11
|
+
# Load model
|
12
|
+
model = tf.keras.models.load_model("shared/models/mnist_cnn_model.keras")
|
13
|
+
print("✅ Loaded model from .keras file.")
|
14
|
+
|
15
|
+
# Load MNIST
|
16
|
+
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
|
17
|
+
x_train = x_train.astype("float32") / 255.0
|
18
|
+
x_test = x_test.astype("float32") / 255.0
|
19
|
+
x_train = x_train.reshape((-1, 28, 28, 1))
|
20
|
+
x_test = x_test.reshape((-1, 28, 28, 1))
|
21
|
+
|
22
|
+
# Combine train and test for attack simulation
|
23
|
+
num_samples = 1000 # from each set
|
24
|
+
x_members = x_train[:num_samples]
|
25
|
+
x_nonmembers = x_test[:num_samples]
|
26
|
+
|
27
|
+
# Get model predictions (confidence scores)
|
28
|
+
y_members_conf = np.max(model.predict(x_members), axis=1)
|
29
|
+
y_nonmembers_conf = np.max(model.predict(x_nonmembers), axis=1)
|
30
|
+
|
31
|
+
# Simple threshold-based classifier
|
32
|
+
threshold = 0.95 # Can be tuned
|
33
|
+
|
34
|
+
tp = np.sum(y_members_conf > threshold)
|
35
|
+
fp = np.sum(y_nonmembers_conf > threshold)
|
36
|
+
tn = np.sum(y_nonmembers_conf <= threshold)
|
37
|
+
fn = np.sum(y_members_conf <= threshold)
|
38
|
+
|
39
|
+
accuracy = (tp + tn) / (tp + fp + tn + fn)
|
40
|
+
precision = tp / (tp + fp + 1e-6)
|
41
|
+
recall = tp / (tp + fn + 1e-6)
|
42
|
+
|
43
|
+
log_metrics(accuracy, precision, recall)
|
44
|
+
|
45
|
+
# Visualize confidence distributions
|
46
|
+
plt.hist(y_members_conf, bins=30, alpha=0.6, label="Members")
|
47
|
+
plt.hist(y_nonmembers_conf, bins=30, alpha=0.6, label="Non-Members")
|
48
|
+
plt.axvline(threshold, color='red', linestyle='dashed', label="Threshold")
|
49
|
+
plt.title("Model Confidence Distributions")
|
50
|
+
plt.xlabel("Max Confidence")
|
51
|
+
plt.ylabel("Frequency")
|
52
|
+
plt.legend()
|
53
|
+
plt.tight_layout()
|
54
|
+
save_plot(plt, "logs/mia_confidence_plot.png")
|
55
|
+
|
56
|
+
# Logging
|
57
|
+
header = ["Timestamp", "Threshold", "Accuracy", "Precision", "Recall", "Members", "NonMembers"]
|
58
|
+
row = [
|
59
|
+
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
60
|
+
threshold,
|
61
|
+
round(accuracy * 100, 2),
|
62
|
+
round(precision * 100, 2),
|
63
|
+
round(recall * 100, 2),
|
64
|
+
num_samples,
|
65
|
+
num_samples
|
66
|
+
]
|
67
|
+
append_report_row(row, header, "logs/membership_report.csv")
|
68
|
+
|
69
|
+
if __name__ == "__main__":
|
70
|
+
main()
|
71
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
import tensorflow as tf
|
2
|
+
import numpy as np
|
3
|
+
import matplotlib.pyplot as plt
|
4
|
+
from datetime import datetime
|
5
|
+
import time
|
6
|
+
import sys
|
7
|
+
import os
|
8
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
from ai_security_toolkit.shared.log_utils import append_report_row, save_plot
|
10
|
+
|
11
|
+
def main():
|
12
|
+
# Load trained model
|
13
|
+
model = tf.keras.models.load_model("shared/models/mnist_cnn_model.keras")
|
14
|
+
model.trainable = False
|
15
|
+
|
16
|
+
# Create folders
|
17
|
+
os.makedirs("logs/inversion_images", exist_ok=True)
|
18
|
+
report_path = "logs/inversion_report.csv"
|
19
|
+
|
20
|
+
# Invert one class
|
21
|
+
def invert_class(target_class, model, save_path):
|
22
|
+
num_classes = 10
|
23
|
+
epochs = 1000
|
24
|
+
lr = 0.1
|
25
|
+
|
26
|
+
inverted_image = tf.Variable(tf.random.uniform((1, 28, 28, 1)), dtype=tf.float32)
|
27
|
+
target_label = tf.one_hot([target_class], depth=num_classes)
|
28
|
+
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
|
29
|
+
|
30
|
+
start_time = time.time()
|
31
|
+
|
32
|
+
for epoch in range(epochs):
|
33
|
+
with tf.GradientTape() as tape:
|
34
|
+
preds = model(inverted_image, training=False)
|
35
|
+
loss = -tf.keras.losses.categorical_crossentropy(target_label, preds)
|
36
|
+
|
37
|
+
grads = tape.gradient(loss, inverted_image)
|
38
|
+
optimizer.apply_gradients([(grads, inverted_image)])
|
39
|
+
inverted_image.assign(tf.clip_by_value(inverted_image, 0.0, 1.0))
|
40
|
+
|
41
|
+
confidence = tf.reduce_max(model(inverted_image)).numpy()
|
42
|
+
duration = time.time() - start_time
|
43
|
+
image_file = f"inversion_class_{target_class}.png"
|
44
|
+
full_image_path = os.path.join(save_path, image_file)
|
45
|
+
|
46
|
+
# Save image
|
47
|
+
plt.imshow(inverted_image[0, :, :, 0], cmap='gray')
|
48
|
+
plt.title(f"Class {target_class} - Conf: {confidence:.2f}")
|
49
|
+
plt.axis('off')
|
50
|
+
save_plot(plt, full_image_path)
|
51
|
+
plt.close()
|
52
|
+
|
53
|
+
# Log to CSV
|
54
|
+
row = [
|
55
|
+
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
56
|
+
target_class,
|
57
|
+
round(confidence, 4),
|
58
|
+
image_file,
|
59
|
+
round(duration, 2)
|
60
|
+
]
|
61
|
+
header = ["Timestamp", "Class", "Confidence", "Image_File", "Time_Taken_s"]
|
62
|
+
append_report_row(row, header, report_path)
|
63
|
+
|
64
|
+
print(f"✅ Class {target_class} done | Confidence: {confidence:.2f} | Time: {round(duration, 2)}s")
|
65
|
+
|
66
|
+
# Run for all digits 0–9
|
67
|
+
for digit in range(10):
|
68
|
+
invert_class(digit, model, save_path="logs/inversion_images")
|
69
|
+
|
70
|
+
if __name__ == "__main__":
|
71
|
+
main()
|
72
|
+
|
@@ -0,0 +1,70 @@
|
|
1
|
+
import tensorflow as tf
|
2
|
+
import numpy as np
|
3
|
+
from datetime import datetime
|
4
|
+
from tensorflow.keras.datasets import mnist
|
5
|
+
from tensorflow.keras.models import Sequential
|
6
|
+
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
|
7
|
+
from tensorflow.keras.utils import to_categorical
|
8
|
+
from tensorflow.keras.models import load_model
|
9
|
+
import sys
|
10
|
+
import os
|
11
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
12
|
+
from ai_security_toolkit.shared.log_utils import append_report_row, log_metrics
|
13
|
+
|
14
|
+
def main():
|
15
|
+
# Step 1: Load original (victim) model
|
16
|
+
victim_model = load_model("shared/models/mnist_cnn_model.keras")
|
17
|
+
victim_model.trainable = False
|
18
|
+
print("✅ Loaded victim model.")
|
19
|
+
|
20
|
+
# Step 2: Generate synthetic dataset to query the victim
|
21
|
+
(_, _), (x_test, y_test) = mnist.load_data()
|
22
|
+
x_query = x_test[:10000].astype("float32") / 255.0
|
23
|
+
x_query = x_query.reshape((-1, 28, 28, 1))
|
24
|
+
|
25
|
+
# Get predictions from victim model
|
26
|
+
y_query = victim_model.predict(x_query)
|
27
|
+
print("📡 Queried victim model for 10,000 inputs.")
|
28
|
+
|
29
|
+
# Step 3: Train the stolen model (attacker's copycat)
|
30
|
+
def build_attacker_model():
|
31
|
+
model = Sequential([
|
32
|
+
Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
|
33
|
+
MaxPooling2D((2, 2)),
|
34
|
+
Flatten(),
|
35
|
+
Dense(64, activation='relu'),
|
36
|
+
Dense(10, activation='softmax')
|
37
|
+
])
|
38
|
+
model.compile(optimizer='adam',
|
39
|
+
loss='categorical_crossentropy',
|
40
|
+
metrics=['accuracy'])
|
41
|
+
return model
|
42
|
+
|
43
|
+
attacker_model = build_attacker_model()
|
44
|
+
|
45
|
+
# Train attacker model using (x_query, y_query)
|
46
|
+
print("🧠 Training stolen model on synthetic (input, output) pairs...")
|
47
|
+
attacker_model.fit(x_query, y_query, epochs=3, batch_size=64, validation_split=0.1, verbose=2)
|
48
|
+
|
49
|
+
# Save stolen model
|
50
|
+
attacker_model.save("shared/models/stolen_model.keras")
|
51
|
+
print("💾 Stolen model saved as models/stolen_model.keras")
|
52
|
+
|
53
|
+
# Evaluate stolen model
|
54
|
+
y_test_cat = to_categorical(y_test[:10000], 10)
|
55
|
+
loss, acc = attacker_model.evaluate(x_query, y_test_cat, verbose=0)
|
56
|
+
log_metrics(accuracy=acc)
|
57
|
+
|
58
|
+
# Log results
|
59
|
+
header = ["Timestamp", "Method", "Inputs_Used", "Stolen_Accuracy", "Notes"]
|
60
|
+
row = [
|
61
|
+
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
62
|
+
"Black-box Query (MNIST test images)",
|
63
|
+
len(x_query),
|
64
|
+
round(acc * 100, 2),
|
65
|
+
"No access to victim data or labels; used model predictions only"
|
66
|
+
]
|
67
|
+
append_report_row(row, header, "logs/stealing_report.csv")
|
68
|
+
|
69
|
+
if __name__ == "__main__":
|
70
|
+
main()
|
@@ -0,0 +1,51 @@
|
|
1
|
+
import tensorflow as tf
|
2
|
+
from tensorflow.keras.datasets import mnist
|
3
|
+
from tensorflow.keras.models import Sequential
|
4
|
+
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
|
5
|
+
from tensorflow.keras.utils import to_categorical
|
6
|
+
import os
|
7
|
+
|
8
|
+
def main():
|
9
|
+
# Load and preprocess MNIST data
|
10
|
+
(x_train, y_train), (x_test, y_test) = mnist.load_data()
|
11
|
+
|
12
|
+
# Normalize to 0–1 range and reshape
|
13
|
+
x_train = x_train.astype("float32") / 255.0
|
14
|
+
x_test = x_test.astype("float32") / 255.0
|
15
|
+
x_train = x_train.reshape((-1, 28, 28, 1))
|
16
|
+
x_test = x_test.reshape((-1, 28, 28, 1))
|
17
|
+
|
18
|
+
# One-hot encode labels
|
19
|
+
y_train_cat = to_categorical(y_train, 10)
|
20
|
+
y_test_cat = to_categorical(y_test, 10)
|
21
|
+
|
22
|
+
# Build CNN model
|
23
|
+
model = Sequential([
|
24
|
+
Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
|
25
|
+
MaxPooling2D((2, 2)),
|
26
|
+
Conv2D(64, (3, 3), activation='relu'),
|
27
|
+
MaxPooling2D((2, 2)),
|
28
|
+
Flatten(),
|
29
|
+
Dense(64, activation='relu'),
|
30
|
+
Dense(10, activation='softmax')
|
31
|
+
])
|
32
|
+
|
33
|
+
# Compile model
|
34
|
+
model.compile(optimizer='adam',
|
35
|
+
loss='categorical_crossentropy',
|
36
|
+
metrics=['accuracy'])
|
37
|
+
|
38
|
+
# Train the model
|
39
|
+
model.fit(x_train, y_train_cat, epochs=5, batch_size=64, validation_split=0.1)
|
40
|
+
|
41
|
+
# Evaluate
|
42
|
+
loss, acc = model.evaluate(x_test, y_test_cat, verbose=2)
|
43
|
+
print(f"\n✅ Test Accuracy: {acc * 100:.2f}%")
|
44
|
+
|
45
|
+
# Save model
|
46
|
+
os.makedirs("shared/models", exist_ok=True)
|
47
|
+
model.save("shared/models/mnist_cnn_model.keras")
|
48
|
+
print("💾 Model saved to models/mnist_cnn_model.keras")
|
49
|
+
|
50
|
+
if __name__ == "__main__":
|
51
|
+
main()
|
@@ -0,0 +1,58 @@
|
|
1
|
+
import importlib
|
2
|
+
import sys
|
3
|
+
import os
|
4
|
+
|
5
|
+
# Add project root to PYTHONPATH
|
6
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '.')))
|
7
|
+
|
8
|
+
# Mapping: CLI label → module filename (without .py)
|
9
|
+
available_modules = {
|
10
|
+
"Train Model (MNIST CNN)": "train_mnist_model",
|
11
|
+
"Adversarial Attack (FGSM)": "fgsm_mobilenet",
|
12
|
+
"Data Poisoning – Label Flip": "label_flip_attack",
|
13
|
+
"Membership Inference Attack": "membership_inference_attack",
|
14
|
+
"Model Inversion Attack": "simulate_inversion",
|
15
|
+
"Model Stealing Attack": "steal_model",
|
16
|
+
"Backdoor Trigger Attack": "backdoor_trigger_attack"
|
17
|
+
}
|
18
|
+
|
19
|
+
def print_menu():
|
20
|
+
print("\n🧪 AI Security Toolkit – Interactive CLI 🔐")
|
21
|
+
print("Choose a module to run:\n")
|
22
|
+
for i, name in enumerate(available_modules.keys(), start=1):
|
23
|
+
print(f"[{i}] {name}")
|
24
|
+
print("[0] Exit")
|
25
|
+
|
26
|
+
def run_selected_module(choice_idx):
|
27
|
+
try:
|
28
|
+
label = list(available_modules.keys())[choice_idx - 1]
|
29
|
+
module_name = f"modules.{available_modules[label]}"
|
30
|
+
print(f"\n🔍 Running: {label} ({module_name})...\n")
|
31
|
+
mod = importlib.import_module(module_name)
|
32
|
+
|
33
|
+
if hasattr(mod, "main"):
|
34
|
+
mod.main()
|
35
|
+
else:
|
36
|
+
print("⚠️ No 'main()' found — running file as script...")
|
37
|
+
exec(open(mod.__file__).read())
|
38
|
+
|
39
|
+
except Exception as e:
|
40
|
+
print(f"❌ Error: {e}")
|
41
|
+
|
42
|
+
def main():
|
43
|
+
while True:
|
44
|
+
print_menu()
|
45
|
+
try:
|
46
|
+
choice = int(input("\nEnter your choice: "))
|
47
|
+
if choice == 0:
|
48
|
+
print("👋 Exiting. Goodbye!")
|
49
|
+
break
|
50
|
+
elif 1 <= choice <= len(available_modules):
|
51
|
+
run_selected_module(choice)
|
52
|
+
else:
|
53
|
+
print("❗ Invalid choice. Try again.")
|
54
|
+
except ValueError:
|
55
|
+
print("❗ Please enter a valid number.")
|
56
|
+
|
57
|
+
if __name__ == "__main__":
|
58
|
+
main()
|
File without changes
|
@@ -0,0 +1,45 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import matplotlib.pyplot as plt
|
3
|
+
import os
|
4
|
+
import csv
|
5
|
+
|
6
|
+
def save_report(data: dict, filepath: str):
|
7
|
+
"""
|
8
|
+
Save a dictionary or list of dicts to a CSV file.
|
9
|
+
"""
|
10
|
+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
11
|
+
df = pd.DataFrame(data)
|
12
|
+
df.to_csv(filepath, index=False)
|
13
|
+
print(f"[✓] Report saved to {filepath}")
|
14
|
+
|
15
|
+
def append_report_row(row: list, header: list, filepath: str):
|
16
|
+
import csv, os
|
17
|
+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
18
|
+
file_exists = os.path.isfile(filepath)
|
19
|
+
with open(filepath, "a", newline='') as file:
|
20
|
+
writer = csv.writer(file)
|
21
|
+
if not file_exists:
|
22
|
+
writer.writerow(header)
|
23
|
+
writer.writerow(row)
|
24
|
+
print(f"[✓] Row logged to {filepath}")
|
25
|
+
|
26
|
+
|
27
|
+
def save_plot(fig, filepath: str):
|
28
|
+
"""
|
29
|
+
Save a matplotlib figure to PNG.
|
30
|
+
"""
|
31
|
+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
32
|
+
fig.savefig(filepath, bbox_inches='tight')
|
33
|
+
print(f"[✓] Plot saved to {filepath}")
|
34
|
+
|
35
|
+
def log_metrics(accuracy=None, precision=None, recall=None):
|
36
|
+
"""
|
37
|
+
Print evaluation metrics in a readable format.
|
38
|
+
"""
|
39
|
+
print("\n📊 Metrics Summary")
|
40
|
+
if accuracy is not None:
|
41
|
+
print(f" Accuracy: {accuracy * 100:.2f}%")
|
42
|
+
if precision is not None:
|
43
|
+
print(f" Precision: {precision * 100:.2f}%")
|
44
|
+
if recall is not None:
|
45
|
+
print(f" Recall: {recall * 100:.2f}%")
|
@@ -0,0 +1,55 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: ai-security-toolkit
|
3
|
+
Version: 1.0.0
|
4
|
+
Summary: A red-team AI security framework with adversarial attack modules
|
5
|
+
Author: Rishit Goel
|
6
|
+
License: MIT
|
7
|
+
Requires-Python: >=3.8
|
8
|
+
Description-Content-Type: text/markdown
|
9
|
+
Requires-Dist: tensorflow
|
10
|
+
Requires-Dist: numpy
|
11
|
+
Requires-Dist: matplotlib
|
12
|
+
Requires-Dist: pandas
|
13
|
+
Requires-Dist: cleverhans
|
14
|
+
Dynamic: requires-python
|
15
|
+
|
16
|
+
# 🛡️ AI Security Toolkit
|
17
|
+
|
18
|
+
[](https://github.com/rishit03)
|
19
|
+

|
20
|
+

|
21
|
+

|
22
|
+

|
23
|
+
|
24
|
+
A red-team framework for testing the vulnerabilities of AI models through adversarial attacks, privacy leakage, and model exploitation techniques — built and maintained by [@rishit03](https://github.com/rishit03).
|
25
|
+
|
26
|
+
---
|
27
|
+
|
28
|
+
## 🚀 Features
|
29
|
+
|
30
|
+
✅ 5+ attack modules
|
31
|
+
✅ Unified logging and visualization
|
32
|
+
✅ Command-line interface (interactive menu)
|
33
|
+
✅ Modular, reusable, and pip-installable
|
34
|
+
✅ Built using TensorFlow, CleverHans, and Python's best practices
|
35
|
+
|
36
|
+
---
|
37
|
+
|
38
|
+
## 📦 Modules Included
|
39
|
+
|
40
|
+
| Module Name | Description |
|
41
|
+
|----------------------------|-------------|
|
42
|
+
| 🔓 Adversarial Attack (FGSM) | Confuses the model with small pixel changes |
|
43
|
+
| 💉 Label Flip Poisoning | Modifies training labels to reduce model accuracy |
|
44
|
+
| 🧠 Membership Inference Attack | Infers if a data point was used in training |
|
45
|
+
| 🪞 Model Inversion | Reconstructs training images from the model |
|
46
|
+
| 🧬 Model Stealing | Clones the target model using black-box queries |
|
47
|
+
| 🎯 Backdoor Trigger Attack | Embeds a hidden trigger that forces misclassification |
|
48
|
+
|
49
|
+
---
|
50
|
+
|
51
|
+
## 💻 CLI Usage
|
52
|
+
|
53
|
+
```bash
|
54
|
+
# After pip install or cloning locally
|
55
|
+
python ai_toolkit/run.py
|
@@ -0,0 +1,21 @@
|
|
1
|
+
README.md
|
2
|
+
pyproject.toml
|
3
|
+
setup.py
|
4
|
+
ai_security_toolkit/__init__.py
|
5
|
+
ai_security_toolkit/run.py
|
6
|
+
ai_security_toolkit.egg-info/PKG-INFO
|
7
|
+
ai_security_toolkit.egg-info/SOURCES.txt
|
8
|
+
ai_security_toolkit.egg-info/dependency_links.txt
|
9
|
+
ai_security_toolkit.egg-info/entry_points.txt
|
10
|
+
ai_security_toolkit.egg-info/requires.txt
|
11
|
+
ai_security_toolkit.egg-info/top_level.txt
|
12
|
+
ai_security_toolkit/modules/__init__.py
|
13
|
+
ai_security_toolkit/modules/backdoor_trigger_attack.py
|
14
|
+
ai_security_toolkit/modules/fgsm_mobilenet.py
|
15
|
+
ai_security_toolkit/modules/label_flip_attack.py
|
16
|
+
ai_security_toolkit/modules/membership_inference_attack.py
|
17
|
+
ai_security_toolkit/modules/simulate_inversion.py
|
18
|
+
ai_security_toolkit/modules/steal_model.py
|
19
|
+
ai_security_toolkit/modules/train_mnist_model.py
|
20
|
+
ai_security_toolkit/shared/__init__.py
|
21
|
+
ai_security_toolkit/shared/log_utils.py
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
ai_security_toolkit
|
@@ -0,0 +1,25 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["setuptools>=61.0"]
|
3
|
+
build-backend = "setuptools.build_meta"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "ai-security-toolkit"
|
7
|
+
version = "1.0.0"
|
8
|
+
description = "A red-team AI security framework with adversarial attack modules"
|
9
|
+
readme = "README.md"
|
10
|
+
authors = [
|
11
|
+
{ name="Rishit Goel" }
|
12
|
+
]
|
13
|
+
license = { text="MIT" }
|
14
|
+
requires-python = ">=3.8"
|
15
|
+
|
16
|
+
dependencies = [
|
17
|
+
"tensorflow",
|
18
|
+
"numpy",
|
19
|
+
"matplotlib",
|
20
|
+
"pandas",
|
21
|
+
"cleverhans"
|
22
|
+
]
|
23
|
+
|
24
|
+
[project.scripts]
|
25
|
+
ai-toolkit = "ai_security_toolkit.run:main"
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from setuptools import setup, find_packages
|
2
|
+
|
3
|
+
setup(
|
4
|
+
name="ai-security-toolkit",
|
5
|
+
version="1.0.0",
|
6
|
+
packages=find_packages(),
|
7
|
+
include_package_data=True,
|
8
|
+
install_requires=[
|
9
|
+
"tensorflow",
|
10
|
+
"matplotlib",
|
11
|
+
"numpy",
|
12
|
+
"pandas",
|
13
|
+
"cleverhans"
|
14
|
+
],
|
15
|
+
entry_points={
|
16
|
+
'console_scripts': [
|
17
|
+
'ai-toolkit=ai_toolkit.run:main'
|
18
|
+
]
|
19
|
+
},
|
20
|
+
author="Neha",
|
21
|
+
description="AI Red Team Toolkit with adversarial attacks, model stealing, inversion, and more.",
|
22
|
+
long_description=open("README.md").read(),
|
23
|
+
long_description_content_type="text/markdown",
|
24
|
+
python_requires='>=3.8',
|
25
|
+
)
|