aibt-fl 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aibt/metrics.py ADDED
@@ -0,0 +1,383 @@
1
+ """
2
+ Privacy Attack Metrics for AIBT Framework
3
+
4
+ Implements Membership Inference and Attribute Inference attacks for
5
+ evaluating privacy protection of federated learning models.
6
+ """
7
+
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ from torch.utils.data import DataLoader, TensorDataset
13
+ from typing import Dict, Tuple, Optional, List
14
+ from sklearn.linear_model import LogisticRegression
15
+ from sklearn.model_selection import train_test_split
16
+ from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_fscore_support
17
+ import warnings
18
+ warnings.filterwarnings('ignore')
19
+
20
+
21
+ # ============================================================================
22
+ # MEMBERSHIP INFERENCE ATTACK
23
+ # ============================================================================
24
+
25
+ class MembershipInferenceAttack:
26
+ """
27
+ Membership Inference Attack (MIA) for evaluating privacy leakage.
28
+
29
+ Reference: Shokri et al., "Membership Inference Attacks Against Machine Learning Models"
30
+
31
+ The attack attempts to determine whether a given sample was part of the
32
+ training data, using model confidence and prediction patterns.
33
+ """
34
+
35
+ def __init__(self, target_model: nn.Module, device: str = "cpu"):
36
+ self.target_model = target_model
37
+ self.device = device
38
+ self.attack_model = None
39
+
40
+ def get_prediction_features(
41
+ self,
42
+ model: nn.Module,
43
+ X: np.ndarray,
44
+ y: np.ndarray
45
+ ) -> np.ndarray:
46
+ """
47
+ Extract features from model predictions for attack model.
48
+
49
+ Features include:
50
+ - Prediction probabilities
51
+ - Prediction confidence (max prob)
52
+ - Prediction entropy
53
+ - Whether prediction is correct
54
+ - Loss value
55
+ """
56
+ model.eval()
57
+ X_tensor = torch.FloatTensor(X).to(self.device)
58
+ y_tensor = torch.LongTensor(y).to(self.device)
59
+
60
+ with torch.no_grad():
61
+ output = model(X_tensor)
62
+ if isinstance(output, tuple):
63
+ logits = output[0]
64
+ else:
65
+ logits = output
66
+
67
+ probs = F.softmax(logits, dim=1)
68
+
69
+ # Features
70
+ confidence = probs.max(dim=1)[0].cpu().numpy()
71
+ correct = (probs.argmax(dim=1) == y_tensor).float().cpu().numpy()
72
+ entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=1).cpu().numpy()
73
+
74
+ # Per-sample loss
75
+ loss_per_sample = F.cross_entropy(logits, y_tensor, reduction='none')
76
+ loss = loss_per_sample.cpu().numpy()
77
+
78
+ # Combine features
79
+ features = np.column_stack([confidence, correct, entropy, loss])
80
+ return features
81
+
82
+ def train_attack_model(
83
+ self,
84
+ train_data: Tuple[np.ndarray, np.ndarray],
85
+ test_data: Tuple[np.ndarray, np.ndarray],
86
+ shadow_model: Optional[nn.Module] = None
87
+ ) -> float:
88
+ """
89
+ Train the attack model using shadow model technique.
90
+
91
+ Args:
92
+ train_data: (X, y) used for training target model (members)
93
+ test_data: (X, y) not used for training (non-members)
94
+ shadow_model: Optional shadow model (uses target if not provided)
95
+
96
+ Returns:
97
+ AUC score of attack model
98
+ """
99
+ model = shadow_model if shadow_model else self.target_model
100
+
101
+ # Get features for members and non-members
102
+ X_train, y_train = train_data
103
+ X_test, y_test = test_data
104
+
105
+ member_features = self.get_prediction_features(model, X_train, y_train)
106
+ non_member_features = self.get_prediction_features(model, X_test, y_test)
107
+
108
+ # Create attack dataset
109
+ X_attack = np.vstack([member_features, non_member_features])
110
+ y_attack = np.concatenate([
111
+ np.ones(len(member_features)),
112
+ np.zeros(len(non_member_features))
113
+ ])
114
+
115
+ # Train attack model
116
+ X_attack_train, X_attack_test, y_attack_train, y_attack_test = train_test_split(
117
+ X_attack, y_attack, test_size=0.3, random_state=42, stratify=y_attack
118
+ )
119
+
120
+ self.attack_model = LogisticRegression(max_iter=1000, random_state=42)
121
+ self.attack_model.fit(X_attack_train, y_attack_train)
122
+
123
+ # Evaluate
124
+ y_pred_proba = self.attack_model.predict_proba(X_attack_test)[:, 1]
125
+ auc = roc_auc_score(y_attack_test, y_pred_proba)
126
+
127
+ return auc
128
+
129
+ def attack(
130
+ self,
131
+ X: np.ndarray,
132
+ y: np.ndarray
133
+ ) -> Tuple[np.ndarray, float]:
134
+ """
135
+ Perform membership inference attack.
136
+
137
+ Returns:
138
+ membership_probs: Probability of being a member
139
+ auc: Area under ROC curve
140
+ """
141
+ if self.attack_model is None:
142
+ raise ValueError("Attack model not trained. Call train_attack_model first.")
143
+
144
+ features = self.get_prediction_features(self.target_model, X, y)
145
+ membership_probs = self.attack_model.predict_proba(features)[:, 1]
146
+
147
+ return membership_probs, 0.5 # AUC requires ground truth
148
+
149
+
150
+ def evaluate_membership_inference(
151
+ model: nn.Module,
152
+ train_data: Tuple[np.ndarray, np.ndarray],
153
+ test_data: Tuple[np.ndarray, np.ndarray],
154
+ device: str = "cpu"
155
+ ) -> Dict[str, float]:
156
+ """
157
+ Complete membership inference evaluation.
158
+
159
+ Returns:
160
+ Dictionary with AUC and other metrics
161
+ """
162
+ attack = MembershipInferenceAttack(model, device)
163
+ auc = attack.train_attack_model(train_data, test_data)
164
+
165
+ return {
166
+ "membership_auc": auc,
167
+ "privacy_score": 1.0 - abs(auc - 0.5) * 2 # Closer to 0.5 = better privacy
168
+ }
169
+
170
+
171
+ # ============================================================================
172
+ # ATTRIBUTE INFERENCE ATTACK
173
+ # ============================================================================
174
+
175
+ class AttributeInferenceAttack:
176
+ """
177
+ Attribute Inference Attack for evaluating sensitive attribute leakage.
178
+
179
+ Attempts to predict sensitive attributes from latent representations.
180
+ """
181
+
182
+ def __init__(self, target_model: nn.Module, device: str = "cpu"):
183
+ self.target_model = target_model
184
+ self.device = device
185
+ self.attack_model = None
186
+
187
+ def get_latent_representations(
188
+ self,
189
+ X: np.ndarray
190
+ ) -> np.ndarray:
191
+ """Extract latent representations from the model"""
192
+ self.target_model.eval()
193
+ X_tensor = torch.FloatTensor(X).to(self.device)
194
+
195
+ with torch.no_grad():
196
+ if hasattr(self.target_model, 'get_latent'):
197
+ z = self.target_model.get_latent(X_tensor)
198
+ elif hasattr(self.target_model, 'encoder'):
199
+ encoder_out = self.target_model.encoder(X_tensor)
200
+ if isinstance(encoder_out, tuple):
201
+ z = encoder_out[0]
202
+ else:
203
+ z = encoder_out
204
+ else:
205
+ # Use output directly
206
+ output = self.target_model(X_tensor)
207
+ if isinstance(output, tuple):
208
+ z = output[0]
209
+ else:
210
+ z = output
211
+
212
+ return z.cpu().numpy()
213
+
214
+ def train_attack_model(
215
+ self,
216
+ X: np.ndarray,
217
+ sensitive_attrs: np.ndarray
218
+ ) -> float:
219
+ """
220
+ Train attack model to predict sensitive attributes from latent codes.
221
+
222
+ Args:
223
+ X: Input features
224
+ sensitive_attrs: Sensitive attribute labels
225
+
226
+ Returns:
227
+ AUC score of attribute inference
228
+ """
229
+ # Get latent representations
230
+ latent = self.get_latent_representations(X)
231
+
232
+ # Train-test split
233
+ z_train, z_test, s_train, s_test = train_test_split(
234
+ latent, sensitive_attrs, test_size=0.3, random_state=42,
235
+ stratify=sensitive_attrs
236
+ )
237
+
238
+ # Train attack model
239
+ self.attack_model = LogisticRegression(max_iter=1000, random_state=42)
240
+ self.attack_model.fit(z_train, s_train)
241
+
242
+ # Evaluate
243
+ s_pred_proba = self.attack_model.predict_proba(z_test)
244
+
245
+ # Handle binary and multi-class
246
+ if len(np.unique(sensitive_attrs)) == 2:
247
+ auc = roc_auc_score(s_test, s_pred_proba[:, 1])
248
+ else:
249
+ auc = roc_auc_score(s_test, s_pred_proba, multi_class='ovr', average='weighted')
250
+
251
+ return auc
252
+
253
+ def attack(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
254
+ """
255
+ Perform attribute inference attack.
256
+
257
+ Returns:
258
+ predictions: Predicted sensitive attributes
259
+ probabilities: Prediction probabilities
260
+ """
261
+ if self.attack_model is None:
262
+ raise ValueError("Attack model not trained. Call train_attack_model first.")
263
+
264
+ latent = self.get_latent_representations(X)
265
+ predictions = self.attack_model.predict(latent)
266
+ probabilities = self.attack_model.predict_proba(latent)
267
+
268
+ return predictions, probabilities
269
+
270
+
271
+ def evaluate_attribute_inference(
272
+ model: nn.Module,
273
+ X: np.ndarray,
274
+ sensitive_attrs: np.ndarray,
275
+ device: str = "cpu"
276
+ ) -> Dict[str, float]:
277
+ """
278
+ Complete attribute inference evaluation.
279
+
280
+ Returns:
281
+ Dictionary with AUC and accuracy
282
+ """
283
+ attack = AttributeInferenceAttack(model, device)
284
+ auc = attack.train_attack_model(X, sensitive_attrs)
285
+
286
+ predictions, _ = attack.attack(X)
287
+ accuracy = accuracy_score(sensitive_attrs, predictions)
288
+
289
+ return {
290
+ "attribute_auc": auc,
291
+ "attribute_accuracy": accuracy,
292
+ "privacy_score": 1.0 - abs(auc - 0.5) * 2
293
+ }
294
+
295
+
296
+ # ============================================================================
297
+ # COMPREHENSIVE PRIVACY EVALUATION
298
+ # ============================================================================
299
+
300
+ def evaluate_privacy(
301
+ model: nn.Module,
302
+ train_data: Tuple[np.ndarray, np.ndarray],
303
+ test_data: Tuple[np.ndarray, np.ndarray],
304
+ sensitive_attrs_train: Optional[np.ndarray] = None,
305
+ sensitive_attrs_test: Optional[np.ndarray] = None,
306
+ device: str = "cpu"
307
+ ) -> Dict[str, float]:
308
+ """
309
+ Comprehensive privacy evaluation.
310
+
311
+ Performs:
312
+ 1. Membership Inference Attack
313
+ 2. Attribute Inference Attack (if sensitive attrs provided)
314
+
315
+ Returns:
316
+ Dictionary with all privacy metrics
317
+ """
318
+ results = {}
319
+
320
+ # Membership Inference Attack
321
+ mia_metrics = evaluate_membership_inference(model, train_data, test_data, device)
322
+ results.update({
323
+ "membership_auc": mia_metrics["membership_auc"],
324
+ "membership_privacy_score": mia_metrics["privacy_score"]
325
+ })
326
+
327
+ # Attribute Inference Attack
328
+ if sensitive_attrs_train is not None:
329
+ X_train, _ = train_data
330
+ aia_metrics = evaluate_attribute_inference(model, X_train, sensitive_attrs_train, device)
331
+ results.update({
332
+ "attribute_auc": aia_metrics["attribute_auc"],
333
+ "attribute_accuracy": aia_metrics["attribute_accuracy"],
334
+ "attribute_privacy_score": aia_metrics["privacy_score"]
335
+ })
336
+
337
+ # Overall privacy score
338
+ privacy_scores = [v for k, v in results.items() if 'privacy_score' in k]
339
+ results["overall_privacy_score"] = np.mean(privacy_scores) if privacy_scores else 0.0
340
+
341
+ return results
342
+
343
+
344
+ # ============================================================================
345
+ # PERFORMANCE METRICS
346
+ # ============================================================================
347
+
348
+ def evaluate_performance(
349
+ model: nn.Module,
350
+ test_data: Tuple[np.ndarray, np.ndarray],
351
+ device: str = "cpu"
352
+ ) -> Dict[str, float]:
353
+ """
354
+ Evaluate model performance metrics.
355
+
356
+ Returns:
357
+ Dictionary with accuracy, precision, recall, f1
358
+ """
359
+ model.eval()
360
+ X, y = test_data
361
+ X_tensor = torch.FloatTensor(X).to(device)
362
+ y_tensor = torch.LongTensor(y).to(device)
363
+
364
+ with torch.no_grad():
365
+ output = model(X_tensor)
366
+ if isinstance(output, tuple):
367
+ logits = output[0]
368
+ else:
369
+ logits = output
370
+
371
+ predictions = logits.argmax(dim=1).cpu().numpy()
372
+
373
+ accuracy = accuracy_score(y, predictions)
374
+ precision, recall, f1, _ = precision_recall_fscore_support(
375
+ y, predictions, average='weighted', zero_division=0
376
+ )
377
+
378
+ return {
379
+ "accuracy": accuracy,
380
+ "precision": precision,
381
+ "recall": recall,
382
+ "f1_score": f1
383
+ }