noshot 0.4.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- noshot/data/ML TS XAI/TS/10. Seasonal ARIMA Forecasting.ipynb +32 -714
- noshot/data/ML TS XAI/TS/11. Multivariate ARIMA Forecasting.ipynb +29 -1071
- noshot/data/ML TS XAI/TS/6. ACF PACF.ipynb +7 -105
- noshot/data/ML TS XAI/TS/7. Differencing.ipynb +16 -152
- noshot/data/ML TS XAI/TS/8. ARMA Forecasting.ipynb +26 -575
- noshot/data/ML TS XAI/TS/9. ARIMA Forecasting.ipynb +23 -382
- noshot/data/ML TS XAI/XAI/XAI 1/EDA2_chipsdatset.ipynb +633 -0
- noshot/data/ML TS XAI/XAI/XAI 1/EDA_IRISH_8thjan.ipynb +326 -0
- noshot/data/ML TS XAI/XAI/XAI 1/XAI_EX1 MODEL BIAS (FINAL).ipynb +487 -0
- noshot/data/ML TS XAI/XAI/XAI 1/complete_guide_to_eda_on_text_data.ipynb +845 -0
- noshot/data/ML TS XAI/XAI/XAI 1/deepchecksframeworks.ipynb +100 -0
- noshot/data/ML TS XAI/XAI/XAI 1/deepexplainers (mnist).ipynb +90 -0
- noshot/data/ML TS XAI/XAI/XAI 1/guidedbackpropagation.ipynb +203 -0
- noshot/data/ML TS XAI/XAI/XAI 1/updated_image_EDA1_with_LRP.ipynb +3998 -0
- noshot/data/ML TS XAI/XAI/XAI 1/zebrastripes.ipynb +271 -0
- noshot/data/ML TS XAI/XAI/XAI 2/EXP_5.ipynb +1545 -0
- noshot/data/ML TS XAI/XAI/XAI 2/Exp-3 (EDA-loan).ipynb +221 -0
- noshot/data/ML TS XAI/XAI/XAI 2/Exp-3 (EDA-movie).ipynb +229 -0
- noshot/data/ML TS XAI/XAI/XAI 2/Exp-4(Flower dataset).ipynb +237 -0
- noshot/data/ML TS XAI/XAI/XAI 2/Exp-4.ipynb +241 -0
- noshot/data/ML TS XAI/XAI/XAI 2/Exp_2.ipynb +352 -0
- noshot/data/ML TS XAI/XAI/XAI 2/Exp_7.ipynb +110 -0
- noshot/data/ML TS XAI/XAI/XAI 2/FeatureImportance_SensitivityAnalysis.ipynb +708 -0
- {noshot-0.4.1.dist-info → noshot-1.0.0.dist-info}/METADATA +1 -1
- noshot-1.0.0.dist-info/RECORD +32 -0
- noshot-0.4.1.dist-info/RECORD +0 -15
- {noshot-0.4.1.dist-info → noshot-1.0.0.dist-info}/WHEEL +0 -0
- {noshot-0.4.1.dist-info → noshot-1.0.0.dist-info}/licenses/LICENSE.txt +0 -0
- {noshot-0.4.1.dist-info → noshot-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,241 @@
|
|
1
|
+
{
|
2
|
+
"cells": [
|
3
|
+
{
|
4
|
+
"cell_type": "code",
|
5
|
+
"execution_count": null,
|
6
|
+
"id": "a956d239-b4b0-4e65-ac1a-d8047cfc883f",
|
7
|
+
"metadata": {},
|
8
|
+
"outputs": [],
|
9
|
+
"source": [
|
10
|
+
"import torch \n",
|
11
|
+
"import torch.nn as nn \n",
|
12
|
+
"import torch.optim as optim\n",
|
13
|
+
"import torch.nn.functional as F\n",
|
14
|
+
"import torchvision \n",
|
15
|
+
"import torchvision.transforms as transforms\n",
|
16
|
+
"import numpy as np\n",
|
17
|
+
"import matplotlib.pyplot as plt \n",
|
18
|
+
"from sklearn.tree import DecisionTreeClassifier\n",
|
19
|
+
"from sklearn.tree import plot_tree"
|
20
|
+
]
|
21
|
+
},
|
22
|
+
{
|
23
|
+
"cell_type": "code",
|
24
|
+
"execution_count": null,
|
25
|
+
"id": "7a0d4f98-bfb5-40a0-9c0e-8843361cb7a7",
|
26
|
+
"metadata": {},
|
27
|
+
"outputs": [],
|
28
|
+
"source": [
|
29
|
+
"transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])\n",
|
30
|
+
"trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)\n",
|
31
|
+
"trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)\n",
|
32
|
+
"\n",
|
33
|
+
"testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)\n",
|
34
|
+
"testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)"
|
35
|
+
]
|
36
|
+
},
|
37
|
+
{
|
38
|
+
"cell_type": "code",
|
39
|
+
"execution_count": null,
|
40
|
+
"id": "7c323e6e-58bf-498a-bc60-1e02a1aea4ef",
|
41
|
+
"metadata": {},
|
42
|
+
"outputs": [],
|
43
|
+
"source": [
|
44
|
+
"class CNN(nn.Module):\n",
|
45
|
+
" def __init__(self):\n",
|
46
|
+
" super(CNN, self).__init__()\n",
|
47
|
+
" self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)\n",
|
48
|
+
" self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)\n",
|
49
|
+
" self.fc1 = nn.Linear(32 * 7 * 7, 128)\n",
|
50
|
+
" self.fc2 = nn.Linear(128, 10)\n",
|
51
|
+
"\n",
|
52
|
+
" def forward(self, x):\n",
|
53
|
+
" x = F.relu(self.conv1(x))\n",
|
54
|
+
" x = F.max_pool2d(x, 2, 2)\n",
|
55
|
+
" x = F.relu(self.conv2(x))\n",
|
56
|
+
" x = F.max_pool2d(x, 2, 2)\n",
|
57
|
+
" x = x.view(-1, 32 * 7 * 7)\n",
|
58
|
+
" x = F.relu(self.fc1(x))\n",
|
59
|
+
" x = self.fc2(x)\n",
|
60
|
+
" return x\n",
|
61
|
+
"\n",
|
62
|
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
63
|
+
"model = CNN().to(device)\n",
|
64
|
+
"criterion = nn.CrossEntropyLoss()\n",
|
65
|
+
"optimizer = optim.Adam(model.parameters(), lr=0.001)"
|
66
|
+
]
|
67
|
+
},
|
68
|
+
{
|
69
|
+
"cell_type": "code",
|
70
|
+
"execution_count": null,
|
71
|
+
"id": "57e73bc1-cbfb-46ca-b5d2-4798b8b59a89",
|
72
|
+
"metadata": {},
|
73
|
+
"outputs": [],
|
74
|
+
"source": [
|
75
|
+
"def train(model, trainloader, criterion, optimizer, epochs=3):\n",
|
76
|
+
" model.train()\n",
|
77
|
+
" for epoch in range(epochs):\n",
|
78
|
+
" running_loss = 0.0\n",
|
79
|
+
" for images, labels in trainloader:\n",
|
80
|
+
" images, labels = images.to(device), labels.to(device)\n",
|
81
|
+
" optimizer.zero_grad()\n",
|
82
|
+
" outputs = model(images) # CNN outputs (logits)\n",
|
83
|
+
"\n",
|
84
|
+
" # Print CNN output (logits) for the first image in the batch\n",
|
85
|
+
" print(f\"Logits for first image in batch: {outputs[0]}\") # First image in the batch\n",
|
86
|
+
"\n",
|
87
|
+
" loss = criterion(outputs, labels)\n",
|
88
|
+
" loss.backward()\n",
|
89
|
+
" optimizer.step()\n",
|
90
|
+
" running_loss += loss.item()\n",
|
91
|
+
" print(f\"Epoch {epoch + 1}, Loss: {running_loss / len(trainloader):.4f}\")\n",
|
92
|
+
"\n",
|
93
|
+
"train(model, trainloader, criterion, optimizer)\n"
|
94
|
+
]
|
95
|
+
},
|
96
|
+
{
|
97
|
+
"cell_type": "code",
|
98
|
+
"execution_count": null,
|
99
|
+
"id": "93b6189f-889e-419e-95d6-9154c2acca26",
|
100
|
+
"metadata": {},
|
101
|
+
"outputs": [],
|
102
|
+
"source": [
|
103
|
+
"def extract_features(model, dataloader):\n",
|
104
|
+
" model.eval()\n",
|
105
|
+
" features, labels = [], []\n",
|
106
|
+
" with torch.no_grad():\n",
|
107
|
+
" for images, lbls in dataloader:\n",
|
108
|
+
" images = images.to(device)\n",
|
109
|
+
" outputs = model(images) # CNN outputs (logits)\n",
|
110
|
+
"\n",
|
111
|
+
" # Print CNN outputs (logits) for the first image in each batch\n",
|
112
|
+
" print(f\"Logits for first image in batch: {outputs[0]}\") # This will print the logits for the first image\n",
|
113
|
+
"\n",
|
114
|
+
" features.extend(outputs.cpu().numpy()) # Extract CNN outputs as features\n",
|
115
|
+
" labels.extend(lbls.numpy())\n",
|
116
|
+
" return np.array(features), np.array(labels)\n",
|
117
|
+
"\n",
|
118
|
+
"X_train, y_train = extract_features(model, trainloader)\n",
|
119
|
+
"X_test, y_test = extract_features(model, testloader)"
|
120
|
+
]
|
121
|
+
},
|
122
|
+
{
|
123
|
+
"cell_type": "code",
|
124
|
+
"execution_count": null,
|
125
|
+
"id": "44442993-376b-41c0-9dc9-f55cfdc0268c",
|
126
|
+
"metadata": {},
|
127
|
+
"outputs": [],
|
128
|
+
"source": [
|
129
|
+
"dt = DecisionTreeClassifier(max_depth=5) \n",
|
130
|
+
"dt.fit(X_train, y_train)\n",
|
131
|
+
"\n",
|
132
|
+
"\n",
|
133
|
+
"acc = dt.score(X_test, y_test)\n",
|
134
|
+
"print(f\"Surrogate Model Accuracy: {acc * 100:.2f}%\")\n"
|
135
|
+
]
|
136
|
+
},
|
137
|
+
{
|
138
|
+
"cell_type": "code",
|
139
|
+
"execution_count": null,
|
140
|
+
"id": "79bb7a9f-b4ba-4bc1-95ef-ecd1cf525b45",
|
141
|
+
"metadata": {},
|
142
|
+
"outputs": [],
|
143
|
+
"source": [
|
144
|
+
"\n",
|
145
|
+
"def visualize_surrogate_model(dt):\n",
|
146
|
+
" plt.figure(figsize=(12, 8))\n",
|
147
|
+
" plot_tree(dt, filled=True, feature_names=[f\"Feature {i}\" for i in range(X_train.shape[1])], class_names=[str(i) for i in range(10)], rounded=True)\n",
|
148
|
+
" plt.title(\"Surrogate Model - Decision Tree\")\n",
|
149
|
+
" plt.show()\n",
|
150
|
+
"\n",
|
151
|
+
"visualize_surrogate_model(dt)\n",
|
152
|
+
"\n",
|
153
|
+
"\n",
|
154
|
+
"def plot_feature_importance(dt, feature_names):\n",
|
155
|
+
" feature_importances = dt.feature_importances_\n",
|
156
|
+
" indices = np.argsort(feature_importances)[::-1]\n",
|
157
|
+
"\n",
|
158
|
+
" plt.figure(figsize=(10, 6))\n",
|
159
|
+
" plt.title(\"Feature Importances (Surrogate Model)\")\n",
|
160
|
+
" plt.barh(range(X_train.shape[1]), feature_importances[indices], align=\"center\")\n",
|
161
|
+
" plt.yticks(range(X_train.shape[1]), [f\"Feature {i}\" for i in indices])\n",
|
162
|
+
" plt.xlabel(\"Importance\")\n",
|
163
|
+
" plt.show()\n",
|
164
|
+
"\n",
|
165
|
+
"\n",
|
166
|
+
"plot_feature_importance(dt, [f\"Feature {i}\" for i in range(X_train.shape[1])])\n",
|
167
|
+
"\n",
|
168
|
+
"\n",
|
169
|
+
"\n",
|
170
|
+
"def visualize_feature_maps(model, input_image):\n",
|
171
|
+
" model.eval()\n",
|
172
|
+
" layers = [model.conv1, model.conv2]\n",
|
173
|
+
" activations = []\n",
|
174
|
+
"\n",
|
175
|
+
" def save_activation(name):\n",
|
176
|
+
" def hook(model, input, output):\n",
|
177
|
+
" activations.append(output)\n",
|
178
|
+
" return hook\n",
|
179
|
+
"\n",
|
180
|
+
" \n",
|
181
|
+
" hooks = []\n",
|
182
|
+
" for layer in layers:\n",
|
183
|
+
" hooks.append(layer.register_forward_hook(save_activation(layer.__class__.__name__)))\n",
|
184
|
+
"\n",
|
185
|
+
" \n",
|
186
|
+
" input_image = input_image.unsqueeze(0).to(device)\n",
|
187
|
+
" model(input_image)\n",
|
188
|
+
"\n",
|
189
|
+
" \n",
|
190
|
+
" for i, activation in enumerate(activations):\n",
|
191
|
+
" activation = activation.squeeze(0).cpu().detach().numpy()\n",
|
192
|
+
" num_filters = activation.shape[0]\n",
|
193
|
+
"\n",
|
194
|
+
" \n",
|
195
|
+
" fig, axes = plt.subplots(1, num_filters, figsize=(15, 8))\n",
|
196
|
+
" for j in range(num_filters):\n",
|
197
|
+
" axes[j].imshow(activation[j], cmap='gray')\n",
|
198
|
+
" axes[j].axis('off')\n",
|
199
|
+
" axes[j].set_title(f\"Filter {j + 1}\")\n",
|
200
|
+
" plt.show()\n",
|
201
|
+
"\n",
|
202
|
+
" \n",
|
203
|
+
" for hook in hooks:\n",
|
204
|
+
" hook.remove()\n",
|
205
|
+
"\n",
|
206
|
+
"\n",
|
207
|
+
"sample_image, sample_label = testset[0]\n",
|
208
|
+
"visualize_feature_maps(model, sample_image)"
|
209
|
+
]
|
210
|
+
},
|
211
|
+
{
|
212
|
+
"cell_type": "code",
|
213
|
+
"execution_count": null,
|
214
|
+
"id": "d572de18-1df8-41b8-8b24-618790d7d0aa",
|
215
|
+
"metadata": {},
|
216
|
+
"outputs": [],
|
217
|
+
"source": []
|
218
|
+
}
|
219
|
+
],
|
220
|
+
"metadata": {
|
221
|
+
"kernelspec": {
|
222
|
+
"display_name": "Python 3 (ipykernel)",
|
223
|
+
"language": "python",
|
224
|
+
"name": "python3"
|
225
|
+
},
|
226
|
+
"language_info": {
|
227
|
+
"codemirror_mode": {
|
228
|
+
"name": "ipython",
|
229
|
+
"version": 3
|
230
|
+
},
|
231
|
+
"file_extension": ".py",
|
232
|
+
"mimetype": "text/x-python",
|
233
|
+
"name": "python",
|
234
|
+
"nbconvert_exporter": "python",
|
235
|
+
"pygments_lexer": "ipython3",
|
236
|
+
"version": "3.12.4"
|
237
|
+
}
|
238
|
+
},
|
239
|
+
"nbformat": 4,
|
240
|
+
"nbformat_minor": 5
|
241
|
+
}
|
@@ -0,0 +1,352 @@
|
|
1
|
+
{
|
2
|
+
"cells": [
|
3
|
+
{
|
4
|
+
"cell_type": "code",
|
5
|
+
"execution_count": null,
|
6
|
+
"metadata": {
|
7
|
+
"id": "Gdwg19aUY6j6"
|
8
|
+
},
|
9
|
+
"outputs": [],
|
10
|
+
"source": [
|
11
|
+
"import matplotlib.pyplot as plt\n",
|
12
|
+
"from sklearn.datasets import make_classification\n",
|
13
|
+
"from imblearn.over_sampling import SMOTE\n",
|
14
|
+
"from collections import Counter\n",
|
15
|
+
"from sklearn.naive_bayes import GaussianNB # Importing Naive Bayes\n",
|
16
|
+
"from sklearn.model_selection import train_test_split\n",
|
17
|
+
"from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay\n",
|
18
|
+
"import seaborn as sns\n"
|
19
|
+
]
|
20
|
+
},
|
21
|
+
{
|
22
|
+
"cell_type": "code",
|
23
|
+
"execution_count": null,
|
24
|
+
"metadata": {
|
25
|
+
"colab": {
|
26
|
+
"base_uri": "https://localhost:8080/",
|
27
|
+
"height": 1000
|
28
|
+
},
|
29
|
+
"id": "hrwNktWpZoNM",
|
30
|
+
"outputId": "e5640cd0-28f0-4c67-a0cd-b22b22091984"
|
31
|
+
},
|
32
|
+
"outputs": [],
|
33
|
+
"source": [
|
34
|
+
"X, y = make_classification(n_samples=1000, n_features=10, n_classes=2,\n",
|
35
|
+
" class_sep=2, weights=[0.9, 0.1], random_state=42)\n",
|
36
|
+
"\n",
|
37
|
+
"print(\"Original class distribution:\", Counter(y)) #counter is used to count the number of occurrences\n",
|
38
|
+
"\n",
|
39
|
+
"# Plot the class distribution\n",
|
40
|
+
"plt.figure(figsize=(6,4))\n",
|
41
|
+
"plt.bar(['Class 0', 'Class 1'], [Counter(y)[0], Counter(y)[1]], color=['cyan', 'black'])\n",
|
42
|
+
"plt.title('Original Class Distribution')\n",
|
43
|
+
"plt.ylabel('Frequency')\n",
|
44
|
+
"plt.show()\n",
|
45
|
+
"\n",
|
46
|
+
"# Apply SMOTE (Synthetic Minority Over-sampling Technique) for oversampling\n",
|
47
|
+
"smote = SMOTE(random_state=42)\n",
|
48
|
+
"\n",
|
49
|
+
"X_res, y_res = smote.fit_resample(X, y)\n",
|
50
|
+
"\n",
|
51
|
+
"# Display new class distribution\n",
|
52
|
+
"print(\"Resampled class distribution:\", Counter(y_res))\n",
|
53
|
+
"\n",
|
54
|
+
"# Plot the resampled class distribution\n",
|
55
|
+
"plt.figure(figsize=(6,4))\n",
|
56
|
+
"plt.bar(['Class 0', 'Class 1'], [Counter(y_res)[0], Counter(y_res)[1]], color=['cyan', 'black'])\n",
|
57
|
+
"plt.title('Resampled Class Distribution (SMOTE)')\n",
|
58
|
+
"plt.ylabel('Frequency')\n",
|
59
|
+
"plt.show()\n",
|
60
|
+
"\n",
|
61
|
+
"# Train and evaluate a Naive Bayes classifier on the resampled data\n",
|
62
|
+
"X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)\n",
|
63
|
+
"\n",
|
64
|
+
"# Using Naive Bayes Classifier\n",
|
65
|
+
"clf = GaussianNB()\n",
|
66
|
+
"clf.fit(X_train, y_train)\n",
|
67
|
+
"y_pred_resampled = clf.predict(X_test)\n",
|
68
|
+
"\n",
|
69
|
+
"# Evaluate the model on the resampled dataset\n",
|
70
|
+
"print(\"Classification report on resampled data:\")\n",
|
71
|
+
"print(classification_report(y_test, y_pred_resampled))\n",
|
72
|
+
"\n",
|
73
|
+
"# Confusion Matrix for resampled data\n",
|
74
|
+
"cm_resampled = confusion_matrix(y_test, y_pred_resampled)\n",
|
75
|
+
"disp = ConfusionMatrixDisplay(confusion_matrix=cm_resampled, display_labels=['Class 0', 'Class 1'])\n",
|
76
|
+
"disp.plot(cmap='Blues')\n",
|
77
|
+
"plt.title(\"Confusion Matrix - Resampled Data\")\n",
|
78
|
+
"plt.show()\n",
|
79
|
+
"\n",
|
80
|
+
"# Now, evaluate the model on the original imbalanced data\n",
|
81
|
+
"X_train_imbalanced, X_test_imbalanced, y_train_imbalanced, y_test_imbalanced = train_test_split(X, y, test_size=0.3, random_state=42)\n",
|
82
|
+
"\n",
|
83
|
+
"# Train the Naive Bayes Classifier on imbalanced data\n",
|
84
|
+
"clf.fit(X_train_imbalanced, y_train_imbalanced)\n",
|
85
|
+
"y_pred_imbalanced = clf.predict(X_test_imbalanced)\n",
|
86
|
+
"\n",
|
87
|
+
"# Evaluate the model on the original imbalanced dataset\n",
|
88
|
+
"print(\"Classification report on imbalanced data:\")\n",
|
89
|
+
"print(classification_report(y_test_imbalanced, y_pred_imbalanced))\n",
|
90
|
+
"\n",
|
91
|
+
"# Confusion Matrix for imbalanced data\n",
|
92
|
+
"cm_imbalanced = confusion_matrix(y_test_imbalanced, y_pred_imbalanced)\n",
|
93
|
+
"disp_imbalanced = ConfusionMatrixDisplay(confusion_matrix=cm_imbalanced, display_labels=['Class 0', 'Class 1'])\n",
|
94
|
+
"disp_imbalanced.plot(cmap='Blues')\n",
|
95
|
+
"plt.title(\"Confusion Matrix - Imbalanced Data\")\n",
|
96
|
+
"plt.show()"
|
97
|
+
]
|
98
|
+
},
|
99
|
+
{
|
100
|
+
"cell_type": "code",
|
101
|
+
"execution_count": null,
|
102
|
+
"metadata": {
|
103
|
+
"colab": {
|
104
|
+
"base_uri": "https://localhost:8080/",
|
105
|
+
"height": 455
|
106
|
+
},
|
107
|
+
"id": "iXasw8cxb8OV",
|
108
|
+
"outputId": "eb0e821f-61fd-4a85-c94a-fdcfa55b2ed4"
|
109
|
+
},
|
110
|
+
"outputs": [],
|
111
|
+
"source": [
|
112
|
+
"import numpy as np\n",
|
113
|
+
"import matplotlib.pyplot as plt\n",
|
114
|
+
"from sklearn.linear_model import LinearRegression\n",
|
115
|
+
"from sklearn.model_selection import train_test_split\n",
|
116
|
+
"\n",
|
117
|
+
"# Generate data with a linear relationship (y = 2X + 1)\n",
|
118
|
+
"np.random.seed(42)\n",
|
119
|
+
"X = np.linspace(0, 10, 100)#generates 100 evenly spaced points in the range from 0 to 10\n",
|
120
|
+
"y = 2 * X + 1 # True underlying function\n",
|
121
|
+
"\n",
|
122
|
+
"# Add some noise to the data\n",
|
123
|
+
"noise = np.random.normal(0, 2, X.shape)#take ranom values from a normal distribution of mean 0 and SD 2 same as X.shape\n",
|
124
|
+
"y_noisy = y + noise #y is y added with noise\n",
|
125
|
+
"\n",
|
126
|
+
"# Introduce outliers\n",
|
127
|
+
"X_outliers = np.array([2, 4, 6, 8])\n",
|
128
|
+
"y_outliers = np.array([25, 30, 28, 35]) # Outliers with large values\n",
|
129
|
+
"X_combined = np.concatenate((X, X_outliers))\n",
|
130
|
+
"y_combined = np.concatenate((y_noisy, y_outliers))\n",
|
131
|
+
"\n",
|
132
|
+
"# Split the data into training and test sets\n",
|
133
|
+
"X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)\n",
|
134
|
+
"\n",
|
135
|
+
"# Reshape data to fit into linear regression model\n",
|
136
|
+
"X_train = X_train.reshape(-1, 1) #1: Tells NumPy to automatically calculate the number of rows based on the total number of elements\n",
|
137
|
+
"\n",
|
138
|
+
"# and the specified number of columns (1 in this case).\n",
|
139
|
+
"X_test = X_test.reshape(-1, 1)\n",
|
140
|
+
"\n",
|
141
|
+
"\n",
|
142
|
+
"# Fit the linear regression model\n",
|
143
|
+
"model = LinearRegression()\n",
|
144
|
+
"model.fit(X_train, y_train)\n",
|
145
|
+
"\n",
|
146
|
+
"# Predict on both training and testing data\n",
|
147
|
+
"y_train_pred = model.predict(X_train)\n",
|
148
|
+
"y_test_pred = model.predict(X_test)\n",
|
149
|
+
"# Plot the results\n",
|
150
|
+
"plt.figure(figsize=(10, 6))\n",
|
151
|
+
"plt.scatter(X_train, y_train, color='blue', label='Training data')\n",
|
152
|
+
"plt.scatter(X_test, y_test, color='green', label='Test data')\n",
|
153
|
+
"plt.plot(X_test, y_test_pred, color='cyan', label='2nd Fitted line')\n",
|
154
|
+
"plt.plot(X_train, y_train_pred, color='red', label='1st Fitted line (with outliers)',linestyle='-.')\n",
|
155
|
+
"plt.plot(X, y, color='black', label='True line (y = 2X + 1)', linestyle='--')\n",
|
156
|
+
"plt.legend()\n",
|
157
|
+
"plt.xlabel('X')\n",
|
158
|
+
"plt.ylabel('y')\n",
|
159
|
+
"plt.title('Linear Regression with Outliers')\n",
|
160
|
+
"plt.show()\n"
|
161
|
+
]
|
162
|
+
},
|
163
|
+
{
|
164
|
+
"cell_type": "code",
|
165
|
+
"execution_count": null,
|
166
|
+
"metadata": {
|
167
|
+
"colab": {
|
168
|
+
"base_uri": "https://localhost:8080/",
|
169
|
+
"height": 363
|
170
|
+
},
|
171
|
+
"id": "fi-j-O-dfIWz",
|
172
|
+
"outputId": "81b98378-aff1-436a-e32b-47a5f23929ba"
|
173
|
+
},
|
174
|
+
"outputs": [],
|
175
|
+
"source": [
|
176
|
+
"import numpy as np\n",
|
177
|
+
"import matplotlib.pyplot as plt\n",
|
178
|
+
"from sklearn.linear_model import LogisticRegression\n",
|
179
|
+
"from sklearn.metrics import accuracy_score, log_loss\n",
|
180
|
+
"from scipy.stats import entropy\n",
|
181
|
+
"\n",
|
182
|
+
"# Step 1: Generate synthetic data (initial dataset)\n",
|
183
|
+
"np.random.seed(42)\n",
|
184
|
+
"n_samples = 500\n",
|
185
|
+
"X1 = np.random.normal(0, 1, n_samples) # Feature 1\n",
|
186
|
+
"X2 = np.random.normal(0, 1, n_samples) # Feature 2\n",
|
187
|
+
"X = np.column_stack((X1, X2))\n",
|
188
|
+
"y = (X1 + X2 > 0).astype(int) # Binary target\n",
|
189
|
+
"\n",
|
190
|
+
"# Step 2: Fit logistic regression on the initial data\n",
|
191
|
+
"model = LogisticRegression()\n",
|
192
|
+
"model.fit(X, y)\n",
|
193
|
+
"\n",
|
194
|
+
"# Step 3: Simulate drift (change the distribution of features)\n",
|
195
|
+
"X1_drifted = np.random.normal(1, 1, n_samples) # Shift mean from 0 to 1\n",
|
196
|
+
"X2_drifted = np.random.normal(1, 1, n_samples) # Shift mean from 0 to 1\n",
|
197
|
+
"X_drifted = np.column_stack((X1_drifted, X2_drifted))\n",
|
198
|
+
"y_drifted = (X1_drifted + X2_drifted > 0).astype(int) # New labels based on drifted data\n",
|
199
|
+
"\n",
|
200
|
+
"# Step 4: Detect drift using KL divergence\n",
|
201
|
+
"def kl_divergence(p, q, bins=10):\n",
|
202
|
+
" \"\"\"Calculate KL divergence between two distributions.\"\"\"\n",
|
203
|
+
" p_hist, _ = np.histogram(p, bins=bins, density=True)\n",
|
204
|
+
" q_hist, _ = np.histogram(q, bins=bins, density=True)\n",
|
205
|
+
" p_hist += 1e-10 # Avoid division by zero\n",
|
206
|
+
" q_hist += 1e-10\n",
|
207
|
+
" return entropy(p_hist, q_hist)\n",
|
208
|
+
"\n",
|
209
|
+
"# Calculate KL divergence for each feature\n",
|
210
|
+
"kl_X1 = kl_divergence(X1, X1_drifted)\n",
|
211
|
+
"kl_X2 = kl_divergence(X2, X2_drifted)\n",
|
212
|
+
"\n",
|
213
|
+
"# Step 5: Evaluate model performance on drifted data\n",
|
214
|
+
"y_pred_drifted = model.predict(X_drifted)\n",
|
215
|
+
"accuracy_drifted = accuracy_score(y_drifted, y_pred_drifted)\n",
|
216
|
+
"log_loss_drifted = log_loss(y_drifted, model.predict_proba(X_drifted))\n",
|
217
|
+
"\n",
|
218
|
+
"# Step 6: Visualization\n",
|
219
|
+
"plt.figure(figsize=(12, 5))\n",
|
220
|
+
"\n",
|
221
|
+
"# Plot original and drifted distributions\n",
|
222
|
+
"plt.subplot(1, 2, 1)\n",
|
223
|
+
"plt.hist(X1, bins=20, alpha=0.6, label=\"Feature 1 (original)\", color=\"blue\")\n",
|
224
|
+
"plt.hist(X1_drifted, bins=20, alpha=0.6, label=\"Feature 1 (drifted)\", color=\"orange\")\n",
|
225
|
+
"plt.title(f\"KL Divergence for Feature 1: {kl_X1:.4f}\")\n",
|
226
|
+
"plt.legend()\n",
|
227
|
+
"\n",
|
228
|
+
"plt.subplot(1, 2, 2)\n",
|
229
|
+
"plt.hist(X2, bins=20, alpha=0.6, label=\"Feature 2 (original)\", color=\"blue\")\n",
|
230
|
+
"plt.hist(X2_drifted, bins=20, alpha=0.6, label=\"Feature 2 (drifted)\", color=\"orange\")\n",
|
231
|
+
"plt.title(f\"KL Divergence for Feature 2: {kl_X2:.4f}\")\n",
|
232
|
+
"plt.legend()\n",
|
233
|
+
"\n",
|
234
|
+
"plt.show()\n",
|
235
|
+
"\n",
|
236
|
+
"# Step 7: Print performance metrics\n",
|
237
|
+
"print(f\"Model accuracy on drifted data: {accuracy_drifted:.4f}\")\n",
|
238
|
+
"print(f\"Log loss on drifted data: {log_loss_drifted:.4f}\")\n"
|
239
|
+
]
|
240
|
+
},
|
241
|
+
{
|
242
|
+
"cell_type": "code",
|
243
|
+
"execution_count": null,
|
244
|
+
"metadata": {
|
245
|
+
"colab": {
|
246
|
+
"base_uri": "https://localhost:8080/"
|
247
|
+
},
|
248
|
+
"id": "hAnWt1NKfg_c",
|
249
|
+
"outputId": "cd7ab20d-b6d8-433c-f6b2-5875068ac0ae"
|
250
|
+
},
|
251
|
+
"outputs": [],
|
252
|
+
"source": [
|
253
|
+
"import pandas as pd\n",
|
254
|
+
"import numpy as np\n",
|
255
|
+
"from sklearn.model_selection import train_test_split\n",
|
256
|
+
"from sklearn.linear_model import LogisticRegression\n",
|
257
|
+
"from sklearn.metrics import classification_report\n",
|
258
|
+
"\n",
|
259
|
+
"# Simulated dataset\n",
|
260
|
+
"data = pd.DataFrame({\n",
|
261
|
+
" 'Income': [30000, 45000, 60000, 80000, 20000, 50000],\n",
|
262
|
+
" 'CreditScore': [600, 650, 700, 750, 550, 680],\n",
|
263
|
+
" 'Gender': [0, 1, 0, 1, 0, 1], # 0: Male, 1: Female\n",
|
264
|
+
" 'Approved': [0, 1, 1, 1, 0, 1]\n",
|
265
|
+
"})\n",
|
266
|
+
"\n",
|
267
|
+
"X = data[['Income', 'CreditScore', 'Gender']]\n",
|
268
|
+
"y = data['Approved']\n",
|
269
|
+
"\n",
|
270
|
+
"# Train-test split\n",
|
271
|
+
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
|
272
|
+
"\n",
|
273
|
+
"# Logistic Regression model\n",
|
274
|
+
"model = LogisticRegression()\n",
|
275
|
+
"model.fit(X_train, y_train)\n",
|
276
|
+
"y_pred = model.predict(X_test)\n",
|
277
|
+
"\n",
|
278
|
+
"# Display results\n",
|
279
|
+
"print(\"Classification Report:\\n\", classification_report(y_test, y_pred))\n",
|
280
|
+
"\n",
|
281
|
+
"# Identifying bias in coefficients\n",
|
282
|
+
"print(\"Model Coefficients:\", model.coef_)\n"
|
283
|
+
]
|
284
|
+
},
|
285
|
+
{
|
286
|
+
"cell_type": "code",
|
287
|
+
"execution_count": null,
|
288
|
+
"metadata": {
|
289
|
+
"colab": {
|
290
|
+
"base_uri": "https://localhost:8080/"
|
291
|
+
},
|
292
|
+
"id": "MoNlmk5jfikz",
|
293
|
+
"outputId": "8b987f2b-983f-48cf-89b5-f203712a144e"
|
294
|
+
},
|
295
|
+
"outputs": [],
|
296
|
+
"source": [
|
297
|
+
"import tensorflow as tf\n",
|
298
|
+
"from tensorflow.keras import Sequential\n",
|
299
|
+
"from tensorflow.keras.layers import Dense\n",
|
300
|
+
"import numpy as np\n",
|
301
|
+
"\n",
|
302
|
+
"# Small synthetic dataset\n",
|
303
|
+
"X = np.random.rand(100, 1)\n",
|
304
|
+
"y = X**2 + np.random.normal(0, 0.05, (100, 1))\n",
|
305
|
+
"\n",
|
306
|
+
"# Train-test split\n",
|
307
|
+
"X_train, X_test = X[:80], X[80:]\n",
|
308
|
+
"y_train, y_test = y[:80], y[80:]\n",
|
309
|
+
"\n",
|
310
|
+
"# Overfitted Neural Network\n",
|
311
|
+
"model = Sequential([\n",
|
312
|
+
" Dense(128, activation='relu', input_dim=1),\n",
|
313
|
+
" Dense(128, activation='relu'),\n",
|
314
|
+
" Dense(1)\n",
|
315
|
+
"])\n",
|
316
|
+
"\n",
|
317
|
+
"model.compile(optimizer='adam', loss='mse', metrics=['mae'])\n",
|
318
|
+
"model.fit(X_train, y_train, epochs=200, verbose=0)\n",
|
319
|
+
"\n",
|
320
|
+
"# Evaluation\n",
|
321
|
+
"train_loss = model.evaluate(X_train, y_train, verbose=0)\n",
|
322
|
+
"test_loss = model.evaluate(X_test, y_test, verbose=0)\n",
|
323
|
+
"print(\"Train Loss:\", train_loss)\n",
|
324
|
+
"print(\"Test Loss:\", test_loss)\n"
|
325
|
+
]
|
326
|
+
}
|
327
|
+
],
|
328
|
+
"metadata": {
|
329
|
+
"colab": {
|
330
|
+
"provenance": []
|
331
|
+
},
|
332
|
+
"kernelspec": {
|
333
|
+
"display_name": "Python 3 (ipykernel)",
|
334
|
+
"language": "python",
|
335
|
+
"name": "python3"
|
336
|
+
},
|
337
|
+
"language_info": {
|
338
|
+
"codemirror_mode": {
|
339
|
+
"name": "ipython",
|
340
|
+
"version": 3
|
341
|
+
},
|
342
|
+
"file_extension": ".py",
|
343
|
+
"mimetype": "text/x-python",
|
344
|
+
"name": "python",
|
345
|
+
"nbconvert_exporter": "python",
|
346
|
+
"pygments_lexer": "ipython3",
|
347
|
+
"version": "3.12.4"
|
348
|
+
}
|
349
|
+
},
|
350
|
+
"nbformat": 4,
|
351
|
+
"nbformat_minor": 4
|
352
|
+
}
|