noshot 6.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. noshot/data/ML TS XAI/ML/CNN(Image_for_Folders_5).ipynb +201 -0
  2. noshot/data/ML TS XAI/ML/CNN(Image_form_Folder_2).ipynb +201 -0
  3. noshot/data/ML TS XAI/ML/ML 1/1. EDA-PCA (Balance Scale Dataset).ipynb +147 -0
  4. noshot/data/ML TS XAI/ML/ML 1/1. EDA-PCA (Rice Dataset).ipynb +181 -0
  5. noshot/data/ML TS XAI/ML/ML 1/10. HMM Veterbi.ipynb +152 -0
  6. noshot/data/ML TS XAI/ML/ML 1/2. KNN (Balance Scale Dataset).ipynb +117 -0
  7. noshot/data/ML TS XAI/ML/ML 1/2. KNN (Iris Dataset).ipynb +156 -0
  8. noshot/data/ML TS XAI/ML/ML 1/2. KNN (Sobar-72 Dataset).ipynb +215 -0
  9. noshot/data/ML TS XAI/ML/ML 1/3. LDA (Balance Scale Dataset).ipynb +78 -0
  10. noshot/data/ML TS XAI/ML/ML 1/3. LDA (NPHA Doctor Visits Dataset).ipynb +114 -0
  11. noshot/data/ML TS XAI/ML/ML 1/4. Linear Regression (Machine Dataset).ipynb +115 -0
  12. noshot/data/ML TS XAI/ML/ML 1/4. Linear Regression (Real Estate Dataset).ipynb +146 -0
  13. noshot/data/ML TS XAI/ML/ML 1/5. Logistic Regression (Magic04 Dataset).ipynb +130 -0
  14. noshot/data/ML TS XAI/ML/ML 1/5. Logistic Regression (Wine Dataset).ipynb +112 -0
  15. noshot/data/ML TS XAI/ML/ML 1/6. Naive Bayes Classifier (Agaricus Lepiota Dataset).ipynb +118 -0
  16. noshot/data/ML TS XAI/ML/ML 1/6. Naive Bayes Classifier (Wine Dataset).ipynb +89 -0
  17. noshot/data/ML TS XAI/ML/ML 1/7. SVM (Rice Dataset).ipynb +120 -0
  18. noshot/data/ML TS XAI/ML/ML 1/8. FeedForward NN (Sobar72 Dataset).ipynb +262 -0
  19. noshot/data/ML TS XAI/ML/ML 1/9. CNN (Cifar10 Dataset).ipynb +156 -0
  20. noshot/data/ML TS XAI/ML/ML 2/1. PCA.ipynb +162 -0
  21. noshot/data/ML TS XAI/ML/ML 2/10. CNN.ipynb +100 -0
  22. noshot/data/ML TS XAI/ML/ML 2/11. HMM.ipynb +336 -0
  23. noshot/data/ML TS XAI/ML/ML 2/2. KNN.ipynb +149 -0
  24. noshot/data/ML TS XAI/ML/ML 2/3. LDA.ipynb +132 -0
  25. noshot/data/ML TS XAI/ML/ML 2/4. Linear Regression.ipynb +86 -0
  26. noshot/data/ML TS XAI/ML/ML 2/5. Logistic Regression.ipynb +115 -0
  27. noshot/data/ML TS XAI/ML/ML 2/6. Naive Bayes (Titanic).ipynb +196 -0
  28. noshot/data/ML TS XAI/ML/ML 2/6. Naive Bayes (Wine).ipynb +98 -0
  29. noshot/data/ML TS XAI/ML/ML 2/7. SVM Linear.ipynb +109 -0
  30. noshot/data/ML TS XAI/ML/ML 2/8. SVM Non-Linear.ipynb +195 -0
  31. noshot/data/ML TS XAI/ML/ML 2/9. FNN With Regularization.ipynb +189 -0
  32. noshot/data/ML TS XAI/ML/ML 2/9. FNN Without Regularization.ipynb +197 -0
  33. noshot/data/ML TS XAI/ML/ML 2/All in One Lab CIA 1 Q.ipynb +1087 -0
  34. noshot/data/ML TS XAI/ML/ML 3 (Latest)/1. PCA EDA.ipynb +274 -0
  35. noshot/data/ML TS XAI/ML/ML 3 (Latest)/10. CNN.ipynb +170 -0
  36. noshot/data/ML TS XAI/ML/ML 3 (Latest)/11. HMM 2.ipynb +1087 -0
  37. noshot/data/ML TS XAI/ML/ML 3 (Latest)/11. HMM 3.ipynb +178 -0
  38. noshot/data/ML TS XAI/ML/ML 3 (Latest)/11. HMM 4.ipynb +185 -0
  39. noshot/data/ML TS XAI/ML/ML 3 (Latest)/11. HMM.ipynb +106 -0
  40. noshot/data/ML TS XAI/ML/ML 3 (Latest)/2. KNN.ipynb +177 -0
  41. noshot/data/ML TS XAI/ML/ML 3 (Latest)/3. LDA.ipynb +195 -0
  42. noshot/data/ML TS XAI/ML/ML 3 (Latest)/4. Linear Regression.ipynb +267 -0
  43. noshot/data/ML TS XAI/ML/ML 3 (Latest)/5. Logistic Regression.ipynb +104 -0
  44. noshot/data/ML TS XAI/ML/ML 3 (Latest)/6. Bayesian Classifier.ipynb +109 -0
  45. noshot/data/ML TS XAI/ML/ML 3 (Latest)/7. SVM.ipynb +220 -0
  46. noshot/data/ML TS XAI/ML/ML 3 (Latest)/8. MLP.ipynb +99 -0
  47. noshot/data/ML TS XAI/ML/ML 3 (Latest)/9. Ridge - Lasso.ipynb +211 -0
  48. noshot/data/ML TS XAI/ML/ML 3 (Latest)/9. Ridge Lasso 2.ipynb +99 -0
  49. noshot/data/ML TS XAI/ML/ML 3 (Latest)/Image Load Example.ipynb +118 -0
  50. noshot/data/ML TS XAI/ML/ML 3 (Latest)/Updated_Untitled.ipynb +603 -0
  51. noshot/data/ML TS XAI/ML/Rolls Royce AllinOne.ipynb +691 -0
  52. {noshot-6.0.0.dist-info → noshot-8.0.0.dist-info}/METADATA +1 -1
  53. noshot-8.0.0.dist-info/RECORD +60 -0
  54. {noshot-6.0.0.dist-info → noshot-8.0.0.dist-info}/WHEEL +1 -1
  55. noshot/data/ML TS XAI/XAI/Q1.ipynb +0 -377
  56. noshot/data/ML TS XAI/XAI/Q2.ipynb +0 -362
  57. noshot/data/ML TS XAI/XAI/Q3.ipynb +0 -637
  58. noshot/data/ML TS XAI/XAI/Q4.ipynb +0 -206
  59. noshot/data/ML TS XAI/XAI/Q5.ipynb +0 -1018
  60. noshot-6.0.0.dist-info/RECORD +0 -14
  61. {noshot-6.0.0.dist-info → noshot-8.0.0.dist-info}/licenses/LICENSE.txt +0 -0
  62. {noshot-6.0.0.dist-info → noshot-8.0.0.dist-info}/top_level.txt +0 -0
@@ -1,1018 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "raw",
5
- "id": "8162b3fe-ad19-43cb-bfcc-e82ded327445",
6
- "metadata": {},
7
- "source": [
8
- "1.\tPerform a counterfactual explanation on the Iris dataset. Analyze how the model’s prediction would change by modifying certain feature values and explain which feature alterations have the most significant impact on the outcome.\n",
9
- "2.\t Conduct a feature importance analysis on the Diabetes dataset. Use an appropriate technique to evaluate the contribution of each feature to the model's predictions, and identify which features are the most influential in determining the outcome"
10
- ]
11
- },
12
- {
13
- "cell_type": "markdown",
14
- "id": "8ac58536",
15
- "metadata": {},
16
- "source": [
17
- "DATASET LINK-https://drive.google.com/drive/folders/1fcTn5rc-CcqX40JzVq4qwPkeRo3_fzGa?usp=sharing"
18
- ]
19
- },
20
- {
21
- "cell_type": "markdown",
22
- "id": "e0cc5c1d",
23
- "metadata": {},
24
- "source": [
25
- "### 1)Perform a counterfactual explanation on the Iris dataset. Analyze how the model’sprediction would change by modifying certain feature values and explain whichfeature alterations have the most significant impact on the outcome"
26
- ]
27
- },
28
- {
29
- "cell_type": "markdown",
30
- "id": "cacd38ba",
31
- "metadata": {},
32
- "source": [
33
- "Install Required Libraries"
34
- ]
35
- },
36
- {
37
- "cell_type": "code",
38
- "execution_count": null,
39
- "id": "d140f733",
40
- "metadata": {},
41
- "outputs": [],
42
- "source": [
43
- "#!pip install scikit-learn pandas matplotlib seaborn --quiet\n"
44
- ]
45
- },
46
- {
47
- "cell_type": "markdown",
48
- "id": "dda2c1ac",
49
- "metadata": {},
50
- "source": [
51
- "Load and Preprocess the Dataset"
52
- ]
53
- },
54
- {
55
- "cell_type": "code",
56
- "execution_count": null,
57
- "id": "4a181dfe",
58
- "metadata": {},
59
- "outputs": [],
60
- "source": [
61
- "import pandas as pd\n",
62
- "from sklearn.preprocessing import LabelEncoder\n",
63
- "\n",
64
- "# Load the data\n",
65
- "df = pd.read_csv('Iris.csv')\n",
66
- "df = df.drop(columns=['Id'])\n",
67
- "\n",
68
- "# Encode species\n",
69
- "le = LabelEncoder()\n",
70
- "df['Species'] = le.fit_transform(df['Species'])\n",
71
- "\n",
72
- "df.head()\n"
73
- ]
74
- },
75
- {
76
- "cell_type": "markdown",
77
- "id": "b4388b9e",
78
- "metadata": {},
79
- "source": [
80
- "Train a Random Forest Classifier"
81
- ]
82
- },
83
- {
84
- "cell_type": "code",
85
- "execution_count": null,
86
- "id": "75acbf55",
87
- "metadata": {},
88
- "outputs": [],
89
- "source": [
90
- "from sklearn.model_selection import train_test_split\n",
91
- "from sklearn.ensemble import RandomForestClassifier\n",
92
- "\n",
93
- "X = df.drop('Species', axis=1)\n",
94
- "y = df['Species']\n",
95
- "\n",
96
- "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
97
- "\n",
98
- "model = RandomForestClassifier(random_state=42)\n",
99
- "model.fit(X_train, y_train)\n"
100
- ]
101
- },
102
- {
103
- "cell_type": "markdown",
104
- "id": "43dc471b",
105
- "metadata": {},
106
- "source": [
107
- "Choose a Sample and Predict Original Class"
108
- ]
109
- },
110
- {
111
- "cell_type": "code",
112
- "execution_count": null,
113
- "id": "63f18f66",
114
- "metadata": {
115
- "scrolled": true
116
- },
117
- "outputs": [],
118
- "source": [
119
- "# Choose one instance to explain\n",
120
- "query_instance = X_test.iloc[[0]]\n",
121
- "original_pred = model.predict(query_instance)[0]\n",
122
- "\n",
123
- "print(\"Original Class:\", original_pred, \"-\", le.inverse_transform([original_pred])[0])\n",
124
- "query_instance\n"
125
- ]
126
- },
127
- {
128
- "cell_type": "markdown",
129
- "id": "4c46391c",
130
- "metadata": {},
131
- "source": [
132
- "Generate Manual Counterfactuals (Systematic Perturbation)"
133
- ]
134
- },
135
- {
136
- "cell_type": "code",
137
- "execution_count": null,
138
- "id": "b1cac1c2",
139
- "metadata": {},
140
- "outputs": [],
141
- "source": [
142
- "import numpy as np\n",
143
- "\n",
144
- "# Define steps for perturbation\n",
145
- "steps = 20\n",
146
- "feature_changes = {}\n",
147
- "\n",
148
- "# Store counterfactual predictions\n",
149
- "for feature in X.columns:\n",
150
- " values = np.linspace(X[feature].min(), X[feature].max(), steps)\n",
151
- " predictions = []\n",
152
- "\n",
153
- " for val in values:\n",
154
- " modified_instance = query_instance.copy()\n",
155
- " modified_instance[feature] = val\n",
156
- " pred = model.predict(modified_instance)[0]\n",
157
- " predictions.append(pred)\n",
158
- " \n",
159
- " feature_changes[feature] = {\n",
160
- " 'values': values,\n",
161
- " 'predictions': predictions\n",
162
- " }\n"
163
- ]
164
- },
165
- {
166
- "cell_type": "markdown",
167
- "id": "1453dedc",
168
- "metadata": {},
169
- "source": [
170
- "Visualize Counterfactual Impact (Prediction vs. Feature Value)"
171
- ]
172
- },
173
- {
174
- "cell_type": "code",
175
- "execution_count": null,
176
- "id": "2da50a26",
177
- "metadata": {},
178
- "outputs": [],
179
- "source": [
180
- "import matplotlib.pyplot as plt\n",
181
- "import seaborn as sns\n",
182
- "\n",
183
- "# Plot how prediction changes with each feature\n",
184
- "plt.figure(figsize=(12, 8))\n",
185
- "for i, (feature, data) in enumerate(feature_changes.items()):\n",
186
- " plt.subplot(2, 2, i+1)\n",
187
- " sns.scatterplot(x=data['values'], y=data['predictions'])\n",
188
- " plt.title(f\"{feature} vs. Predicted Class\")\n",
189
- " plt.xlabel(feature)\n",
190
- " plt.ylabel(\"Predicted Class\")\n",
191
- "\n",
192
- "plt.tight_layout()\n",
193
- "plt.show()\n"
194
- ]
195
- },
196
- {
197
- "cell_type": "markdown",
198
- "id": "67c98c45",
199
- "metadata": {},
200
- "source": [
201
- "Identify Most Impactful Features"
202
- ]
203
- },
204
- {
205
- "cell_type": "code",
206
- "execution_count": null,
207
- "id": "dd73bf4b",
208
- "metadata": {},
209
- "outputs": [],
210
- "source": [
211
- "# Count how many times each feature caused a class change\n",
212
- "impact_score = {}\n",
213
- "for feature, data in feature_changes.items():\n",
214
- " transitions = sum(np.array(data['predictions']) != original_pred)\n",
215
- " impact_score[feature] = transitions\n",
216
- "\n",
217
- "# Sort by impact\n",
218
- "sorted_impact = dict(sorted(impact_score.items(), key=lambda x: x[1], reverse=True))\n",
219
- "\n",
220
- "print(\"Feature Impact Scores (higher = more influence):\")\n",
221
- "for feature, score in sorted_impact.items():\n",
222
- " print(f\"- {feature}: {score} transitions\")\n"
223
- ]
224
- },
225
- {
226
- "cell_type": "markdown",
227
- "id": "f836f808",
228
- "metadata": {},
229
- "source": [
230
- "Visualize Most Influential Features (Bar Plot)"
231
- ]
232
- },
233
- {
234
- "cell_type": "code",
235
- "execution_count": null,
236
- "id": "8c35c0b4",
237
- "metadata": {},
238
- "outputs": [],
239
- "source": [
240
- "plt.figure(figsize=(8, 4))\n",
241
- "sns.barplot(x=list(sorted_impact.keys()), y=list(sorted_impact.values()))\n",
242
- "plt.title(\"Feature Impact on Class Change\")\n",
243
- "plt.xlabel(\"Feature\")\n",
244
- "plt.ylabel(\"Number of Prediction Changes\")\n",
245
- "plt.show()\n"
246
- ]
247
- },
248
- {
249
- "cell_type": "markdown",
250
- "id": "49e6ebba",
251
- "metadata": {},
252
- "source": [
253
- "Heatmap of Prediction Changes"
254
- ]
255
- },
256
- {
257
- "cell_type": "code",
258
- "execution_count": null,
259
- "id": "02c1db52",
260
- "metadata": {
261
- "scrolled": true
262
- },
263
- "outputs": [],
264
- "source": [
265
- "import numpy as np\n",
266
- "import seaborn as sns\n",
267
- "import matplotlib.pyplot as plt\n",
268
- "\n",
269
- "# Create a matrix: features x steps\n",
270
- "heatmap_data = []\n",
271
- "for feature in X.columns:\n",
272
- " pred_labels = feature_changes[feature]['predictions']\n",
273
- " heatmap_data.append(pred_labels)\n",
274
- "\n",
275
- "# Convert to numpy array\n",
276
- "heatmap_array = np.array(heatmap_data)\n",
277
- "\n",
278
- "# Plot\n",
279
- "plt.figure(figsize=(10, 5))\n",
280
- "sns.heatmap(heatmap_array, cmap=\"viridis\", xticklabels=False, yticklabels=list(X.columns))\n",
281
- "plt.title(\"Prediction Change Heatmap Across Feature Perturbations\")\n",
282
- "plt.xlabel(\"Steps\")\n",
283
- "plt.ylabel(\"Features\")\n",
284
- "plt.show()\n"
285
- ]
286
- },
287
- {
288
- "cell_type": "markdown",
289
- "id": "8205bebb",
290
- "metadata": {},
291
- "source": [
292
- "Line Plot of Predictions per Feature"
293
- ]
294
- },
295
- {
296
- "cell_type": "code",
297
- "execution_count": null,
298
- "id": "8354eb84",
299
- "metadata": {},
300
- "outputs": [],
301
- "source": [
302
- "plt.figure(figsize=(12, 8))\n",
303
- "for i, (feature, data) in enumerate(feature_changes.items()):\n",
304
- " plt.subplot(2, 2, i+1)\n",
305
- " plt.plot(data['values'], data['predictions'], marker='o')\n",
306
- " plt.title(f\"{feature} Influence on Prediction\")\n",
307
- " plt.xlabel(feature)\n",
308
- " plt.ylabel(\"Predicted Class\")\n",
309
- " plt.grid(True)\n",
310
- "\n",
311
- "plt.tight_layout()\n",
312
- "plt.show()\n"
313
- ]
314
- },
315
- {
316
- "cell_type": "markdown",
317
- "id": "87eebb6c",
318
- "metadata": {},
319
- "source": [
320
- "Feature Perturbation Transition Steps Plot"
321
- ]
322
- },
323
- {
324
- "cell_type": "code",
325
- "execution_count": null,
326
- "id": "4838cf9e",
327
- "metadata": {},
328
- "outputs": [],
329
- "source": [
330
- "# Track transition steps for each feature where prediction changes\n",
331
- "transition_steps = []\n",
332
- "\n",
333
- "for feature, data in feature_changes.items():\n",
334
- " preds = np.array(data['predictions'])\n",
335
- " original = preds[0]\n",
336
- " transition_point = np.where(preds != original)[0]\n",
337
- " if len(transition_point) > 0:\n",
338
- " transition_steps.append({'Feature': feature, 'Step': transition_point[0]}) # first transition\n",
339
- "\n",
340
- "# Plotting the results using a DataFrame\n",
341
- "if transition_steps:\n",
342
- " transition_df = pd.DataFrame(transition_steps)\n",
343
- "\n",
344
- " plt.figure(figsize=(8, 4))\n",
345
- " sns.barplot(data=transition_df,hue='Feature',x='Feature', y='Step', palette='coolwarm',legend=False)\n",
346
- " plt.title(\"Feature Perturbation Step Causing First Class Change\")\n",
347
- " plt.ylabel(\"Step Index (Lower = More Sensitive)\")\n",
348
- " plt.xticks(rotation=45)\n",
349
- " plt.tight_layout()\n",
350
- " plt.show()\n",
351
- "else:\n",
352
- " print(\"No class transitions detected during feature perturbation.\")\n"
353
- ]
354
- },
355
- {
356
- "cell_type": "markdown",
357
- "id": "9f8b8f95",
358
- "metadata": {},
359
- "source": [
360
- "Parallel Coordinates Plot (Optional - Multifeature View)"
361
- ]
362
- },
363
- {
364
- "cell_type": "code",
365
- "execution_count": null,
366
- "id": "00312184",
367
- "metadata": {
368
- "scrolled": true
369
- },
370
- "outputs": [],
371
- "source": [
372
- "from pandas.plotting import parallel_coordinates\n",
373
- "\n",
374
- "# Combine and label original + a few variations\n",
375
- "combined_df = query_instance.copy()\n",
376
- "for feature in X.columns:\n",
377
- " alt = query_instance.copy()\n",
378
- " alt[feature] += (X[feature].max() - X[feature].min()) * 0.2 # 20% bump\n",
379
- " combined_df = pd.concat([combined_df, alt], axis=0)\n",
380
- "\n",
381
- "# Add prediction labels for visualization\n",
382
- "combined_df['Prediction'] = model.predict(combined_df)\n",
383
- "\n",
384
- "# Convert numeric class to label\n",
385
- "combined_df['Prediction'] = le.inverse_transform(combined_df['Prediction'])\n",
386
- "\n",
387
- "# Add label for original vs modified\n",
388
- "combined_df['Type'] = ['Original'] + ['Modified'] * (len(combined_df)-1)\n",
389
- "\n",
390
- "# Parallel coordinates plot\n",
391
- "plt.figure(figsize=(10, 6))\n",
392
- "parallel_coordinates(combined_df.drop(columns=['Type']), class_column='Prediction', color=('#1f77b4', '#ff7f0e', '#2ca02c'))\n",
393
- "plt.title(\"Parallel Coordinates Plot: Feature Impact on Prediction\")\n",
394
- "plt.xticks(rotation=45)\n",
395
- "plt.grid(True)\n",
396
- "plt.show()\n"
397
- ]
398
- },
399
- {
400
- "cell_type": "markdown",
401
- "id": "6085e06e",
402
- "metadata": {},
403
- "source": [
404
- "Confusion Matrix"
405
- ]
406
- },
407
- {
408
- "cell_type": "code",
409
- "execution_count": null,
410
- "id": "752d34d6",
411
- "metadata": {},
412
- "outputs": [],
413
- "source": [
414
- "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
415
- "\n",
416
- "# Predictions on test set\n",
417
- "y_pred = model.predict(X_test)\n",
418
- "\n",
419
- "# Plot confusion matrix\n",
420
- "cm = confusion_matrix(y_test, y_pred)\n",
421
- "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)\n",
422
- "disp.plot(cmap='Blues')\n",
423
- "plt.title(\"Confusion Matrix\")\n",
424
- "plt.show()\n"
425
- ]
426
- },
427
- {
428
- "cell_type": "markdown",
429
- "id": "12137965",
430
- "metadata": {},
431
- "source": [
432
- "Feature Importance Plot"
433
- ]
434
- },
435
- {
436
- "cell_type": "code",
437
- "execution_count": null,
438
- "id": "f2a6d755",
439
- "metadata": {
440
- "scrolled": true
441
- },
442
- "outputs": [],
443
- "source": [
444
- "import numpy as np\n",
445
- "\n",
446
- "importances = model.feature_importances_\n",
447
- "indices = np.argsort(importances)[::-1]\n",
448
- "\n",
449
- "plt.figure(figsize=(8, 4))\n",
450
- "sns.barplot(x=[X.columns[i] for i in indices], y=importances[indices])\n",
451
- "plt.title(\"Feature Importances (Random Forest)\")\n",
452
- "plt.xlabel(\"Feature\")\n",
453
- "plt.ylabel(\"Importance\")\n",
454
- "plt.show()\n"
455
- ]
456
- },
457
- {
458
- "cell_type": "markdown",
459
- "id": "7455f044",
460
- "metadata": {},
461
- "source": [
462
- "Pair Plot (Colored by Predicted Class)"
463
- ]
464
- },
465
- {
466
- "cell_type": "code",
467
- "execution_count": null,
468
- "id": "ea47af82",
469
- "metadata": {
470
- "scrolled": true
471
- },
472
- "outputs": [],
473
- "source": [
474
- "import warnings\n",
475
- "import seaborn as sns\n",
476
- "import matplotlib.pyplot as plt\n",
477
- "\n",
478
- "# Add predicted labels for visualization\n",
479
- "viz_df = X_test.copy()\n",
480
- "viz_df['True'] = le.inverse_transform(y_test)\n",
481
- "viz_df['Predicted'] = le.inverse_transform(y_pred)\n",
482
- "\n",
483
- "# Suppress layout warnings temporarily\n",
484
- "with warnings.catch_warnings():\n",
485
- " warnings.filterwarnings(\"ignore\", message=\".*figure layout has changed to tight.*\")\n",
486
- "\n",
487
- " # Create pairplot\n",
488
- " sns.set(style=\"ticks\")\n",
489
- " g = sns.pairplot(viz_df, hue='Predicted', corner=True, palette='Set1')\n",
490
- " g.fig.suptitle(\"Pair Plot by Predicted Class\", y=1.02)\n",
491
- "\n",
492
- " plt.show()\n"
493
- ]
494
- },
495
- {
496
- "cell_type": "markdown",
497
- "id": "412e6b11",
498
- "metadata": {},
499
- "source": [
500
- "Decision Boundary (2D Projection)"
501
- ]
502
- },
503
- {
504
- "cell_type": "code",
505
- "execution_count": null,
506
- "id": "8aca69c1",
507
- "metadata": {},
508
- "outputs": [],
509
- "source": [
510
- "import numpy as np\n",
511
- "import matplotlib.pyplot as plt\n",
512
- "import seaborn as sns\n",
513
- "\n",
514
- "# Choose two features for 2D projection\n",
515
- "feature1 = 'SepalLengthCm' # Change as needed\n",
516
- "feature2 = 'SepalWidthCm' # Change as needed\n",
517
- "\n",
518
- "# Create a mesh grid to evaluate model predictions over a 2D space\n",
519
- "x_min, x_max = X[feature1].min() - 1, X[feature1].max() + 1\n",
520
- "y_min, y_max = X[feature2].min() - 1, X[feature2].max() + 1\n",
521
- "\n",
522
- "xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),\n",
523
- " np.linspace(y_min, y_max, 100))\n",
524
- "\n",
525
- "# Prepare the input for the model (take only the two features for prediction)\n",
526
- "grid_points = np.c_[xx.ravel(), yy.ravel()]\n",
527
- "\n",
528
- "# Create a DataFrame with the same structure as the training data\n",
529
- "# Fill the other features (PetalLengthCm, PetalWidthCm) with their mean values\n",
530
- "grid_df = pd.DataFrame(grid_points, columns=[feature1, feature2])\n",
531
- "grid_df['PetalLengthCm'] = X['PetalLengthCm'].mean()\n",
532
- "grid_df['PetalWidthCm'] = X['PetalWidthCm'].mean()\n",
533
- "\n",
534
- "# Make predictions on the grid points\n",
535
- "predictions = model.predict(grid_df)\n",
536
- "\n",
537
- "# Reshape predictions to match the grid shape\n",
538
- "Z = predictions.reshape(xx.shape)\n",
539
- "\n",
540
- "# Plot the decision boundary\n",
541
- "plt.figure(figsize=(10, 8))\n",
542
- "plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlBu)\n",
543
- "\n",
544
- "# Plot the training points\n",
545
- "sns.scatterplot(x=X[feature1], y=X[feature2], hue=y, palette=\"deep\", s=100, edgecolor=\"k\")\n",
546
- "\n",
547
- "# Highlight the query instance\n",
548
- "sns.scatterplot(x=query_instance[feature1], y=query_instance[feature2], color=\"black\", marker=\"*\", s=200, label=\"Query Instance\")\n",
549
- "\n",
550
- "# Labeling the plot\n",
551
- "plt.title(f\"Decision Boundary for {feature1} vs {feature2}\")\n",
552
- "plt.xlabel(feature1)\n",
553
- "plt.ylabel(feature2)\n",
554
- "plt.legend()\n",
555
- "plt.tight_layout()\n",
556
- "plt.show()\n"
557
- ]
558
- },
559
- {
560
- "cell_type": "code",
561
- "execution_count": null,
562
- "id": "ccc422a7",
563
- "metadata": {},
564
- "outputs": [],
565
- "source": []
566
- },
567
- {
568
- "cell_type": "markdown",
569
- "id": "76b8511d",
570
- "metadata": {},
571
- "source": [
572
- "### 2)Conduct a feature importance analysis on the Diabetes dataset. Use an appropriatetechnique to evaluate the contribution of each feature to the model's predictions, andidentify which features are the most influential in determining the outcome."
573
- ]
574
- },
575
- {
576
- "cell_type": "markdown",
577
- "id": "1e28323c",
578
- "metadata": {},
579
- "source": [
580
- "Install necessary libraries"
581
- ]
582
- },
583
- {
584
- "cell_type": "code",
585
- "execution_count": null,
586
- "id": "5e573730",
587
- "metadata": {},
588
- "outputs": [],
589
- "source": [
590
- "#!pip install pandas matplotlib seaborn scikit-learn --quiet\n"
591
- ]
592
- },
593
- {
594
- "cell_type": "markdown",
595
- "id": "8ce0f9d4",
596
- "metadata": {},
597
- "source": [
598
- "Import Libraries"
599
- ]
600
- },
601
- {
602
- "cell_type": "code",
603
- "execution_count": null,
604
- "id": "79f02371",
605
- "metadata": {},
606
- "outputs": [],
607
- "source": [
608
- "import pandas as pd\n",
609
- "import matplotlib.pyplot as plt\n",
610
- "import seaborn as sns\n",
611
- "import shap\n",
612
- "from sklearn.ensemble import RandomForestClassifier\n",
613
- "from sklearn.model_selection import train_test_split\n",
614
- "from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay\n",
615
- "import warnings\n",
616
- "warnings.filterwarnings('ignore')\n"
617
- ]
618
- },
619
- {
620
- "cell_type": "markdown",
621
- "id": "2afc266e",
622
- "metadata": {},
623
- "source": [
624
- "Load and Inspect the Dataset"
625
- ]
626
- },
627
- {
628
- "cell_type": "code",
629
- "execution_count": null,
630
- "id": "47b7f945",
631
- "metadata": {},
632
- "outputs": [],
633
- "source": [
634
- "# Load dataset\n",
635
- "df = pd.read_csv('diabetes.csv')\n",
636
- "\n",
637
- "# Display basic information\n",
638
- "print(df.info())\n",
639
- "\n",
640
- "# Check for missing values\n",
641
- "print(\"\\nMissing values:\\n\", df.isnull().sum())\n",
642
- "\n",
643
- "# Display first few rows of the dataset\n",
644
- "df.head()\n"
645
- ]
646
- },
647
- {
648
- "cell_type": "markdown",
649
- "id": "9c3cb2cb",
650
- "metadata": {},
651
- "source": [
652
- "Prepare Features and Target Variable"
653
- ]
654
- },
655
- {
656
- "cell_type": "code",
657
- "execution_count": null,
658
- "id": "fa93825e",
659
- "metadata": {},
660
- "outputs": [],
661
- "source": [
662
- "# Features and target\n",
663
- "X = df.drop('Outcome', axis=1)\n",
664
- "y = df['Outcome']\n",
665
- "\n",
666
- "# Split the data into training and testing sets\n",
667
- "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n"
668
- ]
669
- },
670
- {
671
- "cell_type": "markdown",
672
- "id": "6182d006",
673
- "metadata": {},
674
- "source": [
675
- "Train the Random Forest Model"
676
- ]
677
- },
678
- {
679
- "cell_type": "code",
680
- "execution_count": null,
681
- "id": "c3e6d1ff",
682
- "metadata": {},
683
- "outputs": [],
684
- "source": [
685
- "# Train the Random Forest model\n",
686
- "model = RandomForestClassifier(random_state=42)\n",
687
- "model.fit(X_train, y_train)\n"
688
- ]
689
- },
690
- {
691
- "cell_type": "markdown",
692
- "id": "7c1347cb",
693
- "metadata": {},
694
- "source": [
695
- "Feature Importance Visualization"
696
- ]
697
- },
698
- {
699
- "cell_type": "code",
700
- "execution_count": null,
701
- "id": "09fb9e62",
702
- "metadata": {},
703
- "outputs": [],
704
- "source": [
705
- "# Get feature importances\n",
706
- "importances = model.feature_importances_\n",
707
- "feature_names = X.columns\n",
708
- "\n",
709
- "# Create a DataFrame for visualizing feature importances\n",
710
- "feature_imp_df = pd.DataFrame({\n",
711
- " 'Feature': feature_names,\n",
712
- " 'Importance': importances\n",
713
- "}).sort_values(by='Importance', ascending=False)\n",
714
- "\n",
715
- "# Plot Feature Importance\n",
716
- "plt.figure(figsize=(10, 6))\n",
717
- "sns.barplot(x='Importance', y='Feature', data=feature_imp_df, palette='viridis')\n",
718
- "plt.title('Feature Importance - Random Forest')\n",
719
- "plt.xlabel('Importance Score')\n",
720
- "plt.ylabel('Features')\n",
721
- "plt.tight_layout()\n",
722
- "plt.show()\n"
723
- ]
724
- },
725
- {
726
- "cell_type": "markdown",
727
- "id": "4abb043b",
728
- "metadata": {},
729
- "source": [
730
- "Display Top 3 Important Features"
731
- ]
732
- },
733
- {
734
- "cell_type": "code",
735
- "execution_count": null,
736
- "id": "597afccf",
737
- "metadata": {},
738
- "outputs": [],
739
- "source": [
740
- "# Display top 3 important features\n",
741
- "top_features = feature_imp_df.head(3)\n",
742
- "top_features\n"
743
- ]
744
- },
745
- {
746
- "cell_type": "markdown",
747
- "id": "a9c73329",
748
- "metadata": {},
749
- "source": [
750
- "Confusion Matrix and Classification Report"
751
- ]
752
- },
753
- {
754
- "cell_type": "code",
755
- "execution_count": null,
756
- "id": "9738ebae",
757
- "metadata": {},
758
- "outputs": [],
759
- "source": [
760
- "# Predict on test data\n",
761
- "y_pred = model.predict(X_test)\n",
762
- "\n",
763
- "# Confusion matrix\n",
764
- "cm = confusion_matrix(y_test, y_pred)\n",
765
- "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)\n",
766
- "disp.plot(cmap='Blues')\n",
767
- "plt.title('Confusion Matrix')\n",
768
- "plt.show()\n",
769
- "# Classification report\n",
770
- "print(classification_report(y_test, y_pred))\n"
771
- ]
772
- },
773
- {
774
- "cell_type": "markdown",
775
- "id": "a4f200cd",
776
- "metadata": {},
777
- "source": [
778
- "Not Needed SHAP (Optional Advanced Feature Importance)"
779
- ]
780
- },
781
- {
782
- "cell_type": "code",
783
- "execution_count": null,
784
- "id": "d8b2d842",
785
- "metadata": {
786
- "scrolled": true
787
- },
788
- "outputs": [],
789
- "source": [
790
- "#!pip install shap"
791
- ]
792
- },
793
- {
794
- "cell_type": "markdown",
795
- "id": "75f1ea72",
796
- "metadata": {},
797
- "source": [
798
- "SHAP Analysis for Feature Contribution"
799
- ]
800
- },
801
- {
802
- "cell_type": "raw",
803
- "id": "011f0fda",
804
- "metadata": {
805
- "scrolled": true
806
- },
807
- "source": [
808
- "# Initialize SHAP\n",
809
- "shap.initjs()\n",
810
- "\n",
811
- "# Create TreeExplainer and compute SHAP values\n",
812
- "explainer = shap.TreeExplainer(model)\n",
813
- "shap_values = explainer.shap_values(X_test)\n",
814
- "\n",
815
- "# If there's a shape mismatch, fix it\n",
816
- "if isinstance(shap_values, list):\n",
817
- " shap_vals = shap_values[1] # Class 1 (diabetic)\n",
818
- "else:\n",
819
- " shap_vals = shap_values\n",
820
- "\n",
821
- "# Plot SHAP Summary Plot\n",
822
- "shap.summary_plot(shap_vals, X_test, plot_type=\"bar\")\n"
823
- ]
824
- },
825
- {
826
- "cell_type": "raw",
827
- "id": "a2bd5991",
828
- "metadata": {},
829
- "source": [
830
- "print(\"SHAP values shape:\", shap_values[1].shape)\n",
831
- "print(\"X_test shape:\", X_test.shape)\n"
832
- ]
833
- },
834
- {
835
- "cell_type": "markdown",
836
- "id": "7c98a7eb",
837
- "metadata": {},
838
- "source": [
839
- "Correlation Heatmap"
840
- ]
841
- },
842
- {
843
- "cell_type": "code",
844
- "execution_count": null,
845
- "id": "fc857f1a",
846
- "metadata": {},
847
- "outputs": [],
848
- "source": [
849
- "# Correlation Heatmap to understand relationships between features\n",
850
- "plt.figure(figsize=(10, 8))\n",
851
- "corr = df.corr()\n",
852
- "sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')\n",
853
- "plt.title(\"Correlation Heatmap\")\n",
854
- "plt.tight_layout()\n",
855
- "plt.show()\n"
856
- ]
857
- },
858
- {
859
- "cell_type": "markdown",
860
- "id": "ed6205f3",
861
- "metadata": {},
862
- "source": [
863
- "Pairplot of Top 3 Important Features"
864
- ]
865
- },
866
- {
867
- "cell_type": "code",
868
- "execution_count": null,
869
- "id": "66f0a3cf",
870
- "metadata": {},
871
- "outputs": [],
872
- "source": [
873
- "# Extract the top 3 features\n",
874
- "top_3_features = top_features['Feature'].tolist()\n",
875
- "\n",
876
- "# Pairplot for top 3 features\n",
877
- "sns.pairplot(df[top_3_features + ['Outcome']], hue='Outcome', palette='Set2')\n",
878
- "plt.suptitle(\"Pairplot of Top 3 Features by Importance\", y=1.02)\n",
879
- "plt.show()\n"
880
- ]
881
- },
882
- {
883
- "cell_type": "markdown",
884
- "id": "167f3ed8",
885
- "metadata": {},
886
- "source": [
887
- "Boxplots of Top 3 Features"
888
- ]
889
- },
890
- {
891
- "cell_type": "code",
892
- "execution_count": null,
893
- "id": "f4c03946",
894
- "metadata": {},
895
- "outputs": [],
896
- "source": [
897
- "# Boxplots to visualize the distribution of top 3 features against the outcome\n",
898
- "plt.figure(figsize=(15, 5))\n",
899
- "for i, feature in enumerate(top_3_features):\n",
900
- " plt.subplot(1, 3, i+1)\n",
901
- " sns.boxplot(x='Outcome', y=feature, data=df, palette='Set1')\n",
902
- " plt.title(f'{feature} vs Outcome')\n",
903
- " plt.tight_layout()\n",
904
- "plt.show()\n"
905
- ]
906
- },
907
- {
908
- "cell_type": "markdown",
909
- "id": "f661f7c2",
910
- "metadata": {},
911
- "source": [
912
- "Pairwise Correlation Matrix"
913
- ]
914
- },
915
- {
916
- "cell_type": "code",
917
- "execution_count": null,
918
- "id": "ddfe5f1c",
919
- "metadata": {
920
- "scrolled": true
921
- },
922
- "outputs": [],
923
- "source": [
924
- "# Create a pairwise correlation matrix to visualize relationships between all features\n",
925
- "sns.pairplot(df, hue='Outcome', palette='Set2', plot_kws={'alpha': 0.7})\n",
926
- "plt.suptitle(\"Pairwise Correlation Matrix\", y=1.02)\n",
927
- "plt.show()\n"
928
- ]
929
- },
930
- {
931
- "cell_type": "markdown",
932
- "id": "0396fa2a",
933
- "metadata": {},
934
- "source": [
935
- " ROC Curve for Model Evaluation "
936
- ]
937
- },
938
- {
939
- "cell_type": "code",
940
- "execution_count": null,
941
- "id": "28bb1256",
942
- "metadata": {},
943
- "outputs": [],
944
- "source": [
945
- "from sklearn.metrics import roc_curve, auc\n",
946
- "\n",
947
- "# Compute ROC curve\n",
948
- "fpr, tpr, thresholds = roc_curve(y_test, y_pred)\n",
949
- "roc_auc = auc(fpr, tpr)\n",
950
- "\n",
951
- "# Plot ROC curve\n",
952
- "plt.figure(figsize=(8, 6))\n",
953
- "plt.plot(fpr, tpr, color='blue', label=f'ROC curve (area = {roc_auc:.2f})')\n",
954
- "plt.plot([0, 1], [0, 1], color='gray', linestyle='--')\n",
955
- "plt.xlabel('False Positive Rate')\n",
956
- "plt.ylabel('True Positive Rate')\n",
957
- "plt.title('Receiver Operating Characteristic (ROC) Curve')\n",
958
- "plt.legend(loc='lower right')\n",
959
- "plt.tight_layout()\n",
960
- "plt.show()\n"
961
- ]
962
- },
963
- {
964
- "cell_type": "markdown",
965
- "id": "68fba66a",
966
- "metadata": {},
967
- "source": [
968
- "Feature Distribution for Top 3 Features"
969
- ]
970
- },
971
- {
972
- "cell_type": "code",
973
- "execution_count": null,
974
- "id": "0b5d695f",
975
- "metadata": {},
976
- "outputs": [],
977
- "source": [
978
- "# Visualize distribution of top 3 features\n",
979
- "plt.figure(figsize=(15, 5))\n",
980
- "for i, feature in enumerate(top_3_features):\n",
981
- " plt.subplot(1, 3, i+1)\n",
982
- " sns.histplot(df[feature], kde=True, color='skyblue')\n",
983
- " plt.title(f'Distribution of {feature}')\n",
984
- " plt.tight_layout()\n",
985
- "plt.show()\n"
986
- ]
987
- },
988
- {
989
- "cell_type": "code",
990
- "execution_count": null,
991
- "id": "43110782",
992
- "metadata": {},
993
- "outputs": [],
994
- "source": []
995
- }
996
- ],
997
- "metadata": {
998
- "kernelspec": {
999
- "display_name": "Python 3 (ipykernel)",
1000
- "language": "python",
1001
- "name": "python3"
1002
- },
1003
- "language_info": {
1004
- "codemirror_mode": {
1005
- "name": "ipython",
1006
- "version": 3
1007
- },
1008
- "file_extension": ".py",
1009
- "mimetype": "text/x-python",
1010
- "name": "python",
1011
- "nbconvert_exporter": "python",
1012
- "pygments_lexer": "ipython3",
1013
- "version": "3.12.4"
1014
- }
1015
- },
1016
- "nbformat": 4,
1017
- "nbformat_minor": 5
1018
- }