numpyUtilsUpdated 0.0.2__tar.gz → 0.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numpyutilsupdated-0.0.7/PKG-INFO +34 -0
- numpyutilsupdated-0.0.7/README.md +20 -0
- {numpyutilsupdated-0.0.2 → numpyutilsupdated-0.0.7}/pyproject.toml +1 -1
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/__init__.py +102 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set1.py +94 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set10.py +135 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set11.py +82 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set12.py +135 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set13.py +112 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set14.py +123 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set15.py +165 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set16.py +176 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set17.py +165 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set18.py +176 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set2.py +88 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set3.py +105 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set4.py +102 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set5.py +117 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set6.py +92 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set7.py +133 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set8.py +121 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set9.py +133 -0
- numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/temp.py +26 -0
- numpyutilsupdated-0.0.2/PKG-INFO +0 -95
- numpyutilsupdated-0.0.2/README.md +0 -81
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/__init__.py +0 -5
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/AlphaBetaPr.py +0 -42
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/BFS_8pz.py +0 -46
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/DFS_8pz.py +0 -50
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/DLimitS_8pz.py +0 -48
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/GBFS.py +0 -33
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/GraphColor.py +0 -42
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/Minimax.py +0 -71
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/Prologs.py +0 -53
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/UCS.py +0 -38
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/VaccumCleaner.py +0 -59
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/Wumpus.py +0 -89
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/__init__.py +0 -19
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/ticTacToe.py +0 -101
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/waterJug.py +0 -59
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/BFS1.py +0 -94
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Best_First_search1.py +0 -57
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/DFS1.py +0 -97
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/DLS1.py +0 -93
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Graph_coloring1.py +0 -47
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Greedy_BFS1.py +0 -38
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Minimax_Tic_Tac_Toe1.py +0 -102
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/UCS1.py +0 -109
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Vaccum_World1.py +0 -54
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Water_jug1.py +0 -44
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Wumpus_World1.py +0 -87
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/__init__.py +0 -16
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/cs/__init__.py +0 -14
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/cs/ass1.py +0 -102
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/cs/ass2.py +0 -71
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/cs/ass3.py +0 -88
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/cs/ass4.py +0 -76
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/cs/ass5.py +0 -0
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/APSP_floyd_warshall.py +0 -64
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/BinarySearch.py +0 -24
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Dijkstras.py +0 -87
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Greedy_01knapsack.py +0 -62
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Greedy_FKanpsack.py +0 -71
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Job_Seq.py +0 -71
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/KnapDP.py +0 -32
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Kruskals.py +0 -108
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Matrix.py +0 -116
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Prims.py +0 -92
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Quick_Sort.py +0 -62
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/SSSP_bellman_ford.py +0 -81
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/String_editing_problem.py +0 -75
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/TSP.py +0 -31
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/__init__.py +0 -24
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/getInfo.py +0 -0
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/min_max.py +0 -41
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/10_knapsack_dp.py +0 -27
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/11_bellman_ford.py +0 -24
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/12_tsp_dp.py +0 -25
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/1_greedy_knapsack.py +0 -19
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/2_job_sequencing.py +0 -19
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/3_prims_algorithm.py +0 -31
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/4_kruskals_algorithm.py +0 -31
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/5_min_max.py +0 -18
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/6_str.py +0 -53
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/6_strassen.py +0 -49
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/7_dijkstra.py +0 -30
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/8_floyd_warshall.py +0 -23
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/9_edit_distance.py +0 -22
- numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/__init__.py +0 -19
- {numpyutilsupdated-0.0.2 → numpyutilsupdated-0.0.7}/LICENSE +0 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: numpyUtilsUpdated
|
|
3
|
+
Version: 0.0.7
|
|
4
|
+
Summary: A collection of numpy utilities for data analysis and manipulation.
|
|
5
|
+
Project-URL: Homepage, https://github.com/nani-here/numpyUtilsUpdated
|
|
6
|
+
Project-URL: Issues, https://github.com/nani-here/numpyUtilsUpdated/issues
|
|
7
|
+
Author-email: Nani bolthe! <neku.enduku2005@gmail.com>
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Requires-Python: >=3.0
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# numpyUtilsUpdated
|
|
16
|
+
|
|
17
|
+
A comprehensive collection of utility functions designed to enhance your NumPy workflows. This module provides a range of tools to simplify common NumPy operations, improve code readability, and boost your productivity when working with numerical data in Python.
|
|
18
|
+
|
|
19
|
+
This module provides questions and Python codes for ML lab sets.
|
|
20
|
+
|
|
21
|
+
## Install Module
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install numpyUtilsUpdated
|
|
25
|
+
|
|
26
|
+
then import like this:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
import numpyUtilsUpdated as nup
|
|
30
|
+
|
|
31
|
+
the module will give further instructions
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install numpy pandas scikit-learn statsmodels matplotlib seaborn scipy
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# numpyUtilsUpdated
|
|
2
|
+
|
|
3
|
+
A comprehensive collection of utility functions designed to enhance your NumPy workflows. This module provides a range of tools to simplify common NumPy operations, improve code readability, and boost your productivity when working with numerical data in Python.
|
|
4
|
+
|
|
5
|
+
This module provides questions and Python codes for ML lab sets.
|
|
6
|
+
|
|
7
|
+
## Install Module
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install numpyUtilsUpdated
|
|
11
|
+
|
|
12
|
+
then import like this:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
import numpyUtilsUpdated as nup
|
|
16
|
+
|
|
17
|
+
the module will give further instructions
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install numpy pandas scikit-learn statsmodels matplotlib seaborn scipy
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
print("[numpyUtilsUpdated] : Welcome to LM Codes")
|
|
2
|
+
print("!pip install numpy pandas scikit-learn statsmodels matplotlib seaborn scipy --user")
|
|
3
|
+
|
|
4
|
+
print("""
|
|
5
|
+
Use the function:
|
|
6
|
+
|
|
7
|
+
import numpyUtilsUpdated as nup
|
|
8
|
+
nup.getSetDetails(set_no)
|
|
9
|
+
|
|
10
|
+
Example:
|
|
11
|
+
getSetDetails(1)
|
|
12
|
+
|
|
13
|
+
This will print the question for that set.
|
|
14
|
+
|
|
15
|
+
Then to get the code use:
|
|
16
|
+
Replace set1 with your set no, [set1 - set18]
|
|
17
|
+
import numpyUtilsUpdated.set1 as set
|
|
18
|
+
print(set.code)
|
|
19
|
+
""")
|
|
20
|
+
|
|
21
|
+
SET_DETAILS = {
|
|
22
|
+
"set1": {
|
|
23
|
+
"question": "Develop an SLR model to predict MBA salary using Grade 10 percentage from MBA salary.csv. Diagnose the regression model and perform residual analysis using P-P plot."
|
|
24
|
+
},
|
|
25
|
+
|
|
26
|
+
"set2": {
|
|
27
|
+
"question": "Develop a simple linear regression model between Corruption Perception Index (Y) and Gini Index (X) using country.csv. Diagnose the regression model and perform residual analysis using P-P plot."
|
|
28
|
+
},
|
|
29
|
+
|
|
30
|
+
"set3": {
|
|
31
|
+
"question": "Develop an SLR model using MBA salary.csv and detect outliers using Z-score and Cook’s distance. Make prediction and measure accuracy."
|
|
32
|
+
},
|
|
33
|
+
|
|
34
|
+
"set4": {
|
|
35
|
+
"question": "Using IPL dataset, build an MLR model, show the summary, and identify features with multicollinearity."
|
|
36
|
+
},
|
|
37
|
+
|
|
38
|
+
"set5": {
|
|
39
|
+
"question": "Using IPL dataset, build an MLR model, detect multicollinearity, rebuild the model after removing it, and perform residual analysis using P-P plot."
|
|
40
|
+
},
|
|
41
|
+
|
|
42
|
+
"set6": {
|
|
43
|
+
"question": "Using country.csv dataset, build an SLR model between Corruption Index and Gini Index, detect outliers using Z-score and Cook’s distance, and evaluate prediction accuracy."
|
|
44
|
+
},
|
|
45
|
+
|
|
46
|
+
"set7": {
|
|
47
|
+
"question": "Using GermanCredit.csv dataset, build a logistic regression model to predict credit risk, identify significant features, rebuild the model, compute confusion matrix, precision, recall, ROC and AUC."
|
|
48
|
+
},
|
|
49
|
+
|
|
50
|
+
"set8": {
|
|
51
|
+
"question": "Using GermanCredit.csv dataset, build logistic regression, compute Youden’s index for cut-offs from 0.1 to 0.5, find optimal cut-off, build confusion matrix, and compute ROC and AUC."
|
|
52
|
+
},
|
|
53
|
+
|
|
54
|
+
"set9": {
|
|
55
|
+
"question": "Demonstrate Gain and Lift charts using bank.csv dataset."
|
|
56
|
+
},
|
|
57
|
+
|
|
58
|
+
"set10": {
|
|
59
|
+
"question": "Using the tennis dataset, build a logistic regression model to predict PLAY, identify significant features, analyze coefficients, and compute confusion matrix with precision and recall."
|
|
60
|
+
},
|
|
61
|
+
|
|
62
|
+
"set11": {
|
|
63
|
+
"question": "Construct the decision tree using Gini impurity for the given training dataset."
|
|
64
|
+
},
|
|
65
|
+
|
|
66
|
+
"set12": {
|
|
67
|
+
"question": "Demonstrate Gradient Descent Algorithm for Linear Regression using Advertising.csv dataset."
|
|
68
|
+
},
|
|
69
|
+
|
|
70
|
+
"set13": {
|
|
71
|
+
"question": "Build logistic regression models on bank.csv dataset for both imbalanced and balanced data, evaluate using 5-fold cross-validation and ROC AUC score."
|
|
72
|
+
},
|
|
73
|
+
|
|
74
|
+
"set14": {
|
|
75
|
+
"question": "Demonstrate the KNN algorithm using a suitable dataset."
|
|
76
|
+
},
|
|
77
|
+
|
|
78
|
+
"set15": {
|
|
79
|
+
"question": "Using Income Data.csv dataset, perform K-Means clustering, draw scatter plot of age vs income, normalize features, plot clusters, and interpret cluster centers."
|
|
80
|
+
},
|
|
81
|
+
|
|
82
|
+
"set16": {
|
|
83
|
+
"question": "Using customerspends.csv dataset, perform K-Means clustering, visualize clusters, normalize features, use dendrogram and elbow method, and print cluster centers."
|
|
84
|
+
},
|
|
85
|
+
|
|
86
|
+
"set17": {
|
|
87
|
+
"question": "Using Income Data.csv dataset, identify clusters, normalize features, apply elbow method, and print records and cluster centers."
|
|
88
|
+
},
|
|
89
|
+
|
|
90
|
+
"set18": {
|
|
91
|
+
"question": "Perform product segmentation using K-Means clustering on customerspends.csv dataset, visualize clusters, verify with dendrogram and elbow method, and print cluster centers."
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
def getSetDetails(set_no):
|
|
96
|
+
key = f"set{set_no}"
|
|
97
|
+
|
|
98
|
+
if key in SET_DETAILS:
|
|
99
|
+
print(f"Set {set_no} Question:\n")
|
|
100
|
+
print(SET_DETAILS[key]["question"])
|
|
101
|
+
else:
|
|
102
|
+
print("Invalid set number")
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
code="""
|
|
2
|
+
# ============================================================
|
|
3
|
+
# COMMON IMPORTS — Run this cell FIRST before any question
|
|
4
|
+
# ============================================================
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
import statsmodels.api as sm
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
9
|
+
import seaborn as sns
|
|
10
|
+
from scipy.stats import zscore
|
|
11
|
+
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
|
|
12
|
+
from sklearn.linear_model import LogisticRegression
|
|
13
|
+
from sklearn import metrics
|
|
14
|
+
from sklearn.neighbors import KNeighborsClassifier
|
|
15
|
+
from sklearn.cluster import KMeans
|
|
16
|
+
from sklearn.preprocessing import StandardScaler
|
|
17
|
+
from sklearn.utils import resample, shuffle
|
|
18
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
19
|
+
from statsmodels.graphics.regressionplots import influence_plot
|
|
20
|
+
from scipy.cluster.hierarchy import dendrogram, linkage
|
|
21
|
+
|
|
22
|
+
print('All imports successful!')
|
|
23
|
+
|
|
24
|
+
# ============================================================
|
|
25
|
+
# QUESTION 1 — SLR on MBA Salary Dataset
|
|
26
|
+
# ============================================================
|
|
27
|
+
|
|
28
|
+
# ---------- Step 1: Load Dataset ----------
|
|
29
|
+
mba_df = pd.read_csv('MBA_salary.csv')
|
|
30
|
+
print('First 5 rows:')
|
|
31
|
+
print(mba_df.head())
|
|
32
|
+
print('\nDataset Info:')
|
|
33
|
+
print(mba_df.info())
|
|
34
|
+
|
|
35
|
+
# ---------- Step 2 (i): Build SLR Model ----------
|
|
36
|
+
# X = Grade 10 percentage (independent variable)
|
|
37
|
+
# y = Salary (dependent variable)
|
|
38
|
+
X = sm.add_constant(mba_df['percentage in Grade 10']) # adds intercept column
|
|
39
|
+
y = mba_df['salary']
|
|
40
|
+
|
|
41
|
+
# Split: 80% train, 20% test
|
|
42
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
43
|
+
X, y, train_size=0.8, random_state=42)
|
|
44
|
+
|
|
45
|
+
# Fit OLS (Ordinary Least Squares) model
|
|
46
|
+
mba_lm = sm.OLS(y_train, X_train).fit()
|
|
47
|
+
print('\n===== MODEL SUMMARY (i) =====')
|
|
48
|
+
print(mba_lm.summary2())
|
|
49
|
+
|
|
50
|
+
# ---------- Step 3 (ii): Diagnose the Model — Homoscedasticity ----------
|
|
51
|
+
mba_resid = mba_lm.resid # residuals = actual − predicted
|
|
52
|
+
|
|
53
|
+
def get_std_values(vals):
|
|
54
|
+
return (vals - vals.mean()) / vals.std()
|
|
55
|
+
|
|
56
|
+
plt.figure(figsize=(8, 5))
|
|
57
|
+
plt.scatter(
|
|
58
|
+
get_std_values(mba_lm.fittedvalues),
|
|
59
|
+
get_std_values(mba_resid)
|
|
60
|
+
)
|
|
61
|
+
plt.axhline(y=0, color='red', linestyle='--')
|
|
62
|
+
plt.title('(ii) Residual Plot — Homoscedasticity Check')
|
|
63
|
+
plt.xlabel('Standardized Predicted Values')
|
|
64
|
+
plt.ylabel('Standardized Residuals')
|
|
65
|
+
plt.show()
|
|
66
|
+
|
|
67
|
+
# ---------- Step 4 (iii): P-P Plot — Residual Normality ----------
|
|
68
|
+
probplot = sm.ProbPlot(mba_resid)
|
|
69
|
+
plt.figure(figsize=(8, 5))
|
|
70
|
+
probplot.ppplot(line='45')
|
|
71
|
+
plt.title('(iii) Normal P-P Plot of Regression Standardized Residuals')
|
|
72
|
+
plt.show()
|
|
73
|
+
|
|
74
|
+
print('''
|
|
75
|
+
EXPLANATION:
|
|
76
|
+
(i) sm.add_constant() adds an intercept (β₀) column.
|
|
77
|
+
sm.OLS().fit() trains the linear model on training data.
|
|
78
|
+
summary2() shows R², coefficients, p-values, AIC, BIC.
|
|
79
|
+
• R²: % of salary variation explained by Grade 10 %.
|
|
80
|
+
• p-value < 0.05 → feature is statistically significant.
|
|
81
|
+
|
|
82
|
+
(ii) Residual Plot checks Homoscedasticity:
|
|
83
|
+
→ Residuals randomly scattered around 0 = model is valid.
|
|
84
|
+
→ Pattern or funnel shape = Heteroscedasticity (problem).
|
|
85
|
+
|
|
86
|
+
(iii) P-P Plot checks if residuals are Normally distributed:
|
|
87
|
+
→ Points close to 45° line = residuals are normal = model is valid.
|
|
88
|
+
→ Points far from line = residuals are NOT normal.
|
|
89
|
+
''')
|
|
90
|
+
|
|
91
|
+
END
|
|
92
|
+
"""
|
|
93
|
+
def getCode():
|
|
94
|
+
print(code)
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
code="""
|
|
2
|
+
# ============================================================
|
|
3
|
+
# COMMON IMPORTS — Run this cell FIRST before any question
|
|
4
|
+
# ============================================================
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
import statsmodels.api as sm
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
9
|
+
import seaborn as sns
|
|
10
|
+
from scipy.stats import zscore
|
|
11
|
+
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
|
|
12
|
+
from sklearn.linear_model import LogisticRegression
|
|
13
|
+
from sklearn import metrics
|
|
14
|
+
from sklearn.neighbors import KNeighborsClassifier
|
|
15
|
+
from sklearn.cluster import KMeans
|
|
16
|
+
from sklearn.preprocessing import StandardScaler
|
|
17
|
+
from sklearn.utils import resample, shuffle
|
|
18
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
19
|
+
from statsmodels.graphics.regressionplots import influence_plot
|
|
20
|
+
from scipy.cluster.hierarchy import dendrogram, linkage
|
|
21
|
+
|
|
22
|
+
print('All imports successful!')
|
|
23
|
+
|
|
24
|
+
# ============================================================
|
|
25
|
+
# QUESTION 10 — Logistic Regression on Tennis Dataset
|
|
26
|
+
# ============================================================
|
|
27
|
+
|
|
28
|
+
# ---------- Step 1: Create Dataset from Question ----------
|
|
29
|
+
tennis_data = {
|
|
30
|
+
'DAY' : [f'Day{i}' for i in range(1, 15)],
|
|
31
|
+
'OUTLOOK' : ['Sunny','Sunny','Overcast','Rain','Rain','Rain',
|
|
32
|
+
'Overcast','Sunny','Sunny','Rain','Sunny','Overcast','Overcast','Rain'],
|
|
33
|
+
'TEMP' : ['Hot','Hot','Hot','Mild','Cool','Cool','Cool',
|
|
34
|
+
'Mild','Cool','Mild','Mild','Mild','Hot','Mild'],
|
|
35
|
+
'HUMIDITY': ['High','High','High','High','Normal','Normal','Normal',
|
|
36
|
+
'High','Normal','Normal','Normal','High','Normal','High'],
|
|
37
|
+
'WIND' : ['Weak','Strong','Weak','Weak','Weak','Strong','Strong',
|
|
38
|
+
'Weak','Weak','Weak','Strong','Strong','Weak','Strong'],
|
|
39
|
+
'PLAY' : ['NO','NO','YES','YES','YES','NO','YES',
|
|
40
|
+
'NO','YES','YES','YES','YES','YES','NO']
|
|
41
|
+
}
|
|
42
|
+
tennis_df = pd.DataFrame(tennis_data)
|
|
43
|
+
print('Tennis Dataset:')
|
|
44
|
+
print(tennis_df.to_string(index=False))
|
|
45
|
+
|
|
46
|
+
# ---------- Step 2 (i): Build Logistic Model ----------
|
|
47
|
+
tennis_enc = pd.get_dummies(
|
|
48
|
+
tennis_df[['OUTLOOK','TEMP','HUMIDITY','WIND']],
|
|
49
|
+
drop_first=True, dtype=int)
|
|
50
|
+
y_tennis = (tennis_df['PLAY'] == 'YES').astype(int)
|
|
51
|
+
|
|
52
|
+
# Use sklearn since n=14 is very small (statsmodels may not converge)
|
|
53
|
+
tennis_clf = LogisticRegression(max_iter=10000, solver='lbfgs')
|
|
54
|
+
tennis_clf.fit(tennis_enc, y_tennis)
|
|
55
|
+
|
|
56
|
+
# Show all features and coefficients
|
|
57
|
+
coeff_tennis = pd.DataFrame({
|
|
58
|
+
'feature' : tennis_enc.columns,
|
|
59
|
+
'coefficient': tennis_clf.coef_[0]
|
|
60
|
+
}).sort_values('coefficient', ascending=False)
|
|
61
|
+
|
|
62
|
+
print('\n===== (i) All Features and Coefficients =====')
|
|
63
|
+
print(coeff_tennis.to_string(index=False))
|
|
64
|
+
|
|
65
|
+
# ---------- Step 3 (ii): Significant Features ----------
|
|
66
|
+
# For small datasets use statsmodels for p-values
|
|
67
|
+
X_tennis_sm = sm.add_constant(tennis_enc)
|
|
68
|
+
try:
|
|
69
|
+
logit_sm = sm.Logit(y_tennis, X_tennis_sm).fit(maxiter=500, disp=False)
|
|
70
|
+
sig_tennis = [v for v, p in logit_sm.pvalues.items() if p <= 0.05]
|
|
71
|
+
print(f'\n(ii) Significant features (p ≤ 0.05): {sig_tennis}')
|
|
72
|
+
except:
|
|
73
|
+
# Fallback: use features with largest absolute coefficients
|
|
74
|
+
sig_tennis = list(coeff_tennis.nlargest(3, 'coefficient')['feature']) + \
|
|
75
|
+
list(coeff_tennis.nsmallest(2, 'coefficient')['feature'])
|
|
76
|
+
print(f'\n(ii) Top features by coefficient magnitude: {sig_tennis}')
|
|
77
|
+
|
|
78
|
+
# Build new model with significant features
|
|
79
|
+
tennis_clf2 = LogisticRegression(max_iter=10000)
|
|
80
|
+
sig_tennis_cols = [c for c in sig_tennis if c in tennis_enc.columns and c != 'const']
|
|
81
|
+
if sig_tennis_cols:
|
|
82
|
+
tennis_clf2.fit(tennis_enc[sig_tennis_cols], y_tennis)
|
|
83
|
+
print(f'New model features: {sig_tennis_cols}')
|
|
84
|
+
else:
|
|
85
|
+
tennis_clf2 = tennis_clf
|
|
86
|
+
sig_tennis_cols = list(tennis_enc.columns)
|
|
87
|
+
|
|
88
|
+
# ---------- Step 4 (iii): Positive/Negative Effects ----------
|
|
89
|
+
print('\n===== (iii) Effect on Probability of PLAY =====')
|
|
90
|
+
for _, row in coeff_tennis.iterrows():
|
|
91
|
+
effect = 'POSITIVE (+) — increases PLAY probability' if row['coefficient'] > 0 \
|
|
92
|
+
else 'NEGATIVE (−) — decreases PLAY probability'
|
|
93
|
+
print(f" {row['feature']:30s}: coeff={row['coefficient']:+.4f} → {effect}")
|
|
94
|
+
|
|
95
|
+
# ---------- Step 5 (iv): Confusion Matrix at 0.5 Cut-off ----------
|
|
96
|
+
pred_tennis = tennis_clf.predict(tennis_enc) # uses default 0.5 threshold
|
|
97
|
+
|
|
98
|
+
cm_tennis = metrics.confusion_matrix(y_tennis, pred_tennis)
|
|
99
|
+
plt.figure(figsize=(6, 5))
|
|
100
|
+
sns.heatmap(cm_tennis, annot=True, fmt='.0f',
|
|
101
|
+
xticklabels=['NO', 'YES'],
|
|
102
|
+
yticklabels=['NO', 'YES'],
|
|
103
|
+
cmap='Blues')
|
|
104
|
+
plt.xlabel('Predicted')
|
|
105
|
+
plt.ylabel('Actual')
|
|
106
|
+
plt.title('(iv) Confusion Matrix — Tennis (cut-off = 0.5)')
|
|
107
|
+
plt.show()
|
|
108
|
+
|
|
109
|
+
print('\n(iv) Classification Report (PLAY = YES):')
|
|
110
|
+
print(metrics.classification_report(y_tennis, pred_tennis,
|
|
111
|
+
target_names=['NO', 'YES']))
|
|
112
|
+
|
|
113
|
+
print('''
|
|
114
|
+
EXPLANATION:
|
|
115
|
+
(i) get_dummies() converts OUTLOOK (Sunny/Overcast/Rain) → binary columns.
|
|
116
|
+
OUTLOOK_Overcast=1, OUTLOOK_Sunny=1, TEMP_Hot=1, WIND_Weak=1 etc.
|
|
117
|
+
drop_first=True: removes one level per feature to avoid multicollinearity.
|
|
118
|
+
|
|
119
|
+
(ii) Only 14 training samples → statsmodels may struggle to converge.
|
|
120
|
+
Significant features are those with the strongest influence.
|
|
121
|
+
|
|
122
|
+
(iii) Positive coefficient:
|
|
123
|
+
e.g., HUMIDITY_Normal = positive → Normal humidity increases chance of PLAY.
|
|
124
|
+
Negative coefficient:
|
|
125
|
+
e.g., WIND_Strong = negative → Strong wind decreases chance of PLAY.
|
|
126
|
+
|
|
127
|
+
(iv) Precision for PLAY=YES:
|
|
128
|
+
Of all days we predicted play, how many actually played.
|
|
129
|
+
Recall for PLAY=YES:
|
|
130
|
+
Of all actual play days, how many did we correctly predict.
|
|
131
|
+
''')
|
|
132
|
+
|
|
133
|
+
END
|
|
134
|
+
|
|
135
|
+
"""
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
code = """
|
|
2
|
+
# =====================================
|
|
3
|
+
# Decision Tree using Gini Impurity
|
|
4
|
+
# =====================================
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import matplotlib.pyplot as plt
|
|
8
|
+
import seaborn as sns
|
|
9
|
+
from sklearn.tree import DecisionTreeClassifier, plot_tree
|
|
10
|
+
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
|
|
11
|
+
|
|
12
|
+
# -------------------------------------
|
|
13
|
+
# DATASET
|
|
14
|
+
# -------------------------------------
|
|
15
|
+
data = {
|
|
16
|
+
'Weather':['Sunny','Cloudy','Sunny','Cloudy','Rainy','Rainy','Rainy','Sunny','Cloudy','Rainy'],
|
|
17
|
+
'Temperature':['Hot','Hot','Mild','Mild','Mild','Cool','Mild','Hot','Hot','Mild'],
|
|
18
|
+
'Humidity':['High','High','Normal','High','High','Normal','High','High','Normal','High'],
|
|
19
|
+
'Wind':['Weak','Weak','Strong','Strong','Strong','Strong','Weak','Strong','Weak','Strong'],
|
|
20
|
+
'Play':['No','Yes','Yes','Yes','No','No','Yes','No','Yes','No']
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
df = pd.DataFrame(data)
|
|
24
|
+
print("Dataset:\n",df)
|
|
25
|
+
|
|
26
|
+
# -------------------------------------
|
|
27
|
+
# ENCODE DATA
|
|
28
|
+
# -------------------------------------
|
|
29
|
+
df = pd.get_dummies(df)
|
|
30
|
+
|
|
31
|
+
X = df.drop('Play_Yes',axis=1)
|
|
32
|
+
y = df['Play_Yes']
|
|
33
|
+
|
|
34
|
+
# -------------------------------------
|
|
35
|
+
# TRAIN MODEL
|
|
36
|
+
# -------------------------------------
|
|
37
|
+
model = DecisionTreeClassifier(criterion='gini',max_depth=3)
|
|
38
|
+
model.fit(X,y)
|
|
39
|
+
|
|
40
|
+
# -------------------------------------
|
|
41
|
+
# TREE GRAPH
|
|
42
|
+
# -------------------------------------
|
|
43
|
+
plt.figure(figsize=(12,6))
|
|
44
|
+
plot_tree(model,feature_names=X.columns,class_names=['No','Yes'],filled=True)
|
|
45
|
+
plt.title("Decision Tree (Gini Impurity)")
|
|
46
|
+
plt.show()
|
|
47
|
+
|
|
48
|
+
# -------------------------------------
|
|
49
|
+
# PREDICTIONS
|
|
50
|
+
# -------------------------------------
|
|
51
|
+
y_pred = model.predict(X)
|
|
52
|
+
|
|
53
|
+
print("\nAccuracy:",accuracy_score(y,y_pred))
|
|
54
|
+
print("\nClassification Report:\n",classification_report(y,y_pred))
|
|
55
|
+
|
|
56
|
+
# -------------------------------------
|
|
57
|
+
# CONFUSION MATRIX (DIAGNOSTIC)
|
|
58
|
+
# -------------------------------------
|
|
59
|
+
cm = confusion_matrix(y,y_pred)
|
|
60
|
+
|
|
61
|
+
plt.figure(figsize=(5,4))
|
|
62
|
+
sns.heatmap(cm,annot=True,fmt='d',cmap='Blues',
|
|
63
|
+
xticklabels=['No','Yes'],
|
|
64
|
+
yticklabels=['No','Yes'])
|
|
65
|
+
plt.xlabel("Predicted")
|
|
66
|
+
plt.ylabel("Actual")
|
|
67
|
+
plt.title("Confusion Matrix")
|
|
68
|
+
plt.show()
|
|
69
|
+
|
|
70
|
+
# -------------------------------------
|
|
71
|
+
# FEATURE IMPORTANCE GRAPH
|
|
72
|
+
# -------------------------------------
|
|
73
|
+
importance = pd.Series(model.feature_importances_,index=X.columns)
|
|
74
|
+
|
|
75
|
+
importance.sort_values().plot(kind='barh',figsize=(8,5))
|
|
76
|
+
plt.title("Feature Importance (Gini Reduction)")
|
|
77
|
+
plt.xlabel("Importance")
|
|
78
|
+
plt.show()
|
|
79
|
+
|
|
80
|
+
END
|
|
81
|
+
|
|
82
|
+
"""
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
code='''
|
|
2
|
+
# ============================================================
|
|
3
|
+
# COMMON IMPORTS — Run this cell FIRST before any question
|
|
4
|
+
# ============================================================
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import numpy as np
|
|
7
|
+
import statsmodels.api as sm
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
9
|
+
import seaborn as sns
|
|
10
|
+
from scipy.stats import zscore
|
|
11
|
+
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
|
|
12
|
+
from sklearn.linear_model import LogisticRegression
|
|
13
|
+
from sklearn import metrics
|
|
14
|
+
from sklearn.neighbors import KNeighborsClassifier
|
|
15
|
+
from sklearn.cluster import KMeans
|
|
16
|
+
from sklearn.preprocessing import StandardScaler
|
|
17
|
+
from sklearn.utils import resample, shuffle
|
|
18
|
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
19
|
+
from statsmodels.graphics.regressionplots import influence_plot
|
|
20
|
+
from scipy.cluster.hierarchy import dendrogram, linkage
|
|
21
|
+
|
|
22
|
+
print('All imports successful!')
|
|
23
|
+
|
|
24
|
+
# ============================================================
|
|
25
|
+
# QUESTION 12 — Gradient Descent Algorithm for Linear Regression
|
|
26
|
+
# ============================================================
|
|
27
|
+
|
|
28
|
+
# ---------- Step 1: Load Dataset ----------
|
|
29
|
+
sales_df = pd.read_csv('Advertising.csv')
|
|
30
|
+
print('Dataset:')
|
|
31
|
+
print(sales_df.head())
|
|
32
|
+
print('Shape:', sales_df.shape)
|
|
33
|
+
|
|
34
|
+
X_sales = sales_df[['TV', 'Radio', 'Newspaper']]
|
|
35
|
+
y_sales = sales_df['sales']
|
|
36
|
+
|
|
37
|
+
# ---------- Step 2: Standardize Features ----------
|
|
38
|
+
y_std = np.array((y_sales - y_sales.mean()) / y_sales.std())
|
|
39
|
+
X_std = X_sales.apply(
|
|
40
|
+
lambda col: (col - col.mean()) / col.std(), axis=0)
|
|
41
|
+
|
|
42
|
+
# ---------- Step 3: Define Gradient Descent Functions ----------
|
|
43
|
+
|
|
44
|
+
def initialize(dim):
|
|
45
|
+
"""Random initialization of weights (w) and bias (b)"""
|
|
46
|
+
np.random.seed(42)
|
|
47
|
+
b = np.random.random()
|
|
48
|
+
w = np.random.rand(dim)
|
|
49
|
+
return b, w
|
|
50
|
+
|
|
51
|
+
def predict_y(b, w, X):
|
|
52
|
+
"""Forward pass: y_hat = b + X·w"""
|
|
53
|
+
return b + np.matmul(X, w)
|
|
54
|
+
|
|
55
|
+
def get_cost(y, y_hat):
|
|
56
|
+
"""Cost function: Mean Squared Error (MSE)"""
|
|
57
|
+
residuals = y - y_hat
|
|
58
|
+
return np.sum(np.matmul(residuals.T, residuals)) / len(residuals)
|
|
59
|
+
|
|
60
|
+
def update_beta(X, y, y_hat, b0, w0, learning_rate):
|
|
61
|
+
"""Gradient update step for bias and weights"""
|
|
62
|
+
db = (np.sum(y_hat - y) * 2) / len(y) # gradient for bias
|
|
63
|
+
dw = (np.dot((y_hat - y), X) * 2) / len(y) # gradient for weights
|
|
64
|
+
b1 = b0 - learning_rate * db # update bias
|
|
65
|
+
w1 = w0 - learning_rate * dw # update weights
|
|
66
|
+
return b1, w1
|
|
67
|
+
|
|
68
|
+
def run_grad(X, y, alpha=0.01, num_iterations=100):
|
|
69
|
+
"""Run full Gradient Descent for given iterations and learning rate"""
|
|
70
|
+
b, w = initialize(X.shape[1])
|
|
71
|
+
iter_num = 0
|
|
72
|
+
gd_iter_df = pd.DataFrame(columns=['iterations', 'cost'])
|
|
73
|
+
result_idx = 0
|
|
74
|
+
|
|
75
|
+
for iter_num in range(num_iterations):
|
|
76
|
+
y_hat = predict_y(b, w, X)
|
|
77
|
+
this_cost = get_cost(y, y_hat)
|
|
78
|
+
prev_b, prev_w = b, w
|
|
79
|
+
b, w = update_beta(X, y, y_hat, prev_b, prev_w, alpha)
|
|
80
|
+
|
|
81
|
+
if iter_num % 10 == 0: # record every 10th iteration
|
|
82
|
+
gd_iter_df.loc[result_idx] = [iter_num, this_cost]
|
|
83
|
+
result_idx += 1
|
|
84
|
+
|
|
85
|
+
print(f'Final estimate of b & w: {round(b,5)} {np.round(w,5)}')
|
|
86
|
+
return gd_iter_df, b, w
|
|
87
|
+
|
|
88
|
+
# ---------- Step 4: Show Initial Parameters ----------
|
|
89
|
+
b_init, w_init = initialize(3)
|
|
90
|
+
print(f'\nInitial Bias : {b_init:.4f}')
|
|
91
|
+
print(f'Initial Weights : {w_init}')
|
|
92
|
+
|
|
93
|
+
y_hat_init = predict_y(b_init, w_init, X_std.values)
|
|
94
|
+
cost_init = get_cost(y_std, y_hat_init)
|
|
95
|
+
print(f'Initial Cost (MSE): {cost_init:.4f}')
|
|
96
|
+
|
|
97
|
+
b_init, w_init = update_beta(
|
|
98
|
+
X_std.values, y_std, y_hat_init, b_init, w_init, 0.01)
|
|
99
|
+
print(f'\nAfter first update → Bias: {b_init:.4f}, Weights: {np.round(w_init,4)}')
|
|
100
|
+
|
|
101
|
+
# ---------- Step 5: Run Gradient Descent ----------
|
|
102
|
+
print('\n===== Running Gradient Descent (alpha=0.01, 2000 iterations) =====')
|
|
103
|
+
gd_df1, b1, w1 = run_grad(X_std.values, y_std, alpha=0.01, num_iterations=2000)
|
|
104
|
+
print('\nCost per 10 iterations (first 40 rows):')
|
|
105
|
+
print(gd_df1.head(40).to_string(index=False))
|
|
106
|
+
|
|
107
|
+
# ---------- Step 6: Plot Cost vs Iterations ----------
|
|
108
|
+
print('\n===== Running with alpha=0.001 for comparison =====')
|
|
109
|
+
gd_df2, b2, w2 = run_grad(X_std.values, y_std, alpha=0.001, num_iterations=2000)
|
|
110
|
+
|
|
111
|
+
plt.figure(figsize=(12, 5))
|
|
112
|
+
plt.subplot(1, 2, 1)
|
|
113
|
+
plt.plot(gd_df1['iterations'].astype(float),
|
|
114
|
+
gd_df1['cost'].astype(float), color='blue', label='alpha=0.01')
|
|
115
|
+
plt.xlabel('No. of Iterations')
|
|
116
|
+
plt.ylabel('Cost (MSE)')
|
|
117
|
+
plt.title('Cost vs Iterations (alpha=0.01)')
|
|
118
|
+
plt.legend()
|
|
119
|
+
plt.grid(True, alpha=0.3)
|
|
120
|
+
|
|
121
|
+
plt.subplot(1, 2, 2)
|
|
122
|
+
plt.plot(gd_df2['iterations'].astype(float),
|
|
123
|
+
gd_df2['cost'].astype(float), color='orange', label='alpha=0.001')
|
|
124
|
+
plt.xlabel('No. of Iterations')
|
|
125
|
+
plt.ylabel('Cost (MSE)')
|
|
126
|
+
plt.title('Cost vs Iterations (alpha=0.001)')
|
|
127
|
+
plt.legend()
|
|
128
|
+
plt.grid(True, alpha=0.3)
|
|
129
|
+
|
|
130
|
+
plt.tight_layout()
|
|
131
|
+
plt.show()
|
|
132
|
+
|
|
133
|
+
END
|
|
134
|
+
|
|
135
|
+
'''
|