numpyUtilsUpdated 0.0.2__tar.gz → 0.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. numpyutilsupdated-0.0.7/PKG-INFO +34 -0
  2. numpyutilsupdated-0.0.7/README.md +20 -0
  3. {numpyutilsupdated-0.0.2 → numpyutilsupdated-0.0.7}/pyproject.toml +1 -1
  4. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/__init__.py +102 -0
  5. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set1.py +94 -0
  6. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set10.py +135 -0
  7. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set11.py +82 -0
  8. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set12.py +135 -0
  9. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set13.py +112 -0
  10. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set14.py +123 -0
  11. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set15.py +165 -0
  12. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set16.py +176 -0
  13. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set17.py +165 -0
  14. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set18.py +176 -0
  15. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set2.py +88 -0
  16. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set3.py +105 -0
  17. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set4.py +102 -0
  18. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set5.py +117 -0
  19. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set6.py +92 -0
  20. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set7.py +133 -0
  21. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set8.py +121 -0
  22. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/set9.py +133 -0
  23. numpyutilsupdated-0.0.7/src/numpyUtilsUpdated/temp.py +26 -0
  24. numpyutilsupdated-0.0.2/PKG-INFO +0 -95
  25. numpyutilsupdated-0.0.2/README.md +0 -81
  26. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/__init__.py +0 -5
  27. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/AlphaBetaPr.py +0 -42
  28. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/BFS_8pz.py +0 -46
  29. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/DFS_8pz.py +0 -50
  30. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/DLimitS_8pz.py +0 -48
  31. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/GBFS.py +0 -33
  32. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/GraphColor.py +0 -42
  33. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/Minimax.py +0 -71
  34. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/Prologs.py +0 -53
  35. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/UCS.py +0 -38
  36. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/VaccumCleaner.py +0 -59
  37. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/Wumpus.py +0 -89
  38. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/__init__.py +0 -19
  39. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/ticTacToe.py +0 -101
  40. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/ai/waterJug.py +0 -59
  41. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/BFS1.py +0 -94
  42. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Best_First_search1.py +0 -57
  43. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/DFS1.py +0 -97
  44. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/DLS1.py +0 -93
  45. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Graph_coloring1.py +0 -47
  46. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Greedy_BFS1.py +0 -38
  47. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Minimax_Tic_Tac_Toe1.py +0 -102
  48. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/UCS1.py +0 -109
  49. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Vaccum_World1.py +0 -54
  50. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Water_jug1.py +0 -44
  51. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/Wumpus_World1.py +0 -87
  52. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/aib/__init__.py +0 -16
  53. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/cs/__init__.py +0 -14
  54. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/cs/ass1.py +0 -102
  55. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/cs/ass2.py +0 -71
  56. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/cs/ass3.py +0 -88
  57. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/cs/ass4.py +0 -76
  58. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/cs/ass5.py +0 -0
  59. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/APSP_floyd_warshall.py +0 -64
  60. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/BinarySearch.py +0 -24
  61. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Dijkstras.py +0 -87
  62. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Greedy_01knapsack.py +0 -62
  63. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Greedy_FKanpsack.py +0 -71
  64. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Job_Seq.py +0 -71
  65. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/KnapDP.py +0 -32
  66. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Kruskals.py +0 -108
  67. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Matrix.py +0 -116
  68. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Prims.py +0 -92
  69. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/Quick_Sort.py +0 -62
  70. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/SSSP_bellman_ford.py +0 -81
  71. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/String_editing_problem.py +0 -75
  72. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/TSP.py +0 -31
  73. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/__init__.py +0 -24
  74. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/getInfo.py +0 -0
  75. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/daa/min_max.py +0 -41
  76. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/10_knapsack_dp.py +0 -27
  77. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/11_bellman_ford.py +0 -24
  78. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/12_tsp_dp.py +0 -25
  79. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/1_greedy_knapsack.py +0 -19
  80. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/2_job_sequencing.py +0 -19
  81. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/3_prims_algorithm.py +0 -31
  82. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/4_kruskals_algorithm.py +0 -31
  83. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/5_min_max.py +0 -18
  84. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/6_str.py +0 -53
  85. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/6_strassen.py +0 -49
  86. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/7_dijkstra.py +0 -30
  87. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/8_floyd_warshall.py +0 -23
  88. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/9_edit_distance.py +0 -22
  89. numpyutilsupdated-0.0.2/src/numpyUtilsUpdated/dab/__init__.py +0 -19
  90. {numpyutilsupdated-0.0.2 → numpyutilsupdated-0.0.7}/LICENSE +0 -0
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.4
2
+ Name: numpyUtilsUpdated
3
+ Version: 0.0.7
4
+ Summary: A collection of numpy utilities for data analysis and manipulation.
5
+ Project-URL: Homepage, https://github.com/nani-here/numpyUtilsUpdated
6
+ Project-URL: Issues, https://github.com/nani-here/numpyUtilsUpdated/issues
7
+ Author-email: Nani bolthe! <neku.enduku2005@gmail.com>
8
+ License-File: LICENSE
9
+ Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.0
13
+ Description-Content-Type: text/markdown
14
+
15
+ # numpyUtilsUpdated
16
+
17
+ A comprehensive collection of utility functions designed to enhance your NumPy workflows. This module provides a range of tools to simplify common NumPy operations, improve code readability, and boost your productivity when working with numerical data in Python.
18
+
19
+ This module provides questions and Python codes for ML lab sets.
20
+
21
+ ## Install Module
22
+
23
+ ```bash
24
+ pip install numpyUtilsUpdated
25
+
26
+ then import like this:
27
+
28
+ ```bash
29
+ import numpyUtilsUpdated as nup
30
+
31
+ the module will give further instructions
32
+
33
+ ```bash
34
+ pip install numpy pandas scikit-learn statsmodels matplotlib seaborn scipy
@@ -0,0 +1,20 @@
1
+ # numpyUtilsUpdated
2
+
3
+ A comprehensive collection of utility functions designed to enhance your NumPy workflows. This module provides a range of tools to simplify common NumPy operations, improve code readability, and boost your productivity when working with numerical data in Python.
4
+
5
+ This module provides questions and Python codes for ML lab sets.
6
+
7
+ ## Install Module
8
+
9
+ ```bash
10
+ pip install numpyUtilsUpdated
11
+
12
+ then import like this:
13
+
14
+ ```bash
15
+ import numpyUtilsUpdated as nup
16
+
17
+ the module will give further instructions
18
+
19
+ ```bash
20
+ pip install numpy pandas scikit-learn statsmodels matplotlib seaborn scipy
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "numpyUtilsUpdated"
7
- version = "0.0.2"
7
+ version = "0.0.7"
8
8
  authors = [
9
9
  { name="Nani bolthe!", email="neku.enduku2005@gmail.com" },
10
10
  ]
@@ -0,0 +1,102 @@
1
+ print("[numpyUtilsUpdated] : Welcome to LM Codes")
2
+ print("!pip install numpy pandas scikit-learn statsmodels matplotlib seaborn scipy --user")
3
+
4
+ print("""
5
+ Use the function:
6
+
7
+ import numpyUtilsUpdated as nup
8
+ nup.getSetDetails(set_no)
9
+
10
+ Example:
11
+ getSetDetails(1)
12
+
13
+ This will print the question for that set.
14
+
15
+ Then to get the code use:
16
+ Replace set1 with your set no, [set1 - set18]
17
+ import numpyUtilsUpdated.set1 as set
18
+ print(set.code)
19
+ """)
20
+
21
+ SET_DETAILS = {
22
+ "set1": {
23
+ "question": "Develop an SLR model to predict MBA salary using Grade 10 percentage from MBA salary.csv. Diagnose the regression model and perform residual analysis using P-P plot."
24
+ },
25
+
26
+ "set2": {
27
+ "question": "Develop a simple linear regression model between Corruption Perception Index (Y) and Gini Index (X) using country.csv. Diagnose the regression model and perform residual analysis using P-P plot."
28
+ },
29
+
30
+ "set3": {
31
+ "question": "Develop an SLR model using MBA salary.csv and detect outliers using Z-score and Cook’s distance. Make prediction and measure accuracy."
32
+ },
33
+
34
+ "set4": {
35
+ "question": "Using IPL dataset, build an MLR model, show the summary, and identify features with multicollinearity."
36
+ },
37
+
38
+ "set5": {
39
+ "question": "Using IPL dataset, build an MLR model, detect multicollinearity, rebuild the model after removing it, and perform residual analysis using P-P plot."
40
+ },
41
+
42
+ "set6": {
43
+ "question": "Using country.csv dataset, build an SLR model between Corruption Index and Gini Index, detect outliers using Z-score and Cook’s distance, and evaluate prediction accuracy."
44
+ },
45
+
46
+ "set7": {
47
+ "question": "Using GermanCredit.csv dataset, build a logistic regression model to predict credit risk, identify significant features, rebuild the model, compute confusion matrix, precision, recall, ROC and AUC."
48
+ },
49
+
50
+ "set8": {
51
+ "question": "Using GermanCredit.csv dataset, build logistic regression, compute Youden’s index for cut-offs from 0.1 to 0.5, find optimal cut-off, build confusion matrix, and compute ROC and AUC."
52
+ },
53
+
54
+ "set9": {
55
+ "question": "Demonstrate Gain and Lift charts using bank.csv dataset."
56
+ },
57
+
58
+ "set10": {
59
+ "question": "Using the tennis dataset, build a logistic regression model to predict PLAY, identify significant features, analyze coefficients, and compute confusion matrix with precision and recall."
60
+ },
61
+
62
+ "set11": {
63
+ "question": "Construct the decision tree using Gini impurity for the given training dataset."
64
+ },
65
+
66
+ "set12": {
67
+ "question": "Demonstrate Gradient Descent Algorithm for Linear Regression using Advertising.csv dataset."
68
+ },
69
+
70
+ "set13": {
71
+ "question": "Build logistic regression models on bank.csv dataset for both imbalanced and balanced data, evaluate using 5-fold cross-validation and ROC AUC score."
72
+ },
73
+
74
+ "set14": {
75
+ "question": "Demonstrate the KNN algorithm using a suitable dataset."
76
+ },
77
+
78
+ "set15": {
79
+ "question": "Using Income Data.csv dataset, perform K-Means clustering, draw scatter plot of age vs income, normalize features, plot clusters, and interpret cluster centers."
80
+ },
81
+
82
+ "set16": {
83
+ "question": "Using customerspends.csv dataset, perform K-Means clustering, visualize clusters, normalize features, use dendrogram and elbow method, and print cluster centers."
84
+ },
85
+
86
+ "set17": {
87
+ "question": "Using Income Data.csv dataset, identify clusters, normalize features, apply elbow method, and print records and cluster centers."
88
+ },
89
+
90
+ "set18": {
91
+ "question": "Perform product segmentation using K-Means clustering on customerspends.csv dataset, visualize clusters, verify with dendrogram and elbow method, and print cluster centers."
92
+ }
93
+ }
94
+
95
+ def getSetDetails(set_no):
96
+ key = f"set{set_no}"
97
+
98
+ if key in SET_DETAILS:
99
+ print(f"Set {set_no} Question:\n")
100
+ print(SET_DETAILS[key]["question"])
101
+ else:
102
+ print("Invalid set number")
@@ -0,0 +1,94 @@
1
+ code="""
2
+ # ============================================================
3
+ # COMMON IMPORTS — Run this cell FIRST before any question
4
+ # ============================================================
5
+ import pandas as pd
6
+ import numpy as np
7
+ import statsmodels.api as sm
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ from scipy.stats import zscore
11
+ from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
12
+ from sklearn.linear_model import LogisticRegression
13
+ from sklearn import metrics
14
+ from sklearn.neighbors import KNeighborsClassifier
15
+ from sklearn.cluster import KMeans
16
+ from sklearn.preprocessing import StandardScaler
17
+ from sklearn.utils import resample, shuffle
18
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
19
+ from statsmodels.graphics.regressionplots import influence_plot
20
+ from scipy.cluster.hierarchy import dendrogram, linkage
21
+
22
+ print('All imports successful!')
23
+
24
+ # ============================================================
25
+ # QUESTION 1 — SLR on MBA Salary Dataset
26
+ # ============================================================
27
+
28
+ # ---------- Step 1: Load Dataset ----------
29
+ mba_df = pd.read_csv('MBA_salary.csv')
30
+ print('First 5 rows:')
31
+ print(mba_df.head())
32
+ print('\nDataset Info:')
33
+ print(mba_df.info())
34
+
35
+ # ---------- Step 2 (i): Build SLR Model ----------
36
+ # X = Grade 10 percentage (independent variable)
37
+ # y = Salary (dependent variable)
38
+ X = sm.add_constant(mba_df['percentage in Grade 10']) # adds intercept column
39
+ y = mba_df['salary']
40
+
41
+ # Split: 80% train, 20% test
42
+ X_train, X_test, y_train, y_test = train_test_split(
43
+ X, y, train_size=0.8, random_state=42)
44
+
45
+ # Fit OLS (Ordinary Least Squares) model
46
+ mba_lm = sm.OLS(y_train, X_train).fit()
47
+ print('\n===== MODEL SUMMARY (i) =====')
48
+ print(mba_lm.summary2())
49
+
50
+ # ---------- Step 3 (ii): Diagnose the Model — Homoscedasticity ----------
51
+ mba_resid = mba_lm.resid # residuals = actual − predicted
52
+
53
+ def get_std_values(vals):
54
+ return (vals - vals.mean()) / vals.std()
55
+
56
+ plt.figure(figsize=(8, 5))
57
+ plt.scatter(
58
+ get_std_values(mba_lm.fittedvalues),
59
+ get_std_values(mba_resid)
60
+ )
61
+ plt.axhline(y=0, color='red', linestyle='--')
62
+ plt.title('(ii) Residual Plot — Homoscedasticity Check')
63
+ plt.xlabel('Standardized Predicted Values')
64
+ plt.ylabel('Standardized Residuals')
65
+ plt.show()
66
+
67
+ # ---------- Step 4 (iii): P-P Plot — Residual Normality ----------
68
+ probplot = sm.ProbPlot(mba_resid)
69
+ plt.figure(figsize=(8, 5))
70
+ probplot.ppplot(line='45')
71
+ plt.title('(iii) Normal P-P Plot of Regression Standardized Residuals')
72
+ plt.show()
73
+
74
+ print('''
75
+ EXPLANATION:
76
+ (i) sm.add_constant() adds an intercept (β₀) column.
77
+ sm.OLS().fit() trains the linear model on training data.
78
+ summary2() shows R², coefficients, p-values, AIC, BIC.
79
+ • R²: % of salary variation explained by Grade 10 %.
80
+ • p-value < 0.05 → feature is statistically significant.
81
+
82
+ (ii) Residual Plot checks Homoscedasticity:
83
+ → Residuals randomly scattered around 0 = model is valid.
84
+ → Pattern or funnel shape = Heteroscedasticity (problem).
85
+
86
+ (iii) P-P Plot checks if residuals are Normally distributed:
87
+ → Points close to 45° line = residuals are normal = model is valid.
88
+ → Points far from line = residuals are NOT normal.
89
+ ''')
90
+
91
+ END
92
+ """
93
+ def getCode():
94
+ print(code)
@@ -0,0 +1,135 @@
1
+ code="""
2
+ # ============================================================
3
+ # COMMON IMPORTS — Run this cell FIRST before any question
4
+ # ============================================================
5
+ import pandas as pd
6
+ import numpy as np
7
+ import statsmodels.api as sm
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ from scipy.stats import zscore
11
+ from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
12
+ from sklearn.linear_model import LogisticRegression
13
+ from sklearn import metrics
14
+ from sklearn.neighbors import KNeighborsClassifier
15
+ from sklearn.cluster import KMeans
16
+ from sklearn.preprocessing import StandardScaler
17
+ from sklearn.utils import resample, shuffle
18
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
19
+ from statsmodels.graphics.regressionplots import influence_plot
20
+ from scipy.cluster.hierarchy import dendrogram, linkage
21
+
22
+ print('All imports successful!')
23
+
24
+ # ============================================================
25
+ # QUESTION 10 — Logistic Regression on Tennis Dataset
26
+ # ============================================================
27
+
28
+ # ---------- Step 1: Create Dataset from Question ----------
29
+ tennis_data = {
30
+ 'DAY' : [f'Day{i}' for i in range(1, 15)],
31
+ 'OUTLOOK' : ['Sunny','Sunny','Overcast','Rain','Rain','Rain',
32
+ 'Overcast','Sunny','Sunny','Rain','Sunny','Overcast','Overcast','Rain'],
33
+ 'TEMP' : ['Hot','Hot','Hot','Mild','Cool','Cool','Cool',
34
+ 'Mild','Cool','Mild','Mild','Mild','Hot','Mild'],
35
+ 'HUMIDITY': ['High','High','High','High','Normal','Normal','Normal',
36
+ 'High','Normal','Normal','Normal','High','Normal','High'],
37
+ 'WIND' : ['Weak','Strong','Weak','Weak','Weak','Strong','Strong',
38
+ 'Weak','Weak','Weak','Strong','Strong','Weak','Strong'],
39
+ 'PLAY' : ['NO','NO','YES','YES','YES','NO','YES',
40
+ 'NO','YES','YES','YES','YES','YES','NO']
41
+ }
42
+ tennis_df = pd.DataFrame(tennis_data)
43
+ print('Tennis Dataset:')
44
+ print(tennis_df.to_string(index=False))
45
+
46
+ # ---------- Step 2 (i): Build Logistic Model ----------
47
+ tennis_enc = pd.get_dummies(
48
+ tennis_df[['OUTLOOK','TEMP','HUMIDITY','WIND']],
49
+ drop_first=True, dtype=int)
50
+ y_tennis = (tennis_df['PLAY'] == 'YES').astype(int)
51
+
52
+ # Use sklearn since n=14 is very small (statsmodels may not converge)
53
+ tennis_clf = LogisticRegression(max_iter=10000, solver='lbfgs')
54
+ tennis_clf.fit(tennis_enc, y_tennis)
55
+
56
+ # Show all features and coefficients
57
+ coeff_tennis = pd.DataFrame({
58
+ 'feature' : tennis_enc.columns,
59
+ 'coefficient': tennis_clf.coef_[0]
60
+ }).sort_values('coefficient', ascending=False)
61
+
62
+ print('\n===== (i) All Features and Coefficients =====')
63
+ print(coeff_tennis.to_string(index=False))
64
+
65
+ # ---------- Step 3 (ii): Significant Features ----------
66
+ # For small datasets use statsmodels for p-values
67
+ X_tennis_sm = sm.add_constant(tennis_enc)
68
+ try:
69
+ logit_sm = sm.Logit(y_tennis, X_tennis_sm).fit(maxiter=500, disp=False)
70
+ sig_tennis = [v for v, p in logit_sm.pvalues.items() if p <= 0.05]
71
+ print(f'\n(ii) Significant features (p ≤ 0.05): {sig_tennis}')
72
+ except:
73
+ # Fallback: use features with largest absolute coefficients
74
+ sig_tennis = list(coeff_tennis.nlargest(3, 'coefficient')['feature']) + \
75
+ list(coeff_tennis.nsmallest(2, 'coefficient')['feature'])
76
+ print(f'\n(ii) Top features by coefficient magnitude: {sig_tennis}')
77
+
78
+ # Build new model with significant features
79
+ tennis_clf2 = LogisticRegression(max_iter=10000)
80
+ sig_tennis_cols = [c for c in sig_tennis if c in tennis_enc.columns and c != 'const']
81
+ if sig_tennis_cols:
82
+ tennis_clf2.fit(tennis_enc[sig_tennis_cols], y_tennis)
83
+ print(f'New model features: {sig_tennis_cols}')
84
+ else:
85
+ tennis_clf2 = tennis_clf
86
+ sig_tennis_cols = list(tennis_enc.columns)
87
+
88
+ # ---------- Step 4 (iii): Positive/Negative Effects ----------
89
+ print('\n===== (iii) Effect on Probability of PLAY =====')
90
+ for _, row in coeff_tennis.iterrows():
91
+ effect = 'POSITIVE (+) — increases PLAY probability' if row['coefficient'] > 0 \
92
+ else 'NEGATIVE (−) — decreases PLAY probability'
93
+ print(f" {row['feature']:30s}: coeff={row['coefficient']:+.4f} → {effect}")
94
+
95
+ # ---------- Step 5 (iv): Confusion Matrix at 0.5 Cut-off ----------
96
+ pred_tennis = tennis_clf.predict(tennis_enc) # uses default 0.5 threshold
97
+
98
+ cm_tennis = metrics.confusion_matrix(y_tennis, pred_tennis)
99
+ plt.figure(figsize=(6, 5))
100
+ sns.heatmap(cm_tennis, annot=True, fmt='.0f',
101
+ xticklabels=['NO', 'YES'],
102
+ yticklabels=['NO', 'YES'],
103
+ cmap='Blues')
104
+ plt.xlabel('Predicted')
105
+ plt.ylabel('Actual')
106
+ plt.title('(iv) Confusion Matrix — Tennis (cut-off = 0.5)')
107
+ plt.show()
108
+
109
+ print('\n(iv) Classification Report (PLAY = YES):')
110
+ print(metrics.classification_report(y_tennis, pred_tennis,
111
+ target_names=['NO', 'YES']))
112
+
113
+ print('''
114
+ EXPLANATION:
115
+ (i) get_dummies() converts OUTLOOK (Sunny/Overcast/Rain) → binary columns.
116
+ OUTLOOK_Overcast=1, OUTLOOK_Sunny=1, TEMP_Hot=1, WIND_Weak=1 etc.
117
+ drop_first=True: removes one level per feature to avoid multicollinearity.
118
+
119
+ (ii) Only 14 training samples → statsmodels may struggle to converge.
120
+ Significant features are those with the strongest influence.
121
+
122
+ (iii) Positive coefficient:
123
+ e.g., HUMIDITY_Normal = positive → Normal humidity increases chance of PLAY.
124
+ Negative coefficient:
125
+ e.g., WIND_Strong = negative → Strong wind decreases chance of PLAY.
126
+
127
+ (iv) Precision for PLAY=YES:
128
+ Of all days we predicted play, how many actually played.
129
+ Recall for PLAY=YES:
130
+ Of all actual play days, how many did we correctly predict.
131
+ ''')
132
+
133
+ END
134
+
135
+ """
@@ -0,0 +1,82 @@
1
+ code = """
2
+ # =====================================
3
+ # Decision Tree using Gini Impurity
4
+ # =====================================
5
+
6
+ import pandas as pd
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+ from sklearn.tree import DecisionTreeClassifier, plot_tree
10
+ from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
11
+
12
+ # -------------------------------------
13
+ # DATASET
14
+ # -------------------------------------
15
+ data = {
16
+ 'Weather':['Sunny','Cloudy','Sunny','Cloudy','Rainy','Rainy','Rainy','Sunny','Cloudy','Rainy'],
17
+ 'Temperature':['Hot','Hot','Mild','Mild','Mild','Cool','Mild','Hot','Hot','Mild'],
18
+ 'Humidity':['High','High','Normal','High','High','Normal','High','High','Normal','High'],
19
+ 'Wind':['Weak','Weak','Strong','Strong','Strong','Strong','Weak','Strong','Weak','Strong'],
20
+ 'Play':['No','Yes','Yes','Yes','No','No','Yes','No','Yes','No']
21
+ }
22
+
23
+ df = pd.DataFrame(data)
24
+ print("Dataset:\n",df)
25
+
26
+ # -------------------------------------
27
+ # ENCODE DATA
28
+ # -------------------------------------
29
+ df = pd.get_dummies(df)
30
+
31
+ X = df.drop('Play_Yes',axis=1)
32
+ y = df['Play_Yes']
33
+
34
+ # -------------------------------------
35
+ # TRAIN MODEL
36
+ # -------------------------------------
37
+ model = DecisionTreeClassifier(criterion='gini',max_depth=3)
38
+ model.fit(X,y)
39
+
40
+ # -------------------------------------
41
+ # TREE GRAPH
42
+ # -------------------------------------
43
+ plt.figure(figsize=(12,6))
44
+ plot_tree(model,feature_names=X.columns,class_names=['No','Yes'],filled=True)
45
+ plt.title("Decision Tree (Gini Impurity)")
46
+ plt.show()
47
+
48
+ # -------------------------------------
49
+ # PREDICTIONS
50
+ # -------------------------------------
51
+ y_pred = model.predict(X)
52
+
53
+ print("\nAccuracy:",accuracy_score(y,y_pred))
54
+ print("\nClassification Report:\n",classification_report(y,y_pred))
55
+
56
+ # -------------------------------------
57
+ # CONFUSION MATRIX (DIAGNOSTIC)
58
+ # -------------------------------------
59
+ cm = confusion_matrix(y,y_pred)
60
+
61
+ plt.figure(figsize=(5,4))
62
+ sns.heatmap(cm,annot=True,fmt='d',cmap='Blues',
63
+ xticklabels=['No','Yes'],
64
+ yticklabels=['No','Yes'])
65
+ plt.xlabel("Predicted")
66
+ plt.ylabel("Actual")
67
+ plt.title("Confusion Matrix")
68
+ plt.show()
69
+
70
+ # -------------------------------------
71
+ # FEATURE IMPORTANCE GRAPH
72
+ # -------------------------------------
73
+ importance = pd.Series(model.feature_importances_,index=X.columns)
74
+
75
+ importance.sort_values().plot(kind='barh',figsize=(8,5))
76
+ plt.title("Feature Importance (Gini Reduction)")
77
+ plt.xlabel("Importance")
78
+ plt.show()
79
+
80
+ END
81
+
82
+ """
@@ -0,0 +1,135 @@
1
+ code='''
2
+ # ============================================================
3
+ # COMMON IMPORTS — Run this cell FIRST before any question
4
+ # ============================================================
5
+ import pandas as pd
6
+ import numpy as np
7
+ import statsmodels.api as sm
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ from scipy.stats import zscore
11
+ from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
12
+ from sklearn.linear_model import LogisticRegression
13
+ from sklearn import metrics
14
+ from sklearn.neighbors import KNeighborsClassifier
15
+ from sklearn.cluster import KMeans
16
+ from sklearn.preprocessing import StandardScaler
17
+ from sklearn.utils import resample, shuffle
18
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
19
+ from statsmodels.graphics.regressionplots import influence_plot
20
+ from scipy.cluster.hierarchy import dendrogram, linkage
21
+
22
+ print('All imports successful!')
23
+
24
+ # ============================================================
25
+ # QUESTION 12 — Gradient Descent Algorithm for Linear Regression
26
+ # ============================================================
27
+
28
+ # ---------- Step 1: Load Dataset ----------
29
+ sales_df = pd.read_csv('Advertising.csv')
30
+ print('Dataset:')
31
+ print(sales_df.head())
32
+ print('Shape:', sales_df.shape)
33
+
34
+ X_sales = sales_df[['TV', 'Radio', 'Newspaper']]
35
+ y_sales = sales_df['sales']
36
+
37
+ # ---------- Step 2: Standardize Features ----------
38
+ y_std = np.array((y_sales - y_sales.mean()) / y_sales.std())
39
+ X_std = X_sales.apply(
40
+ lambda col: (col - col.mean()) / col.std(), axis=0)
41
+
42
+ # ---------- Step 3: Define Gradient Descent Functions ----------
43
+
44
+ def initialize(dim):
45
+ """Random initialization of weights (w) and bias (b)"""
46
+ np.random.seed(42)
47
+ b = np.random.random()
48
+ w = np.random.rand(dim)
49
+ return b, w
50
+
51
+ def predict_y(b, w, X):
52
+ """Forward pass: y_hat = b + X·w"""
53
+ return b + np.matmul(X, w)
54
+
55
+ def get_cost(y, y_hat):
56
+ """Cost function: Mean Squared Error (MSE)"""
57
+ residuals = y - y_hat
58
+ return np.sum(np.matmul(residuals.T, residuals)) / len(residuals)
59
+
60
+ def update_beta(X, y, y_hat, b0, w0, learning_rate):
61
+ """Gradient update step for bias and weights"""
62
+ db = (np.sum(y_hat - y) * 2) / len(y) # gradient for bias
63
+ dw = (np.dot((y_hat - y), X) * 2) / len(y) # gradient for weights
64
+ b1 = b0 - learning_rate * db # update bias
65
+ w1 = w0 - learning_rate * dw # update weights
66
+ return b1, w1
67
+
68
+ def run_grad(X, y, alpha=0.01, num_iterations=100):
69
+ """Run full Gradient Descent for given iterations and learning rate"""
70
+ b, w = initialize(X.shape[1])
71
+ iter_num = 0
72
+ gd_iter_df = pd.DataFrame(columns=['iterations', 'cost'])
73
+ result_idx = 0
74
+
75
+ for iter_num in range(num_iterations):
76
+ y_hat = predict_y(b, w, X)
77
+ this_cost = get_cost(y, y_hat)
78
+ prev_b, prev_w = b, w
79
+ b, w = update_beta(X, y, y_hat, prev_b, prev_w, alpha)
80
+
81
+ if iter_num % 10 == 0: # record every 10th iteration
82
+ gd_iter_df.loc[result_idx] = [iter_num, this_cost]
83
+ result_idx += 1
84
+
85
+ print(f'Final estimate of b & w: {round(b,5)} {np.round(w,5)}')
86
+ return gd_iter_df, b, w
87
+
88
+ # ---------- Step 4: Show Initial Parameters ----------
89
+ b_init, w_init = initialize(3)
90
+ print(f'\nInitial Bias : {b_init:.4f}')
91
+ print(f'Initial Weights : {w_init}')
92
+
93
+ y_hat_init = predict_y(b_init, w_init, X_std.values)
94
+ cost_init = get_cost(y_std, y_hat_init)
95
+ print(f'Initial Cost (MSE): {cost_init:.4f}')
96
+
97
+ b_init, w_init = update_beta(
98
+ X_std.values, y_std, y_hat_init, b_init, w_init, 0.01)
99
+ print(f'\nAfter first update → Bias: {b_init:.4f}, Weights: {np.round(w_init,4)}')
100
+
101
+ # ---------- Step 5: Run Gradient Descent ----------
102
+ print('\n===== Running Gradient Descent (alpha=0.01, 2000 iterations) =====')
103
+ gd_df1, b1, w1 = run_grad(X_std.values, y_std, alpha=0.01, num_iterations=2000)
104
+ print('\nCost per 10 iterations (first 40 rows):')
105
+ print(gd_df1.head(40).to_string(index=False))
106
+
107
+ # ---------- Step 6: Plot Cost vs Iterations ----------
108
+ print('\n===== Running with alpha=0.001 for comparison =====')
109
+ gd_df2, b2, w2 = run_grad(X_std.values, y_std, alpha=0.001, num_iterations=2000)
110
+
111
+ plt.figure(figsize=(12, 5))
112
+ plt.subplot(1, 2, 1)
113
+ plt.plot(gd_df1['iterations'].astype(float),
114
+ gd_df1['cost'].astype(float), color='blue', label='alpha=0.01')
115
+ plt.xlabel('No. of Iterations')
116
+ plt.ylabel('Cost (MSE)')
117
+ plt.title('Cost vs Iterations (alpha=0.01)')
118
+ plt.legend()
119
+ plt.grid(True, alpha=0.3)
120
+
121
+ plt.subplot(1, 2, 2)
122
+ plt.plot(gd_df2['iterations'].astype(float),
123
+ gd_df2['cost'].astype(float), color='orange', label='alpha=0.001')
124
+ plt.xlabel('No. of Iterations')
125
+ plt.ylabel('Cost (MSE)')
126
+ plt.title('Cost vs Iterations (alpha=0.001)')
127
+ plt.legend()
128
+ plt.grid(True, alpha=0.3)
129
+
130
+ plt.tight_layout()
131
+ plt.show()
132
+
133
+ END
134
+
135
+ '''