myawesomepkg 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. myawesomepkg/TSAPY1/1 (A) Working with Numpy Arrays.py +1146 -0
  2. myawesomepkg/TSAPY1/1(B)Aggregation (1).py +319 -0
  3. myawesomepkg/TSAPY1/1(C) Broadcasting .py +328 -0
  4. myawesomepkg/TSAPY1/10-A_Load_stringr.py +77 -0
  5. myawesomepkg/TSAPY1/10-B_Forcats.py +70 -0
  6. myawesomepkg/TSAPY1/2(a) Comparison, Masking And Boolean Logic (1).py +497 -0
  7. myawesomepkg/TSAPY1/2(b)Fancy Indexing.py +594 -0
  8. myawesomepkg/TSAPY1/2(c) Sorting Arrays.py +528 -0
  9. myawesomepkg/TSAPY1/2(d) Structured Array.py +350 -0
  10. myawesomepkg/TSAPY1/3 (A) Handling Missing Data.py +1013 -0
  11. myawesomepkg/TSAPY1/4A_Merge_Joins.py +1209 -0
  12. myawesomepkg/TSAPY1/9A_Dplyr.py +85 -0
  13. myawesomepkg/TSAPY1/9B_Tidyr.py +71 -0
  14. myawesomepkg/TSAPY1/Aggregation_Groupin_Pivot_Filter_Vectorice_Time_Series.py +1999 -0
  15. myawesomepkg/TSAPY1/Combining_Joins.py +1209 -0
  16. myawesomepkg/TSAPY1/P4-1-different_distance_methods_(euclidean)_with_prediction,_test_score_and_confusion_matrix1.py +131 -0
  17. myawesomepkg/TSAPY1/P4-2-k_means_clustering_with_prediction,_test_score_and_confusion_matrix2.py +150 -0
  18. myawesomepkg/TSAPY1/Pract3_C.py +482 -0
  19. myawesomepkg/TSAPY1/Pract5_Data_Visualization.py +481 -0
  20. myawesomepkg/TSAPY1/Practical 6.py +860 -0
  21. myawesomepkg/TSAPY1/Practical No 1.py +148 -0
  22. myawesomepkg/TSAPY1/Practical No 2.py +115 -0
  23. myawesomepkg/TSAPY1/Practical No 3.py +168 -0
  24. myawesomepkg/TSAPY1/Practical No 4 A.py +233 -0
  25. myawesomepkg/TSAPY1/Practical No 4 B.py +137 -0
  26. myawesomepkg/TSAPY1/Practical No 5.py +52 -0
  27. myawesomepkg/TSAPY1/Practical No 6.py +29 -0
  28. myawesomepkg/TSAPY1/Practical No 7.py +67 -0
  29. myawesomepkg/TSAPY1/Practical No 8.py +108 -0
  30. myawesomepkg/TSAPY1/Print_R.py +123 -0
  31. myawesomepkg/TSAPY1/R_Graph.py +32 -0
  32. myawesomepkg/TSAPY1/Working_Ggplot.py +53 -0
  33. myawesomepkg/TSAPY1/__init__.py +0 -0
  34. myawesomepkg/TSAPY1/p1_2_pca_iris.py +141 -0
  35. myawesomepkg/TSAPY1/p2_1_find_s.py +78 -0
  36. myawesomepkg/TSAPY1/p2_bcandidate_elimination_algorithm_(1).py +85 -0
  37. myawesomepkg/TSAPY1/p3_1_least_square_regression.py +105 -0
  38. myawesomepkg/TSAPY1/p3_2_logistic_regression_algorithm.py +79 -0
  39. myawesomepkg/TSAPY1/p5_1_hierarchical_clustering.py +143 -0
  40. myawesomepkg/TSAPY1/p5_2_k_nearest_neighbour_algorithm.py +104 -0
  41. myawesomepkg/TSAPY1/p6_1_id3_algorithm_.py +199 -0
  42. myawesomepkg/TSAPY1/p7_1_ann_backpropagation_algorithm.py +116 -0
  43. myawesomepkg/TSAPY1/p7_2_bds_association_rule_mining.py +99 -0
  44. myawesomepkg/TSAPY1/p8_1_gaussian_naive_bayes_.py +97 -0
  45. myawesomepkg/TSAPY1/p8_2_naive_bayes_document_classifier.py +111 -0
  46. myawesomepkg/TSAPY1/p9_1bayesian_network.py +91 -0
  47. myawesomepkg/TSAPY1/p9_b_loess_regression.py +113 -0
  48. myawesomepkg/TSAPY1/p_1_test_and_train.py +98 -0
  49. myawesomepkg/TSAPY1/pract3A-B.py +3212 -0
  50. myawesomepkg/TSAPY1/practical_no_3.py +167 -0
  51. myawesomepkg/TSAPY1/practical_no_4.py +215 -0
  52. myawesomepkg/TSAPY1/practical_no_4b.py +78 -0
  53. myawesomepkg/TSAPY1/practical_no_5_ac_and_pca.py +39 -0
  54. myawesomepkg/TSAPY1/practical_no_6.py +37 -0
  55. myawesomepkg/TSAPY1/practical_no_7.py +69 -0
  56. myawesomepkg/TSAPY1/practical_no_8.py +79 -0
  57. myawesomepkg/TSAPY1/tsa_practical_no_1.py +287 -0
  58. myawesomepkg/TSAPY1/tsa_practical_no_2.py +121 -0
  59. myawesomepkg/__init__.py +1 -0
  60. myawesomepkg/core.py +2 -0
  61. myawesomepkg-0.1.8.dist-info/METADATA +17 -0
  62. myawesomepkg-0.1.8.dist-info/RECORD +64 -0
  63. myawesomepkg-0.1.8.dist-info/WHEEL +5 -0
  64. myawesomepkg-0.1.8.dist-info/top_level.txt +1 -0
@@ -0,0 +1,141 @@
1
+ # -*- coding: utf-8 -*-
2
+ """P1-2-PCA Iris.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1PyEQgfiZYSfkl9j1bYi61d2LQwF86D3U
8
+ """
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ from sklearn.preprocessing import StandardScaler
13
+ sc = StandardScaler()
14
+
15
+ def PCA(X , num_components):
16
+
17
+ #Step-1
18
+ # X_meaned = X - np.mean(X , axis = 0)
19
+ X_meaned = sc.fit_transform(X)
20
+
21
+ #Step-2
22
+ cov_mat = np.cov(X_meaned , rowvar = False)
23
+
24
+ #Step-3
25
+ eigen_values , eigen_vectors = np.linalg.eigh(cov_mat)
26
+
27
+ #Step-4
28
+ sorted_index = np.argsort(eigen_values)[::-1]
29
+ sorted_eigenvalue = eigen_values[sorted_index]
30
+ sorted_eigenvectors = eigen_vectors[:,sorted_index]
31
+
32
+ #Step-5
33
+ eigenvector_subset = sorted_eigenvectors[:,0:num_components]
34
+
35
+ #Step-6
36
+ X_reduced = np.dot(eigenvector_subset.transpose() , X_meaned.transpose() ).transpose()
37
+
38
+ return X_reduced
39
+
40
+ data = pd.read_csv("/content/iris.csv")
41
+ data.head()
42
+
43
+ #prepare the data
44
+ x = data.iloc[:,0:4]
45
+
46
+ #prepare the target
47
+ target = data.iloc[:,4]
48
+
49
+ x.head()
50
+
51
+ target.head()
52
+
53
+ #Applying it to PCA function
54
+ mat_reduced = PCA(x , 2)
55
+
56
+ #Creating a Pandas DataFrame of reduced Dataset
57
+ principal_df = pd.DataFrame(mat_reduced , columns = ['PC1','PC2'])
58
+
59
+ #Display the Principal Coponents
60
+ principal_df.head()
61
+
62
+ #Concat it with target variable to create a complete Dataset
63
+ principal_df = pd.concat([principal_df , pd.DataFrame(target)] , axis = 1)
64
+
65
+ principal_df.head()
66
+
67
+ import seaborn as sb
68
+ import matplotlib.pyplot as plt
69
+
70
+ plt.figure(figsize = (6,6))
71
+ sb.scatterplot(data = principal_df , x = 'PC1', y = 'PC2' , hue = "variety" , s = 60 , palette= 'icefire')
72
+ #sb.scatterplot(data = principal_df , x = 'PC1', y = 'PC2', s = 60 , palette= 'icefire')
73
+
74
+ #my coooode
75
+
76
+ # Import necessary libraries
77
+ import numpy as np
78
+ import pandas as pd
79
+ import matplotlib.pyplot as plt
80
+ import seaborn as sns
81
+ from sklearn.preprocessing import StandardScaler
82
+
83
+ # Initialize scaler
84
+ scaler = StandardScaler()
85
+
86
+ # Define PCA function
87
+ def PCA(X, num_components):
88
+ # Step 1: Standardize the dataset
89
+ X_scaled = scaler.fit_transform(X)
90
+
91
+ # Step 2: Compute covariance matrix
92
+ covariance_matrix = np.cov(X_scaled, rowvar=False)
93
+
94
+ # Step 3: Compute eigenvalues and eigenvectors
95
+ eigen_values, eigen_vectors = np.linalg.eigh(covariance_matrix)
96
+
97
+ # Step 4: Sort eigenvalues and eigenvectors in descending order
98
+ sorted_indices = np.argsort(eigen_values)[::-1]
99
+ eigen_values = eigen_values[sorted_indices]
100
+ eigen_vectors = eigen_vectors[:, sorted_indices]
101
+
102
+ # Step 5: Select top components
103
+ eigenvector_subset = eigen_vectors[:, :num_components]
104
+
105
+ # Step 6: Project data onto principal components
106
+ X_reduced = np.dot(X_scaled, eigenvector_subset)
107
+
108
+ return X_reduced, eigen_values
109
+
110
+ # Load dataset
111
+ data = pd.read_csv(r"/content/iris.csv")
112
+
113
+ # Separate features and target
114
+ X = data.iloc[:, :-1]
115
+ y = data.iloc[:, -1]
116
+
117
+ # Apply PCA
118
+ X_reduced, eigen_values = PCA(X, 2)
119
+
120
+ # Create a DataFrame with principal components
121
+ pca_df = pd.DataFrame(X_reduced, columns=['PC1', 'PC2'])
122
+ pca_df['Variety'] = y
123
+
124
+ # Display first few rows
125
+ print(pca_df.head())
126
+
127
+ # Variance explained by each component
128
+ explained_variance = (eigen_values / np.sum(eigen_values)) * 100
129
+ print("\nExplained Variance (%):")
130
+ print(explained_variance[:2])
131
+
132
+ # Visualization
133
+ plt.figure(figsize=(7, 6))
134
+ sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Variety', s=80, palette='viridis')
135
+ plt.title('PCA on Iris Dataset', fontsize=14)
136
+ plt.xlabel(f'PC1 ({explained_variance[0]:.2f}% variance)')
137
+ plt.ylabel(f'PC2 ({explained_variance[1]:.2f}% variance)')
138
+ plt.legend(title='Variety')
139
+ plt.grid(True, linestyle='--', alpha=0.6)
140
+ plt.show()
141
+
@@ -0,0 +1,78 @@
1
+ # -*- coding: utf-8 -*-
2
+ """P2-1 Find-S.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1f8yD_lr15trt6SrY0k7GkzBGPc-iHKDR
8
+ """
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+
13
+ data = pd.read_csv('/content/lab1.csv')
14
+ data
15
+
16
+ concepts = np.array(data)[:,:-1]
17
+ concepts
18
+
19
+ target = np.array(data)[:,-1]
20
+ target
21
+
22
+ def train(con,tar):
23
+ for i,val in enumerate(tar):
24
+ if val == 'yes':
25
+ specific_h = con[i].copy()
26
+ break
27
+ for i,val in enumerate(con):
28
+ if tar[i] == 'yes':
29
+ for x in range(len(specific_h)):
30
+ if val[x] != specific_h[x]:
31
+ specific_h[x] = '?'
32
+ else:
33
+ pass
34
+ return specific_h
35
+
36
+ print(train(concepts,target))
37
+
38
+ # my code
39
+
40
+ # ✅ Improved FIND-S Algorithm Implementation
41
+
42
+ import pandas as pd
43
+ import numpy as np
44
+
45
+ # Load dataset
46
+ data = pd.read_csv(r"/content/lab1.csv")
47
+
48
+ # Display the data
49
+ print("Dataset:\n", data, "\n")
50
+
51
+ # Separate features (concepts) and target
52
+ concepts = data.iloc[:, :-1].values
53
+ target = data.iloc[:, -1].values
54
+
55
+ # FIND-S algorithm definition
56
+ def find_s(concepts, target):
57
+ # Step 1: Initialize specific hypothesis with the first positive example
58
+ specific_h = None
59
+ for i, val in enumerate(target):
60
+ if val.lower() == 'yes':
61
+ specific_h = concepts[i].copy()
62
+ break
63
+
64
+ # Step 2: Generalize only when necessary
65
+ for i, val in enumerate(target):
66
+ if val.lower() == 'yes':
67
+ for j in range(len(specific_h)):
68
+ if concepts[i][j] != specific_h[j]:
69
+ specific_h[j] = '?'
70
+
71
+ return specific_h
72
+
73
+ # Train the model
74
+ final_hypothesis = find_s(concepts, target)
75
+
76
+ # Display results
77
+ print("✅ Final Specific Hypothesis:\n", final_hypothesis)
78
+
@@ -0,0 +1,85 @@
1
+ # -*- coding: utf-8 -*-
2
+ """P2-BCandidate Elimination algorithm (1).ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1J2QTr3esKQiO5tSmwN9heAyTb86Lzh7f
8
+ """
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+
13
+ data = pd.read_csv('/content/enjoysport.csv')
14
+ data
15
+
16
+ concepts = np.array(data.iloc[:,0:-1])
17
+ print("\nInstances are:\n",concepts)
18
+ target = np.array(data.iloc[:,-1])
19
+ print("\nTarget Values are: ",target)
20
+
21
+ def learn(concepts, target):
22
+ specific_h = concepts[0].copy()
23
+ print("\nInitialization of specific_h and genearal_h")
24
+ print("\nSpecific Boundary: ", specific_h)
25
+ general_h = [["?" for i in range(len(specific_h))] for i in range(len(specific_h))]
26
+ print("\nGeneric Boundary: ",general_h)
27
+
28
+ for i, h in enumerate(concepts):
29
+ print("\nInstance", i+1 , "is ", h)
30
+ if target[i] == "yes":
31
+ print("Instance is Positive ")
32
+ for x in range(len(specific_h)):
33
+ if h[x]!= specific_h[x]:
34
+ specific_h[x] ='?'
35
+ general_h[x][x] ='?'
36
+
37
+ if target[i] == "no":
38
+ print("Instance is Negative ")
39
+ for x in range(len(specific_h)):
40
+ if h[x]!= specific_h[x]:
41
+ general_h[x][x] = specific_h[x]
42
+ else:
43
+ general_h[x][x] = '?'
44
+
45
+ print("Specific Bundary after ", i+1, "Instance is ", specific_h)
46
+ print("Generic Boundary after ", i+1, "Instance is ", general_h)
47
+ print("\n")
48
+
49
+ indices = [i for i, val in enumerate(general_h) if val == ['?', '?', '?', '?', '?', '?']]
50
+
51
+ for i in indices:
52
+ general_h.remove(['?', '?', '?', '?', '?', '?'])
53
+ return specific_h, general_h
54
+
55
+ s_final, g_final = learn(concepts, target)
56
+ print("Final Specific_h:", s_final, sep="\n")
57
+ print("Final General_h:", g_final, sep="\n")
58
+
59
+ #my coode
60
+
61
+ import pandas as pd
62
+ import numpy as np
63
+
64
+ # Load dataset
65
+ data = pd.read_csv(r'/content/enjoysport.csv')
66
+
67
+ # Split into features and target
68
+ concepts = np.array(data.iloc[:, :-1])
69
+ target = np.array(data.iloc[:, -1])
70
+
71
+ def find_s(concepts, target):
72
+ for i, val in enumerate(target):
73
+ if val.lower() == 'yes':
74
+ specific_h = concepts[i].copy()
75
+ break
76
+
77
+ for i, val in enumerate(concepts):
78
+ if target[i].lower() == 'yes':
79
+ for x in range(len(specific_h)):
80
+ if val[x] != specific_h[x]:
81
+ specific_h[x] = '?'
82
+ return specific_h
83
+
84
+ print("Most specific hypothesis:", find_s(concepts, target))
85
+
@@ -0,0 +1,105 @@
1
+ # -*- coding: utf-8 -*-
2
+ """P3-1-Least Square Regression.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1gU6rRYa8vNSqSFvH9dDuRKc80I1Oejvg
8
+ """
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+
14
+ plt.rcParams['figure.figsize'] = (12.0, 9.0)
15
+
16
+ # Preprocessing Input data
17
+ data = pd.read_csv('/content/Data (2).csv')
18
+ X = data.iloc[:, 0]
19
+ Y = data.iloc[:, 1]
20
+ plt.scatter(X, Y)
21
+ plt.show()
22
+
23
+ # Building the model
24
+ #Calculating the means
25
+ X_mean = np.mean(X)
26
+ Y_mean = np.mean(Y)
27
+
28
+ cov = 0
29
+ var = 0
30
+ for i in range(len(X)):
31
+ cov += (X[i] - X_mean)*(Y[i] - Y_mean)# Calculate the covariance of X and Y
32
+ var += (X[i] - X_mean)**2 # Calculate the variance of X
33
+ regcoeff = cov / var
34
+ intercept = Y_mean - regcoeff*X_mean
35
+
36
+ print (regcoeff, intercept)
37
+
38
+ # Making predictions
39
+ Y_pred = regcoeff*X + intercept
40
+
41
+ plt.scatter(X, Y) # actual
42
+ plt.plot([min(X), max(X)], [min(Y_pred), max(Y_pred)], color='red') # predicted
43
+ plt.show()
44
+
45
+ # my cooode
46
+
47
+
48
+ import pandas as pd
49
+ import numpy as np
50
+ import matplotlib.pyplot as plt
51
+
52
+ # Set plot size
53
+ plt.rcParams['figure.figsize'] = (10, 6)
54
+
55
+ # Load data
56
+ data = pd.read_csv(r'/content/Data (2).csv')
57
+
58
+ # Extract features and target
59
+ X = data.iloc[:, 0].values
60
+ Y = data.iloc[:, 1].values
61
+
62
+ # Scatter plot of original data
63
+ plt.scatter(X, Y, color='blue', label='Actual Data')
64
+ plt.title('Data Visualization')
65
+ plt.xlabel('X')
66
+ plt.ylabel('Y')
67
+ plt.legend()
68
+ plt.show()
69
+
70
+ # --- Linear Regression from Scratch ---
71
+
72
+ # Calculate means
73
+ X_mean = np.mean(X)
74
+ Y_mean = np.mean(Y)
75
+
76
+ # Calculate covariance and variance
77
+ cov = np.sum((X - X_mean) * (Y - Y_mean))
78
+ var = np.sum((X - X_mean) ** 2)
79
+
80
+ # Regression coefficients
81
+ reg_coeff = cov / var
82
+ intercept = Y_mean - reg_coeff * X_mean
83
+
84
+ print(f"Slope (regression coefficient): {reg_coeff:.4f}")
85
+ print(f"Intercept: {intercept:.4f}")
86
+
87
+ # Predictions
88
+ Y_pred = reg_coeff * X + intercept
89
+
90
+ # --- Model Evaluation ---
91
+ # R² Score
92
+ ss_total = np.sum((Y - Y_mean) ** 2)
93
+ ss_residual = np.sum((Y - Y_pred) ** 2)
94
+ r2_score = 1 - (ss_residual / ss_total)
95
+ print(f"R² Score: {r2_score:.4f}")
96
+
97
+ # --- Visualization ---
98
+ plt.scatter(X, Y, color='blue', label='Actual Data')
99
+ plt.plot(X, Y_pred, color='red', label='Regression Line')
100
+ plt.title('Simple Linear Regression')
101
+ plt.xlabel('X')
102
+ plt.ylabel('Y')
103
+ plt.legend()
104
+ plt.grid(True)
105
+ plt.show()
@@ -0,0 +1,79 @@
1
+ # -*- coding: utf-8 -*-
2
+ """P3-2-Logistic Regression algorithm.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1jJpY0R35HT2AXPM9yNTJwX6JqW5Q6Zta
8
+ """
9
+
10
+ import pandas as pd
11
+ from matplotlib import pyplot as plt
12
+ df = pd.read_csv("/content/insurance_data.csv")
13
+ df.head()
14
+ plt.scatter(df.Age,df.have_insurance,marker='+',color='red')
15
+ plt.show()
16
+ from sklearn.model_selection import train_test_split
17
+ X_train, X_test, Y_train, Y_test = train_test_split(df[['Age']],df.have_insurance,test_size=0.1)
18
+ from sklearn.linear_model import LogisticRegression
19
+ model = LogisticRegression()
20
+ model.fit(X_train, Y_train)#Training the model
21
+ model.predict(X_test)#Prediction on the test set
22
+ model.score(X_test,Y_test)#Calculating the accuracy
23
+ model.predict_proba(X_test)#Predicting the probabilities
24
+
25
+ #my coode
26
+
27
+ import pandas as pd
28
+ import matplotlib.pyplot as plt
29
+ from sklearn.model_selection import train_test_split
30
+ from sklearn.linear_model import LogisticRegression
31
+ from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
32
+
33
+ # Load dataset
34
+ df = pd.read_csv(r'/content/insurance_data.csv')
35
+
36
+ # Display first few rows
37
+ print("Dataset Preview:\n", df.head())
38
+
39
+ # Visualize the data
40
+ plt.figure(figsize=(8, 5))
41
+ plt.scatter(df['Age'], df['have_insurance'], marker='+', color='red')
42
+ plt.title('Insurance Data Distribution')
43
+ plt.xlabel('Age')
44
+ plt.ylabel('Has Insurance (0 = No, 1 = Yes)')
45
+ plt.grid(True)
46
+ plt.show()
47
+
48
+ # Split data into train and test
49
+ X_train, X_test, y_train, y_test = train_test_split(
50
+ df[['Age']], df['have_insurance'], test_size=0.2, random_state=42
51
+ )
52
+
53
+ # Train Logistic Regression model
54
+ model = LogisticRegression()
55
+ model.fit(X_train, y_train)
56
+
57
+ # Predictions
58
+ y_pred = model.predict(X_test)
59
+ y_prob = model.predict_proba(X_test)[:, 1]
60
+
61
+ # Model evaluation
62
+ accuracy = accuracy_score(y_test, y_pred)
63
+ print(f"✅ Model Accuracy: {accuracy:.4f}\n")
64
+
65
+ print("🔹 Classification Report:\n", classification_report(y_test, y_pred))
66
+ print("🔹 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
67
+
68
+ # Plot logistic curve
69
+ plt.figure(figsize=(8, 5))
70
+ plt.scatter(df['Age'], df['have_insurance'], color='blue', label='Actual Data')
71
+ X_range = pd.DataFrame({'Age': range(int(df['Age'].min()), int(df['Age'].max()))})
72
+ plt.plot(X_range, model.predict_proba(X_range)[:, 1], color='red', label='Logistic Curve')
73
+ plt.title('Logistic Regression Fit')
74
+ plt.xlabel('Age')
75
+ plt.ylabel('Probability of Having Insurance')
76
+ plt.legend()
77
+ plt.grid(True)
78
+ plt.show()
79
+
@@ -0,0 +1,143 @@
1
+ # -*- coding: utf-8 -*-
2
+ """P5-1 Hierarchical Clustering.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1-t2ZOVavQ3YXZC9-s7tvo3uI0acgS2C5
8
+ """
9
+
10
+ import pandas as pd
11
+ import seaborn as sns
12
+ import matplotlib.pyplot as plt
13
+ from sklearn.cluster import AgglomerativeClustering
14
+ from scipy.cluster import hierarchy
15
+
16
+ #load the Penguins dataset
17
+ data = pd.read_csv("/content/penguins.csv")
18
+ data.head()
19
+
20
+ data['species'].value_counts()
21
+
22
+ print(data.shape) # (344, 9)
23
+
24
+ #trimming the dataset to the chosen columns and dropping rows with missing data
25
+ df = data[['bill_length_mm', 'flipper_length_mm']]
26
+ df = df.dropna(axis=0)
27
+
28
+ df.head()
29
+
30
+ #use Scipy's hierarchy.linkage() to form clusters and plot them with hierarchy.dendrogram()
31
+
32
+ clusters = hierarchy.linkage(df, method="ward")
33
+
34
+ plt.figure(figsize=(8, 6))
35
+ dendrogram = hierarchy.dendrogram(clusters)
36
+ # Plotting a horizontal line based on the first biggest distance between clusters
37
+ plt.axhline(150, color='red', linestyle='--');
38
+ # Plotting a horizontal line based on the second biggest distance between clusters
39
+ plt.axhline(100, color='crimson');
40
+
41
+ #perform Agglomerative Clustering with Scikit-Learn to find cluster labels for the three types of penguins
42
+ clustering_model = AgglomerativeClustering(n_clusters=3, linkage="ward")
43
+ clustering_model.fit(df)
44
+ labels = clustering_model.labels_
45
+
46
+ #plot the data before and after Agglomerative Clustering with 3 clusters
47
+ fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
48
+ sns.scatterplot(ax=axes[0], data=df, x='bill_length_mm', y='flipper_length_mm').set_title('Without cliustering')
49
+ sns.scatterplot(ax=axes[1], data=df, x='bill_length_mm', y='flipper_length_mm', hue=clustering_model.labels_).set_title('With clustering');
50
+
51
+ #Agglomerative Clustering without specifying the number of clusters
52
+ clustering_model_no_clusters = AgglomerativeClustering(linkage="ward")
53
+ clustering_model_no_clusters.fit(df)
54
+ labels_no_clusters = clustering_model_no_clusters.labels_
55
+
56
+ # plot the data without Agglomerative Clustering, with 3 clusters and with no pre defined clusters
57
+ fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,5))
58
+ sns.scatterplot(ax=axes[0], data=df, x='bill_length_mm', y='flipper_length_mm').set_title('Without cliustering')
59
+ sns.scatterplot(ax=axes[1], data=df, x='bill_length_mm', y='flipper_length_mm', hue=clustering_model.labels_).set_title('With 3 clusters')
60
+ sns.scatterplot(ax=axes[2], data=df, x='bill_length_mm', y='flipper_length_mm', hue=clustering_model_no_clusters.labels_).set_title('Without choosing number of clusters');
61
+
62
+ # my cooode
63
+
64
+ import pandas as pd
65
+ import seaborn as sns
66
+ import matplotlib.pyplot as plt
67
+ from sklearn.cluster import AgglomerativeClustering
68
+ from scipy.cluster import hierarchy
69
+
70
+ # ============================
71
+ # 1️⃣ Load & Prepare Dataset
72
+ # ============================
73
+ data = pd.read_csv("/content/penguins.csv")
74
+
75
+ print("Dataset shape:", data.shape)
76
+ print("\nSpecies count:\n", data['species'].value_counts())
77
+
78
+ # Select relevant numerical features and drop missing rows
79
+ df = data[['bill_length_mm', 'flipper_length_mm']].dropna()
80
+ print("\nCleaned dataset shape:", df.shape)
81
+ print(df.head())
82
+
83
+ # ============================
84
+ # 2️⃣ Create Dendrogram (Hierarchical Tree)
85
+ # ============================
86
+ plt.figure(figsize=(10, 6))
87
+ clusters = hierarchy.linkage(df, method="ward")
88
+
89
+ # Plot the dendrogram
90
+ dendrogram = hierarchy.dendrogram(clusters, color_threshold=100)
91
+ plt.axhline(150, color='red', linestyle='--', label='Threshold 150')
92
+ plt.axhline(100, color='orange', linestyle='--', label='Threshold 100')
93
+ plt.title("Hierarchical Clustering Dendrogram (Ward’s Method)")
94
+ plt.xlabel("Data Points")
95
+ plt.ylabel("Euclidean Distance")
96
+ plt.legend()
97
+ plt.grid(True)
98
+ plt.show()
99
+
100
+ # ============================
101
+ # 3️⃣ Agglomerative Clustering (with 3 clusters)
102
+ # ============================
103
+ agg3 = AgglomerativeClustering(n_clusters=3, linkage="ward")
104
+ labels_3 = agg3.fit_predict(df)
105
+
106
+ # Add cluster labels to dataframe
107
+ df['Cluster_3'] = labels_3
108
+
109
+ # ============================
110
+ # 4️⃣ Visualization — With and Without Clustering
111
+ # ============================
112
+ fig, axes = plt.subplots(1, 2, figsize=(14, 5))
113
+ sns.scatterplot(ax=axes[0], data=df, x='bill_length_mm', y='flipper_length_mm', color='gray')
114
+ axes[0].set_title("Without Clustering")
115
+
116
+ sns.scatterplot(ax=axes[1], data=df, x='bill_length_mm', y='flipper_length_mm', hue='Cluster_3', palette='viridis', s=70)
117
+ axes[1].set_title("With 3 Clusters")
118
+ plt.tight_layout()
119
+ plt.show()
120
+
121
+ # ============================
122
+ # 5️⃣ Agglomerative Clustering (without predefined clusters)
123
+ # ============================
124
+ agg_auto = AgglomerativeClustering(linkage="ward")
125
+ labels_auto = agg_auto.fit_predict(df[['bill_length_mm', 'flipper_length_mm']])
126
+ df['Cluster_auto'] = labels_auto
127
+
128
+ # ============================
129
+ # 6️⃣ Compare — No Clustering vs 3 Clusters vs Auto
130
+ # ============================
131
+ fig, axes = plt.subplots(1, 3, figsize=(18, 5))
132
+
133
+ sns.scatterplot(ax=axes[0], data=df, x='bill_length_mm', y='flipper_length_mm', color='gray')
134
+ axes[0].set_title("Without Clustering")
135
+
136
+ sns.scatterplot(ax=axes[1], data=df, x='bill_length_mm', y='flipper_length_mm', hue='Cluster_3', palette='cool', s=70)
137
+ axes[1].set_title("With 3 Clusters")
138
+
139
+ sns.scatterplot(ax=axes[2], data=df, x='bill_length_mm', y='flipper_length_mm', hue='Cluster_auto', palette='plasma', s=70)
140
+ axes[2].set_title("Auto Clustering (No Predefined k)")
141
+
142
+ plt.tight_layout()
143
+ plt.show()