PyPI - myawesomepkg - Versions diffs - 0.1.8__py3-none-any.whl - Mend

myawesomepkg 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

myawesomepkg/TSAPY1/1 (A) Working with Numpy Arrays.py +1146 -0
myawesomepkg/TSAPY1/1(B)Aggregation (1).py +319 -0
myawesomepkg/TSAPY1/1(C) Broadcasting .py +328 -0
myawesomepkg/TSAPY1/10-A_Load_stringr.py +77 -0
myawesomepkg/TSAPY1/10-B_Forcats.py +70 -0
myawesomepkg/TSAPY1/2(a) Comparison, Masking And Boolean Logic (1).py +497 -0
myawesomepkg/TSAPY1/2(b)Fancy Indexing.py +594 -0
myawesomepkg/TSAPY1/2(c) Sorting Arrays.py +528 -0
myawesomepkg/TSAPY1/2(d) Structured Array.py +350 -0
myawesomepkg/TSAPY1/3 (A) Handling Missing Data.py +1013 -0
myawesomepkg/TSAPY1/4A_Merge_Joins.py +1209 -0
myawesomepkg/TSAPY1/9A_Dplyr.py +85 -0
myawesomepkg/TSAPY1/9B_Tidyr.py +71 -0
myawesomepkg/TSAPY1/Aggregation_Groupin_Pivot_Filter_Vectorice_Time_Series.py +1999 -0
myawesomepkg/TSAPY1/Combining_Joins.py +1209 -0
myawesomepkg/TSAPY1/P4-1-different_distance_methods_(euclidean)_with_prediction,_test_score_and_confusion_matrix1.py +131 -0
myawesomepkg/TSAPY1/P4-2-k_means_clustering_with_prediction,_test_score_and_confusion_matrix2.py +150 -0
myawesomepkg/TSAPY1/Pract3_C.py +482 -0
myawesomepkg/TSAPY1/Pract5_Data_Visualization.py +481 -0
myawesomepkg/TSAPY1/Practical 6.py +860 -0
myawesomepkg/TSAPY1/Practical No 1.py +148 -0
myawesomepkg/TSAPY1/Practical No 2.py +115 -0
myawesomepkg/TSAPY1/Practical No 3.py +168 -0
myawesomepkg/TSAPY1/Practical No 4 A.py +233 -0
myawesomepkg/TSAPY1/Practical No 4 B.py +137 -0
myawesomepkg/TSAPY1/Practical No 5.py +52 -0
myawesomepkg/TSAPY1/Practical No 6.py +29 -0
myawesomepkg/TSAPY1/Practical No 7.py +67 -0
myawesomepkg/TSAPY1/Practical No 8.py +108 -0
myawesomepkg/TSAPY1/Print_R.py +123 -0
myawesomepkg/TSAPY1/R_Graph.py +32 -0
myawesomepkg/TSAPY1/Working_Ggplot.py +53 -0
myawesomepkg/TSAPY1/__init__.py +0 -0
myawesomepkg/TSAPY1/p1_2_pca_iris.py +141 -0
myawesomepkg/TSAPY1/p2_1_find_s.py +78 -0
myawesomepkg/TSAPY1/p2_bcandidate_elimination_algorithm_(1).py +85 -0
myawesomepkg/TSAPY1/p3_1_least_square_regression.py +105 -0
myawesomepkg/TSAPY1/p3_2_logistic_regression_algorithm.py +79 -0
myawesomepkg/TSAPY1/p5_1_hierarchical_clustering.py +143 -0
myawesomepkg/TSAPY1/p5_2_k_nearest_neighbour_algorithm.py +104 -0
myawesomepkg/TSAPY1/p6_1_id3_algorithm_.py +199 -0
myawesomepkg/TSAPY1/p7_1_ann_backpropagation_algorithm.py +116 -0
myawesomepkg/TSAPY1/p7_2_bds_association_rule_mining.py +99 -0
myawesomepkg/TSAPY1/p8_1_gaussian_naive_bayes_.py +97 -0
myawesomepkg/TSAPY1/p8_2_naive_bayes_document_classifier.py +111 -0
myawesomepkg/TSAPY1/p9_1bayesian_network.py +91 -0
myawesomepkg/TSAPY1/p9_b_loess_regression.py +113 -0
myawesomepkg/TSAPY1/p_1_test_and_train.py +98 -0
myawesomepkg/TSAPY1/pract3A-B.py +3212 -0
myawesomepkg/TSAPY1/practical_no_3.py +167 -0
myawesomepkg/TSAPY1/practical_no_4.py +215 -0
myawesomepkg/TSAPY1/practical_no_4b.py +78 -0
myawesomepkg/TSAPY1/practical_no_5_ac_and_pca.py +39 -0
myawesomepkg/TSAPY1/practical_no_6.py +37 -0
myawesomepkg/TSAPY1/practical_no_7.py +69 -0
myawesomepkg/TSAPY1/practical_no_8.py +79 -0
myawesomepkg/TSAPY1/tsa_practical_no_1.py +287 -0
myawesomepkg/TSAPY1/tsa_practical_no_2.py +121 -0
myawesomepkg/__init__.py +1 -0
myawesomepkg/core.py +2 -0
myawesomepkg-0.1.8.dist-info/METADATA +17 -0
myawesomepkg-0.1.8.dist-info/RECORD +64 -0
myawesomepkg-0.1.8.dist-info/WHEEL +5 -0
myawesomepkg-0.1.8.dist-info/top_level.txt +1 -0

myawesomepkg/TSAPY1/p1_2_pca_iris.py ADDED Viewed

@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+"""P1-2-PCA Iris.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1PyEQgfiZYSfkl9j1bYi61d2LQwF86D3U
+"""
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+sc = StandardScaler()
+def PCA(X , num_components):
+    #Step-1
+    # X_meaned = X - np.mean(X , axis = 0)
+    X_meaned = sc.fit_transform(X)
+    #Step-2
+    cov_mat = np.cov(X_meaned , rowvar = False)
+    #Step-3
+    eigen_values , eigen_vectors = np.linalg.eigh(cov_mat)
+    #Step-4
+    sorted_index = np.argsort(eigen_values)[::-1]
+    sorted_eigenvalue = eigen_values[sorted_index]
+    sorted_eigenvectors = eigen_vectors[:,sorted_index]
+    #Step-5
+    eigenvector_subset = sorted_eigenvectors[:,0:num_components]
+    #Step-6
+    X_reduced = np.dot(eigenvector_subset.transpose() , X_meaned.transpose() ).transpose()
+    return X_reduced
+data = pd.read_csv("/content/iris.csv")
+data.head()
+#prepare the data
+x = data.iloc[:,0:4]
+#prepare the target
+target = data.iloc[:,4]
+x.head()
+target.head()
+#Applying it to PCA function
+mat_reduced = PCA(x , 2)
+#Creating a Pandas DataFrame of reduced Dataset
+principal_df = pd.DataFrame(mat_reduced , columns = ['PC1','PC2'])
+#Display the Principal Coponents
+principal_df.head()
+#Concat it with target variable to create a complete Dataset
+principal_df = pd.concat([principal_df , pd.DataFrame(target)] , axis = 1)
+principal_df.head()
+import seaborn as sb
+import matplotlib.pyplot as plt
+plt.figure(figsize = (6,6))
+sb.scatterplot(data = principal_df , x = 'PC1', y = 'PC2' , hue = "variety" , s = 60 , palette= 'icefire')
+#sb.scatterplot(data = principal_df , x = 'PC1', y = 'PC2', s = 60 , palette= 'icefire')
+#my coooode
+# Import necessary libraries
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.preprocessing import StandardScaler
+# Initialize scaler
+scaler = StandardScaler()
+# Define PCA function
+def PCA(X, num_components):
+    # Step 1: Standardize the dataset
+    X_scaled = scaler.fit_transform(X)
+    # Step 2: Compute covariance matrix
+    covariance_matrix = np.cov(X_scaled, rowvar=False)
+    # Step 3: Compute eigenvalues and eigenvectors
+    eigen_values, eigen_vectors = np.linalg.eigh(covariance_matrix)
+    # Step 4: Sort eigenvalues and eigenvectors in descending order
+    sorted_indices = np.argsort(eigen_values)[::-1]
+    eigen_values = eigen_values[sorted_indices]
+    eigen_vectors = eigen_vectors[:, sorted_indices]
+    # Step 5: Select top components
+    eigenvector_subset = eigen_vectors[:, :num_components]
+    # Step 6: Project data onto principal components
+    X_reduced = np.dot(X_scaled, eigenvector_subset)
+    return X_reduced, eigen_values
+# Load dataset
+data = pd.read_csv(r"/content/iris.csv")
+# Separate features and target
+X = data.iloc[:, :-1]
+y = data.iloc[:, -1]
+# Apply PCA
+X_reduced, eigen_values = PCA(X, 2)
+# Create a DataFrame with principal components
+pca_df = pd.DataFrame(X_reduced, columns=['PC1', 'PC2'])
+pca_df['Variety'] = y
+# Display first few rows
+print(pca_df.head())
+# Variance explained by each component
+explained_variance = (eigen_values / np.sum(eigen_values)) * 100
+print("\nExplained Variance (%):")
+print(explained_variance[:2])
+# Visualization
+plt.figure(figsize=(7, 6))
+sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='Variety', s=80, palette='viridis')
+plt.title('PCA on Iris Dataset', fontsize=14)
+plt.xlabel(f'PC1 ({explained_variance[0]:.2f}% variance)')
+plt.ylabel(f'PC2 ({explained_variance[1]:.2f}% variance)')
+plt.legend(title='Variety')
+plt.grid(True, linestyle='--', alpha=0.6)
+plt.show()

myawesomepkg/TSAPY1/p2_1_find_s.py ADDED Viewed

@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+"""P2-1 Find-S.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1f8yD_lr15trt6SrY0k7GkzBGPc-iHKDR
+"""
+import pandas as pd
+import numpy as np
+data = pd.read_csv('/content/lab1.csv')
+data
+concepts = np.array(data)[:,:-1]
+concepts
+target = np.array(data)[:,-1]
+target
+def train(con,tar):
+	for i,val in enumerate(tar):
+		if val == 'yes':
+			specific_h = con[i].copy()
+			break
+	for i,val in enumerate(con):
+		if tar[i] == 'yes':
+			for x in range(len(specific_h)):
+				if val[x] != specific_h[x]:
+					specific_h[x] = '?'
+				else:
+					pass
+	return specific_h
+print(train(concepts,target))
+# my code
+# ✅ Improved FIND-S Algorithm Implementation
+import pandas as pd
+import numpy as np
+# Load dataset
+data = pd.read_csv(r"/content/lab1.csv")
+# Display the data
+print("Dataset:\n", data, "\n")
+# Separate features (concepts) and target
+concepts = data.iloc[:, :-1].values
+target = data.iloc[:, -1].values
+# FIND-S algorithm definition
+def find_s(concepts, target):
+    # Step 1: Initialize specific hypothesis with the first positive example
+    specific_h = None
+    for i, val in enumerate(target):
+        if val.lower() == 'yes':
+            specific_h = concepts[i].copy()
+            break
+    # Step 2: Generalize only when necessary
+    for i, val in enumerate(target):
+        if val.lower() == 'yes':
+            for j in range(len(specific_h)):
+                if concepts[i][j] != specific_h[j]:
+                    specific_h[j] = '?'
+    return specific_h
+# Train the model
+final_hypothesis = find_s(concepts, target)
+# Display results
+print("✅ Final Specific Hypothesis:\n", final_hypothesis)

myawesomepkg/TSAPY1/p2_bcandidate_elimination_algorithm_(1).py ADDED Viewed

@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+"""P2-BCandidate Elimination algorithm (1).ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1J2QTr3esKQiO5tSmwN9heAyTb86Lzh7f
+"""
+import numpy as np
+import pandas as pd
+data = pd.read_csv('/content/enjoysport.csv')
+data
+concepts = np.array(data.iloc[:,0:-1])
+print("\nInstances are:\n",concepts)
+target = np.array(data.iloc[:,-1])
+print("\nTarget Values are: ",target)
+def learn(concepts, target):
+    specific_h = concepts[0].copy()
+    print("\nInitialization of specific_h and genearal_h")
+    print("\nSpecific Boundary: ", specific_h)
+    general_h = [["?" for i in range(len(specific_h))] for i in range(len(specific_h))]
+    print("\nGeneric Boundary: ",general_h)
+    for i, h in enumerate(concepts):
+        print("\nInstance", i+1 , "is ", h)
+        if target[i] == "yes":
+            print("Instance is Positive ")
+            for x in range(len(specific_h)):
+                if h[x]!= specific_h[x]:
+                    specific_h[x] ='?'
+                    general_h[x][x] ='?'
+        if target[i] == "no":
+            print("Instance is Negative ")
+            for x in range(len(specific_h)):
+                if h[x]!= specific_h[x]:
+                    general_h[x][x] = specific_h[x]
+                else:
+                    general_h[x][x] = '?'
+        print("Specific Bundary after ", i+1, "Instance is ", specific_h)
+        print("Generic Boundary after ", i+1, "Instance is ", general_h)
+        print("\n")
+    indices = [i for i, val in enumerate(general_h) if val == ['?', '?', '?', '?', '?', '?']]
+    for i in indices:
+        general_h.remove(['?', '?', '?', '?', '?', '?'])
+    return specific_h, general_h
+s_final, g_final = learn(concepts, target)
+print("Final Specific_h:", s_final, sep="\n")
+print("Final General_h:", g_final, sep="\n")
+#my coode
+import pandas as pd
+import numpy as np
+# Load dataset
+data = pd.read_csv(r'/content/enjoysport.csv')
+# Split into features and target
+concepts = np.array(data.iloc[:, :-1])
+target = np.array(data.iloc[:, -1])
+def find_s(concepts, target):
+    for i, val in enumerate(target):
+        if val.lower() == 'yes':
+            specific_h = concepts[i].copy()
+            break
+    for i, val in enumerate(concepts):
+        if target[i].lower() == 'yes':
+            for x in range(len(specific_h)):
+                if val[x] != specific_h[x]:
+                    specific_h[x] = '?'
+    return specific_h
+print("Most specific hypothesis:", find_s(concepts, target))

myawesomepkg/TSAPY1/p3_1_least_square_regression.py ADDED Viewed

@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+"""P3-1-Least Square Regression.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1gU6rRYa8vNSqSFvH9dDuRKc80I1Oejvg
+"""
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+plt.rcParams['figure.figsize'] = (12.0, 9.0)
+# Preprocessing Input data
+data = pd.read_csv('/content/Data (2).csv')
+X = data.iloc[:, 0]
+Y = data.iloc[:, 1]
+plt.scatter(X, Y)
+plt.show()
+# Building the model
+#Calculating the means
+X_mean = np.mean(X)
+Y_mean = np.mean(Y)
+cov = 0
+var = 0
+for i in range(len(X)):
+    cov += (X[i] - X_mean)*(Y[i] - Y_mean)# Calculate the covariance of X and Y
+    var += (X[i] - X_mean)**2 # Calculate the variance of X
+regcoeff = cov / var
+intercept = Y_mean - regcoeff*X_mean
+print (regcoeff, intercept)
+# Making predictions
+Y_pred = regcoeff*X + intercept
+plt.scatter(X, Y) # actual
+plt.plot([min(X), max(X)], [min(Y_pred), max(Y_pred)], color='red') # predicted
+plt.show()
+# my cooode
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+# Set plot size
+plt.rcParams['figure.figsize'] = (10, 6)
+# Load data
+data = pd.read_csv(r'/content/Data (2).csv')
+# Extract features and target
+X = data.iloc[:, 0].values
+Y = data.iloc[:, 1].values
+# Scatter plot of original data
+plt.scatter(X, Y, color='blue', label='Actual Data')
+plt.title('Data Visualization')
+plt.xlabel('X')
+plt.ylabel('Y')
+plt.legend()
+plt.show()
+# --- Linear Regression from Scratch ---
+# Calculate means
+X_mean = np.mean(X)
+Y_mean = np.mean(Y)
+# Calculate covariance and variance
+cov = np.sum((X - X_mean) * (Y - Y_mean))
+var = np.sum((X - X_mean) ** 2)
+# Regression coefficients
+reg_coeff = cov / var
+intercept = Y_mean - reg_coeff * X_mean
+print(f"Slope (regression coefficient): {reg_coeff:.4f}")
+print(f"Intercept: {intercept:.4f}")
+# Predictions
+Y_pred = reg_coeff * X + intercept
+# --- Model Evaluation ---
+# R² Score
+ss_total = np.sum((Y - Y_mean) ** 2)
+ss_residual = np.sum((Y - Y_pred) ** 2)
+r2_score = 1 - (ss_residual / ss_total)
+print(f"R² Score: {r2_score:.4f}")
+# --- Visualization ---
+plt.scatter(X, Y, color='blue', label='Actual Data')
+plt.plot(X, Y_pred, color='red', label='Regression Line')
+plt.title('Simple Linear Regression')
+plt.xlabel('X')
+plt.ylabel('Y')
+plt.legend()
+plt.grid(True)
+plt.show()

myawesomepkg/TSAPY1/p3_2_logistic_regression_algorithm.py ADDED Viewed

@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+"""P3-2-Logistic Regression algorithm.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1jJpY0R35HT2AXPM9yNTJwX6JqW5Q6Zta
+"""
+import pandas as pd
+from matplotlib import pyplot as plt
+df = pd.read_csv("/content/insurance_data.csv")
+df.head()
+plt.scatter(df.Age,df.have_insurance,marker='+',color='red')
+plt.show()
+from sklearn.model_selection import train_test_split
+X_train, X_test, Y_train, Y_test = train_test_split(df[['Age']],df.have_insurance,test_size=0.1)
+from sklearn.linear_model import LogisticRegression
+model = LogisticRegression()
+model.fit(X_train, Y_train)#Training the model
+model.predict(X_test)#Prediction on the test set
+model.score(X_test,Y_test)#Calculating the accuracy
+model.predict_proba(X_test)#Predicting the probabilities
+#my coode
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
+# Load dataset
+df = pd.read_csv(r'/content/insurance_data.csv')
+# Display first few rows
+print("Dataset Preview:\n", df.head())
+# Visualize the data
+plt.figure(figsize=(8, 5))
+plt.scatter(df['Age'], df['have_insurance'], marker='+', color='red')
+plt.title('Insurance Data Distribution')
+plt.xlabel('Age')
+plt.ylabel('Has Insurance (0 = No, 1 = Yes)')
+plt.grid(True)
+plt.show()
+# Split data into train and test
+X_train, X_test, y_train, y_test = train_test_split(
+    df[['Age']], df['have_insurance'], test_size=0.2, random_state=42
+)
+# Train Logistic Regression model
+model = LogisticRegression()
+model.fit(X_train, y_train)
+# Predictions
+y_pred = model.predict(X_test)
+y_prob = model.predict_proba(X_test)[:, 1]
+# Model evaluation
+accuracy = accuracy_score(y_test, y_pred)
+print(f"✅ Model Accuracy: {accuracy:.4f}\n")
+print("🔹 Classification Report:\n", classification_report(y_test, y_pred))
+print("🔹 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
+# Plot logistic curve
+plt.figure(figsize=(8, 5))
+plt.scatter(df['Age'], df['have_insurance'], color='blue', label='Actual Data')
+X_range = pd.DataFrame({'Age': range(int(df['Age'].min()), int(df['Age'].max()))})
+plt.plot(X_range, model.predict_proba(X_range)[:, 1], color='red', label='Logistic Curve')
+plt.title('Logistic Regression Fit')
+plt.xlabel('Age')
+plt.ylabel('Probability of Having Insurance')
+plt.legend()
+plt.grid(True)
+plt.show()

myawesomepkg/TSAPY1/p5_1_hierarchical_clustering.py ADDED Viewed

@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+"""P5-1 Hierarchical Clustering.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1-t2ZOVavQ3YXZC9-s7tvo3uI0acgS2C5
+"""
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.cluster import AgglomerativeClustering
+from scipy.cluster import hierarchy
+#load the Penguins dataset
+data = pd.read_csv("/content/penguins.csv")
+data.head()
+data['species'].value_counts()
+print(data.shape) # (344, 9)
+#trimming the dataset to the chosen columns and dropping rows with missing data
+df = data[['bill_length_mm', 'flipper_length_mm']]
+df = df.dropna(axis=0)
+df.head()
+#use Scipy's hierarchy.linkage() to form clusters and plot them with hierarchy.dendrogram()
+clusters = hierarchy.linkage(df, method="ward")
+plt.figure(figsize=(8, 6))
+dendrogram = hierarchy.dendrogram(clusters)
+# Plotting a horizontal line based on the first biggest distance between clusters
+plt.axhline(150, color='red', linestyle='--');
+# Plotting a horizontal line based on the second biggest distance between clusters
+plt.axhline(100, color='crimson');
+#perform Agglomerative Clustering with Scikit-Learn to find cluster labels for the three types of penguins
+clustering_model = AgglomerativeClustering(n_clusters=3, linkage="ward")
+clustering_model.fit(df)
+labels = clustering_model.labels_
+#plot the data before and after Agglomerative Clustering with 3 clusters
+fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
+sns.scatterplot(ax=axes[0], data=df, x='bill_length_mm', y='flipper_length_mm').set_title('Without cliustering')
+sns.scatterplot(ax=axes[1], data=df, x='bill_length_mm', y='flipper_length_mm', hue=clustering_model.labels_).set_title('With clustering');
+#Agglomerative Clustering without specifying the number of clusters
+clustering_model_no_clusters = AgglomerativeClustering(linkage="ward")
+clustering_model_no_clusters.fit(df)
+labels_no_clusters = clustering_model_no_clusters.labels_
+# plot the data without Agglomerative Clustering, with 3 clusters and with no pre defined clusters
+fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,5))
+sns.scatterplot(ax=axes[0], data=df, x='bill_length_mm', y='flipper_length_mm').set_title('Without cliustering')
+sns.scatterplot(ax=axes[1], data=df, x='bill_length_mm', y='flipper_length_mm', hue=clustering_model.labels_).set_title('With 3 clusters')
+sns.scatterplot(ax=axes[2], data=df, x='bill_length_mm', y='flipper_length_mm', hue=clustering_model_no_clusters.labels_).set_title('Without choosing number of clusters');
+# my cooode
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.cluster import AgglomerativeClustering
+from scipy.cluster import hierarchy
+# ============================
+# 1️⃣ Load & Prepare Dataset
+# ============================
+data = pd.read_csv("/content/penguins.csv")
+print("Dataset shape:", data.shape)
+print("\nSpecies count:\n", data['species'].value_counts())
+# Select relevant numerical features and drop missing rows
+df = data[['bill_length_mm', 'flipper_length_mm']].dropna()
+print("\nCleaned dataset shape:", df.shape)
+print(df.head())
+# ============================
+# 2️⃣ Create Dendrogram (Hierarchical Tree)
+# ============================
+plt.figure(figsize=(10, 6))
+clusters = hierarchy.linkage(df, method="ward")
+# Plot the dendrogram
+dendrogram = hierarchy.dendrogram(clusters, color_threshold=100)
+plt.axhline(150, color='red', linestyle='--', label='Threshold 150')
+plt.axhline(100, color='orange', linestyle='--', label='Threshold 100')
+plt.title("Hierarchical Clustering Dendrogram (Ward’s Method)")
+plt.xlabel("Data Points")
+plt.ylabel("Euclidean Distance")
+plt.legend()
+plt.grid(True)
+plt.show()
+# ============================
+# 3️⃣ Agglomerative Clustering (with 3 clusters)
+# ============================
+agg3 = AgglomerativeClustering(n_clusters=3, linkage="ward")
+labels_3 = agg3.fit_predict(df)
+# Add cluster labels to dataframe
+df['Cluster_3'] = labels_3
+# ============================
+# 4️⃣ Visualization — With and Without Clustering
+# ============================
+fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+sns.scatterplot(ax=axes[0], data=df, x='bill_length_mm', y='flipper_length_mm', color='gray')
+axes[0].set_title("Without Clustering")
+sns.scatterplot(ax=axes[1], data=df, x='bill_length_mm', y='flipper_length_mm', hue='Cluster_3', palette='viridis', s=70)
+axes[1].set_title("With 3 Clusters")
+plt.tight_layout()
+plt.show()
+# ============================
+# 5️⃣ Agglomerative Clustering (without predefined clusters)
+# ============================
+agg_auto = AgglomerativeClustering(linkage="ward")
+labels_auto = agg_auto.fit_predict(df[['bill_length_mm', 'flipper_length_mm']])
+df['Cluster_auto'] = labels_auto
+# ============================
+# 6️⃣ Compare — No Clustering vs 3 Clusters vs Auto
+# ============================
+fig, axes = plt.subplots(1, 3, figsize=(18, 5))
+sns.scatterplot(ax=axes[0], data=df, x='bill_length_mm', y='flipper_length_mm', color='gray')
+axes[0].set_title("Without Clustering")
+sns.scatterplot(ax=axes[1], data=df, x='bill_length_mm', y='flipper_length_mm', hue='Cluster_3', palette='cool', s=70)
+axes[1].set_title("With 3 Clusters")
+sns.scatterplot(ax=axes[2], data=df, x='bill_length_mm', y='flipper_length_mm', hue='Cluster_auto', palette='plasma', s=70)
+axes[2].set_title("Auto Clustering (No Predefined k)")
+plt.tight_layout()
+plt.show()