shreenath-ml-scripts 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- shreenath_ml_scripts-0.1.0/PKG-INFO +16 -0
- shreenath_ml_scripts-0.1.0/README.md +13 -0
- shreenath_ml_scripts-0.1.0/setup.cfg +4 -0
- shreenath_ml_scripts-0.1.0/setup.py +21 -0
- shreenath_ml_scripts-0.1.0/src/ml_prac_scripts/__init__.py +0 -0
- shreenath_ml_scripts-0.1.0/src/ml_prac_scripts/eda.py +65 -0
- shreenath_ml_scripts-0.1.0/src/ml_prac_scripts/kmeansclustering.py +47 -0
- shreenath_ml_scripts-0.1.0/src/ml_prac_scripts/knn.py +36 -0
- shreenath_ml_scripts-0.1.0/src/ml_prac_scripts/linear_regression.py +58 -0
- shreenath_ml_scripts-0.1.0/src/ml_prac_scripts/logistic_regression.py +58 -0
- shreenath_ml_scripts-0.1.0/src/ml_prac_scripts/pca.py +86 -0
- shreenath_ml_scripts-0.1.0/src/ml_prac_scripts/svm.py +52 -0
- shreenath_ml_scripts-0.1.0/src/shreenath_ml_scripts.egg-info/PKG-INFO +16 -0
- shreenath_ml_scripts-0.1.0/src/shreenath_ml_scripts.egg-info/SOURCES.txt +15 -0
- shreenath_ml_scripts-0.1.0/src/shreenath_ml_scripts.egg-info/dependency_links.txt +1 -0
- shreenath_ml_scripts-0.1.0/src/shreenath_ml_scripts.egg-info/requires.txt +5 -0
- shreenath_ml_scripts-0.1.0/src/shreenath_ml_scripts.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: shreenath-ml-scripts
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A collection of machine learning scripts
|
|
5
|
+
Author: Shreenath
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Requires-Dist: pandas
|
|
9
|
+
Requires-Dist: numpy
|
|
10
|
+
Requires-Dist: matplotlib
|
|
11
|
+
Requires-Dist: scikit-learn
|
|
12
|
+
Requires-Dist: seaborn
|
|
13
|
+
Dynamic: author
|
|
14
|
+
Dynamic: classifier
|
|
15
|
+
Dynamic: requires-dist
|
|
16
|
+
Dynamic: summary
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Shreenath ML Scripts
|
|
2
|
+
|
|
3
|
+
A collection of machine learning scripts including EDA, KNN, K-Means, Linear Regression, Logistic Regression, PCA, and SVM.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install shreenath-ml-scripts
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
Once downloaded/installed, you can find the python scripts within your Python environment's site-packages under `ml_prac_scripts`.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name='shreenath-ml-scripts', # Using a unique name as 'sklearn' is not permitted
|
|
5
|
+
version='0.1.0',
|
|
6
|
+
description='A collection of machine learning scripts',
|
|
7
|
+
author='Shreenath',
|
|
8
|
+
packages=find_packages(where="src"),
|
|
9
|
+
package_dir={"": "src"},
|
|
10
|
+
install_requires=[
|
|
11
|
+
'pandas',
|
|
12
|
+
'numpy',
|
|
13
|
+
'matplotlib',
|
|
14
|
+
'scikit-learn',
|
|
15
|
+
'seaborn'
|
|
16
|
+
],
|
|
17
|
+
classifiers=[
|
|
18
|
+
'Programming Language :: Python :: 3',
|
|
19
|
+
'License :: OSI Approved :: MIT License',
|
|
20
|
+
],
|
|
21
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""eda.ipynb
|
|
3
|
+
|
|
4
|
+
Automatically generated by Colab.
|
|
5
|
+
|
|
6
|
+
Original file is located at
|
|
7
|
+
https://colab.research.google.com/drive/1Fm6OC_xJb4m29eC8pGFUqGFzpMxKvUiA
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import matplotlib.pyplot as plt
|
|
13
|
+
import seaborn as sns
|
|
14
|
+
|
|
15
|
+
df = sns.load_dataset("titanic")
|
|
16
|
+
|
|
17
|
+
df.head()
|
|
18
|
+
|
|
19
|
+
df.info()
|
|
20
|
+
|
|
21
|
+
df.describe()
|
|
22
|
+
|
|
23
|
+
df.isnull().sum()
|
|
24
|
+
|
|
25
|
+
df["age"] = df["age"].fillna(df["age"].median())
|
|
26
|
+
|
|
27
|
+
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])
|
|
28
|
+
|
|
29
|
+
df.drop('deck', axis=1, inplace=True)
|
|
30
|
+
|
|
31
|
+
df.dropna(inplace=True)
|
|
32
|
+
|
|
33
|
+
df.info()
|
|
34
|
+
|
|
35
|
+
sns.boxplot(x=df['fare'])
|
|
36
|
+
plt.title('Fare Boxplot (Before Handling Outliers)')
|
|
37
|
+
plt.show()
|
|
38
|
+
|
|
39
|
+
Q1 = df['fare'].quantile(0.25)
|
|
40
|
+
Q3 = df['fare'].quantile(0.75)
|
|
41
|
+
IQR = Q3 - Q1
|
|
42
|
+
|
|
43
|
+
lower_bound = Q1 - 1.5 * IQR
|
|
44
|
+
upper_bound = Q3 + 1.5 * IQR
|
|
45
|
+
|
|
46
|
+
df['fare'] = np.where(df['fare'] > upper_bound, upper_bound, np.where(df['fare'] < lower_bound, lower_bound, df['fare']))
|
|
47
|
+
|
|
48
|
+
print(f"Outliers capped at: {upper_bound}")
|
|
49
|
+
|
|
50
|
+
df['log_fare'] = np.log1p(df['fare'])
|
|
51
|
+
|
|
52
|
+
plt.figure(figsize=(10,4))
|
|
53
|
+
plt.subplot(1, 2, 1)
|
|
54
|
+
sns.histplot(df['fare'], kde=True).set_title('Original Fare')
|
|
55
|
+
plt.subplot(1, 2, 2)
|
|
56
|
+
sns.histplot(df['log_fare'], kde=True).set_title('Log Transformed Fare')
|
|
57
|
+
plt.show()
|
|
58
|
+
|
|
59
|
+
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
|
|
60
|
+
|
|
61
|
+
df = pd.get_dummies(df, columns=['embarked'], drop_first=True)
|
|
62
|
+
|
|
63
|
+
print("\nFinal Dataframe Head:")
|
|
64
|
+
print(df[['survived', 'pclass', 'sex', 'age', 'log_fare', 'embarked_Q', 'embarked_S']].head())
|
|
65
|
+
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""KMeansClustering.ipynb
|
|
3
|
+
|
|
4
|
+
Automatically generated by Colab.
|
|
5
|
+
|
|
6
|
+
Original file is located at
|
|
7
|
+
https://colab.research.google.com/drive/1Yt6p5RJKdDBlYIy11bsGPmTO5cJ6kdnA
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import matplotlib.pyplot as plt
|
|
13
|
+
from sklearn.cluster import KMeans
|
|
14
|
+
from sklearn.preprocessing import LabelEncoder
|
|
15
|
+
from sklearn.preprocessing import StandardScaler
|
|
16
|
+
import warnings
|
|
17
|
+
import seaborn as sns
|
|
18
|
+
warnings.filterwarnings("ignore")
|
|
19
|
+
|
|
20
|
+
df = pd.read_csv("sales.csv", encoding='latin-1')
|
|
21
|
+
df.head()
|
|
22
|
+
|
|
23
|
+
scaler = StandardScaler()
|
|
24
|
+
|
|
25
|
+
df_numeric = df.select_dtypes(include=np.number)
|
|
26
|
+
X_scaled = scaler.fit_transform(df_numeric)
|
|
27
|
+
|
|
28
|
+
inertia = []
|
|
29
|
+
for k in range(1, 11):
|
|
30
|
+
k_means = KMeans(n_clusters=k, random_state=42, n_init=10)
|
|
31
|
+
k_means.fit(X_scaled)
|
|
32
|
+
inertia.append(k_means.inertia_)
|
|
33
|
+
|
|
34
|
+
plt.figure(figsize=(8, 5))
|
|
35
|
+
sns.lineplot(x=range(1, 11), y=inertia, marker='o', linestyle='-')
|
|
36
|
+
plt.title('Elbow Method for Optimal k')
|
|
37
|
+
plt.xlabel('Number of Clusters (k)')
|
|
38
|
+
plt.ylabel('Inertia')
|
|
39
|
+
|
|
40
|
+
optimal_k = 3
|
|
41
|
+
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
|
|
42
|
+
kmeans.fit(X_scaled)
|
|
43
|
+
labels = kmeans.labels_
|
|
44
|
+
|
|
45
|
+
plt.figure(figsize=(10, 6))
|
|
46
|
+
sns.scatterplot(x=X_scaled[:, 0], y=X_scaled[:, 1], hue=labels)
|
|
47
|
+
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""knn.ipynb
|
|
3
|
+
|
|
4
|
+
Automatically generated by Colab.
|
|
5
|
+
|
|
6
|
+
Original file is located at
|
|
7
|
+
https://colab.research.google.com/drive/1u49MdRl6i0Jlmx9I1gm1iYjiHEXGI97C
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import numpy as np
|
|
12
|
+
from sklearn.model_selection import train_test_split
|
|
13
|
+
from sklearn.preprocessing import LabelEncoder
|
|
14
|
+
from sklearn.neighbors import KNeighborsClassifier
|
|
15
|
+
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, f1_score, precision_score
|
|
16
|
+
|
|
17
|
+
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
|
|
18
|
+
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
|
|
19
|
+
df = pd.read_csv(url, names=column_names)
|
|
20
|
+
|
|
21
|
+
label_enoder = LabelEncoder()
|
|
22
|
+
df['species'] = label_enoder.fit_transform(df['species'])
|
|
23
|
+
|
|
24
|
+
X = df.drop('species', axis=1)
|
|
25
|
+
y = df['species']
|
|
26
|
+
|
|
27
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
|
28
|
+
|
|
29
|
+
knn = KNeighborsClassifier(n_neighbors=3)
|
|
30
|
+
knn.fit(X_train, y_train)
|
|
31
|
+
|
|
32
|
+
y_pred = knn.predict(X_test)
|
|
33
|
+
|
|
34
|
+
print("Accuracy:", accuracy_score(y_test, y_pred))
|
|
35
|
+
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
|
|
36
|
+
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""linear_regression.ipynb
|
|
3
|
+
|
|
4
|
+
Automatically generated by Colab.
|
|
5
|
+
|
|
6
|
+
Original file is located at
|
|
7
|
+
https://colab.research.google.com/drive/1AqzDrmjgKq5uD5lNbJdJJFdzGUUCW3hs
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import numpy as np
|
|
12
|
+
import matplotlib.pyplot as plt
|
|
13
|
+
from sklearn.model_selection import train_test_split
|
|
14
|
+
from sklearn.linear_model import LinearRegression
|
|
15
|
+
from sklearn.metrics import mean_squared_error, r2_score
|
|
16
|
+
from sklearn.preprocessing import StandardScaler
|
|
17
|
+
|
|
18
|
+
df = pd.read_csv("HousingData.csv")
|
|
19
|
+
df.head()
|
|
20
|
+
|
|
21
|
+
df["CRIM"] = df["CRIM"].fillna(df["CRIM"].mean())
|
|
22
|
+
df["ZN"] = df["ZN"].fillna(df["ZN"].mean())
|
|
23
|
+
df["INDUS"] = df["INDUS"].fillna(df["INDUS"].mean())
|
|
24
|
+
df["CHAS"] = df["CHAS"].fillna(df["CHAS"].mean())
|
|
25
|
+
df["AGE"] = df["AGE"].fillna(df["AGE"].mean())
|
|
26
|
+
df["LSTAT"] = df["LSTAT"].fillna(df["LSTAT"].mean())
|
|
27
|
+
|
|
28
|
+
df.info()
|
|
29
|
+
|
|
30
|
+
X = df.drop('MEDV', axis=1)
|
|
31
|
+
y = df['MEDV']
|
|
32
|
+
X_scaled = StandardScaler().fit_transform(X)
|
|
33
|
+
X_scaled
|
|
34
|
+
|
|
35
|
+
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
|
|
36
|
+
|
|
37
|
+
model = LinearRegression()
|
|
38
|
+
model.fit(X_train, y_train)
|
|
39
|
+
y_pred = model.predict(X_test)
|
|
40
|
+
y_pred
|
|
41
|
+
|
|
42
|
+
mse = mean_squared_error(y_test, y_pred)
|
|
43
|
+
r2 = r2_score(y_test, y_pred)
|
|
44
|
+
print("Mean Squared Error:", mse)
|
|
45
|
+
print("R-squared:", r2)
|
|
46
|
+
|
|
47
|
+
plt.figure(figsize=(10, 6))
|
|
48
|
+
plt.scatter(y_test, y_pred)
|
|
49
|
+
plt.xlabel("Actual Values")
|
|
50
|
+
plt.ylabel("Predicted Values")
|
|
51
|
+
plt.title("Actual vs. Predicted Values")
|
|
52
|
+
|
|
53
|
+
# Add the line of best fit (y=x)
|
|
54
|
+
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '-', lw=1)
|
|
55
|
+
plt.plot([y_pred.min(), y_pred.max()], [y_pred.min(), y_pred.max()], '-', lw=1)
|
|
56
|
+
|
|
57
|
+
plt.show()
|
|
58
|
+
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""logistic_regression.ipynb
|
|
3
|
+
|
|
4
|
+
Automatically generated by Colab.
|
|
5
|
+
|
|
6
|
+
Original file is located at
|
|
7
|
+
https://colab.research.google.com/drive/1_9gGFH3ONYSD_saWKC9Vf2wJ-YU3if1G
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import numpy as np
|
|
12
|
+
import matplotlib.pyplot as plt
|
|
13
|
+
import seaborn as sns
|
|
14
|
+
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
|
|
15
|
+
from sklearn.model_selection import train_test_split
|
|
16
|
+
from sklearn.linear_model import LogisticRegression
|
|
17
|
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
|
18
|
+
import seaborn as sns
|
|
19
|
+
|
|
20
|
+
df = pd.read_csv("Social_Network_Ads.csv")
|
|
21
|
+
numeric_columns = ['Age', 'EstimatedSalary']
|
|
22
|
+
df.head()
|
|
23
|
+
|
|
24
|
+
scaler = MinMaxScaler()
|
|
25
|
+
scaled = scaler.fit_transform(df[numeric_columns])
|
|
26
|
+
scaled = pd.DataFrame(scaled, columns=numeric_columns)
|
|
27
|
+
scaled
|
|
28
|
+
|
|
29
|
+
label_encoder = LabelEncoder()
|
|
30
|
+
encoded = df.copy()
|
|
31
|
+
encoded['Gender'] = label_encoder.fit_transform(df['Gender'])
|
|
32
|
+
encoded
|
|
33
|
+
|
|
34
|
+
X = scaled
|
|
35
|
+
X['Gender'] = encoded['Gender']
|
|
36
|
+
y = df['Purchased']
|
|
37
|
+
|
|
38
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
39
|
+
|
|
40
|
+
model = LogisticRegression()
|
|
41
|
+
|
|
42
|
+
model.fit(X_train, y_train)
|
|
43
|
+
|
|
44
|
+
y_pred = model.predict(X_test)
|
|
45
|
+
|
|
46
|
+
accuracy = accuracy_score(y_test, y_pred)
|
|
47
|
+
print(f"Accuracy: {accuracy:.4f}\n")
|
|
48
|
+
|
|
49
|
+
print("Classification Report:")
|
|
50
|
+
|
|
51
|
+
print(classification_report(y_test, y_pred, target_names=['Not Purchased (0)', 'Purchased (1)']))
|
|
52
|
+
|
|
53
|
+
print("Confusion Matrix:")
|
|
54
|
+
|
|
55
|
+
print(confusion_matrix(y_test, y_pred))
|
|
56
|
+
cm = confusion_matrix(y_test, y_pred)
|
|
57
|
+
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
|
|
58
|
+
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""pca.ipynb
|
|
3
|
+
|
|
4
|
+
Automatically generated by Colab.
|
|
5
|
+
|
|
6
|
+
Original file is located at
|
|
7
|
+
https://colab.research.google.com/drive/1vS6JOW2HWZiPOYrryS6XDOjq0G3Ja_9y
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import numpy as np
|
|
12
|
+
from sklearn.decomposition import PCA
|
|
13
|
+
from sklearn.preprocessing import StandardScaler
|
|
14
|
+
import matplotlib.pyplot as plt
|
|
15
|
+
|
|
16
|
+
df = pd.read_csv("Iris.csv")
|
|
17
|
+
df.columns = ["id", "sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
|
|
18
|
+
df.head()
|
|
19
|
+
|
|
20
|
+
features = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
|
|
21
|
+
x = df.loc[:, features].values
|
|
22
|
+
y = df.loc[:, ["species"]].values
|
|
23
|
+
|
|
24
|
+
x = StandardScaler().fit_transform(x)
|
|
25
|
+
|
|
26
|
+
pca = PCA(n_components=2)
|
|
27
|
+
principal_components = pca.fit_transform(x)
|
|
28
|
+
principal_components
|
|
29
|
+
|
|
30
|
+
principalDF = pd.DataFrame(data = principal_components, columns=["pc1", "pc2"])
|
|
31
|
+
final = pd.concat([principalDF, df[["species"]]], axis=1)
|
|
32
|
+
final.head()
|
|
33
|
+
|
|
34
|
+
plt.figure()
|
|
35
|
+
plt.xlabel("pc1")
|
|
36
|
+
plt.ylabel("pc2")
|
|
37
|
+
plt.title("2 component PCA")
|
|
38
|
+
targets = ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
|
|
39
|
+
colors = ["r", "g", "b"]
|
|
40
|
+
for target, color in zip(targets, colors):
|
|
41
|
+
indicesToKeep = final["species"] == target
|
|
42
|
+
plt.scatter(final.loc[indicesToKeep, "pc1"], final.loc[indicesToKeep, "pc2"], c=color, s=50)
|
|
43
|
+
plt.legend(targets)
|
|
44
|
+
plt.grid()
|
|
45
|
+
plt.show()
|
|
46
|
+
|
|
47
|
+
import numpy as np
|
|
48
|
+
import pandas as pd
|
|
49
|
+
import matplotlib.pyplot as plt
|
|
50
|
+
from sklearn.decomposition import PCA
|
|
51
|
+
from sklearn.preprocessing import StandardScaler
|
|
52
|
+
|
|
53
|
+
df = pd.read_csv("Iris.csv")
|
|
54
|
+
df.info()
|
|
55
|
+
|
|
56
|
+
df.columns = ["id", "speal_len", "sepal_width", "petal_len", "petal_width", "species"]
|
|
57
|
+
features = ["speal_len", "sepal_width", "petal_len", "petal_width"]
|
|
58
|
+
|
|
59
|
+
X = df.loc[:, features].values
|
|
60
|
+
y = df.loc[:, ["species"]].values
|
|
61
|
+
|
|
62
|
+
# X = StandardScaler().fit_transform(X)
|
|
63
|
+
|
|
64
|
+
pca = PCA(n_components=2)
|
|
65
|
+
p_com = pca.fit_transform(X)
|
|
66
|
+
|
|
67
|
+
p_df = pd.DataFrame(data=p_com, columns=["pca1", "pc2"])
|
|
68
|
+
final = pd.concat([p_df, df[["species"]]], axis=1)
|
|
69
|
+
|
|
70
|
+
plt.figure()
|
|
71
|
+
plt.xlabel("pc1")
|
|
72
|
+
plt.ylabel("pc2")
|
|
73
|
+
plt.title("2 components PCA")
|
|
74
|
+
targets = ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
|
|
75
|
+
colors = ["r", "g", "b"]
|
|
76
|
+
|
|
77
|
+
for t, c in zip(targets, colors):
|
|
78
|
+
indices = final["species"] == t
|
|
79
|
+
plt.scatter(
|
|
80
|
+
final.loc[indices, "pca1"],
|
|
81
|
+
final.loc[indices, "pc2"],
|
|
82
|
+
c=c,
|
|
83
|
+
s=50
|
|
84
|
+
)
|
|
85
|
+
plt.legend(targets)
|
|
86
|
+
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""svm.ipynb
|
|
3
|
+
|
|
4
|
+
Automatically generated by Colab.
|
|
5
|
+
|
|
6
|
+
Original file is located at
|
|
7
|
+
https://colab.research.google.com/drive/1gtCM25ZOphz_jwcev_e6g8YH-yMt5RDP
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from sklearn.model_selection import train_test_split
|
|
12
|
+
from sklearn.svm import SVC
|
|
13
|
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
|
14
|
+
from sklearn.preprocessing import StandardScaler
|
|
15
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
16
|
+
import seaborn as sns
|
|
17
|
+
|
|
18
|
+
df = pd.read_csv("emails.csv")
|
|
19
|
+
df.dropna()
|
|
20
|
+
df = df.drop(columns=["Email No."])
|
|
21
|
+
df.isna().sum()
|
|
22
|
+
|
|
23
|
+
vectorizer = TfidfVectorizer(stop_words='english')
|
|
24
|
+
|
|
25
|
+
X = vectorizer.fit_transform(df)
|
|
26
|
+
|
|
27
|
+
X = df.drop(columns=['Prediction'])
|
|
28
|
+
y = df['Prediction']
|
|
29
|
+
|
|
30
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
|
31
|
+
|
|
32
|
+
model = SVC(kernel='linear', C=1.0, random_state=42)
|
|
33
|
+
model.fit(X_train, y_train)
|
|
34
|
+
|
|
35
|
+
y_pred = model.predict(X_test)
|
|
36
|
+
|
|
37
|
+
print("Accuracy:", accuracy_score(y_test, y_pred))
|
|
38
|
+
print("\nClassification Report:\n")
|
|
39
|
+
print(classification_report(y_test, y_pred))
|
|
40
|
+
print("\nConfusion Matrix:\n")
|
|
41
|
+
print(confusion_matrix(y_test, y_pred))
|
|
42
|
+
cm = confusion_matrix(y_test, y_pred)
|
|
43
|
+
|
|
44
|
+
sns.heatmap(
|
|
45
|
+
cm,
|
|
46
|
+
annot=True,
|
|
47
|
+
fmt='d',
|
|
48
|
+
cmap='Blues',
|
|
49
|
+
xticklabels=['Not Spam', 'Spam'],
|
|
50
|
+
yticklabels=['Not Spam', 'Spam']
|
|
51
|
+
)
|
|
52
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: shreenath-ml-scripts
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A collection of machine learning scripts
|
|
5
|
+
Author: Shreenath
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
8
|
+
Requires-Dist: pandas
|
|
9
|
+
Requires-Dist: numpy
|
|
10
|
+
Requires-Dist: matplotlib
|
|
11
|
+
Requires-Dist: scikit-learn
|
|
12
|
+
Requires-Dist: seaborn
|
|
13
|
+
Dynamic: author
|
|
14
|
+
Dynamic: classifier
|
|
15
|
+
Dynamic: requires-dist
|
|
16
|
+
Dynamic: summary
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
src/ml_prac_scripts/__init__.py
|
|
4
|
+
src/ml_prac_scripts/eda.py
|
|
5
|
+
src/ml_prac_scripts/kmeansclustering.py
|
|
6
|
+
src/ml_prac_scripts/knn.py
|
|
7
|
+
src/ml_prac_scripts/linear_regression.py
|
|
8
|
+
src/ml_prac_scripts/logistic_regression.py
|
|
9
|
+
src/ml_prac_scripts/pca.py
|
|
10
|
+
src/ml_prac_scripts/svm.py
|
|
11
|
+
src/shreenath_ml_scripts.egg-info/PKG-INFO
|
|
12
|
+
src/shreenath_ml_scripts.egg-info/SOURCES.txt
|
|
13
|
+
src/shreenath_ml_scripts.egg-info/dependency_links.txt
|
|
14
|
+
src/shreenath_ml_scripts.egg-info/requires.txt
|
|
15
|
+
src/shreenath_ml_scripts.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ml_prac_scripts
|