sklearne 1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sklearne/__init__.py +1 -0
- sklearne/data/A1.txt +74 -0
- sklearne/data/A2.txt +50 -0
- sklearne/data/A3.txt +44 -0
- sklearne/data/A4.txt +41 -0
- sklearne/data/A5.txt +48 -0
- sklearne/data/B1.txt +74 -0
- sklearne/data/B2.txt +49 -0
- sklearne/data/B4.txt +44 -0
- sklearne/data/B6.txt +76 -0
- sklearne/data/B7.txt +55 -0
- sklearne/loader.py +12 -0
- sklearne-1.0.dist-info/METADATA +5 -0
- sklearne-1.0.dist-info/RECORD +16 -0
- sklearne-1.0.dist-info/WHEEL +5 -0
- sklearne-1.0.dist-info/top_level.txt +1 -0
sklearne/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
from .loader import show_code
|
sklearne/data/A1.txt
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# A1
|
2
|
+
# Design and implement pattern recognition system to identify and extract unique species patterns from the Iris dataset
|
3
|
+
# Step 1: Import Required Libraries
|
4
|
+
import numpy as np
|
5
|
+
import pandas as pd
|
6
|
+
import matplotlib.pyplot as plt
|
7
|
+
import seaborn as sns
|
8
|
+
|
9
|
+
from sklearn.datasets import load_iris
|
10
|
+
from sklearn.model_selection import train_test_split
|
11
|
+
from sklearn.preprocessing import StandardScaler
|
12
|
+
from sklearn.decomposition import PCA
|
13
|
+
from sklearn.neighbors import KNeighborsClassifier
|
14
|
+
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
|
15
|
+
|
16
|
+
# Step 2: Load Dataset
|
17
|
+
iris = load_iris()
|
18
|
+
X = iris.data
|
19
|
+
y = iris.target
|
20
|
+
feature_names = iris.feature_names
|
21
|
+
target_names = iris.target_names
|
22
|
+
|
23
|
+
# Create DataFrame for easier handling
|
24
|
+
df = pd.DataFrame(X, columns=feature_names)
|
25
|
+
df['species'] = pd.Series(y).map(dict(zip(range(3), target_names)))
|
26
|
+
|
27
|
+
# Step 3: Feature Analysis (Pattern Recognition)
|
28
|
+
print("Feature-wise Mean by Species:")
|
29
|
+
print(df.groupby('species').mean()) # Summary of patterns
|
30
|
+
|
31
|
+
# Step 4: Visualizations - Patterns between features
|
32
|
+
sns.pairplot(df, hue='species', palette='Set2')
|
33
|
+
plt.suptitle("Pairplot of Iris Features by Species", y=1.02)
|
34
|
+
plt.show()
|
35
|
+
|
36
|
+
# Step 5: PCA for Pattern Visualization (2D)
|
37
|
+
scaler = StandardScaler()
|
38
|
+
X_scaled = scaler.fit_transform(X)
|
39
|
+
|
40
|
+
pca = PCA(n_components=2)
|
41
|
+
X_pca = pca.fit_transform(X_scaled)
|
42
|
+
|
43
|
+
plt.figure(figsize=(8, 6))
|
44
|
+
for i, label in enumerate(np.unique(y)):
|
45
|
+
plt.scatter(X_pca[y == label, 0], X_pca[y == label, 1], label=target_names[label])
|
46
|
+
plt.title("PCA of Iris Dataset")
|
47
|
+
plt.xlabel("Principal Component 1")
|
48
|
+
plt.ylabel("Principal Component 2")
|
49
|
+
plt.legend()
|
50
|
+
plt.grid(True)
|
51
|
+
plt.show()
|
52
|
+
|
53
|
+
# Step 6: Classification using KNN
|
54
|
+
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
|
55
|
+
|
56
|
+
knn = KNeighborsClassifier(n_neighbors=5)
|
57
|
+
knn.fit(X_train, y_train)
|
58
|
+
|
59
|
+
y_pred = knn.predict(X_test)
|
60
|
+
|
61
|
+
# Step 7: Evaluation
|
62
|
+
print("\nClassification Report:")
|
63
|
+
print(classification_report(y_test, y_pred, target_names=target_names))
|
64
|
+
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
|
65
|
+
|
66
|
+
# Confusion Matrix
|
67
|
+
cm = confusion_matrix(y_test, y_pred)
|
68
|
+
plt.figure(figsize=(6, 5))
|
69
|
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
|
70
|
+
plt.title("Confusion Matrix")
|
71
|
+
plt.xlabel("Predicted")
|
72
|
+
plt.ylabel("True")
|
73
|
+
plt.tight_layout()
|
74
|
+
plt.show()
|
sklearne/data/A2.txt
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# A2
|
2
|
+
# Develop a text classification model that can effectively identify, extract features, and classify documents from the 20 Newsgroups dataset into one of the 20 predefined categories using pattern recognition techniques.
|
3
|
+
# Import required libraries
|
4
|
+
import numpy as np
|
5
|
+
import pandas as pd
|
6
|
+
import matplotlib.pyplot as plt
|
7
|
+
import seaborn as sns
|
8
|
+
|
9
|
+
from sklearn.datasets import fetch_20newsgroups
|
10
|
+
from sklearn.model_selection import train_test_split
|
11
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
12
|
+
from sklearn.naive_bayes import MultinomialNB
|
13
|
+
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
|
14
|
+
|
15
|
+
# Step 1: Load the dataset
|
16
|
+
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
|
17
|
+
X = newsgroups.data
|
18
|
+
y = newsgroups.target
|
19
|
+
target_names = newsgroups.target_names
|
20
|
+
|
21
|
+
# Step 2: Train-test split
|
22
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
23
|
+
|
24
|
+
# Step 3: Feature Extraction using TF-IDF (Pattern Recognition Technique)
|
25
|
+
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5)
|
26
|
+
X_train_tfidf = vectorizer.fit_transform(X_train)
|
27
|
+
X_test_tfidf = vectorizer.transform(X_test)
|
28
|
+
|
29
|
+
# Step 4: Model Training using Naive Bayes
|
30
|
+
model = MultinomialNB()
|
31
|
+
model.fit(X_train_tfidf, y_train)
|
32
|
+
|
33
|
+
# Step 5: Prediction
|
34
|
+
y_pred = model.predict(X_test_tfidf)
|
35
|
+
|
36
|
+
# Step 6: Evaluation
|
37
|
+
print("Accuracy:", accuracy_score(y_test, y_pred))
|
38
|
+
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=target_names))
|
39
|
+
|
40
|
+
# Step 7: Confusion Matrix Visualization
|
41
|
+
conf_mat = confusion_matrix(y_test, y_pred)
|
42
|
+
plt.figure(figsize=(12, 10))
|
43
|
+
sns.heatmap(conf_mat, annot=False, cmap='Blues', xticklabels=target_names, yticklabels=target_names)
|
44
|
+
plt.title("Confusion Matrix - 20 Newsgroups")
|
45
|
+
plt.xlabel("Predicted")
|
46
|
+
plt.ylabel("True")
|
47
|
+
plt.xticks(rotation=90)
|
48
|
+
plt.yticks(rotation=0)
|
49
|
+
plt.tight_layout()
|
50
|
+
plt.show()
|
sklearne/data/A3.txt
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# A3
|
2
|
+
# Design a statistical model to analyze wine quality using Gaussian distribution methods. Utilize synthetic data generated with NumPy or the Wine Quality Dataset
|
3
|
+
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import pandas as pd
|
7
|
+
import matplotlib.pyplot as plt
|
8
|
+
import seaborn as sns
|
9
|
+
from scipy.stats import norm
|
10
|
+
|
11
|
+
# Generate synthetic wine data (only 3 features)
|
12
|
+
n = 1000
|
13
|
+
fixed_acidity = np.random.normal(7.0, 0.7, n)
|
14
|
+
volatile_acidity = np.random.normal(0.5, 0.1, n)
|
15
|
+
citric_acid = np.random.normal(0.3, 0.1, n)
|
16
|
+
|
17
|
+
# Create DataFrame
|
18
|
+
df = pd.DataFrame({"fixed_acidity": fixed_acidity, "volatile_acidity": volatile_acidity, "citric_acid": citric_acid})
|
19
|
+
|
20
|
+
# Generate synthetic wine quality
|
21
|
+
df["wine_quality"] = np.clip(
|
22
|
+
(0.3 * df["fixed_acidity"] + 1.5 * df["volatile_acidity"] + 0.8 * df["citric_acid"] + np.random.normal(0, 0.5, n)).round().astype(int),
|
23
|
+
3, 8
|
24
|
+
)
|
25
|
+
|
26
|
+
# Display summary
|
27
|
+
print(df.describe())
|
28
|
+
|
29
|
+
# Plot histogram for 'fixed_acidity' with Gaussian fit
|
30
|
+
plt.figure(figsize=(10, 6))
|
31
|
+
sns.histplot(df['fixed_acidity'], kde=True, stat="density", color="skyblue", bins=30)
|
32
|
+
|
33
|
+
# Fit a Gaussian distribution to the data
|
34
|
+
mu, std = norm.fit(df['fixed_acidity'])
|
35
|
+
x = np.linspace(df['fixed_acidity'].min(), df['fixed_acidity'].max(), 100)
|
36
|
+
plt.plot(x, norm.pdf(x, mu, std), 'k', lw=2)
|
37
|
+
|
38
|
+
# Add title and labels
|
39
|
+
plt.title(f"Fixed Acidity Distribution ~ N({mu:.2f}, {std:.2f})", fontsize=14)
|
40
|
+
plt.xlabel("Fixed Acidity", fontsize=12)
|
41
|
+
plt.ylabel("Density", fontsize=12)
|
42
|
+
|
43
|
+
# Show the plot
|
44
|
+
plt.show()
|
sklearne/data/A4.txt
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# A4
|
2
|
+
# Develop a classification system for handwritten digit recognition using the MNIST dataset, leveraging Bayes' Decision Theory to optimize decision-making and minimize classification error.
|
3
|
+
import numpy as np
|
4
|
+
import matplotlib.pyplot as plt
|
5
|
+
import seaborn as sns
|
6
|
+
from sklearn.datasets import fetch_openml
|
7
|
+
from sklearn.model_selection import train_test_split
|
8
|
+
from sklearn.naive_bayes import GaussianNB
|
9
|
+
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
|
10
|
+
from sklearn.decomposition import PCA
|
11
|
+
|
12
|
+
# Dataset Loading
|
13
|
+
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
|
14
|
+
X, y = mnist.data, mnist.target.astype(np.int8)
|
15
|
+
X = X / 255.0 # Normalize pixel values
|
16
|
+
|
17
|
+
# Reduce dimensionality using PCA
|
18
|
+
pca = PCA(n_components=50) # You can try 30–100 and tune this
|
19
|
+
X_pca = pca.fit_transform(X)
|
20
|
+
|
21
|
+
# Model Development (Using GaussianNB as an approximation to Bayes' Decision)
|
22
|
+
model = GaussianNB()
|
23
|
+
|
24
|
+
# Training and Testing
|
25
|
+
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
|
26
|
+
model.fit(X_train, y_train)
|
27
|
+
y_pred = model.predict(X_test)
|
28
|
+
|
29
|
+
# Evaluation
|
30
|
+
print("\nEvaluation Metrics:")
|
31
|
+
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
|
32
|
+
print("\nClassification Report:\n", classification_report(y_test, y_pred))
|
33
|
+
|
34
|
+
# Confusion Matrix
|
35
|
+
conf_mat = confusion_matrix(y_test, y_pred)
|
36
|
+
plt.figure(figsize=(10, 7))
|
37
|
+
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues')
|
38
|
+
plt.title("Confusion Matrix")
|
39
|
+
plt.xlabel("Predicted")
|
40
|
+
plt.ylabel("True")
|
41
|
+
plt.show()
|
sklearne/data/A5.txt
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#A5
|
2
|
+
#Develop an anomaly detection system for high-dimensional network traffic data using the KDD Cup 1999 dataset
|
3
|
+
import pandas as pd
|
4
|
+
import numpy as np
|
5
|
+
from sklearn.datasets import fetch_kddcup99
|
6
|
+
from sklearn.model_selection import train_test_split
|
7
|
+
from sklearn.preprocessing import StandardScaler
|
8
|
+
from sklearn.decomposition import PCA
|
9
|
+
from sklearn.neighbors import KNeighborsClassifier
|
10
|
+
from sklearn.metrics import classification_report
|
11
|
+
|
12
|
+
# Load only a subset of the data for speed (optional: use 20,000 samples)
|
13
|
+
kdd = fetch_kddcup99(percent10=True, shuffle=True)
|
14
|
+
X_raw = kdd["data"][:20000]
|
15
|
+
y_raw = kdd["target"][:20000]
|
16
|
+
|
17
|
+
# Convert to DataFrame
|
18
|
+
df = pd.DataFrame(X_raw, columns=kdd["feature_names"])
|
19
|
+
|
20
|
+
# Binary label: 0 for normal, 1 for anomaly
|
21
|
+
df["binary_label"] = np.where(y_raw == b'normal.', 0, 1)
|
22
|
+
|
23
|
+
# One-hot encode categorical columns (with limited unique categories)
|
24
|
+
df = pd.get_dummies(df, columns=["protocol_type", "service", "flag"], drop_first=True)
|
25
|
+
|
26
|
+
# Features and label
|
27
|
+
X = df.drop(['binary_label'], axis=1)
|
28
|
+
y = df['binary_label']
|
29
|
+
|
30
|
+
# Scale features
|
31
|
+
scaler = StandardScaler()
|
32
|
+
X_scaled = scaler.fit_transform(X)
|
33
|
+
|
34
|
+
# PCA for faster training (keep only 10 components)
|
35
|
+
pca = PCA(n_components=10, random_state=42)
|
36
|
+
X_reduced = pca.fit_transform(X_scaled)
|
37
|
+
|
38
|
+
# Train/test split
|
39
|
+
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
|
40
|
+
|
41
|
+
# KNN with fewer neighbors for speed
|
42
|
+
knn = KNeighborsClassifier(n_neighbors=3, n_jobs=-1) # Use all CPU cores
|
43
|
+
knn.fit(X_train, y_train)
|
44
|
+
|
45
|
+
# Prediction & Evaluation
|
46
|
+
y_pred = knn.predict(X_test)
|
47
|
+
print(f"Accuracy: {knn.score(X_test, y_test) * 100:.2f}%")
|
48
|
+
print(classification_report(y_test, y_pred))
|
sklearne/data/B1.txt
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
#B1
|
2
|
+
# Implement a Hidden Markov Model (HMM) to recognize the sequence of weather patterns (e.g., sunny, cloudy, rainy) based on temperature and humidity observations. Use both discrete and continuous HMMs to compare their performance.
|
3
|
+
import numpy as np
|
4
|
+
import matplotlib.pyplot as plt
|
5
|
+
from hmmlearn import hmm
|
6
|
+
from sklearn.preprocessing import KBinsDiscretizer
|
7
|
+
import warnings
|
8
|
+
|
9
|
+
warnings.filterwarnings("ignore")
|
10
|
+
np.random.seed(42)
|
11
|
+
|
12
|
+
# Step 1: Define states
|
13
|
+
states = ["Sunny", "Cloudy", "Rainy"]
|
14
|
+
n_states = len(states)
|
15
|
+
|
16
|
+
# Step 2: Transition probabilities
|
17
|
+
trans_probs = np.array([
|
18
|
+
[0.6, 0.3, 0.1],
|
19
|
+
[0.2, 0.5, 0.3],
|
20
|
+
[0.1, 0.3, 0.6]
|
21
|
+
])
|
22
|
+
|
23
|
+
# Step 3: Emission properties (Temp, Humidity)
|
24
|
+
means = np.array([[30, 40], [25, 50], [20, 80]])
|
25
|
+
covars = np.array([
|
26
|
+
[[5, 0], [0, 5]],
|
27
|
+
[[4, 0], [0, 4]],
|
28
|
+
[[5, 0], [0, 5]]
|
29
|
+
])
|
30
|
+
|
31
|
+
# Step 4: Generate synthetic data
|
32
|
+
n_samples = 300
|
33
|
+
hidden_states = np.random.choice(n_states, size=n_samples, p=[0.5, 0.3, 0.2])
|
34
|
+
observations = np.array([
|
35
|
+
np.random.multivariate_normal(means[s], covars[s]) for s in hidden_states
|
36
|
+
])
|
37
|
+
|
38
|
+
# Visualize the generated data
|
39
|
+
plt.scatter(observations[:, 0], observations[:, 1], c=hidden_states, cmap='viridis')
|
40
|
+
plt.xlabel("Temperature")
|
41
|
+
plt.ylabel("Humidity")
|
42
|
+
plt.title("Synthetic Weather Observations")
|
43
|
+
plt.show()
|
44
|
+
|
45
|
+
# === Discrete HMM (Temperature only) ===
|
46
|
+
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
|
47
|
+
X_discrete = discretizer.fit_transform(observations[:, [0]]).astype(int)
|
48
|
+
|
49
|
+
model_discrete = hmm.MultinomialHMM(n_components=n_states, n_iter=100)
|
50
|
+
model_discrete.fit(X_discrete)
|
51
|
+
pred_discrete = model_discrete.predict(X_discrete)
|
52
|
+
|
53
|
+
# Accuracy (rough comparison)
|
54
|
+
acc_discrete = np.mean(pred_discrete == hidden_states)
|
55
|
+
print(f"Discrete HMM Accuracy: {acc_discrete:.2f}")
|
56
|
+
|
57
|
+
# === Continuous HMM (Full Temp & Humidity) ===
|
58
|
+
model_continuous = hmm.GaussianHMM(n_components=n_states, covariance_type="full", n_iter=100)
|
59
|
+
model_continuous.fit(observations)
|
60
|
+
pred_continuous = model_continuous.predict(observations)
|
61
|
+
|
62
|
+
acc_continuous = np.mean(pred_continuous == hidden_states)
|
63
|
+
print(f"Continuous HMM Accuracy: {acc_continuous:.2f}")
|
64
|
+
|
65
|
+
# Visualize predictions (first 50 steps)
|
66
|
+
plt.figure(figsize=(10, 4))
|
67
|
+
plt.plot(hidden_states[:50], "bo-", label="True States")
|
68
|
+
plt.plot(pred_discrete[:50], "r--", label="Discrete HMM")
|
69
|
+
plt.plot(pred_continuous[:50], "g.-", label="Continuous HMM")
|
70
|
+
plt.legend()
|
71
|
+
plt.title("True vs Predicted Hidden States")
|
72
|
+
plt.xlabel("Time Step")
|
73
|
+
plt.ylabel("State Index")
|
74
|
+
plt.show()
|
sklearne/data/B2.txt
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
#B2
|
2
|
+
# Build a Discrete Hidden Markov Model (HMM) to analyze DNA sequences and predict gene regions. Use Maximum Likelihood Estimation to train the model with a given dataset of labeled sequences
|
3
|
+
import numpy as np
|
4
|
+
from hmmlearn import hmm
|
5
|
+
from sklearn.preprocessing import LabelEncoder
|
6
|
+
import matplotlib.pyplot as plt
|
7
|
+
|
8
|
+
# Example sequences
|
9
|
+
sequences = [
|
10
|
+
"ATGCGCGTATCGT", # Mostly gene
|
11
|
+
"CGTACGTAGCTA", # Mix
|
12
|
+
"TTATTAGCGTTA" # Mostly intergenic
|
13
|
+
]
|
14
|
+
|
15
|
+
# Corresponding labels (0 = intergenic, 1 = gene)
|
16
|
+
labels = [
|
17
|
+
[1,1,1,1,1,1,0,0,0,0,0,0,0],
|
18
|
+
[0,0,1,1,1,1,0,0,0,1,1,1],
|
19
|
+
[0,0,0,0,0,1,1,0,0,0,0,0]
|
20
|
+
]
|
21
|
+
|
22
|
+
# Flatten sequence and labels for training
|
23
|
+
all_seq = ''.join(sequences)
|
24
|
+
all_labels = np.concatenate(labels)
|
25
|
+
|
26
|
+
# Encode DNA characters A/C/G/T to integers 0-3
|
27
|
+
le = LabelEncoder()
|
28
|
+
le.fit(['A', 'C', 'G', 'T'])
|
29
|
+
X = le.transform(list(all_seq)).reshape(-1, 1)
|
30
|
+
|
31
|
+
# Train HMM (Discrete = MultinomialHMM)
|
32
|
+
model = hmm.MultinomialHMM(n_components=2, n_iter=100, tol=0.01)
|
33
|
+
model.fit(X)
|
34
|
+
|
35
|
+
# Predict hidden states
|
36
|
+
predicted_states = model.predict(X)
|
37
|
+
|
38
|
+
# Compare with true labels
|
39
|
+
accuracy = np.mean(predicted_states == all_labels)
|
40
|
+
print(f"Prediction Accuracy (approx): {accuracy:.2f}")
|
41
|
+
|
42
|
+
# Visualize true vs predicted
|
43
|
+
plt.plot(all_labels[:50], label="True State")
|
44
|
+
plt.plot(predicted_states[:50], '--', label="Predicted")
|
45
|
+
plt.title("Gene Prediction - HMM")
|
46
|
+
plt.xlabel("Sequence Position")
|
47
|
+
plt.ylabel("State (0=Intergenic, 1=Gene)")
|
48
|
+
plt.legend()
|
49
|
+
plt.show()
|
sklearne/data/B4.txt
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# B4
|
2
|
+
# Create a program that fits a mixture of Gaussians to a dataset of handwritten digit features and clusters them into distinct groups. Use the Expectation-Maximization method to estimate the parameters of the Gaussian mixture model.
|
3
|
+
import numpy as np
|
4
|
+
import matplotlib.pyplot as plt
|
5
|
+
import seaborn as sns
|
6
|
+
from sklearn.datasets import fetch_openml
|
7
|
+
from sklearn.decomposition import PCA
|
8
|
+
from sklearn.mixture import GaussianMixture
|
9
|
+
from sklearn.metrics import adjusted_rand_score
|
10
|
+
|
11
|
+
# Load MNIST dataset
|
12
|
+
mnist = fetch_openml("mnist_784", version=1, as_frame=False)
|
13
|
+
X, y = mnist.data, mnist.target.astype(int)
|
14
|
+
|
15
|
+
# Normalize pixel values
|
16
|
+
X = X / 255.0
|
17
|
+
|
18
|
+
# Reduce dimensionality with PCA (to speed up EM)
|
19
|
+
pca = PCA(n_components=50)
|
20
|
+
X_pca = pca.fit_transform(X)
|
21
|
+
|
22
|
+
# Fit Gaussian Mixture Model
|
23
|
+
n_components = 10 # Assume 10 digits (0-9)
|
24
|
+
gmm = GaussianMixture(n_components=n_components, covariance_type='full', max_iter=100, random_state=42)
|
25
|
+
gmm.fit(X_pca)
|
26
|
+
|
27
|
+
# Predict cluster labels
|
28
|
+
cluster_labels = gmm.predict(X_pca)
|
29
|
+
|
30
|
+
# Optional: Check clustering quality with Adjusted Rand Index
|
31
|
+
ari = adjusted_rand_score(y, cluster_labels)
|
32
|
+
print(f"Adjusted Rand Index: {ari:.2f}")
|
33
|
+
|
34
|
+
# Visualize clusters in 2D
|
35
|
+
pca_2d = PCA(n_components=2)
|
36
|
+
X_2d = pca_2d.fit_transform(X_pca)
|
37
|
+
|
38
|
+
plt.figure(figsize=(10, 6))
|
39
|
+
sns.scatterplot(x=X_2d[:, 0], y=X_2d[:, 1], hue=cluster_labels, palette="tab10", legend="full", s=10)
|
40
|
+
plt.title("GMM Clustering of MNIST (2D PCA Projection)")
|
41
|
+
plt.xlabel("PCA 1")
|
42
|
+
plt.ylabel("PCA 2")
|
43
|
+
plt.legend(title="Cluster")
|
44
|
+
plt.show()
|
sklearne/data/B6.txt
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
#6 Use non-parametric K-Nearest Neighbor (KNN) techniques to classify grayscale images of shapes (e.g., circles, squares, and triangles). #Evaluate and compare the classification accuracy of both methods.
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
import matplotlib.pyplot as plt
|
5
|
+
from sklearn.model_selection import train_test_split
|
6
|
+
from sklearn.neighbors import KNeighborsClassifier
|
7
|
+
from sklearn.metrics import accuracy_score, classification_report
|
8
|
+
import cv2 # Required for triangle generation
|
9
|
+
|
10
|
+
# Function to generate square
|
11
|
+
def generate_square(image_size=64):
|
12
|
+
img = np.zeros((image_size, image_size), dtype=np.uint8)
|
13
|
+
img[16:48, 16:48] = 255 # Create a square in the center
|
14
|
+
return img
|
15
|
+
|
16
|
+
# Function to generate circle
|
17
|
+
def generate_circle(image_size=64):
|
18
|
+
img = np.zeros((image_size, image_size), dtype=np.uint8)
|
19
|
+
y, x = np.ogrid[:image_size, :image_size]
|
20
|
+
mask = (x - image_size // 2) ** 2 + (y - image_size // 2) ** 2 <= (image_size // 4) ** 2
|
21
|
+
img[mask] = 255
|
22
|
+
return img
|
23
|
+
|
24
|
+
# Function to generate triangle
|
25
|
+
def generate_triangle(image_size=64):
|
26
|
+
img = np.zeros((image_size, image_size), dtype=np.uint8)
|
27
|
+
pts = np.array([[32, 16], [16, 48], [48, 48]], np.int32)
|
28
|
+
cv2.fillPoly(img, [pts], 255)
|
29
|
+
return img
|
30
|
+
|
31
|
+
# Function to generate the appropriate shape based on the label
|
32
|
+
def generate_shape(shape):
|
33
|
+
if shape == 'square':
|
34
|
+
return generate_square()
|
35
|
+
elif shape == 'circle':
|
36
|
+
return generate_circle()
|
37
|
+
elif shape == 'triangle':
|
38
|
+
return generate_triangle()
|
39
|
+
|
40
|
+
# Create synthetic dataset
|
41
|
+
shapes = ['square', 'circle', 'triangle']
|
42
|
+
X = [] # Features (flattened images)
|
43
|
+
y = [] # Labels (0: Square, 1: Circle, 2: Triangle)
|
44
|
+
n_samples = 1000
|
45
|
+
|
46
|
+
for shape_idx, shape in enumerate(shapes):
|
47
|
+
for _ in range(n_samples // 3):
|
48
|
+
img = generate_shape(shape)
|
49
|
+
X.append(img.flatten()) # Flatten image to 1D
|
50
|
+
y.append(shape_idx) # Assign corresponding label
|
51
|
+
|
52
|
+
X = np.array(X)
|
53
|
+
y = np.array(y)
|
54
|
+
|
55
|
+
# Split dataset into training and testing sets
|
56
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
57
|
+
|
58
|
+
# Train KNN classifier
|
59
|
+
knn = KNeighborsClassifier(n_neighbors=3)
|
60
|
+
knn.fit(X_train, y_train)
|
61
|
+
|
62
|
+
# Make predictions and evaluate
|
63
|
+
y_pred = knn.predict(X_test)
|
64
|
+
accuracy = accuracy_score(y_test, y_pred)
|
65
|
+
|
66
|
+
print(f"Accuracy: {accuracy * 100:.2f}%")
|
67
|
+
print("\nClassification Report:\n", classification_report(y_test, y_pred))
|
68
|
+
|
69
|
+
# Visualize test images with predicted labels
|
70
|
+
fig, axes = plt.subplots(1, 5, figsize=(12, 6))
|
71
|
+
for i in range(5):
|
72
|
+
axes[i].imshow(X_test[i].reshape(64, 64), cmap='gray')
|
73
|
+
axes[i].set_title(f"Pred: {shapes[y_pred[i]]}")
|
74
|
+
axes[i].axis('off')
|
75
|
+
|
76
|
+
plt.show()
|
sklearne/data/B7.txt
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
#B7
|
2
|
+
# Build a Python application to classify iris flowers using the Nearest Neighbor Rule. Use a given dataset with features such as petal length and width. Experiment with different values of K and evaluate the model's accuracy
|
3
|
+
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
from sklearn.datasets import load_iris
|
7
|
+
from sklearn.model_selection import train_test_split
|
8
|
+
from sklearn.neighbors import KNeighborsClassifier
|
9
|
+
from sklearn.metrics import accuracy_score, classification_report
|
10
|
+
import matplotlib.pyplot as plt
|
11
|
+
|
12
|
+
# Step 1: Load the Iris dataset
|
13
|
+
iris = load_iris()
|
14
|
+
X = iris.data # Features (petal length, petal width, etc.)
|
15
|
+
y = iris.target # Labels (species of flowers)
|
16
|
+
|
17
|
+
# Step 2: Split the dataset into training and testing sets
|
18
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
19
|
+
|
20
|
+
# Step 3: Define a function to train and evaluate the KNN model with different K values
|
21
|
+
def evaluate_knn(k_values):
|
22
|
+
for k in k_values:
|
23
|
+
# Initialize the KNN classifier with a given value of k
|
24
|
+
knn = KNeighborsClassifier(n_neighbors=k)
|
25
|
+
|
26
|
+
# Train the classifier on the training data
|
27
|
+
knn.fit(X_train, y_train)
|
28
|
+
|
29
|
+
# Make predictions on the test data
|
30
|
+
y_pred = knn.predict(X_test)
|
31
|
+
|
32
|
+
# Calculate accuracy
|
33
|
+
accuracy = accuracy_score(y_test, y_pred)
|
34
|
+
print(f"K={k} - Accuracy: {accuracy * 100:.2f}%")
|
35
|
+
print(f"Classification Report for K={k}:\n{classification_report(y_test, y_pred)}")
|
36
|
+
|
37
|
+
# Step 4: Experiment with different K values
|
38
|
+
k_values = [1, 3, 5, 7, 9] # Different values of K to try
|
39
|
+
evaluate_knn(k_values)
|
40
|
+
|
41
|
+
# Optional: Plot the effect of K on accuracy (for visualization)
|
42
|
+
accuracies = []
|
43
|
+
for k in k_values:
|
44
|
+
knn = KNeighborsClassifier(n_neighbors=k)
|
45
|
+
knn.fit(X_train, y_train)
|
46
|
+
y_pred = knn.predict(X_test)
|
47
|
+
accuracies.append(accuracy_score(y_test, y_pred))
|
48
|
+
|
49
|
+
# Plot accuracy vs K value
|
50
|
+
plt.plot(k_values, accuracies, marker='o')
|
51
|
+
plt.title('KNN Classifier Accuracy vs K')
|
52
|
+
plt.xlabel('K value')
|
53
|
+
plt.ylabel('Accuracy')
|
54
|
+
plt.grid(True)
|
55
|
+
plt.show()
|
sklearne/loader.py
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
def show_code(filename):
|
4
|
+
file_path = os.path.join(os.path.dirname(__file__), 'data', filename)
|
5
|
+
if not os.path.exists(file_path):
|
6
|
+
return f"File {filename} not found."
|
7
|
+
|
8
|
+
with open(file_path, 'r') as f:
|
9
|
+
content = f.read()
|
10
|
+
|
11
|
+
print(content) # shows the code in Jupyter output
|
12
|
+
return content # also returns content if needed
|
@@ -0,0 +1,16 @@
|
|
1
|
+
sklearne/__init__.py,sha256=uk0QwGzWTdxut9oI3_0723e8w0kYC3b__AqlSruVpSU,31
|
2
|
+
sklearne/loader.py,sha256=PXerhBcG88Z6FhLV1izCPz_a_jQ2nN10dpE1U834j0Q,379
|
3
|
+
sklearne/data/A1.txt,sha256=OD3cGtqSVqfH5XSD0kxd4-yTv4hJ90VJvZ2dGL2Ssqc,2439
|
4
|
+
sklearne/data/A2.txt,sha256=ctmfmMhFXZCUXlAFqnEMtUXPYhT2QQinYLovtHtNCXQ,1943
|
5
|
+
sklearne/data/A3.txt,sha256=B_pUaofgsAnhovRXOTsiHjIlFoNJulLE6lvr8v1ZJgE,1507
|
6
|
+
sklearne/data/A4.txt,sha256=N_PgZKbBUyZqefBf05aQbeaukuyfhUOjtCeGll92gYQ,1560
|
7
|
+
sklearne/data/A5.txt,sha256=P-8wK-Hd7B_BMrJ0OsfiYKyfirOy2oF6wm8dW8bnF9Y,1724
|
8
|
+
sklearne/data/B1.txt,sha256=XenxamzxPD78T_fOQ15AkNU5sWcrOMWp-Zg60atRX4A,2564
|
9
|
+
sklearne/data/B2.txt,sha256=azrRFUVR1Y4DgOkDtsj-z2_PUCoFOAfzRO1NwiC-dYo,1502
|
10
|
+
sklearne/data/B4.txt,sha256=P949GYoOoCj5KCM62Q9CrW16mPD7t-W0y4hFSUHMWbk,1565
|
11
|
+
sklearne/data/B6.txt,sha256=Aq615q1NecnjT_KYNbKdJaLu6Zis1-ioTyXC-6eRoh0,2728
|
12
|
+
sklearne/data/B7.txt,sha256=S3HC0Il-_FHOLGbMvlvNmZb1rtfRHYV8mtxkE08tq7c,2109
|
13
|
+
sklearne-1.0.dist-info/METADATA,sha256=2OJbnz8lh-DXOeeolFnJa5tmecLiNxG7L-a8BInXcYQ,88
|
14
|
+
sklearne-1.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
15
|
+
sklearne-1.0.dist-info/top_level.txt,sha256=v7HoQGFOgDk0Ia60xJb3HE_zt-QB6YueCQjJyse8pEY,9
|
16
|
+
sklearne-1.0.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
sklearne
|