natbook 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natbook-0.1.0/PKG-INFO +11 -0
- natbook-0.1.0/README.md +50 -0
- natbook-0.1.0/lekhexamprep/__init__.py +28 -0
- natbook-0.1.0/lekhexamprep/adaboost.py +166 -0
- natbook-0.1.0/lekhexamprep/classification_regression.py +178 -0
- natbook-0.1.0/lekhexamprep/decision_tree.py +198 -0
- natbook-0.1.0/lekhexamprep/kmeans.py +141 -0
- natbook-0.1.0/lekhexamprep/lda.py +146 -0
- natbook-0.1.0/lekhexamprep/linear_regression.py +120 -0
- natbook-0.1.0/lekhexamprep/linear_svm.py +163 -0
- natbook-0.1.0/lekhexamprep/nonlinear_svm.py +168 -0
- natbook-0.1.0/lekhexamprep/pca.py +233 -0
- natbook-0.1.0/lekhexamprep/polynomial_regression.py +195 -0
- natbook-0.1.0/lekhexamprep/random_forest.py +134 -0
- natbook-0.1.0/lekhexamprep/svd.py +199 -0
- natbook-0.1.0/lekhexamprep/xgboost_model.py +169 -0
- natbook-0.1.0/lekhexamprep/xgboost_withoutsk.py +317 -0
- natbook-0.1.0/natbook.egg-info/PKG-INFO +11 -0
- natbook-0.1.0/natbook.egg-info/SOURCES.txt +22 -0
- natbook-0.1.0/natbook.egg-info/dependency_links.txt +1 -0
- natbook-0.1.0/natbook.egg-info/requires.txt +7 -0
- natbook-0.1.0/natbook.egg-info/top_level.txt +1 -0
- natbook-0.1.0/pyproject.toml +22 -0
- natbook-0.1.0/setup.cfg +4 -0
natbook-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: natbook
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: ML exam prep — 12 algorithms ready to run
|
|
5
|
+
Requires-Python: >=3.8
|
|
6
|
+
Requires-Dist: numpy
|
|
7
|
+
Requires-Dist: scikit-learn
|
|
8
|
+
Requires-Dist: matplotlib
|
|
9
|
+
Requires-Dist: pandas
|
|
10
|
+
Provides-Extra: xgboost
|
|
11
|
+
Requires-Dist: xgboost; extra == "xgboost"
|
natbook-0.1.0/README.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# lekhexamprep
|
|
2
|
+
|
|
3
|
+
ML exam prep package — 12 algorithms, one `run()` call each.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install lekhexamprep # from PyPI (after publishing)
|
|
9
|
+
# or locally:
|
|
10
|
+
pip install .
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
import lekhexamprep as lep
|
|
17
|
+
|
|
18
|
+
lep.linear_regression()
|
|
19
|
+
lep.polynomial_regression(degree=3)
|
|
20
|
+
lep.classification_regression()
|
|
21
|
+
lep.decision_tree(max_depth=3)
|
|
22
|
+
lep.linear_svm()
|
|
23
|
+
lep.nonlinear_svm(kernel="rbf")
|
|
24
|
+
lep.random_forest(n_estimators=100)
|
|
25
|
+
lep.xgboost()
|
|
26
|
+
lep.kmeans(k=3)
|
|
27
|
+
lep.svd()
|
|
28
|
+
lep.pca(n_components=2)
|
|
29
|
+
lep.lda(n_components=2)
|
|
30
|
+
|
|
31
|
+
# Pass plot=True to any function for a matplotlib visualisation
|
|
32
|
+
lep.pca(plot=True)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Algorithms
|
|
36
|
+
|
|
37
|
+
| # | Module | Algorithm |
|
|
38
|
+
|---|--------|-----------|
|
|
39
|
+
| 1 | `linear_regression` | Simple Linear Regression |
|
|
40
|
+
| 2 | `polynomial_regression` | Polynomial Regression (Pipeline) |
|
|
41
|
+
| 3 | `classification_regression` | Logistic Regression (Classification) |
|
|
42
|
+
| 4 | `decision_tree` | Decision Tree Classifier |
|
|
43
|
+
| 5 | `linear_svm` | Linear SVM |
|
|
44
|
+
| 6 | `nonlinear_svm` | Non-Linear SVM (RBF kernel) |
|
|
45
|
+
| 7 | `random_forest` | Random Forest |
|
|
46
|
+
| 8 | `xgboost` | XGBoost |
|
|
47
|
+
| 9 | `kmeans` | K-Means Clustering |
|
|
48
|
+
| 10 | `svd` | Singular Value Decomposition |
|
|
49
|
+
| 11 | `pca` | Principal Component Analysis |
|
|
50
|
+
| 12 | `lda` | Linear Discriminant Analysis |
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from .linear_regression import run as linear_regression
|
|
2
|
+
from .polynomial_regression import run as polynomial_regression
|
|
3
|
+
from .classification_regression import run as classification_regression
|
|
4
|
+
from .decision_tree import run as decision_tree
|
|
5
|
+
from .linear_svm import run as linear_svm
|
|
6
|
+
from .nonlinear_svm import run as nonlinear_svm
|
|
7
|
+
from .random_forest import run as random_forest
|
|
8
|
+
from .xgboost_model import run as xgboost
|
|
9
|
+
from .kmeans import run as kmeans
|
|
10
|
+
from .svd import run as svd
|
|
11
|
+
from .pca import run as pca
|
|
12
|
+
from .lda import run as lda
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.0"
|
|
15
|
+
__all__ = [
|
|
16
|
+
"linear_regression",
|
|
17
|
+
"polynomial_regression",
|
|
18
|
+
"classification_regression",
|
|
19
|
+
"decision_tree",
|
|
20
|
+
"linear_svm",
|
|
21
|
+
"nonlinear_svm",
|
|
22
|
+
"random_forest",
|
|
23
|
+
"xgboost",
|
|
24
|
+
"kmeans",
|
|
25
|
+
"svd",
|
|
26
|
+
"pca",
|
|
27
|
+
"lda",
|
|
28
|
+
]
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# AdaBoost Classifier
|
|
3
|
+
# Dataset: Breast Cancer Wisconsin — Kaggle
|
|
4
|
+
# https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
|
|
5
|
+
#
|
|
6
|
+
# SETUP:
|
|
7
|
+
# kaggle datasets download -d uciml/breast-cancer-wisconsin-data --unzip
|
|
8
|
+
# → this gives you data.csv
|
|
9
|
+
# OR manually download from Kaggle and place data.csv in the same folder
|
|
10
|
+
# =============================================================================
|
|
11
|
+
|
|
12
|
+
# Step 1 - Import Libraries
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import matplotlib.pyplot as plt
|
|
16
|
+
from sklearn.ensemble import AdaBoostClassifier
|
|
17
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
18
|
+
from sklearn.model_selection import train_test_split, cross_val_score
|
|
19
|
+
from sklearn.metrics import (accuracy_score, confusion_matrix,
|
|
20
|
+
classification_report, ConfusionMatrixDisplay)
|
|
21
|
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
22
|
+
|
|
23
|
+
# =============================================================================
|
|
24
|
+
# Step 2 - Load and Explore Dataset
|
|
25
|
+
# =============================================================================
|
|
26
|
+
|
|
27
|
+
df = pd.read_csv('data.csv') # CSV must be in the same folder as this file
|
|
28
|
+
|
|
29
|
+
print("Shape :", df.shape)
|
|
30
|
+
print("\nColumns :", list(df.columns))
|
|
31
|
+
print("\nFirst 5 rows:")
|
|
32
|
+
print(df.head())
|
|
33
|
+
print("\nMissing values:")
|
|
34
|
+
print(df.isnull().sum())
|
|
35
|
+
print("\nClass distribution:")
|
|
36
|
+
print(df['diagnosis'].value_counts()) # M = Malignant, B = Benign
|
|
37
|
+
|
|
38
|
+
# =============================================================================
|
|
39
|
+
# Step 3 - Preprocess the Data
|
|
40
|
+
# =============================================================================
|
|
41
|
+
|
|
42
|
+
# Drop columns we don't need
|
|
43
|
+
df = df.drop(columns=['id', 'Unnamed: 32'], errors='ignore')
|
|
44
|
+
|
|
45
|
+
# Encode target: M → 1, B → 0
|
|
46
|
+
le = LabelEncoder()
|
|
47
|
+
df['diagnosis'] = le.fit_transform(df['diagnosis'])
|
|
48
|
+
|
|
49
|
+
# Separate features and target
|
|
50
|
+
X = df.drop(columns=['diagnosis']).values
|
|
51
|
+
y = df['diagnosis'].values
|
|
52
|
+
feature_names = df.drop(columns=['diagnosis']).columns.tolist()
|
|
53
|
+
|
|
54
|
+
# Train-test split (80-20)
|
|
55
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
56
|
+
X, y, test_size=0.2, random_state=42, stratify=y
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Scale features
|
|
60
|
+
scaler = StandardScaler()
|
|
61
|
+
X_train = scaler.fit_transform(X_train)
|
|
62
|
+
X_test = scaler.transform(X_test) # only transform, never fit on test data
|
|
63
|
+
|
|
64
|
+
print("\nTraining samples :", X_train.shape[0])
|
|
65
|
+
print("Testing samples :", X_test.shape[0])
|
|
66
|
+
print("Number of features:", X_train.shape[1])
|
|
67
|
+
|
|
68
|
+
# =============================================================================
|
|
69
|
+
# Step 4 - Train AdaBoost Model
|
|
70
|
+
# =============================================================================
|
|
71
|
+
# AdaBoost = Adaptive Boosting
|
|
72
|
+
# Idea: Train weak learners (stumps) one by one.
|
|
73
|
+
# Each new learner focuses more on the samples the previous one got WRONG.
|
|
74
|
+
# Final prediction = weighted vote of all weak learners.
|
|
75
|
+
|
|
76
|
+
base_estimator = DecisionTreeClassifier(max_depth=1) # weak learner = stump
|
|
77
|
+
|
|
78
|
+
ada = AdaBoostClassifier(
|
|
79
|
+
estimator = base_estimator, # weak learner
|
|
80
|
+
n_estimators = 100, # number of weak learners to train
|
|
81
|
+
learning_rate = 1.0, # shrinks contribution of each learner
|
|
82
|
+
algorithm = 'SAMME', # use SAMME (SAMME.R is deprecated)
|
|
83
|
+
random_state = 42
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
ada.fit(X_train, y_train)
|
|
87
|
+
print("\nModel trained successfully!")
|
|
88
|
+
print("Number of estimators:", ada.n_estimators)
|
|
89
|
+
|
|
90
|
+
# =============================================================================
|
|
91
|
+
# Step 5 - Evaluate the Model
|
|
92
|
+
# =============================================================================
|
|
93
|
+
|
|
94
|
+
y_pred = ada.predict(X_test)
|
|
95
|
+
|
|
96
|
+
print("\n========== Evaluation ==========")
|
|
97
|
+
print("Accuracy :", round(accuracy_score(y_test, y_pred), 4))
|
|
98
|
+
|
|
99
|
+
print("\nClassification Report:")
|
|
100
|
+
print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant']))
|
|
101
|
+
|
|
102
|
+
# Cross-validation (5-fold)
|
|
103
|
+
cv_scores = cross_val_score(ada, X, y, cv=5, scoring='accuracy')
|
|
104
|
+
print("5-Fold CV Scores :", cv_scores.round(4))
|
|
105
|
+
print("CV Mean Accuracy :", round(cv_scores.mean(), 4))
|
|
106
|
+
print("CV Std Dev :", round(cv_scores.std(), 4))
|
|
107
|
+
|
|
108
|
+
# Confusion Matrix
|
|
109
|
+
cm = confusion_matrix(y_test, y_pred)
|
|
110
|
+
print("\nConfusion Matrix:\n", cm)
|
|
111
|
+
|
|
112
|
+
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
|
|
113
|
+
display_labels=['Benign', 'Malignant'])
|
|
114
|
+
disp.plot(cmap='Oranges')
|
|
115
|
+
plt.title('AdaBoost - Confusion Matrix')
|
|
116
|
+
plt.tight_layout()
|
|
117
|
+
plt.show()
|
|
118
|
+
|
|
119
|
+
# =============================================================================
|
|
120
|
+
# Step 6 - Feature Importance
|
|
121
|
+
# =============================================================================
|
|
122
|
+
|
|
123
|
+
importances = ada.feature_importances_
|
|
124
|
+
indices = np.argsort(importances)[::-1]
|
|
125
|
+
top_n = 10
|
|
126
|
+
|
|
127
|
+
print("\nTop 10 Important Features:")
|
|
128
|
+
for i in range(top_n):
|
|
129
|
+
print(f" {i+1}. {feature_names[indices[i]]:35s} {importances[indices[i]]:.4f}")
|
|
130
|
+
|
|
131
|
+
plt.figure(figsize=(10, 5))
|
|
132
|
+
plt.bar(range(top_n),
|
|
133
|
+
importances[indices[:top_n]],
|
|
134
|
+
color='darkorange', edgecolor='black', linewidth=0.6)
|
|
135
|
+
plt.xticks(range(top_n),
|
|
136
|
+
[feature_names[i] for i in indices[:top_n]],
|
|
137
|
+
rotation=45, ha='right', fontsize=8)
|
|
138
|
+
plt.title('AdaBoost - Top 10 Feature Importances')
|
|
139
|
+
plt.ylabel('Importance Score')
|
|
140
|
+
plt.tight_layout()
|
|
141
|
+
plt.show()
|
|
142
|
+
|
|
143
|
+
# =============================================================================
|
|
144
|
+
# Step 7 - Accuracy vs Number of Estimators
|
|
145
|
+
# =============================================================================
|
|
146
|
+
# Shows how AdaBoost improves as it adds more weak learners
|
|
147
|
+
|
|
148
|
+
train_errors = []
|
|
149
|
+
test_errors = []
|
|
150
|
+
|
|
151
|
+
for y_pred_train, y_pred_test in zip(
|
|
152
|
+
ada.staged_predict(X_train),
|
|
153
|
+
ada.staged_predict(X_test)
|
|
154
|
+
):
|
|
155
|
+
train_errors.append(1 - accuracy_score(y_train, y_pred_train))
|
|
156
|
+
test_errors.append(1 - accuracy_score(y_test, y_pred_test))
|
|
157
|
+
|
|
158
|
+
plt.figure(figsize=(9, 5))
|
|
159
|
+
plt.plot(train_errors, label='Train Error', color='blue', linewidth=2)
|
|
160
|
+
plt.plot(test_errors, label='Test Error', color='red', linewidth=2)
|
|
161
|
+
plt.xlabel('Number of Estimators (Weak Learners)')
|
|
162
|
+
plt.ylabel('Error Rate')
|
|
163
|
+
plt.title('AdaBoost - Error vs Number of Estimators')
|
|
164
|
+
plt.legend()
|
|
165
|
+
plt.tight_layout()
|
|
166
|
+
plt.show()
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# Logistic Regression Classifier
|
|
3
|
+
# Dataset: Breast Cancer Wisconsin — Kaggle
|
|
4
|
+
# https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
|
|
5
|
+
#
|
|
6
|
+
# SETUP:
|
|
7
|
+
# kaggle datasets download -d uciml/breast-cancer-wisconsin-data --unzip
|
|
8
|
+
# → this gives you data.csv
|
|
9
|
+
# OR manually download from Kaggle and place data.csv in the same folder
|
|
10
|
+
# =============================================================================
|
|
11
|
+
|
|
12
|
+
# Step 1 - Import Libraries
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import matplotlib.pyplot as plt
|
|
16
|
+
from sklearn.linear_model import LogisticRegression
|
|
17
|
+
from sklearn.model_selection import train_test_split, cross_val_score
|
|
18
|
+
from sklearn.metrics import (accuracy_score, confusion_matrix,
|
|
19
|
+
classification_report, ConfusionMatrixDisplay,
|
|
20
|
+
roc_curve, auc)
|
|
21
|
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
22
|
+
|
|
23
|
+
# =============================================================================
|
|
24
|
+
# Step 2 - Load and Explore Dataset
|
|
25
|
+
# =============================================================================
|
|
26
|
+
|
|
27
|
+
df = pd.read_csv('data.csv') # CSV must be in the same folder as this file
|
|
28
|
+
|
|
29
|
+
print("Shape :", df.shape)
|
|
30
|
+
print("\nColumns :", list(df.columns))
|
|
31
|
+
print("\nFirst 5 rows:")
|
|
32
|
+
print(df.head())
|
|
33
|
+
print("\nMissing values:")
|
|
34
|
+
print(df.isnull().sum())
|
|
35
|
+
print("\nClass distribution:")
|
|
36
|
+
print(df['diagnosis'].value_counts()) # M = Malignant, B = Benign
|
|
37
|
+
|
|
38
|
+
# =============================================================================
|
|
39
|
+
# Step 3 - Preprocess the Data
|
|
40
|
+
# =============================================================================
|
|
41
|
+
|
|
42
|
+
# Drop columns we don't need
|
|
43
|
+
df = df.drop(columns=['id', 'Unnamed: 32'], errors='ignore')
|
|
44
|
+
|
|
45
|
+
# Encode target: M → 1, B → 0
|
|
46
|
+
le = LabelEncoder()
|
|
47
|
+
df['diagnosis'] = le.fit_transform(df['diagnosis'])
|
|
48
|
+
|
|
49
|
+
# Separate features and target
|
|
50
|
+
X = df.drop(columns=['diagnosis']).values
|
|
51
|
+
y = df['diagnosis'].values
|
|
52
|
+
feature_names = df.drop(columns=['diagnosis']).columns.tolist()
|
|
53
|
+
|
|
54
|
+
# Train-test split (80-20)
|
|
55
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
56
|
+
X, y, test_size=0.2, random_state=42, stratify=y
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Scale features — VERY important for Logistic Regression
|
|
60
|
+
# LR uses gradient descent; unscaled features make it converge slowly/poorly
|
|
61
|
+
scaler = StandardScaler()
|
|
62
|
+
X_train = scaler.fit_transform(X_train)
|
|
63
|
+
X_test = scaler.transform(X_test) # only transform, never fit on test data
|
|
64
|
+
|
|
65
|
+
print("\nTraining samples :", X_train.shape[0])
|
|
66
|
+
print("Testing samples :", X_test.shape[0])
|
|
67
|
+
print("Number of features:", X_train.shape[1])
|
|
68
|
+
|
|
69
|
+
# =============================================================================
|
|
70
|
+
# Step 4 - Train Logistic Regression Model
|
|
71
|
+
# =============================================================================
|
|
72
|
+
# Logistic Regression estimates the probability that a sample belongs to a class
|
|
73
|
+
# using the sigmoid function:
|
|
74
|
+
#
|
|
75
|
+
# P(y=1 | x) = 1 / (1 + e^(-z)) where z = w.x + b
|
|
76
|
+
#
|
|
77
|
+
# If P > 0.5 → predict class 1 (Malignant)
|
|
78
|
+
# If P <= 0.5 → predict class 0 (Benign)
|
|
79
|
+
|
|
80
|
+
lr = LogisticRegression(
|
|
81
|
+
C = 1.0, # regularisation strength (smaller C = stronger regularisation)
|
|
82
|
+
penalty = 'l2', # L2 regularisation (Ridge) — prevents overfitting
|
|
83
|
+
solver = 'lbfgs', # optimisation algorithm (works well for small datasets)
|
|
84
|
+
max_iter = 1000, # max iterations for solver to converge
|
|
85
|
+
random_state= 42
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
lr.fit(X_train, y_train)
|
|
89
|
+
print("\nModel trained successfully!")
|
|
90
|
+
print("Solver used :", lr.solver)
|
|
91
|
+
print("Regularisation :", lr.penalty)
|
|
92
|
+
|
|
93
|
+
# =============================================================================
|
|
94
|
+
# Step 5 - Evaluate the Model
|
|
95
|
+
# =============================================================================
|
|
96
|
+
|
|
97
|
+
y_pred = lr.predict(X_test)
|
|
98
|
+
y_pred_prob = lr.predict_proba(X_test)[:, 1] # probability of being Malignant
|
|
99
|
+
|
|
100
|
+
print("\n========== Evaluation ==========")
|
|
101
|
+
print("Accuracy :", round(accuracy_score(y_test, y_pred), 4))
|
|
102
|
+
|
|
103
|
+
print("\nClassification Report:")
|
|
104
|
+
print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant']))
|
|
105
|
+
|
|
106
|
+
# Cross-validation (5-fold)
|
|
107
|
+
cv_scores = cross_val_score(lr, X, y, cv=5, scoring='accuracy')
|
|
108
|
+
print("5-Fold CV Scores :", cv_scores.round(4))
|
|
109
|
+
print("CV Mean Accuracy :", round(cv_scores.mean(), 4))
|
|
110
|
+
print("CV Std Dev :", round(cv_scores.std(), 4))
|
|
111
|
+
|
|
112
|
+
# Confusion Matrix
|
|
113
|
+
cm = confusion_matrix(y_test, y_pred)
|
|
114
|
+
print("\nConfusion Matrix:\n", cm)
|
|
115
|
+
|
|
116
|
+
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
|
|
117
|
+
display_labels=['Benign', 'Malignant'])
|
|
118
|
+
disp.plot(cmap='Blues')
|
|
119
|
+
plt.title('Logistic Regression - Confusion Matrix')
|
|
120
|
+
plt.tight_layout()
|
|
121
|
+
plt.show()
|
|
122
|
+
|
|
123
|
+
# =============================================================================
|
|
124
|
+
# Step 6 - ROC Curve (unique to probabilistic classifiers like LR)
|
|
125
|
+
# =============================================================================
|
|
126
|
+
# ROC curve shows the tradeoff between True Positive Rate and False Positive Rate
|
|
127
|
+
# AUC (Area Under Curve) closer to 1.0 = better model
|
|
128
|
+
|
|
129
|
+
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
|
|
130
|
+
roc_auc = auc(fpr, tpr)
|
|
131
|
+
|
|
132
|
+
plt.figure(figsize=(7, 5))
|
|
133
|
+
plt.plot(fpr, tpr, color='blue', linewidth=2,
|
|
134
|
+
label=f'ROC Curve (AUC = {roc_auc:.4f})')
|
|
135
|
+
plt.plot([0, 1], [0, 1], color='grey', linestyle='--', linewidth=1,
|
|
136
|
+
label='Random Classifier (AUC = 0.5)')
|
|
137
|
+
plt.xlabel('False Positive Rate')
|
|
138
|
+
plt.ylabel('True Positive Rate')
|
|
139
|
+
plt.title('Logistic Regression - ROC Curve')
|
|
140
|
+
plt.legend()
|
|
141
|
+
plt.tight_layout()
|
|
142
|
+
plt.show()
|
|
143
|
+
|
|
144
|
+
print(f"\nAUC Score: {roc_auc:.4f} (1.0 = perfect, 0.5 = random)")
|
|
145
|
+
|
|
146
|
+
# =============================================================================
|
|
147
|
+
# Step 7 - Feature Coefficients (what LR actually learned)
|
|
148
|
+
# =============================================================================
|
|
149
|
+
# In Logistic Regression, coefficients tell you:
|
|
150
|
+
# Positive coefficient → feature pushes toward class 1 (Malignant)
|
|
151
|
+
# Negative coefficient → feature pushes toward class 0 (Benign)
|
|
152
|
+
|
|
153
|
+
coefficients = lr.coef_[0]
|
|
154
|
+
coef_df = pd.DataFrame({
|
|
155
|
+
'Feature' : feature_names,
|
|
156
|
+
'Coefficient': coefficients
|
|
157
|
+
}).sort_values('Coefficient', ascending=False)
|
|
158
|
+
|
|
159
|
+
print("\nTop 5 features pushing toward Malignant (positive coeff):")
|
|
160
|
+
print(coef_df.head(5).to_string(index=False))
|
|
161
|
+
|
|
162
|
+
print("\nTop 5 features pushing toward Benign (negative coeff):")
|
|
163
|
+
print(coef_df.tail(5).to_string(index=False))
|
|
164
|
+
|
|
165
|
+
# Plot top 10 + bottom 10 coefficients
|
|
166
|
+
top10 = coef_df.head(10)
|
|
167
|
+
bottom10 = coef_df.tail(10)
|
|
168
|
+
plot_df = pd.concat([top10, bottom10])
|
|
169
|
+
|
|
170
|
+
plt.figure(figsize=(10, 6))
|
|
171
|
+
colors = ['red' if c > 0 else 'blue' for c in plot_df['Coefficient']]
|
|
172
|
+
plt.barh(plot_df['Feature'], plot_df['Coefficient'],
|
|
173
|
+
color=colors, edgecolor='black', linewidth=0.5)
|
|
174
|
+
plt.axvline(0, color='black', linewidth=0.8)
|
|
175
|
+
plt.xlabel('Coefficient Value')
|
|
176
|
+
plt.title('Logistic Regression - Feature Coefficients\n(Red = Malignant, Blue = Benign)')
|
|
177
|
+
plt.tight_layout()
|
|
178
|
+
plt.show()
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# Decision Tree Classifier
|
|
3
|
+
# Dataset: Breast Cancer Wisconsin — Kaggle
|
|
4
|
+
# https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
|
|
5
|
+
#
|
|
6
|
+
# SETUP:
|
|
7
|
+
# kaggle datasets download -d uciml/breast-cancer-wisconsin-data --unzip
|
|
8
|
+
# → this gives you data.csv
|
|
9
|
+
# OR manually download from Kaggle and place data.csv in the same folder
|
|
10
|
+
# =============================================================================
|
|
11
|
+
|
|
12
|
+
# Step 1 - Import Libraries
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import matplotlib.pyplot as plt
|
|
16
|
+
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
|
|
17
|
+
from sklearn.model_selection import train_test_split, cross_val_score
|
|
18
|
+
from sklearn.metrics import (accuracy_score, confusion_matrix,
|
|
19
|
+
classification_report, ConfusionMatrixDisplay)
|
|
20
|
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
21
|
+
|
|
22
|
+
# =============================================================================
|
|
23
|
+
# Step 2 - Load and Explore Dataset
|
|
24
|
+
# =============================================================================
|
|
25
|
+
|
|
26
|
+
df = pd.read_csv('data.csv') # CSV must be in the same folder as this file
|
|
27
|
+
|
|
28
|
+
print("Shape :", df.shape)
|
|
29
|
+
print("\nColumns :", list(df.columns))
|
|
30
|
+
print("\nFirst 5 rows:")
|
|
31
|
+
print(df.head())
|
|
32
|
+
print("\nMissing values:")
|
|
33
|
+
print(df.isnull().sum())
|
|
34
|
+
print("\nClass distribution:")
|
|
35
|
+
print(df['diagnosis'].value_counts()) # M = Malignant, B = Benign
|
|
36
|
+
|
|
37
|
+
# =============================================================================
|
|
38
|
+
# Step 3 - Preprocess the Data
|
|
39
|
+
# =============================================================================
|
|
40
|
+
|
|
41
|
+
# Drop columns we don't need
|
|
42
|
+
df = df.drop(columns=['id', 'Unnamed: 32'], errors='ignore')
|
|
43
|
+
|
|
44
|
+
# Encode target: M → 1, B → 0
|
|
45
|
+
le = LabelEncoder()
|
|
46
|
+
df['diagnosis'] = le.fit_transform(df['diagnosis'])
|
|
47
|
+
|
|
48
|
+
# Separate features and target
|
|
49
|
+
X = df.drop(columns=['diagnosis']).values
|
|
50
|
+
y = df['diagnosis'].values
|
|
51
|
+
feature_names = df.drop(columns=['diagnosis']).columns.tolist()
|
|
52
|
+
|
|
53
|
+
# Train-test split (80-20)
|
|
54
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
55
|
+
X, y, test_size=0.2, random_state=42, stratify=y
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Note: Decision Trees don't strictly need scaling (they split on thresholds)
|
|
59
|
+
# but we scale anyway for consistency and fair comparison with other models
|
|
60
|
+
scaler = StandardScaler()
|
|
61
|
+
X_train = scaler.fit_transform(X_train)
|
|
62
|
+
X_test = scaler.transform(X_test)
|
|
63
|
+
|
|
64
|
+
print("\nTraining samples :", X_train.shape[0])
|
|
65
|
+
print("Testing samples :", X_test.shape[0])
|
|
66
|
+
print("Number of features:", X_train.shape[1])
|
|
67
|
+
|
|
68
|
+
# =============================================================================
|
|
69
|
+
# Step 4 - Train Decision Tree Model
|
|
70
|
+
# =============================================================================
|
|
71
|
+
# Decision Tree splits data at each node by asking a question:
|
|
72
|
+
# "Is feature X > threshold T?"
|
|
73
|
+
# It picks the split that best separates classes using a criterion (Gini / Entropy)
|
|
74
|
+
#
|
|
75
|
+
# Gini Impurity: measures how often a randomly chosen element is misclassified
|
|
76
|
+
# Gini = 1 - sum(p_i^2) → 0 means perfectly pure node
|
|
77
|
+
#
|
|
78
|
+
# Entropy (Information Gain): measures disorder
|
|
79
|
+
# Entropy = -sum(p_i * log2(p_i)) → 0 means perfectly pure node
|
|
80
|
+
|
|
81
|
+
dt = DecisionTreeClassifier(
|
|
82
|
+
criterion = 'gini', # split criterion: 'gini' or 'entropy'
|
|
83
|
+
max_depth = 5, # limit tree depth to prevent overfitting
|
|
84
|
+
min_samples_split = 2, # min samples needed to split a node
|
|
85
|
+
min_samples_leaf = 1, # min samples required at a leaf node
|
|
86
|
+
random_state = 42
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
dt.fit(X_train, y_train)
|
|
90
|
+
print("\nModel trained successfully!")
|
|
91
|
+
print("Tree depth :", dt.get_depth())
|
|
92
|
+
print("Num of leaves:", dt.get_n_leaves())
|
|
93
|
+
|
|
94
|
+
# =============================================================================
|
|
95
|
+
# Step 5 - Evaluate the Model
|
|
96
|
+
# =============================================================================
|
|
97
|
+
|
|
98
|
+
y_pred = dt.predict(X_test)
|
|
99
|
+
|
|
100
|
+
print("\n========== Evaluation ==========")
|
|
101
|
+
print("Accuracy :", round(accuracy_score(y_test, y_pred), 4))
|
|
102
|
+
|
|
103
|
+
print("\nClassification Report:")
|
|
104
|
+
print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant']))
|
|
105
|
+
|
|
106
|
+
# Cross-validation (5-fold)
|
|
107
|
+
cv_scores = cross_val_score(dt, X, y, cv=5, scoring='accuracy')
|
|
108
|
+
print("5-Fold CV Scores :", cv_scores.round(4))
|
|
109
|
+
print("CV Mean Accuracy :", round(cv_scores.mean(), 4))
|
|
110
|
+
print("CV Std Dev :", round(cv_scores.std(), 4))
|
|
111
|
+
|
|
112
|
+
# Confusion Matrix
|
|
113
|
+
cm = confusion_matrix(y_test, y_pred)
|
|
114
|
+
print("\nConfusion Matrix:\n", cm)
|
|
115
|
+
|
|
116
|
+
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
|
|
117
|
+
display_labels=['Benign', 'Malignant'])
|
|
118
|
+
disp.plot(cmap='Purples')
|
|
119
|
+
plt.title('Decision Tree - Confusion Matrix')
|
|
120
|
+
plt.tight_layout()
|
|
121
|
+
plt.show()
|
|
122
|
+
|
|
123
|
+
# =============================================================================
|
|
124
|
+
# Step 6 - Visualise the Decision Tree
|
|
125
|
+
# =============================================================================
|
|
126
|
+
# This is the best part of Decision Trees — you can actually SEE the rules
|
|
127
|
+
|
|
128
|
+
plt.figure(figsize=(20, 8))
|
|
129
|
+
plot_tree(
|
|
130
|
+
dt,
|
|
131
|
+
feature_names = feature_names,
|
|
132
|
+
class_names = ['Benign', 'Malignant'],
|
|
133
|
+
filled = True, # colour nodes by class
|
|
134
|
+
rounded = True, # rounded boxes
|
|
135
|
+
fontsize = 8
|
|
136
|
+
)
|
|
137
|
+
plt.title('Decision Tree Visualisation (max_depth=5)', fontsize=14, fontweight='bold')
|
|
138
|
+
plt.tight_layout()
|
|
139
|
+
plt.show()
|
|
140
|
+
|
|
141
|
+
# Print tree as text rules (useful for exam/report)
|
|
142
|
+
print("\n=== Decision Tree Rules (Text) ===")
|
|
143
|
+
tree_rules = export_text(dt, feature_names=feature_names, max_depth=3)
|
|
144
|
+
print(tree_rules)
|
|
145
|
+
|
|
146
|
+
# =============================================================================
|
|
147
|
+
# Step 7 - Feature Importance
|
|
148
|
+
# =============================================================================
|
|
149
|
+
|
|
150
|
+
importances = dt.feature_importances_
|
|
151
|
+
indices = np.argsort(importances)[::-1]
|
|
152
|
+
top_n = 10
|
|
153
|
+
|
|
154
|
+
print("\nTop 10 Important Features:")
|
|
155
|
+
for i in range(top_n):
|
|
156
|
+
print(f" {i+1}. {feature_names[indices[i]]:35s} {importances[indices[i]]:.4f}")
|
|
157
|
+
|
|
158
|
+
plt.figure(figsize=(10, 5))
|
|
159
|
+
plt.bar(range(top_n),
|
|
160
|
+
importances[indices[:top_n]],
|
|
161
|
+
color='mediumpurple', edgecolor='black', linewidth=0.6)
|
|
162
|
+
plt.xticks(range(top_n),
|
|
163
|
+
[feature_names[i] for i in indices[:top_n]],
|
|
164
|
+
rotation=45, ha='right', fontsize=8)
|
|
165
|
+
plt.title('Decision Tree - Top 10 Feature Importances')
|
|
166
|
+
plt.ylabel('Importance Score (Gini)')
|
|
167
|
+
plt.tight_layout()
|
|
168
|
+
plt.show()
|
|
169
|
+
|
|
170
|
+
# =============================================================================
|
|
171
|
+
# Step 8 - Effect of max_depth on Accuracy (Overfitting Demo)
|
|
172
|
+
# =============================================================================
|
|
173
|
+
# A tree with no depth limit memorises training data = overfitting
|
|
174
|
+
# This plot shows the sweet spot for max_depth
|
|
175
|
+
|
|
176
|
+
train_accs = []
|
|
177
|
+
test_accs = []
|
|
178
|
+
depths = range(1, 21)
|
|
179
|
+
|
|
180
|
+
for d in depths:
|
|
181
|
+
model = DecisionTreeClassifier(criterion='gini', max_depth=d, random_state=42)
|
|
182
|
+
model.fit(X_train, y_train)
|
|
183
|
+
train_accs.append(accuracy_score(y_train, model.predict(X_train)))
|
|
184
|
+
test_accs.append(accuracy_score(y_test, model.predict(X_test)))
|
|
185
|
+
|
|
186
|
+
plt.figure(figsize=(9, 5))
|
|
187
|
+
plt.plot(depths, train_accs, marker='o', label='Train Accuracy',
|
|
188
|
+
color='blue', linewidth=2)
|
|
189
|
+
plt.plot(depths, test_accs, marker='s', label='Test Accuracy',
|
|
190
|
+
color='red', linewidth=2)
|
|
191
|
+
plt.axvline(5, color='grey', linestyle='--', linewidth=1.2, label='Our max_depth=5')
|
|
192
|
+
plt.xlabel('max_depth')
|
|
193
|
+
plt.ylabel('Accuracy')
|
|
194
|
+
plt.title('Decision Tree - Accuracy vs max_depth\n(shows overfitting as depth increases)')
|
|
195
|
+
plt.legend()
|
|
196
|
+
plt.xticks(depths)
|
|
197
|
+
plt.tight_layout()
|
|
198
|
+
plt.show()
|