natbook 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natbook-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: natbook
3
+ Version: 0.1.0
4
+ Summary: ML exam prep — 12 algorithms ready to run
5
+ Requires-Python: >=3.8
6
+ Requires-Dist: numpy
7
+ Requires-Dist: scikit-learn
8
+ Requires-Dist: matplotlib
9
+ Requires-Dist: pandas
10
+ Provides-Extra: xgboost
11
+ Requires-Dist: xgboost; extra == "xgboost"
@@ -0,0 +1,50 @@
1
+ # lekhexamprep
2
+
3
+ ML exam prep package — 12 algorithms, one `run()` call each.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install lekhexamprep # from PyPI (after publishing)
9
+ # or locally:
10
+ pip install .
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ ```python
16
+ import lekhexamprep as lep
17
+
18
+ lep.linear_regression()
19
+ lep.polynomial_regression(degree=3)
20
+ lep.classification_regression()
21
+ lep.decision_tree(max_depth=3)
22
+ lep.linear_svm()
23
+ lep.nonlinear_svm(kernel="rbf")
24
+ lep.random_forest(n_estimators=100)
25
+ lep.xgboost()
26
+ lep.kmeans(k=3)
27
+ lep.svd()
28
+ lep.pca(n_components=2)
29
+ lep.lda(n_components=2)
30
+
31
+ # Pass plot=True to any function for a matplotlib visualisation
32
+ lep.pca(plot=True)
33
+ ```
34
+
35
+ ## Algorithms
36
+
37
+ | # | Module | Algorithm |
38
+ |---|--------|-----------|
39
+ | 1 | `linear_regression` | Simple Linear Regression |
40
+ | 2 | `polynomial_regression` | Polynomial Regression (Pipeline) |
41
+ | 3 | `classification_regression` | Logistic Regression (Classification) |
42
+ | 4 | `decision_tree` | Decision Tree Classifier |
43
+ | 5 | `linear_svm` | Linear SVM |
44
+ | 6 | `nonlinear_svm` | Non-Linear SVM (RBF kernel) |
45
+ | 7 | `random_forest` | Random Forest |
46
+ | 8 | `xgboost` | XGBoost |
47
+ | 9 | `kmeans` | K-Means Clustering |
48
+ | 10 | `svd` | Singular Value Decomposition |
49
+ | 11 | `pca` | Principal Component Analysis |
50
+ | 12 | `lda` | Linear Discriminant Analysis |
@@ -0,0 +1,28 @@
1
+ from .linear_regression import run as linear_regression
2
+ from .polynomial_regression import run as polynomial_regression
3
+ from .classification_regression import run as classification_regression
4
+ from .decision_tree import run as decision_tree
5
+ from .linear_svm import run as linear_svm
6
+ from .nonlinear_svm import run as nonlinear_svm
7
+ from .random_forest import run as random_forest
8
+ from .xgboost_model import run as xgboost
9
+ from .kmeans import run as kmeans
10
+ from .svd import run as svd
11
+ from .pca import run as pca
12
+ from .lda import run as lda
13
+
14
+ __version__ = "0.1.0"
15
+ __all__ = [
16
+ "linear_regression",
17
+ "polynomial_regression",
18
+ "classification_regression",
19
+ "decision_tree",
20
+ "linear_svm",
21
+ "nonlinear_svm",
22
+ "random_forest",
23
+ "xgboost",
24
+ "kmeans",
25
+ "svd",
26
+ "pca",
27
+ "lda",
28
+ ]
@@ -0,0 +1,166 @@
1
+ # =============================================================================
2
+ # AdaBoost Classifier
3
+ # Dataset: Breast Cancer Wisconsin — Kaggle
4
+ # https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
5
+ #
6
+ # SETUP:
7
+ # kaggle datasets download -d uciml/breast-cancer-wisconsin-data --unzip
8
+ # → this gives you data.csv
9
+ # OR manually download from Kaggle and place data.csv in the same folder
10
+ # =============================================================================
11
+
12
+ # Step 1 - Import Libraries
13
+ import numpy as np
14
+ import pandas as pd
15
+ import matplotlib.pyplot as plt
16
+ from sklearn.ensemble import AdaBoostClassifier
17
+ from sklearn.tree import DecisionTreeClassifier
18
+ from sklearn.model_selection import train_test_split, cross_val_score
19
+ from sklearn.metrics import (accuracy_score, confusion_matrix,
20
+ classification_report, ConfusionMatrixDisplay)
21
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
22
+
23
+ # =============================================================================
24
+ # Step 2 - Load and Explore Dataset
25
+ # =============================================================================
26
+
27
+ df = pd.read_csv('data.csv') # CSV must be in the same folder as this file
28
+
29
+ print("Shape :", df.shape)
30
+ print("\nColumns :", list(df.columns))
31
+ print("\nFirst 5 rows:")
32
+ print(df.head())
33
+ print("\nMissing values:")
34
+ print(df.isnull().sum())
35
+ print("\nClass distribution:")
36
+ print(df['diagnosis'].value_counts()) # M = Malignant, B = Benign
37
+
38
+ # =============================================================================
39
+ # Step 3 - Preprocess the Data
40
+ # =============================================================================
41
+
42
+ # Drop columns we don't need
43
+ df = df.drop(columns=['id', 'Unnamed: 32'], errors='ignore')
44
+
45
+ # Encode target: M → 1, B → 0
46
+ le = LabelEncoder()
47
+ df['diagnosis'] = le.fit_transform(df['diagnosis'])
48
+
49
+ # Separate features and target
50
+ X = df.drop(columns=['diagnosis']).values
51
+ y = df['diagnosis'].values
52
+ feature_names = df.drop(columns=['diagnosis']).columns.tolist()
53
+
54
+ # Train-test split (80-20)
55
+ X_train, X_test, y_train, y_test = train_test_split(
56
+ X, y, test_size=0.2, random_state=42, stratify=y
57
+ )
58
+
59
+ # Scale features
60
+ scaler = StandardScaler()
61
+ X_train = scaler.fit_transform(X_train)
62
+ X_test = scaler.transform(X_test) # only transform, never fit on test data
63
+
64
+ print("\nTraining samples :", X_train.shape[0])
65
+ print("Testing samples :", X_test.shape[0])
66
+ print("Number of features:", X_train.shape[1])
67
+
68
+ # =============================================================================
69
+ # Step 4 - Train AdaBoost Model
70
+ # =============================================================================
71
+ # AdaBoost = Adaptive Boosting
72
+ # Idea: Train weak learners (stumps) one by one.
73
+ # Each new learner focuses more on the samples the previous one got WRONG.
74
+ # Final prediction = weighted vote of all weak learners.
75
+
76
+ base_estimator = DecisionTreeClassifier(max_depth=1) # weak learner = stump
77
+
78
+ ada = AdaBoostClassifier(
79
+ estimator = base_estimator, # weak learner
80
+ n_estimators = 100, # number of weak learners to train
81
+ learning_rate = 1.0, # shrinks contribution of each learner
82
+ algorithm = 'SAMME', # use SAMME (SAMME.R is deprecated)
83
+ random_state = 42
84
+ )
85
+
86
+ ada.fit(X_train, y_train)
87
+ print("\nModel trained successfully!")
88
+ print("Number of estimators:", ada.n_estimators)
89
+
90
+ # =============================================================================
91
+ # Step 5 - Evaluate the Model
92
+ # =============================================================================
93
+
94
+ y_pred = ada.predict(X_test)
95
+
96
+ print("\n========== Evaluation ==========")
97
+ print("Accuracy :", round(accuracy_score(y_test, y_pred), 4))
98
+
99
+ print("\nClassification Report:")
100
+ print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant']))
101
+
102
+ # Cross-validation (5-fold)
103
+ cv_scores = cross_val_score(ada, X, y, cv=5, scoring='accuracy')
104
+ print("5-Fold CV Scores :", cv_scores.round(4))
105
+ print("CV Mean Accuracy :", round(cv_scores.mean(), 4))
106
+ print("CV Std Dev :", round(cv_scores.std(), 4))
107
+
108
+ # Confusion Matrix
109
+ cm = confusion_matrix(y_test, y_pred)
110
+ print("\nConfusion Matrix:\n", cm)
111
+
112
+ disp = ConfusionMatrixDisplay(confusion_matrix=cm,
113
+ display_labels=['Benign', 'Malignant'])
114
+ disp.plot(cmap='Oranges')
115
+ plt.title('AdaBoost - Confusion Matrix')
116
+ plt.tight_layout()
117
+ plt.show()
118
+
119
+ # =============================================================================
120
+ # Step 6 - Feature Importance
121
+ # =============================================================================
122
+
123
+ importances = ada.feature_importances_
124
+ indices = np.argsort(importances)[::-1]
125
+ top_n = 10
126
+
127
+ print("\nTop 10 Important Features:")
128
+ for i in range(top_n):
129
+ print(f" {i+1}. {feature_names[indices[i]]:35s} {importances[indices[i]]:.4f}")
130
+
131
+ plt.figure(figsize=(10, 5))
132
+ plt.bar(range(top_n),
133
+ importances[indices[:top_n]],
134
+ color='darkorange', edgecolor='black', linewidth=0.6)
135
+ plt.xticks(range(top_n),
136
+ [feature_names[i] for i in indices[:top_n]],
137
+ rotation=45, ha='right', fontsize=8)
138
+ plt.title('AdaBoost - Top 10 Feature Importances')
139
+ plt.ylabel('Importance Score')
140
+ plt.tight_layout()
141
+ plt.show()
142
+
143
+ # =============================================================================
144
+ # Step 7 - Accuracy vs Number of Estimators
145
+ # =============================================================================
146
+ # Shows how AdaBoost improves as it adds more weak learners
147
+
148
+ train_errors = []
149
+ test_errors = []
150
+
151
+ for y_pred_train, y_pred_test in zip(
152
+ ada.staged_predict(X_train),
153
+ ada.staged_predict(X_test)
154
+ ):
155
+ train_errors.append(1 - accuracy_score(y_train, y_pred_train))
156
+ test_errors.append(1 - accuracy_score(y_test, y_pred_test))
157
+
158
+ plt.figure(figsize=(9, 5))
159
+ plt.plot(train_errors, label='Train Error', color='blue', linewidth=2)
160
+ plt.plot(test_errors, label='Test Error', color='red', linewidth=2)
161
+ plt.xlabel('Number of Estimators (Weak Learners)')
162
+ plt.ylabel('Error Rate')
163
+ plt.title('AdaBoost - Error vs Number of Estimators')
164
+ plt.legend()
165
+ plt.tight_layout()
166
+ plt.show()
@@ -0,0 +1,178 @@
1
+ # =============================================================================
2
+ # Logistic Regression Classifier
3
+ # Dataset: Breast Cancer Wisconsin — Kaggle
4
+ # https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
5
+ #
6
+ # SETUP:
7
+ # kaggle datasets download -d uciml/breast-cancer-wisconsin-data --unzip
8
+ # → this gives you data.csv
9
+ # OR manually download from Kaggle and place data.csv in the same folder
10
+ # =============================================================================
11
+
12
+ # Step 1 - Import Libraries
13
+ import numpy as np
14
+ import pandas as pd
15
+ import matplotlib.pyplot as plt
16
+ from sklearn.linear_model import LogisticRegression
17
+ from sklearn.model_selection import train_test_split, cross_val_score
18
+ from sklearn.metrics import (accuracy_score, confusion_matrix,
19
+ classification_report, ConfusionMatrixDisplay,
20
+ roc_curve, auc)
21
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
22
+
23
+ # =============================================================================
24
+ # Step 2 - Load and Explore Dataset
25
+ # =============================================================================
26
+
27
+ df = pd.read_csv('data.csv') # CSV must be in the same folder as this file
28
+
29
+ print("Shape :", df.shape)
30
+ print("\nColumns :", list(df.columns))
31
+ print("\nFirst 5 rows:")
32
+ print(df.head())
33
+ print("\nMissing values:")
34
+ print(df.isnull().sum())
35
+ print("\nClass distribution:")
36
+ print(df['diagnosis'].value_counts()) # M = Malignant, B = Benign
37
+
38
+ # =============================================================================
39
+ # Step 3 - Preprocess the Data
40
+ # =============================================================================
41
+
42
+ # Drop columns we don't need
43
+ df = df.drop(columns=['id', 'Unnamed: 32'], errors='ignore')
44
+
45
+ # Encode target: M → 1, B → 0
46
+ le = LabelEncoder()
47
+ df['diagnosis'] = le.fit_transform(df['diagnosis'])
48
+
49
+ # Separate features and target
50
+ X = df.drop(columns=['diagnosis']).values
51
+ y = df['diagnosis'].values
52
+ feature_names = df.drop(columns=['diagnosis']).columns.tolist()
53
+
54
+ # Train-test split (80-20)
55
+ X_train, X_test, y_train, y_test = train_test_split(
56
+ X, y, test_size=0.2, random_state=42, stratify=y
57
+ )
58
+
59
+ # Scale features — VERY important for Logistic Regression
60
+ # LR uses gradient descent; unscaled features make it converge slowly/poorly
61
+ scaler = StandardScaler()
62
+ X_train = scaler.fit_transform(X_train)
63
+ X_test = scaler.transform(X_test) # only transform, never fit on test data
64
+
65
+ print("\nTraining samples :", X_train.shape[0])
66
+ print("Testing samples :", X_test.shape[0])
67
+ print("Number of features:", X_train.shape[1])
68
+
69
+ # =============================================================================
70
+ # Step 4 - Train Logistic Regression Model
71
+ # =============================================================================
72
+ # Logistic Regression estimates the probability that a sample belongs to a class
73
+ # using the sigmoid function:
74
+ #
75
+ # P(y=1 | x) = 1 / (1 + e^(-z)) where z = w.x + b
76
+ #
77
+ # If P > 0.5 → predict class 1 (Malignant)
78
+ # If P <= 0.5 → predict class 0 (Benign)
79
+
80
+ lr = LogisticRegression(
81
+ C = 1.0, # regularisation strength (smaller C = stronger regularisation)
82
+ penalty = 'l2', # L2 regularisation (Ridge) — prevents overfitting
83
+ solver = 'lbfgs', # optimisation algorithm (works well for small datasets)
84
+ max_iter = 1000, # max iterations for solver to converge
85
+ random_state= 42
86
+ )
87
+
88
+ lr.fit(X_train, y_train)
89
+ print("\nModel trained successfully!")
90
+ print("Solver used :", lr.solver)
91
+ print("Regularisation :", lr.penalty)
92
+
93
+ # =============================================================================
94
+ # Step 5 - Evaluate the Model
95
+ # =============================================================================
96
+
97
+ y_pred = lr.predict(X_test)
98
+ y_pred_prob = lr.predict_proba(X_test)[:, 1] # probability of being Malignant
99
+
100
+ print("\n========== Evaluation ==========")
101
+ print("Accuracy :", round(accuracy_score(y_test, y_pred), 4))
102
+
103
+ print("\nClassification Report:")
104
+ print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant']))
105
+
106
+ # Cross-validation (5-fold)
107
+ cv_scores = cross_val_score(lr, X, y, cv=5, scoring='accuracy')
108
+ print("5-Fold CV Scores :", cv_scores.round(4))
109
+ print("CV Mean Accuracy :", round(cv_scores.mean(), 4))
110
+ print("CV Std Dev :", round(cv_scores.std(), 4))
111
+
112
+ # Confusion Matrix
113
+ cm = confusion_matrix(y_test, y_pred)
114
+ print("\nConfusion Matrix:\n", cm)
115
+
116
+ disp = ConfusionMatrixDisplay(confusion_matrix=cm,
117
+ display_labels=['Benign', 'Malignant'])
118
+ disp.plot(cmap='Blues')
119
+ plt.title('Logistic Regression - Confusion Matrix')
120
+ plt.tight_layout()
121
+ plt.show()
122
+
123
+ # =============================================================================
124
+ # Step 6 - ROC Curve (unique to probabilistic classifiers like LR)
125
+ # =============================================================================
126
+ # ROC curve shows the tradeoff between True Positive Rate and False Positive Rate
127
+ # AUC (Area Under Curve) closer to 1.0 = better model
128
+
129
+ fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
130
+ roc_auc = auc(fpr, tpr)
131
+
132
+ plt.figure(figsize=(7, 5))
133
+ plt.plot(fpr, tpr, color='blue', linewidth=2,
134
+ label=f'ROC Curve (AUC = {roc_auc:.4f})')
135
+ plt.plot([0, 1], [0, 1], color='grey', linestyle='--', linewidth=1,
136
+ label='Random Classifier (AUC = 0.5)')
137
+ plt.xlabel('False Positive Rate')
138
+ plt.ylabel('True Positive Rate')
139
+ plt.title('Logistic Regression - ROC Curve')
140
+ plt.legend()
141
+ plt.tight_layout()
142
+ plt.show()
143
+
144
+ print(f"\nAUC Score: {roc_auc:.4f} (1.0 = perfect, 0.5 = random)")
145
+
146
+ # =============================================================================
147
+ # Step 7 - Feature Coefficients (what LR actually learned)
148
+ # =============================================================================
149
+ # In Logistic Regression, coefficients tell you:
150
+ # Positive coefficient → feature pushes toward class 1 (Malignant)
151
+ # Negative coefficient → feature pushes toward class 0 (Benign)
152
+
153
+ coefficients = lr.coef_[0]
154
+ coef_df = pd.DataFrame({
155
+ 'Feature' : feature_names,
156
+ 'Coefficient': coefficients
157
+ }).sort_values('Coefficient', ascending=False)
158
+
159
+ print("\nTop 5 features pushing toward Malignant (positive coeff):")
160
+ print(coef_df.head(5).to_string(index=False))
161
+
162
+ print("\nTop 5 features pushing toward Benign (negative coeff):")
163
+ print(coef_df.tail(5).to_string(index=False))
164
+
165
+ # Plot top 10 + bottom 10 coefficients
166
+ top10 = coef_df.head(10)
167
+ bottom10 = coef_df.tail(10)
168
+ plot_df = pd.concat([top10, bottom10])
169
+
170
+ plt.figure(figsize=(10, 6))
171
+ colors = ['red' if c > 0 else 'blue' for c in plot_df['Coefficient']]
172
+ plt.barh(plot_df['Feature'], plot_df['Coefficient'],
173
+ color=colors, edgecolor='black', linewidth=0.5)
174
+ plt.axvline(0, color='black', linewidth=0.8)
175
+ plt.xlabel('Coefficient Value')
176
+ plt.title('Logistic Regression - Feature Coefficients\n(Red = Malignant, Blue = Benign)')
177
+ plt.tight_layout()
178
+ plt.show()
@@ -0,0 +1,198 @@
1
+ # =============================================================================
2
+ # Decision Tree Classifier
3
+ # Dataset: Breast Cancer Wisconsin — Kaggle
4
+ # https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data
5
+ #
6
+ # SETUP:
7
+ # kaggle datasets download -d uciml/breast-cancer-wisconsin-data --unzip
8
+ # → this gives you data.csv
9
+ # OR manually download from Kaggle and place data.csv in the same folder
10
+ # =============================================================================
11
+
12
+ # Step 1 - Import Libraries
13
+ import numpy as np
14
+ import pandas as pd
15
+ import matplotlib.pyplot as plt
16
+ from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
17
+ from sklearn.model_selection import train_test_split, cross_val_score
18
+ from sklearn.metrics import (accuracy_score, confusion_matrix,
19
+ classification_report, ConfusionMatrixDisplay)
20
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
21
+
22
+ # =============================================================================
23
+ # Step 2 - Load and Explore Dataset
24
+ # =============================================================================
25
+
26
+ df = pd.read_csv('data.csv') # CSV must be in the same folder as this file
27
+
28
+ print("Shape :", df.shape)
29
+ print("\nColumns :", list(df.columns))
30
+ print("\nFirst 5 rows:")
31
+ print(df.head())
32
+ print("\nMissing values:")
33
+ print(df.isnull().sum())
34
+ print("\nClass distribution:")
35
+ print(df['diagnosis'].value_counts()) # M = Malignant, B = Benign
36
+
37
+ # =============================================================================
38
+ # Step 3 - Preprocess the Data
39
+ # =============================================================================
40
+
41
+ # Drop columns we don't need
42
+ df = df.drop(columns=['id', 'Unnamed: 32'], errors='ignore')
43
+
44
+ # Encode target: M → 1, B → 0
45
+ le = LabelEncoder()
46
+ df['diagnosis'] = le.fit_transform(df['diagnosis'])
47
+
48
+ # Separate features and target
49
+ X = df.drop(columns=['diagnosis']).values
50
+ y = df['diagnosis'].values
51
+ feature_names = df.drop(columns=['diagnosis']).columns.tolist()
52
+
53
+ # Train-test split (80-20)
54
+ X_train, X_test, y_train, y_test = train_test_split(
55
+ X, y, test_size=0.2, random_state=42, stratify=y
56
+ )
57
+
58
+ # Note: Decision Trees don't strictly need scaling (they split on thresholds)
59
+ # but we scale anyway for consistency and fair comparison with other models
60
+ scaler = StandardScaler()
61
+ X_train = scaler.fit_transform(X_train)
62
+ X_test = scaler.transform(X_test)
63
+
64
+ print("\nTraining samples :", X_train.shape[0])
65
+ print("Testing samples :", X_test.shape[0])
66
+ print("Number of features:", X_train.shape[1])
67
+
68
+ # =============================================================================
69
+ # Step 4 - Train Decision Tree Model
70
+ # =============================================================================
71
+ # Decision Tree splits data at each node by asking a question:
72
+ # "Is feature X > threshold T?"
73
+ # It picks the split that best separates classes using a criterion (Gini / Entropy)
74
+ #
75
+ # Gini Impurity: measures how often a randomly chosen element is misclassified
76
+ # Gini = 1 - sum(p_i^2) → 0 means perfectly pure node
77
+ #
78
+ # Entropy (Information Gain): measures disorder
79
+ # Entropy = -sum(p_i * log2(p_i)) → 0 means perfectly pure node
80
+
81
+ dt = DecisionTreeClassifier(
82
+ criterion = 'gini', # split criterion: 'gini' or 'entropy'
83
+ max_depth = 5, # limit tree depth to prevent overfitting
84
+ min_samples_split = 2, # min samples needed to split a node
85
+ min_samples_leaf = 1, # min samples required at a leaf node
86
+ random_state = 42
87
+ )
88
+
89
+ dt.fit(X_train, y_train)
90
+ print("\nModel trained successfully!")
91
+ print("Tree depth :", dt.get_depth())
92
+ print("Num of leaves:", dt.get_n_leaves())
93
+
94
+ # =============================================================================
95
+ # Step 5 - Evaluate the Model
96
+ # =============================================================================
97
+
98
+ y_pred = dt.predict(X_test)
99
+
100
+ print("\n========== Evaluation ==========")
101
+ print("Accuracy :", round(accuracy_score(y_test, y_pred), 4))
102
+
103
+ print("\nClassification Report:")
104
+ print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant']))
105
+
106
+ # Cross-validation (5-fold)
107
+ cv_scores = cross_val_score(dt, X, y, cv=5, scoring='accuracy')
108
+ print("5-Fold CV Scores :", cv_scores.round(4))
109
+ print("CV Mean Accuracy :", round(cv_scores.mean(), 4))
110
+ print("CV Std Dev :", round(cv_scores.std(), 4))
111
+
112
+ # Confusion Matrix
113
+ cm = confusion_matrix(y_test, y_pred)
114
+ print("\nConfusion Matrix:\n", cm)
115
+
116
+ disp = ConfusionMatrixDisplay(confusion_matrix=cm,
117
+ display_labels=['Benign', 'Malignant'])
118
+ disp.plot(cmap='Purples')
119
+ plt.title('Decision Tree - Confusion Matrix')
120
+ plt.tight_layout()
121
+ plt.show()
122
+
123
+ # =============================================================================
124
+ # Step 6 - Visualise the Decision Tree
125
+ # =============================================================================
126
+ # This is the best part of Decision Trees — you can actually SEE the rules
127
+
128
+ plt.figure(figsize=(20, 8))
129
+ plot_tree(
130
+ dt,
131
+ feature_names = feature_names,
132
+ class_names = ['Benign', 'Malignant'],
133
+ filled = True, # colour nodes by class
134
+ rounded = True, # rounded boxes
135
+ fontsize = 8
136
+ )
137
+ plt.title('Decision Tree Visualisation (max_depth=5)', fontsize=14, fontweight='bold')
138
+ plt.tight_layout()
139
+ plt.show()
140
+
141
+ # Print tree as text rules (useful for exam/report)
142
+ print("\n=== Decision Tree Rules (Text) ===")
143
+ tree_rules = export_text(dt, feature_names=feature_names, max_depth=3)
144
+ print(tree_rules)
145
+
146
+ # =============================================================================
147
+ # Step 7 - Feature Importance
148
+ # =============================================================================
149
+
150
+ importances = dt.feature_importances_
151
+ indices = np.argsort(importances)[::-1]
152
+ top_n = 10
153
+
154
+ print("\nTop 10 Important Features:")
155
+ for i in range(top_n):
156
+ print(f" {i+1}. {feature_names[indices[i]]:35s} {importances[indices[i]]:.4f}")
157
+
158
+ plt.figure(figsize=(10, 5))
159
+ plt.bar(range(top_n),
160
+ importances[indices[:top_n]],
161
+ color='mediumpurple', edgecolor='black', linewidth=0.6)
162
+ plt.xticks(range(top_n),
163
+ [feature_names[i] for i in indices[:top_n]],
164
+ rotation=45, ha='right', fontsize=8)
165
+ plt.title('Decision Tree - Top 10 Feature Importances')
166
+ plt.ylabel('Importance Score (Gini)')
167
+ plt.tight_layout()
168
+ plt.show()
169
+
170
+ # =============================================================================
171
+ # Step 8 - Effect of max_depth on Accuracy (Overfitting Demo)
172
+ # =============================================================================
173
+ # A tree with no depth limit memorises training data = overfitting
174
+ # This plot shows the sweet spot for max_depth
175
+
176
+ train_accs = []
177
+ test_accs = []
178
+ depths = range(1, 21)
179
+
180
+ for d in depths:
181
+ model = DecisionTreeClassifier(criterion='gini', max_depth=d, random_state=42)
182
+ model.fit(X_train, y_train)
183
+ train_accs.append(accuracy_score(y_train, model.predict(X_train)))
184
+ test_accs.append(accuracy_score(y_test, model.predict(X_test)))
185
+
186
+ plt.figure(figsize=(9, 5))
187
+ plt.plot(depths, train_accs, marker='o', label='Train Accuracy',
188
+ color='blue', linewidth=2)
189
+ plt.plot(depths, test_accs, marker='s', label='Test Accuracy',
190
+ color='red', linewidth=2)
191
+ plt.axvline(5, color='grey', linestyle='--', linewidth=1.2, label='Our max_depth=5')
192
+ plt.xlabel('max_depth')
193
+ plt.ylabel('Accuracy')
194
+ plt.title('Decision Tree - Accuracy vs max_depth\n(shows overfitting as depth increases)')
195
+ plt.legend()
196
+ plt.xticks(depths)
197
+ plt.tight_layout()
198
+ plt.show()