bm-preprocessing 0.6.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/PKG-INFO +1 -1
  2. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/USAGE.md +10 -1
  3. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/pyproject.toml +1 -1
  4. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/__init__.py +2 -1
  5. bm_preprocessing-0.7.0/src/bm_preprocessing/DM/metrics.py +30 -0
  6. bm_preprocessing-0.7.0/src/bm_preprocessing/DM/sources/metrics.py +240 -0
  7. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/.gitignore +0 -0
  8. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/README.md +0 -0
  9. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/adaboost.py +0 -0
  10. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/all.py +0 -0
  11. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/apriori.py +0 -0
  12. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/bagging.py +0 -0
  13. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/hash.py +0 -0
  14. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/hunts.py +0 -0
  15. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/hunts_test.py +0 -0
  16. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/id3.py +0 -0
  17. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/id3_test.py +0 -0
  18. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/preprocessing.py +0 -0
  19. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/sources/adaboost.py +0 -0
  20. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/sources/all.py +0 -0
  21. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/sources/apriori.py +0 -0
  22. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/sources/bagging.py +0 -0
  23. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/sources/data.csv +0 -0
  24. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/sources/hash.py +0 -0
  25. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/sources/hunts.py +0 -0
  26. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/sources/hunts_test.py +0 -0
  27. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/sources/id3.py +0 -0
  28. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/sources/id3_test.py +0 -0
  29. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/sources/preprocessing.py +0 -0
  30. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/DM/sources/tennis.csv +0 -0
  31. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/IR/__init__.py +0 -0
  32. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/IR/all.py +0 -0
  33. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/IR/sources/all.py +0 -0
  34. {bm_preprocessing-0.6.0 → bm_preprocessing-0.7.0}/src/bm_preprocessing/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bm-preprocessing
3
- Version: 0.6.0
3
+ Version: 0.7.0
4
4
  Summary: A package to preprocess text data
5
5
  Requires-Python: >=3.8
6
6
  Requires-Dist: build>=1.2.2.post1
@@ -15,7 +15,7 @@ Create a file `example.py`:
15
15
  ```python
16
16
  # Import modules
17
17
  from bm_preprocessing.IR import all
18
- from bm_preprocessing.DM import adaboost, apriori, bagging, hash, hunts, hunts_test, id3, id3_test, preprocessing
18
+ from bm_preprocessing.DM import adaboost, apriori, bagging, hash, hunts, hunts_test, id3, id3_test, metrics, preprocessing
19
19
 
20
20
  # Print the source code
21
21
  print("=== IR All Module ===")
@@ -45,6 +45,9 @@ print(id3)
45
45
  print("\n=== DM ID3 Test Module ===")
46
46
  print(id3_test)
47
47
 
48
+ print("\n=== DM Metrics Module ===")
49
+ print(metrics)
50
+
48
51
  print("\n=== DM Preprocessing Module ===")
49
52
  print(preprocessing)
50
53
  ```
@@ -92,6 +95,10 @@ Then in the Python REPL:
92
95
  # Prints entire DM/id3.py source code
93
96
  >>> print(id3_test)
94
97
  # Prints entire DM/id3_test.py source code
98
+
99
+ >>> from bm_preprocessing.DM import metrics
100
+ >>> print(metrics)
101
+ # Prints entire DM/metrics.py source code
95
102
  ```
96
103
 
97
104
  ---
@@ -108,6 +115,7 @@ python -c "from bm_preprocessing.DM import hunts; print(hunts)"
108
115
  python -c "from bm_preprocessing.DM import hunts_test; print(hunts_test)"
109
116
  python -c "from bm_preprocessing.DM import id3; print(id3)"
110
117
  python -c "from bm_preprocessing.DM import id3_test; print(id3_test)"
118
+ python -c "from bm_preprocessing.DM import metrics; print(metrics)"
111
119
  python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
112
120
  ```
113
121
 
@@ -127,4 +135,5 @@ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
127
135
  | `from bm_preprocessing.DM import hunts_test` | Hunt's decision tree with visualization |
128
136
  | `from bm_preprocessing.DM import id3` | ID3 decision tree algorithm |
129
137
  | `from bm_preprocessing.DM import id3_test` | ID3 decision tree with visualization |
138
+ | `from bm_preprocessing.DM import metrics` | Classification metrics & curves |
130
139
  | `from bm_preprocessing.DM import preprocessing` | Data preprocessing utilities |
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "bm-preprocessing"
7
- version = "0.6.0"
7
+ version = "0.7.0"
8
8
  description = "A package to preprocess text data"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -9,7 +9,8 @@ from .hunts import hunts
9
9
  from .hunts_test import hunts_test
10
10
  from .id3 import id3
11
11
  from .id3_test import id3_test
12
+ from .metrics import metrics
12
13
  from .preprocessing import preprocessing
13
14
 
14
- __all__ = ["adaboost", "all", "apriori", "bagging", "hash", "hunts", "hunts_test", "id3", "id3_test", "preprocessing"]
15
+ __all__ = ["adaboost", "all", "apriori", "bagging", "hash", "hunts", "hunts_test", "id3", "id3_test", "metrics", "preprocessing"]
15
16
 
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/metrics.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "metrics.py"
30
+ metrics = SourceCodeModule("DM.metrics", _source_file)
@@ -0,0 +1,240 @@
1
+ import numpy as np
2
+ from sklearn.datasets import load_iris
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.tree import DecisionTreeClassifier
5
+ from sklearn.metrics import (
6
+ accuracy_score,
7
+ confusion_matrix,
8
+ f1_score,
9
+ precision_score,
10
+ recall_score,
11
+ roc_curve,
12
+ auc,
13
+ precision_recall_curve,
14
+ classification_report,
15
+ )
16
+ from sklearn.preprocessing import label_binarize
17
+ import matplotlib.pyplot as plt
18
+
19
+ # ==========================================
20
+ # Load dataset and train a classifier
21
+ # ==========================================
22
+
23
+ iris = load_iris()
24
+ X = iris.data
25
+ y = iris.target
26
+
27
+ X_train, X_test, y_train, y_test = train_test_split(
28
+ X, y, test_size=0.3, random_state=42, stratify=y
29
+ )
30
+
31
+ clf = DecisionTreeClassifier(random_state=42)
32
+ clf.fit(X_train, y_train)
33
+ y_pred = clf.predict(X_test)
34
+ y_proba = clf.predict_proba(X_test)
35
+
36
+ # ==========================================
37
+ # 1. ACCURACY
38
+ # ==========================================
39
+ # Formula:
40
+ # Accuracy = (TP + TN) / (TP + TN + FP + FN)
41
+ # OR equivalently:
42
+ # Accuracy = Number of Correct Predictions / Total Number of Predictions
43
+ #
44
+ # Where:
45
+ # TP = True Positives (correctly predicted positive)
46
+ # TN = True Negatives (correctly predicted negative)
47
+ # FP = False Positives (incorrectly predicted positive, Type I error)
48
+ # FN = False Negatives (incorrectly predicted negative, Type II error)
49
+
50
+ acc = accuracy_score(y_test, y_pred)
51
+ print("=" * 50)
52
+ print("1. ACCURACY")
53
+ print("=" * 50)
54
+ print(f"Accuracy: {acc:.4f}")
55
+
56
+ # ==========================================
57
+ # 2. CONFUSION MATRIX
58
+ # ==========================================
59
+ # The confusion matrix is a table that describes the performance of a classifier.
60
+ # For binary classification:
61
+ #
62
+ # Predicted Positive Predicted Negative
63
+ # Actual Positive TP FN
64
+ # Actual Negative FP TN
65
+ #
66
+ # For multiclass: C[i][j] = number of samples with true label i predicted as label j
67
+
68
+ cm = confusion_matrix(y_test, y_pred)
69
+ print("\n" + "=" * 50)
70
+ print("2. CONFUSION MATRIX")
71
+ print("=" * 50)
72
+ print(f"\n{cm}")
73
+ print(f"\nLabels: {iris.target_names}")
74
+
75
+ # Plot Confusion Matrix
76
+ fig, ax = plt.subplots(figsize=(8, 6))
77
+ im = ax.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
78
+ ax.set_title("Confusion Matrix", fontsize=14, fontweight="bold")
79
+ plt.colorbar(im, ax=ax)
80
+ tick_marks = np.arange(len(iris.target_names))
81
+ ax.set_xticks(tick_marks)
82
+ ax.set_xticklabels(iris.target_names, rotation=45, ha="right")
83
+ ax.set_yticks(tick_marks)
84
+ ax.set_yticklabels(iris.target_names)
85
+
86
+ # Add text annotations to the confusion matrix
87
+ thresh = cm.max() / 2.0
88
+ for i in range(cm.shape[0]):
89
+ for j in range(cm.shape[1]):
90
+ ax.text(
91
+ j, i, format(cm[i, j], "d"),
92
+ ha="center", va="center",
93
+ color="white" if cm[i, j] > thresh else "black",
94
+ )
95
+
96
+ ax.set_ylabel("True Label", fontsize=12)
97
+ ax.set_xlabel("Predicted Label", fontsize=12)
98
+ plt.tight_layout()
99
+ plt.savefig("confusion_matrix.png", dpi=150)
100
+ plt.show()
101
+ print("Confusion matrix plot saved as 'confusion_matrix.png'")
102
+
103
+ # ==========================================
104
+ # 3. PRECISION
105
+ # ==========================================
106
+ # Formula:
107
+ # Precision = TP / (TP + FP)
108
+ #
109
+ # Precision answers: "Of all instances predicted as positive, how many are actually positive?"
110
+ # High precision = low false positive rate
111
+
112
+ prec = precision_score(y_test, y_pred, average="weighted")
113
+ prec_per_class = precision_score(y_test, y_pred, average=None)
114
+ print("\n" + "=" * 50)
115
+ print("3. PRECISION")
116
+ print("=" * 50)
117
+ print(f"Weighted Precision: {prec:.4f}")
118
+ for i, name in enumerate(iris.target_names):
119
+ print(f" {name}: {prec_per_class[i]:.4f}")
120
+
121
+ # ==========================================
122
+ # 4. RECALL (Sensitivity / True Positive Rate)
123
+ # ==========================================
124
+ # Formula:
125
+ # Recall = TP / (TP + FN)
126
+ #
127
+ # Recall answers: "Of all actual positive instances, how many did we correctly predict?"
128
+ # Also known as Sensitivity or True Positive Rate (TPR)
129
+ # High recall = low false negative rate
130
+
131
+ rec = recall_score(y_test, y_pred, average="weighted")
132
+ rec_per_class = recall_score(y_test, y_pred, average=None)
133
+ print("\n" + "=" * 50)
134
+ print("4. RECALL (Sensitivity / TPR)")
135
+ print("=" * 50)
136
+ print(f"Weighted Recall: {rec:.4f}")
137
+ for i, name in enumerate(iris.target_names):
138
+ print(f" {name}: {rec_per_class[i]:.4f}")
139
+
140
+ # ==========================================
141
+ # 5. F1 SCORE
142
+ # ==========================================
143
+ # Formula:
144
+ # F1 = 2 * (Precision * Recall) / (Precision + Recall)
145
+ #
146
+ # F1 Score is the harmonic mean of Precision and Recall.
147
+ # It provides a balance between Precision and Recall.
148
+ # Range: 0 (worst) to 1 (best)
149
+ # Use F1 when you need a balance between Precision and Recall,
150
+ # especially with imbalanced datasets.
151
+
152
+ f1 = f1_score(y_test, y_pred, average="weighted")
153
+ f1_per_class = f1_score(y_test, y_pred, average=None)
154
+ print("\n" + "=" * 50)
155
+ print("5. F1 SCORE")
156
+ print("=" * 50)
157
+ print(f"Weighted F1 Score: {f1:.4f}")
158
+ for i, name in enumerate(iris.target_names):
159
+ print(f" {name}: {f1_per_class[i]:.4f}")
160
+
161
+ # ==========================================
162
+ # Full Classification Report
163
+ # ==========================================
164
+ print("\n" + "=" * 50)
165
+ print("FULL CLASSIFICATION REPORT")
166
+ print("=" * 50)
167
+ print(classification_report(y_test, y_pred, target_names=iris.target_names))
168
+
169
+ # ==========================================
170
+ # 6. ROC CURVE (One-vs-Rest for multiclass)
171
+ # ==========================================
172
+ # ROC = Receiver Operating Characteristic
173
+ #
174
+ # The ROC curve plots:
175
+ # X-axis: False Positive Rate (FPR) = FP / (FP + TN)
176
+ # Y-axis: True Positive Rate (TPR) = TP / (TP + FN) (same as Recall)
177
+ #
178
+ # AUC (Area Under the ROC Curve):
179
+ # AUC = 1.0 means perfect classifier
180
+ # AUC = 0.5 means random classifier (diagonal line)
181
+ # AUC < 0.5 means worse than random
182
+ #
183
+ # For multiclass, we use One-vs-Rest (OvR) strategy
184
+
185
+ y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
186
+ n_classes = y_test_bin.shape[1]
187
+
188
+ fig, ax = plt.subplots(figsize=(8, 6))
189
+ colors = ["#e74c3c", "#2ecc71", "#3498db"]
190
+
191
+ for i in range(n_classes):
192
+ fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
193
+ roc_auc = auc(fpr, tpr)
194
+ ax.plot(fpr, tpr, color=colors[i], lw=2,
195
+ label=f"{iris.target_names[i]} (AUC = {roc_auc:.2f})")
196
+
197
+ ax.plot([0, 1], [0, 1], "k--", lw=1, label="Random (AUC = 0.50)")
198
+ ax.set_xlim([0.0, 1.0])
199
+ ax.set_ylim([0.0, 1.05])
200
+ ax.set_xlabel("False Positive Rate (FPR)", fontsize=12)
201
+ ax.set_ylabel("True Positive Rate (TPR)", fontsize=12)
202
+ ax.set_title("ROC Curve (One-vs-Rest)", fontsize=14, fontweight="bold")
203
+ ax.legend(loc="lower right")
204
+ plt.tight_layout()
205
+ plt.savefig("roc_curve.png", dpi=150)
206
+ plt.show()
207
+ print("\nROC curve plot saved as 'roc_curve.png'")
208
+
209
+ # ==========================================
210
+ # 7. PRECISION-RECALL CURVE (One-vs-Rest for multiclass)
211
+ # ==========================================
212
+ # The Precision-Recall curve plots:
213
+ # X-axis: Recall = TP / (TP + FN)
214
+ # Y-axis: Precision = TP / (TP + FP)
215
+ #
216
+ # This curve is especially useful for imbalanced datasets
217
+ # where the positive class is rare.
218
+ #
219
+ # A good classifier has a curve that stays close to the top-right corner
220
+ # (high precision and high recall simultaneously).
221
+
222
+ fig, ax = plt.subplots(figsize=(8, 6))
223
+
224
+ for i in range(n_classes):
225
+ precision_vals, recall_vals, _ = precision_recall_curve(
226
+ y_test_bin[:, i], y_proba[:, i]
227
+ )
228
+ ax.plot(recall_vals, precision_vals, color=colors[i], lw=2,
229
+ label=f"{iris.target_names[i]}")
230
+
231
+ ax.set_xlim([0.0, 1.0])
232
+ ax.set_ylim([0.0, 1.05])
233
+ ax.set_xlabel("Recall", fontsize=12)
234
+ ax.set_ylabel("Precision", fontsize=12)
235
+ ax.set_title("Precision-Recall Curve (One-vs-Rest)", fontsize=14, fontweight="bold")
236
+ ax.legend(loc="lower left")
237
+ plt.tight_layout()
238
+ plt.savefig("precision_recall_curve.png", dpi=150)
239
+ plt.show()
240
+ print("\nPrecision-Recall curve plot saved as 'precision_recall_curve.png'")