Moral88 0.10.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- Moral88/classification.py +123 -0
- Moral88/clustering.py +129 -0
- Moral88/regression.py +259 -302
- Moral88/utils.py +67 -97
- {Moral88-0.10.0.dist-info → Moral88-0.13.0.dist-info}/METADATA +1 -1
- Moral88-0.13.0.dist-info/RECORD +14 -0
- {Moral88-0.10.0.dist-info → Moral88-0.13.0.dist-info}/top_level.txt +1 -1
- Test/test_classification.py +88 -0
- Test/test_clustering.py +63 -0
- Test/test_regression.py +141 -0
- Moral88-0.10.0.dist-info/RECORD +0 -10
- tests/test_regression.py +0 -99
- {Moral88-0.10.0.dist-info → Moral88-0.13.0.dist-info}/LICENSE +0 -0
- {Moral88-0.10.0.dist-info → Moral88-0.13.0.dist-info}/WHEEL +0 -0
- {tests → Test}/__init__.py +0 -0
@@ -0,0 +1,123 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from Moral88.utils import DataValidator
|
4
|
+
|
5
|
+
validator = DataValidator()
|
6
|
+
def accuracy(y_true, y_pred):
|
7
|
+
"""Compute Accuracy"""
|
8
|
+
validator.validate_all(y_true, y_pred)
|
9
|
+
return np.sum(y_true == y_pred) / len(y_true)
|
10
|
+
|
11
|
+
def auc_roc(y_true, y_scores, average='macro'):
|
12
|
+
"""Compute AUC-ROC Score"""
|
13
|
+
validator.validate_all(y_true, np.round(y_scores))
|
14
|
+
unique_classes = np.unique(y_true)
|
15
|
+
aucs = []
|
16
|
+
|
17
|
+
for cls in unique_classes:
|
18
|
+
y_binary = (y_true == cls).astype(int)
|
19
|
+
sorted_indices = np.argsort(y_scores[:, cls])[::-1]
|
20
|
+
y_sorted = y_binary[sorted_indices]
|
21
|
+
cum_true = np.cumsum(y_sorted)
|
22
|
+
cum_false = np.cumsum(1 - y_sorted)
|
23
|
+
auc = np.sum(cum_true * (1 - y_sorted)) / (cum_true[-1] * cum_false[-1]) if (cum_true[-1] > 0 and cum_false[-1] > 0) else 0
|
24
|
+
aucs.append(auc)
|
25
|
+
|
26
|
+
if average == 'macro':
|
27
|
+
return np.mean(aucs)
|
28
|
+
elif average == 'weighted':
|
29
|
+
class_counts = np.bincount(y_true)
|
30
|
+
return np.sum([aucs[i] * class_counts[i] for i in range(len(unique_classes))]) / len(y_true)
|
31
|
+
return aucs
|
32
|
+
|
33
|
+
def precision(y_true, y_pred, average='binary'):
|
34
|
+
"""Compute Precision Score"""
|
35
|
+
validator.validate_all(y_true, y_pred)
|
36
|
+
unique_classes = np.unique(y_true)
|
37
|
+
precisions = []
|
38
|
+
|
39
|
+
for cls in unique_classes:
|
40
|
+
tp = np.sum((y_true == cls) & (y_pred == cls))
|
41
|
+
fp = np.sum((y_true != cls) & (y_pred == cls))
|
42
|
+
precisions.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
|
43
|
+
|
44
|
+
if average == 'macro':
|
45
|
+
return np.mean(precisions)
|
46
|
+
elif average == 'micro':
|
47
|
+
tp_total = np.sum(y_true == y_pred)
|
48
|
+
fp_total = np.sum(y_true != y_pred)
|
49
|
+
return tp_total / (tp_total + fp_total) if (tp_total + fp_total) > 0 else 0
|
50
|
+
elif average == 'weighted':
|
51
|
+
class_counts = np.bincount(y_true)
|
52
|
+
return np.sum([precisions[i] * class_counts[i] for i in range(len(unique_classes))]) / len(y_true)
|
53
|
+
return precisions
|
54
|
+
|
55
|
+
def recall(y_true, y_pred, average='binary'):
|
56
|
+
"""Compute Recall Score"""
|
57
|
+
validator.validate_all(y_true, y_pred)
|
58
|
+
unique_classes = np.unique(y_true)
|
59
|
+
recalls = []
|
60
|
+
|
61
|
+
for cls in unique_classes:
|
62
|
+
tp = np.sum((y_true == cls) & (y_pred == cls))
|
63
|
+
fn = np.sum((y_true == cls) & (y_pred != cls))
|
64
|
+
recalls.append(tp / (tp + fn) if (tp + fn) > 0 else 0)
|
65
|
+
|
66
|
+
if average == 'macro':
|
67
|
+
return np.mean(recalls)
|
68
|
+
elif average == 'micro':
|
69
|
+
tp_total = np.sum(y_true == y_pred)
|
70
|
+
fn_total = np.sum(y_true != y_pred)
|
71
|
+
return tp_total / (tp_total + fn_total) if (tp_total + fn_total) > 0 else 0
|
72
|
+
elif average == 'weighted':
|
73
|
+
class_counts = np.bincount(y_true)
|
74
|
+
return np.sum([recalls[i] * class_counts[i] for i in range(len(unique_classes))]) / len(y_true)
|
75
|
+
return recalls
|
76
|
+
def balanced_accuracy(y_true, y_pred):
|
77
|
+
"""Compute Balanced Accuracy"""
|
78
|
+
validator.validate_all(y_true, y_pred)
|
79
|
+
unique_classes = np.unique(y_true)
|
80
|
+
recalls = []
|
81
|
+
|
82
|
+
for cls in unique_classes:
|
83
|
+
tp = np.sum((y_true == cls) & (y_pred == cls))
|
84
|
+
fn = np.sum((y_true == cls) & (y_pred != cls))
|
85
|
+
recalls.append(tp / (tp + fn) if (tp + fn) > 0 else 0)
|
86
|
+
|
87
|
+
return np.mean(recalls)
|
88
|
+
|
89
|
+
def matthews_correlation_coefficient(y_true, y_pred):
|
90
|
+
"""Compute Matthews Correlation Coefficient (MCC)"""
|
91
|
+
validator.validate_all(y_true, y_pred)
|
92
|
+
unique_classes = np.unique(y_true)
|
93
|
+
confusion_mat = np.zeros((len(unique_classes), len(unique_classes)), dtype=int)
|
94
|
+
|
95
|
+
for i, cls_true in enumerate(unique_classes):
|
96
|
+
for j, cls_pred in enumerate(unique_classes):
|
97
|
+
confusion_mat[i, j] = np.sum((y_true == cls_true) & (y_pred == cls_pred))
|
98
|
+
|
99
|
+
tp = np.diag(confusion_mat)
|
100
|
+
fp = np.sum(confusion_mat, axis=0) - tp
|
101
|
+
fn = np.sum(confusion_mat, axis=1) - tp
|
102
|
+
tn = np.sum(confusion_mat) - (tp + fp + fn)
|
103
|
+
|
104
|
+
numerator = (tp * tn) - (fp * fn)
|
105
|
+
denominator = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
|
106
|
+
mcc = np.mean(numerator / denominator) if np.all(denominator > 0) else 0
|
107
|
+
return mcc
|
108
|
+
|
109
|
+
def cohens_kappa(y_true, y_pred):
|
110
|
+
"""Compute Cohen’s Kappa Score"""
|
111
|
+
validator.validate_all(y_true, y_pred)
|
112
|
+
unique_classes = np.unique(y_true)
|
113
|
+
confusion_mat = np.zeros((len(unique_classes), len(unique_classes)), dtype=int)
|
114
|
+
|
115
|
+
for i, cls_true in enumerate(unique_classes):
|
116
|
+
for j, cls_pred in enumerate(unique_classes):
|
117
|
+
confusion_mat[i, j] = np.sum((y_true == cls_true) & (y_pred == cls_pred))
|
118
|
+
|
119
|
+
total = np.sum(confusion_mat)
|
120
|
+
po = np.sum(np.diag(confusion_mat)) / total
|
121
|
+
pe = np.sum(np.sum(confusion_mat, axis=0) * np.sum(confusion_mat, axis=1)) / (total ** 2)
|
122
|
+
|
123
|
+
return (po - pe) / (1 - pe) if (1 - pe) > 0 else 0
|
Moral88/clustering.py
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from Moral88.utils import DataValidator
|
3
|
+
|
4
|
+
validator = DataValidator()
|
5
|
+
|
6
|
+
|
7
|
+
def adjusted_rand_index(labels_true, labels_pred):
|
8
|
+
"""Compute Adjusted Rand Index (ARI)"""
|
9
|
+
validator.validate_all(labels_true, labels_pred)
|
10
|
+
n = len(labels_true)
|
11
|
+
contingency_matrix = np.zeros((len(set(labels_true)), len(set(labels_pred))))
|
12
|
+
|
13
|
+
for i in range(n):
|
14
|
+
contingency_matrix[labels_true[i], labels_pred[i]] += 1
|
15
|
+
|
16
|
+
sum_comb_c = np.sum([np.sum(row) * (np.sum(row) - 1) for row in contingency_matrix]) / 2
|
17
|
+
sum_comb_k = np.sum([np.sum(col) * (np.sum(col) - 1) for col in contingency_matrix.T]) / 2
|
18
|
+
sum_comb = np.sum(contingency_matrix * (contingency_matrix - 1)) / 2
|
19
|
+
|
20
|
+
expected_index = (sum_comb_c * sum_comb_k) / (n * (n - 1) / 2)
|
21
|
+
max_index = (sum_comb_c + sum_comb_k) / 2
|
22
|
+
|
23
|
+
return (sum_comb - expected_index) / (max_index - expected_index) if max_index != expected_index else 1
|
24
|
+
|
25
|
+
def normalized_mutual_info(labels_true, labels_pred, epsilon=1e-10):
|
26
|
+
"""Compute Normalized Mutual Information (NMI) avoid devided by zero"""
|
27
|
+
validator.validate_all(labels_true, labels_pred)
|
28
|
+
unique_true, counts_true = np.unique(labels_true, return_counts=True)
|
29
|
+
unique_pred, counts_pred = np.unique(labels_pred, return_counts=True)
|
30
|
+
|
31
|
+
contingency_matrix = np.zeros((len(unique_true), len(unique_pred)))
|
32
|
+
for i in range(len(labels_true)):
|
33
|
+
contingency_matrix[labels_true[i], labels_pred[i]] += 1
|
34
|
+
|
35
|
+
h_true = -np.sum((counts_true / len(labels_true)) * np.log2(np.where(counts_true > 0, counts_true / len(labels_true), epsilon)))
|
36
|
+
h_pred = -np.sum((counts_pred / len(labels_pred)) * np.log2(np.where(counts_pred > 0, counts_pred / len(labels_pred), epsilon)))
|
37
|
+
|
38
|
+
joint_prob = contingency_matrix / len(labels_true)
|
39
|
+
mutual_info = np.sum(joint_prob * np.log2(np.where(joint_prob > 0, joint_prob / ((counts_true[:, None] / len(labels_true)) * (counts_pred[None, :] / len(labels_pred))), epsilon)))
|
40
|
+
|
41
|
+
return mutual_info / np.sqrt(h_true * h_pred) if h_true * h_pred > 0 else 0
|
42
|
+
|
43
|
+
def silhouette_score(X, labels):
|
44
|
+
"""Compute Silhouette Score"""
|
45
|
+
validator.validate_all(labels, labels)
|
46
|
+
unique_labels = np.unique(labels)
|
47
|
+
a = np.zeros(len(X))
|
48
|
+
b = np.zeros(len(X))
|
49
|
+
|
50
|
+
for i, label in enumerate(labels):
|
51
|
+
same_cluster = X[labels == label]
|
52
|
+
other_clusters = [X[labels == other_label] for other_label in unique_labels if other_label != label]
|
53
|
+
|
54
|
+
a[i] = np.mean(np.linalg.norm(same_cluster - X[i], axis=1)) if len(same_cluster) > 1 else 0
|
55
|
+
b[i] = np.min([np.mean(np.linalg.norm(other_cluster - X[i], axis=1)) for other_cluster in other_clusters]) if other_clusters else 0
|
56
|
+
|
57
|
+
silhouette_values = (b - a) / np.maximum(a, b)
|
58
|
+
return np.mean(silhouette_values)
|
59
|
+
|
60
|
+
def calinski_harabasz_index(X, labels):
|
61
|
+
"""Compute Calinski-Harabasz Index"""
|
62
|
+
validator.validate_all(labels, labels)
|
63
|
+
n_clusters = len(np.unique(labels))
|
64
|
+
n_samples = len(X)
|
65
|
+
cluster_means = np.array([np.mean(X[labels == label], axis=0) for label in np.unique(labels)])
|
66
|
+
overall_mean = np.mean(X, axis=0)
|
67
|
+
|
68
|
+
between_group_dispersion = np.sum([len(X[labels == label]) * np.linalg.norm(cluster_mean - overall_mean)**2 for label, cluster_mean in zip(np.unique(labels), cluster_means)])
|
69
|
+
within_group_dispersion = np.sum([np.sum((X[labels == label] - cluster_mean) ** 2) for label, cluster_mean in zip(np.unique(labels), cluster_means)])
|
70
|
+
|
71
|
+
return (between_group_dispersion / within_group_dispersion) * ((n_samples - n_clusters) / (n_clusters - 1))
|
72
|
+
|
73
|
+
def dunn_index(X, labels):
|
74
|
+
"""Compute Dunn Index"""
|
75
|
+
validator.validate_all(labels, labels)
|
76
|
+
unique_labels = np.unique(labels)
|
77
|
+
cluster_means = [np.mean(X[labels == label], axis=0) for label in unique_labels]
|
78
|
+
intra_distances = [np.max(np.linalg.norm(X[labels == label] - cluster_mean, axis=1)) for label, cluster_mean in zip(unique_labels, cluster_means)]
|
79
|
+
inter_distances = [np.linalg.norm(cluster_means[i] - cluster_means[j]) for i in range(len(unique_labels)) for j in range(i + 1, len(unique_labels))]
|
80
|
+
return np.min(inter_distances) / np.max(intra_distances)
|
81
|
+
|
82
|
+
def inertia(X, labels):
|
83
|
+
"""Compute Inertia (Sum of Squared Distances to Centroids)"""
|
84
|
+
validator.validate_all(labels, labels)
|
85
|
+
unique_labels = np.unique(labels)
|
86
|
+
cluster_means = [np.mean(X[labels == label], axis=0) for label in unique_labels]
|
87
|
+
return np.sum([np.sum((X[labels == label] - cluster_means[i]) ** 2) for i, label in enumerate(unique_labels)])
|
88
|
+
|
89
|
+
def homogeneity_score(labels_true, labels_pred, epsilon=1e-10):
|
90
|
+
"""Compute Homogeneity Score avoid devided by zero"""
|
91
|
+
validator.validate_all(labels_true, labels_pred)
|
92
|
+
unique_true, counts_true = np.unique(labels_true, return_counts=True)
|
93
|
+
unique_pred, counts_pred = np.unique(labels_pred, return_counts=True)
|
94
|
+
contingency_matrix = np.zeros((len(unique_true), len(unique_pred)))
|
95
|
+
|
96
|
+
for i in range(len(labels_true)):
|
97
|
+
contingency_matrix[labels_true[i], labels_pred[i]] += 1
|
98
|
+
|
99
|
+
entropy_true = -np.sum((counts_true / len(labels_true)) * np.log2(np.where(counts_true > 0, counts_true / len(labels_true), epsilon)))
|
100
|
+
mutual_info = np.sum(contingency_matrix * np.log2(np.where(contingency_matrix > 0, contingency_matrix / ((counts_true[:, None] / len(labels_true)) * (counts_pred[None, :] / len(labels_pred))), epsilon)))
|
101
|
+
|
102
|
+
return mutual_info / entropy_true if entropy_true > 0 else 0
|
103
|
+
|
104
|
+
|
105
|
+
def completeness_score(labels_true, labels_pred, epsilon=1e-10):
|
106
|
+
"""Compute Completeness Score avoid divided by zero"""
|
107
|
+
validator.validate_all(labels_true, labels_pred)
|
108
|
+
unique_true, counts_true = np.unique(labels_true, return_counts=True)
|
109
|
+
unique_pred, counts_pred = np.unique(labels_pred, return_counts=True)
|
110
|
+
contingency_matrix = np.zeros((len(unique_true), len(unique_pred)))
|
111
|
+
|
112
|
+
for i in range(len(labels_true)):
|
113
|
+
contingency_matrix[labels_true[i], labels_pred[i]] += 1
|
114
|
+
|
115
|
+
entropy_pred = -np.sum((counts_pred / len(labels_pred)) * np.log2(np.where(counts_pred > 0, counts_pred / len(labels_pred), epsilon)))
|
116
|
+
mutual_info = np.sum(contingency_matrix * np.log2(np.where(contingency_matrix > 0, contingency_matrix / ((counts_true[:, None] / len(labels_true)) * (counts_pred[None, :] / len(labels_pred))), epsilon)))
|
117
|
+
|
118
|
+
return mutual_info / entropy_pred if entropy_pred > 0 else 0
|
119
|
+
|
120
|
+
def davies_bouldin_index(X, labels):
|
121
|
+
"""Compute Davies-Bouldin Index"""
|
122
|
+
validator.validate_all(labels, labels)
|
123
|
+
n_clusters = len(np.unique(labels))
|
124
|
+
cluster_means = np.array([np.mean(X[labels == label], axis=0) for label in np.unique(labels)])
|
125
|
+
|
126
|
+
dispersions = np.array([np.mean(np.linalg.norm(X[labels == label] - cluster_means[i], axis=1)) for i, label in enumerate(np.unique(labels))])
|
127
|
+
db_index = np.mean([max([(dispersions[i] + dispersions[j]) / np.linalg.norm(cluster_means[i] - cluster_means[j]) for j in range(n_clusters) if i != j]) for i in range(n_clusters)])
|
128
|
+
|
129
|
+
return db_index
|