Moral88 0.11.0__tar.gz → 0.13.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- Moral88-0.13.0/Moral88/classification.py +123 -0
- Moral88-0.13.0/Moral88/clustering.py +129 -0
- Moral88-0.13.0/Moral88/regression.py +271 -0
- Moral88-0.13.0/Moral88/utils.py +71 -0
- {Moral88-0.11.0 → Moral88-0.13.0/Moral88.egg-info}/PKG-INFO +1 -1
- {Moral88-0.11.0 → Moral88-0.13.0}/Moral88.egg-info/SOURCES.txt +9 -3
- {Moral88-0.11.0 → Moral88-0.13.0}/Moral88.egg-info/top_level.txt +1 -1
- {Moral88-0.11.0/Moral88.egg-info → Moral88-0.13.0}/PKG-INFO +1 -1
- Moral88-0.13.0/README.md +144 -0
- Moral88-0.13.0/Test/test_classification.py +88 -0
- Moral88-0.13.0/Test/test_clustering.py +63 -0
- Moral88-0.13.0/Test/test_regression.py +141 -0
- {Moral88-0.11.0 → Moral88-0.13.0}/setup.py +1 -1
- Moral88-0.11.0/Moral88/regression.py +0 -314
- Moral88-0.11.0/Moral88/segmentation.py +0 -166
- Moral88-0.11.0/Moral88/utils.py +0 -116
- Moral88-0.11.0/README.md +0 -18
- Moral88-0.11.0/tests/test_regression.py +0 -100
- {Moral88-0.11.0 → Moral88-0.13.0}/LICENSE +0 -0
- {Moral88-0.11.0 → Moral88-0.13.0}/Moral88/__init__.py +0 -0
- {Moral88-0.11.0 → Moral88-0.13.0}/Moral88.egg-info/dependency_links.txt +0 -0
- {Moral88-0.11.0 → Moral88-0.13.0}/Moral88.egg-info/requires.txt +0 -0
- {Moral88-0.11.0/tests → Moral88-0.13.0/Test}/__init__.py +0 -0
- {Moral88-0.11.0 → Moral88-0.13.0}/setup.cfg +0 -0
@@ -0,0 +1,123 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from Moral88.utils import DataValidator
|
4
|
+
|
5
|
+
validator = DataValidator()
|
6
|
+
def accuracy(y_true, y_pred):
|
7
|
+
"""Compute Accuracy"""
|
8
|
+
validator.validate_all(y_true, y_pred)
|
9
|
+
return np.sum(y_true == y_pred) / len(y_true)
|
10
|
+
|
11
|
+
def auc_roc(y_true, y_scores, average='macro'):
|
12
|
+
"""Compute AUC-ROC Score"""
|
13
|
+
validator.validate_all(y_true, np.round(y_scores))
|
14
|
+
unique_classes = np.unique(y_true)
|
15
|
+
aucs = []
|
16
|
+
|
17
|
+
for cls in unique_classes:
|
18
|
+
y_binary = (y_true == cls).astype(int)
|
19
|
+
sorted_indices = np.argsort(y_scores[:, cls])[::-1]
|
20
|
+
y_sorted = y_binary[sorted_indices]
|
21
|
+
cum_true = np.cumsum(y_sorted)
|
22
|
+
cum_false = np.cumsum(1 - y_sorted)
|
23
|
+
auc = np.sum(cum_true * (1 - y_sorted)) / (cum_true[-1] * cum_false[-1]) if (cum_true[-1] > 0 and cum_false[-1] > 0) else 0
|
24
|
+
aucs.append(auc)
|
25
|
+
|
26
|
+
if average == 'macro':
|
27
|
+
return np.mean(aucs)
|
28
|
+
elif average == 'weighted':
|
29
|
+
class_counts = np.bincount(y_true)
|
30
|
+
return np.sum([aucs[i] * class_counts[i] for i in range(len(unique_classes))]) / len(y_true)
|
31
|
+
return aucs
|
32
|
+
|
33
|
+
def precision(y_true, y_pred, average='binary'):
|
34
|
+
"""Compute Precision Score"""
|
35
|
+
validator.validate_all(y_true, y_pred)
|
36
|
+
unique_classes = np.unique(y_true)
|
37
|
+
precisions = []
|
38
|
+
|
39
|
+
for cls in unique_classes:
|
40
|
+
tp = np.sum((y_true == cls) & (y_pred == cls))
|
41
|
+
fp = np.sum((y_true != cls) & (y_pred == cls))
|
42
|
+
precisions.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
|
43
|
+
|
44
|
+
if average == 'macro':
|
45
|
+
return np.mean(precisions)
|
46
|
+
elif average == 'micro':
|
47
|
+
tp_total = np.sum(y_true == y_pred)
|
48
|
+
fp_total = np.sum(y_true != y_pred)
|
49
|
+
return tp_total / (tp_total + fp_total) if (tp_total + fp_total) > 0 else 0
|
50
|
+
elif average == 'weighted':
|
51
|
+
class_counts = np.bincount(y_true)
|
52
|
+
return np.sum([precisions[i] * class_counts[i] for i in range(len(unique_classes))]) / len(y_true)
|
53
|
+
return precisions
|
54
|
+
|
55
|
+
def recall(y_true, y_pred, average='binary'):
|
56
|
+
"""Compute Recall Score"""
|
57
|
+
validator.validate_all(y_true, y_pred)
|
58
|
+
unique_classes = np.unique(y_true)
|
59
|
+
recalls = []
|
60
|
+
|
61
|
+
for cls in unique_classes:
|
62
|
+
tp = np.sum((y_true == cls) & (y_pred == cls))
|
63
|
+
fn = np.sum((y_true == cls) & (y_pred != cls))
|
64
|
+
recalls.append(tp / (tp + fn) if (tp + fn) > 0 else 0)
|
65
|
+
|
66
|
+
if average == 'macro':
|
67
|
+
return np.mean(recalls)
|
68
|
+
elif average == 'micro':
|
69
|
+
tp_total = np.sum(y_true == y_pred)
|
70
|
+
fn_total = np.sum(y_true != y_pred)
|
71
|
+
return tp_total / (tp_total + fn_total) if (tp_total + fn_total) > 0 else 0
|
72
|
+
elif average == 'weighted':
|
73
|
+
class_counts = np.bincount(y_true)
|
74
|
+
return np.sum([recalls[i] * class_counts[i] for i in range(len(unique_classes))]) / len(y_true)
|
75
|
+
return recalls
|
76
|
+
def balanced_accuracy(y_true, y_pred):
|
77
|
+
"""Compute Balanced Accuracy"""
|
78
|
+
validator.validate_all(y_true, y_pred)
|
79
|
+
unique_classes = np.unique(y_true)
|
80
|
+
recalls = []
|
81
|
+
|
82
|
+
for cls in unique_classes:
|
83
|
+
tp = np.sum((y_true == cls) & (y_pred == cls))
|
84
|
+
fn = np.sum((y_true == cls) & (y_pred != cls))
|
85
|
+
recalls.append(tp / (tp + fn) if (tp + fn) > 0 else 0)
|
86
|
+
|
87
|
+
return np.mean(recalls)
|
88
|
+
|
89
|
+
def matthews_correlation_coefficient(y_true, y_pred):
|
90
|
+
"""Compute Matthews Correlation Coefficient (MCC)"""
|
91
|
+
validator.validate_all(y_true, y_pred)
|
92
|
+
unique_classes = np.unique(y_true)
|
93
|
+
confusion_mat = np.zeros((len(unique_classes), len(unique_classes)), dtype=int)
|
94
|
+
|
95
|
+
for i, cls_true in enumerate(unique_classes):
|
96
|
+
for j, cls_pred in enumerate(unique_classes):
|
97
|
+
confusion_mat[i, j] = np.sum((y_true == cls_true) & (y_pred == cls_pred))
|
98
|
+
|
99
|
+
tp = np.diag(confusion_mat)
|
100
|
+
fp = np.sum(confusion_mat, axis=0) - tp
|
101
|
+
fn = np.sum(confusion_mat, axis=1) - tp
|
102
|
+
tn = np.sum(confusion_mat) - (tp + fp + fn)
|
103
|
+
|
104
|
+
numerator = (tp * tn) - (fp * fn)
|
105
|
+
denominator = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
|
106
|
+
mcc = np.mean(numerator / denominator) if np.all(denominator > 0) else 0
|
107
|
+
return mcc
|
108
|
+
|
109
|
+
def cohens_kappa(y_true, y_pred):
|
110
|
+
"""Compute Cohen’s Kappa Score"""
|
111
|
+
validator.validate_all(y_true, y_pred)
|
112
|
+
unique_classes = np.unique(y_true)
|
113
|
+
confusion_mat = np.zeros((len(unique_classes), len(unique_classes)), dtype=int)
|
114
|
+
|
115
|
+
for i, cls_true in enumerate(unique_classes):
|
116
|
+
for j, cls_pred in enumerate(unique_classes):
|
117
|
+
confusion_mat[i, j] = np.sum((y_true == cls_true) & (y_pred == cls_pred))
|
118
|
+
|
119
|
+
total = np.sum(confusion_mat)
|
120
|
+
po = np.sum(np.diag(confusion_mat)) / total
|
121
|
+
pe = np.sum(np.sum(confusion_mat, axis=0) * np.sum(confusion_mat, axis=1)) / (total ** 2)
|
122
|
+
|
123
|
+
return (po - pe) / (1 - pe) if (1 - pe) > 0 else 0
|
@@ -0,0 +1,129 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from Moral88.utils import DataValidator
|
3
|
+
|
4
|
+
validator = DataValidator()
|
5
|
+
|
6
|
+
|
7
|
+
def adjusted_rand_index(labels_true, labels_pred):
|
8
|
+
"""Compute Adjusted Rand Index (ARI)"""
|
9
|
+
validator.validate_all(labels_true, labels_pred)
|
10
|
+
n = len(labels_true)
|
11
|
+
contingency_matrix = np.zeros((len(set(labels_true)), len(set(labels_pred))))
|
12
|
+
|
13
|
+
for i in range(n):
|
14
|
+
contingency_matrix[labels_true[i], labels_pred[i]] += 1
|
15
|
+
|
16
|
+
sum_comb_c = np.sum([np.sum(row) * (np.sum(row) - 1) for row in contingency_matrix]) / 2
|
17
|
+
sum_comb_k = np.sum([np.sum(col) * (np.sum(col) - 1) for col in contingency_matrix.T]) / 2
|
18
|
+
sum_comb = np.sum(contingency_matrix * (contingency_matrix - 1)) / 2
|
19
|
+
|
20
|
+
expected_index = (sum_comb_c * sum_comb_k) / (n * (n - 1) / 2)
|
21
|
+
max_index = (sum_comb_c + sum_comb_k) / 2
|
22
|
+
|
23
|
+
return (sum_comb - expected_index) / (max_index - expected_index) if max_index != expected_index else 1
|
24
|
+
|
25
|
+
def normalized_mutual_info(labels_true, labels_pred, epsilon=1e-10):
|
26
|
+
"""Compute Normalized Mutual Information (NMI) avoid devided by zero"""
|
27
|
+
validator.validate_all(labels_true, labels_pred)
|
28
|
+
unique_true, counts_true = np.unique(labels_true, return_counts=True)
|
29
|
+
unique_pred, counts_pred = np.unique(labels_pred, return_counts=True)
|
30
|
+
|
31
|
+
contingency_matrix = np.zeros((len(unique_true), len(unique_pred)))
|
32
|
+
for i in range(len(labels_true)):
|
33
|
+
contingency_matrix[labels_true[i], labels_pred[i]] += 1
|
34
|
+
|
35
|
+
h_true = -np.sum((counts_true / len(labels_true)) * np.log2(np.where(counts_true > 0, counts_true / len(labels_true), epsilon)))
|
36
|
+
h_pred = -np.sum((counts_pred / len(labels_pred)) * np.log2(np.where(counts_pred > 0, counts_pred / len(labels_pred), epsilon)))
|
37
|
+
|
38
|
+
joint_prob = contingency_matrix / len(labels_true)
|
39
|
+
mutual_info = np.sum(joint_prob * np.log2(np.where(joint_prob > 0, joint_prob / ((counts_true[:, None] / len(labels_true)) * (counts_pred[None, :] / len(labels_pred))), epsilon)))
|
40
|
+
|
41
|
+
return mutual_info / np.sqrt(h_true * h_pred) if h_true * h_pred > 0 else 0
|
42
|
+
|
43
|
+
def silhouette_score(X, labels):
|
44
|
+
"""Compute Silhouette Score"""
|
45
|
+
validator.validate_all(labels, labels)
|
46
|
+
unique_labels = np.unique(labels)
|
47
|
+
a = np.zeros(len(X))
|
48
|
+
b = np.zeros(len(X))
|
49
|
+
|
50
|
+
for i, label in enumerate(labels):
|
51
|
+
same_cluster = X[labels == label]
|
52
|
+
other_clusters = [X[labels == other_label] for other_label in unique_labels if other_label != label]
|
53
|
+
|
54
|
+
a[i] = np.mean(np.linalg.norm(same_cluster - X[i], axis=1)) if len(same_cluster) > 1 else 0
|
55
|
+
b[i] = np.min([np.mean(np.linalg.norm(other_cluster - X[i], axis=1)) for other_cluster in other_clusters]) if other_clusters else 0
|
56
|
+
|
57
|
+
silhouette_values = (b - a) / np.maximum(a, b)
|
58
|
+
return np.mean(silhouette_values)
|
59
|
+
|
60
|
+
def calinski_harabasz_index(X, labels):
|
61
|
+
"""Compute Calinski-Harabasz Index"""
|
62
|
+
validator.validate_all(labels, labels)
|
63
|
+
n_clusters = len(np.unique(labels))
|
64
|
+
n_samples = len(X)
|
65
|
+
cluster_means = np.array([np.mean(X[labels == label], axis=0) for label in np.unique(labels)])
|
66
|
+
overall_mean = np.mean(X, axis=0)
|
67
|
+
|
68
|
+
between_group_dispersion = np.sum([len(X[labels == label]) * np.linalg.norm(cluster_mean - overall_mean)**2 for label, cluster_mean in zip(np.unique(labels), cluster_means)])
|
69
|
+
within_group_dispersion = np.sum([np.sum((X[labels == label] - cluster_mean) ** 2) for label, cluster_mean in zip(np.unique(labels), cluster_means)])
|
70
|
+
|
71
|
+
return (between_group_dispersion / within_group_dispersion) * ((n_samples - n_clusters) / (n_clusters - 1))
|
72
|
+
|
73
|
+
def dunn_index(X, labels):
|
74
|
+
"""Compute Dunn Index"""
|
75
|
+
validator.validate_all(labels, labels)
|
76
|
+
unique_labels = np.unique(labels)
|
77
|
+
cluster_means = [np.mean(X[labels == label], axis=0) for label in unique_labels]
|
78
|
+
intra_distances = [np.max(np.linalg.norm(X[labels == label] - cluster_mean, axis=1)) for label, cluster_mean in zip(unique_labels, cluster_means)]
|
79
|
+
inter_distances = [np.linalg.norm(cluster_means[i] - cluster_means[j]) for i in range(len(unique_labels)) for j in range(i + 1, len(unique_labels))]
|
80
|
+
return np.min(inter_distances) / np.max(intra_distances)
|
81
|
+
|
82
|
+
def inertia(X, labels):
|
83
|
+
"""Compute Inertia (Sum of Squared Distances to Centroids)"""
|
84
|
+
validator.validate_all(labels, labels)
|
85
|
+
unique_labels = np.unique(labels)
|
86
|
+
cluster_means = [np.mean(X[labels == label], axis=0) for label in unique_labels]
|
87
|
+
return np.sum([np.sum((X[labels == label] - cluster_means[i]) ** 2) for i, label in enumerate(unique_labels)])
|
88
|
+
|
89
|
+
def homogeneity_score(labels_true, labels_pred, epsilon=1e-10):
|
90
|
+
"""Compute Homogeneity Score avoid devided by zero"""
|
91
|
+
validator.validate_all(labels_true, labels_pred)
|
92
|
+
unique_true, counts_true = np.unique(labels_true, return_counts=True)
|
93
|
+
unique_pred, counts_pred = np.unique(labels_pred, return_counts=True)
|
94
|
+
contingency_matrix = np.zeros((len(unique_true), len(unique_pred)))
|
95
|
+
|
96
|
+
for i in range(len(labels_true)):
|
97
|
+
contingency_matrix[labels_true[i], labels_pred[i]] += 1
|
98
|
+
|
99
|
+
entropy_true = -np.sum((counts_true / len(labels_true)) * np.log2(np.where(counts_true > 0, counts_true / len(labels_true), epsilon)))
|
100
|
+
mutual_info = np.sum(contingency_matrix * np.log2(np.where(contingency_matrix > 0, contingency_matrix / ((counts_true[:, None] / len(labels_true)) * (counts_pred[None, :] / len(labels_pred))), epsilon)))
|
101
|
+
|
102
|
+
return mutual_info / entropy_true if entropy_true > 0 else 0
|
103
|
+
|
104
|
+
|
105
|
+
def completeness_score(labels_true, labels_pred, epsilon=1e-10):
|
106
|
+
"""Compute Completeness Score avoid divided by zero"""
|
107
|
+
validator.validate_all(labels_true, labels_pred)
|
108
|
+
unique_true, counts_true = np.unique(labels_true, return_counts=True)
|
109
|
+
unique_pred, counts_pred = np.unique(labels_pred, return_counts=True)
|
110
|
+
contingency_matrix = np.zeros((len(unique_true), len(unique_pred)))
|
111
|
+
|
112
|
+
for i in range(len(labels_true)):
|
113
|
+
contingency_matrix[labels_true[i], labels_pred[i]] += 1
|
114
|
+
|
115
|
+
entropy_pred = -np.sum((counts_pred / len(labels_pred)) * np.log2(np.where(counts_pred > 0, counts_pred / len(labels_pred), epsilon)))
|
116
|
+
mutual_info = np.sum(contingency_matrix * np.log2(np.where(contingency_matrix > 0, contingency_matrix / ((counts_true[:, None] / len(labels_true)) * (counts_pred[None, :] / len(labels_pred))), epsilon)))
|
117
|
+
|
118
|
+
return mutual_info / entropy_pred if entropy_pred > 0 else 0
|
119
|
+
|
120
|
+
def davies_bouldin_index(X, labels):
|
121
|
+
"""Compute Davies-Bouldin Index"""
|
122
|
+
validator.validate_all(labels, labels)
|
123
|
+
n_clusters = len(np.unique(labels))
|
124
|
+
cluster_means = np.array([np.mean(X[labels == label], axis=0) for label in np.unique(labels)])
|
125
|
+
|
126
|
+
dispersions = np.array([np.mean(np.linalg.norm(X[labels == label] - cluster_means[i], axis=1)) for i, label in enumerate(np.unique(labels))])
|
127
|
+
db_index = np.mean([max([(dispersions[i] + dispersions[j]) / np.linalg.norm(cluster_means[i] - cluster_means[j]) for j in range(n_clusters) if i != j]) for i in range(n_clusters)])
|
128
|
+
|
129
|
+
return db_index
|
@@ -0,0 +1,271 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from Moral88.utils import DataValidator
|
3
|
+
|
4
|
+
validator = DataValidator()
|
5
|
+
|
6
|
+
|
7
|
+
def mean_absolute_error(y_true, y_pred, sample_weights=None, normalize=False, method='mean'):
|
8
|
+
"""Compute Mean Absolute Error (MAE)"""
|
9
|
+
validator.validate_all(y_true, y_pred)
|
10
|
+
errors = np.abs(y_true - y_pred)
|
11
|
+
|
12
|
+
if sample_weights is not None:
|
13
|
+
sample_weights = np.array(sample_weights)
|
14
|
+
if len(sample_weights) != len(y_true):
|
15
|
+
raise ValueError("sample_weights must have the same length as y_true and y_pred")
|
16
|
+
errors *= sample_weights
|
17
|
+
|
18
|
+
if normalize:
|
19
|
+
errors /= np.mean(y_true)
|
20
|
+
|
21
|
+
if method == 'mean':
|
22
|
+
return np.mean(errors)
|
23
|
+
elif method == 'sum':
|
24
|
+
return np.sum(errors)
|
25
|
+
elif method == 'none':
|
26
|
+
return errors
|
27
|
+
else:
|
28
|
+
raise ValueError("Invalid method. Choose from 'mean', 'sum', or 'none'")
|
29
|
+
|
30
|
+
def mean_squared_error(y_true, y_pred, sample_weights=None, squared=True, method='mean'):
|
31
|
+
"""Compute Mean Squared Error (MSE) or Root Mean Squared Error (RMSE)"""
|
32
|
+
validator.validate_all(y_true, y_pred)
|
33
|
+
errors = (y_true - y_pred) ** 2
|
34
|
+
|
35
|
+
if sample_weights is not None:
|
36
|
+
sample_weights = np.array(sample_weights)
|
37
|
+
if len(sample_weights) != len(y_true):
|
38
|
+
raise ValueError("sample_weights must have the same length as y_true and y_pred")
|
39
|
+
errors *= sample_weights
|
40
|
+
|
41
|
+
if method == 'mean':
|
42
|
+
result = np.mean(errors)
|
43
|
+
elif method == 'sum':
|
44
|
+
result = np.sum(errors)
|
45
|
+
elif method == 'none':
|
46
|
+
return errors
|
47
|
+
else:
|
48
|
+
raise ValueError("Invalid method. Choose from 'mean', 'sum', or 'none'")
|
49
|
+
|
50
|
+
return result if squared else np.sqrt(result)
|
51
|
+
|
52
|
+
def root_mean_squared_error(y_true, y_pred, sample_weights=None, method='mean'):
|
53
|
+
"""Compute Root Mean Squared Error (RMSE)"""
|
54
|
+
validator.validate_all(y_true, y_pred)
|
55
|
+
errors = (y_true - y_pred) ** 2
|
56
|
+
|
57
|
+
if sample_weights is not None:
|
58
|
+
sample_weights = np.array(sample_weights)
|
59
|
+
if len(sample_weights) != len(y_true):
|
60
|
+
raise ValueError("sample_weights must have the same length as y_true and y_pred")
|
61
|
+
errors *= sample_weights
|
62
|
+
|
63
|
+
if method == 'mean':
|
64
|
+
result = np.mean(errors)
|
65
|
+
elif method == 'sum':
|
66
|
+
result = np.sum(errors)
|
67
|
+
elif method == 'none':
|
68
|
+
return np.sqrt(errors)
|
69
|
+
else:
|
70
|
+
raise ValueError("Invalid method. Choose from 'mean', 'sum', or 'none'")
|
71
|
+
|
72
|
+
return np.sqrt(result)
|
73
|
+
|
74
|
+
def mean_bias_deviation(y_true, y_pred, sample_weights=None, method='mean'):
|
75
|
+
"""Compute Mean Bias Deviation (MBD)"""
|
76
|
+
validator.validate_all(y_true, y_pred)
|
77
|
+
errors = y_true - y_pred
|
78
|
+
|
79
|
+
if sample_weights is not None:
|
80
|
+
sample_weights = np.array(sample_weights)
|
81
|
+
if len(sample_weights) != len(y_true):
|
82
|
+
raise ValueError("sample_weights must have the same length as y_true and y_pred")
|
83
|
+
errors *= sample_weights
|
84
|
+
|
85
|
+
if method == 'mean':
|
86
|
+
return np.mean(errors)
|
87
|
+
elif method == 'sum':
|
88
|
+
return np.sum(errors)
|
89
|
+
elif method == 'none':
|
90
|
+
return errors
|
91
|
+
else:
|
92
|
+
raise ValueError("Invalid method. Choose from 'mean', 'sum', or 'none'")
|
93
|
+
|
94
|
+
def r_squared(y_true, y_pred, adjusted=False, n_features=None):
|
95
|
+
"""Compute R-squared (R²) and Adjusted R-squared if needed"""
|
96
|
+
validator.validate_all(y_true, y_pred)
|
97
|
+
ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
|
98
|
+
ss_residual = np.sum((y_true - y_pred) ** 2)
|
99
|
+
r2 = 1 - (ss_residual / ss_total)
|
100
|
+
|
101
|
+
if adjusted:
|
102
|
+
if n_features is None:
|
103
|
+
raise ValueError("n_features must be provided for adjusted R² calculation")
|
104
|
+
n = len(y_true)
|
105
|
+
return 1 - ((1 - r2) * (n - 1) / (n - n_features - 1))
|
106
|
+
|
107
|
+
return r2
|
108
|
+
|
109
|
+
def adjusted_r_squared(y_true, y_pred, n_features):
|
110
|
+
"""Compute Adjusted R-squared (R² Adjusted)"""
|
111
|
+
validator.validate_all(y_true, y_pred)
|
112
|
+
|
113
|
+
n = len(y_true)
|
114
|
+
if n_features >= n:
|
115
|
+
raise ValueError("Number of features must be less than number of samples")
|
116
|
+
|
117
|
+
ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
|
118
|
+
ss_residual = np.sum((y_true - y_pred) ** 2)
|
119
|
+
r2 = 1 - (ss_residual / ss_total)
|
120
|
+
|
121
|
+
return 1 - ((1 - r2) * (n - 1) / (n - n_features - 1))
|
122
|
+
|
123
|
+
def mean_absolute_percentage_error(y_true, y_pred, sample_weights=None, method='mean'):
|
124
|
+
"""Compute Mean Absolute Percentage Error (MAPE)"""
|
125
|
+
validator.validate_all(y_true, y_pred, mape_based=True)
|
126
|
+
errors = np.abs((y_true - y_pred) / y_true) * 100
|
127
|
+
|
128
|
+
if sample_weights is not None:
|
129
|
+
sample_weights = np.array(sample_weights)
|
130
|
+
if len(sample_weights) != len(y_true):
|
131
|
+
raise ValueError("sample_weights must have the same length as y_true and y_pred")
|
132
|
+
errors *= sample_weights
|
133
|
+
|
134
|
+
if method == 'mean':
|
135
|
+
return np.mean(errors)
|
136
|
+
elif method == 'sum':
|
137
|
+
return np.sum(errors)
|
138
|
+
elif method == 'none':
|
139
|
+
return errors
|
140
|
+
else:
|
141
|
+
raise ValueError("Invalid method. Choose from 'mean', 'sum', or 'none'")
|
142
|
+
|
143
|
+
def symmetric_mean_absolute_percentage_error(y_true, y_pred, sample_weights=None, method='mean'):
|
144
|
+
"""Compute Symmetric Mean Absolute Percentage Error (sMAPE)"""
|
145
|
+
validator.validate_all(y_true, y_pred, mape_based=True)
|
146
|
+
errors = 200 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred))
|
147
|
+
|
148
|
+
if sample_weights is not None:
|
149
|
+
sample_weights = np.array(sample_weights)
|
150
|
+
if len(sample_weights) != len(y_true):
|
151
|
+
raise ValueError("sample_weights must have the same length as y_true and y_pred")
|
152
|
+
errors *= sample_weights
|
153
|
+
|
154
|
+
if method == 'mean':
|
155
|
+
return np.mean(errors)
|
156
|
+
elif method == 'sum':
|
157
|
+
return np.sum(errors)
|
158
|
+
elif method == 'none':
|
159
|
+
return errors
|
160
|
+
else:
|
161
|
+
raise ValueError("Invalid method. Choose from 'mean', 'sum', or 'none'")
|
162
|
+
|
163
|
+
def huber_loss(y_true, y_pred, delta=1.0, sample_weights=None, method='mean'):
|
164
|
+
"""Compute Huber Loss"""
|
165
|
+
validator.validate_all(y_true, y_pred)
|
166
|
+
error = y_true - y_pred
|
167
|
+
|
168
|
+
loss = np.where(np.abs(error) <= delta, 0.5 * error ** 2, delta * (np.abs(error) - 0.5 * delta))
|
169
|
+
|
170
|
+
if sample_weights is not None:
|
171
|
+
sample_weights = np.array(sample_weights)
|
172
|
+
if len(sample_weights) != len(y_true):
|
173
|
+
raise ValueError("sample_weights must have the same length as y_true and y_pred")
|
174
|
+
loss *= sample_weights
|
175
|
+
|
176
|
+
if method == 'mean':
|
177
|
+
return np.mean(loss)
|
178
|
+
elif method == 'sum':
|
179
|
+
return np.sum(loss)
|
180
|
+
elif method == 'none':
|
181
|
+
return loss
|
182
|
+
else:
|
183
|
+
raise ValueError("Invalid method. Choose from 'mean', 'sum', or 'none'")
|
184
|
+
|
185
|
+
def relative_squared_error(y_true, y_pred):
|
186
|
+
"""Compute Relative Squared Error (RSE)"""
|
187
|
+
validator.validate_all(y_true, y_pred)
|
188
|
+
return np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2)
|
189
|
+
|
190
|
+
def mean_squared_log_error(y_true, y_pred, sample_weights=None, method='mean', squared=True):
|
191
|
+
"""Compute Logarithmic Mean Squared Error (MSLE) or Root Mean Squared Log Error (RMSLE)"""
|
192
|
+
validator.validate_all(y_true, y_pred, log_based=True)
|
193
|
+
errors = (np.log1p(y_true) - np.log1p(y_pred)) ** 2
|
194
|
+
|
195
|
+
if sample_weights is not None:
|
196
|
+
sample_weights = np.array(sample_weights)
|
197
|
+
if len(sample_weights) != len(y_true):
|
198
|
+
raise ValueError("sample_weights must have the same length as y_true and y_pred")
|
199
|
+
errors *= sample_weights
|
200
|
+
|
201
|
+
if method == 'mean':
|
202
|
+
result = np.mean(errors)
|
203
|
+
elif method == 'sum':
|
204
|
+
result = np.sum(errors)
|
205
|
+
elif method == 'none':
|
206
|
+
return errors
|
207
|
+
else:
|
208
|
+
raise ValueError("Invalid method. Choose from 'mean', 'sum', or 'none'")
|
209
|
+
|
210
|
+
return result if squared else np.sqrt(result)
|
211
|
+
|
212
|
+
def root_mean_squared_log_error(y_true, y_pred, sample_weights=None, method='mean'):
|
213
|
+
"""Compute Root Mean Squared Logarithmic Error (RMSLE)"""
|
214
|
+
validator.validate_all(y_true, y_pred, log_based=True)
|
215
|
+
errors = (np.log1p(y_true) - np.log1p(y_pred)) ** 2
|
216
|
+
|
217
|
+
if sample_weights is not None:
|
218
|
+
sample_weights = np.array(sample_weights)
|
219
|
+
if len(sample_weights) != len(y_true):
|
220
|
+
raise ValueError("sample_weights must have the same length as y_true and y_pred")
|
221
|
+
errors *= sample_weights
|
222
|
+
|
223
|
+
if method == 'mean':
|
224
|
+
return np.sqrt(np.mean(errors))
|
225
|
+
elif method == 'sum':
|
226
|
+
return np.sqrt(np.sum(errors))
|
227
|
+
elif method == 'none':
|
228
|
+
return np.sqrt(errors)
|
229
|
+
else:
|
230
|
+
raise ValueError("Invalid method. Choose from 'mean', 'sum', or 'none'")
|
231
|
+
|
232
|
+
def log_cosh_loss(y_true, y_pred, sample_weights=None, method='mean'):
|
233
|
+
"""Compute Log-Cosh Loss"""
|
234
|
+
validator.validate_all(y_true, y_pred)
|
235
|
+
errors = np.log(np.cosh(y_pred - y_true))
|
236
|
+
|
237
|
+
if sample_weights is not None:
|
238
|
+
sample_weights = np.array(sample_weights)
|
239
|
+
if len(sample_weights) != len(y_true):
|
240
|
+
raise ValueError("sample_weights must have the same length as y_true and y_pred")
|
241
|
+
errors *= sample_weights
|
242
|
+
|
243
|
+
if method == 'mean':
|
244
|
+
return np.mean(errors)
|
245
|
+
elif method == 'sum':
|
246
|
+
return np.sum(errors)
|
247
|
+
elif method == 'none':
|
248
|
+
return errors
|
249
|
+
else:
|
250
|
+
raise ValueError("Invalid method. Choose from 'mean', 'sum', or 'none'")
|
251
|
+
|
252
|
+
def explained_variance(y_true, y_pred):
|
253
|
+
"""Compute Explained Variance Score"""
|
254
|
+
validator.validate_all(y_true, y_pred)
|
255
|
+
variance_y_true = np.var(y_true)
|
256
|
+
return 1 - (np.var(y_true - y_pred) / variance_y_true) if variance_y_true != 0 else 0
|
257
|
+
|
258
|
+
def median_absolute_error(y_true, y_pred, sample_weights=None):
|
259
|
+
"""Compute Median Absolute Error"""
|
260
|
+
validator = DataValidator()
|
261
|
+
validator.validate_all(y_true, y_pred)
|
262
|
+
errors = np.abs(y_true - y_pred)
|
263
|
+
|
264
|
+
if sample_weights is not None:
|
265
|
+
sample_weights = np.array(sample_weights)
|
266
|
+
if len(sample_weights) != len(y_true):
|
267
|
+
raise ValueError("sample_weights must have the same length as y_true and y_pred")
|
268
|
+
errors *= sample_weights
|
269
|
+
|
270
|
+
return np.median(errors)
|
271
|
+
|
@@ -0,0 +1,71 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
|
4
|
+
class DataValidator:
|
5
|
+
def __init__(self, raise_warning=True):
|
6
|
+
"""Initialize the DataValidator class"""
|
7
|
+
self.raise_warning = raise_warning
|
8
|
+
|
9
|
+
def check_data_type(self, y_true, y_pred):
|
10
|
+
"""Check if input data types are valid"""
|
11
|
+
valid_types = (np.ndarray, pd.Series, pd.DataFrame, list)
|
12
|
+
if not isinstance(y_true, valid_types) or not isinstance(y_pred, valid_types):
|
13
|
+
raise TypeError("y_true and y_pred must be numpy array, pandas series, or list")
|
14
|
+
|
15
|
+
def check_missing_values(self, y_true, y_pred):
|
16
|
+
"""Check for missing values"""
|
17
|
+
if np.any(pd.isnull(y_true)) or np.any(pd.isnull(y_pred)):
|
18
|
+
raise ValueError("Missing values (NaN) detected in data")
|
19
|
+
|
20
|
+
def check_inf_values(self, y_true, y_pred):
|
21
|
+
"""Check for infinite values"""
|
22
|
+
if np.any(np.isinf(y_true)) or np.any(np.isinf(y_pred)):
|
23
|
+
raise ValueError("Infinite values (inf) detected in data")
|
24
|
+
|
25
|
+
def check_lengths(self, y_true, y_pred):
|
26
|
+
"""Check if y_true and y_pred have the same length"""
|
27
|
+
if len(y_true) != len(y_pred):
|
28
|
+
raise ValueError("y_true and y_pred must have the same length")
|
29
|
+
|
30
|
+
def check_numeric_values(self, y_true, y_pred):
|
31
|
+
"""Check if values are numeric"""
|
32
|
+
if not np.issubdtype(np.array(y_true).dtype, np.number) or not np.issubdtype(np.array(y_pred).dtype, np.number):
|
33
|
+
raise TypeError("y_true and y_pred must contain numeric values")
|
34
|
+
|
35
|
+
def check_variance(self, y_true, y_pred):
|
36
|
+
"""Check if variance of y_true is zero (can cause issues in R-squared calculation)"""
|
37
|
+
if np.var(y_true) == 0:
|
38
|
+
raise ValueError("Variance of y_true is zero. R-squared may not be meaningful")
|
39
|
+
|
40
|
+
def check_non_negative(self, y_true, y_pred):
|
41
|
+
"""Check that values are non-negative for Logarithmic Mean Squared Error"""
|
42
|
+
if np.any(y_true < -1) or np.any(y_pred < -1):
|
43
|
+
raise ValueError("y_true and y_pred must be greater than or equal to -1 for log-based metrics")
|
44
|
+
|
45
|
+
def check_multicollinearity(self, X, threshold=0.9):
|
46
|
+
"""Check for multicollinearity in input features"""
|
47
|
+
if isinstance(X, pd.DataFrame):
|
48
|
+
corr_matrix = X.corr().abs()
|
49
|
+
high_corr = (corr_matrix > threshold).sum().sum() - len(X.columns)
|
50
|
+
if high_corr > 0:
|
51
|
+
raise ValueError("High multicollinearity detected in input features")
|
52
|
+
else:
|
53
|
+
if self.raise_warning:
|
54
|
+
print("Warning: Multicollinearity check requires a pandas DataFrame")
|
55
|
+
|
56
|
+
def validate_all(self, y_true, y_pred, log_based=False, mape_based=False):
|
57
|
+
"""Run all validation checks"""
|
58
|
+
self.check_data_type(y_true, y_pred)
|
59
|
+
self.check_missing_values(y_true, y_pred)
|
60
|
+
self.check_inf_values(y_true, y_pred)
|
61
|
+
self.check_lengths(y_true, y_pred)
|
62
|
+
self.check_numeric_values(y_true, y_pred)
|
63
|
+
self.check_variance(y_true, y_pred)
|
64
|
+
if log_based or mape_based:
|
65
|
+
self.check_non_negative(y_true, y_pred) # Ensure non-negative values for log-based functions and MAPE
|
66
|
+
return True # Return True if all checks pass
|
67
|
+
|
68
|
+
|
69
|
+
# Example usage
|
70
|
+
if __name__ == "__main__":
|
71
|
+
pass
|
@@ -2,13 +2,19 @@ LICENSE
|
|
2
2
|
README.md
|
3
3
|
setup.py
|
4
4
|
Moral88/__init__.py
|
5
|
+
Moral88/classification.py
|
6
|
+
Moral88/clustering.py
|
5
7
|
Moral88/regression.py
|
6
|
-
Moral88/segmentation.py
|
7
8
|
Moral88/utils.py
|
8
9
|
Moral88.egg-info/PKG-INFO
|
9
10
|
Moral88.egg-info/SOURCES.txt
|
10
11
|
Moral88.egg-info/dependency_links.txt
|
11
12
|
Moral88.egg-info/requires.txt
|
12
13
|
Moral88.egg-info/top_level.txt
|
13
|
-
|
14
|
-
|
14
|
+
Test/__init__.py
|
15
|
+
Test/test_classification.py
|
16
|
+
Test/test_clustering.py
|
17
|
+
Test/test_regression.py
|
18
|
+
test/test_classification.py
|
19
|
+
test/test_clustering.py
|
20
|
+
test/test_regression.py
|
@@ -1,2 +1,2 @@
|
|
1
1
|
Moral88
|
2
|
-
|
2
|
+
Test
|