smallaxe 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smallaxe/__init__.py +157 -0
- smallaxe/_config.py +37 -0
- smallaxe/auto/__init__.py +1 -0
- smallaxe/datasets/__init__.py +13 -0
- smallaxe/datasets/_data.py +240 -0
- smallaxe/exceptions/__init__.py +120 -0
- smallaxe/metrics/__init__.py +35 -0
- smallaxe/metrics/classification.py +241 -0
- smallaxe/metrics/regression.py +301 -0
- smallaxe/pipeline/__init__.py +5 -0
- smallaxe/pipeline/pipeline.py +691 -0
- smallaxe/preprocessing/__init__.py +11 -0
- smallaxe/preprocessing/encoder.py +410 -0
- smallaxe/preprocessing/imputer.py +327 -0
- smallaxe/preprocessing/scaler.py +285 -0
- smallaxe/search/__init__.py +1 -0
- smallaxe/training/__init__.py +16 -0
- smallaxe/training/base.py +764 -0
- smallaxe/training/classifiers.py +127 -0
- smallaxe/training/mixins/__init__.py +15 -0
- smallaxe/training/mixins/metadata_mixin.py +158 -0
- smallaxe/training/mixins/param_mixin.py +151 -0
- smallaxe/training/mixins/persistence_mixin.py +164 -0
- smallaxe/training/mixins/spark_model_mixin.py +255 -0
- smallaxe/training/mixins/validation_mixin.py +228 -0
- smallaxe/training/random_forest.py +198 -0
- smallaxe/training/regressors.py +125 -0
- smallaxe/viz/__init__.py +1 -0
- smallaxe-0.1.0.dist-info/METADATA +204 -0
- smallaxe-0.1.0.dist-info/RECORD +33 -0
- smallaxe-0.1.0.dist-info/WHEEL +5 -0
- smallaxe-0.1.0.dist-info/licenses/LICENSE +21 -0
- smallaxe-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""Classification metrics for evaluating model predictions."""
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from pyspark.ml.evaluation import BinaryClassificationEvaluator
|
|
6
|
+
from pyspark.sql import DataFrame
|
|
7
|
+
from pyspark.sql import functions as F
|
|
8
|
+
|
|
9
|
+
from smallaxe.exceptions import ColumnNotFoundError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _validate_columns(df: DataFrame, *cols: str) -> None:
|
|
13
|
+
"""Validate that required columns exist in the DataFrame.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
df: PySpark DataFrame.
|
|
17
|
+
*cols: Column names to validate.
|
|
18
|
+
|
|
19
|
+
Raises:
|
|
20
|
+
ColumnNotFoundError: If any required column is missing.
|
|
21
|
+
"""
|
|
22
|
+
available_columns: List[str] = df.columns
|
|
23
|
+
for col in cols:
|
|
24
|
+
if col not in available_columns:
|
|
25
|
+
raise ColumnNotFoundError(column=col, available_columns=available_columns)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def accuracy(
|
|
29
|
+
df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label"
|
|
30
|
+
) -> float:
|
|
31
|
+
"""Compute classification accuracy.
|
|
32
|
+
|
|
33
|
+
Accuracy = (TP + TN) / (TP + TN + FP + FN) = correct / total
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
df: PySpark DataFrame containing true and predicted labels.
|
|
37
|
+
label_col: Name of the column containing true labels. Default is 'label'.
|
|
38
|
+
prediction_col: Name of the column containing predictions. Default is 'predict_label'.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Accuracy as a float between 0 and 1.
|
|
42
|
+
|
|
43
|
+
Raises:
|
|
44
|
+
ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
|
|
45
|
+
"""
|
|
46
|
+
_validate_columns(df, label_col, prediction_col)
|
|
47
|
+
|
|
48
|
+
total_count = df.count()
|
|
49
|
+
if total_count == 0:
|
|
50
|
+
return 0.0
|
|
51
|
+
|
|
52
|
+
correct_count = df.filter(F.col(label_col) == F.col(prediction_col)).count()
|
|
53
|
+
return float(correct_count / total_count)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def precision(
|
|
57
|
+
df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label"
|
|
58
|
+
) -> float:
|
|
59
|
+
"""Compute precision for binary classification.
|
|
60
|
+
|
|
61
|
+
Precision = TP / (TP + FP)
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
df: PySpark DataFrame containing true and predicted labels.
|
|
65
|
+
label_col: Name of the column containing true labels (0 or 1). Default is 'label'.
|
|
66
|
+
prediction_col: Name of the column containing predictions (0 or 1). Default is 'predict_label'.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Precision as a float between 0 and 1.
|
|
70
|
+
Returns 0.0 if there are no positive predictions.
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
|
|
74
|
+
"""
|
|
75
|
+
_validate_columns(df, label_col, prediction_col)
|
|
76
|
+
|
|
77
|
+
# Count true positives (predicted positive and actually positive)
|
|
78
|
+
true_positives = df.filter((F.col(prediction_col) == 1) & (F.col(label_col) == 1)).count()
|
|
79
|
+
|
|
80
|
+
# Count all positive predictions
|
|
81
|
+
predicted_positives = df.filter(F.col(prediction_col) == 1).count()
|
|
82
|
+
|
|
83
|
+
if predicted_positives == 0:
|
|
84
|
+
return 0.0
|
|
85
|
+
|
|
86
|
+
return float(true_positives / predicted_positives)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def recall(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
|
|
90
|
+
"""Compute recall (sensitivity, true positive rate) for binary classification.
|
|
91
|
+
|
|
92
|
+
Recall = TP / (TP + FN)
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
df: PySpark DataFrame containing true and predicted labels.
|
|
96
|
+
label_col: Name of the column containing true labels (0 or 1). Default is 'label'.
|
|
97
|
+
prediction_col: Name of the column containing predictions (0 or 1). Default is 'predict_label'.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Recall as a float between 0 and 1.
|
|
101
|
+
Returns 0.0 if there are no actual positive labels.
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
|
|
105
|
+
"""
|
|
106
|
+
_validate_columns(df, label_col, prediction_col)
|
|
107
|
+
|
|
108
|
+
# Count true positives (predicted positive and actually positive)
|
|
109
|
+
true_positives = df.filter((F.col(prediction_col) == 1) & (F.col(label_col) == 1)).count()
|
|
110
|
+
|
|
111
|
+
# Count all actual positives
|
|
112
|
+
actual_positives = df.filter(F.col(label_col) == 1).count()
|
|
113
|
+
|
|
114
|
+
if actual_positives == 0:
|
|
115
|
+
return 0.0
|
|
116
|
+
|
|
117
|
+
return float(true_positives / actual_positives)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def f1_score(
|
|
121
|
+
df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label"
|
|
122
|
+
) -> float:
|
|
123
|
+
"""Compute F1 score for binary classification.
|
|
124
|
+
|
|
125
|
+
F1 = 2 * (precision * recall) / (precision + recall)
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
df: PySpark DataFrame containing true and predicted labels.
|
|
129
|
+
label_col: Name of the column containing true labels (0 or 1). Default is 'label'.
|
|
130
|
+
prediction_col: Name of the column containing predictions (0 or 1). Default is 'predict_label'.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
F1 score as a float between 0 and 1.
|
|
134
|
+
Returns 0.0 if precision + recall = 0.
|
|
135
|
+
|
|
136
|
+
Raises:
|
|
137
|
+
ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
|
|
138
|
+
"""
|
|
139
|
+
_validate_columns(df, label_col, prediction_col)
|
|
140
|
+
|
|
141
|
+
prec = precision(df, label_col, prediction_col)
|
|
142
|
+
rec = recall(df, label_col, prediction_col)
|
|
143
|
+
|
|
144
|
+
if prec + rec == 0:
|
|
145
|
+
return 0.0
|
|
146
|
+
|
|
147
|
+
return float(2 * (prec * rec) / (prec + rec))
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def auc_roc(df, label_col="label", probability_col="probability"):
|
|
151
|
+
_validate_columns(df, label_col, probability_col)
|
|
152
|
+
|
|
153
|
+
# 1. Check for empty DataFrame
|
|
154
|
+
# Spark's evaluator might return 0.5 for empty sets; your test wants 0.0.
|
|
155
|
+
if df.storageLevel.useMemory or df.limit(1).count() == 0:
|
|
156
|
+
if df.limit(1).count() == 0:
|
|
157
|
+
return 0.0
|
|
158
|
+
|
|
159
|
+
# 2. Check for single-class data (No negatives or no positives)
|
|
160
|
+
# Spark often returns 1.0 or NaN here; your test expects 0.0.
|
|
161
|
+
distinct_labels = [row[0] for row in df.select(label_col).distinct().collect()]
|
|
162
|
+
if len(distinct_labels) < 2:
|
|
163
|
+
return 0.0
|
|
164
|
+
|
|
165
|
+
# 3. Use the Spark Evaluator for the heavy lifting
|
|
166
|
+
evaluator = BinaryClassificationEvaluator(
|
|
167
|
+
labelCol=label_col, rawPredictionCol=probability_col, metricName="areaUnderROC"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return float(evaluator.evaluate(df))
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def auc_pr(df: DataFrame, label_col: str = "label", probability_col: str = "probability") -> float:
|
|
174
|
+
"""Compute Area Under the Precision-Recall Curve (AUC-PR)."""
|
|
175
|
+
_validate_columns(df, label_col, probability_col)
|
|
176
|
+
|
|
177
|
+
# 1. Handle Empty DataFrame (matches your test requirements)
|
|
178
|
+
if df.limit(1).count() == 0:
|
|
179
|
+
return 0.0
|
|
180
|
+
|
|
181
|
+
# 2. Check for the existence of positive labels
|
|
182
|
+
# AUC-PR is defined by precision/recall; if there are no positives,
|
|
183
|
+
# recall is undefined. Your manual code returns 0.0.
|
|
184
|
+
has_positives = df.filter(F.col(label_col) == 1).limit(1).count() > 0
|
|
185
|
+
if not has_positives:
|
|
186
|
+
return 0.0
|
|
187
|
+
|
|
188
|
+
# 3. Use the Spark Evaluator
|
|
189
|
+
evaluator = BinaryClassificationEvaluator(
|
|
190
|
+
labelCol=label_col, rawPredictionCol=probability_col, metricName="areaUnderPR"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
return float(evaluator.evaluate(df))
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def log_loss(
|
|
197
|
+
df: DataFrame,
|
|
198
|
+
label_col: str = "label",
|
|
199
|
+
probability_col: str = "probability",
|
|
200
|
+
eps: float = 1e-15,
|
|
201
|
+
) -> float:
|
|
202
|
+
"""Compute logarithmic loss (cross-entropy loss).
|
|
203
|
+
|
|
204
|
+
Log Loss = -(1/n) * sum(y * log(p) + (1-y) * log(1-p))
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
df: PySpark DataFrame containing true labels and probability scores.
|
|
208
|
+
label_col: Name of the column containing true labels (0 or 1). Default is 'label'.
|
|
209
|
+
probability_col: Name of the column containing probability scores. Default is 'probability'.
|
|
210
|
+
eps: Small value to avoid log(0). Default is 1e-15.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
Log loss as a float. Lower values indicate better predictions.
|
|
214
|
+
|
|
215
|
+
Raises:
|
|
216
|
+
ColumnNotFoundError: If label_col or probability_col is not in the DataFrame.
|
|
217
|
+
"""
|
|
218
|
+
_validate_columns(df, label_col, probability_col)
|
|
219
|
+
|
|
220
|
+
# Clip probabilities to avoid log(0)
|
|
221
|
+
clipped_prob = (
|
|
222
|
+
F.when(F.col(probability_col) < eps, eps)
|
|
223
|
+
.when(F.col(probability_col) > 1 - eps, 1 - eps)
|
|
224
|
+
.otherwise(F.col(probability_col))
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# Calculate log loss
|
|
228
|
+
# -[y * log(p) + (1-y) * log(1-p)]
|
|
229
|
+
result = df.select(
|
|
230
|
+
F.avg(
|
|
231
|
+
-(
|
|
232
|
+
F.col(label_col) * F.log(clipped_prob)
|
|
233
|
+
+ (1 - F.col(label_col)) * F.log(1 - clipped_prob)
|
|
234
|
+
)
|
|
235
|
+
).alias("log_loss")
|
|
236
|
+
).first()
|
|
237
|
+
|
|
238
|
+
if result["log_loss"] is None:
|
|
239
|
+
return 0.0
|
|
240
|
+
|
|
241
|
+
return float(result["log_loss"])
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""Regression metrics for evaluating model predictions."""
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from pyspark.ml.evaluation import RegressionEvaluator
|
|
6
|
+
from pyspark.sql import DataFrame
|
|
7
|
+
from pyspark.sql import functions as F
|
|
8
|
+
|
|
9
|
+
from smallaxe.exceptions import ColumnNotFoundError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _validate_columns(df: DataFrame, label_col: str, prediction_col: str) -> None:
|
|
13
|
+
"""Validate that required columns exist in the DataFrame.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
df: PySpark DataFrame.
|
|
17
|
+
label_col: Name of the column containing true labels.
|
|
18
|
+
prediction_col: Name of the column containing predictions.
|
|
19
|
+
|
|
20
|
+
Raises:
|
|
21
|
+
ColumnNotFoundError: If any required column is missing.
|
|
22
|
+
"""
|
|
23
|
+
available_columns: List[str] = df.columns
|
|
24
|
+
for col in [label_col, prediction_col]:
|
|
25
|
+
if col not in available_columns:
|
|
26
|
+
raise ColumnNotFoundError(column=col, available_columns=available_columns)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# def mse(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
|
|
30
|
+
# """Compute Mean Squared Error.
|
|
31
|
+
|
|
32
|
+
# MSE = (1/n) * sum((y_true - y_pred)^2)
|
|
33
|
+
|
|
34
|
+
# Args:
|
|
35
|
+
# df: PySpark DataFrame containing true and predicted values.
|
|
36
|
+
# label_col: Name of the column containing true labels. Default is 'label'.
|
|
37
|
+
# prediction_col: Name of the column containing predictions. Default is 'predict_label'.
|
|
38
|
+
|
|
39
|
+
# Returns:
|
|
40
|
+
# Mean Squared Error as a float.
|
|
41
|
+
|
|
42
|
+
# Raises:
|
|
43
|
+
# ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
|
|
44
|
+
# """
|
|
45
|
+
# _validate_columns(df, label_col, prediction_col)
|
|
46
|
+
|
|
47
|
+
# result = df.select(
|
|
48
|
+
# F.avg(F.pow(F.col(label_col) - F.col(prediction_col), 2)).alias("mse")
|
|
49
|
+
# ).first()
|
|
50
|
+
|
|
51
|
+
# return float(result["mse"]) if result["mse"] is not None else 0.0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def mse(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
|
|
55
|
+
"""Compute Mean Squared Error using Spark's RegressionEvaluator."""
|
|
56
|
+
_validate_columns(df, label_col, prediction_col)
|
|
57
|
+
|
|
58
|
+
# 1. Handle Empty DataFrame (consistent with your other metric functions)
|
|
59
|
+
if df.limit(1).count() == 0:
|
|
60
|
+
return 0.0
|
|
61
|
+
|
|
62
|
+
# 2. Setup the Regression Evaluator
|
|
63
|
+
evaluator = RegressionEvaluator(
|
|
64
|
+
labelCol=label_col, predictionCol=prediction_col, metricName="mse"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# 3. Calculate MSE
|
|
68
|
+
# Spark returns a float; if the DF is empty (caught above) or invalid,
|
|
69
|
+
try:
|
|
70
|
+
result = evaluator.evaluate(df)
|
|
71
|
+
return float(result)
|
|
72
|
+
except Exception:
|
|
73
|
+
return 0.0
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# def rmse(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
|
|
77
|
+
# """Compute Root Mean Squared Error.
|
|
78
|
+
|
|
79
|
+
# RMSE = sqrt(MSE) = sqrt((1/n) * sum((y_true - y_pred)^2))
|
|
80
|
+
|
|
81
|
+
# Args:
|
|
82
|
+
# df: PySpark DataFrame containing true and predicted values.
|
|
83
|
+
# label_col: Name of the column containing true labels. Default is 'label'.
|
|
84
|
+
# prediction_col: Name of the column containing predictions. Default is 'predict_label'.
|
|
85
|
+
|
|
86
|
+
# Returns:
|
|
87
|
+
# Root Mean Squared Error as a float.
|
|
88
|
+
|
|
89
|
+
# Raises:
|
|
90
|
+
# ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
|
|
91
|
+
# """
|
|
92
|
+
# _validate_columns(df, label_col, prediction_col)
|
|
93
|
+
|
|
94
|
+
# result = df.select(
|
|
95
|
+
# F.sqrt(F.avg(F.pow(F.col(label_col) - F.col(prediction_col), 2))).alias("rmse")
|
|
96
|
+
# ).first()
|
|
97
|
+
|
|
98
|
+
# return float(result["rmse"]) if result["rmse"] is not None else 0.0
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def rmse(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
|
|
102
|
+
"""Compute Root Mean Squared Error using Spark's RegressionEvaluator."""
|
|
103
|
+
_validate_columns(df, label_col, prediction_col)
|
|
104
|
+
|
|
105
|
+
# 1. Handle Empty DataFrame
|
|
106
|
+
# Returns 0.0 to match the behavior of your previous metric functions
|
|
107
|
+
if df.limit(1).count() == 0:
|
|
108
|
+
return 0.0
|
|
109
|
+
|
|
110
|
+
# 2. Setup the Regression Evaluator for RMSE
|
|
111
|
+
evaluator = RegressionEvaluator(
|
|
112
|
+
labelCol=label_col, predictionCol=prediction_col, metricName="rmse"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# 3. Calculate RMSE
|
|
116
|
+
try:
|
|
117
|
+
result = evaluator.evaluate(df)
|
|
118
|
+
return float(result)
|
|
119
|
+
except Exception:
|
|
120
|
+
# Fallback for unexpected calculation errors or data issues
|
|
121
|
+
return 0.0
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# def mae(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
|
|
125
|
+
# """Compute Mean Absolute Error.
|
|
126
|
+
|
|
127
|
+
# MAE = (1/n) * sum(|y_true - y_pred|)
|
|
128
|
+
|
|
129
|
+
# Args:
|
|
130
|
+
# df: PySpark DataFrame containing true and predicted values.
|
|
131
|
+
# label_col: Name of the column containing true labels. Default is 'label'.
|
|
132
|
+
# prediction_col: Name of the column containing predictions. Default is 'predict_label'.
|
|
133
|
+
|
|
134
|
+
# Returns:
|
|
135
|
+
# Mean Absolute Error as a float.
|
|
136
|
+
|
|
137
|
+
# Raises:
|
|
138
|
+
# ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
|
|
139
|
+
# """
|
|
140
|
+
# _validate_columns(df, label_col, prediction_col)
|
|
141
|
+
|
|
142
|
+
# result = df.select(F.avg(F.abs(F.col(label_col) - F.col(prediction_col))).alias("mae")).first()
|
|
143
|
+
|
|
144
|
+
# return float(result["mae"]) if result["mae"] is not None else 0.0
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def mae(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
|
|
148
|
+
"""Compute Mean Absolute Error using Spark's RegressionEvaluator."""
|
|
149
|
+
_validate_columns(df, label_col, prediction_col)
|
|
150
|
+
|
|
151
|
+
# 1. Handle Empty DataFrame
|
|
152
|
+
# Returns 0.0 to remain consistent with your previous logic
|
|
153
|
+
if df.limit(1).count() == 0:
|
|
154
|
+
return 0.0
|
|
155
|
+
|
|
156
|
+
# 2. Setup the Regression Evaluator for MAE
|
|
157
|
+
evaluator = RegressionEvaluator(
|
|
158
|
+
labelCol=label_col, predictionCol=prediction_col, metricName="mae"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# 3. Calculate MAE
|
|
162
|
+
try:
|
|
163
|
+
result = evaluator.evaluate(df)
|
|
164
|
+
return float(result)
|
|
165
|
+
except Exception:
|
|
166
|
+
# Fallback for data issues or unexpected calculation errors
|
|
167
|
+
return 0.0
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# def r2(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
|
|
171
|
+
# """Compute R-squared (Coefficient of Determination).
|
|
172
|
+
|
|
173
|
+
# R2 = 1 - (SS_res / SS_tot)
|
|
174
|
+
# where:
|
|
175
|
+
# SS_res = sum((y_true - y_pred)^2) (residual sum of squares)
|
|
176
|
+
# SS_tot = sum((y_true - y_mean)^2) (total sum of squares)
|
|
177
|
+
|
|
178
|
+
# Args:
|
|
179
|
+
# df: PySpark DataFrame containing true and predicted values.
|
|
180
|
+
# label_col: Name of the column containing true labels. Default is 'label'.
|
|
181
|
+
# prediction_col: Name of the column containing predictions. Default is 'predict_label'.
|
|
182
|
+
|
|
183
|
+
# Returns:
|
|
184
|
+
# R-squared score as a float. Can be negative for very poor predictions.
|
|
185
|
+
|
|
186
|
+
# Raises:
|
|
187
|
+
# ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
|
|
188
|
+
# """
|
|
189
|
+
# _validate_columns(df, label_col, prediction_col)
|
|
190
|
+
|
|
191
|
+
# # Compute mean of true labels
|
|
192
|
+
# mean_label = df.select(F.avg(F.col(label_col))).first()[0]
|
|
193
|
+
|
|
194
|
+
# if mean_label is None:
|
|
195
|
+
# return 0.0
|
|
196
|
+
|
|
197
|
+
# # Compute SS_res and SS_tot
|
|
198
|
+
# result = df.select(
|
|
199
|
+
# F.sum(F.pow(F.col(label_col) - F.col(prediction_col), 2)).alias("ss_res"),
|
|
200
|
+
# F.sum(F.pow(F.col(label_col) - F.lit(mean_label), 2)).alias("ss_tot"),
|
|
201
|
+
# ).first()
|
|
202
|
+
|
|
203
|
+
# ss_res = result["ss_res"]
|
|
204
|
+
# ss_tot = result["ss_tot"]
|
|
205
|
+
|
|
206
|
+
# if ss_res is None or ss_tot is None:
|
|
207
|
+
# return 0.0
|
|
208
|
+
|
|
209
|
+
# # Handle case where SS_tot is zero (all true values are the same)
|
|
210
|
+
# if ss_tot == 0:
|
|
211
|
+
# return 1.0 if ss_res == 0 else 0.0
|
|
212
|
+
|
|
213
|
+
# return float(1 - (ss_res / ss_tot))
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def r2(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
|
|
217
|
+
"""Compute R-squared (Coefficient of Determination) using Spark's RegressionEvaluator."""
|
|
218
|
+
_validate_columns(df, label_col, prediction_col)
|
|
219
|
+
|
|
220
|
+
# 1. Handle Empty DataFrame
|
|
221
|
+
if df.limit(1).count() == 0:
|
|
222
|
+
return 0.0
|
|
223
|
+
|
|
224
|
+
# 2. Handle Constant Labels (SS_tot = 0)
|
|
225
|
+
# If the min and max of the label column are the same, the labels are constant.
|
|
226
|
+
stats = df.select(
|
|
227
|
+
F.min(label_col).alias("min_label"),
|
|
228
|
+
F.max(label_col).alias("max_label"),
|
|
229
|
+
F.count_distinct(F.when(F.col(label_col) == F.col(prediction_col), 1)).alias(
|
|
230
|
+
"perfect_match"
|
|
231
|
+
),
|
|
232
|
+
).collect()[0]
|
|
233
|
+
|
|
234
|
+
if stats["min_label"] == stats["max_label"]:
|
|
235
|
+
# Check if predictions also match that constant value
|
|
236
|
+
# We check if there are any rows where label != prediction
|
|
237
|
+
mismatch_count = df.filter(F.col(label_col) != F.col(prediction_col)).count()
|
|
238
|
+
return 1.0 if mismatch_count == 0 else 0.0
|
|
239
|
+
|
|
240
|
+
# 3. Setup the Regression Evaluator for R2
|
|
241
|
+
evaluator = RegressionEvaluator(
|
|
242
|
+
labelCol=label_col, predictionCol=prediction_col, metricName="r2"
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# 4. Calculate R2
|
|
246
|
+
try:
|
|
247
|
+
result = evaluator.evaluate(df)
|
|
248
|
+
|
|
249
|
+
# Spark's R2 evaluator returns NaN or -inf in edge cases
|
|
250
|
+
import math
|
|
251
|
+
|
|
252
|
+
if math.isnan(result) or result == float("-inf"):
|
|
253
|
+
return 0.0
|
|
254
|
+
|
|
255
|
+
return float(result)
|
|
256
|
+
except Exception:
|
|
257
|
+
return 0.0
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def mape(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
|
|
261
|
+
"""Compute Mean Absolute Percentage Error.
|
|
262
|
+
|
|
263
|
+
MAPE = (100/n) * sum(|y_true - y_pred| / |y_true|)
|
|
264
|
+
|
|
265
|
+
Note: Rows where label_col is zero are excluded from the calculation
|
|
266
|
+
to avoid division by zero.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
df: PySpark DataFrame containing true and predicted values.
|
|
270
|
+
label_col: Name of the column containing true labels. Default is 'label'.
|
|
271
|
+
prediction_col: Name of the column containing predictions. Default is 'predict_label'.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
Mean Absolute Percentage Error as a float (in percentage, e.g., 10.5 for 10.5%).
|
|
275
|
+
Returns 0.0 if all true values are zero.
|
|
276
|
+
|
|
277
|
+
Raises:
|
|
278
|
+
ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
|
|
279
|
+
"""
|
|
280
|
+
_validate_columns(df, label_col, prediction_col)
|
|
281
|
+
|
|
282
|
+
# Filter out rows where the true label is zero to avoid division by zero
|
|
283
|
+
df_filtered = df.filter(F.col(label_col) != 0)
|
|
284
|
+
|
|
285
|
+
# If all values are zero, return 0.0
|
|
286
|
+
if df_filtered.count() == 0:
|
|
287
|
+
return 0.0
|
|
288
|
+
|
|
289
|
+
result = df_filtered.select(
|
|
290
|
+
F.avg(F.abs(F.col(label_col) - F.col(prediction_col)) / F.abs(F.col(label_col))).alias(
|
|
291
|
+
"mape"
|
|
292
|
+
)
|
|
293
|
+
).first()
|
|
294
|
+
|
|
295
|
+
mape_value = result["mape"]
|
|
296
|
+
|
|
297
|
+
if mape_value is None:
|
|
298
|
+
return 0.0
|
|
299
|
+
|
|
300
|
+
# Return as percentage
|
|
301
|
+
return float(mape_value * 100)
|