smallaxe 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,241 @@
1
+ """Classification metrics for evaluating model predictions."""
2
+
3
+ from typing import List
4
+
5
+ from pyspark.ml.evaluation import BinaryClassificationEvaluator
6
+ from pyspark.sql import DataFrame
7
+ from pyspark.sql import functions as F
8
+
9
+ from smallaxe.exceptions import ColumnNotFoundError
10
+
11
+
12
+ def _validate_columns(df: DataFrame, *cols: str) -> None:
13
+ """Validate that required columns exist in the DataFrame.
14
+
15
+ Args:
16
+ df: PySpark DataFrame.
17
+ *cols: Column names to validate.
18
+
19
+ Raises:
20
+ ColumnNotFoundError: If any required column is missing.
21
+ """
22
+ available_columns: List[str] = df.columns
23
+ for col in cols:
24
+ if col not in available_columns:
25
+ raise ColumnNotFoundError(column=col, available_columns=available_columns)
26
+
27
+
28
+ def accuracy(
29
+ df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label"
30
+ ) -> float:
31
+ """Compute classification accuracy.
32
+
33
+ Accuracy = (TP + TN) / (TP + TN + FP + FN) = correct / total
34
+
35
+ Args:
36
+ df: PySpark DataFrame containing true and predicted labels.
37
+ label_col: Name of the column containing true labels. Default is 'label'.
38
+ prediction_col: Name of the column containing predictions. Default is 'predict_label'.
39
+
40
+ Returns:
41
+ Accuracy as a float between 0 and 1.
42
+
43
+ Raises:
44
+ ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
45
+ """
46
+ _validate_columns(df, label_col, prediction_col)
47
+
48
+ total_count = df.count()
49
+ if total_count == 0:
50
+ return 0.0
51
+
52
+ correct_count = df.filter(F.col(label_col) == F.col(prediction_col)).count()
53
+ return float(correct_count / total_count)
54
+
55
+
56
+ def precision(
57
+ df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label"
58
+ ) -> float:
59
+ """Compute precision for binary classification.
60
+
61
+ Precision = TP / (TP + FP)
62
+
63
+ Args:
64
+ df: PySpark DataFrame containing true and predicted labels.
65
+ label_col: Name of the column containing true labels (0 or 1). Default is 'label'.
66
+ prediction_col: Name of the column containing predictions (0 or 1). Default is 'predict_label'.
67
+
68
+ Returns:
69
+ Precision as a float between 0 and 1.
70
+ Returns 0.0 if there are no positive predictions.
71
+
72
+ Raises:
73
+ ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
74
+ """
75
+ _validate_columns(df, label_col, prediction_col)
76
+
77
+ # Count true positives (predicted positive and actually positive)
78
+ true_positives = df.filter((F.col(prediction_col) == 1) & (F.col(label_col) == 1)).count()
79
+
80
+ # Count all positive predictions
81
+ predicted_positives = df.filter(F.col(prediction_col) == 1).count()
82
+
83
+ if predicted_positives == 0:
84
+ return 0.0
85
+
86
+ return float(true_positives / predicted_positives)
87
+
88
+
89
+ def recall(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
90
+ """Compute recall (sensitivity, true positive rate) for binary classification.
91
+
92
+ Recall = TP / (TP + FN)
93
+
94
+ Args:
95
+ df: PySpark DataFrame containing true and predicted labels.
96
+ label_col: Name of the column containing true labels (0 or 1). Default is 'label'.
97
+ prediction_col: Name of the column containing predictions (0 or 1). Default is 'predict_label'.
98
+
99
+ Returns:
100
+ Recall as a float between 0 and 1.
101
+ Returns 0.0 if there are no actual positive labels.
102
+
103
+ Raises:
104
+ ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
105
+ """
106
+ _validate_columns(df, label_col, prediction_col)
107
+
108
+ # Count true positives (predicted positive and actually positive)
109
+ true_positives = df.filter((F.col(prediction_col) == 1) & (F.col(label_col) == 1)).count()
110
+
111
+ # Count all actual positives
112
+ actual_positives = df.filter(F.col(label_col) == 1).count()
113
+
114
+ if actual_positives == 0:
115
+ return 0.0
116
+
117
+ return float(true_positives / actual_positives)
118
+
119
+
120
+ def f1_score(
121
+ df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label"
122
+ ) -> float:
123
+ """Compute F1 score for binary classification.
124
+
125
+ F1 = 2 * (precision * recall) / (precision + recall)
126
+
127
+ Args:
128
+ df: PySpark DataFrame containing true and predicted labels.
129
+ label_col: Name of the column containing true labels (0 or 1). Default is 'label'.
130
+ prediction_col: Name of the column containing predictions (0 or 1). Default is 'predict_label'.
131
+
132
+ Returns:
133
+ F1 score as a float between 0 and 1.
134
+ Returns 0.0 if precision + recall = 0.
135
+
136
+ Raises:
137
+ ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
138
+ """
139
+ _validate_columns(df, label_col, prediction_col)
140
+
141
+ prec = precision(df, label_col, prediction_col)
142
+ rec = recall(df, label_col, prediction_col)
143
+
144
+ if prec + rec == 0:
145
+ return 0.0
146
+
147
+ return float(2 * (prec * rec) / (prec + rec))
148
+
149
+
150
+ def auc_roc(df, label_col="label", probability_col="probability"):
151
+ _validate_columns(df, label_col, probability_col)
152
+
153
+ # 1. Check for empty DataFrame
154
+ # Spark's evaluator might return 0.5 for empty sets; your test wants 0.0.
155
+ if df.storageLevel.useMemory or df.limit(1).count() == 0:
156
+ if df.limit(1).count() == 0:
157
+ return 0.0
158
+
159
+ # 2. Check for single-class data (No negatives or no positives)
160
+ # Spark often returns 1.0 or NaN here; your test expects 0.0.
161
+ distinct_labels = [row[0] for row in df.select(label_col).distinct().collect()]
162
+ if len(distinct_labels) < 2:
163
+ return 0.0
164
+
165
+ # 3. Use the Spark Evaluator for the heavy lifting
166
+ evaluator = BinaryClassificationEvaluator(
167
+ labelCol=label_col, rawPredictionCol=probability_col, metricName="areaUnderROC"
168
+ )
169
+
170
+ return float(evaluator.evaluate(df))
171
+
172
+
173
+ def auc_pr(df: DataFrame, label_col: str = "label", probability_col: str = "probability") -> float:
174
+ """Compute Area Under the Precision-Recall Curve (AUC-PR)."""
175
+ _validate_columns(df, label_col, probability_col)
176
+
177
+ # 1. Handle Empty DataFrame (matches your test requirements)
178
+ if df.limit(1).count() == 0:
179
+ return 0.0
180
+
181
+ # 2. Check for the existence of positive labels
182
+ # AUC-PR is defined by precision/recall; if there are no positives,
183
+ # recall is undefined. Your manual code returns 0.0.
184
+ has_positives = df.filter(F.col(label_col) == 1).limit(1).count() > 0
185
+ if not has_positives:
186
+ return 0.0
187
+
188
+ # 3. Use the Spark Evaluator
189
+ evaluator = BinaryClassificationEvaluator(
190
+ labelCol=label_col, rawPredictionCol=probability_col, metricName="areaUnderPR"
191
+ )
192
+
193
+ return float(evaluator.evaluate(df))
194
+
195
+
196
+ def log_loss(
197
+ df: DataFrame,
198
+ label_col: str = "label",
199
+ probability_col: str = "probability",
200
+ eps: float = 1e-15,
201
+ ) -> float:
202
+ """Compute logarithmic loss (cross-entropy loss).
203
+
204
+ Log Loss = -(1/n) * sum(y * log(p) + (1-y) * log(1-p))
205
+
206
+ Args:
207
+ df: PySpark DataFrame containing true labels and probability scores.
208
+ label_col: Name of the column containing true labels (0 or 1). Default is 'label'.
209
+ probability_col: Name of the column containing probability scores. Default is 'probability'.
210
+ eps: Small value to avoid log(0). Default is 1e-15.
211
+
212
+ Returns:
213
+ Log loss as a float. Lower values indicate better predictions.
214
+
215
+ Raises:
216
+ ColumnNotFoundError: If label_col or probability_col is not in the DataFrame.
217
+ """
218
+ _validate_columns(df, label_col, probability_col)
219
+
220
+ # Clip probabilities to avoid log(0)
221
+ clipped_prob = (
222
+ F.when(F.col(probability_col) < eps, eps)
223
+ .when(F.col(probability_col) > 1 - eps, 1 - eps)
224
+ .otherwise(F.col(probability_col))
225
+ )
226
+
227
+ # Calculate log loss
228
+ # -[y * log(p) + (1-y) * log(1-p)]
229
+ result = df.select(
230
+ F.avg(
231
+ -(
232
+ F.col(label_col) * F.log(clipped_prob)
233
+ + (1 - F.col(label_col)) * F.log(1 - clipped_prob)
234
+ )
235
+ ).alias("log_loss")
236
+ ).first()
237
+
238
+ if result["log_loss"] is None:
239
+ return 0.0
240
+
241
+ return float(result["log_loss"])
@@ -0,0 +1,301 @@
1
+ """Regression metrics for evaluating model predictions."""
2
+
3
+ from typing import List
4
+
5
+ from pyspark.ml.evaluation import RegressionEvaluator
6
+ from pyspark.sql import DataFrame
7
+ from pyspark.sql import functions as F
8
+
9
+ from smallaxe.exceptions import ColumnNotFoundError
10
+
11
+
12
+ def _validate_columns(df: DataFrame, label_col: str, prediction_col: str) -> None:
13
+ """Validate that required columns exist in the DataFrame.
14
+
15
+ Args:
16
+ df: PySpark DataFrame.
17
+ label_col: Name of the column containing true labels.
18
+ prediction_col: Name of the column containing predictions.
19
+
20
+ Raises:
21
+ ColumnNotFoundError: If any required column is missing.
22
+ """
23
+ available_columns: List[str] = df.columns
24
+ for col in [label_col, prediction_col]:
25
+ if col not in available_columns:
26
+ raise ColumnNotFoundError(column=col, available_columns=available_columns)
27
+
28
+
29
+ # def mse(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
30
+ # """Compute Mean Squared Error.
31
+
32
+ # MSE = (1/n) * sum((y_true - y_pred)^2)
33
+
34
+ # Args:
35
+ # df: PySpark DataFrame containing true and predicted values.
36
+ # label_col: Name of the column containing true labels. Default is 'label'.
37
+ # prediction_col: Name of the column containing predictions. Default is 'predict_label'.
38
+
39
+ # Returns:
40
+ # Mean Squared Error as a float.
41
+
42
+ # Raises:
43
+ # ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
44
+ # """
45
+ # _validate_columns(df, label_col, prediction_col)
46
+
47
+ # result = df.select(
48
+ # F.avg(F.pow(F.col(label_col) - F.col(prediction_col), 2)).alias("mse")
49
+ # ).first()
50
+
51
+ # return float(result["mse"]) if result["mse"] is not None else 0.0
52
+
53
+
54
+ def mse(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
55
+ """Compute Mean Squared Error using Spark's RegressionEvaluator."""
56
+ _validate_columns(df, label_col, prediction_col)
57
+
58
+ # 1. Handle Empty DataFrame (consistent with your other metric functions)
59
+ if df.limit(1).count() == 0:
60
+ return 0.0
61
+
62
+ # 2. Setup the Regression Evaluator
63
+ evaluator = RegressionEvaluator(
64
+ labelCol=label_col, predictionCol=prediction_col, metricName="mse"
65
+ )
66
+
67
+ # 3. Calculate MSE
68
+ # Spark returns a float; if the DF is empty (caught above) or invalid,
69
+ try:
70
+ result = evaluator.evaluate(df)
71
+ return float(result)
72
+ except Exception:
73
+ return 0.0
74
+
75
+
76
+ # def rmse(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
77
+ # """Compute Root Mean Squared Error.
78
+
79
+ # RMSE = sqrt(MSE) = sqrt((1/n) * sum((y_true - y_pred)^2))
80
+
81
+ # Args:
82
+ # df: PySpark DataFrame containing true and predicted values.
83
+ # label_col: Name of the column containing true labels. Default is 'label'.
84
+ # prediction_col: Name of the column containing predictions. Default is 'predict_label'.
85
+
86
+ # Returns:
87
+ # Root Mean Squared Error as a float.
88
+
89
+ # Raises:
90
+ # ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
91
+ # """
92
+ # _validate_columns(df, label_col, prediction_col)
93
+
94
+ # result = df.select(
95
+ # F.sqrt(F.avg(F.pow(F.col(label_col) - F.col(prediction_col), 2))).alias("rmse")
96
+ # ).first()
97
+
98
+ # return float(result["rmse"]) if result["rmse"] is not None else 0.0
99
+
100
+
101
+ def rmse(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
102
+ """Compute Root Mean Squared Error using Spark's RegressionEvaluator."""
103
+ _validate_columns(df, label_col, prediction_col)
104
+
105
+ # 1. Handle Empty DataFrame
106
+ # Returns 0.0 to match the behavior of your previous metric functions
107
+ if df.limit(1).count() == 0:
108
+ return 0.0
109
+
110
+ # 2. Setup the Regression Evaluator for RMSE
111
+ evaluator = RegressionEvaluator(
112
+ labelCol=label_col, predictionCol=prediction_col, metricName="rmse"
113
+ )
114
+
115
+ # 3. Calculate RMSE
116
+ try:
117
+ result = evaluator.evaluate(df)
118
+ return float(result)
119
+ except Exception:
120
+ # Fallback for unexpected calculation errors or data issues
121
+ return 0.0
122
+
123
+
124
+ # def mae(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
125
+ # """Compute Mean Absolute Error.
126
+
127
+ # MAE = (1/n) * sum(|y_true - y_pred|)
128
+
129
+ # Args:
130
+ # df: PySpark DataFrame containing true and predicted values.
131
+ # label_col: Name of the column containing true labels. Default is 'label'.
132
+ # prediction_col: Name of the column containing predictions. Default is 'predict_label'.
133
+
134
+ # Returns:
135
+ # Mean Absolute Error as a float.
136
+
137
+ # Raises:
138
+ # ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
139
+ # """
140
+ # _validate_columns(df, label_col, prediction_col)
141
+
142
+ # result = df.select(F.avg(F.abs(F.col(label_col) - F.col(prediction_col))).alias("mae")).first()
143
+
144
+ # return float(result["mae"]) if result["mae"] is not None else 0.0
145
+
146
+
147
+ def mae(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
148
+ """Compute Mean Absolute Error using Spark's RegressionEvaluator."""
149
+ _validate_columns(df, label_col, prediction_col)
150
+
151
+ # 1. Handle Empty DataFrame
152
+ # Returns 0.0 to remain consistent with your previous logic
153
+ if df.limit(1).count() == 0:
154
+ return 0.0
155
+
156
+ # 2. Setup the Regression Evaluator for MAE
157
+ evaluator = RegressionEvaluator(
158
+ labelCol=label_col, predictionCol=prediction_col, metricName="mae"
159
+ )
160
+
161
+ # 3. Calculate MAE
162
+ try:
163
+ result = evaluator.evaluate(df)
164
+ return float(result)
165
+ except Exception:
166
+ # Fallback for data issues or unexpected calculation errors
167
+ return 0.0
168
+
169
+
170
+ # def r2(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
171
+ # """Compute R-squared (Coefficient of Determination).
172
+
173
+ # R2 = 1 - (SS_res / SS_tot)
174
+ # where:
175
+ # SS_res = sum((y_true - y_pred)^2) (residual sum of squares)
176
+ # SS_tot = sum((y_true - y_mean)^2) (total sum of squares)
177
+
178
+ # Args:
179
+ # df: PySpark DataFrame containing true and predicted values.
180
+ # label_col: Name of the column containing true labels. Default is 'label'.
181
+ # prediction_col: Name of the column containing predictions. Default is 'predict_label'.
182
+
183
+ # Returns:
184
+ # R-squared score as a float. Can be negative for very poor predictions.
185
+
186
+ # Raises:
187
+ # ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
188
+ # """
189
+ # _validate_columns(df, label_col, prediction_col)
190
+
191
+ # # Compute mean of true labels
192
+ # mean_label = df.select(F.avg(F.col(label_col))).first()[0]
193
+
194
+ # if mean_label is None:
195
+ # return 0.0
196
+
197
+ # # Compute SS_res and SS_tot
198
+ # result = df.select(
199
+ # F.sum(F.pow(F.col(label_col) - F.col(prediction_col), 2)).alias("ss_res"),
200
+ # F.sum(F.pow(F.col(label_col) - F.lit(mean_label), 2)).alias("ss_tot"),
201
+ # ).first()
202
+
203
+ # ss_res = result["ss_res"]
204
+ # ss_tot = result["ss_tot"]
205
+
206
+ # if ss_res is None or ss_tot is None:
207
+ # return 0.0
208
+
209
+ # # Handle case where SS_tot is zero (all true values are the same)
210
+ # if ss_tot == 0:
211
+ # return 1.0 if ss_res == 0 else 0.0
212
+
213
+ # return float(1 - (ss_res / ss_tot))
214
+
215
+
216
+ def r2(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
217
+ """Compute R-squared (Coefficient of Determination) using Spark's RegressionEvaluator."""
218
+ _validate_columns(df, label_col, prediction_col)
219
+
220
+ # 1. Handle Empty DataFrame
221
+ if df.limit(1).count() == 0:
222
+ return 0.0
223
+
224
+ # 2. Handle Constant Labels (SS_tot = 0)
225
+ # If the min and max of the label column are the same, the labels are constant.
226
+ stats = df.select(
227
+ F.min(label_col).alias("min_label"),
228
+ F.max(label_col).alias("max_label"),
229
+ F.count_distinct(F.when(F.col(label_col) == F.col(prediction_col), 1)).alias(
230
+ "perfect_match"
231
+ ),
232
+ ).collect()[0]
233
+
234
+ if stats["min_label"] == stats["max_label"]:
235
+ # Check if predictions also match that constant value
236
+ # We check if there are any rows where label != prediction
237
+ mismatch_count = df.filter(F.col(label_col) != F.col(prediction_col)).count()
238
+ return 1.0 if mismatch_count == 0 else 0.0
239
+
240
+ # 3. Setup the Regression Evaluator for R2
241
+ evaluator = RegressionEvaluator(
242
+ labelCol=label_col, predictionCol=prediction_col, metricName="r2"
243
+ )
244
+
245
+ # 4. Calculate R2
246
+ try:
247
+ result = evaluator.evaluate(df)
248
+
249
+ # Spark's R2 evaluator returns NaN or -inf in edge cases
250
+ import math
251
+
252
+ if math.isnan(result) or result == float("-inf"):
253
+ return 0.0
254
+
255
+ return float(result)
256
+ except Exception:
257
+ return 0.0
258
+
259
+
260
+ def mape(df: DataFrame, label_col: str = "label", prediction_col: str = "predict_label") -> float:
261
+ """Compute Mean Absolute Percentage Error.
262
+
263
+ MAPE = (100/n) * sum(|y_true - y_pred| / |y_true|)
264
+
265
+ Note: Rows where label_col is zero are excluded from the calculation
266
+ to avoid division by zero.
267
+
268
+ Args:
269
+ df: PySpark DataFrame containing true and predicted values.
270
+ label_col: Name of the column containing true labels. Default is 'label'.
271
+ prediction_col: Name of the column containing predictions. Default is 'predict_label'.
272
+
273
+ Returns:
274
+ Mean Absolute Percentage Error as a float (in percentage, e.g., 10.5 for 10.5%).
275
+ Returns 0.0 if all true values are zero.
276
+
277
+ Raises:
278
+ ColumnNotFoundError: If label_col or prediction_col is not in the DataFrame.
279
+ """
280
+ _validate_columns(df, label_col, prediction_col)
281
+
282
+ # Filter out rows where the true label is zero to avoid division by zero
283
+ df_filtered = df.filter(F.col(label_col) != 0)
284
+
285
+ # If all values are zero, return 0.0
286
+ if df_filtered.count() == 0:
287
+ return 0.0
288
+
289
+ result = df_filtered.select(
290
+ F.avg(F.abs(F.col(label_col) - F.col(prediction_col)) / F.abs(F.col(label_col))).alias(
291
+ "mape"
292
+ )
293
+ ).first()
294
+
295
+ mape_value = result["mape"]
296
+
297
+ if mape_value is None:
298
+ return 0.0
299
+
300
+ # Return as percentage
301
+ return float(mape_value * 100)
@@ -0,0 +1,5 @@
1
+ """Pipeline module for chaining preprocessing and models."""
2
+
3
+ from .pipeline import Pipeline
4
+
5
+ __all__ = ["Pipeline"]