orca-sdk 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/PKG-INFO +1 -1
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_shared/metrics.py +179 -40
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_shared/metrics_test.py +99 -6
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/data_parsing_test.py +1 -1
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/async_client.py +462 -301
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/classification_model.py +156 -41
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/classification_model_test.py +327 -8
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/client.py +462 -301
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/conftest.py +140 -21
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/datasource.py +45 -2
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/datasource_test.py +120 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/embedding_model.py +32 -24
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/job.py +17 -17
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/memoryset.py +459 -56
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/memoryset_test.py +435 -2
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/regression_model.py +110 -19
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/regression_model_test.py +213 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/telemetry.py +52 -13
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/pyproject.toml +1 -1
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/README.md +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/__init__.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_shared/__init__.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/__init__.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/analysis_ui.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/analysis_ui_style.css +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/auth.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/auth_test.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/common.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/data_parsing.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/pagination.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/pagination_test.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/prediction_result_ui.css +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/prediction_result_ui.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/tqdm_file_reader.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/value_parser.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/_utils/value_parser_test.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/credentials.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/credentials_test.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/embedding_model_test.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/job_test.py +0 -0
- {orca_sdk-0.1.3 → orca_sdk-0.1.5}/orca_sdk/telemetry_test.py +0 -0
|
@@ -9,7 +9,7 @@ IMPORTANT:
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
from dataclasses import dataclass
|
|
12
|
-
from typing import Any, Literal, TypedDict, cast
|
|
12
|
+
from typing import Any, Literal, Sequence, TypedDict, cast
|
|
13
13
|
|
|
14
14
|
import numpy as np
|
|
15
15
|
import sklearn.metrics
|
|
@@ -39,6 +39,66 @@ def transform_eval_pred(eval_pred: Any) -> tuple[NDArray, NDArray[np.float32]]:
|
|
|
39
39
|
return (references, logits)
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
def convert_to_float32_array(
|
|
43
|
+
data: (
|
|
44
|
+
Sequence[float | None]
|
|
45
|
+
| NDArray[np.float32]
|
|
46
|
+
| Sequence[Sequence[float]]
|
|
47
|
+
| Sequence[NDArray[np.float32]]
|
|
48
|
+
| NDArray[np.float32]
|
|
49
|
+
),
|
|
50
|
+
) -> NDArray[np.float32]:
|
|
51
|
+
"""
|
|
52
|
+
Convert a list or array that may contain None values to a float32 numpy array.
|
|
53
|
+
None values are converted to NaN.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
data: Input data that may contain None values
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
A float32 numpy array with None values converted to NaN
|
|
60
|
+
"""
|
|
61
|
+
array = np.array(data)
|
|
62
|
+
# Convert None values to NaN to handle missing values
|
|
63
|
+
if array.dtype == object:
|
|
64
|
+
|
|
65
|
+
def convert_value(x):
|
|
66
|
+
return np.nan if x is None else float(x)
|
|
67
|
+
|
|
68
|
+
array = np.vectorize(convert_value, otypes=[np.float32])(array)
|
|
69
|
+
else:
|
|
70
|
+
array = np.asarray(array, dtype=np.float32)
|
|
71
|
+
return cast(NDArray[np.float32], array)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def calculate_anomaly_score_stats(
|
|
75
|
+
anomaly_scores: NDArray[np.float32] | Sequence[float] | None,
|
|
76
|
+
) -> tuple[float | None, float | None, float | None]:
|
|
77
|
+
"""
|
|
78
|
+
Calculate statistics (mean, median, variance) for anomaly scores.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
anomaly_scores: Anomaly scores as a list, numpy array, or None
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
A tuple of (mean, median, variance). All values are None if anomaly_scores is None.
|
|
85
|
+
"""
|
|
86
|
+
if anomaly_scores is None:
|
|
87
|
+
return (None, None, None)
|
|
88
|
+
|
|
89
|
+
# Convert to numpy array if needed
|
|
90
|
+
if isinstance(anomaly_scores, list):
|
|
91
|
+
anomalies = np.array(anomaly_scores, dtype=np.float32)
|
|
92
|
+
else:
|
|
93
|
+
anomalies = anomaly_scores
|
|
94
|
+
|
|
95
|
+
return (
|
|
96
|
+
float(np.mean(anomalies)),
|
|
97
|
+
float(np.median(anomalies)),
|
|
98
|
+
float(np.var(anomalies)),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
42
102
|
class PRCurve(TypedDict):
|
|
43
103
|
thresholds: list[float]
|
|
44
104
|
precisions: list[float]
|
|
@@ -196,37 +256,93 @@ class ClassificationMetrics:
|
|
|
196
256
|
)
|
|
197
257
|
|
|
198
258
|
|
|
199
|
-
def
|
|
200
|
-
|
|
201
|
-
logits
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
259
|
+
def convert_logits_to_probabilities(logits: NDArray[np.float32]) -> NDArray[np.float32]:
|
|
260
|
+
"""
|
|
261
|
+
Convert logits to probability distributions.
|
|
262
|
+
|
|
263
|
+
This function handles multiple input formats:
|
|
264
|
+
- 1D arrays: Binary classification probabilities (must be between 0 and 1)
|
|
265
|
+
- 2D arrays: Multi-class logits or probabilities
|
|
266
|
+
|
|
267
|
+
For 2D inputs, the function automatically detects the format:
|
|
268
|
+
- If any values are <= 0: applies softmax (raw logits)
|
|
269
|
+
- If rows don't sum to 1: normalizes to probabilities
|
|
270
|
+
- If rows sum to 1: treats as already normalized probabilities
|
|
208
271
|
|
|
209
|
-
|
|
272
|
+
Args:
|
|
273
|
+
logits: Input logits or probabilities as a float32 numpy array.
|
|
274
|
+
Can be 1D (binary) or 2D (multi-class). May contain NaN values.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
A 2D float32 numpy array of probabilities with shape (n_samples, n_classes).
|
|
278
|
+
Each row sums to 1.0 (except for rows with all NaN values).
|
|
279
|
+
|
|
280
|
+
Raises:
|
|
281
|
+
ValueError: If logits are not 1D or 2D
|
|
282
|
+
ValueError: If 1D logits are not between 0 and 1 (for binary classification)
|
|
283
|
+
ValueError: If 2D logits have fewer than 2 classes (use regression metrics instead)
|
|
284
|
+
"""
|
|
210
285
|
if logits.ndim == 1:
|
|
211
|
-
|
|
286
|
+
# Binary classification: 1D probabilities
|
|
287
|
+
# Check non-NaN values only
|
|
288
|
+
valid_logits = logits[~np.isnan(logits)]
|
|
289
|
+
if len(valid_logits) > 0 and ((valid_logits > 1).any() or (valid_logits < 0).any()):
|
|
212
290
|
raise ValueError("Logits must be between 0 and 1 for binary classification")
|
|
213
|
-
#
|
|
214
|
-
|
|
215
|
-
probabilities = logits # no need to convert to probabilities
|
|
291
|
+
# Convert 1D probabilities to 2D format: [1-p, p]
|
|
292
|
+
probabilities = cast(NDArray[np.float32], np.column_stack([1 - logits, logits]))
|
|
216
293
|
elif logits.ndim == 2:
|
|
217
294
|
if logits.shape[1] < 2:
|
|
218
295
|
raise ValueError("Use a different metric function for regression tasks")
|
|
219
|
-
if
|
|
220
|
-
|
|
221
|
-
|
|
296
|
+
# Check if any non-NaN values are <= 0 (NaN-aware comparison)
|
|
297
|
+
valid_logits = logits[~np.isnan(logits)]
|
|
298
|
+
if len(valid_logits) > 0 and not (valid_logits > 0).all():
|
|
299
|
+
# Contains negative values or zeros: apply softmax (raw logits)
|
|
300
|
+
probabilities = cast(NDArray[np.float32], softmax(logits))
|
|
222
301
|
elif not np.allclose(logits.sum(-1, keepdims=True), 1.0):
|
|
223
|
-
#
|
|
224
|
-
probabilities = logits / logits.sum(-1, keepdims=True)
|
|
302
|
+
# Rows don't sum to 1: normalize to probabilities
|
|
303
|
+
probabilities = cast(NDArray[np.float32], logits / logits.sum(-1, keepdims=True))
|
|
225
304
|
else:
|
|
305
|
+
# Already normalized probabilities
|
|
226
306
|
probabilities = logits
|
|
227
307
|
else:
|
|
228
308
|
raise ValueError("Logits must be 1 or 2 dimensional")
|
|
229
309
|
|
|
310
|
+
return probabilities
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def calculate_classification_metrics(
|
|
314
|
+
expected_labels: list[int] | NDArray[np.int64],
|
|
315
|
+
logits: list[list[float]] | list[NDArray[np.float32]] | NDArray[np.float32],
|
|
316
|
+
anomaly_scores: list[float] | None = None,
|
|
317
|
+
average: Literal["micro", "macro", "weighted", "binary"] | None = None,
|
|
318
|
+
multi_class: Literal["ovr", "ovo"] = "ovr",
|
|
319
|
+
include_curves: bool = False,
|
|
320
|
+
) -> ClassificationMetrics:
|
|
321
|
+
references = np.array(expected_labels)
|
|
322
|
+
|
|
323
|
+
# Convert to numpy array, handling None values
|
|
324
|
+
logits = convert_to_float32_array(logits)
|
|
325
|
+
|
|
326
|
+
# Check if all logits are NaN (all predictions are None/NaN)
|
|
327
|
+
if np.all(np.isnan(logits)):
|
|
328
|
+
# Return placeholder metrics when all logits are invalid
|
|
329
|
+
return ClassificationMetrics(
|
|
330
|
+
coverage=0.0,
|
|
331
|
+
f1_score=0.0,
|
|
332
|
+
accuracy=0.0,
|
|
333
|
+
loss=None,
|
|
334
|
+
anomaly_score_mean=None,
|
|
335
|
+
anomaly_score_median=None,
|
|
336
|
+
anomaly_score_variance=None,
|
|
337
|
+
roc_auc=None,
|
|
338
|
+
pr_auc=None,
|
|
339
|
+
pr_curve=None,
|
|
340
|
+
roc_curve=None,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Convert logits to probabilities
|
|
344
|
+
probabilities = convert_logits_to_probabilities(logits)
|
|
345
|
+
|
|
230
346
|
predictions = np.argmax(probabilities, axis=-1)
|
|
231
347
|
predictions[np.isnan(probabilities).all(axis=-1)] = -1 # set predictions to -1 for all nan logits
|
|
232
348
|
|
|
@@ -238,10 +354,6 @@ def calculate_classification_metrics(
|
|
|
238
354
|
if average is None:
|
|
239
355
|
average = "binary" if num_classes_references == 2 and num_none_predictions == 0 else "weighted"
|
|
240
356
|
|
|
241
|
-
anomaly_score_mean = float(np.mean(anomaly_scores)) if anomaly_scores else None
|
|
242
|
-
anomaly_score_median = float(np.median(anomaly_scores)) if anomaly_scores else None
|
|
243
|
-
anomaly_score_variance = float(np.var(anomaly_scores)) if anomaly_scores else None
|
|
244
|
-
|
|
245
357
|
accuracy = sklearn.metrics.accuracy_score(references, predictions)
|
|
246
358
|
f1 = sklearn.metrics.f1_score(references, predictions, average=average)
|
|
247
359
|
# Ensure sklearn sees the full class set corresponding to probability columns
|
|
@@ -259,10 +371,12 @@ def calculate_classification_metrics(
|
|
|
259
371
|
if num_classes_references == num_classes_predictions and num_none_predictions == 0:
|
|
260
372
|
# special case for binary classification: https://github.com/scikit-learn/scikit-learn/issues/20186
|
|
261
373
|
if num_classes_references == 2:
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
374
|
+
# Use probabilities[:, 1] which is guaranteed to be 2D
|
|
375
|
+
probabilities_positive = probabilities[:, 1]
|
|
376
|
+
roc_auc = sklearn.metrics.roc_auc_score(references, probabilities_positive)
|
|
377
|
+
roc_curve = calculate_roc_curve(references, probabilities_positive) if include_curves else None
|
|
378
|
+
pr_auc = sklearn.metrics.average_precision_score(references, probabilities_positive)
|
|
379
|
+
pr_curve = calculate_pr_curve(references, probabilities_positive) if include_curves else None
|
|
266
380
|
else:
|
|
267
381
|
roc_auc = sklearn.metrics.roc_auc_score(references, probabilities, multi_class=multi_class)
|
|
268
382
|
roc_curve = None
|
|
@@ -274,6 +388,9 @@ def calculate_classification_metrics(
|
|
|
274
388
|
pr_curve = None
|
|
275
389
|
roc_curve = None
|
|
276
390
|
|
|
391
|
+
# Calculate anomaly score statistics
|
|
392
|
+
anomaly_score_mean, anomaly_score_median, anomaly_score_variance = calculate_anomaly_score_stats(anomaly_scores)
|
|
393
|
+
|
|
277
394
|
return ClassificationMetrics(
|
|
278
395
|
coverage=coverage,
|
|
279
396
|
accuracy=float(accuracy),
|
|
@@ -337,9 +454,9 @@ class RegressionMetrics:
|
|
|
337
454
|
|
|
338
455
|
|
|
339
456
|
def calculate_regression_metrics(
|
|
340
|
-
expected_scores: NDArray[np.float32] |
|
|
341
|
-
predicted_scores: NDArray[np.float32] |
|
|
342
|
-
anomaly_scores:
|
|
457
|
+
expected_scores: NDArray[np.float32] | Sequence[float],
|
|
458
|
+
predicted_scores: NDArray[np.float32] | Sequence[float | None],
|
|
459
|
+
anomaly_scores: NDArray[np.float32] | Sequence[float] | None = None,
|
|
343
460
|
) -> RegressionMetrics:
|
|
344
461
|
"""
|
|
345
462
|
Calculate regression metrics for model evaluation.
|
|
@@ -354,23 +471,42 @@ def calculate_regression_metrics(
|
|
|
354
471
|
|
|
355
472
|
Raises:
|
|
356
473
|
ValueError: If predictions and references have different lengths
|
|
474
|
+
ValueError: If expected_scores contains None or NaN values
|
|
357
475
|
"""
|
|
358
|
-
|
|
359
|
-
|
|
476
|
+
# Convert to numpy arrays, handling None values
|
|
477
|
+
references = convert_to_float32_array(expected_scores)
|
|
478
|
+
predictions = convert_to_float32_array(predicted_scores)
|
|
360
479
|
|
|
361
480
|
if len(predictions) != len(references):
|
|
362
481
|
raise ValueError("Predictions and references must have the same length")
|
|
363
482
|
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
483
|
+
# Validate that all expected_scores are non-None and non-NaN
|
|
484
|
+
if np.any(np.isnan(references)):
|
|
485
|
+
raise ValueError("expected_scores must not contain None or NaN values")
|
|
486
|
+
|
|
487
|
+
# If all of the predictions are None or NaN, return None for all metrics
|
|
488
|
+
if np.all(np.isnan(predictions)):
|
|
489
|
+
anomaly_score_mean, anomaly_score_median, anomaly_score_variance = calculate_anomaly_score_stats(anomaly_scores)
|
|
490
|
+
return RegressionMetrics(
|
|
491
|
+
coverage=0.0,
|
|
492
|
+
mse=0.0,
|
|
493
|
+
rmse=0.0,
|
|
494
|
+
mae=0.0,
|
|
495
|
+
r2=0.0,
|
|
496
|
+
explained_variance=0.0,
|
|
497
|
+
loss=0.0,
|
|
498
|
+
anomaly_score_mean=anomaly_score_mean,
|
|
499
|
+
anomaly_score_median=anomaly_score_median,
|
|
500
|
+
anomaly_score_variance=anomaly_score_variance,
|
|
501
|
+
)
|
|
367
502
|
|
|
368
|
-
|
|
369
|
-
|
|
503
|
+
# Filter out NaN values from predictions (expected_scores are already validated to be non-NaN)
|
|
504
|
+
valid_mask = ~np.isnan(predictions)
|
|
505
|
+
num_none_predictions = (~valid_mask).sum()
|
|
370
506
|
coverage = 1 - num_none_predictions / len(predictions)
|
|
371
507
|
if num_none_predictions > 0:
|
|
372
|
-
references = references[
|
|
373
|
-
predictions = predictions[
|
|
508
|
+
references = references[valid_mask]
|
|
509
|
+
predictions = predictions[valid_mask]
|
|
374
510
|
|
|
375
511
|
# Calculate core regression metrics
|
|
376
512
|
mse = float(sklearn.metrics.mean_squared_error(references, predictions))
|
|
@@ -379,6 +515,9 @@ def calculate_regression_metrics(
|
|
|
379
515
|
r2 = float(sklearn.metrics.r2_score(references, predictions))
|
|
380
516
|
explained_var = float(sklearn.metrics.explained_variance_score(references, predictions))
|
|
381
517
|
|
|
518
|
+
# Calculate anomaly score statistics
|
|
519
|
+
anomaly_score_mean, anomaly_score_median, anomaly_score_variance = calculate_anomaly_score_stats(anomaly_scores)
|
|
520
|
+
|
|
382
521
|
return RegressionMetrics(
|
|
383
522
|
coverage=coverage,
|
|
384
523
|
mse=mse,
|
|
@@ -80,24 +80,36 @@ def test_multiclass_metrics_with_3_classes(
|
|
|
80
80
|
def test_does_not_modify_logits_unless_necessary():
|
|
81
81
|
logits = np.array([[0.1, 0.9], [0.2, 0.8], [0.7, 0.3], [0.8, 0.2]])
|
|
82
82
|
expected_labels = [0, 1, 0, 1]
|
|
83
|
-
|
|
84
|
-
|
|
83
|
+
loss = calculate_classification_metrics(expected_labels, logits).loss
|
|
84
|
+
assert loss is not None
|
|
85
|
+
assert np.allclose(
|
|
86
|
+
loss,
|
|
87
|
+
sklearn.metrics.log_loss(expected_labels, logits),
|
|
88
|
+
atol=1e-6,
|
|
85
89
|
)
|
|
86
90
|
|
|
87
91
|
|
|
88
92
|
def test_normalizes_logits_if_necessary():
|
|
89
93
|
logits = np.array([[1.2, 3.9], [1.2, 5.8], [1.2, 2.7], [1.2, 1.3]])
|
|
90
94
|
expected_labels = [0, 1, 0, 1]
|
|
91
|
-
|
|
92
|
-
|
|
95
|
+
loss = calculate_classification_metrics(expected_labels, logits).loss
|
|
96
|
+
assert loss is not None
|
|
97
|
+
assert np.allclose(
|
|
98
|
+
loss,
|
|
99
|
+
sklearn.metrics.log_loss(expected_labels, logits / logits.sum(axis=1, keepdims=True)),
|
|
100
|
+
atol=1e-6,
|
|
93
101
|
)
|
|
94
102
|
|
|
95
103
|
|
|
96
104
|
def test_softmaxes_logits_if_necessary():
|
|
97
105
|
logits = np.array([[-1.2, 3.9], [1.2, -5.8], [1.2, 2.7], [1.2, 1.3]])
|
|
98
106
|
expected_labels = [0, 1, 0, 1]
|
|
99
|
-
|
|
100
|
-
|
|
107
|
+
loss = calculate_classification_metrics(expected_labels, logits).loss
|
|
108
|
+
assert loss is not None
|
|
109
|
+
assert np.allclose(
|
|
110
|
+
loss,
|
|
111
|
+
sklearn.metrics.log_loss(expected_labels, softmax(logits)),
|
|
112
|
+
atol=1e-6,
|
|
101
113
|
)
|
|
102
114
|
|
|
103
115
|
|
|
@@ -271,3 +283,84 @@ def test_regression_metrics_handles_nans():
|
|
|
271
283
|
assert metrics.mae > 0.0
|
|
272
284
|
assert 0.0 <= metrics.r2 <= 1.0
|
|
273
285
|
assert 0.0 <= metrics.explained_variance <= 1.0
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def test_regression_metrics_handles_none_values():
|
|
289
|
+
# Test with lists containing None values
|
|
290
|
+
y_true = [1.0, 2.0, 3.0, 4.0, 5.0]
|
|
291
|
+
y_pred = [1.1, 1.9, None, 3.8, np.nan]
|
|
292
|
+
|
|
293
|
+
metrics = calculate_regression_metrics(y_true, y_pred)
|
|
294
|
+
|
|
295
|
+
# Coverage should be 0.6 (3 out of 5 predictions are valid)
|
|
296
|
+
# Positions with None/NaN predictions (indices 2 and 4) are filtered out
|
|
297
|
+
assert np.allclose(metrics.coverage, 0.6)
|
|
298
|
+
|
|
299
|
+
# Metrics should be calculated only on valid pairs (indices 0, 1, 3)
|
|
300
|
+
# Valid pairs: (1.0, 1.1), (2.0, 1.9), and (4.0, 3.8)
|
|
301
|
+
expected_mse = np.mean([(1.0 - 1.1) ** 2, (2.0 - 1.9) ** 2, (4.0 - 3.8) ** 2])
|
|
302
|
+
expected_mae = np.mean([abs(1.0 - 1.1), abs(2.0 - 1.9), abs(4.0 - 3.8)])
|
|
303
|
+
|
|
304
|
+
assert metrics.mse == pytest.approx(expected_mse)
|
|
305
|
+
assert metrics.mae == pytest.approx(expected_mae)
|
|
306
|
+
assert metrics.rmse == pytest.approx(np.sqrt(expected_mse))
|
|
307
|
+
assert 0.0 <= metrics.r2 <= 1.0
|
|
308
|
+
assert 0.0 <= metrics.explained_variance <= 1.0
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def test_regression_metrics_rejects_none_expected_scores():
|
|
312
|
+
# Test that None values in expected_scores are rejected
|
|
313
|
+
y_true = [1.0, 2.0, None, 4.0, 5.0]
|
|
314
|
+
y_pred = [1.1, 1.9, 3.2, 3.8, 5.1]
|
|
315
|
+
|
|
316
|
+
with pytest.raises(ValueError, match="expected_scores must not contain None or NaN values"):
|
|
317
|
+
calculate_regression_metrics(y_true, y_pred)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def test_regression_metrics_rejects_nan_expected_scores():
|
|
321
|
+
# Test that NaN values in expected_scores are rejected
|
|
322
|
+
y_true = np.array([1.0, 2.0, np.nan, 4.0, 5.0], dtype=np.float32)
|
|
323
|
+
y_pred = np.array([1.1, 1.9, 3.2, 3.8, 5.1], dtype=np.float32)
|
|
324
|
+
|
|
325
|
+
with pytest.raises(ValueError, match="expected_scores must not contain None or NaN values"):
|
|
326
|
+
calculate_regression_metrics(y_true, y_pred)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def test_regression_metrics_all_predictions_none():
|
|
330
|
+
# Test with all predictions being None
|
|
331
|
+
y_true = [1.0, 2.0, 3.0, 4.0, 5.0]
|
|
332
|
+
y_pred = [None, None, None, None, None]
|
|
333
|
+
|
|
334
|
+
metrics = calculate_regression_metrics(y_true, y_pred)
|
|
335
|
+
|
|
336
|
+
# When all predictions are None, coverage should be 0.0 and all metrics should be 0.0
|
|
337
|
+
assert metrics.coverage == 0.0
|
|
338
|
+
assert metrics.mse == 0.0
|
|
339
|
+
assert metrics.rmse == 0.0
|
|
340
|
+
assert metrics.mae == 0.0
|
|
341
|
+
assert metrics.r2 == 0.0
|
|
342
|
+
assert metrics.explained_variance == 0.0
|
|
343
|
+
assert metrics.loss == 0.0
|
|
344
|
+
assert metrics.anomaly_score_mean is None
|
|
345
|
+
assert metrics.anomaly_score_median is None
|
|
346
|
+
assert metrics.anomaly_score_variance is None
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def test_regression_metrics_all_predictions_nan():
|
|
350
|
+
# Test with all predictions being NaN
|
|
351
|
+
y_true = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
|
|
352
|
+
y_pred = np.array([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float32)
|
|
353
|
+
|
|
354
|
+
metrics = calculate_regression_metrics(y_true, y_pred)
|
|
355
|
+
|
|
356
|
+
# When all predictions are NaN, coverage should be 0.0 and all metrics should be 0.0
|
|
357
|
+
assert metrics.coverage == 0.0
|
|
358
|
+
assert metrics.mse == 0.0
|
|
359
|
+
assert metrics.rmse == 0.0
|
|
360
|
+
assert metrics.mae == 0.0
|
|
361
|
+
assert metrics.r2 == 0.0
|
|
362
|
+
assert metrics.explained_variance == 0.0
|
|
363
|
+
assert metrics.loss == 0.0
|
|
364
|
+
assert metrics.anomaly_score_mean is None
|
|
365
|
+
assert metrics.anomaly_score_median is None
|
|
366
|
+
assert metrics.anomaly_score_variance is None
|
|
@@ -33,7 +33,7 @@ def test_hf_dataset_from_torch_dict():
|
|
|
33
33
|
# Then the HF dataset should be created successfully
|
|
34
34
|
assert isinstance(hf_dataset, Dataset)
|
|
35
35
|
assert len(hf_dataset) == len(dataset)
|
|
36
|
-
assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id"}
|
|
36
|
+
assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id", "partition_id"}
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class PytorchTupleDataset(TorchDataset):
|