orca-sdk 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. orca_sdk/__init__.py +30 -0
  2. orca_sdk/_shared/__init__.py +10 -0
  3. orca_sdk/_shared/metrics.py +634 -0
  4. orca_sdk/_shared/metrics_test.py +570 -0
  5. orca_sdk/_utils/__init__.py +0 -0
  6. orca_sdk/_utils/analysis_ui.py +196 -0
  7. orca_sdk/_utils/analysis_ui_style.css +51 -0
  8. orca_sdk/_utils/auth.py +65 -0
  9. orca_sdk/_utils/auth_test.py +31 -0
  10. orca_sdk/_utils/common.py +37 -0
  11. orca_sdk/_utils/data_parsing.py +129 -0
  12. orca_sdk/_utils/data_parsing_test.py +244 -0
  13. orca_sdk/_utils/pagination.py +126 -0
  14. orca_sdk/_utils/pagination_test.py +132 -0
  15. orca_sdk/_utils/prediction_result_ui.css +18 -0
  16. orca_sdk/_utils/prediction_result_ui.py +110 -0
  17. orca_sdk/_utils/tqdm_file_reader.py +12 -0
  18. orca_sdk/_utils/value_parser.py +45 -0
  19. orca_sdk/_utils/value_parser_test.py +39 -0
  20. orca_sdk/async_client.py +4104 -0
  21. orca_sdk/classification_model.py +1165 -0
  22. orca_sdk/classification_model_test.py +887 -0
  23. orca_sdk/client.py +4096 -0
  24. orca_sdk/conftest.py +382 -0
  25. orca_sdk/credentials.py +217 -0
  26. orca_sdk/credentials_test.py +121 -0
  27. orca_sdk/datasource.py +576 -0
  28. orca_sdk/datasource_test.py +463 -0
  29. orca_sdk/embedding_model.py +712 -0
  30. orca_sdk/embedding_model_test.py +206 -0
  31. orca_sdk/job.py +343 -0
  32. orca_sdk/job_test.py +108 -0
  33. orca_sdk/memoryset.py +3811 -0
  34. orca_sdk/memoryset_test.py +1150 -0
  35. orca_sdk/regression_model.py +841 -0
  36. orca_sdk/regression_model_test.py +595 -0
  37. orca_sdk/telemetry.py +742 -0
  38. orca_sdk/telemetry_test.py +119 -0
  39. orca_sdk-0.1.9.dist-info/METADATA +98 -0
  40. orca_sdk-0.1.9.dist-info/RECORD +41 -0
  41. orca_sdk-0.1.9.dist-info/WHEEL +4 -0
orca_sdk/__init__.py ADDED
@@ -0,0 +1,30 @@
1
+ """
2
+ OrcaSDK is a Python library for building and using retrieval augmented models in the OrcaCloud.
3
+ """
4
+
5
+ from ._utils.common import UNSET, CreateMode, DropMode
6
+ from .classification_model import ClassificationMetrics, ClassificationModel
7
+ from .client import OrcaClient
8
+ from .credentials import OrcaCredentials
9
+ from .datasource import Datasource
10
+ from .embedding_model import (
11
+ FinetunedEmbeddingModel,
12
+ PretrainedEmbeddingModel,
13
+ PretrainedEmbeddingModelName,
14
+ )
15
+ from .job import Job, Status
16
+ from .memoryset import (
17
+ CascadingEditSuggestion,
18
+ FilterItemTuple,
19
+ LabeledMemory,
20
+ LabeledMemoryLookup,
21
+ LabeledMemoryset,
22
+ ScoredMemory,
23
+ ScoredMemoryLookup,
24
+ ScoredMemoryset,
25
+ )
26
+ from .regression_model import RegressionModel
27
+ from .telemetry import ClassificationPrediction, FeedbackCategory, RegressionPrediction
28
+
29
+ # only specify things that should show up on the root page of the reference docs because they are in private modules
30
+ __all__ = ["UNSET", "CreateMode", "DropMode"]
@@ -0,0 +1,10 @@
1
+ from .metrics import (
2
+ ClassificationMetrics,
3
+ PRCurve,
4
+ RegressionMetrics,
5
+ ROCCurve,
6
+ calculate_classification_metrics,
7
+ calculate_pr_curve,
8
+ calculate_regression_metrics,
9
+ calculate_roc_curve,
10
+ )
@@ -0,0 +1,634 @@
1
+ """
2
+ This module contains metrics for usage with the Hugging Face Trainer.
3
+
4
+ IMPORTANT:
5
+ - This is a shared file between OrcaLib and the OrcaSDK.
6
+ - Please ensure that it does not have any dependencies on the OrcaLib code.
7
+ - Make sure to edit this file in orcalib/shared and NOT in orca_sdk, since it will be overwritten there.
8
+
9
+ """
10
+
11
+ import logging
12
+ from dataclasses import dataclass, field
13
+ from typing import Any, Literal, Sequence, TypedDict, cast
14
+
15
+ import numpy as np
16
+ import sklearn.metrics
17
+ from numpy.typing import NDArray
18
+
19
+
20
+ # we don't want to depend on scipy or torch in orca_sdk
21
+ def softmax(logits: np.ndarray, axis: int = -1) -> np.ndarray:
22
+ shifted = logits - np.max(logits, axis=axis, keepdims=True)
23
+ exps = np.exp(shifted)
24
+ sums = np.sum(exps, axis=axis, keepdims=True)
25
+ # Guard against division by zero (can happen if all logits are -inf or NaN)
26
+ return exps / np.where(sums > 0, sums, 1.0)
27
+
28
+
29
+ # We don't want to depend on transformers just for the eval_pred type in orca_sdk
30
+ def transform_eval_pred(eval_pred: Any) -> tuple[NDArray, NDArray[np.float32]]:
31
+ # convert results from Trainer compute_metrics param for use in calculate_classification_metrics
32
+ logits, references = eval_pred # transformers.trainer_utils.EvalPrediction
33
+ if isinstance(logits, tuple):
34
+ logits = logits[0]
35
+ if not isinstance(logits, np.ndarray):
36
+ raise ValueError("Logits must be a numpy array")
37
+ if not isinstance(references, np.ndarray):
38
+ raise ValueError(
39
+ "Multiple label columns found, use the `label_names` training argument to specify which one to use"
40
+ )
41
+
42
+ return (references, logits)
43
+
44
+
45
+ def convert_to_float32_array(
46
+ data: (
47
+ Sequence[float | None]
48
+ | NDArray[np.float32]
49
+ | Sequence[Sequence[float]]
50
+ | Sequence[NDArray[np.float32]]
51
+ | NDArray[np.float32]
52
+ ),
53
+ ) -> NDArray[np.float32]:
54
+ """
55
+ Convert a list or array that may contain None values to a float32 numpy array.
56
+ None values are converted to NaN.
57
+
58
+ Args:
59
+ data: Input data that may contain None values
60
+
61
+ Returns:
62
+ A float32 numpy array with None values converted to NaN
63
+ """
64
+ array = np.array(data)
65
+ # Convert None values to NaN to handle missing values
66
+ if array.dtype == object:
67
+
68
+ def convert_value(x):
69
+ return np.nan if x is None else float(x)
70
+
71
+ array = np.vectorize(convert_value, otypes=[np.float32])(array)
72
+ else:
73
+ array = np.asarray(array, dtype=np.float32)
74
+ return cast(NDArray[np.float32], array)
75
+
76
+
77
+ def calculate_anomaly_score_stats(
78
+ anomaly_scores: NDArray[np.float32] | Sequence[float] | None,
79
+ ) -> tuple[float | None, float | None, float | None]:
80
+ """
81
+ Calculate statistics (mean, median, variance) for anomaly scores.
82
+
83
+ Args:
84
+ anomaly_scores: Anomaly scores as a list, numpy array, or None
85
+
86
+ Returns:
87
+ A tuple of (mean, median, variance). All values are None if anomaly_scores is None.
88
+ """
89
+ if anomaly_scores is None:
90
+ return (None, None, None)
91
+
92
+ # Convert to numpy array if needed
93
+ if isinstance(anomaly_scores, list):
94
+ anomalies = np.array(anomaly_scores, dtype=np.float32)
95
+ else:
96
+ anomalies = anomaly_scores
97
+
98
+ return (
99
+ float(np.mean(anomalies)),
100
+ float(np.median(anomalies)),
101
+ float(np.var(anomalies)),
102
+ )
103
+
104
+
105
+ class PRCurve(TypedDict):
106
+ thresholds: list[float]
107
+ precisions: list[float]
108
+ recalls: list[float]
109
+
110
+
111
+ def calculate_pr_curve(
112
+ references: NDArray[np.int64],
113
+ probabilities: NDArray[np.float32],
114
+ max_length: int = 100,
115
+ ) -> PRCurve:
116
+ if probabilities.ndim == 1:
117
+ probabilities_slice = probabilities
118
+ elif probabilities.ndim == 2:
119
+ probabilities_slice = probabilities[:, 1]
120
+ else:
121
+ raise ValueError("Probabilities must be 1 or 2 dimensional")
122
+
123
+ if len(probabilities_slice) != len(references):
124
+ raise ValueError("Probabilities and references must have the same length")
125
+
126
+ precisions, recalls, thresholds = sklearn.metrics.precision_recall_curve(references, probabilities_slice)
127
+
128
+ # Convert all arrays to float32 immediately after getting them
129
+ precisions = precisions.astype(np.float32)
130
+ recalls = recalls.astype(np.float32)
131
+ thresholds = thresholds.astype(np.float32)
132
+
133
+ # Concatenate with 0 to include the lowest threshold
134
+ thresholds = np.concatenate(([0], thresholds))
135
+
136
+ # Sort by threshold
137
+ sorted_indices = np.argsort(thresholds)
138
+ thresholds = thresholds[sorted_indices]
139
+ precisions = precisions[sorted_indices]
140
+ recalls = recalls[sorted_indices]
141
+
142
+ if len(precisions) > max_length:
143
+ new_thresholds = np.linspace(0, 1, max_length, dtype=np.float32)
144
+ new_precisions = np.interp(new_thresholds, thresholds, precisions)
145
+ new_recalls = np.interp(new_thresholds, thresholds, recalls)
146
+ thresholds = new_thresholds
147
+ precisions = new_precisions
148
+ recalls = new_recalls
149
+
150
+ return PRCurve(
151
+ thresholds=cast(list[float], thresholds.tolist()),
152
+ precisions=cast(list[float], precisions.tolist()),
153
+ recalls=cast(list[float], recalls.tolist()),
154
+ )
155
+
156
+
157
+ class ROCCurve(TypedDict):
158
+ thresholds: list[float]
159
+ false_positive_rates: list[float]
160
+ true_positive_rates: list[float]
161
+
162
+
163
+ def calculate_roc_curve(
164
+ references: NDArray[np.int64],
165
+ probabilities: NDArray[np.float32],
166
+ max_length: int = 100,
167
+ ) -> ROCCurve:
168
+ if probabilities.ndim == 1:
169
+ probabilities_slice = probabilities
170
+ elif probabilities.ndim == 2:
171
+ probabilities_slice = probabilities[:, 1]
172
+ else:
173
+ raise ValueError("Probabilities must be 1 or 2 dimensional")
174
+
175
+ if len(probabilities_slice) != len(references):
176
+ raise ValueError("Probabilities and references must have the same length")
177
+
178
+ # Convert probabilities to float32 before calling sklearn_roc_curve
179
+ probabilities_slice = probabilities_slice.astype(np.float32)
180
+ fpr, tpr, thresholds = sklearn.metrics.roc_curve(references, probabilities_slice)
181
+
182
+ # Convert all arrays to float32 immediately after getting them
183
+ fpr = fpr.astype(np.float32)
184
+ tpr = tpr.astype(np.float32)
185
+ thresholds = thresholds.astype(np.float32)
186
+
187
+ # We set the first threshold to 1.0 instead of inf for reasonable values in interpolation
188
+ thresholds[0] = 1.0
189
+
190
+ # Sort by threshold
191
+ sorted_indices = np.argsort(thresholds)
192
+ thresholds = thresholds[sorted_indices]
193
+ fpr = fpr[sorted_indices]
194
+ tpr = tpr[sorted_indices]
195
+
196
+ if len(fpr) > max_length:
197
+ new_thresholds = np.linspace(0, 1, max_length, dtype=np.float32)
198
+ new_fpr = np.interp(new_thresholds, thresholds, fpr)
199
+ new_tpr = np.interp(new_thresholds, thresholds, tpr)
200
+ thresholds = new_thresholds
201
+ fpr = new_fpr
202
+ tpr = new_tpr
203
+
204
+ return ROCCurve(
205
+ false_positive_rates=cast(list[float], fpr.tolist()),
206
+ true_positive_rates=cast(list[float], tpr.tolist()),
207
+ thresholds=cast(list[float], thresholds.tolist()),
208
+ )
209
+
210
+
211
+ @dataclass
212
+ class ClassificationMetrics:
213
+ coverage: float
214
+ """Percentage of predictions that are not none"""
215
+
216
+ f1_score: float
217
+ """F1 score of the predictions"""
218
+
219
+ accuracy: float
220
+ """Accuracy of the predictions"""
221
+
222
+ loss: float | None
223
+ """Cross-entropy loss of the logits"""
224
+
225
+ anomaly_score_mean: float | None = None
226
+ """Mean of anomaly scores across the dataset"""
227
+
228
+ anomaly_score_median: float | None = None
229
+ """Median of anomaly scores across the dataset"""
230
+
231
+ anomaly_score_variance: float | None = None
232
+ """Variance of anomaly scores across the dataset"""
233
+
234
+ roc_auc: float | None = None
235
+ """Receiver operating characteristic area under the curve"""
236
+
237
+ pr_auc: float | None = None
238
+ """Average precision (area under the curve of the precision-recall curve)"""
239
+
240
+ pr_curve: PRCurve | None = None
241
+ """Precision-recall curve"""
242
+
243
+ roc_curve: ROCCurve | None = None
244
+ """Receiver operating characteristic curve"""
245
+
246
+ confusion_matrix: list[list[int]] | None = None
247
+ """Confusion matrix where confusion_matrix[i][j] is the count of samples with true label i predicted as label j"""
248
+
249
+ warnings: list[str] = field(default_factory=list)
250
+ """Human-readable warnings about skipped or adjusted metrics"""
251
+
252
+ def __repr__(self) -> str:
253
+ return (
254
+ "ClassificationMetrics({\n"
255
+ + f" accuracy: {self.accuracy:.4f},\n"
256
+ + f" f1_score: {self.f1_score:.4f},\n"
257
+ + (f" roc_auc: {self.roc_auc:.4f},\n" if self.roc_auc else "")
258
+ + (f" pr_auc: {self.pr_auc:.4f},\n" if self.pr_auc else "")
259
+ + (
260
+ f" anomaly_score: {self.anomaly_score_mean:.4f} ± {self.anomaly_score_variance:.4f},\n"
261
+ if self.anomaly_score_mean
262
+ else ""
263
+ )
264
+ + "})"
265
+ )
266
+
267
+
268
+ def convert_logits_to_probabilities(logits: NDArray[np.float32]) -> NDArray[np.float32]:
269
+ """
270
+ Convert logits to probability distributions.
271
+
272
+ This function handles multiple input formats:
273
+ - 1D arrays: Binary classification probabilities (must be between 0 and 1)
274
+ - 2D arrays: Multi-class logits or probabilities
275
+
276
+ For 2D inputs, the function automatically detects the format:
277
+ - If any values are <= 0: applies softmax (raw logits)
278
+ - If rows don't sum to 1: normalizes to probabilities
279
+ - If rows sum to 1: treats as already normalized probabilities
280
+
281
+ Args:
282
+ logits: Input logits or probabilities as a float32 numpy array.
283
+ Can be 1D (binary) or 2D (multi-class). May contain NaN values.
284
+
285
+ Returns:
286
+ A 2D float32 numpy array of probabilities with shape (n_samples, n_classes).
287
+ Each row sums to 1.0 (except for rows with all NaN values).
288
+
289
+ Raises:
290
+ ValueError: If logits are not 1D or 2D
291
+ ValueError: If 1D logits are not between 0 and 1 (for binary classification)
292
+ ValueError: If 2D logits have fewer than 2 classes (use regression metrics instead)
293
+ """
294
+ if logits.ndim == 1:
295
+ # Binary classification: 1D probabilities
296
+ # Check non-NaN values only
297
+ valid_logits = logits[~np.isnan(logits)]
298
+ if len(valid_logits) > 0 and ((valid_logits > 1).any() or (valid_logits < 0).any()):
299
+ raise ValueError("Logits must be between 0 and 1 for binary classification")
300
+ # Convert 1D probabilities to 2D format: [1-p, p]
301
+ probabilities = cast(NDArray[np.float32], np.column_stack([1 - logits, logits]))
302
+ elif logits.ndim == 2:
303
+ if logits.shape[1] < 2:
304
+ raise ValueError("Use a different metric function for regression tasks")
305
+ # Check if any non-NaN values are <= 0 (NaN-aware comparison)
306
+ valid_logits = logits[~np.isnan(logits)]
307
+ if len(valid_logits) > 0 and not (valid_logits > 0).all():
308
+ # Contains negative values or zeros: apply softmax (raw logits)
309
+ probabilities = cast(NDArray[np.float32], softmax(logits))
310
+ elif not np.allclose(logits.sum(-1, keepdims=True), 1.0):
311
+ # Rows don't sum to 1: normalize to probabilities
312
+ row_sums = logits.sum(-1, keepdims=True)
313
+ # Guard against division by zero (can happen if all values in a row are 0 or NaN)
314
+ probabilities = cast(NDArray[np.float32], logits / np.where(row_sums > 0, row_sums, 1.0))
315
+ else:
316
+ # Already normalized probabilities
317
+ probabilities = logits
318
+ else:
319
+ raise ValueError("Logits must be 1 or 2 dimensional")
320
+
321
+ return probabilities
322
+
323
+
324
+ def calculate_classification_metrics(
325
+ expected_labels: list[int] | NDArray[np.int64],
326
+ logits: list[list[float]] | list[NDArray[np.float32]] | NDArray[np.float32],
327
+ anomaly_scores: list[float] | None = None,
328
+ average: Literal["micro", "macro", "weighted", "binary"] | None = None,
329
+ multi_class: Literal["ovr", "ovo"] = "ovr",
330
+ include_curves: bool = False,
331
+ include_confusion_matrix: bool = False,
332
+ ) -> ClassificationMetrics:
333
+ warnings: list[str] = []
334
+ references = np.array(expected_labels)
335
+
336
+ # Convert to numpy array, handling None values
337
+ logits = convert_to_float32_array(logits)
338
+
339
+ # Check if all logits are NaN (all predictions are None/NaN)
340
+ if np.all(np.isnan(logits)):
341
+ # Return placeholder metrics when all logits are invalid
342
+ return ClassificationMetrics(
343
+ coverage=0.0,
344
+ f1_score=0.0,
345
+ accuracy=0.0,
346
+ loss=None,
347
+ anomaly_score_mean=None,
348
+ anomaly_score_median=None,
349
+ anomaly_score_variance=None,
350
+ roc_auc=None,
351
+ pr_auc=None,
352
+ pr_curve=None,
353
+ roc_curve=None,
354
+ confusion_matrix=None,
355
+ )
356
+
357
+ # Convert logits to probabilities
358
+ probabilities = convert_logits_to_probabilities(logits)
359
+
360
+ predictions = np.argmax(probabilities, axis=-1)
361
+ predictions[np.isnan(probabilities).all(axis=-1)] = -1 # set predictions to -1 for all nan logits
362
+
363
+ num_classes_references = len(set(references))
364
+ num_classes_predictions = probabilities.shape[1] # Number of probability columns (model's known classes)
365
+ num_none_predictions = np.isnan(probabilities).all(axis=-1).sum()
366
+ coverage = 1 - (num_none_predictions / len(probabilities) if len(probabilities) > 0 else 0)
367
+ if num_none_predictions > 0:
368
+ warnings.append(f"Some predictions were missing (coverage={coverage:.3f}); loss and AUC metrics were skipped.")
369
+
370
+ if average is None:
371
+ average = "binary" if num_classes_references == 2 and num_none_predictions == 0 else "weighted"
372
+
373
+ accuracy = sklearn.metrics.accuracy_score(references, predictions)
374
+ f1 = sklearn.metrics.f1_score(references, predictions, average=average)
375
+
376
+ # Check for unknown classes early (before log_loss)
377
+ classes_in_references = np.unique(references)
378
+ has_unknown_classes = np.max(classes_in_references) >= num_classes_predictions
379
+ if has_unknown_classes:
380
+ logging.warning(
381
+ f"Test labels contain classes not in the model's predictions. "
382
+ f"Model has {num_classes_predictions} classes (0 - {num_classes_predictions - 1}), "
383
+ f"but test labels contain class {np.max(classes_in_references)}. "
384
+ f"ROC AUC and PR AUC cannot be calculated."
385
+ )
386
+ warnings.append("y_true contains classes unknown to the model; loss and AUC metrics were skipped.")
387
+
388
+ # Ensure sklearn sees the full class set corresponding to probability columns
389
+ # to avoid errors when y_true does not contain all classes.
390
+ # Skip log_loss if there are unknown classes (would cause ValueError)
391
+ loss = (
392
+ sklearn.metrics.log_loss(
393
+ references,
394
+ probabilities,
395
+ labels=list(range(probabilities.shape[1])),
396
+ )
397
+ if num_none_predictions == 0 and not has_unknown_classes
398
+ else None
399
+ )
400
+
401
+ # Calculate ROC AUC with filtering for class mismatch
402
+ if num_none_predictions == 0:
403
+ # Check if y_true contains classes not in the model (unknown classes)
404
+ if has_unknown_classes:
405
+ # Unknown classes present - can't calculate meaningful ROC AUC
406
+ logging.warning(
407
+ "Cannot calculate ROC AUC and PR AUC: test labels contain classes not in the model's predictions."
408
+ )
409
+ if "y_true contains classes unknown to the model" not in " ".join(warnings):
410
+ warnings.append("y_true contains classes unknown to the model; loss and AUC metrics were skipped.")
411
+ roc_auc = None
412
+ pr_auc = None
413
+ pr_curve = None
414
+ roc_curve = None
415
+ elif len(classes_in_references) < 2:
416
+ # Need at least 2 classes for ROC AUC
417
+ logging.warning(
418
+ f"Cannot calculate ROC AUC and PR AUC: need at least 2 classes, but only {len(classes_in_references)} class(es) found in test labels."
419
+ )
420
+ roc_auc = None
421
+ pr_auc = None
422
+ pr_curve = None
423
+ roc_curve = None
424
+ warnings.append("ROC AUC requires at least 2 classes; metric was skipped.")
425
+ else:
426
+ # Filter probabilities to only classes present in references
427
+ if len(classes_in_references) < num_classes_predictions:
428
+ # Subset and renormalize probabilities
429
+ probabilities_filtered = probabilities[:, classes_in_references]
430
+ # Safe renormalization: guard against zero denominators
431
+ row_sums = probabilities_filtered.sum(axis=1, keepdims=True)
432
+ probabilities_filtered = probabilities_filtered / np.where(row_sums > 0, row_sums, 1.0)
433
+
434
+ # Remap references to filtered indices
435
+ class_mapping = {cls: idx for idx, cls in enumerate(classes_in_references)}
436
+ references_remapped = np.array([class_mapping[y] for y in references])
437
+ warnings.append(
438
+ f"ROC AUC computed only on classes present in y_true: {classes_in_references.tolist()}."
439
+ )
440
+ else:
441
+ # All classes present, no filtering needed
442
+ probabilities_filtered = probabilities
443
+ references_remapped = references
444
+
445
+ # special case for binary classification: https://github.com/scikit-learn/scikit-learn/issues/20186
446
+ if len(classes_in_references) == 2:
447
+ # Use probabilities[:, 1] which is guaranteed to be 2D
448
+ probabilities_positive = cast(NDArray[np.float32], probabilities_filtered[:, 1].astype(np.float32))
449
+ roc_auc = sklearn.metrics.roc_auc_score(references_remapped, probabilities_positive)
450
+ roc_curve = calculate_roc_curve(references_remapped, probabilities_positive) if include_curves else None
451
+ pr_auc = sklearn.metrics.average_precision_score(references_remapped, probabilities_positive)
452
+ pr_curve = calculate_pr_curve(references_remapped, probabilities_positive) if include_curves else None
453
+ else:
454
+ roc_auc = sklearn.metrics.roc_auc_score(
455
+ references_remapped, probabilities_filtered, multi_class=multi_class
456
+ )
457
+ roc_curve = None
458
+ pr_auc = None
459
+ pr_curve = None
460
+ else:
461
+ roc_auc = None
462
+ pr_auc = None
463
+ pr_curve = None
464
+ roc_curve = None
465
+
466
+ # Calculate anomaly score statistics
467
+ anomaly_score_mean, anomaly_score_median, anomaly_score_variance = calculate_anomaly_score_stats(anomaly_scores)
468
+
469
+ # Calculate confusion matrix if requested
470
+ confusion_matrix: list[list[int]] | None = None
471
+ if include_confusion_matrix:
472
+ # Get the number of classes from the probabilities shape
473
+ num_classes = probabilities.shape[1]
474
+ labels = list(range(num_classes))
475
+ # Filter out NaN predictions (which are set to -1) before computing confusion matrix
476
+ valid_mask = predictions != -1
477
+ num_filtered = (~valid_mask).sum()
478
+ if num_filtered > 0:
479
+ warning_msg = (
480
+ f"Confusion matrix computation: filtered out {num_filtered} samples with NaN predictions "
481
+ f"({num_filtered}/{len(predictions)} = {num_filtered / len(predictions):.1%})"
482
+ )
483
+ logging.warning(warning_msg)
484
+ warnings.append(warning_msg)
485
+
486
+ if np.any(valid_mask):
487
+ # Compute confusion matrix with explicit labels to ensure consistent shape
488
+ cm = sklearn.metrics.confusion_matrix(references[valid_mask], predictions[valid_mask], labels=labels)
489
+ else:
490
+ # No valid predictions; return an all-zero confusion matrix
491
+ cm = np.zeros((num_classes, num_classes), dtype=int)
492
+ confusion_matrix = cast(list[list[int]], cm.tolist())
493
+
494
+ return ClassificationMetrics(
495
+ coverage=coverage,
496
+ accuracy=float(accuracy),
497
+ f1_score=float(f1),
498
+ loss=float(loss) if loss is not None else None,
499
+ anomaly_score_mean=anomaly_score_mean,
500
+ anomaly_score_median=anomaly_score_median,
501
+ anomaly_score_variance=anomaly_score_variance,
502
+ roc_auc=float(roc_auc) if roc_auc is not None else None,
503
+ pr_auc=float(pr_auc) if pr_auc is not None else None,
504
+ pr_curve=pr_curve,
505
+ roc_curve=roc_curve,
506
+ confusion_matrix=confusion_matrix,
507
+ warnings=warnings,
508
+ )
509
+
510
+
511
+ @dataclass
512
+ class RegressionMetrics:
513
+ coverage: float
514
+ """Percentage of predictions that are not none"""
515
+
516
+ mse: float
517
+ """Mean squared error of the predictions"""
518
+
519
+ rmse: float
520
+ """Root mean squared error of the predictions"""
521
+
522
+ mae: float
523
+ """Mean absolute error of the predictions"""
524
+
525
+ r2: float
526
+ """R-squared score (coefficient of determination) of the predictions"""
527
+
528
+ explained_variance: float
529
+ """Explained variance score of the predictions"""
530
+
531
+ loss: float
532
+ """Mean squared error loss of the predictions"""
533
+
534
+ anomaly_score_mean: float | None = None
535
+ """Mean of anomaly scores across the dataset"""
536
+
537
+ anomaly_score_median: float | None = None
538
+ """Median of anomaly scores across the dataset"""
539
+
540
+ anomaly_score_variance: float | None = None
541
+ """Variance of anomaly scores across the dataset"""
542
+
543
+ def __repr__(self) -> str:
544
+ return (
545
+ "RegressionMetrics({\n"
546
+ + f" mae: {self.mae:.4f},\n"
547
+ + f" rmse: {self.rmse:.4f},\n"
548
+ + f" r2: {self.r2:.4f},\n"
549
+ + (
550
+ f" anomaly_score: {self.anomaly_score_mean:.4f} ± {self.anomaly_score_variance:.4f},\n"
551
+ if self.anomaly_score_mean
552
+ else ""
553
+ )
554
+ + "})"
555
+ )
556
+
557
+
558
+ def calculate_regression_metrics(
559
+ expected_scores: NDArray[np.float32] | Sequence[float],
560
+ predicted_scores: NDArray[np.float32] | Sequence[float | None],
561
+ anomaly_scores: NDArray[np.float32] | Sequence[float] | None = None,
562
+ ) -> RegressionMetrics:
563
+ """
564
+ Calculate regression metrics for model evaluation.
565
+
566
+ Params:
567
+ references: True target values
568
+ predictions: Predicted values from the model
569
+ anomaly_scores: Optional anomaly scores for each prediction
570
+
571
+ Returns:
572
+ Comprehensive regression metrics including MSE, RMSE, MAE, R², and explained variance
573
+
574
+ Raises:
575
+ ValueError: If predictions and references have different lengths
576
+ ValueError: If expected_scores contains None or NaN values
577
+ """
578
+ # Convert to numpy arrays, handling None values
579
+ references = convert_to_float32_array(expected_scores)
580
+ predictions = convert_to_float32_array(predicted_scores)
581
+
582
+ if len(predictions) != len(references):
583
+ raise ValueError("Predictions and references must have the same length")
584
+
585
+ # Validate that all expected_scores are non-None and non-NaN
586
+ if np.any(np.isnan(references)):
587
+ raise ValueError("expected_scores must not contain None or NaN values")
588
+
589
+ # If all of the predictions are None or NaN, return None for all metrics
590
+ if np.all(np.isnan(predictions)):
591
+ anomaly_score_mean, anomaly_score_median, anomaly_score_variance = calculate_anomaly_score_stats(anomaly_scores)
592
+ return RegressionMetrics(
593
+ coverage=0.0,
594
+ mse=0.0,
595
+ rmse=0.0,
596
+ mae=0.0,
597
+ r2=0.0,
598
+ explained_variance=0.0,
599
+ loss=0.0,
600
+ anomaly_score_mean=anomaly_score_mean,
601
+ anomaly_score_median=anomaly_score_median,
602
+ anomaly_score_variance=anomaly_score_variance,
603
+ )
604
+
605
+ # Filter out NaN values from predictions (expected_scores are already validated to be non-NaN)
606
+ valid_mask = ~np.isnan(predictions)
607
+ num_none_predictions = (~valid_mask).sum()
608
+ coverage = 1 - (num_none_predictions / len(predictions) if len(predictions) > 0 else 0)
609
+ if num_none_predictions > 0:
610
+ references = references[valid_mask]
611
+ predictions = predictions[valid_mask]
612
+
613
+ # Calculate core regression metrics
614
+ mse = float(sklearn.metrics.mean_squared_error(references, predictions))
615
+ rmse = float(np.sqrt(mse))
616
+ mae = float(sklearn.metrics.mean_absolute_error(references, predictions))
617
+ r2 = float(sklearn.metrics.r2_score(references, predictions))
618
+ explained_var = float(sklearn.metrics.explained_variance_score(references, predictions))
619
+
620
+ # Calculate anomaly score statistics
621
+ anomaly_score_mean, anomaly_score_median, anomaly_score_variance = calculate_anomaly_score_stats(anomaly_scores)
622
+
623
+ return RegressionMetrics(
624
+ coverage=coverage,
625
+ mse=mse,
626
+ rmse=rmse,
627
+ mae=mae,
628
+ r2=r2,
629
+ explained_variance=explained_var,
630
+ loss=mse, # For regression, loss is typically MSE
631
+ anomaly_score_mean=anomaly_score_mean,
632
+ anomaly_score_median=anomaly_score_median,
633
+ anomaly_score_variance=anomaly_score_variance,
634
+ )