orca-sdk 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/_shared/metrics.py +186 -43
- orca_sdk/_shared/metrics_test.py +99 -6
- orca_sdk/_utils/data_parsing_test.py +1 -1
- orca_sdk/async_client.py +52 -14
- orca_sdk/classification_model.py +107 -30
- orca_sdk/classification_model_test.py +327 -8
- orca_sdk/client.py +52 -14
- orca_sdk/conftest.py +140 -21
- orca_sdk/embedding_model.py +0 -2
- orca_sdk/memoryset.py +141 -26
- orca_sdk/memoryset_test.py +253 -4
- orca_sdk/regression_model.py +73 -16
- orca_sdk/regression_model_test.py +213 -0
- {orca_sdk-0.1.4.dist-info → orca_sdk-0.1.6.dist-info}/METADATA +1 -1
- {orca_sdk-0.1.4.dist-info → orca_sdk-0.1.6.dist-info}/RECORD +16 -16
- {orca_sdk-0.1.4.dist-info → orca_sdk-0.1.6.dist-info}/WHEEL +0 -0
orca_sdk/_shared/metrics.py
CHANGED
|
@@ -9,7 +9,7 @@ IMPORTANT:
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
from dataclasses import dataclass
|
|
12
|
-
from typing import Any, Literal, TypedDict, cast
|
|
12
|
+
from typing import Any, Literal, Sequence, TypedDict, cast
|
|
13
13
|
|
|
14
14
|
import numpy as np
|
|
15
15
|
import sklearn.metrics
|
|
@@ -20,7 +20,9 @@ from numpy.typing import NDArray
|
|
|
20
20
|
def softmax(logits: np.ndarray, axis: int = -1) -> np.ndarray:
|
|
21
21
|
shifted = logits - np.max(logits, axis=axis, keepdims=True)
|
|
22
22
|
exps = np.exp(shifted)
|
|
23
|
-
|
|
23
|
+
sums = np.sum(exps, axis=axis, keepdims=True)
|
|
24
|
+
# Guard against division by zero (can happen if all logits are -inf or NaN)
|
|
25
|
+
return exps / np.where(sums > 0, sums, 1.0)
|
|
24
26
|
|
|
25
27
|
|
|
26
28
|
# We don't want to depend on transformers just for the eval_pred type in orca_sdk
|
|
@@ -39,6 +41,66 @@ def transform_eval_pred(eval_pred: Any) -> tuple[NDArray, NDArray[np.float32]]:
|
|
|
39
41
|
return (references, logits)
|
|
40
42
|
|
|
41
43
|
|
|
44
|
+
def convert_to_float32_array(
|
|
45
|
+
data: (
|
|
46
|
+
Sequence[float | None]
|
|
47
|
+
| NDArray[np.float32]
|
|
48
|
+
| Sequence[Sequence[float]]
|
|
49
|
+
| Sequence[NDArray[np.float32]]
|
|
50
|
+
| NDArray[np.float32]
|
|
51
|
+
),
|
|
52
|
+
) -> NDArray[np.float32]:
|
|
53
|
+
"""
|
|
54
|
+
Convert a list or array that may contain None values to a float32 numpy array.
|
|
55
|
+
None values are converted to NaN.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
data: Input data that may contain None values
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
A float32 numpy array with None values converted to NaN
|
|
62
|
+
"""
|
|
63
|
+
array = np.array(data)
|
|
64
|
+
# Convert None values to NaN to handle missing values
|
|
65
|
+
if array.dtype == object:
|
|
66
|
+
|
|
67
|
+
def convert_value(x):
|
|
68
|
+
return np.nan if x is None else float(x)
|
|
69
|
+
|
|
70
|
+
array = np.vectorize(convert_value, otypes=[np.float32])(array)
|
|
71
|
+
else:
|
|
72
|
+
array = np.asarray(array, dtype=np.float32)
|
|
73
|
+
return cast(NDArray[np.float32], array)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def calculate_anomaly_score_stats(
|
|
77
|
+
anomaly_scores: NDArray[np.float32] | Sequence[float] | None,
|
|
78
|
+
) -> tuple[float | None, float | None, float | None]:
|
|
79
|
+
"""
|
|
80
|
+
Calculate statistics (mean, median, variance) for anomaly scores.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
anomaly_scores: Anomaly scores as a list, numpy array, or None
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
A tuple of (mean, median, variance). All values are None if anomaly_scores is None.
|
|
87
|
+
"""
|
|
88
|
+
if anomaly_scores is None:
|
|
89
|
+
return (None, None, None)
|
|
90
|
+
|
|
91
|
+
# Convert to numpy array if needed
|
|
92
|
+
if isinstance(anomaly_scores, list):
|
|
93
|
+
anomalies = np.array(anomaly_scores, dtype=np.float32)
|
|
94
|
+
else:
|
|
95
|
+
anomalies = anomaly_scores
|
|
96
|
+
|
|
97
|
+
return (
|
|
98
|
+
float(np.mean(anomalies)),
|
|
99
|
+
float(np.median(anomalies)),
|
|
100
|
+
float(np.var(anomalies)),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
42
104
|
class PRCurve(TypedDict):
|
|
43
105
|
thresholds: list[float]
|
|
44
106
|
precisions: list[float]
|
|
@@ -196,52 +258,106 @@ class ClassificationMetrics:
|
|
|
196
258
|
)
|
|
197
259
|
|
|
198
260
|
|
|
199
|
-
def
|
|
200
|
-
|
|
201
|
-
logits
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
261
|
+
def convert_logits_to_probabilities(logits: NDArray[np.float32]) -> NDArray[np.float32]:
|
|
262
|
+
"""
|
|
263
|
+
Convert logits to probability distributions.
|
|
264
|
+
|
|
265
|
+
This function handles multiple input formats:
|
|
266
|
+
- 1D arrays: Binary classification probabilities (must be between 0 and 1)
|
|
267
|
+
- 2D arrays: Multi-class logits or probabilities
|
|
268
|
+
|
|
269
|
+
For 2D inputs, the function automatically detects the format:
|
|
270
|
+
- If any values are <= 0: applies softmax (raw logits)
|
|
271
|
+
- If rows don't sum to 1: normalizes to probabilities
|
|
272
|
+
- If rows sum to 1: treats as already normalized probabilities
|
|
208
273
|
|
|
209
|
-
|
|
274
|
+
Args:
|
|
275
|
+
logits: Input logits or probabilities as a float32 numpy array.
|
|
276
|
+
Can be 1D (binary) or 2D (multi-class). May contain NaN values.
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
A 2D float32 numpy array of probabilities with shape (n_samples, n_classes).
|
|
280
|
+
Each row sums to 1.0 (except for rows with all NaN values).
|
|
281
|
+
|
|
282
|
+
Raises:
|
|
283
|
+
ValueError: If logits are not 1D or 2D
|
|
284
|
+
ValueError: If 1D logits are not between 0 and 1 (for binary classification)
|
|
285
|
+
ValueError: If 2D logits have fewer than 2 classes (use regression metrics instead)
|
|
286
|
+
"""
|
|
210
287
|
if logits.ndim == 1:
|
|
211
|
-
|
|
288
|
+
# Binary classification: 1D probabilities
|
|
289
|
+
# Check non-NaN values only
|
|
290
|
+
valid_logits = logits[~np.isnan(logits)]
|
|
291
|
+
if len(valid_logits) > 0 and ((valid_logits > 1).any() or (valid_logits < 0).any()):
|
|
212
292
|
raise ValueError("Logits must be between 0 and 1 for binary classification")
|
|
213
|
-
#
|
|
214
|
-
|
|
215
|
-
probabilities = logits # no need to convert to probabilities
|
|
293
|
+
# Convert 1D probabilities to 2D format: [1-p, p]
|
|
294
|
+
probabilities = cast(NDArray[np.float32], np.column_stack([1 - logits, logits]))
|
|
216
295
|
elif logits.ndim == 2:
|
|
217
296
|
if logits.shape[1] < 2:
|
|
218
297
|
raise ValueError("Use a different metric function for regression tasks")
|
|
219
|
-
if
|
|
220
|
-
|
|
221
|
-
|
|
298
|
+
# Check if any non-NaN values are <= 0 (NaN-aware comparison)
|
|
299
|
+
valid_logits = logits[~np.isnan(logits)]
|
|
300
|
+
if len(valid_logits) > 0 and not (valid_logits > 0).all():
|
|
301
|
+
# Contains negative values or zeros: apply softmax (raw logits)
|
|
302
|
+
probabilities = cast(NDArray[np.float32], softmax(logits))
|
|
222
303
|
elif not np.allclose(logits.sum(-1, keepdims=True), 1.0):
|
|
223
|
-
#
|
|
224
|
-
|
|
304
|
+
# Rows don't sum to 1: normalize to probabilities
|
|
305
|
+
row_sums = logits.sum(-1, keepdims=True)
|
|
306
|
+
# Guard against division by zero (can happen if all values in a row are 0 or NaN)
|
|
307
|
+
probabilities = cast(NDArray[np.float32], logits / np.where(row_sums > 0, row_sums, 1.0))
|
|
225
308
|
else:
|
|
309
|
+
# Already normalized probabilities
|
|
226
310
|
probabilities = logits
|
|
227
311
|
else:
|
|
228
312
|
raise ValueError("Logits must be 1 or 2 dimensional")
|
|
229
313
|
|
|
314
|
+
return probabilities
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def calculate_classification_metrics(
|
|
318
|
+
expected_labels: list[int] | NDArray[np.int64],
|
|
319
|
+
logits: list[list[float]] | list[NDArray[np.float32]] | NDArray[np.float32],
|
|
320
|
+
anomaly_scores: list[float] | None = None,
|
|
321
|
+
average: Literal["micro", "macro", "weighted", "binary"] | None = None,
|
|
322
|
+
multi_class: Literal["ovr", "ovo"] = "ovr",
|
|
323
|
+
include_curves: bool = False,
|
|
324
|
+
) -> ClassificationMetrics:
|
|
325
|
+
references = np.array(expected_labels)
|
|
326
|
+
|
|
327
|
+
# Convert to numpy array, handling None values
|
|
328
|
+
logits = convert_to_float32_array(logits)
|
|
329
|
+
|
|
330
|
+
# Check if all logits are NaN (all predictions are None/NaN)
|
|
331
|
+
if np.all(np.isnan(logits)):
|
|
332
|
+
# Return placeholder metrics when all logits are invalid
|
|
333
|
+
return ClassificationMetrics(
|
|
334
|
+
coverage=0.0,
|
|
335
|
+
f1_score=0.0,
|
|
336
|
+
accuracy=0.0,
|
|
337
|
+
loss=None,
|
|
338
|
+
anomaly_score_mean=None,
|
|
339
|
+
anomaly_score_median=None,
|
|
340
|
+
anomaly_score_variance=None,
|
|
341
|
+
roc_auc=None,
|
|
342
|
+
pr_auc=None,
|
|
343
|
+
pr_curve=None,
|
|
344
|
+
roc_curve=None,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Convert logits to probabilities
|
|
348
|
+
probabilities = convert_logits_to_probabilities(logits)
|
|
349
|
+
|
|
230
350
|
predictions = np.argmax(probabilities, axis=-1)
|
|
231
351
|
predictions[np.isnan(probabilities).all(axis=-1)] = -1 # set predictions to -1 for all nan logits
|
|
232
352
|
|
|
233
353
|
num_classes_references = len(set(references))
|
|
234
354
|
num_classes_predictions = len(set(predictions))
|
|
235
355
|
num_none_predictions = np.isnan(probabilities).all(axis=-1).sum()
|
|
236
|
-
coverage = 1 - num_none_predictions / len(probabilities)
|
|
356
|
+
coverage = 1 - (num_none_predictions / len(probabilities) if len(probabilities) > 0 else 0)
|
|
237
357
|
|
|
238
358
|
if average is None:
|
|
239
359
|
average = "binary" if num_classes_references == 2 and num_none_predictions == 0 else "weighted"
|
|
240
360
|
|
|
241
|
-
anomaly_score_mean = float(np.mean(anomaly_scores)) if anomaly_scores else None
|
|
242
|
-
anomaly_score_median = float(np.median(anomaly_scores)) if anomaly_scores else None
|
|
243
|
-
anomaly_score_variance = float(np.var(anomaly_scores)) if anomaly_scores else None
|
|
244
|
-
|
|
245
361
|
accuracy = sklearn.metrics.accuracy_score(references, predictions)
|
|
246
362
|
f1 = sklearn.metrics.f1_score(references, predictions, average=average)
|
|
247
363
|
# Ensure sklearn sees the full class set corresponding to probability columns
|
|
@@ -259,10 +375,12 @@ def calculate_classification_metrics(
|
|
|
259
375
|
if num_classes_references == num_classes_predictions and num_none_predictions == 0:
|
|
260
376
|
# special case for binary classification: https://github.com/scikit-learn/scikit-learn/issues/20186
|
|
261
377
|
if num_classes_references == 2:
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
378
|
+
# Use probabilities[:, 1] which is guaranteed to be 2D
|
|
379
|
+
probabilities_positive = probabilities[:, 1]
|
|
380
|
+
roc_auc = sklearn.metrics.roc_auc_score(references, probabilities_positive)
|
|
381
|
+
roc_curve = calculate_roc_curve(references, probabilities_positive) if include_curves else None
|
|
382
|
+
pr_auc = sklearn.metrics.average_precision_score(references, probabilities_positive)
|
|
383
|
+
pr_curve = calculate_pr_curve(references, probabilities_positive) if include_curves else None
|
|
266
384
|
else:
|
|
267
385
|
roc_auc = sklearn.metrics.roc_auc_score(references, probabilities, multi_class=multi_class)
|
|
268
386
|
roc_curve = None
|
|
@@ -274,6 +392,9 @@ def calculate_classification_metrics(
|
|
|
274
392
|
pr_curve = None
|
|
275
393
|
roc_curve = None
|
|
276
394
|
|
|
395
|
+
# Calculate anomaly score statistics
|
|
396
|
+
anomaly_score_mean, anomaly_score_median, anomaly_score_variance = calculate_anomaly_score_stats(anomaly_scores)
|
|
397
|
+
|
|
277
398
|
return ClassificationMetrics(
|
|
278
399
|
coverage=coverage,
|
|
279
400
|
accuracy=float(accuracy),
|
|
@@ -337,9 +458,9 @@ class RegressionMetrics:
|
|
|
337
458
|
|
|
338
459
|
|
|
339
460
|
def calculate_regression_metrics(
|
|
340
|
-
expected_scores: NDArray[np.float32] |
|
|
341
|
-
predicted_scores: NDArray[np.float32] |
|
|
342
|
-
anomaly_scores:
|
|
461
|
+
expected_scores: NDArray[np.float32] | Sequence[float],
|
|
462
|
+
predicted_scores: NDArray[np.float32] | Sequence[float | None],
|
|
463
|
+
anomaly_scores: NDArray[np.float32] | Sequence[float] | None = None,
|
|
343
464
|
) -> RegressionMetrics:
|
|
344
465
|
"""
|
|
345
466
|
Calculate regression metrics for model evaluation.
|
|
@@ -354,23 +475,42 @@ def calculate_regression_metrics(
|
|
|
354
475
|
|
|
355
476
|
Raises:
|
|
356
477
|
ValueError: If predictions and references have different lengths
|
|
478
|
+
ValueError: If expected_scores contains None or NaN values
|
|
357
479
|
"""
|
|
358
|
-
|
|
359
|
-
|
|
480
|
+
# Convert to numpy arrays, handling None values
|
|
481
|
+
references = convert_to_float32_array(expected_scores)
|
|
482
|
+
predictions = convert_to_float32_array(predicted_scores)
|
|
360
483
|
|
|
361
484
|
if len(predictions) != len(references):
|
|
362
485
|
raise ValueError("Predictions and references must have the same length")
|
|
363
486
|
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
487
|
+
# Validate that all expected_scores are non-None and non-NaN
|
|
488
|
+
if np.any(np.isnan(references)):
|
|
489
|
+
raise ValueError("expected_scores must not contain None or NaN values")
|
|
490
|
+
|
|
491
|
+
# If all of the predictions are None or NaN, return None for all metrics
|
|
492
|
+
if np.all(np.isnan(predictions)):
|
|
493
|
+
anomaly_score_mean, anomaly_score_median, anomaly_score_variance = calculate_anomaly_score_stats(anomaly_scores)
|
|
494
|
+
return RegressionMetrics(
|
|
495
|
+
coverage=0.0,
|
|
496
|
+
mse=0.0,
|
|
497
|
+
rmse=0.0,
|
|
498
|
+
mae=0.0,
|
|
499
|
+
r2=0.0,
|
|
500
|
+
explained_variance=0.0,
|
|
501
|
+
loss=0.0,
|
|
502
|
+
anomaly_score_mean=anomaly_score_mean,
|
|
503
|
+
anomaly_score_median=anomaly_score_median,
|
|
504
|
+
anomaly_score_variance=anomaly_score_variance,
|
|
505
|
+
)
|
|
367
506
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
507
|
+
# Filter out NaN values from predictions (expected_scores are already validated to be non-NaN)
|
|
508
|
+
valid_mask = ~np.isnan(predictions)
|
|
509
|
+
num_none_predictions = (~valid_mask).sum()
|
|
510
|
+
coverage = 1 - (num_none_predictions / len(predictions) if len(predictions) > 0 else 0)
|
|
371
511
|
if num_none_predictions > 0:
|
|
372
|
-
references = references[
|
|
373
|
-
predictions = predictions[
|
|
512
|
+
references = references[valid_mask]
|
|
513
|
+
predictions = predictions[valid_mask]
|
|
374
514
|
|
|
375
515
|
# Calculate core regression metrics
|
|
376
516
|
mse = float(sklearn.metrics.mean_squared_error(references, predictions))
|
|
@@ -379,6 +519,9 @@ def calculate_regression_metrics(
|
|
|
379
519
|
r2 = float(sklearn.metrics.r2_score(references, predictions))
|
|
380
520
|
explained_var = float(sklearn.metrics.explained_variance_score(references, predictions))
|
|
381
521
|
|
|
522
|
+
# Calculate anomaly score statistics
|
|
523
|
+
anomaly_score_mean, anomaly_score_median, anomaly_score_variance = calculate_anomaly_score_stats(anomaly_scores)
|
|
524
|
+
|
|
382
525
|
return RegressionMetrics(
|
|
383
526
|
coverage=coverage,
|
|
384
527
|
mse=mse,
|
orca_sdk/_shared/metrics_test.py
CHANGED
|
@@ -80,24 +80,36 @@ def test_multiclass_metrics_with_3_classes(
|
|
|
80
80
|
def test_does_not_modify_logits_unless_necessary():
|
|
81
81
|
logits = np.array([[0.1, 0.9], [0.2, 0.8], [0.7, 0.3], [0.8, 0.2]])
|
|
82
82
|
expected_labels = [0, 1, 0, 1]
|
|
83
|
-
|
|
84
|
-
|
|
83
|
+
loss = calculate_classification_metrics(expected_labels, logits).loss
|
|
84
|
+
assert loss is not None
|
|
85
|
+
assert np.allclose(
|
|
86
|
+
loss,
|
|
87
|
+
sklearn.metrics.log_loss(expected_labels, logits),
|
|
88
|
+
atol=1e-6,
|
|
85
89
|
)
|
|
86
90
|
|
|
87
91
|
|
|
88
92
|
def test_normalizes_logits_if_necessary():
|
|
89
93
|
logits = np.array([[1.2, 3.9], [1.2, 5.8], [1.2, 2.7], [1.2, 1.3]])
|
|
90
94
|
expected_labels = [0, 1, 0, 1]
|
|
91
|
-
|
|
92
|
-
|
|
95
|
+
loss = calculate_classification_metrics(expected_labels, logits).loss
|
|
96
|
+
assert loss is not None
|
|
97
|
+
assert np.allclose(
|
|
98
|
+
loss,
|
|
99
|
+
sklearn.metrics.log_loss(expected_labels, logits / logits.sum(axis=1, keepdims=True)),
|
|
100
|
+
atol=1e-6,
|
|
93
101
|
)
|
|
94
102
|
|
|
95
103
|
|
|
96
104
|
def test_softmaxes_logits_if_necessary():
|
|
97
105
|
logits = np.array([[-1.2, 3.9], [1.2, -5.8], [1.2, 2.7], [1.2, 1.3]])
|
|
98
106
|
expected_labels = [0, 1, 0, 1]
|
|
99
|
-
|
|
100
|
-
|
|
107
|
+
loss = calculate_classification_metrics(expected_labels, logits).loss
|
|
108
|
+
assert loss is not None
|
|
109
|
+
assert np.allclose(
|
|
110
|
+
loss,
|
|
111
|
+
sklearn.metrics.log_loss(expected_labels, softmax(logits)),
|
|
112
|
+
atol=1e-6,
|
|
101
113
|
)
|
|
102
114
|
|
|
103
115
|
|
|
@@ -271,3 +283,84 @@ def test_regression_metrics_handles_nans():
|
|
|
271
283
|
assert metrics.mae > 0.0
|
|
272
284
|
assert 0.0 <= metrics.r2 <= 1.0
|
|
273
285
|
assert 0.0 <= metrics.explained_variance <= 1.0
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def test_regression_metrics_handles_none_values():
|
|
289
|
+
# Test with lists containing None values
|
|
290
|
+
y_true = [1.0, 2.0, 3.0, 4.0, 5.0]
|
|
291
|
+
y_pred = [1.1, 1.9, None, 3.8, np.nan]
|
|
292
|
+
|
|
293
|
+
metrics = calculate_regression_metrics(y_true, y_pred)
|
|
294
|
+
|
|
295
|
+
# Coverage should be 0.6 (3 out of 5 predictions are valid)
|
|
296
|
+
# Positions with None/NaN predictions (indices 2 and 4) are filtered out
|
|
297
|
+
assert np.allclose(metrics.coverage, 0.6)
|
|
298
|
+
|
|
299
|
+
# Metrics should be calculated only on valid pairs (indices 0, 1, 3)
|
|
300
|
+
# Valid pairs: (1.0, 1.1), (2.0, 1.9), and (4.0, 3.8)
|
|
301
|
+
expected_mse = np.mean([(1.0 - 1.1) ** 2, (2.0 - 1.9) ** 2, (4.0 - 3.8) ** 2])
|
|
302
|
+
expected_mae = np.mean([abs(1.0 - 1.1), abs(2.0 - 1.9), abs(4.0 - 3.8)])
|
|
303
|
+
|
|
304
|
+
assert metrics.mse == pytest.approx(expected_mse)
|
|
305
|
+
assert metrics.mae == pytest.approx(expected_mae)
|
|
306
|
+
assert metrics.rmse == pytest.approx(np.sqrt(expected_mse))
|
|
307
|
+
assert 0.0 <= metrics.r2 <= 1.0
|
|
308
|
+
assert 0.0 <= metrics.explained_variance <= 1.0
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def test_regression_metrics_rejects_none_expected_scores():
|
|
312
|
+
# Test that None values in expected_scores are rejected
|
|
313
|
+
y_true = [1.0, 2.0, None, 4.0, 5.0]
|
|
314
|
+
y_pred = [1.1, 1.9, 3.2, 3.8, 5.1]
|
|
315
|
+
|
|
316
|
+
with pytest.raises(ValueError, match="expected_scores must not contain None or NaN values"):
|
|
317
|
+
calculate_regression_metrics(y_true, y_pred)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def test_regression_metrics_rejects_nan_expected_scores():
|
|
321
|
+
# Test that NaN values in expected_scores are rejected
|
|
322
|
+
y_true = np.array([1.0, 2.0, np.nan, 4.0, 5.0], dtype=np.float32)
|
|
323
|
+
y_pred = np.array([1.1, 1.9, 3.2, 3.8, 5.1], dtype=np.float32)
|
|
324
|
+
|
|
325
|
+
with pytest.raises(ValueError, match="expected_scores must not contain None or NaN values"):
|
|
326
|
+
calculate_regression_metrics(y_true, y_pred)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def test_regression_metrics_all_predictions_none():
|
|
330
|
+
# Test with all predictions being None
|
|
331
|
+
y_true = [1.0, 2.0, 3.0, 4.0, 5.0]
|
|
332
|
+
y_pred = [None, None, None, None, None]
|
|
333
|
+
|
|
334
|
+
metrics = calculate_regression_metrics(y_true, y_pred)
|
|
335
|
+
|
|
336
|
+
# When all predictions are None, coverage should be 0.0 and all metrics should be 0.0
|
|
337
|
+
assert metrics.coverage == 0.0
|
|
338
|
+
assert metrics.mse == 0.0
|
|
339
|
+
assert metrics.rmse == 0.0
|
|
340
|
+
assert metrics.mae == 0.0
|
|
341
|
+
assert metrics.r2 == 0.0
|
|
342
|
+
assert metrics.explained_variance == 0.0
|
|
343
|
+
assert metrics.loss == 0.0
|
|
344
|
+
assert metrics.anomaly_score_mean is None
|
|
345
|
+
assert metrics.anomaly_score_median is None
|
|
346
|
+
assert metrics.anomaly_score_variance is None
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def test_regression_metrics_all_predictions_nan():
|
|
350
|
+
# Test with all predictions being NaN
|
|
351
|
+
y_true = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
|
|
352
|
+
y_pred = np.array([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float32)
|
|
353
|
+
|
|
354
|
+
metrics = calculate_regression_metrics(y_true, y_pred)
|
|
355
|
+
|
|
356
|
+
# When all predictions are NaN, coverage should be 0.0 and all metrics should be 0.0
|
|
357
|
+
assert metrics.coverage == 0.0
|
|
358
|
+
assert metrics.mse == 0.0
|
|
359
|
+
assert metrics.rmse == 0.0
|
|
360
|
+
assert metrics.mae == 0.0
|
|
361
|
+
assert metrics.r2 == 0.0
|
|
362
|
+
assert metrics.explained_variance == 0.0
|
|
363
|
+
assert metrics.loss == 0.0
|
|
364
|
+
assert metrics.anomaly_score_mean is None
|
|
365
|
+
assert metrics.anomaly_score_median is None
|
|
366
|
+
assert metrics.anomaly_score_variance is None
|
|
@@ -33,7 +33,7 @@ def test_hf_dataset_from_torch_dict():
|
|
|
33
33
|
# Then the HF dataset should be created successfully
|
|
34
34
|
assert isinstance(hf_dataset, Dataset)
|
|
35
35
|
assert len(hf_dataset) == len(dataset)
|
|
36
|
-
assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id"}
|
|
36
|
+
assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id", "partition_id"}
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class PytorchTupleDataset(TorchDataset):
|
orca_sdk/async_client.py
CHANGED
|
@@ -137,6 +137,8 @@ class ClassificationEvaluationRequest(TypedDict):
|
|
|
137
137
|
telemetry_tags: NotRequired[list[str] | None]
|
|
138
138
|
subsample: NotRequired[int | float | None]
|
|
139
139
|
ignore_unlabeled: NotRequired[bool]
|
|
140
|
+
datasource_partition_column: NotRequired[str | None]
|
|
141
|
+
partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
|
|
140
142
|
|
|
141
143
|
|
|
142
144
|
class CleanupResponse(TypedDict):
|
|
@@ -317,12 +319,16 @@ class ListMemoriesRequest(TypedDict):
|
|
|
317
319
|
offset: NotRequired[int]
|
|
318
320
|
limit: NotRequired[int]
|
|
319
321
|
filters: NotRequired[list[FilterItem]]
|
|
322
|
+
partition_id: NotRequired[str | None]
|
|
323
|
+
partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
|
|
320
324
|
|
|
321
325
|
|
|
322
326
|
class LookupRequest(TypedDict):
|
|
323
327
|
query: list[str]
|
|
324
328
|
count: NotRequired[int]
|
|
325
329
|
prompt: NotRequired[str | None]
|
|
330
|
+
partition_id: NotRequired[str | list[str | None] | None]
|
|
331
|
+
partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
|
|
326
332
|
|
|
327
333
|
|
|
328
334
|
class LookupScoreMetrics(TypedDict):
|
|
@@ -549,16 +555,7 @@ class PredictiveModelUpdate(TypedDict):
|
|
|
549
555
|
|
|
550
556
|
|
|
551
557
|
PretrainedEmbeddingModelName = Literal[
|
|
552
|
-
"CLIP_BASE",
|
|
553
|
-
"GTE_BASE",
|
|
554
|
-
"CDE_SMALL",
|
|
555
|
-
"DISTILBERT",
|
|
556
|
-
"GTE_SMALL",
|
|
557
|
-
"MXBAI_LARGE",
|
|
558
|
-
"E5_LARGE",
|
|
559
|
-
"QWEN2_1_5B",
|
|
560
|
-
"BGE_BASE",
|
|
561
|
-
"GIST_LARGE",
|
|
558
|
+
"CLIP_BASE", "GTE_BASE", "CDE_SMALL", "DISTILBERT", "GTE_SMALL", "MXBAI_LARGE", "E5_LARGE", "BGE_BASE", "GIST_LARGE"
|
|
562
559
|
]
|
|
563
560
|
|
|
564
561
|
|
|
@@ -588,6 +585,8 @@ class RegressionEvaluationRequest(TypedDict):
|
|
|
588
585
|
telemetry_tags: NotRequired[list[str] | None]
|
|
589
586
|
subsample: NotRequired[int | float | None]
|
|
590
587
|
ignore_unlabeled: NotRequired[bool]
|
|
588
|
+
datasource_partition_column: NotRequired[str | None]
|
|
589
|
+
partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
|
|
591
590
|
|
|
592
591
|
|
|
593
592
|
class RegressionMetrics(TypedDict):
|
|
@@ -631,6 +630,8 @@ class RegressionPredictionRequest(TypedDict):
|
|
|
631
630
|
use_lookup_cache: NotRequired[bool]
|
|
632
631
|
consistency_level: NotRequired[Literal["Bounded", "Session", "Strong", "Eventual"] | None]
|
|
633
632
|
ignore_unlabeled: NotRequired[bool]
|
|
633
|
+
partition_ids: NotRequired[str | list[str | None] | None]
|
|
634
|
+
partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
|
|
634
635
|
|
|
635
636
|
|
|
636
637
|
class ScorePredictionMemoryLookup(TypedDict):
|
|
@@ -1165,7 +1166,14 @@ class BootstrapClassificationModelRequest(TypedDict):
|
|
|
1165
1166
|
num_examples_per_label: NotRequired[int]
|
|
1166
1167
|
|
|
1167
1168
|
|
|
1168
|
-
class
|
|
1169
|
+
class BootstrapLabeledMemoryDataInput(TypedDict):
|
|
1170
|
+
model_description: str
|
|
1171
|
+
label_names: list[str]
|
|
1172
|
+
initial_examples: NotRequired[list[LabeledExample]]
|
|
1173
|
+
num_examples_per_label: NotRequired[int]
|
|
1174
|
+
|
|
1175
|
+
|
|
1176
|
+
class BootstrapLabeledMemoryDataResult(TypedDict):
|
|
1169
1177
|
model_description: str
|
|
1170
1178
|
label_names: list[str]
|
|
1171
1179
|
model_name: str
|
|
@@ -1218,6 +1226,8 @@ class ClassificationPredictionRequest(TypedDict):
|
|
|
1218
1226
|
use_lookup_cache: NotRequired[bool]
|
|
1219
1227
|
consistency_level: NotRequired[Literal["Bounded", "Session", "Strong", "Eventual"] | None]
|
|
1220
1228
|
ignore_unlabeled: NotRequired[bool]
|
|
1229
|
+
partition_ids: NotRequired[str | list[str | None] | None]
|
|
1230
|
+
partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
|
|
1221
1231
|
|
|
1222
1232
|
|
|
1223
1233
|
class CloneMemorysetRequest(TypedDict):
|
|
@@ -1271,6 +1281,7 @@ class CreateMemorysetRequest(TypedDict):
|
|
|
1271
1281
|
datasource_score_column: NotRequired[str | None]
|
|
1272
1282
|
datasource_value_column: str
|
|
1273
1283
|
datasource_source_id_column: NotRequired[str | None]
|
|
1284
|
+
datasource_partition_id_column: NotRequired[str | None]
|
|
1274
1285
|
remove_duplicates: NotRequired[bool]
|
|
1275
1286
|
pretrained_embedding_model_name: NotRequired[PretrainedEmbeddingModelName | None]
|
|
1276
1287
|
finetuned_embedding_model_name_or_id: NotRequired[str | None]
|
|
@@ -1541,6 +1552,7 @@ class MemorysetAnalysisRequest(TypedDict):
|
|
|
1541
1552
|
batch_size: NotRequired[int]
|
|
1542
1553
|
clear_metrics: NotRequired[bool]
|
|
1543
1554
|
configs: MemorysetAnalysisConfigs
|
|
1555
|
+
partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
|
|
1544
1556
|
|
|
1545
1557
|
|
|
1546
1558
|
class MemorysetConceptMetrics(TypedDict):
|
|
@@ -1666,7 +1678,7 @@ class BootstrapClassificationModelMeta(TypedDict):
|
|
|
1666
1678
|
datasource_meta: DatasourceMetadata
|
|
1667
1679
|
memoryset_meta: MemorysetMetadata
|
|
1668
1680
|
model_meta: ClassificationModelMetadata
|
|
1669
|
-
agent_output:
|
|
1681
|
+
agent_output: BootstrapLabeledMemoryDataResult
|
|
1670
1682
|
|
|
1671
1683
|
|
|
1672
1684
|
class BootstrapClassificationModelResponse(TypedDict):
|
|
@@ -2556,7 +2568,7 @@ class OrcaAsyncClient(AsyncClient):
|
|
|
2556
2568
|
timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
|
|
2557
2569
|
extensions: RequestExtensions | None = None,
|
|
2558
2570
|
) -> BootstrapClassificationModelResponse:
|
|
2559
|
-
"""Get the status of a bootstrap
|
|
2571
|
+
"""Get the status of a bootstrap labeled memory data job"""
|
|
2560
2572
|
pass
|
|
2561
2573
|
|
|
2562
2574
|
async def GET(
|
|
@@ -3278,6 +3290,32 @@ class OrcaAsyncClient(AsyncClient):
|
|
|
3278
3290
|
"""Get row count from a specific datasource with optional filtering."""
|
|
3279
3291
|
pass
|
|
3280
3292
|
|
|
3293
|
+
@overload
|
|
3294
|
+
async def POST(
|
|
3295
|
+
self,
|
|
3296
|
+
path: Literal["/datasource/bootstrap_memory_data"],
|
|
3297
|
+
*,
|
|
3298
|
+
params: None = None,
|
|
3299
|
+
json: BootstrapLabeledMemoryDataInput,
|
|
3300
|
+
data: None = None,
|
|
3301
|
+
files: None = None,
|
|
3302
|
+
content: None = None,
|
|
3303
|
+
parse_as: Literal["json"] = "json",
|
|
3304
|
+
headers: HeaderTypes | None = None,
|
|
3305
|
+
cookies: CookieTypes | None = None,
|
|
3306
|
+
auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT,
|
|
3307
|
+
follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT,
|
|
3308
|
+
timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
|
|
3309
|
+
extensions: RequestExtensions | None = None,
|
|
3310
|
+
) -> BootstrapLabeledMemoryDataResult:
|
|
3311
|
+
"""
|
|
3312
|
+
Bootstrap memory data using an AI agent.
|
|
3313
|
+
|
|
3314
|
+
This endpoint uses the bootstrap labeled memory data agent to generate
|
|
3315
|
+
high-quality, diverse training examples for a classification model.
|
|
3316
|
+
"""
|
|
3317
|
+
pass
|
|
3318
|
+
|
|
3281
3319
|
@overload
|
|
3282
3320
|
async def POST(
|
|
3283
3321
|
self,
|
|
@@ -3526,7 +3564,7 @@ class OrcaAsyncClient(AsyncClient):
|
|
|
3526
3564
|
"""
|
|
3527
3565
|
Bootstrap a classification model by creating a memoryset with generated memories and a classification model.
|
|
3528
3566
|
|
|
3529
|
-
This endpoint uses the
|
|
3567
|
+
This endpoint uses the bootstrap_labeled_memory_data agent to generate:
|
|
3530
3568
|
1. Memoryset configuration with appropriate settings
|
|
3531
3569
|
2. Model configuration with optimal parameters
|
|
3532
3570
|
3. High-quality training memories for each label
|