orca-sdk 0.1.4__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/PKG-INFO +1 -1
  2. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_shared/metrics.py +186 -43
  3. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_shared/metrics_test.py +99 -6
  4. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/data_parsing_test.py +1 -1
  5. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/async_client.py +52 -14
  6. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/classification_model.py +107 -30
  7. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/classification_model_test.py +327 -8
  8. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/client.py +52 -14
  9. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/conftest.py +140 -21
  10. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/embedding_model.py +0 -2
  11. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/memoryset.py +141 -26
  12. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/memoryset_test.py +253 -4
  13. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/regression_model.py +73 -16
  14. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/regression_model_test.py +213 -0
  15. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/pyproject.toml +1 -1
  16. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/README.md +0 -0
  17. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/__init__.py +0 -0
  18. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_shared/__init__.py +0 -0
  19. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/__init__.py +0 -0
  20. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/analysis_ui.py +0 -0
  21. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/analysis_ui_style.css +0 -0
  22. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/auth.py +0 -0
  23. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/auth_test.py +0 -0
  24. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/common.py +0 -0
  25. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/data_parsing.py +0 -0
  26. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/pagination.py +0 -0
  27. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/pagination_test.py +0 -0
  28. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/prediction_result_ui.css +0 -0
  29. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/prediction_result_ui.py +0 -0
  30. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/tqdm_file_reader.py +0 -0
  31. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/value_parser.py +0 -0
  32. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/_utils/value_parser_test.py +0 -0
  33. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/credentials.py +0 -0
  34. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/credentials_test.py +0 -0
  35. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/datasource.py +0 -0
  36. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/datasource_test.py +0 -0
  37. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/embedding_model_test.py +0 -0
  38. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/job.py +0 -0
  39. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/job_test.py +0 -0
  40. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/telemetry.py +0 -0
  41. {orca_sdk-0.1.4 → orca_sdk-0.1.6}/orca_sdk/telemetry_test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orca_sdk
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: SDK for interacting with Orca Services
5
5
  License-Expression: Apache-2.0
6
6
  Author: Orca DB Inc.
@@ -9,7 +9,7 @@ IMPORTANT:
9
9
  """
10
10
 
11
11
  from dataclasses import dataclass
12
- from typing import Any, Literal, TypedDict, cast
12
+ from typing import Any, Literal, Sequence, TypedDict, cast
13
13
 
14
14
  import numpy as np
15
15
  import sklearn.metrics
@@ -20,7 +20,9 @@ from numpy.typing import NDArray
20
20
  def softmax(logits: np.ndarray, axis: int = -1) -> np.ndarray:
21
21
  shifted = logits - np.max(logits, axis=axis, keepdims=True)
22
22
  exps = np.exp(shifted)
23
- return exps / np.sum(exps, axis=axis, keepdims=True)
23
+ sums = np.sum(exps, axis=axis, keepdims=True)
24
+ # Guard against division by zero (can happen if all logits are -inf or NaN)
25
+ return exps / np.where(sums > 0, sums, 1.0)
24
26
 
25
27
 
26
28
  # We don't want to depend on transformers just for the eval_pred type in orca_sdk
@@ -39,6 +41,66 @@ def transform_eval_pred(eval_pred: Any) -> tuple[NDArray, NDArray[np.float32]]:
39
41
  return (references, logits)
40
42
 
41
43
 
44
+ def convert_to_float32_array(
45
+ data: (
46
+ Sequence[float | None]
47
+ | NDArray[np.float32]
48
+ | Sequence[Sequence[float]]
49
+ | Sequence[NDArray[np.float32]]
50
+ | NDArray[np.float32]
51
+ ),
52
+ ) -> NDArray[np.float32]:
53
+ """
54
+ Convert a list or array that may contain None values to a float32 numpy array.
55
+ None values are converted to NaN.
56
+
57
+ Args:
58
+ data: Input data that may contain None values
59
+
60
+ Returns:
61
+ A float32 numpy array with None values converted to NaN
62
+ """
63
+ array = np.array(data)
64
+ # Convert None values to NaN to handle missing values
65
+ if array.dtype == object:
66
+
67
+ def convert_value(x):
68
+ return np.nan if x is None else float(x)
69
+
70
+ array = np.vectorize(convert_value, otypes=[np.float32])(array)
71
+ else:
72
+ array = np.asarray(array, dtype=np.float32)
73
+ return cast(NDArray[np.float32], array)
74
+
75
+
76
+ def calculate_anomaly_score_stats(
77
+ anomaly_scores: NDArray[np.float32] | Sequence[float] | None,
78
+ ) -> tuple[float | None, float | None, float | None]:
79
+ """
80
+ Calculate statistics (mean, median, variance) for anomaly scores.
81
+
82
+ Args:
83
+ anomaly_scores: Anomaly scores as a list, numpy array, or None
84
+
85
+ Returns:
86
+ A tuple of (mean, median, variance). All values are None if anomaly_scores is None.
87
+ """
88
+ if anomaly_scores is None:
89
+ return (None, None, None)
90
+
91
+ # Convert to numpy array if needed
92
+ if isinstance(anomaly_scores, list):
93
+ anomalies = np.array(anomaly_scores, dtype=np.float32)
94
+ else:
95
+ anomalies = anomaly_scores
96
+
97
+ return (
98
+ float(np.mean(anomalies)),
99
+ float(np.median(anomalies)),
100
+ float(np.var(anomalies)),
101
+ )
102
+
103
+
42
104
  class PRCurve(TypedDict):
43
105
  thresholds: list[float]
44
106
  precisions: list[float]
@@ -196,52 +258,106 @@ class ClassificationMetrics:
196
258
  )
197
259
 
198
260
 
199
- def calculate_classification_metrics(
200
- expected_labels: list[int] | NDArray[np.int64],
201
- logits: list[list[float]] | list[NDArray[np.float32]] | NDArray[np.float32],
202
- anomaly_scores: list[float] | None = None,
203
- average: Literal["micro", "macro", "weighted", "binary"] | None = None,
204
- multi_class: Literal["ovr", "ovo"] = "ovr",
205
- include_curves: bool = False,
206
- ) -> ClassificationMetrics:
207
- references = np.array(expected_labels)
261
+ def convert_logits_to_probabilities(logits: NDArray[np.float32]) -> NDArray[np.float32]:
262
+ """
263
+ Convert logits to probability distributions.
264
+
265
+ This function handles multiple input formats:
266
+ - 1D arrays: Binary classification probabilities (must be between 0 and 1)
267
+ - 2D arrays: Multi-class logits or probabilities
268
+
269
+ For 2D inputs, the function automatically detects the format:
270
+ - If any values are <= 0: applies softmax (raw logits)
271
+ - If rows don't sum to 1: normalizes to probabilities
272
+ - If rows sum to 1: treats as already normalized probabilities
208
273
 
209
- logits = np.array(logits)
274
+ Args:
275
+ logits: Input logits or probabilities as a float32 numpy array.
276
+ Can be 1D (binary) or 2D (multi-class). May contain NaN values.
277
+
278
+ Returns:
279
+ A 2D float32 numpy array of probabilities with shape (n_samples, n_classes).
280
+ Each row sums to 1.0 (except for rows with all NaN values).
281
+
282
+ Raises:
283
+ ValueError: If logits are not 1D or 2D
284
+ ValueError: If 1D logits are not between 0 and 1 (for binary classification)
285
+ ValueError: If 2D logits have fewer than 2 classes (use regression metrics instead)
286
+ """
210
287
  if logits.ndim == 1:
211
- if (logits > 1).any() or (logits < 0).any():
288
+ # Binary classification: 1D probabilities
289
+ # Check non-NaN values only
290
+ valid_logits = logits[~np.isnan(logits)]
291
+ if len(valid_logits) > 0 and ((valid_logits > 1).any() or (valid_logits < 0).any()):
212
292
  raise ValueError("Logits must be between 0 and 1 for binary classification")
213
- # convert 1D probabilities (binary) to 2D logits
214
- logits = np.column_stack([1 - logits, logits])
215
- probabilities = logits # no need to convert to probabilities
293
+ # Convert 1D probabilities to 2D format: [1-p, p]
294
+ probabilities = cast(NDArray[np.float32], np.column_stack([1 - logits, logits]))
216
295
  elif logits.ndim == 2:
217
296
  if logits.shape[1] < 2:
218
297
  raise ValueError("Use a different metric function for regression tasks")
219
- if not (logits > 0).all():
220
- # convert logits to probabilities with softmax if necessary
221
- probabilities = softmax(logits)
298
+ # Check if any non-NaN values are <= 0 (NaN-aware comparison)
299
+ valid_logits = logits[~np.isnan(logits)]
300
+ if len(valid_logits) > 0 and not (valid_logits > 0).all():
301
+ # Contains negative values or zeros: apply softmax (raw logits)
302
+ probabilities = cast(NDArray[np.float32], softmax(logits))
222
303
  elif not np.allclose(logits.sum(-1, keepdims=True), 1.0):
223
- # convert logits to probabilities through normalization if necessary
224
- probabilities = logits / logits.sum(-1, keepdims=True)
304
+ # Rows don't sum to 1: normalize to probabilities
305
+ row_sums = logits.sum(-1, keepdims=True)
306
+ # Guard against division by zero (can happen if all values in a row are 0 or NaN)
307
+ probabilities = cast(NDArray[np.float32], logits / np.where(row_sums > 0, row_sums, 1.0))
225
308
  else:
309
+ # Already normalized probabilities
226
310
  probabilities = logits
227
311
  else:
228
312
  raise ValueError("Logits must be 1 or 2 dimensional")
229
313
 
314
+ return probabilities
315
+
316
+
317
+ def calculate_classification_metrics(
318
+ expected_labels: list[int] | NDArray[np.int64],
319
+ logits: list[list[float]] | list[NDArray[np.float32]] | NDArray[np.float32],
320
+ anomaly_scores: list[float] | None = None,
321
+ average: Literal["micro", "macro", "weighted", "binary"] | None = None,
322
+ multi_class: Literal["ovr", "ovo"] = "ovr",
323
+ include_curves: bool = False,
324
+ ) -> ClassificationMetrics:
325
+ references = np.array(expected_labels)
326
+
327
+ # Convert to numpy array, handling None values
328
+ logits = convert_to_float32_array(logits)
329
+
330
+ # Check if all logits are NaN (all predictions are None/NaN)
331
+ if np.all(np.isnan(logits)):
332
+ # Return placeholder metrics when all logits are invalid
333
+ return ClassificationMetrics(
334
+ coverage=0.0,
335
+ f1_score=0.0,
336
+ accuracy=0.0,
337
+ loss=None,
338
+ anomaly_score_mean=None,
339
+ anomaly_score_median=None,
340
+ anomaly_score_variance=None,
341
+ roc_auc=None,
342
+ pr_auc=None,
343
+ pr_curve=None,
344
+ roc_curve=None,
345
+ )
346
+
347
+ # Convert logits to probabilities
348
+ probabilities = convert_logits_to_probabilities(logits)
349
+
230
350
  predictions = np.argmax(probabilities, axis=-1)
231
351
  predictions[np.isnan(probabilities).all(axis=-1)] = -1 # set predictions to -1 for all nan logits
232
352
 
233
353
  num_classes_references = len(set(references))
234
354
  num_classes_predictions = len(set(predictions))
235
355
  num_none_predictions = np.isnan(probabilities).all(axis=-1).sum()
236
- coverage = 1 - num_none_predictions / len(probabilities)
356
+ coverage = 1 - (num_none_predictions / len(probabilities) if len(probabilities) > 0 else 0)
237
357
 
238
358
  if average is None:
239
359
  average = "binary" if num_classes_references == 2 and num_none_predictions == 0 else "weighted"
240
360
 
241
- anomaly_score_mean = float(np.mean(anomaly_scores)) if anomaly_scores else None
242
- anomaly_score_median = float(np.median(anomaly_scores)) if anomaly_scores else None
243
- anomaly_score_variance = float(np.var(anomaly_scores)) if anomaly_scores else None
244
-
245
361
  accuracy = sklearn.metrics.accuracy_score(references, predictions)
246
362
  f1 = sklearn.metrics.f1_score(references, predictions, average=average)
247
363
  # Ensure sklearn sees the full class set corresponding to probability columns
@@ -259,10 +375,12 @@ def calculate_classification_metrics(
259
375
  if num_classes_references == num_classes_predictions and num_none_predictions == 0:
260
376
  # special case for binary classification: https://github.com/scikit-learn/scikit-learn/issues/20186
261
377
  if num_classes_references == 2:
262
- roc_auc = sklearn.metrics.roc_auc_score(references, logits[:, 1])
263
- roc_curve = calculate_roc_curve(references, logits[:, 1]) if include_curves else None
264
- pr_auc = sklearn.metrics.average_precision_score(references, logits[:, 1])
265
- pr_curve = calculate_pr_curve(references, logits[:, 1]) if include_curves else None
378
+ # Use probabilities[:, 1] which is guaranteed to be 2D
379
+ probabilities_positive = probabilities[:, 1]
380
+ roc_auc = sklearn.metrics.roc_auc_score(references, probabilities_positive)
381
+ roc_curve = calculate_roc_curve(references, probabilities_positive) if include_curves else None
382
+ pr_auc = sklearn.metrics.average_precision_score(references, probabilities_positive)
383
+ pr_curve = calculate_pr_curve(references, probabilities_positive) if include_curves else None
266
384
  else:
267
385
  roc_auc = sklearn.metrics.roc_auc_score(references, probabilities, multi_class=multi_class)
268
386
  roc_curve = None
@@ -274,6 +392,9 @@ def calculate_classification_metrics(
274
392
  pr_curve = None
275
393
  roc_curve = None
276
394
 
395
+ # Calculate anomaly score statistics
396
+ anomaly_score_mean, anomaly_score_median, anomaly_score_variance = calculate_anomaly_score_stats(anomaly_scores)
397
+
277
398
  return ClassificationMetrics(
278
399
  coverage=coverage,
279
400
  accuracy=float(accuracy),
@@ -337,9 +458,9 @@ class RegressionMetrics:
337
458
 
338
459
 
339
460
  def calculate_regression_metrics(
340
- expected_scores: NDArray[np.float32] | list[float],
341
- predicted_scores: NDArray[np.float32] | list[float],
342
- anomaly_scores: list[float] | None = None,
461
+ expected_scores: NDArray[np.float32] | Sequence[float],
462
+ predicted_scores: NDArray[np.float32] | Sequence[float | None],
463
+ anomaly_scores: NDArray[np.float32] | Sequence[float] | None = None,
343
464
  ) -> RegressionMetrics:
344
465
  """
345
466
  Calculate regression metrics for model evaluation.
@@ -354,23 +475,42 @@ def calculate_regression_metrics(
354
475
 
355
476
  Raises:
356
477
  ValueError: If predictions and references have different lengths
478
+ ValueError: If expected_scores contains None or NaN values
357
479
  """
358
- references = np.array(expected_scores)
359
- predictions = np.array(predicted_scores)
480
+ # Convert to numpy arrays, handling None values
481
+ references = convert_to_float32_array(expected_scores)
482
+ predictions = convert_to_float32_array(predicted_scores)
360
483
 
361
484
  if len(predictions) != len(references):
362
485
  raise ValueError("Predictions and references must have the same length")
363
486
 
364
- anomaly_score_mean = float(np.mean(anomaly_scores)) if anomaly_scores else None
365
- anomaly_score_median = float(np.median(anomaly_scores)) if anomaly_scores else None
366
- anomaly_score_variance = float(np.var(anomaly_scores)) if anomaly_scores else None
487
+ # Validate that all expected_scores are non-None and non-NaN
488
+ if np.any(np.isnan(references)):
489
+ raise ValueError("expected_scores must not contain None or NaN values")
490
+
491
+ # If all of the predictions are None or NaN, return None for all metrics
492
+ if np.all(np.isnan(predictions)):
493
+ anomaly_score_mean, anomaly_score_median, anomaly_score_variance = calculate_anomaly_score_stats(anomaly_scores)
494
+ return RegressionMetrics(
495
+ coverage=0.0,
496
+ mse=0.0,
497
+ rmse=0.0,
498
+ mae=0.0,
499
+ r2=0.0,
500
+ explained_variance=0.0,
501
+ loss=0.0,
502
+ anomaly_score_mean=anomaly_score_mean,
503
+ anomaly_score_median=anomaly_score_median,
504
+ anomaly_score_variance=anomaly_score_variance,
505
+ )
367
506
 
368
- none_prediction_mask = np.isnan(predictions)
369
- num_none_predictions = none_prediction_mask.sum()
370
- coverage = 1 - num_none_predictions / len(predictions)
507
+ # Filter out NaN values from predictions (expected_scores are already validated to be non-NaN)
508
+ valid_mask = ~np.isnan(predictions)
509
+ num_none_predictions = (~valid_mask).sum()
510
+ coverage = 1 - (num_none_predictions / len(predictions) if len(predictions) > 0 else 0)
371
511
  if num_none_predictions > 0:
372
- references = references[~none_prediction_mask]
373
- predictions = predictions[~none_prediction_mask]
512
+ references = references[valid_mask]
513
+ predictions = predictions[valid_mask]
374
514
 
375
515
  # Calculate core regression metrics
376
516
  mse = float(sklearn.metrics.mean_squared_error(references, predictions))
@@ -379,6 +519,9 @@ def calculate_regression_metrics(
379
519
  r2 = float(sklearn.metrics.r2_score(references, predictions))
380
520
  explained_var = float(sklearn.metrics.explained_variance_score(references, predictions))
381
521
 
522
+ # Calculate anomaly score statistics
523
+ anomaly_score_mean, anomaly_score_median, anomaly_score_variance = calculate_anomaly_score_stats(anomaly_scores)
524
+
382
525
  return RegressionMetrics(
383
526
  coverage=coverage,
384
527
  mse=mse,
@@ -80,24 +80,36 @@ def test_multiclass_metrics_with_3_classes(
80
80
  def test_does_not_modify_logits_unless_necessary():
81
81
  logits = np.array([[0.1, 0.9], [0.2, 0.8], [0.7, 0.3], [0.8, 0.2]])
82
82
  expected_labels = [0, 1, 0, 1]
83
- assert calculate_classification_metrics(expected_labels, logits).loss == sklearn.metrics.log_loss(
84
- expected_labels, logits
83
+ loss = calculate_classification_metrics(expected_labels, logits).loss
84
+ assert loss is not None
85
+ assert np.allclose(
86
+ loss,
87
+ sklearn.metrics.log_loss(expected_labels, logits),
88
+ atol=1e-6,
85
89
  )
86
90
 
87
91
 
88
92
  def test_normalizes_logits_if_necessary():
89
93
  logits = np.array([[1.2, 3.9], [1.2, 5.8], [1.2, 2.7], [1.2, 1.3]])
90
94
  expected_labels = [0, 1, 0, 1]
91
- assert calculate_classification_metrics(expected_labels, logits).loss == sklearn.metrics.log_loss(
92
- expected_labels, logits / logits.sum(axis=1, keepdims=True)
95
+ loss = calculate_classification_metrics(expected_labels, logits).loss
96
+ assert loss is not None
97
+ assert np.allclose(
98
+ loss,
99
+ sklearn.metrics.log_loss(expected_labels, logits / logits.sum(axis=1, keepdims=True)),
100
+ atol=1e-6,
93
101
  )
94
102
 
95
103
 
96
104
  def test_softmaxes_logits_if_necessary():
97
105
  logits = np.array([[-1.2, 3.9], [1.2, -5.8], [1.2, 2.7], [1.2, 1.3]])
98
106
  expected_labels = [0, 1, 0, 1]
99
- assert calculate_classification_metrics(expected_labels, logits).loss == sklearn.metrics.log_loss(
100
- expected_labels, softmax(logits)
107
+ loss = calculate_classification_metrics(expected_labels, logits).loss
108
+ assert loss is not None
109
+ assert np.allclose(
110
+ loss,
111
+ sklearn.metrics.log_loss(expected_labels, softmax(logits)),
112
+ atol=1e-6,
101
113
  )
102
114
 
103
115
 
@@ -271,3 +283,84 @@ def test_regression_metrics_handles_nans():
271
283
  assert metrics.mae > 0.0
272
284
  assert 0.0 <= metrics.r2 <= 1.0
273
285
  assert 0.0 <= metrics.explained_variance <= 1.0
286
+
287
+
288
+ def test_regression_metrics_handles_none_values():
289
+ # Test with lists containing None values
290
+ y_true = [1.0, 2.0, 3.0, 4.0, 5.0]
291
+ y_pred = [1.1, 1.9, None, 3.8, np.nan]
292
+
293
+ metrics = calculate_regression_metrics(y_true, y_pred)
294
+
295
+ # Coverage should be 0.6 (3 out of 5 predictions are valid)
296
+ # Positions with None/NaN predictions (indices 2 and 4) are filtered out
297
+ assert np.allclose(metrics.coverage, 0.6)
298
+
299
+ # Metrics should be calculated only on valid pairs (indices 0, 1, 3)
300
+ # Valid pairs: (1.0, 1.1), (2.0, 1.9), and (4.0, 3.8)
301
+ expected_mse = np.mean([(1.0 - 1.1) ** 2, (2.0 - 1.9) ** 2, (4.0 - 3.8) ** 2])
302
+ expected_mae = np.mean([abs(1.0 - 1.1), abs(2.0 - 1.9), abs(4.0 - 3.8)])
303
+
304
+ assert metrics.mse == pytest.approx(expected_mse)
305
+ assert metrics.mae == pytest.approx(expected_mae)
306
+ assert metrics.rmse == pytest.approx(np.sqrt(expected_mse))
307
+ assert 0.0 <= metrics.r2 <= 1.0
308
+ assert 0.0 <= metrics.explained_variance <= 1.0
309
+
310
+
311
+ def test_regression_metrics_rejects_none_expected_scores():
312
+ # Test that None values in expected_scores are rejected
313
+ y_true = [1.0, 2.0, None, 4.0, 5.0]
314
+ y_pred = [1.1, 1.9, 3.2, 3.8, 5.1]
315
+
316
+ with pytest.raises(ValueError, match="expected_scores must not contain None or NaN values"):
317
+ calculate_regression_metrics(y_true, y_pred)
318
+
319
+
320
+ def test_regression_metrics_rejects_nan_expected_scores():
321
+ # Test that NaN values in expected_scores are rejected
322
+ y_true = np.array([1.0, 2.0, np.nan, 4.0, 5.0], dtype=np.float32)
323
+ y_pred = np.array([1.1, 1.9, 3.2, 3.8, 5.1], dtype=np.float32)
324
+
325
+ with pytest.raises(ValueError, match="expected_scores must not contain None or NaN values"):
326
+ calculate_regression_metrics(y_true, y_pred)
327
+
328
+
329
+ def test_regression_metrics_all_predictions_none():
330
+ # Test with all predictions being None
331
+ y_true = [1.0, 2.0, 3.0, 4.0, 5.0]
332
+ y_pred = [None, None, None, None, None]
333
+
334
+ metrics = calculate_regression_metrics(y_true, y_pred)
335
+
336
+ # When all predictions are None, coverage should be 0.0 and all metrics should be 0.0
337
+ assert metrics.coverage == 0.0
338
+ assert metrics.mse == 0.0
339
+ assert metrics.rmse == 0.0
340
+ assert metrics.mae == 0.0
341
+ assert metrics.r2 == 0.0
342
+ assert metrics.explained_variance == 0.0
343
+ assert metrics.loss == 0.0
344
+ assert metrics.anomaly_score_mean is None
345
+ assert metrics.anomaly_score_median is None
346
+ assert metrics.anomaly_score_variance is None
347
+
348
+
349
+ def test_regression_metrics_all_predictions_nan():
350
+ # Test with all predictions being NaN
351
+ y_true = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
352
+ y_pred = np.array([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float32)
353
+
354
+ metrics = calculate_regression_metrics(y_true, y_pred)
355
+
356
+ # When all predictions are NaN, coverage should be 0.0 and all metrics should be 0.0
357
+ assert metrics.coverage == 0.0
358
+ assert metrics.mse == 0.0
359
+ assert metrics.rmse == 0.0
360
+ assert metrics.mae == 0.0
361
+ assert metrics.r2 == 0.0
362
+ assert metrics.explained_variance == 0.0
363
+ assert metrics.loss == 0.0
364
+ assert metrics.anomaly_score_mean is None
365
+ assert metrics.anomaly_score_median is None
366
+ assert metrics.anomaly_score_variance is None
@@ -33,7 +33,7 @@ def test_hf_dataset_from_torch_dict():
33
33
  # Then the HF dataset should be created successfully
34
34
  assert isinstance(hf_dataset, Dataset)
35
35
  assert len(hf_dataset) == len(dataset)
36
- assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id"}
36
+ assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id", "partition_id"}
37
37
 
38
38
 
39
39
  class PytorchTupleDataset(TorchDataset):
@@ -137,6 +137,8 @@ class ClassificationEvaluationRequest(TypedDict):
137
137
  telemetry_tags: NotRequired[list[str] | None]
138
138
  subsample: NotRequired[int | float | None]
139
139
  ignore_unlabeled: NotRequired[bool]
140
+ datasource_partition_column: NotRequired[str | None]
141
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
140
142
 
141
143
 
142
144
  class CleanupResponse(TypedDict):
@@ -317,12 +319,16 @@ class ListMemoriesRequest(TypedDict):
317
319
  offset: NotRequired[int]
318
320
  limit: NotRequired[int]
319
321
  filters: NotRequired[list[FilterItem]]
322
+ partition_id: NotRequired[str | None]
323
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
320
324
 
321
325
 
322
326
  class LookupRequest(TypedDict):
323
327
  query: list[str]
324
328
  count: NotRequired[int]
325
329
  prompt: NotRequired[str | None]
330
+ partition_id: NotRequired[str | list[str | None] | None]
331
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
326
332
 
327
333
 
328
334
  class LookupScoreMetrics(TypedDict):
@@ -549,16 +555,7 @@ class PredictiveModelUpdate(TypedDict):
549
555
 
550
556
 
551
557
  PretrainedEmbeddingModelName = Literal[
552
- "CLIP_BASE",
553
- "GTE_BASE",
554
- "CDE_SMALL",
555
- "DISTILBERT",
556
- "GTE_SMALL",
557
- "MXBAI_LARGE",
558
- "E5_LARGE",
559
- "QWEN2_1_5B",
560
- "BGE_BASE",
561
- "GIST_LARGE",
558
+ "CLIP_BASE", "GTE_BASE", "CDE_SMALL", "DISTILBERT", "GTE_SMALL", "MXBAI_LARGE", "E5_LARGE", "BGE_BASE", "GIST_LARGE"
562
559
  ]
563
560
 
564
561
 
@@ -588,6 +585,8 @@ class RegressionEvaluationRequest(TypedDict):
588
585
  telemetry_tags: NotRequired[list[str] | None]
589
586
  subsample: NotRequired[int | float | None]
590
587
  ignore_unlabeled: NotRequired[bool]
588
+ datasource_partition_column: NotRequired[str | None]
589
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
591
590
 
592
591
 
593
592
  class RegressionMetrics(TypedDict):
@@ -631,6 +630,8 @@ class RegressionPredictionRequest(TypedDict):
631
630
  use_lookup_cache: NotRequired[bool]
632
631
  consistency_level: NotRequired[Literal["Bounded", "Session", "Strong", "Eventual"] | None]
633
632
  ignore_unlabeled: NotRequired[bool]
633
+ partition_ids: NotRequired[str | list[str | None] | None]
634
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
634
635
 
635
636
 
636
637
  class ScorePredictionMemoryLookup(TypedDict):
@@ -1165,7 +1166,14 @@ class BootstrapClassificationModelRequest(TypedDict):
1165
1166
  num_examples_per_label: NotRequired[int]
1166
1167
 
1167
1168
 
1168
- class BootstrapClassificationModelResult(TypedDict):
1169
+ class BootstrapLabeledMemoryDataInput(TypedDict):
1170
+ model_description: str
1171
+ label_names: list[str]
1172
+ initial_examples: NotRequired[list[LabeledExample]]
1173
+ num_examples_per_label: NotRequired[int]
1174
+
1175
+
1176
+ class BootstrapLabeledMemoryDataResult(TypedDict):
1169
1177
  model_description: str
1170
1178
  label_names: list[str]
1171
1179
  model_name: str
@@ -1218,6 +1226,8 @@ class ClassificationPredictionRequest(TypedDict):
1218
1226
  use_lookup_cache: NotRequired[bool]
1219
1227
  consistency_level: NotRequired[Literal["Bounded", "Session", "Strong", "Eventual"] | None]
1220
1228
  ignore_unlabeled: NotRequired[bool]
1229
+ partition_ids: NotRequired[str | list[str | None] | None]
1230
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
1221
1231
 
1222
1232
 
1223
1233
  class CloneMemorysetRequest(TypedDict):
@@ -1271,6 +1281,7 @@ class CreateMemorysetRequest(TypedDict):
1271
1281
  datasource_score_column: NotRequired[str | None]
1272
1282
  datasource_value_column: str
1273
1283
  datasource_source_id_column: NotRequired[str | None]
1284
+ datasource_partition_id_column: NotRequired[str | None]
1274
1285
  remove_duplicates: NotRequired[bool]
1275
1286
  pretrained_embedding_model_name: NotRequired[PretrainedEmbeddingModelName | None]
1276
1287
  finetuned_embedding_model_name_or_id: NotRequired[str | None]
@@ -1541,6 +1552,7 @@ class MemorysetAnalysisRequest(TypedDict):
1541
1552
  batch_size: NotRequired[int]
1542
1553
  clear_metrics: NotRequired[bool]
1543
1554
  configs: MemorysetAnalysisConfigs
1555
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
1544
1556
 
1545
1557
 
1546
1558
  class MemorysetConceptMetrics(TypedDict):
@@ -1666,7 +1678,7 @@ class BootstrapClassificationModelMeta(TypedDict):
1666
1678
  datasource_meta: DatasourceMetadata
1667
1679
  memoryset_meta: MemorysetMetadata
1668
1680
  model_meta: ClassificationModelMetadata
1669
- agent_output: BootstrapClassificationModelResult
1681
+ agent_output: BootstrapLabeledMemoryDataResult
1670
1682
 
1671
1683
 
1672
1684
  class BootstrapClassificationModelResponse(TypedDict):
@@ -2556,7 +2568,7 @@ class OrcaAsyncClient(AsyncClient):
2556
2568
  timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
2557
2569
  extensions: RequestExtensions | None = None,
2558
2570
  ) -> BootstrapClassificationModelResponse:
2559
- """Get the status of a bootstrap classification model job"""
2571
+ """Get the status of a bootstrap labeled memory data job"""
2560
2572
  pass
2561
2573
 
2562
2574
  async def GET(
@@ -3278,6 +3290,32 @@ class OrcaAsyncClient(AsyncClient):
3278
3290
  """Get row count from a specific datasource with optional filtering."""
3279
3291
  pass
3280
3292
 
3293
+ @overload
3294
+ async def POST(
3295
+ self,
3296
+ path: Literal["/datasource/bootstrap_memory_data"],
3297
+ *,
3298
+ params: None = None,
3299
+ json: BootstrapLabeledMemoryDataInput,
3300
+ data: None = None,
3301
+ files: None = None,
3302
+ content: None = None,
3303
+ parse_as: Literal["json"] = "json",
3304
+ headers: HeaderTypes | None = None,
3305
+ cookies: CookieTypes | None = None,
3306
+ auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT,
3307
+ follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT,
3308
+ timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
3309
+ extensions: RequestExtensions | None = None,
3310
+ ) -> BootstrapLabeledMemoryDataResult:
3311
+ """
3312
+ Bootstrap memory data using an AI agent.
3313
+
3314
+ This endpoint uses the bootstrap labeled memory data agent to generate
3315
+ high-quality, diverse training examples for a classification model.
3316
+ """
3317
+ pass
3318
+
3281
3319
  @overload
3282
3320
  async def POST(
3283
3321
  self,
@@ -3526,7 +3564,7 @@ class OrcaAsyncClient(AsyncClient):
3526
3564
  """
3527
3565
  Bootstrap a classification model by creating a memoryset with generated memories and a classification model.
3528
3566
 
3529
- This endpoint uses the bootstrap_classification_model agent to generate:
3567
+ This endpoint uses the bootstrap_labeled_memory_data agent to generate:
3530
3568
  1. Memoryset configuration with appropriate settings
3531
3569
  2. Model configuration with optimal parameters
3532
3570
  3. High-quality training memories for each label