orca-sdk 0.0.102__py3-none-any.whl → 0.0.104__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@
2
2
  This module contains metrics for usage with the Hugging Face Trainer.
3
3
 
4
4
  IMPORTANT:
5
- - This is a shared file between OrcaLib and the Orca SDK.
5
+ - This is a shared file between OrcaLib and the OrcaSDK.
6
6
  - Please ensure that it does not have any dependencies on the OrcaLib code.
7
7
  - Make sure to edit this file in orcalib/shared and NOT in orca_sdk, since it will be overwritten there.
8
8
 
@@ -147,13 +147,16 @@ def calculate_roc_curve(
147
147
 
148
148
  @dataclass
149
149
  class ClassificationMetrics:
150
+ coverage: float
151
+ """Percentage of predictions that are not none"""
152
+
150
153
  f1_score: float
151
154
  """F1 score of the predictions"""
152
155
 
153
156
  accuracy: float
154
157
  """Accuracy of the predictions"""
155
158
 
156
- loss: float
159
+ loss: float | None
157
160
  """Cross-entropy loss of the logits"""
158
161
 
159
162
  anomaly_score_mean: float | None = None
@@ -225,12 +228,15 @@ def calculate_classification_metrics(
225
228
  raise ValueError("Logits must be 1 or 2 dimensional")
226
229
 
227
230
  predictions = np.argmax(probabilities, axis=-1)
231
+ predictions[np.isnan(probabilities).all(axis=-1)] = -1 # set predictions to -1 for all nan logits
228
232
 
229
233
  num_classes_references = len(set(references))
230
234
  num_classes_predictions = len(set(predictions))
235
+ num_none_predictions = np.isnan(probabilities).all(axis=-1).sum()
236
+ coverage = 1 - num_none_predictions / len(probabilities)
231
237
 
232
238
  if average is None:
233
- average = "binary" if num_classes_references == 2 else "weighted"
239
+ average = "binary" if num_classes_references == 2 and num_none_predictions == 0 else "weighted"
234
240
 
235
241
  anomaly_score_mean = float(np.mean(anomaly_scores)) if anomaly_scores else None
236
242
  anomaly_score_median = float(np.median(anomaly_scores)) if anomaly_scores else None
@@ -240,13 +246,17 @@ def calculate_classification_metrics(
240
246
  f1 = sklearn.metrics.f1_score(references, predictions, average=average)
241
247
  # Ensure sklearn sees the full class set corresponding to probability columns
242
248
  # to avoid errors when y_true does not contain all classes.
243
- loss = sklearn.metrics.log_loss(
244
- references,
245
- probabilities,
246
- labels=list(range(probabilities.shape[1])),
249
+ loss = (
250
+ sklearn.metrics.log_loss(
251
+ references,
252
+ probabilities,
253
+ labels=list(range(probabilities.shape[1])),
254
+ )
255
+ if num_none_predictions == 0
256
+ else None
247
257
  )
248
258
 
249
- if num_classes_references == num_classes_predictions:
259
+ if num_classes_references == num_classes_predictions and num_none_predictions == 0:
250
260
  # special case for binary classification: https://github.com/scikit-learn/scikit-learn/issues/20186
251
261
  if num_classes_references == 2:
252
262
  roc_auc = sklearn.metrics.roc_auc_score(references, logits[:, 1])
@@ -265,9 +275,10 @@ def calculate_classification_metrics(
265
275
  roc_curve = None
266
276
 
267
277
  return ClassificationMetrics(
278
+ coverage=coverage,
268
279
  accuracy=float(accuracy),
269
280
  f1_score=float(f1),
270
- loss=float(loss),
281
+ loss=float(loss) if loss is not None else None,
271
282
  anomaly_score_mean=anomaly_score_mean,
272
283
  anomaly_score_median=anomaly_score_median,
273
284
  anomaly_score_variance=anomaly_score_variance,
@@ -280,6 +291,9 @@ def calculate_classification_metrics(
280
291
 
281
292
  @dataclass
282
293
  class RegressionMetrics:
294
+ coverage: float
295
+ """Percentage of predictions that are not none"""
296
+
283
297
  mse: float
284
298
  """Mean squared error of the predictions"""
285
299
 
@@ -351,6 +365,13 @@ def calculate_regression_metrics(
351
365
  anomaly_score_median = float(np.median(anomaly_scores)) if anomaly_scores else None
352
366
  anomaly_score_variance = float(np.var(anomaly_scores)) if anomaly_scores else None
353
367
 
368
+ none_prediction_mask = np.isnan(predictions)
369
+ num_none_predictions = none_prediction_mask.sum()
370
+ coverage = 1 - num_none_predictions / len(predictions)
371
+ if num_none_predictions > 0:
372
+ references = references[~none_prediction_mask]
373
+ predictions = predictions[~none_prediction_mask]
374
+
354
375
  # Calculate core regression metrics
355
376
  mse = float(sklearn.metrics.mean_squared_error(references, predictions))
356
377
  rmse = float(np.sqrt(mse))
@@ -359,6 +380,7 @@ def calculate_regression_metrics(
359
380
  explained_var = float(sklearn.metrics.explained_variance_score(references, predictions))
360
381
 
361
382
  return RegressionMetrics(
383
+ coverage=coverage,
362
384
  mse=mse,
363
385
  rmse=rmse,
364
386
  mae=mae,
@@ -1,6 +1,6 @@
1
1
  """
2
2
  IMPORTANT:
3
- - This is a shared file between OrcaLib and the Orca SDK.
3
+ - This is a shared file between OrcaLib and the OrcaSDK.
4
4
  - Please ensure that it does not have any dependencies on the OrcaLib code.
5
5
  - Make sure to edit this file in orcalib/shared and NOT in orca_sdk, since it will be overwritten there.
6
6
  """
@@ -101,6 +101,20 @@ def test_softmaxes_logits_if_necessary():
101
101
  )
102
102
 
103
103
 
104
+ def test_handles_nan_logits():
105
+ logits = np.array([[np.nan, np.nan], [np.nan, np.nan], [0.1, 0.9], [0.2, 0.8]])
106
+ expected_labels = [0, 1, 0, 1]
107
+ metrics = calculate_classification_metrics(expected_labels, logits)
108
+ assert metrics.loss is None
109
+ assert metrics.accuracy == 0.25
110
+ assert metrics.f1_score == 0.25
111
+ assert metrics.roc_auc is None
112
+ assert metrics.pr_auc is None
113
+ assert metrics.pr_curve is None
114
+ assert metrics.roc_curve is None
115
+ assert metrics.coverage == 0.5
116
+
117
+
104
118
  def test_precision_recall_curve():
105
119
  y_true = np.array([0, 1, 1, 0, 1])
106
120
  y_score = np.array([0.1, 0.9, 0.8, 0.6, 0.2])
@@ -153,7 +167,7 @@ def test_log_loss_handles_missing_classes_in_y_true():
153
167
  metrics = calculate_classification_metrics(y_true, y_score)
154
168
  expected_loss = sklearn.metrics.log_loss(y_true, y_score, labels=[0, 1, 2])
155
169
 
156
- assert np.isfinite(metrics.loss)
170
+ assert metrics.loss is not None
157
171
  assert np.allclose(metrics.loss, expected_loss)
158
172
 
159
173
 
@@ -194,8 +208,6 @@ def test_roc_curve_max_length():
194
208
 
195
209
 
196
210
  # Regression Metrics Tests
197
-
198
-
199
211
  def test_perfect_regression_predictions():
200
212
  y_true = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
201
213
  y_pred = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
@@ -245,3 +257,17 @@ def test_regression_metrics_with_anomaly_scores():
245
257
  assert metrics.anomaly_score_mean == pytest.approx(np.mean(anomaly_scores))
246
258
  assert metrics.anomaly_score_median == pytest.approx(np.median(anomaly_scores))
247
259
  assert metrics.anomaly_score_variance == pytest.approx(np.var(anomaly_scores))
260
+
261
+
262
+ def test_regression_metrics_handles_nans():
263
+ y_true = np.array([1.0, 2.0, 3.0], dtype=np.float32)
264
+ y_pred = np.array([1.1, 1.9, np.nan], dtype=np.float32)
265
+
266
+ metrics = calculate_regression_metrics(y_true, y_pred)
267
+
268
+ assert np.allclose(metrics.coverage, 0.6666666666666666)
269
+ assert metrics.mse > 0.0
270
+ assert metrics.rmse > 0.0
271
+ assert metrics.mae > 0.0
272
+ assert 0.0 <= metrics.r2 <= 1.0
273
+ assert 0.0 <= metrics.explained_variance <= 1.0
orca_sdk/_utils/auth.py CHANGED
@@ -53,7 +53,7 @@ def _delete_org(org_id: str) -> None:
53
53
  def _authenticate_local_api(org_id: str = _DEFAULT_ORG_ID, api_key_name: str = "local") -> None:
54
54
  """Connect to the local API at http://localhost:1584/ and authenticate with a new API key"""
55
55
  _delete_api_key(org_id, api_key_name, if_not_exists="ignore")
56
- OrcaCredentials.set_base_url("http://localhost:1584")
56
+ OrcaCredentials.set_api_url("http://localhost:1584")
57
57
  OrcaCredentials.set_api_key(_create_api_key(org_id, api_key_name))
58
58
  logging.info(f"Authenticated against local API at 'http://localhost:1584' with '{api_key_name}' API key")
59
59
 
@@ -77,7 +77,11 @@ def inspect_prediction_result(prediction_result: PredictionBase):
77
77
  dropdown = gr.Dropdown(
78
78
  choices=[f"{label_name} ({i})" for i, label_name in enumerate(label_names)],
79
79
  label="Label",
80
- value=f"{label_names[mem_lookup.label]} ({mem_lookup.label})",
80
+ value=(
81
+ f"{label_names[mem_lookup.label]} ({mem_lookup.label})"
82
+ if mem_lookup.label is not None
83
+ else "None"
84
+ ),
81
85
  interactive=True,
82
86
  container=False,
83
87
  )
@@ -343,6 +343,7 @@ class ClassificationModel:
343
343
  save_telemetry: TelemetryMode = "on",
344
344
  prompt: str | None = None,
345
345
  use_lookup_cache: bool = True,
346
+ timeout_seconds: int = 10,
346
347
  ) -> list[ClassificationPrediction]:
347
348
  pass
348
349
 
@@ -356,6 +357,7 @@ class ClassificationModel:
356
357
  save_telemetry: TelemetryMode = "on",
357
358
  prompt: str | None = None,
358
359
  use_lookup_cache: bool = True,
360
+ timeout_seconds: int = 10,
359
361
  ) -> ClassificationPrediction:
360
362
  pass
361
363
 
@@ -368,6 +370,7 @@ class ClassificationModel:
368
370
  save_telemetry: TelemetryMode = "on",
369
371
  prompt: str | None = None,
370
372
  use_lookup_cache: bool = True,
373
+ timeout_seconds: int = 10,
371
374
  ) -> list[ClassificationPrediction] | ClassificationPrediction:
372
375
  """
373
376
  Predict label(s) for the given input value(s) grounded in similar memories
@@ -384,10 +387,16 @@ class ClassificationModel:
384
387
  * `"sync"`: Save telemetry synchronously
385
388
  * `"async"`: Save telemetry asynchronously
386
389
  prompt: Optional prompt to use for instruction-tuned embedding models
390
+ use_lookup_cache: Whether to use cached lookup results for faster predictions
391
+ timeout_seconds: Timeout in seconds for the request, defaults to 10 seconds
387
392
 
388
393
  Returns:
389
394
  Label prediction or list of label predictions
390
395
 
396
+ Raises:
397
+ ValueError: If timeout_seconds is not a positive integer
398
+ TimeoutError: If the request times out after the specified duration
399
+
391
400
  Examples:
392
401
  Predict the label for a single value:
393
402
  >>> prediction = model.predict("I am happy", tags={"test"})
@@ -405,6 +414,9 @@ class ClassificationModel:
405
414
  ClassificationPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy' })
406
415
  """
407
416
 
417
+ if timeout_seconds <= 0:
418
+ raise ValueError("timeout_seconds must be a positive integer")
419
+
408
420
  parsed_filters = [
409
421
  _parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter for filter in filters
410
422
  ]
@@ -437,6 +449,7 @@ class ClassificationModel:
437
449
  "prompt": prompt,
438
450
  "use_lookup_cache": use_lookup_cache,
439
451
  },
452
+ timeout=timeout_seconds,
440
453
  )
441
454
 
442
455
  if telemetry_on and any(p["prediction_id"] is None for p in response):
@@ -557,7 +570,19 @@ class ClassificationModel:
557
570
  params={"model_name_or_id": self.id, "task_id": response["task_id"]},
558
571
  )
559
572
  assert res["result"] is not None
560
- return ClassificationMetrics(**res["result"])
573
+ return ClassificationMetrics(
574
+ coverage=res["result"].get("coverage"),
575
+ f1_score=res["result"].get("f1_score"),
576
+ accuracy=res["result"].get("accuracy"),
577
+ loss=res["result"].get("loss"),
578
+ anomaly_score_mean=res["result"].get("anomaly_score_mean"),
579
+ anomaly_score_median=res["result"].get("anomaly_score_median"),
580
+ anomaly_score_variance=res["result"].get("anomaly_score_variance"),
581
+ roc_auc=res["result"].get("roc_auc"),
582
+ pr_auc=res["result"].get("pr_auc"),
583
+ pr_curve=res["result"].get("pr_curve"),
584
+ roc_curve=res["result"].get("roc_curve"),
585
+ )
561
586
 
562
587
  job = Job(response["task_id"], get_value)
563
588
  return job if background else job.result()
@@ -571,6 +596,12 @@ class ClassificationModel:
571
596
  tags: set[str],
572
597
  batch_size: int,
573
598
  ) -> ClassificationMetrics:
599
+ if len(dataset) == 0:
600
+ raise ValueError("Evaluation dataset cannot be empty")
601
+
602
+ if any(x is None for x in dataset[label_column]):
603
+ raise ValueError("Evaluation dataset cannot contain None values in the label column")
604
+
574
605
  predictions = [
575
606
  prediction
576
607
  for i in range(0, len(dataset), batch_size)
@@ -3,13 +3,14 @@ from uuid import uuid4
3
3
 
4
4
  import numpy as np
5
5
  import pytest
6
- from datasets.arrow_dataset import Dataset
6
+ from datasets import Dataset
7
7
 
8
8
  from .classification_model import ClassificationMetrics, ClassificationModel
9
9
  from .conftest import skip_in_ci
10
10
  from .datasource import Datasource
11
11
  from .embedding_model import PretrainedEmbeddingModel
12
12
  from .memoryset import LabeledMemoryset
13
+ from .telemetry import ClassificationPrediction
13
14
 
14
15
 
15
16
  def test_create_model(classification_model: ClassificationModel, readonly_memoryset: LabeledMemoryset):
@@ -193,6 +194,16 @@ def test_evaluate(classification_model, eval_datasource: Datasource, eval_datase
193
194
  assert np.allclose(result.roc_curve["true_positive_rates"], [1.0, 0.5, 0.5, 0.0])
194
195
 
195
196
 
197
+ def test_evaluate_datasource_with_nones_raises_error(classification_model: ClassificationModel, datasource: Datasource):
198
+ with pytest.raises(ValueError):
199
+ classification_model.evaluate(datasource, record_predictions=True, tags={"test"})
200
+
201
+
202
+ def test_evaluate_dataset_with_nones_raises_error(classification_model: ClassificationModel, hf_dataset: Dataset):
203
+ with pytest.raises(ValueError):
204
+ classification_model.evaluate(hf_dataset, record_predictions=True, tags={"test"})
205
+
206
+
196
207
  def test_evaluate_with_telemetry(classification_model: ClassificationModel, eval_dataset: Dataset):
197
208
  result = classification_model.evaluate(eval_dataset, record_predictions=True, tags={"test"})
198
209
  assert result is not None
@@ -223,6 +234,13 @@ def test_predict(classification_model: ClassificationModel, label_names: list[st
223
234
  assert predictions[1].logits[0] < predictions[1].logits[1]
224
235
 
225
236
 
237
+ def test_classification_prediction_has_no_label(classification_model: ClassificationModel):
238
+ """Ensure optional score is None for classification predictions."""
239
+ prediction = classification_model.predict("Do you want to go to the beach?")
240
+ assert isinstance(prediction, ClassificationPrediction)
241
+ assert prediction.label is None
242
+
243
+
226
244
  def test_predict_disable_telemetry(classification_model: ClassificationModel, label_names: list[str]):
227
245
  predictions = classification_model.predict(["Do you love soup?", "Are cats cute?"], save_telemetry="off")
228
246
  assert len(predictions) == 2