orca-sdk 0.0.103__py3-none-any.whl → 0.0.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/_shared/metrics.py +31 -9
- orca_sdk/_shared/metrics_test.py +30 -4
- orca_sdk/_utils/prediction_result_ui.py +5 -1
- orca_sdk/classification_model.py +32 -1
- orca_sdk/classification_model_test.py +18 -0
- orca_sdk/client.py +297 -257
- orca_sdk/conftest.py +12 -0
- orca_sdk/datasource.py +1 -1
- orca_sdk/datasource_test.py +6 -1
- orca_sdk/embedding_model.py +28 -1
- orca_sdk/job_test.py +20 -10
- orca_sdk/memoryset.py +9 -23
- orca_sdk/memoryset_test.py +3 -2
- orca_sdk/regression_model.py +29 -1
- orca_sdk/regression_model_test.py +18 -1
- {orca_sdk-0.0.103.dist-info → orca_sdk-0.0.104.dist-info}/METADATA +14 -14
- {orca_sdk-0.0.103.dist-info → orca_sdk-0.0.104.dist-info}/RECORD +18 -18
- {orca_sdk-0.0.103.dist-info → orca_sdk-0.0.104.dist-info}/WHEEL +1 -1
orca_sdk/_shared/metrics.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
This module contains metrics for usage with the Hugging Face Trainer.
|
|
3
3
|
|
|
4
4
|
IMPORTANT:
|
|
5
|
-
- This is a shared file between OrcaLib and the
|
|
5
|
+
- This is a shared file between OrcaLib and the OrcaSDK.
|
|
6
6
|
- Please ensure that it does not have any dependencies on the OrcaLib code.
|
|
7
7
|
- Make sure to edit this file in orcalib/shared and NOT in orca_sdk, since it will be overwritten there.
|
|
8
8
|
|
|
@@ -147,13 +147,16 @@ def calculate_roc_curve(
|
|
|
147
147
|
|
|
148
148
|
@dataclass
|
|
149
149
|
class ClassificationMetrics:
|
|
150
|
+
coverage: float
|
|
151
|
+
"""Percentage of predictions that are not none"""
|
|
152
|
+
|
|
150
153
|
f1_score: float
|
|
151
154
|
"""F1 score of the predictions"""
|
|
152
155
|
|
|
153
156
|
accuracy: float
|
|
154
157
|
"""Accuracy of the predictions"""
|
|
155
158
|
|
|
156
|
-
loss: float
|
|
159
|
+
loss: float | None
|
|
157
160
|
"""Cross-entropy loss of the logits"""
|
|
158
161
|
|
|
159
162
|
anomaly_score_mean: float | None = None
|
|
@@ -225,12 +228,15 @@ def calculate_classification_metrics(
|
|
|
225
228
|
raise ValueError("Logits must be 1 or 2 dimensional")
|
|
226
229
|
|
|
227
230
|
predictions = np.argmax(probabilities, axis=-1)
|
|
231
|
+
predictions[np.isnan(probabilities).all(axis=-1)] = -1 # set predictions to -1 for all nan logits
|
|
228
232
|
|
|
229
233
|
num_classes_references = len(set(references))
|
|
230
234
|
num_classes_predictions = len(set(predictions))
|
|
235
|
+
num_none_predictions = np.isnan(probabilities).all(axis=-1).sum()
|
|
236
|
+
coverage = 1 - num_none_predictions / len(probabilities)
|
|
231
237
|
|
|
232
238
|
if average is None:
|
|
233
|
-
average = "binary" if num_classes_references == 2 else "weighted"
|
|
239
|
+
average = "binary" if num_classes_references == 2 and num_none_predictions == 0 else "weighted"
|
|
234
240
|
|
|
235
241
|
anomaly_score_mean = float(np.mean(anomaly_scores)) if anomaly_scores else None
|
|
236
242
|
anomaly_score_median = float(np.median(anomaly_scores)) if anomaly_scores else None
|
|
@@ -240,13 +246,17 @@ def calculate_classification_metrics(
|
|
|
240
246
|
f1 = sklearn.metrics.f1_score(references, predictions, average=average)
|
|
241
247
|
# Ensure sklearn sees the full class set corresponding to probability columns
|
|
242
248
|
# to avoid errors when y_true does not contain all classes.
|
|
243
|
-
loss =
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
249
|
+
loss = (
|
|
250
|
+
sklearn.metrics.log_loss(
|
|
251
|
+
references,
|
|
252
|
+
probabilities,
|
|
253
|
+
labels=list(range(probabilities.shape[1])),
|
|
254
|
+
)
|
|
255
|
+
if num_none_predictions == 0
|
|
256
|
+
else None
|
|
247
257
|
)
|
|
248
258
|
|
|
249
|
-
if num_classes_references == num_classes_predictions:
|
|
259
|
+
if num_classes_references == num_classes_predictions and num_none_predictions == 0:
|
|
250
260
|
# special case for binary classification: https://github.com/scikit-learn/scikit-learn/issues/20186
|
|
251
261
|
if num_classes_references == 2:
|
|
252
262
|
roc_auc = sklearn.metrics.roc_auc_score(references, logits[:, 1])
|
|
@@ -265,9 +275,10 @@ def calculate_classification_metrics(
|
|
|
265
275
|
roc_curve = None
|
|
266
276
|
|
|
267
277
|
return ClassificationMetrics(
|
|
278
|
+
coverage=coverage,
|
|
268
279
|
accuracy=float(accuracy),
|
|
269
280
|
f1_score=float(f1),
|
|
270
|
-
loss=float(loss),
|
|
281
|
+
loss=float(loss) if loss is not None else None,
|
|
271
282
|
anomaly_score_mean=anomaly_score_mean,
|
|
272
283
|
anomaly_score_median=anomaly_score_median,
|
|
273
284
|
anomaly_score_variance=anomaly_score_variance,
|
|
@@ -280,6 +291,9 @@ def calculate_classification_metrics(
|
|
|
280
291
|
|
|
281
292
|
@dataclass
|
|
282
293
|
class RegressionMetrics:
|
|
294
|
+
coverage: float
|
|
295
|
+
"""Percentage of predictions that are not none"""
|
|
296
|
+
|
|
283
297
|
mse: float
|
|
284
298
|
"""Mean squared error of the predictions"""
|
|
285
299
|
|
|
@@ -351,6 +365,13 @@ def calculate_regression_metrics(
|
|
|
351
365
|
anomaly_score_median = float(np.median(anomaly_scores)) if anomaly_scores else None
|
|
352
366
|
anomaly_score_variance = float(np.var(anomaly_scores)) if anomaly_scores else None
|
|
353
367
|
|
|
368
|
+
none_prediction_mask = np.isnan(predictions)
|
|
369
|
+
num_none_predictions = none_prediction_mask.sum()
|
|
370
|
+
coverage = 1 - num_none_predictions / len(predictions)
|
|
371
|
+
if num_none_predictions > 0:
|
|
372
|
+
references = references[~none_prediction_mask]
|
|
373
|
+
predictions = predictions[~none_prediction_mask]
|
|
374
|
+
|
|
354
375
|
# Calculate core regression metrics
|
|
355
376
|
mse = float(sklearn.metrics.mean_squared_error(references, predictions))
|
|
356
377
|
rmse = float(np.sqrt(mse))
|
|
@@ -359,6 +380,7 @@ def calculate_regression_metrics(
|
|
|
359
380
|
explained_var = float(sklearn.metrics.explained_variance_score(references, predictions))
|
|
360
381
|
|
|
361
382
|
return RegressionMetrics(
|
|
383
|
+
coverage=coverage,
|
|
362
384
|
mse=mse,
|
|
363
385
|
rmse=rmse,
|
|
364
386
|
mae=mae,
|
orca_sdk/_shared/metrics_test.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
IMPORTANT:
|
|
3
|
-
- This is a shared file between OrcaLib and the
|
|
3
|
+
- This is a shared file between OrcaLib and the OrcaSDK.
|
|
4
4
|
- Please ensure that it does not have any dependencies on the OrcaLib code.
|
|
5
5
|
- Make sure to edit this file in orcalib/shared and NOT in orca_sdk, since it will be overwritten there.
|
|
6
6
|
"""
|
|
@@ -101,6 +101,20 @@ def test_softmaxes_logits_if_necessary():
|
|
|
101
101
|
)
|
|
102
102
|
|
|
103
103
|
|
|
104
|
+
def test_handles_nan_logits():
|
|
105
|
+
logits = np.array([[np.nan, np.nan], [np.nan, np.nan], [0.1, 0.9], [0.2, 0.8]])
|
|
106
|
+
expected_labels = [0, 1, 0, 1]
|
|
107
|
+
metrics = calculate_classification_metrics(expected_labels, logits)
|
|
108
|
+
assert metrics.loss is None
|
|
109
|
+
assert metrics.accuracy == 0.25
|
|
110
|
+
assert metrics.f1_score == 0.25
|
|
111
|
+
assert metrics.roc_auc is None
|
|
112
|
+
assert metrics.pr_auc is None
|
|
113
|
+
assert metrics.pr_curve is None
|
|
114
|
+
assert metrics.roc_curve is None
|
|
115
|
+
assert metrics.coverage == 0.5
|
|
116
|
+
|
|
117
|
+
|
|
104
118
|
def test_precision_recall_curve():
|
|
105
119
|
y_true = np.array([0, 1, 1, 0, 1])
|
|
106
120
|
y_score = np.array([0.1, 0.9, 0.8, 0.6, 0.2])
|
|
@@ -153,7 +167,7 @@ def test_log_loss_handles_missing_classes_in_y_true():
|
|
|
153
167
|
metrics = calculate_classification_metrics(y_true, y_score)
|
|
154
168
|
expected_loss = sklearn.metrics.log_loss(y_true, y_score, labels=[0, 1, 2])
|
|
155
169
|
|
|
156
|
-
assert
|
|
170
|
+
assert metrics.loss is not None
|
|
157
171
|
assert np.allclose(metrics.loss, expected_loss)
|
|
158
172
|
|
|
159
173
|
|
|
@@ -194,8 +208,6 @@ def test_roc_curve_max_length():
|
|
|
194
208
|
|
|
195
209
|
|
|
196
210
|
# Regression Metrics Tests
|
|
197
|
-
|
|
198
|
-
|
|
199
211
|
def test_perfect_regression_predictions():
|
|
200
212
|
y_true = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
|
|
201
213
|
y_pred = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
|
|
@@ -245,3 +257,17 @@ def test_regression_metrics_with_anomaly_scores():
|
|
|
245
257
|
assert metrics.anomaly_score_mean == pytest.approx(np.mean(anomaly_scores))
|
|
246
258
|
assert metrics.anomaly_score_median == pytest.approx(np.median(anomaly_scores))
|
|
247
259
|
assert metrics.anomaly_score_variance == pytest.approx(np.var(anomaly_scores))
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def test_regression_metrics_handles_nans():
|
|
263
|
+
y_true = np.array([1.0, 2.0, 3.0], dtype=np.float32)
|
|
264
|
+
y_pred = np.array([1.1, 1.9, np.nan], dtype=np.float32)
|
|
265
|
+
|
|
266
|
+
metrics = calculate_regression_metrics(y_true, y_pred)
|
|
267
|
+
|
|
268
|
+
assert np.allclose(metrics.coverage, 0.6666666666666666)
|
|
269
|
+
assert metrics.mse > 0.0
|
|
270
|
+
assert metrics.rmse > 0.0
|
|
271
|
+
assert metrics.mae > 0.0
|
|
272
|
+
assert 0.0 <= metrics.r2 <= 1.0
|
|
273
|
+
assert 0.0 <= metrics.explained_variance <= 1.0
|
|
@@ -77,7 +77,11 @@ def inspect_prediction_result(prediction_result: PredictionBase):
|
|
|
77
77
|
dropdown = gr.Dropdown(
|
|
78
78
|
choices=[f"{label_name} ({i})" for i, label_name in enumerate(label_names)],
|
|
79
79
|
label="Label",
|
|
80
|
-
value=
|
|
80
|
+
value=(
|
|
81
|
+
f"{label_names[mem_lookup.label]} ({mem_lookup.label})"
|
|
82
|
+
if mem_lookup.label is not None
|
|
83
|
+
else "None"
|
|
84
|
+
),
|
|
81
85
|
interactive=True,
|
|
82
86
|
container=False,
|
|
83
87
|
)
|
orca_sdk/classification_model.py
CHANGED
|
@@ -343,6 +343,7 @@ class ClassificationModel:
|
|
|
343
343
|
save_telemetry: TelemetryMode = "on",
|
|
344
344
|
prompt: str | None = None,
|
|
345
345
|
use_lookup_cache: bool = True,
|
|
346
|
+
timeout_seconds: int = 10,
|
|
346
347
|
) -> list[ClassificationPrediction]:
|
|
347
348
|
pass
|
|
348
349
|
|
|
@@ -356,6 +357,7 @@ class ClassificationModel:
|
|
|
356
357
|
save_telemetry: TelemetryMode = "on",
|
|
357
358
|
prompt: str | None = None,
|
|
358
359
|
use_lookup_cache: bool = True,
|
|
360
|
+
timeout_seconds: int = 10,
|
|
359
361
|
) -> ClassificationPrediction:
|
|
360
362
|
pass
|
|
361
363
|
|
|
@@ -368,6 +370,7 @@ class ClassificationModel:
|
|
|
368
370
|
save_telemetry: TelemetryMode = "on",
|
|
369
371
|
prompt: str | None = None,
|
|
370
372
|
use_lookup_cache: bool = True,
|
|
373
|
+
timeout_seconds: int = 10,
|
|
371
374
|
) -> list[ClassificationPrediction] | ClassificationPrediction:
|
|
372
375
|
"""
|
|
373
376
|
Predict label(s) for the given input value(s) grounded in similar memories
|
|
@@ -384,10 +387,16 @@ class ClassificationModel:
|
|
|
384
387
|
* `"sync"`: Save telemetry synchronously
|
|
385
388
|
* `"async"`: Save telemetry asynchronously
|
|
386
389
|
prompt: Optional prompt to use for instruction-tuned embedding models
|
|
390
|
+
use_lookup_cache: Whether to use cached lookup results for faster predictions
|
|
391
|
+
timeout_seconds: Timeout in seconds for the request, defaults to 10 seconds
|
|
387
392
|
|
|
388
393
|
Returns:
|
|
389
394
|
Label prediction or list of label predictions
|
|
390
395
|
|
|
396
|
+
Raises:
|
|
397
|
+
ValueError: If timeout_seconds is not a positive integer
|
|
398
|
+
TimeoutError: If the request times out after the specified duration
|
|
399
|
+
|
|
391
400
|
Examples:
|
|
392
401
|
Predict the label for a single value:
|
|
393
402
|
>>> prediction = model.predict("I am happy", tags={"test"})
|
|
@@ -405,6 +414,9 @@ class ClassificationModel:
|
|
|
405
414
|
ClassificationPrediction({label: <positive: 1>, confidence: 0.95, anomaly_score: 0.1, input_value: 'I am happy' })
|
|
406
415
|
"""
|
|
407
416
|
|
|
417
|
+
if timeout_seconds <= 0:
|
|
418
|
+
raise ValueError("timeout_seconds must be a positive integer")
|
|
419
|
+
|
|
408
420
|
parsed_filters = [
|
|
409
421
|
_parse_filter_item_from_tuple(filter) if isinstance(filter, tuple) else filter for filter in filters
|
|
410
422
|
]
|
|
@@ -437,6 +449,7 @@ class ClassificationModel:
|
|
|
437
449
|
"prompt": prompt,
|
|
438
450
|
"use_lookup_cache": use_lookup_cache,
|
|
439
451
|
},
|
|
452
|
+
timeout=timeout_seconds,
|
|
440
453
|
)
|
|
441
454
|
|
|
442
455
|
if telemetry_on and any(p["prediction_id"] is None for p in response):
|
|
@@ -557,7 +570,19 @@ class ClassificationModel:
|
|
|
557
570
|
params={"model_name_or_id": self.id, "task_id": response["task_id"]},
|
|
558
571
|
)
|
|
559
572
|
assert res["result"] is not None
|
|
560
|
-
return ClassificationMetrics(
|
|
573
|
+
return ClassificationMetrics(
|
|
574
|
+
coverage=res["result"].get("coverage"),
|
|
575
|
+
f1_score=res["result"].get("f1_score"),
|
|
576
|
+
accuracy=res["result"].get("accuracy"),
|
|
577
|
+
loss=res["result"].get("loss"),
|
|
578
|
+
anomaly_score_mean=res["result"].get("anomaly_score_mean"),
|
|
579
|
+
anomaly_score_median=res["result"].get("anomaly_score_median"),
|
|
580
|
+
anomaly_score_variance=res["result"].get("anomaly_score_variance"),
|
|
581
|
+
roc_auc=res["result"].get("roc_auc"),
|
|
582
|
+
pr_auc=res["result"].get("pr_auc"),
|
|
583
|
+
pr_curve=res["result"].get("pr_curve"),
|
|
584
|
+
roc_curve=res["result"].get("roc_curve"),
|
|
585
|
+
)
|
|
561
586
|
|
|
562
587
|
job = Job(response["task_id"], get_value)
|
|
563
588
|
return job if background else job.result()
|
|
@@ -571,6 +596,12 @@ class ClassificationModel:
|
|
|
571
596
|
tags: set[str],
|
|
572
597
|
batch_size: int,
|
|
573
598
|
) -> ClassificationMetrics:
|
|
599
|
+
if len(dataset) == 0:
|
|
600
|
+
raise ValueError("Evaluation dataset cannot be empty")
|
|
601
|
+
|
|
602
|
+
if any(x is None for x in dataset[label_column]):
|
|
603
|
+
raise ValueError("Evaluation dataset cannot contain None values in the label column")
|
|
604
|
+
|
|
574
605
|
predictions = [
|
|
575
606
|
prediction
|
|
576
607
|
for i in range(0, len(dataset), batch_size)
|
|
@@ -10,6 +10,7 @@ from .conftest import skip_in_ci
|
|
|
10
10
|
from .datasource import Datasource
|
|
11
11
|
from .embedding_model import PretrainedEmbeddingModel
|
|
12
12
|
from .memoryset import LabeledMemoryset
|
|
13
|
+
from .telemetry import ClassificationPrediction
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
def test_create_model(classification_model: ClassificationModel, readonly_memoryset: LabeledMemoryset):
|
|
@@ -193,6 +194,16 @@ def test_evaluate(classification_model, eval_datasource: Datasource, eval_datase
|
|
|
193
194
|
assert np.allclose(result.roc_curve["true_positive_rates"], [1.0, 0.5, 0.5, 0.0])
|
|
194
195
|
|
|
195
196
|
|
|
197
|
+
def test_evaluate_datasource_with_nones_raises_error(classification_model: ClassificationModel, datasource: Datasource):
|
|
198
|
+
with pytest.raises(ValueError):
|
|
199
|
+
classification_model.evaluate(datasource, record_predictions=True, tags={"test"})
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def test_evaluate_dataset_with_nones_raises_error(classification_model: ClassificationModel, hf_dataset: Dataset):
|
|
203
|
+
with pytest.raises(ValueError):
|
|
204
|
+
classification_model.evaluate(hf_dataset, record_predictions=True, tags={"test"})
|
|
205
|
+
|
|
206
|
+
|
|
196
207
|
def test_evaluate_with_telemetry(classification_model: ClassificationModel, eval_dataset: Dataset):
|
|
197
208
|
result = classification_model.evaluate(eval_dataset, record_predictions=True, tags={"test"})
|
|
198
209
|
assert result is not None
|
|
@@ -223,6 +234,13 @@ def test_predict(classification_model: ClassificationModel, label_names: list[st
|
|
|
223
234
|
assert predictions[1].logits[0] < predictions[1].logits[1]
|
|
224
235
|
|
|
225
236
|
|
|
237
|
+
def test_classification_prediction_has_no_label(classification_model: ClassificationModel):
|
|
238
|
+
"""Ensure optional score is None for classification predictions."""
|
|
239
|
+
prediction = classification_model.predict("Do you want to go to the beach?")
|
|
240
|
+
assert isinstance(prediction, ClassificationPrediction)
|
|
241
|
+
assert prediction.label is None
|
|
242
|
+
|
|
243
|
+
|
|
226
244
|
def test_predict_disable_telemetry(classification_model: ClassificationModel, label_names: list[str]):
|
|
227
245
|
predictions = classification_model.predict(["Do you love soup?", "Are cats cute?"], save_telemetry="off")
|
|
228
246
|
assert len(predictions) == 2
|