orca-sdk 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/__init__.py +3 -3
- orca_sdk/_utils/auth.py +2 -3
- orca_sdk/_utils/common.py +24 -1
- orca_sdk/_utils/torch_parsing.py +77 -0
- orca_sdk/_utils/torch_parsing_test.py +142 -0
- orca_sdk/async_client.py +156 -4
- orca_sdk/classification_model.py +202 -65
- orca_sdk/classification_model_test.py +16 -3
- orca_sdk/client.py +156 -4
- orca_sdk/conftest.py +10 -9
- orca_sdk/datasource.py +31 -13
- orca_sdk/embedding_model.py +8 -31
- orca_sdk/embedding_model_test.py +1 -1
- orca_sdk/memoryset.py +236 -321
- orca_sdk/memoryset_test.py +39 -13
- orca_sdk/regression_model.py +185 -64
- orca_sdk/regression_model_test.py +18 -3
- orca_sdk/telemetry.py +15 -6
- {orca_sdk-0.1.11.dist-info → orca_sdk-0.1.12.dist-info}/METADATA +3 -5
- orca_sdk-0.1.12.dist-info/RECORD +38 -0
- orca_sdk/_shared/__init__.py +0 -10
- orca_sdk/_shared/metrics.py +0 -634
- orca_sdk/_shared/metrics_test.py +0 -570
- orca_sdk/_utils/data_parsing.py +0 -137
- orca_sdk/_utils/data_parsing_disk_test.py +0 -91
- orca_sdk/_utils/data_parsing_torch_test.py +0 -159
- orca_sdk-0.1.11.dist-info/RECORD +0 -42
- {orca_sdk-0.1.11.dist-info → orca_sdk-0.1.12.dist-info}/WHEEL +0 -0
orca_sdk/memoryset_test.py
CHANGED
|
@@ -5,7 +5,6 @@ import pytest
|
|
|
5
5
|
from datasets.arrow_dataset import Dataset
|
|
6
6
|
|
|
7
7
|
from .classification_model import ClassificationModel
|
|
8
|
-
from .conftest import skip_in_ci, skip_in_prod
|
|
9
8
|
from .datasource import Datasource
|
|
10
9
|
from .embedding_model import PretrainedEmbeddingModel
|
|
11
10
|
from .memoryset import (
|
|
@@ -126,6 +125,33 @@ def test_create_empty_scored_memoryset():
|
|
|
126
125
|
ScoredMemoryset.drop(name, if_not_exists="ignore")
|
|
127
126
|
|
|
128
127
|
|
|
128
|
+
def test_create_empty_partitioned_labeled_memoryset():
|
|
129
|
+
name = f"test_empty_partitioned_labeled_{uuid4()}"
|
|
130
|
+
label_names = ["negative", "positive"]
|
|
131
|
+
try:
|
|
132
|
+
memoryset = LabeledMemoryset.create(
|
|
133
|
+
name, label_names=label_names, partitioned=True, description="empty partitioned labeled test"
|
|
134
|
+
)
|
|
135
|
+
assert memoryset is not None
|
|
136
|
+
assert memoryset.name == name
|
|
137
|
+
assert memoryset.length == 0
|
|
138
|
+
assert memoryset.partitioned is True
|
|
139
|
+
|
|
140
|
+
# inserting with partition_id should work
|
|
141
|
+
memoryset.insert(dict(value="i love soup", label=1, partition_id="p1"))
|
|
142
|
+
memoryset.insert(dict(value="cats are cute", label=0, partition_id="p2"))
|
|
143
|
+
assert memoryset.length == 2
|
|
144
|
+
finally:
|
|
145
|
+
LabeledMemoryset.drop(name, if_not_exists="ignore")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def test_non_partitioned_memoryset_has_partitioned_false(
|
|
149
|
+
readonly_partitioned_memoryset: LabeledMemoryset, readonly_memoryset: LabeledMemoryset
|
|
150
|
+
):
|
|
151
|
+
assert readonly_partitioned_memoryset.partitioned is True
|
|
152
|
+
assert readonly_memoryset.partitioned is False
|
|
153
|
+
|
|
154
|
+
|
|
129
155
|
def test_create_memoryset_unauthenticated(unauthenticated_client, datasource):
|
|
130
156
|
with unauthenticated_client.use():
|
|
131
157
|
with pytest.raises(ValueError, match="Invalid API key"):
|
|
@@ -718,7 +744,6 @@ def test_insert_memories(writable_memoryset: LabeledMemoryset):
|
|
|
718
744
|
assert last_memory.source_id == "test"
|
|
719
745
|
|
|
720
746
|
|
|
721
|
-
@skip_in_prod("Production memorysets do not have session consistency guarantees")
|
|
722
747
|
def test_update_memories(writable_memoryset: LabeledMemoryset, hf_dataset: Dataset):
|
|
723
748
|
# We've combined the update tests into one to avoid multiple expensive requests for a writable_memoryset
|
|
724
749
|
|
|
@@ -726,10 +751,9 @@ def test_update_memories(writable_memoryset: LabeledMemoryset, hf_dataset: Datas
|
|
|
726
751
|
memory_id = writable_memoryset[0].memory_id
|
|
727
752
|
updated_count = writable_memoryset.update(dict(memory_id=memory_id, value="i love soup so much"))
|
|
728
753
|
assert updated_count == 1
|
|
729
|
-
updated_memory = writable_memoryset.get(memory_id)
|
|
754
|
+
updated_memory = writable_memoryset.get(memory_id, consistency_level="Strong")
|
|
730
755
|
assert updated_memory.value == "i love soup so much"
|
|
731
756
|
assert updated_memory.label == hf_dataset[0]["label"]
|
|
732
|
-
assert writable_memoryset.get(memory_id).value == "i love soup so much"
|
|
733
757
|
|
|
734
758
|
# test updating a memory instance
|
|
735
759
|
memory = writable_memoryset[0]
|
|
@@ -748,11 +772,10 @@ def test_update_memories(writable_memoryset: LabeledMemoryset, hf_dataset: Datas
|
|
|
748
772
|
batch_size=1,
|
|
749
773
|
)
|
|
750
774
|
assert updated_count == 2
|
|
751
|
-
assert writable_memoryset.get(memory_ids[0]).value == "i love soup so much"
|
|
752
|
-
assert writable_memoryset.get(memory_ids[1]).value == "cats are so cute"
|
|
775
|
+
assert writable_memoryset.get(memory_ids[0], consistency_level="Strong").value == "i love soup so much"
|
|
776
|
+
assert writable_memoryset.get(memory_ids[1], consistency_level="Strong").value == "cats are so cute"
|
|
753
777
|
|
|
754
778
|
|
|
755
|
-
@skip_in_prod("Production memorysets do not have session consistency guarantees")
|
|
756
779
|
def test_update_memory_metadata(writable_memoryset: LabeledMemoryset):
|
|
757
780
|
memory = writable_memoryset[0]
|
|
758
781
|
assert memory.metadata["key"] == "g1"
|
|
@@ -769,7 +792,7 @@ def test_update_memory_metadata(writable_memoryset: LabeledMemoryset):
|
|
|
769
792
|
|
|
770
793
|
# Can explicitly clear metadata by passing metadata={}
|
|
771
794
|
writable_memoryset.update(dict(memory_id=memory.memory_id, metadata={}))
|
|
772
|
-
updated = writable_memoryset.get(memory.memory_id)
|
|
795
|
+
updated = writable_memoryset.get(memory.memory_id, consistency_level="Strong")
|
|
773
796
|
assert updated.metadata == {}, "Metadata should be cleared when explicitly set to {}"
|
|
774
797
|
|
|
775
798
|
|
|
@@ -782,7 +805,9 @@ def test_update_memories_by_filter(writable_memoryset: LabeledMemoryset):
|
|
|
782
805
|
)
|
|
783
806
|
assert updated_count == 2
|
|
784
807
|
assert len(writable_memoryset) == initial_length
|
|
785
|
-
updated_memories = writable_memoryset.query(
|
|
808
|
+
updated_memories = writable_memoryset.query(
|
|
809
|
+
filters=[("source_id", "in", source_ids_to_update)], consistency_level="Strong"
|
|
810
|
+
)
|
|
786
811
|
assert len(updated_memories) == 2
|
|
787
812
|
assert all(memory.label == 1 for memory in updated_memories)
|
|
788
813
|
|
|
@@ -980,15 +1005,16 @@ def test_scored_memoryset(scored_memoryset: ScoredMemoryset):
|
|
|
980
1005
|
assert lookup[0].score < 0.11
|
|
981
1006
|
|
|
982
1007
|
|
|
983
|
-
@skip_in_prod("Production memorysets do not have session consistency guarantees")
|
|
984
1008
|
def test_update_scored_memory(scored_memoryset: ScoredMemoryset):
|
|
985
1009
|
# we are only updating an inconsequential metadata field so that we don't affect other tests
|
|
986
1010
|
memory = scored_memoryset[0]
|
|
987
1011
|
assert memory.label == 0
|
|
988
1012
|
scored_memoryset.update(dict(memory_id=memory.memory_id, label=3))
|
|
989
|
-
|
|
990
|
-
memory.
|
|
991
|
-
|
|
1013
|
+
memory = scored_memoryset.get(memory.memory_id, consistency_level="Strong")
|
|
1014
|
+
assert memory.label == 3
|
|
1015
|
+
memory = memory.update(label=4)
|
|
1016
|
+
memory = scored_memoryset.get(memory.memory_id, consistency_level="Strong")
|
|
1017
|
+
assert memory.label == 4
|
|
992
1018
|
|
|
993
1019
|
|
|
994
1020
|
@pytest.mark.asyncio
|
orca_sdk/regression_model.py
CHANGED
|
@@ -1,25 +1,33 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import logging
|
|
4
3
|
from contextlib import contextmanager
|
|
5
4
|
from datetime import datetime
|
|
6
|
-
from typing import
|
|
7
|
-
|
|
8
|
-
|
|
5
|
+
from typing import (
|
|
6
|
+
TYPE_CHECKING,
|
|
7
|
+
Any,
|
|
8
|
+
Generator,
|
|
9
|
+
Iterable,
|
|
10
|
+
Literal,
|
|
11
|
+
Sequence,
|
|
12
|
+
cast,
|
|
13
|
+
overload,
|
|
14
|
+
)
|
|
9
15
|
|
|
10
|
-
from .
|
|
11
|
-
from ._utils.common import UNSET, CreateMode, DropMode
|
|
16
|
+
from ._utils.common import UNSET, CreateMode, DropMode, logger
|
|
12
17
|
from .client import (
|
|
13
18
|
ListPredictionsRequest,
|
|
14
19
|
OrcaClient,
|
|
15
20
|
PredictiveModelUpdate,
|
|
16
21
|
RARHeadType,
|
|
22
|
+
)
|
|
23
|
+
from .client import RegressionMetrics as RegressionMetricsResponse
|
|
24
|
+
from .client import (
|
|
17
25
|
RegressionModelMetadata,
|
|
18
26
|
RegressionPredictionRequest,
|
|
19
27
|
)
|
|
20
28
|
from .datasource import Datasource
|
|
21
29
|
from .job import Job
|
|
22
|
-
from .memoryset import ScoredMemoryset
|
|
30
|
+
from .memoryset import ConsistencyLevel, ScoredMemoryset
|
|
23
31
|
from .telemetry import (
|
|
24
32
|
RegressionPrediction,
|
|
25
33
|
TelemetryMode,
|
|
@@ -27,7 +35,107 @@ from .telemetry import (
|
|
|
27
35
|
_parse_feedback,
|
|
28
36
|
)
|
|
29
37
|
|
|
30
|
-
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
# Peer dependency - user has datasets if they have a Dataset object
|
|
40
|
+
from datasets import Dataset as HFDataset # type: ignore
|
|
41
|
+
from pandas import DataFrame as PandasDataFrame # type: ignore
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class RegressionMetrics:
|
|
45
|
+
"""
|
|
46
|
+
Metrics for evaluating regression model performance.
|
|
47
|
+
|
|
48
|
+
Attributes:
|
|
49
|
+
coverage: Percentage of predictions that are not none
|
|
50
|
+
mse: Mean squared error of the predictions
|
|
51
|
+
rmse: Root mean squared error of the predictions
|
|
52
|
+
mae: Mean absolute error of the predictions
|
|
53
|
+
r2: R-squared score (coefficient of determination) of the predictions
|
|
54
|
+
explained_variance: Explained variance score of the predictions
|
|
55
|
+
loss: Mean squared error loss of the predictions
|
|
56
|
+
anomaly_score_mean: Mean of anomaly scores across the dataset
|
|
57
|
+
anomaly_score_median: Median of anomaly scores across the dataset
|
|
58
|
+
anomaly_score_variance: Variance of anomaly scores across the dataset
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
coverage: float
|
|
62
|
+
mse: float
|
|
63
|
+
rmse: float
|
|
64
|
+
mae: float
|
|
65
|
+
r2: float
|
|
66
|
+
explained_variance: float
|
|
67
|
+
loss: float
|
|
68
|
+
anomaly_score_mean: float | None
|
|
69
|
+
anomaly_score_median: float | None
|
|
70
|
+
anomaly_score_variance: float | None
|
|
71
|
+
|
|
72
|
+
def __init__(self, response: RegressionMetricsResponse):
|
|
73
|
+
self.coverage = response["coverage"]
|
|
74
|
+
self.mse = response["mse"]
|
|
75
|
+
self.rmse = response["rmse"]
|
|
76
|
+
self.mae = response["mae"]
|
|
77
|
+
self.r2 = response["r2"]
|
|
78
|
+
self.explained_variance = response["explained_variance"]
|
|
79
|
+
self.loss = response["loss"]
|
|
80
|
+
self.anomaly_score_mean = response.get("anomaly_score_mean")
|
|
81
|
+
self.anomaly_score_median = response.get("anomaly_score_median")
|
|
82
|
+
self.anomaly_score_variance = response.get("anomaly_score_variance")
|
|
83
|
+
for warning in response.get("warnings", []):
|
|
84
|
+
logger.warning(warning)
|
|
85
|
+
|
|
86
|
+
def __repr__(self) -> str:
|
|
87
|
+
return (
|
|
88
|
+
"RegressionMetrics({\n"
|
|
89
|
+
+ f" mae: {self.mae:.4f},\n"
|
|
90
|
+
+ f" rmse: {self.rmse:.4f},\n"
|
|
91
|
+
+ f" r2: {self.r2:.4f},\n"
|
|
92
|
+
+ (
|
|
93
|
+
f" anomaly_score: {self.anomaly_score_mean:.4f} ± {self.anomaly_score_variance:.4f},\n"
|
|
94
|
+
if self.anomaly_score_mean
|
|
95
|
+
else ""
|
|
96
|
+
)
|
|
97
|
+
+ "})"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def compute(
|
|
102
|
+
cls,
|
|
103
|
+
predictions: Sequence[RegressionPrediction],
|
|
104
|
+
) -> RegressionMetrics:
|
|
105
|
+
"""
|
|
106
|
+
Compute regression metrics from a list of predictions.
|
|
107
|
+
|
|
108
|
+
Params:
|
|
109
|
+
predictions: List of RegressionPrediction objects with expected_score set
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
RegressionMetrics with computed metrics
|
|
113
|
+
|
|
114
|
+
Raises:
|
|
115
|
+
ValueError: If any prediction is missing expected_score
|
|
116
|
+
"""
|
|
117
|
+
if len(predictions) > 100_000:
|
|
118
|
+
raise ValueError("Too many predictions, maximum is 100,000")
|
|
119
|
+
if any(p.expected_score is None for p in predictions):
|
|
120
|
+
raise ValueError("All predictions must have expected_score set")
|
|
121
|
+
expected_scores = [cast(float, p.expected_score) for p in predictions]
|
|
122
|
+
predicted_scores = [p.score for p in predictions]
|
|
123
|
+
anomaly_scores = (
|
|
124
|
+
None
|
|
125
|
+
if any(p.anomaly_score is None for p in predictions)
|
|
126
|
+
else [cast(float, p.anomaly_score) for p in predictions]
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
client = OrcaClient._resolve_client()
|
|
130
|
+
response = client.POST(
|
|
131
|
+
"/regression_model/metrics",
|
|
132
|
+
json={
|
|
133
|
+
"expected_scores": expected_scores,
|
|
134
|
+
"predicted_scores": predicted_scores,
|
|
135
|
+
"anomaly_scores": anomaly_scores,
|
|
136
|
+
},
|
|
137
|
+
)
|
|
138
|
+
return cls(response)
|
|
31
139
|
|
|
32
140
|
|
|
33
141
|
class RegressionModel:
|
|
@@ -105,7 +213,7 @@ class RegressionModel:
|
|
|
105
213
|
is raised.
|
|
106
214
|
"""
|
|
107
215
|
if self._last_prediction_was_batch:
|
|
108
|
-
|
|
216
|
+
logger.warning(
|
|
109
217
|
"Last prediction was part of a batch prediction, returning the last prediction from the batch"
|
|
110
218
|
)
|
|
111
219
|
if self._last_prediction is None:
|
|
@@ -233,7 +341,7 @@ class RegressionModel:
|
|
|
233
341
|
try:
|
|
234
342
|
client = OrcaClient._resolve_client()
|
|
235
343
|
client.DELETE("/regression_model/{name_or_id}", params={"name_or_id": name_or_id})
|
|
236
|
-
|
|
344
|
+
logger.info(f"Deleted model {name_or_id}")
|
|
237
345
|
except LookupError:
|
|
238
346
|
if if_not_exists == "error":
|
|
239
347
|
raise
|
|
@@ -297,6 +405,7 @@ class RegressionModel:
|
|
|
297
405
|
] = "include_global",
|
|
298
406
|
use_gpu: bool = True,
|
|
299
407
|
batch_size: int = 100,
|
|
408
|
+
consistency_level: ConsistencyLevel = "Bounded",
|
|
300
409
|
) -> RegressionPrediction: ...
|
|
301
410
|
|
|
302
411
|
@overload
|
|
@@ -316,6 +425,7 @@ class RegressionModel:
|
|
|
316
425
|
] = "include_global",
|
|
317
426
|
use_gpu: bool = True,
|
|
318
427
|
batch_size: int = 100,
|
|
428
|
+
consistency_level: ConsistencyLevel = "Bounded",
|
|
319
429
|
) -> list[RegressionPrediction]: ...
|
|
320
430
|
|
|
321
431
|
# TODO: add filter support
|
|
@@ -335,6 +445,7 @@ class RegressionModel:
|
|
|
335
445
|
] = "include_global",
|
|
336
446
|
use_gpu: bool = True,
|
|
337
447
|
batch_size: int = 100,
|
|
448
|
+
consistency_level: ConsistencyLevel = "Bounded",
|
|
338
449
|
) -> RegressionPrediction | list[RegressionPrediction]:
|
|
339
450
|
"""
|
|
340
451
|
Make predictions using the regression model.
|
|
@@ -383,15 +494,13 @@ class RegressionModel:
|
|
|
383
494
|
client = OrcaClient._resolve_client()
|
|
384
495
|
|
|
385
496
|
# Convert to list for batching
|
|
386
|
-
values = value if isinstance(value,
|
|
497
|
+
values = [value] if isinstance(value, str) else list(value)
|
|
387
498
|
if isinstance(expected_scores, list) and len(expected_scores) != len(values):
|
|
388
499
|
raise ValueError("Invalid input: \n\texpected_scores must be the same length as values")
|
|
389
500
|
if isinstance(partition_id, list) and len(partition_id) != len(values):
|
|
390
501
|
raise ValueError("Invalid input: \n\tpartition_id must be the same length as values")
|
|
391
502
|
|
|
392
|
-
if isinstance(expected_scores,
|
|
393
|
-
expected_scores = expected_scores
|
|
394
|
-
elif expected_scores is not None:
|
|
503
|
+
if expected_scores is not None and isinstance(expected_scores, (float, int)):
|
|
395
504
|
expected_scores = [float(expected_scores)] * len(values)
|
|
396
505
|
|
|
397
506
|
predictions: list[RegressionPrediction] = []
|
|
@@ -410,6 +519,7 @@ class RegressionModel:
|
|
|
410
519
|
"use_lookup_cache": use_lookup_cache,
|
|
411
520
|
"ignore_unlabeled": ignore_unlabeled,
|
|
412
521
|
"partition_filter_mode": partition_filter_mode,
|
|
522
|
+
"consistency_level": consistency_level,
|
|
413
523
|
}
|
|
414
524
|
if partition_filter_mode != "ignore_partitions":
|
|
415
525
|
request_json["partition_ids"] = (
|
|
@@ -426,6 +536,7 @@ class RegressionModel:
|
|
|
426
536
|
if telemetry_on and any(p["prediction_id"] is None for p in response):
|
|
427
537
|
raise RuntimeError("Failed to save prediction to database.")
|
|
428
538
|
|
|
539
|
+
batch_expected = batch_expected_scores or [None] * len(batch_values)
|
|
429
540
|
predictions.extend(
|
|
430
541
|
RegressionPrediction(
|
|
431
542
|
prediction_id=prediction["prediction_id"],
|
|
@@ -438,8 +549,9 @@ class RegressionModel:
|
|
|
438
549
|
model=self,
|
|
439
550
|
logits=None,
|
|
440
551
|
input_value=input_value,
|
|
552
|
+
expected_score=exp_score,
|
|
441
553
|
)
|
|
442
|
-
for prediction, input_value in zip(response, batch_values)
|
|
554
|
+
for prediction, input_value, exp_score in zip(response, batch_values, batch_expected)
|
|
443
555
|
)
|
|
444
556
|
|
|
445
557
|
self._last_prediction_was_batch = isinstance(value, list)
|
|
@@ -581,25 +693,14 @@ class RegressionModel:
|
|
|
581
693
|
params={"model_name_or_id": self.id, "job_id": response["job_id"]},
|
|
582
694
|
)
|
|
583
695
|
assert res["result"] is not None
|
|
584
|
-
return RegressionMetrics(
|
|
585
|
-
coverage=res["result"].get("coverage"),
|
|
586
|
-
mse=res["result"].get("mse"),
|
|
587
|
-
rmse=res["result"].get("rmse"),
|
|
588
|
-
mae=res["result"].get("mae"),
|
|
589
|
-
r2=res["result"].get("r2"),
|
|
590
|
-
explained_variance=res["result"].get("explained_variance"),
|
|
591
|
-
loss=res["result"].get("loss"),
|
|
592
|
-
anomaly_score_mean=res["result"].get("anomaly_score_mean"),
|
|
593
|
-
anomaly_score_median=res["result"].get("anomaly_score_median"),
|
|
594
|
-
anomaly_score_variance=res["result"].get("anomaly_score_variance"),
|
|
595
|
-
)
|
|
696
|
+
return RegressionMetrics(res["result"])
|
|
596
697
|
|
|
597
698
|
job = Job(response["job_id"], get_value)
|
|
598
699
|
return job if background else job.result()
|
|
599
700
|
|
|
600
|
-
def
|
|
701
|
+
def _evaluate_local(
|
|
601
702
|
self,
|
|
602
|
-
|
|
703
|
+
data: Iterable[dict[str, Any]],
|
|
603
704
|
value_column: str,
|
|
604
705
|
score_column: str,
|
|
605
706
|
record_predictions: bool,
|
|
@@ -612,37 +713,42 @@ class RegressionModel:
|
|
|
612
713
|
"ignore_partitions", "include_global", "exclude_global", "only_global"
|
|
613
714
|
] = "include_global",
|
|
614
715
|
) -> RegressionMetrics:
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
716
|
+
values: list[str] = []
|
|
717
|
+
expected_scores: list[float] = []
|
|
718
|
+
partition_ids: list[str | None] | None = [] if partition_column else None
|
|
719
|
+
|
|
720
|
+
for sample in data:
|
|
721
|
+
if len(values) >= 100_000:
|
|
722
|
+
raise ValueError("Upload a Datasource to evaluate against more than 100,000 samples.")
|
|
723
|
+
values.append(sample[value_column])
|
|
724
|
+
expected_score = sample[score_column]
|
|
725
|
+
if expected_score is None:
|
|
726
|
+
raise ValueError("Expected score is required for all samples")
|
|
727
|
+
expected_scores.append(expected_score)
|
|
728
|
+
if partition_ids is not None and partition_column:
|
|
729
|
+
partition_ids.append(sample[partition_column])
|
|
730
|
+
|
|
731
|
+
if not values:
|
|
732
|
+
raise ValueError("Evaluation data cannot be empty")
|
|
733
|
+
|
|
734
|
+
predictions = self.predict(
|
|
735
|
+
values,
|
|
736
|
+
expected_scores=expected_scores,
|
|
737
|
+
tags=tags,
|
|
738
|
+
save_telemetry="sync" if record_predictions else "off",
|
|
739
|
+
prompt=prompt,
|
|
740
|
+
ignore_unlabeled=ignore_unlabeled,
|
|
741
|
+
partition_id=partition_ids,
|
|
742
|
+
partition_filter_mode=partition_filter_mode,
|
|
743
|
+
batch_size=batch_size,
|
|
640
744
|
)
|
|
641
745
|
|
|
746
|
+
return RegressionMetrics.compute(predictions)
|
|
747
|
+
|
|
642
748
|
@overload
|
|
643
749
|
def evaluate(
|
|
644
750
|
self,
|
|
645
|
-
data: Datasource
|
|
751
|
+
data: Datasource,
|
|
646
752
|
*,
|
|
647
753
|
value_column: str = "value",
|
|
648
754
|
score_column: str = "score",
|
|
@@ -663,7 +769,7 @@ class RegressionModel:
|
|
|
663
769
|
@overload
|
|
664
770
|
def evaluate(
|
|
665
771
|
self,
|
|
666
|
-
data: Datasource |
|
|
772
|
+
data: Datasource | HFDataset | PandasDataFrame | Iterable[dict[str, Any]],
|
|
667
773
|
*,
|
|
668
774
|
value_column: str = "value",
|
|
669
775
|
score_column: str = "score",
|
|
@@ -683,7 +789,7 @@ class RegressionModel:
|
|
|
683
789
|
|
|
684
790
|
def evaluate(
|
|
685
791
|
self,
|
|
686
|
-
data: Datasource |
|
|
792
|
+
data: Datasource | HFDataset | PandasDataFrame | Iterable[dict[str, Any]],
|
|
687
793
|
*,
|
|
688
794
|
value_column: str = "value",
|
|
689
795
|
score_column: str = "score",
|
|
@@ -703,12 +809,13 @@ class RegressionModel:
|
|
|
703
809
|
Evaluate the regression model on a given dataset or datasource
|
|
704
810
|
|
|
705
811
|
Params:
|
|
706
|
-
data:
|
|
812
|
+
data: the data to evaluate the model on. This can be an Orca [`Datasource`][orca_sdk.datasource.Datasource],
|
|
813
|
+
a Hugging Face [`Dataset`][datasets.Dataset], a pandas [`DataFrame`][pandas.DataFrame], or an iterable of dictionaries.
|
|
707
814
|
value_column: Name of the column that contains the input values to the model
|
|
708
815
|
score_column: Name of the column containing the expected scores
|
|
709
816
|
record_predictions: Whether to record [`RegressionPrediction`][orca_sdk.telemetry.RegressionPrediction]s for analysis
|
|
710
817
|
tags: Optional tags to add to the recorded [`RegressionPrediction`][orca_sdk.telemetry.RegressionPrediction]s
|
|
711
|
-
batch_size: Batch size for processing
|
|
818
|
+
batch_size: Batch size for processing the data inputs (not used for Datasource inputs)
|
|
712
819
|
prompt: Optional prompt for instruction-tuned embedding models
|
|
713
820
|
subsample: Optional number (int) of rows to sample or fraction (float in (0, 1]) of data to sample for evaluation.
|
|
714
821
|
background: Whether to run the operation in the background and return a job handle
|
|
@@ -752,9 +859,25 @@ class RegressionModel:
|
|
|
752
859
|
partition_column=partition_column,
|
|
753
860
|
partition_filter_mode=partition_filter_mode,
|
|
754
861
|
)
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
862
|
+
else:
|
|
863
|
+
if background:
|
|
864
|
+
raise ValueError("Background evaluation is only supported for Datasource inputs")
|
|
865
|
+
try:
|
|
866
|
+
import pandas as pd # type: ignore
|
|
867
|
+
|
|
868
|
+
if isinstance(data, pd.DataFrame):
|
|
869
|
+
data = data.to_dict(orient="records") # type: ignore
|
|
870
|
+
except ImportError:
|
|
871
|
+
pass
|
|
872
|
+
|
|
873
|
+
if not hasattr(data, "__iter__"):
|
|
874
|
+
raise ValueError(
|
|
875
|
+
f"Invalid data type: {type(data).__name__}. "
|
|
876
|
+
"Expected Iterable[dict], HuggingFace Dataset, or pandas DataFrame."
|
|
877
|
+
)
|
|
878
|
+
|
|
879
|
+
return self._evaluate_local(
|
|
880
|
+
data=cast(Iterable[dict[str, Any]], data),
|
|
758
881
|
value_column=value_column,
|
|
759
882
|
score_column=score_column,
|
|
760
883
|
record_predictions=record_predictions,
|
|
@@ -765,8 +888,6 @@ class RegressionModel:
|
|
|
765
888
|
partition_column=partition_column,
|
|
766
889
|
partition_filter_mode=partition_filter_mode,
|
|
767
890
|
)
|
|
768
|
-
else:
|
|
769
|
-
raise ValueError(f"Invalid data type: {type(data)}")
|
|
770
891
|
|
|
771
892
|
@contextmanager
|
|
772
893
|
def use_memoryset(self, memoryset_override: ScoredMemoryset) -> Generator[None, None, None]:
|
|
@@ -173,10 +173,11 @@ def test_delete_memoryset_with_model_cascade(hf_dataset):
|
|
|
173
173
|
assert not ScoredMemoryset.exists(memoryset.name)
|
|
174
174
|
|
|
175
175
|
|
|
176
|
-
@pytest.mark.parametrize("data_type", ["dataset", "datasource"])
|
|
176
|
+
@pytest.mark.parametrize("data_type", ["dataset", "datasource", "list"])
|
|
177
177
|
def test_evaluate(
|
|
178
178
|
regression_model: RegressionModel,
|
|
179
179
|
eval_datasource: Datasource,
|
|
180
|
+
eval_data: list[dict],
|
|
180
181
|
eval_dataset: Dataset,
|
|
181
182
|
data_type,
|
|
182
183
|
):
|
|
@@ -184,7 +185,11 @@ def test_evaluate(
|
|
|
184
185
|
result = (
|
|
185
186
|
regression_model.evaluate(eval_dataset)
|
|
186
187
|
if data_type == "dataset"
|
|
187
|
-
else
|
|
188
|
+
else (
|
|
189
|
+
regression_model.evaluate(eval_datasource)
|
|
190
|
+
if data_type == "datasource"
|
|
191
|
+
else regression_model.evaluate(eval_data)
|
|
192
|
+
)
|
|
188
193
|
)
|
|
189
194
|
|
|
190
195
|
assert isinstance(result, RegressionMetrics)
|
|
@@ -365,6 +370,8 @@ def test_predict(regression_model: RegressionModel):
|
|
|
365
370
|
assert len(predictions) == 2
|
|
366
371
|
assert predictions[0].prediction_id is not None
|
|
367
372
|
assert predictions[1].prediction_id is not None
|
|
373
|
+
assert predictions[0].score is not None
|
|
374
|
+
assert predictions[1].score is not None
|
|
368
375
|
assert np.allclose(predictions[0].score, 0.1)
|
|
369
376
|
assert np.allclose(predictions[1].score, 0.9)
|
|
370
377
|
assert 0 <= predictions[0].confidence <= 1
|
|
@@ -501,7 +508,7 @@ def test_record_prediction_feedback(regression_model: RegressionModel):
|
|
|
501
508
|
{
|
|
502
509
|
"prediction_id": p.prediction_id,
|
|
503
510
|
"category": "accurate",
|
|
504
|
-
"value": abs(p.score - expected_score) < 0.2,
|
|
511
|
+
"value": abs(p.score - expected_score) < 0.2 if p.score is not None else False,
|
|
505
512
|
}
|
|
506
513
|
for expected_score, p in zip(expected_scores, predictions)
|
|
507
514
|
)
|
|
@@ -538,11 +545,19 @@ def test_predict_with_memoryset_override(regression_model: RegressionModel, hf_d
|
|
|
538
545
|
with regression_model.use_memoryset(inverted_scored_memoryset):
|
|
539
546
|
override_predictions = regression_model.predict(["This is excellent!", "This is terrible!"])
|
|
540
547
|
# With inverted scores, the predictions should be different
|
|
548
|
+
assert original_predictions[0].score is not None
|
|
549
|
+
assert original_predictions[1].score is not None
|
|
550
|
+
assert override_predictions[0].score is not None
|
|
551
|
+
assert override_predictions[1].score is not None
|
|
541
552
|
assert abs(override_predictions[0].score - original_predictions[0].score) > 0.1
|
|
542
553
|
assert abs(override_predictions[1].score - original_predictions[1].score) > 0.1
|
|
543
554
|
|
|
544
555
|
# After exiting context, predictions should be back to normal
|
|
545
556
|
new_predictions = regression_model.predict(["This is excellent!", "This is terrible!"])
|
|
557
|
+
assert new_predictions[0].score is not None
|
|
558
|
+
assert new_predictions[1].score is not None
|
|
559
|
+
assert original_predictions[0].score is not None
|
|
560
|
+
assert original_predictions[1].score is not None
|
|
546
561
|
assert abs(new_predictions[0].score - original_predictions[0].score) < 0.1
|
|
547
562
|
assert abs(new_predictions[1].score - original_predictions[1].score) < 0.1
|
|
548
563
|
|
orca_sdk/telemetry.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import logging
|
|
4
3
|
import os
|
|
5
4
|
from abc import ABC
|
|
6
5
|
from datetime import datetime
|
|
@@ -8,7 +7,7 @@ from typing import TYPE_CHECKING, Any, Iterable, Literal, Self, overload
|
|
|
8
7
|
|
|
9
8
|
from httpx import Timeout
|
|
10
9
|
|
|
11
|
-
from ._utils.common import UNSET
|
|
10
|
+
from ._utils.common import UNSET, logger
|
|
12
11
|
from .client import (
|
|
13
12
|
LabelPredictionWithMemoriesAndFeedback,
|
|
14
13
|
OrcaClient,
|
|
@@ -118,7 +117,7 @@ class FeedbackCategory:
|
|
|
118
117
|
"""
|
|
119
118
|
client = OrcaClient._resolve_client()
|
|
120
119
|
client.DELETE("/telemetry/feedback_category/{name_or_id}", params={"name_or_id": name})
|
|
121
|
-
|
|
120
|
+
logger.info(f"Deleted feedback category {name} with all associated feedback")
|
|
122
121
|
|
|
123
122
|
def __repr__(self):
|
|
124
123
|
return "FeedbackCategory({" + f"name: {self.name}, " + f"value_type: {self.value_type}" + "})"
|
|
@@ -175,6 +174,8 @@ class PredictionBase(ABC):
|
|
|
175
174
|
telemetry: LabelPredictionWithMemoriesAndFeedback | ScorePredictionWithMemoriesAndFeedback | None = None,
|
|
176
175
|
logits: list[float] | None = None,
|
|
177
176
|
input_value: str | None = None,
|
|
177
|
+
expected_label: int | None = None,
|
|
178
|
+
expected_score: float | None = None,
|
|
178
179
|
):
|
|
179
180
|
self.prediction_id = prediction_id
|
|
180
181
|
self.label = label
|
|
@@ -187,6 +188,8 @@ class PredictionBase(ABC):
|
|
|
187
188
|
self.__telemetry = telemetry if telemetry else None
|
|
188
189
|
self.logits = logits
|
|
189
190
|
self._input_value = input_value
|
|
191
|
+
self._expected_label = expected_label
|
|
192
|
+
self._expected_score = expected_score
|
|
190
193
|
|
|
191
194
|
@property
|
|
192
195
|
def _telemetry(self) -> LabelPredictionWithMemoriesAndFeedback | ScorePredictionWithMemoriesAndFeedback:
|
|
@@ -537,7 +540,7 @@ class ClassificationPrediction(PredictionBase):
|
|
|
537
540
|
memoryset: Memoryset that was used to lookup memories to ground the prediction
|
|
538
541
|
"""
|
|
539
542
|
|
|
540
|
-
label: int
|
|
543
|
+
label: int | None
|
|
541
544
|
label_name: str
|
|
542
545
|
logits: list[float] | None
|
|
543
546
|
model: ClassificationModel
|
|
@@ -562,11 +565,15 @@ class ClassificationPrediction(PredictionBase):
|
|
|
562
565
|
|
|
563
566
|
@property
|
|
564
567
|
def expected_label(self) -> int | None:
|
|
568
|
+
if self._expected_label is not None:
|
|
569
|
+
return self._expected_label
|
|
565
570
|
assert "label" in self._telemetry
|
|
566
571
|
return self._telemetry["expected_label"]
|
|
567
572
|
|
|
568
573
|
@property
|
|
569
574
|
def expected_label_name(self) -> str | None:
|
|
575
|
+
if self._expected_label is not None:
|
|
576
|
+
return self.memoryset.label_names[self._expected_label]
|
|
570
577
|
assert "label" in self._telemetry
|
|
571
578
|
return self._telemetry["expected_label_name"]
|
|
572
579
|
|
|
@@ -692,14 +699,14 @@ class RegressionPrediction(PredictionBase):
|
|
|
692
699
|
memoryset: Memoryset that was used to lookup memories to ground the prediction
|
|
693
700
|
"""
|
|
694
701
|
|
|
695
|
-
score: float
|
|
702
|
+
score: float | None
|
|
696
703
|
model: RegressionModel
|
|
697
704
|
memoryset: ScoredMemoryset
|
|
698
705
|
|
|
699
706
|
def __repr__(self):
|
|
700
707
|
return (
|
|
701
708
|
"RegressionPrediction({"
|
|
702
|
-
+ f"score: {self.score:.2f}, "
|
|
709
|
+
+ (f"score: {self.score:.2f}, " if self.score is not None else "score: None, ")
|
|
703
710
|
+ f"confidence: {self.confidence:.2f}, "
|
|
704
711
|
+ (f"anomaly_score: {self.anomaly_score:.2f}, " if self.anomaly_score is not None else "")
|
|
705
712
|
+ f"input_value: '{str(self.input_value)[:100] + '...' if len(str(self.input_value)) > 100 else self.input_value}'"
|
|
@@ -720,6 +727,8 @@ class RegressionPrediction(PredictionBase):
|
|
|
720
727
|
|
|
721
728
|
@property
|
|
722
729
|
def expected_score(self) -> float | None:
|
|
730
|
+
if self._expected_score is not None:
|
|
731
|
+
return self._expected_score
|
|
723
732
|
assert "score" in self._telemetry
|
|
724
733
|
return self._telemetry["expected_score"]
|
|
725
734
|
|