orca-sdk 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,6 @@ import pytest
5
5
  from datasets.arrow_dataset import Dataset
6
6
 
7
7
  from .classification_model import ClassificationModel
8
- from .conftest import skip_in_ci, skip_in_prod
9
8
  from .datasource import Datasource
10
9
  from .embedding_model import PretrainedEmbeddingModel
11
10
  from .memoryset import (
@@ -126,6 +125,33 @@ def test_create_empty_scored_memoryset():
126
125
  ScoredMemoryset.drop(name, if_not_exists="ignore")
127
126
 
128
127
 
128
+ def test_create_empty_partitioned_labeled_memoryset():
129
+ name = f"test_empty_partitioned_labeled_{uuid4()}"
130
+ label_names = ["negative", "positive"]
131
+ try:
132
+ memoryset = LabeledMemoryset.create(
133
+ name, label_names=label_names, partitioned=True, description="empty partitioned labeled test"
134
+ )
135
+ assert memoryset is not None
136
+ assert memoryset.name == name
137
+ assert memoryset.length == 0
138
+ assert memoryset.partitioned is True
139
+
140
+ # inserting with partition_id should work
141
+ memoryset.insert(dict(value="i love soup", label=1, partition_id="p1"))
142
+ memoryset.insert(dict(value="cats are cute", label=0, partition_id="p2"))
143
+ assert memoryset.length == 2
144
+ finally:
145
+ LabeledMemoryset.drop(name, if_not_exists="ignore")
146
+
147
+
148
+ def test_non_partitioned_memoryset_has_partitioned_false(
149
+ readonly_partitioned_memoryset: LabeledMemoryset, readonly_memoryset: LabeledMemoryset
150
+ ):
151
+ assert readonly_partitioned_memoryset.partitioned is True
152
+ assert readonly_memoryset.partitioned is False
153
+
154
+
129
155
  def test_create_memoryset_unauthenticated(unauthenticated_client, datasource):
130
156
  with unauthenticated_client.use():
131
157
  with pytest.raises(ValueError, match="Invalid API key"):
@@ -718,7 +744,6 @@ def test_insert_memories(writable_memoryset: LabeledMemoryset):
718
744
  assert last_memory.source_id == "test"
719
745
 
720
746
 
721
- @skip_in_prod("Production memorysets do not have session consistency guarantees")
722
747
  def test_update_memories(writable_memoryset: LabeledMemoryset, hf_dataset: Dataset):
723
748
  # We've combined the update tests into one to avoid multiple expensive requests for a writable_memoryset
724
749
 
@@ -726,10 +751,9 @@ def test_update_memories(writable_memoryset: LabeledMemoryset, hf_dataset: Datas
726
751
  memory_id = writable_memoryset[0].memory_id
727
752
  updated_count = writable_memoryset.update(dict(memory_id=memory_id, value="i love soup so much"))
728
753
  assert updated_count == 1
729
- updated_memory = writable_memoryset.get(memory_id)
754
+ updated_memory = writable_memoryset.get(memory_id, consistency_level="Strong")
730
755
  assert updated_memory.value == "i love soup so much"
731
756
  assert updated_memory.label == hf_dataset[0]["label"]
732
- assert writable_memoryset.get(memory_id).value == "i love soup so much"
733
757
 
734
758
  # test updating a memory instance
735
759
  memory = writable_memoryset[0]
@@ -748,11 +772,10 @@ def test_update_memories(writable_memoryset: LabeledMemoryset, hf_dataset: Datas
748
772
  batch_size=1,
749
773
  )
750
774
  assert updated_count == 2
751
- assert writable_memoryset.get(memory_ids[0]).value == "i love soup so much"
752
- assert writable_memoryset.get(memory_ids[1]).value == "cats are so cute"
775
+ assert writable_memoryset.get(memory_ids[0], consistency_level="Strong").value == "i love soup so much"
776
+ assert writable_memoryset.get(memory_ids[1], consistency_level="Strong").value == "cats are so cute"
753
777
 
754
778
 
755
- @skip_in_prod("Production memorysets do not have session consistency guarantees")
756
779
  def test_update_memory_metadata(writable_memoryset: LabeledMemoryset):
757
780
  memory = writable_memoryset[0]
758
781
  assert memory.metadata["key"] == "g1"
@@ -769,7 +792,7 @@ def test_update_memory_metadata(writable_memoryset: LabeledMemoryset):
769
792
 
770
793
  # Can explicitly clear metadata by passing metadata={}
771
794
  writable_memoryset.update(dict(memory_id=memory.memory_id, metadata={}))
772
- updated = writable_memoryset.get(memory.memory_id)
795
+ updated = writable_memoryset.get(memory.memory_id, consistency_level="Strong")
773
796
  assert updated.metadata == {}, "Metadata should be cleared when explicitly set to {}"
774
797
 
775
798
 
@@ -782,7 +805,9 @@ def test_update_memories_by_filter(writable_memoryset: LabeledMemoryset):
782
805
  )
783
806
  assert updated_count == 2
784
807
  assert len(writable_memoryset) == initial_length
785
- updated_memories = writable_memoryset.query(filters=[("source_id", "in", source_ids_to_update)])
808
+ updated_memories = writable_memoryset.query(
809
+ filters=[("source_id", "in", source_ids_to_update)], consistency_level="Strong"
810
+ )
786
811
  assert len(updated_memories) == 2
787
812
  assert all(memory.label == 1 for memory in updated_memories)
788
813
 
@@ -980,15 +1005,16 @@ def test_scored_memoryset(scored_memoryset: ScoredMemoryset):
980
1005
  assert lookup[0].score < 0.11
981
1006
 
982
1007
 
983
- @skip_in_prod("Production memorysets do not have session consistency guarantees")
984
1008
  def test_update_scored_memory(scored_memoryset: ScoredMemoryset):
985
1009
  # we are only updating an inconsequential metadata field so that we don't affect other tests
986
1010
  memory = scored_memoryset[0]
987
1011
  assert memory.label == 0
988
1012
  scored_memoryset.update(dict(memory_id=memory.memory_id, label=3))
989
- assert scored_memoryset[0].label == 3
990
- memory.update(label=4)
991
- assert scored_memoryset[0].label == 4
1013
+ memory = scored_memoryset.get(memory.memory_id, consistency_level="Strong")
1014
+ assert memory.label == 3
1015
+ memory = memory.update(label=4)
1016
+ memory = scored_memoryset.get(memory.memory_id, consistency_level="Strong")
1017
+ assert memory.label == 4
992
1018
 
993
1019
 
994
1020
  @pytest.mark.asyncio
@@ -1,25 +1,33 @@
1
1
  from __future__ import annotations
2
2
 
3
- import logging
4
3
  from contextlib import contextmanager
5
4
  from datetime import datetime
6
- from typing import Any, Generator, Iterable, Literal, cast, overload
7
-
8
- from datasets import Dataset
5
+ from typing import (
6
+ TYPE_CHECKING,
7
+ Any,
8
+ Generator,
9
+ Iterable,
10
+ Literal,
11
+ Sequence,
12
+ cast,
13
+ overload,
14
+ )
9
15
 
10
- from ._shared.metrics import RegressionMetrics, calculate_regression_metrics
11
- from ._utils.common import UNSET, CreateMode, DropMode
16
+ from ._utils.common import UNSET, CreateMode, DropMode, logger
12
17
  from .client import (
13
18
  ListPredictionsRequest,
14
19
  OrcaClient,
15
20
  PredictiveModelUpdate,
16
21
  RARHeadType,
22
+ )
23
+ from .client import RegressionMetrics as RegressionMetricsResponse
24
+ from .client import (
17
25
  RegressionModelMetadata,
18
26
  RegressionPredictionRequest,
19
27
  )
20
28
  from .datasource import Datasource
21
29
  from .job import Job
22
- from .memoryset import ScoredMemoryset
30
+ from .memoryset import ConsistencyLevel, ScoredMemoryset
23
31
  from .telemetry import (
24
32
  RegressionPrediction,
25
33
  TelemetryMode,
@@ -27,7 +35,107 @@ from .telemetry import (
27
35
  _parse_feedback,
28
36
  )
29
37
 
30
- logger = logging.getLogger(__name__)
38
+ if TYPE_CHECKING:
39
+ # Peer dependency - user has datasets if they have a Dataset object
40
+ from datasets import Dataset as HFDataset # type: ignore
41
+ from pandas import DataFrame as PandasDataFrame # type: ignore
42
+
43
+
44
+ class RegressionMetrics:
45
+ """
46
+ Metrics for evaluating regression model performance.
47
+
48
+ Attributes:
49
+ coverage: Percentage of predictions that are not none
50
+ mse: Mean squared error of the predictions
51
+ rmse: Root mean squared error of the predictions
52
+ mae: Mean absolute error of the predictions
53
+ r2: R-squared score (coefficient of determination) of the predictions
54
+ explained_variance: Explained variance score of the predictions
55
+ loss: Mean squared error loss of the predictions
56
+ anomaly_score_mean: Mean of anomaly scores across the dataset
57
+ anomaly_score_median: Median of anomaly scores across the dataset
58
+ anomaly_score_variance: Variance of anomaly scores across the dataset
59
+ """
60
+
61
+ coverage: float
62
+ mse: float
63
+ rmse: float
64
+ mae: float
65
+ r2: float
66
+ explained_variance: float
67
+ loss: float
68
+ anomaly_score_mean: float | None
69
+ anomaly_score_median: float | None
70
+ anomaly_score_variance: float | None
71
+
72
+ def __init__(self, response: RegressionMetricsResponse):
73
+ self.coverage = response["coverage"]
74
+ self.mse = response["mse"]
75
+ self.rmse = response["rmse"]
76
+ self.mae = response["mae"]
77
+ self.r2 = response["r2"]
78
+ self.explained_variance = response["explained_variance"]
79
+ self.loss = response["loss"]
80
+ self.anomaly_score_mean = response.get("anomaly_score_mean")
81
+ self.anomaly_score_median = response.get("anomaly_score_median")
82
+ self.anomaly_score_variance = response.get("anomaly_score_variance")
83
+ for warning in response.get("warnings", []):
84
+ logger.warning(warning)
85
+
86
+ def __repr__(self) -> str:
87
+ return (
88
+ "RegressionMetrics({\n"
89
+ + f" mae: {self.mae:.4f},\n"
90
+ + f" rmse: {self.rmse:.4f},\n"
91
+ + f" r2: {self.r2:.4f},\n"
92
+ + (
93
+ f" anomaly_score: {self.anomaly_score_mean:.4f} ± {self.anomaly_score_variance:.4f},\n"
94
+ if self.anomaly_score_mean
95
+ else ""
96
+ )
97
+ + "})"
98
+ )
99
+
100
+ @classmethod
101
+ def compute(
102
+ cls,
103
+ predictions: Sequence[RegressionPrediction],
104
+ ) -> RegressionMetrics:
105
+ """
106
+ Compute regression metrics from a list of predictions.
107
+
108
+ Params:
109
+ predictions: List of RegressionPrediction objects with expected_score set
110
+
111
+ Returns:
112
+ RegressionMetrics with computed metrics
113
+
114
+ Raises:
115
+ ValueError: If any prediction is missing expected_score
116
+ """
117
+ if len(predictions) > 100_000:
118
+ raise ValueError("Too many predictions, maximum is 100,000")
119
+ if any(p.expected_score is None for p in predictions):
120
+ raise ValueError("All predictions must have expected_score set")
121
+ expected_scores = [cast(float, p.expected_score) for p in predictions]
122
+ predicted_scores = [p.score for p in predictions]
123
+ anomaly_scores = (
124
+ None
125
+ if any(p.anomaly_score is None for p in predictions)
126
+ else [cast(float, p.anomaly_score) for p in predictions]
127
+ )
128
+
129
+ client = OrcaClient._resolve_client()
130
+ response = client.POST(
131
+ "/regression_model/metrics",
132
+ json={
133
+ "expected_scores": expected_scores,
134
+ "predicted_scores": predicted_scores,
135
+ "anomaly_scores": anomaly_scores,
136
+ },
137
+ )
138
+ return cls(response)
31
139
 
32
140
 
33
141
  class RegressionModel:
@@ -105,7 +213,7 @@ class RegressionModel:
105
213
  is raised.
106
214
  """
107
215
  if self._last_prediction_was_batch:
108
- logging.warning(
216
+ logger.warning(
109
217
  "Last prediction was part of a batch prediction, returning the last prediction from the batch"
110
218
  )
111
219
  if self._last_prediction is None:
@@ -233,7 +341,7 @@ class RegressionModel:
233
341
  try:
234
342
  client = OrcaClient._resolve_client()
235
343
  client.DELETE("/regression_model/{name_or_id}", params={"name_or_id": name_or_id})
236
- logging.info(f"Deleted model {name_or_id}")
344
+ logger.info(f"Deleted model {name_or_id}")
237
345
  except LookupError:
238
346
  if if_not_exists == "error":
239
347
  raise
@@ -297,6 +405,7 @@ class RegressionModel:
297
405
  ] = "include_global",
298
406
  use_gpu: bool = True,
299
407
  batch_size: int = 100,
408
+ consistency_level: ConsistencyLevel = "Bounded",
300
409
  ) -> RegressionPrediction: ...
301
410
 
302
411
  @overload
@@ -316,6 +425,7 @@ class RegressionModel:
316
425
  ] = "include_global",
317
426
  use_gpu: bool = True,
318
427
  batch_size: int = 100,
428
+ consistency_level: ConsistencyLevel = "Bounded",
319
429
  ) -> list[RegressionPrediction]: ...
320
430
 
321
431
  # TODO: add filter support
@@ -335,6 +445,7 @@ class RegressionModel:
335
445
  ] = "include_global",
336
446
  use_gpu: bool = True,
337
447
  batch_size: int = 100,
448
+ consistency_level: ConsistencyLevel = "Bounded",
338
449
  ) -> RegressionPrediction | list[RegressionPrediction]:
339
450
  """
340
451
  Make predictions using the regression model.
@@ -383,15 +494,13 @@ class RegressionModel:
383
494
  client = OrcaClient._resolve_client()
384
495
 
385
496
  # Convert to list for batching
386
- values = value if isinstance(value, list) else [value]
497
+ values = [value] if isinstance(value, str) else list(value)
387
498
  if isinstance(expected_scores, list) and len(expected_scores) != len(values):
388
499
  raise ValueError("Invalid input: \n\texpected_scores must be the same length as values")
389
500
  if isinstance(partition_id, list) and len(partition_id) != len(values):
390
501
  raise ValueError("Invalid input: \n\tpartition_id must be the same length as values")
391
502
 
392
- if isinstance(expected_scores, list):
393
- expected_scores = expected_scores
394
- elif expected_scores is not None:
503
+ if expected_scores is not None and isinstance(expected_scores, (float, int)):
395
504
  expected_scores = [float(expected_scores)] * len(values)
396
505
 
397
506
  predictions: list[RegressionPrediction] = []
@@ -410,6 +519,7 @@ class RegressionModel:
410
519
  "use_lookup_cache": use_lookup_cache,
411
520
  "ignore_unlabeled": ignore_unlabeled,
412
521
  "partition_filter_mode": partition_filter_mode,
522
+ "consistency_level": consistency_level,
413
523
  }
414
524
  if partition_filter_mode != "ignore_partitions":
415
525
  request_json["partition_ids"] = (
@@ -426,6 +536,7 @@ class RegressionModel:
426
536
  if telemetry_on and any(p["prediction_id"] is None for p in response):
427
537
  raise RuntimeError("Failed to save prediction to database.")
428
538
 
539
+ batch_expected = batch_expected_scores or [None] * len(batch_values)
429
540
  predictions.extend(
430
541
  RegressionPrediction(
431
542
  prediction_id=prediction["prediction_id"],
@@ -438,8 +549,9 @@ class RegressionModel:
438
549
  model=self,
439
550
  logits=None,
440
551
  input_value=input_value,
552
+ expected_score=exp_score,
441
553
  )
442
- for prediction, input_value in zip(response, batch_values)
554
+ for prediction, input_value, exp_score in zip(response, batch_values, batch_expected)
443
555
  )
444
556
 
445
557
  self._last_prediction_was_batch = isinstance(value, list)
@@ -581,25 +693,14 @@ class RegressionModel:
581
693
  params={"model_name_or_id": self.id, "job_id": response["job_id"]},
582
694
  )
583
695
  assert res["result"] is not None
584
- return RegressionMetrics(
585
- coverage=res["result"].get("coverage"),
586
- mse=res["result"].get("mse"),
587
- rmse=res["result"].get("rmse"),
588
- mae=res["result"].get("mae"),
589
- r2=res["result"].get("r2"),
590
- explained_variance=res["result"].get("explained_variance"),
591
- loss=res["result"].get("loss"),
592
- anomaly_score_mean=res["result"].get("anomaly_score_mean"),
593
- anomaly_score_median=res["result"].get("anomaly_score_median"),
594
- anomaly_score_variance=res["result"].get("anomaly_score_variance"),
595
- )
696
+ return RegressionMetrics(res["result"])
596
697
 
597
698
  job = Job(response["job_id"], get_value)
598
699
  return job if background else job.result()
599
700
 
600
- def _evaluate_dataset(
701
+ def _evaluate_local(
601
702
  self,
602
- dataset: Dataset,
703
+ data: Iterable[dict[str, Any]],
603
704
  value_column: str,
604
705
  score_column: str,
605
706
  record_predictions: bool,
@@ -612,37 +713,42 @@ class RegressionModel:
612
713
  "ignore_partitions", "include_global", "exclude_global", "only_global"
613
714
  ] = "include_global",
614
715
  ) -> RegressionMetrics:
615
- if len(dataset) == 0:
616
- raise ValueError("Evaluation dataset cannot be empty")
617
-
618
- if any(x is None for x in dataset[score_column]):
619
- raise ValueError("Evaluation dataset cannot contain None values in the score column")
620
-
621
- predictions = [
622
- prediction
623
- for i in range(0, len(dataset), batch_size)
624
- for prediction in self.predict(
625
- dataset[i : i + batch_size][value_column],
626
- expected_scores=dataset[i : i + batch_size][score_column],
627
- tags=tags,
628
- save_telemetry="sync" if record_predictions else "off",
629
- prompt=prompt,
630
- ignore_unlabeled=ignore_unlabeled,
631
- partition_id=dataset[i : i + batch_size][partition_column] if partition_column else None,
632
- partition_filter_mode=partition_filter_mode,
633
- )
634
- ]
635
-
636
- return calculate_regression_metrics(
637
- expected_scores=dataset[score_column],
638
- predicted_scores=[p.score for p in predictions],
639
- anomaly_scores=[p.anomaly_score for p in predictions],
716
+ values: list[str] = []
717
+ expected_scores: list[float] = []
718
+ partition_ids: list[str | None] | None = [] if partition_column else None
719
+
720
+ for sample in data:
721
+ if len(values) >= 100_000:
722
+ raise ValueError("Upload a Datasource to evaluate against more than 100,000 samples.")
723
+ values.append(sample[value_column])
724
+ expected_score = sample[score_column]
725
+ if expected_score is None:
726
+ raise ValueError("Expected score is required for all samples")
727
+ expected_scores.append(expected_score)
728
+ if partition_ids is not None and partition_column:
729
+ partition_ids.append(sample[partition_column])
730
+
731
+ if not values:
732
+ raise ValueError("Evaluation data cannot be empty")
733
+
734
+ predictions = self.predict(
735
+ values,
736
+ expected_scores=expected_scores,
737
+ tags=tags,
738
+ save_telemetry="sync" if record_predictions else "off",
739
+ prompt=prompt,
740
+ ignore_unlabeled=ignore_unlabeled,
741
+ partition_id=partition_ids,
742
+ partition_filter_mode=partition_filter_mode,
743
+ batch_size=batch_size,
640
744
  )
641
745
 
746
+ return RegressionMetrics.compute(predictions)
747
+
642
748
  @overload
643
749
  def evaluate(
644
750
  self,
645
- data: Datasource | Dataset,
751
+ data: Datasource,
646
752
  *,
647
753
  value_column: str = "value",
648
754
  score_column: str = "score",
@@ -663,7 +769,7 @@ class RegressionModel:
663
769
  @overload
664
770
  def evaluate(
665
771
  self,
666
- data: Datasource | Dataset,
772
+ data: Datasource | HFDataset | PandasDataFrame | Iterable[dict[str, Any]],
667
773
  *,
668
774
  value_column: str = "value",
669
775
  score_column: str = "score",
@@ -683,7 +789,7 @@ class RegressionModel:
683
789
 
684
790
  def evaluate(
685
791
  self,
686
- data: Datasource | Dataset,
792
+ data: Datasource | HFDataset | PandasDataFrame | Iterable[dict[str, Any]],
687
793
  *,
688
794
  value_column: str = "value",
689
795
  score_column: str = "score",
@@ -703,12 +809,13 @@ class RegressionModel:
703
809
  Evaluate the regression model on a given dataset or datasource
704
810
 
705
811
  Params:
706
- data: Dataset or Datasource to evaluate the model on
812
+ data: the data to evaluate the model on. This can be an Orca [`Datasource`][orca_sdk.datasource.Datasource],
813
+ a Hugging Face [`Dataset`][datasets.Dataset], a pandas [`DataFrame`][pandas.DataFrame], or an iterable of dictionaries.
707
814
  value_column: Name of the column that contains the input values to the model
708
815
  score_column: Name of the column containing the expected scores
709
816
  record_predictions: Whether to record [`RegressionPrediction`][orca_sdk.telemetry.RegressionPrediction]s for analysis
710
817
  tags: Optional tags to add to the recorded [`RegressionPrediction`][orca_sdk.telemetry.RegressionPrediction]s
711
- batch_size: Batch size for processing Dataset inputs (only used when input is a Dataset)
818
+ batch_size: Batch size for processing the data inputs (not used for Datasource inputs)
712
819
  prompt: Optional prompt for instruction-tuned embedding models
713
820
  subsample: Optional number (int) of rows to sample or fraction (float in (0, 1]) of data to sample for evaluation.
714
821
  background: Whether to run the operation in the background and return a job handle
@@ -752,9 +859,25 @@ class RegressionModel:
752
859
  partition_column=partition_column,
753
860
  partition_filter_mode=partition_filter_mode,
754
861
  )
755
- elif isinstance(data, Dataset):
756
- return self._evaluate_dataset(
757
- dataset=data,
862
+ else:
863
+ if background:
864
+ raise ValueError("Background evaluation is only supported for Datasource inputs")
865
+ try:
866
+ import pandas as pd # type: ignore
867
+
868
+ if isinstance(data, pd.DataFrame):
869
+ data = data.to_dict(orient="records") # type: ignore
870
+ except ImportError:
871
+ pass
872
+
873
+ if not hasattr(data, "__iter__"):
874
+ raise ValueError(
875
+ f"Invalid data type: {type(data).__name__}. "
876
+ "Expected Iterable[dict], HuggingFace Dataset, or pandas DataFrame."
877
+ )
878
+
879
+ return self._evaluate_local(
880
+ data=cast(Iterable[dict[str, Any]], data),
758
881
  value_column=value_column,
759
882
  score_column=score_column,
760
883
  record_predictions=record_predictions,
@@ -765,8 +888,6 @@ class RegressionModel:
765
888
  partition_column=partition_column,
766
889
  partition_filter_mode=partition_filter_mode,
767
890
  )
768
- else:
769
- raise ValueError(f"Invalid data type: {type(data)}")
770
891
 
771
892
  @contextmanager
772
893
  def use_memoryset(self, memoryset_override: ScoredMemoryset) -> Generator[None, None, None]:
@@ -173,10 +173,11 @@ def test_delete_memoryset_with_model_cascade(hf_dataset):
173
173
  assert not ScoredMemoryset.exists(memoryset.name)
174
174
 
175
175
 
176
- @pytest.mark.parametrize("data_type", ["dataset", "datasource"])
176
+ @pytest.mark.parametrize("data_type", ["dataset", "datasource", "list"])
177
177
  def test_evaluate(
178
178
  regression_model: RegressionModel,
179
179
  eval_datasource: Datasource,
180
+ eval_data: list[dict],
180
181
  eval_dataset: Dataset,
181
182
  data_type,
182
183
  ):
@@ -184,7 +185,11 @@ def test_evaluate(
184
185
  result = (
185
186
  regression_model.evaluate(eval_dataset)
186
187
  if data_type == "dataset"
187
- else regression_model.evaluate(eval_datasource)
188
+ else (
189
+ regression_model.evaluate(eval_datasource)
190
+ if data_type == "datasource"
191
+ else regression_model.evaluate(eval_data)
192
+ )
188
193
  )
189
194
 
190
195
  assert isinstance(result, RegressionMetrics)
@@ -365,6 +370,8 @@ def test_predict(regression_model: RegressionModel):
365
370
  assert len(predictions) == 2
366
371
  assert predictions[0].prediction_id is not None
367
372
  assert predictions[1].prediction_id is not None
373
+ assert predictions[0].score is not None
374
+ assert predictions[1].score is not None
368
375
  assert np.allclose(predictions[0].score, 0.1)
369
376
  assert np.allclose(predictions[1].score, 0.9)
370
377
  assert 0 <= predictions[0].confidence <= 1
@@ -501,7 +508,7 @@ def test_record_prediction_feedback(regression_model: RegressionModel):
501
508
  {
502
509
  "prediction_id": p.prediction_id,
503
510
  "category": "accurate",
504
- "value": abs(p.score - expected_score) < 0.2,
511
+ "value": abs(p.score - expected_score) < 0.2 if p.score is not None else False,
505
512
  }
506
513
  for expected_score, p in zip(expected_scores, predictions)
507
514
  )
@@ -538,11 +545,19 @@ def test_predict_with_memoryset_override(regression_model: RegressionModel, hf_d
538
545
  with regression_model.use_memoryset(inverted_scored_memoryset):
539
546
  override_predictions = regression_model.predict(["This is excellent!", "This is terrible!"])
540
547
  # With inverted scores, the predictions should be different
548
+ assert original_predictions[0].score is not None
549
+ assert original_predictions[1].score is not None
550
+ assert override_predictions[0].score is not None
551
+ assert override_predictions[1].score is not None
541
552
  assert abs(override_predictions[0].score - original_predictions[0].score) > 0.1
542
553
  assert abs(override_predictions[1].score - original_predictions[1].score) > 0.1
543
554
 
544
555
  # After exiting context, predictions should be back to normal
545
556
  new_predictions = regression_model.predict(["This is excellent!", "This is terrible!"])
557
+ assert new_predictions[0].score is not None
558
+ assert new_predictions[1].score is not None
559
+ assert original_predictions[0].score is not None
560
+ assert original_predictions[1].score is not None
546
561
  assert abs(new_predictions[0].score - original_predictions[0].score) < 0.1
547
562
  assert abs(new_predictions[1].score - original_predictions[1].score) < 0.1
548
563
 
orca_sdk/telemetry.py CHANGED
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import logging
4
3
  import os
5
4
  from abc import ABC
6
5
  from datetime import datetime
@@ -8,7 +7,7 @@ from typing import TYPE_CHECKING, Any, Iterable, Literal, Self, overload
8
7
 
9
8
  from httpx import Timeout
10
9
 
11
- from ._utils.common import UNSET
10
+ from ._utils.common import UNSET, logger
12
11
  from .client import (
13
12
  LabelPredictionWithMemoriesAndFeedback,
14
13
  OrcaClient,
@@ -118,7 +117,7 @@ class FeedbackCategory:
118
117
  """
119
118
  client = OrcaClient._resolve_client()
120
119
  client.DELETE("/telemetry/feedback_category/{name_or_id}", params={"name_or_id": name})
121
- logging.info(f"Deleted feedback category {name} with all associated feedback")
120
+ logger.info(f"Deleted feedback category {name} with all associated feedback")
122
121
 
123
122
  def __repr__(self):
124
123
  return "FeedbackCategory({" + f"name: {self.name}, " + f"value_type: {self.value_type}" + "})"
@@ -175,6 +174,8 @@ class PredictionBase(ABC):
175
174
  telemetry: LabelPredictionWithMemoriesAndFeedback | ScorePredictionWithMemoriesAndFeedback | None = None,
176
175
  logits: list[float] | None = None,
177
176
  input_value: str | None = None,
177
+ expected_label: int | None = None,
178
+ expected_score: float | None = None,
178
179
  ):
179
180
  self.prediction_id = prediction_id
180
181
  self.label = label
@@ -187,6 +188,8 @@ class PredictionBase(ABC):
187
188
  self.__telemetry = telemetry if telemetry else None
188
189
  self.logits = logits
189
190
  self._input_value = input_value
191
+ self._expected_label = expected_label
192
+ self._expected_score = expected_score
190
193
 
191
194
  @property
192
195
  def _telemetry(self) -> LabelPredictionWithMemoriesAndFeedback | ScorePredictionWithMemoriesAndFeedback:
@@ -537,7 +540,7 @@ class ClassificationPrediction(PredictionBase):
537
540
  memoryset: Memoryset that was used to lookup memories to ground the prediction
538
541
  """
539
542
 
540
- label: int
543
+ label: int | None
541
544
  label_name: str
542
545
  logits: list[float] | None
543
546
  model: ClassificationModel
@@ -562,11 +565,15 @@ class ClassificationPrediction(PredictionBase):
562
565
 
563
566
  @property
564
567
  def expected_label(self) -> int | None:
568
+ if self._expected_label is not None:
569
+ return self._expected_label
565
570
  assert "label" in self._telemetry
566
571
  return self._telemetry["expected_label"]
567
572
 
568
573
  @property
569
574
  def expected_label_name(self) -> str | None:
575
+ if self._expected_label is not None:
576
+ return self.memoryset.label_names[self._expected_label]
570
577
  assert "label" in self._telemetry
571
578
  return self._telemetry["expected_label_name"]
572
579
 
@@ -692,14 +699,14 @@ class RegressionPrediction(PredictionBase):
692
699
  memoryset: Memoryset that was used to lookup memories to ground the prediction
693
700
  """
694
701
 
695
- score: float
702
+ score: float | None
696
703
  model: RegressionModel
697
704
  memoryset: ScoredMemoryset
698
705
 
699
706
  def __repr__(self):
700
707
  return (
701
708
  "RegressionPrediction({"
702
- + f"score: {self.score:.2f}, "
709
+ + (f"score: {self.score:.2f}, " if self.score is not None else "score: None, ")
703
710
  + f"confidence: {self.confidence:.2f}, "
704
711
  + (f"anomaly_score: {self.anomaly_score:.2f}, " if self.anomaly_score is not None else "")
705
712
  + f"input_value: '{str(self.input_value)[:100] + '...' if len(str(self.input_value)) > 100 else self.input_value}'"
@@ -720,6 +727,8 @@ class RegressionPrediction(PredictionBase):
720
727
 
721
728
  @property
722
729
  def expected_score(self) -> float | None:
730
+ if self._expected_score is not None:
731
+ return self._expected_score
723
732
  assert "score" in self._telemetry
724
733
  return self._telemetry["expected_score"]
725
734