orca-sdk 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
orca_sdk/memoryset.py CHANGED
@@ -4,7 +4,17 @@ import logging
4
4
  from abc import ABC
5
5
  from datetime import datetime, timedelta
6
6
  from os import PathLike
7
- from typing import Any, Generic, Iterable, Literal, Self, TypeVar, cast, overload
7
+ from typing import (
8
+ TYPE_CHECKING,
9
+ Any,
10
+ Generic,
11
+ Iterable,
12
+ Literal,
13
+ Self,
14
+ TypeVar,
15
+ cast,
16
+ overload,
17
+ )
8
18
 
9
19
  import pandas as pd
10
20
  import pyarrow as pa
@@ -29,6 +39,7 @@ from .client import (
29
39
  LabeledMemoryUpdate,
30
40
  LabeledMemoryWithFeedbackMetrics,
31
41
  LabelPredictionMemoryLookup,
42
+ LabelPredictionWithMemoriesAndFeedback,
32
43
  MemoryMetrics,
33
44
  MemorysetAnalysisConfigs,
34
45
  MemorysetMetadata,
@@ -36,6 +47,7 @@ from .client import (
36
47
  MemorysetUpdate,
37
48
  MemoryType,
38
49
  OrcaClient,
50
+ PredictionFeedback,
39
51
  )
40
52
  from .client import ScoredMemory as ScoredMemoryResponse
41
53
  from .client import (
@@ -46,6 +58,7 @@ from .client import (
46
58
  ScoredMemoryUpdate,
47
59
  ScoredMemoryWithFeedbackMetrics,
48
60
  ScorePredictionMemoryLookup,
61
+ ScorePredictionWithMemoriesAndFeedback,
49
62
  TelemetryFilterItem,
50
63
  TelemetrySortOptions,
51
64
  )
@@ -56,6 +69,11 @@ from .embedding_model import (
56
69
  PretrainedEmbeddingModel,
57
70
  )
58
71
  from .job import Job, Status
72
+ from .telemetry import ClassificationPrediction, RegressionPrediction
73
+
74
+ if TYPE_CHECKING:
75
+ from .classification_model import ClassificationModel
76
+ from .regression_model import RegressionModel
59
77
 
60
78
  TelemetrySortItem = tuple[str, Literal["asc", "desc"]]
61
79
  """
@@ -74,7 +92,7 @@ FilterOperation = Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in", "lik
74
92
  Operations that can be used in a filter expression.
75
93
  """
76
94
 
77
- FilterValue = str | int | float | bool | datetime | None | list[str] | list[int] | list[float] | list[bool]
95
+ FilterValue = str | int | float | bool | datetime | None | list[str | None] | list[int] | list[float] | list[bool]
78
96
  """
79
97
  Values that can be used in a filter expression.
80
98
  """
@@ -292,6 +310,110 @@ class MemoryBase(ABC):
292
310
  raise AttributeError(f"{key} is not a valid attribute")
293
311
  return self.metadata[key]
294
312
 
313
+ def _convert_to_classification_prediction(
314
+ self,
315
+ prediction: LabelPredictionWithMemoriesAndFeedback,
316
+ *,
317
+ memoryset: LabeledMemoryset,
318
+ model: ClassificationModel,
319
+ ) -> ClassificationPrediction:
320
+ """
321
+ Convert internal prediction TypedDict to ClassificationPrediction object.
322
+ """
323
+ input_value = prediction.get("input_value")
324
+ input_value_str: str | None = None
325
+ if input_value is not None:
326
+ input_value_str = input_value.decode("utf-8") if isinstance(input_value, bytes) else input_value
327
+
328
+ return ClassificationPrediction(
329
+ prediction_id=prediction["prediction_id"],
330
+ label=prediction.get("label"),
331
+ label_name=prediction.get("label_name"),
332
+ score=None,
333
+ confidence=prediction["confidence"],
334
+ anomaly_score=prediction["anomaly_score"],
335
+ memoryset=memoryset,
336
+ model=model,
337
+ telemetry=prediction,
338
+ logits=prediction.get("logits"),
339
+ input_value=input_value_str,
340
+ )
341
+
342
+ def _convert_to_regression_prediction(
343
+ self,
344
+ prediction: ScorePredictionWithMemoriesAndFeedback,
345
+ *,
346
+ memoryset: ScoredMemoryset,
347
+ model: RegressionModel,
348
+ ) -> RegressionPrediction:
349
+ """
350
+ Convert internal prediction TypedDict to RegressionPrediction object.
351
+ """
352
+ input_value = prediction.get("input_value")
353
+ input_value_str: str | None = None
354
+ if input_value is not None:
355
+ input_value_str = input_value.decode("utf-8") if isinstance(input_value, bytes) else input_value
356
+
357
+ return RegressionPrediction(
358
+ prediction_id=prediction["prediction_id"],
359
+ label=None,
360
+ label_name=None,
361
+ score=prediction.get("score"),
362
+ confidence=prediction["confidence"],
363
+ anomaly_score=prediction["anomaly_score"],
364
+ memoryset=memoryset,
365
+ model=model,
366
+ telemetry=prediction,
367
+ logits=None,
368
+ input_value=input_value_str,
369
+ )
370
+
371
+ def feedback(self) -> dict[str, list[bool] | list[float]]:
372
+ """
373
+ Get feedback metrics computed from predictions that used this memory.
374
+
375
+ Returns a dictionary where:
376
+ - Keys are feedback category names
377
+ - Values are lists of feedback values (you may want to look at mean on the raw data)
378
+ """
379
+ # Collect all feedbacks by category, paginating through all predictions
380
+ feedback_by_category: dict[str, list[bool] | list[float]] = {}
381
+ batch_size = 500
382
+ offset = 0
383
+
384
+ while True:
385
+ predictions_batch = self.predictions(limit=batch_size, offset=offset)
386
+
387
+ if not predictions_batch:
388
+ break
389
+
390
+ for prediction in predictions_batch:
391
+ telemetry = prediction._telemetry
392
+ if "feedbacks" not in telemetry:
393
+ continue
394
+
395
+ for fb in telemetry["feedbacks"]:
396
+ category_name = fb["category_name"]
397
+ value = fb["value"]
398
+ # Convert BINARY (1/0) to boolean, CONTINUOUS to float
399
+ if fb["category_type"] == "BINARY":
400
+ value = bool(value)
401
+ if category_name not in feedback_by_category:
402
+ feedback_by_category[category_name] = []
403
+ cast(list[bool], feedback_by_category[category_name]).append(value)
404
+ else:
405
+ value = float(value)
406
+ if category_name not in feedback_by_category:
407
+ feedback_by_category[category_name] = []
408
+ cast(list[float], feedback_by_category[category_name]).append(value)
409
+
410
+ if len(predictions_batch) < batch_size:
411
+ break
412
+
413
+ offset += batch_size
414
+
415
+ return feedback_by_category
416
+
295
417
  def _update(
296
418
  self,
297
419
  *,
@@ -416,6 +538,75 @@ class LabeledMemory(MemoryBase):
416
538
  self._update(value=value, label=label, source_id=source_id, **metadata)
417
539
  return self
418
540
 
541
+ def predictions(
542
+ self,
543
+ limit: int = 100,
544
+ offset: int = 0,
545
+ tag: str | None = None,
546
+ sort: list[tuple[Literal["anomaly_score", "confidence", "timestamp"], Literal["asc", "desc"]]] = [],
547
+ expected_label_match: bool | None = None,
548
+ ) -> list[ClassificationPrediction]:
549
+ """
550
+ Get classification predictions that used this memory.
551
+
552
+ Args:
553
+ limit: Maximum number of predictions to return (default: 100)
554
+ offset: Number of predictions to skip for pagination (default: 0)
555
+ tag: Optional tag filter to only include predictions with this tag
556
+ sort: List of (field, direction) tuples for sorting results.
557
+ Valid fields: "anomaly_score", "confidence", "timestamp".
558
+ Valid directions: "asc", "desc"
559
+ expected_label_match: Filter by prediction correctness:
560
+ - True: only return correct predictions (label == expected_label)
561
+ - False: only return incorrect predictions (label != expected_label)
562
+ - None: return all predictions (default)
563
+
564
+ Returns:
565
+ List of ClassificationPrediction objects that used this memory
566
+ """
567
+
568
+ client = OrcaClient._resolve_client()
569
+ predictions_data = client.POST(
570
+ "/telemetry/prediction",
571
+ json={
572
+ "memory_id": self.memory_id,
573
+ "limit": limit,
574
+ "offset": offset,
575
+ "sort": [list(sort_item) for sort_item in sort],
576
+ "tag": tag,
577
+ "expected_label_match": expected_label_match,
578
+ },
579
+ )
580
+
581
+ # Filter to only classification predictions and convert to ClassificationPrediction objects
582
+ classification_predictions = [
583
+ cast(LabelPredictionWithMemoriesAndFeedback, p) for p in predictions_data if "label" in p
584
+ ]
585
+
586
+ from .classification_model import ClassificationModel
587
+
588
+ memorysets: dict[str, LabeledMemoryset] = {}
589
+ models: dict[str, ClassificationModel] = {}
590
+
591
+ def resolve_memoryset(memoryset_id: str) -> LabeledMemoryset:
592
+ if memoryset_id not in memorysets:
593
+ memorysets[memoryset_id] = LabeledMemoryset.open(memoryset_id)
594
+ return memorysets[memoryset_id]
595
+
596
+ def resolve_model(model_id: str) -> ClassificationModel:
597
+ if model_id not in models:
598
+ models[model_id] = ClassificationModel.open(model_id)
599
+ return models[model_id]
600
+
601
+ return [
602
+ self._convert_to_classification_prediction(
603
+ p,
604
+ memoryset=resolve_memoryset(p["memoryset_id"]),
605
+ model=resolve_model(p["model_id"]),
606
+ )
607
+ for p in classification_predictions
608
+ ]
609
+
419
610
  def to_dict(self) -> dict[str, Any]:
420
611
  """
421
612
  Convert the memory to a dictionary
@@ -457,7 +648,11 @@ class LabeledMemoryLookup(LabeledMemory):
457
648
  lookup_score: float
458
649
  attention_weight: float | None
459
650
 
460
- def __init__(self, memoryset_id: str, memory_lookup: LabeledMemoryLookupResponse | LabelPredictionMemoryLookup):
651
+ def __init__(
652
+ self,
653
+ memoryset_id: str,
654
+ memory_lookup: LabeledMemoryLookupResponse | LabelPredictionMemoryLookup,
655
+ ):
461
656
  # for internal use only, do not document
462
657
  super().__init__(memoryset_id, memory_lookup)
463
658
  self.lookup_score = memory_lookup["lookup_score"]
@@ -553,6 +748,75 @@ class ScoredMemory(MemoryBase):
553
748
  self._update(value=value, score=score, source_id=source_id, **metadata)
554
749
  return self
555
750
 
751
+ def predictions(
752
+ self,
753
+ limit: int = 100,
754
+ offset: int = 0,
755
+ tag: str | None = None,
756
+ sort: list[tuple[Literal["anomaly_score", "confidence", "timestamp"], Literal["asc", "desc"]]] = [],
757
+ expected_label_match: bool | None = None,
758
+ ) -> list[RegressionPrediction]:
759
+ """
760
+ Get regression predictions that used this memory.
761
+
762
+ Args:
763
+ limit: Maximum number of predictions to return (default: 100)
764
+ offset: Number of predictions to skip for pagination (default: 0)
765
+ tag: Optional tag filter to only include predictions with this tag
766
+ sort: List of (field, direction) tuples for sorting results.
767
+ Valid fields: "anomaly_score", "confidence", "timestamp".
768
+ Valid directions: "asc", "desc"
769
+ expected_label_match: Filter by prediction correctness:
770
+ - True: only return correct predictions (score close to expected_score)
771
+ - False: only return incorrect predictions (score differs from expected_score)
772
+ - None: return all predictions (default)
773
+ Note: For regression, "correctness" is based on score proximity to expected_score.
774
+
775
+ Returns:
776
+ List of RegressionPrediction objects that used this memory
777
+ """
778
+ client = OrcaClient._resolve_client()
779
+ predictions_data = client.POST(
780
+ "/telemetry/prediction",
781
+ json={
782
+ "memory_id": self.memory_id,
783
+ "limit": limit,
784
+ "offset": offset,
785
+ "sort": [list(sort_item) for sort_item in sort],
786
+ "tag": tag,
787
+ "expected_label_match": expected_label_match,
788
+ },
789
+ )
790
+
791
+ # Filter to only regression predictions and convert to RegressionPrediction objects
792
+ regression_predictions = [
793
+ cast(ScorePredictionWithMemoriesAndFeedback, p) for p in predictions_data if "score" in p
794
+ ]
795
+
796
+ from .regression_model import RegressionModel
797
+
798
+ memorysets: dict[str, ScoredMemoryset] = {}
799
+ models: dict[str, RegressionModel] = {}
800
+
801
+ def resolve_memoryset(memoryset_id: str) -> ScoredMemoryset:
802
+ if memoryset_id not in memorysets:
803
+ memorysets[memoryset_id] = ScoredMemoryset.open(memoryset_id)
804
+ return memorysets[memoryset_id]
805
+
806
+ def resolve_model(model_id: str) -> RegressionModel:
807
+ if model_id not in models:
808
+ models[model_id] = RegressionModel.open(model_id)
809
+ return models[model_id]
810
+
811
+ return [
812
+ self._convert_to_regression_prediction(
813
+ p,
814
+ memoryset=resolve_memoryset(p["memoryset_id"]),
815
+ model=resolve_model(p["model_id"]),
816
+ )
817
+ for p in regression_predictions
818
+ ]
819
+
556
820
  def to_dict(self) -> dict[str, Any]:
557
821
  """
558
822
  Convert the memory to a dictionary
@@ -589,7 +853,11 @@ class ScoredMemoryLookup(ScoredMemory):
589
853
  lookup_score: float
590
854
  attention_weight: float | None
591
855
 
592
- def __init__(self, memoryset_id: str, memory_lookup: ScoredMemoryLookupResponse | ScorePredictionMemoryLookup):
856
+ def __init__(
857
+ self,
858
+ memoryset_id: str,
859
+ memory_lookup: ScoredMemoryLookupResponse | ScorePredictionMemoryLookup,
860
+ ):
593
861
  # for internal use only, do not document
594
862
  super().__init__(memoryset_id, memory_lookup)
595
863
  self.lookup_score = memory_lookup["lookup_score"]
@@ -737,6 +1005,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
737
1005
  if_exists: CreateMode = "error",
738
1006
  background: bool = False,
739
1007
  hidden: bool = False,
1008
+ subsample: int | float | None = None,
1009
+ memory_type: MemoryType | None = None,
740
1010
  ) -> Self | Job[Self]:
741
1011
  """
742
1012
  Create a new memoryset in the OrcaCloud
@@ -750,8 +1020,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
750
1020
  embedding_model: Embedding model to use for embedding memory values for semantic search.
751
1021
  If not provided, a default embedding model for the memoryset will be used.
752
1022
  value_column: Name of the column in the datasource that contains the memory values
753
- label_column: Name of the column in the datasource that contains the memory labels,
754
- these must be contiguous integers starting from 0
1023
+ label_column: Name of the column in the datasource that contains the memory labels.
1024
+ Must contain categorical values as integers or strings. String labels will be
1025
+ converted to integers with the unique strings extracted as `label_names`
755
1026
  score_column: Name of the column in the datasource that contains the memory scores
756
1027
  source_id_column: Optional name of the column in the datasource that contains the ids in
757
1028
  the system of reference
@@ -759,9 +1030,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
759
1030
  so make sure it is concise and describes the contents of your memoryset not the
760
1031
  datasource or the embedding model.
761
1032
  label_names: List of human-readable names for the labels in the memoryset, must match
762
- the number of labels in the `label_column`. Will be automatically inferred if a
763
- [Dataset][datasets.Dataset] with a [`ClassLabel`][datasets.ClassLabel] feature for
764
- labels is used as the datasource
1033
+ the number of labels in the `label_column`. Will be automatically inferred if string
1034
+ labels are provided or if a [Dataset][datasets.Dataset] with a
1035
+ [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
765
1036
  max_seq_length_override: Maximum sequence length of values in the memoryset, if the
766
1037
  value is longer than this it will be truncated, will default to the model's max
767
1038
  sequence length if not provided
@@ -775,7 +1046,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
775
1046
  `"error"`. Other option is `"open"` to open the existing memoryset.
776
1047
  background: Whether to run the operation none blocking and return a job handle
777
1048
  hidden: Whether the memoryset should be hidden
778
-
1049
+ subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
1050
+ datasource to insert. Use to limit the size of the initial memoryset.
1051
+ memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
1052
+ and `"SCORED"` if `score_column` is provided, must be specified for other cases.
779
1053
  Returns:
780
1054
  Handle to the new memoryset in the OrcaCloud
781
1055
 
@@ -786,9 +1060,6 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
786
1060
  if embedding_model is None:
787
1061
  embedding_model = PretrainedEmbeddingModel.GTE_BASE
788
1062
 
789
- if label_column is None and score_column is None:
790
- raise ValueError("label_column or score_column must be provided")
791
-
792
1063
  existing = cls._handle_if_exists(
793
1064
  name,
794
1065
  if_exists=if_exists,
@@ -813,6 +1084,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
813
1084
  "index_params": index_params,
814
1085
  "hidden": hidden,
815
1086
  }
1087
+ if memory_type is not None:
1088
+ payload["memory_type"] = memory_type
1089
+ if subsample is not None:
1090
+ payload["subsample"] = subsample
816
1091
  if prompt is not None:
817
1092
  payload["prompt"] = prompt
818
1093
  if isinstance(embedding_model, PretrainedEmbeddingModel):
@@ -823,7 +1098,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
823
1098
  raise ValueError("Invalid embedding model")
824
1099
  client = OrcaClient._resolve_client()
825
1100
  response = client.POST("/memoryset", json=payload)
826
- job = Job(response["insertion_task_id"], lambda: cls.open(response["id"]))
1101
+ job = Job(response["insertion_job_id"], lambda: cls.open(response["id"]))
827
1102
  return job if background else job.result()
828
1103
 
829
1104
  @overload
@@ -1516,7 +1791,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
1516
1791
  client = OrcaClient._resolve_client()
1517
1792
  metadata = client.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
1518
1793
  job = Job(
1519
- metadata["insertion_task_id"],
1794
+ metadata["insertion_job_id"],
1520
1795
  lambda: self.open(metadata["id"]),
1521
1796
  )
1522
1797
  return job if background else job.result()
@@ -2189,11 +2464,11 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
2189
2464
  def get_analysis_result():
2190
2465
  client = OrcaClient._resolve_client()
2191
2466
  return client.GET(
2192
- "/memoryset/{name_or_id}/analysis/{analysis_task_id}",
2193
- params={"name_or_id": self.id, "analysis_task_id": analysis["task_id"]},
2467
+ "/memoryset/{name_or_id}/analysis/{analysis_job_id}",
2468
+ params={"name_or_id": self.id, "analysis_job_id": analysis["job_id"]},
2194
2469
  )["results"]
2195
2470
 
2196
- job = Job(analysis["task_id"], get_analysis_result)
2471
+ job = Job(analysis["job_id"], get_analysis_result)
2197
2472
  return job if background else job.result()
2198
2473
 
2199
2474
  def get_potential_duplicate_groups(self) -> list[list[MemoryT]]:
@@ -2241,7 +2516,7 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2241
2516
  *,
2242
2517
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
2243
2518
  value_column: str = "value",
2244
- label_column: str = "label",
2519
+ label_column: str | None = "label",
2245
2520
  source_id_column: str | None = None,
2246
2521
  description: str | None = None,
2247
2522
  label_names: list[str] | None = None,
@@ -2253,6 +2528,7 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2253
2528
  if_exists: CreateMode = "error",
2254
2529
  background: Literal[True],
2255
2530
  hidden: bool = False,
2531
+ subsample: int | float | None = None,
2256
2532
  ) -> Job[Self]:
2257
2533
  pass
2258
2534
 
@@ -2265,7 +2541,7 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2265
2541
  *,
2266
2542
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
2267
2543
  value_column: str = "value",
2268
- label_column: str = "label",
2544
+ label_column: str | None = "label",
2269
2545
  source_id_column: str | None = None,
2270
2546
  description: str | None = None,
2271
2547
  label_names: list[str] | None = None,
@@ -2277,6 +2553,7 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2277
2553
  if_exists: CreateMode = "error",
2278
2554
  background: Literal[False] = False,
2279
2555
  hidden: bool = False,
2556
+ subsample: int | float | None = None,
2280
2557
  ) -> Self:
2281
2558
  pass
2282
2559
 
@@ -2288,7 +2565,7 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2288
2565
  *,
2289
2566
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
2290
2567
  value_column: str = "value",
2291
- label_column: str = "label",
2568
+ label_column: str | None = "label",
2292
2569
  source_id_column: str | None = None,
2293
2570
  description: str | None = None,
2294
2571
  label_names: list[str] | None = None,
@@ -2300,6 +2577,7 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2300
2577
  if_exists: CreateMode = "error",
2301
2578
  background: bool = False,
2302
2579
  hidden: bool = False,
2580
+ subsample: int | float | None = None,
2303
2581
  ) -> Self | Job[Self]:
2304
2582
  """
2305
2583
  Create a new labeled memoryset in the OrcaCloud
@@ -2313,17 +2591,19 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2313
2591
  embedding_model: Embedding model to use for embedding memory values for semantic search.
2314
2592
  If not provided, a default embedding model for the memoryset will be used.
2315
2593
  value_column: Name of the column in the datasource that contains the memory values
2316
- label_column: Name of the column in the datasource that contains the memory labels,
2317
- these must be contiguous integers starting from 0
2594
+ label_column: Name of the column in the datasource that contains the memory labels.
2595
+ Must contain categorical values as integers or strings. String labels will be
2596
+ converted to integers with the unique strings extracted as `label_names`. To create
2597
+ a memoryset with all none labels, set to `None`.
2318
2598
  source_id_column: Optional name of the column in the datasource that contains the ids in
2319
2599
  the system of reference
2320
2600
  description: Optional description for the memoryset, this will be used in agentic flows,
2321
2601
  so make sure it is concise and describes the contents of your memoryset not the
2322
2602
  datasource or the embedding model.
2323
2603
  label_names: List of human-readable names for the labels in the memoryset, must match
2324
- the number of labels in the `label_column`. Will be automatically inferred if a
2325
- [Dataset][datasets.Dataset] with a [`ClassLabel`][datasets.ClassLabel] feature for
2326
- labels is used as the datasource
2604
+ the number of labels in the `label_column`. Will be automatically inferred if string
2605
+ labels are provided or if a [Dataset][datasets.Dataset] with a
2606
+ [`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
2327
2607
  max_seq_length_override: Maximum sequence length of values in the memoryset, if the
2328
2608
  value is longer than this it will be truncated, will default to the model's max
2329
2609
  sequence length if not provided
@@ -2363,6 +2643,8 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
2363
2643
  if_exists=if_exists,
2364
2644
  background=background,
2365
2645
  hidden=hidden,
2646
+ subsample=subsample,
2647
+ memory_type="LABELED",
2366
2648
  )
2367
2649
 
2368
2650
  def display_label_analysis(self):
@@ -2405,7 +2687,7 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2405
2687
  *,
2406
2688
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
2407
2689
  value_column: str = "value",
2408
- score_column: str = "score",
2690
+ score_column: str | None = "score",
2409
2691
  source_id_column: str | None = None,
2410
2692
  description: str | None = None,
2411
2693
  max_seq_length_override: int | None = None,
@@ -2416,6 +2698,7 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2416
2698
  if_exists: CreateMode = "error",
2417
2699
  background: Literal[True],
2418
2700
  hidden: bool = False,
2701
+ subsample: int | float | None = None,
2419
2702
  ) -> Job[Self]:
2420
2703
  pass
2421
2704
 
@@ -2427,7 +2710,7 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2427
2710
  datasource: Datasource,
2428
2711
  *,
2429
2712
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
2430
- score_column: str = "score",
2713
+ score_column: str | None = "score",
2431
2714
  value_column: str = "value",
2432
2715
  source_id_column: str | None = None,
2433
2716
  description: str | None = None,
@@ -2439,6 +2722,7 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2439
2722
  if_exists: CreateMode = "error",
2440
2723
  background: Literal[False] = False,
2441
2724
  hidden: bool = False,
2725
+ subsample: int | float | None = None,
2442
2726
  ) -> Self:
2443
2727
  pass
2444
2728
 
@@ -2450,7 +2734,7 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2450
2734
  *,
2451
2735
  embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
2452
2736
  value_column: str = "value",
2453
- score_column: str = "score",
2737
+ score_column: str | None = "score",
2454
2738
  source_id_column: str | None = None,
2455
2739
  description: str | None = None,
2456
2740
  max_seq_length_override: int | None = None,
@@ -2461,6 +2745,7 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2461
2745
  if_exists: CreateMode = "error",
2462
2746
  background: bool = False,
2463
2747
  hidden: bool = False,
2748
+ subsample: int | float | None = None,
2464
2749
  ) -> Self | Job[Self]:
2465
2750
  """
2466
2751
  Create a new scored memoryset in the OrcaCloud
@@ -2474,7 +2759,8 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2474
2759
  embedding_model: Embedding model to use for embedding memory values for semantic search.
2475
2760
  If not provided, a default embedding model for the memoryset will be used.
2476
2761
  value_column: Name of the column in the datasource that contains the memory values
2477
- score_column: Name of the column in the datasource that contains the memory scores
2762
+ score_column: Name of the column in the datasource that contains the memory scores. Must
2763
+ contain numerical values. To create a memoryset with all none scores, set to `None`.
2478
2764
  source_id_column: Optional name of the column in the datasource that contains the ids in
2479
2765
  the system of reference
2480
2766
  description: Optional description for the memoryset, this will be used in agentic flows,
@@ -2517,4 +2803,6 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
2517
2803
  if_exists=if_exists,
2518
2804
  background=background,
2519
2805
  hidden=hidden,
2806
+ subsample=subsample,
2807
+ memory_type="SCORED",
2520
2808
  )