orca-sdk 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/async_client.py +448 -301
- orca_sdk/classification_model.py +53 -17
- orca_sdk/client.py +448 -301
- orca_sdk/datasource.py +45 -2
- orca_sdk/datasource_test.py +120 -0
- orca_sdk/embedding_model.py +32 -24
- orca_sdk/job.py +17 -17
- orca_sdk/memoryset.py +318 -30
- orca_sdk/memoryset_test.py +185 -1
- orca_sdk/regression_model.py +38 -4
- orca_sdk/telemetry.py +52 -13
- {orca_sdk-0.1.3.dist-info → orca_sdk-0.1.4.dist-info}/METADATA +1 -1
- {orca_sdk-0.1.3.dist-info → orca_sdk-0.1.4.dist-info}/RECORD +14 -14
- {orca_sdk-0.1.3.dist-info → orca_sdk-0.1.4.dist-info}/WHEEL +0 -0
orca_sdk/memoryset.py
CHANGED
|
@@ -4,7 +4,17 @@ import logging
|
|
|
4
4
|
from abc import ABC
|
|
5
5
|
from datetime import datetime, timedelta
|
|
6
6
|
from os import PathLike
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import (
|
|
8
|
+
TYPE_CHECKING,
|
|
9
|
+
Any,
|
|
10
|
+
Generic,
|
|
11
|
+
Iterable,
|
|
12
|
+
Literal,
|
|
13
|
+
Self,
|
|
14
|
+
TypeVar,
|
|
15
|
+
cast,
|
|
16
|
+
overload,
|
|
17
|
+
)
|
|
8
18
|
|
|
9
19
|
import pandas as pd
|
|
10
20
|
import pyarrow as pa
|
|
@@ -29,6 +39,7 @@ from .client import (
|
|
|
29
39
|
LabeledMemoryUpdate,
|
|
30
40
|
LabeledMemoryWithFeedbackMetrics,
|
|
31
41
|
LabelPredictionMemoryLookup,
|
|
42
|
+
LabelPredictionWithMemoriesAndFeedback,
|
|
32
43
|
MemoryMetrics,
|
|
33
44
|
MemorysetAnalysisConfigs,
|
|
34
45
|
MemorysetMetadata,
|
|
@@ -36,6 +47,7 @@ from .client import (
|
|
|
36
47
|
MemorysetUpdate,
|
|
37
48
|
MemoryType,
|
|
38
49
|
OrcaClient,
|
|
50
|
+
PredictionFeedback,
|
|
39
51
|
)
|
|
40
52
|
from .client import ScoredMemory as ScoredMemoryResponse
|
|
41
53
|
from .client import (
|
|
@@ -46,6 +58,7 @@ from .client import (
|
|
|
46
58
|
ScoredMemoryUpdate,
|
|
47
59
|
ScoredMemoryWithFeedbackMetrics,
|
|
48
60
|
ScorePredictionMemoryLookup,
|
|
61
|
+
ScorePredictionWithMemoriesAndFeedback,
|
|
49
62
|
TelemetryFilterItem,
|
|
50
63
|
TelemetrySortOptions,
|
|
51
64
|
)
|
|
@@ -56,6 +69,11 @@ from .embedding_model import (
|
|
|
56
69
|
PretrainedEmbeddingModel,
|
|
57
70
|
)
|
|
58
71
|
from .job import Job, Status
|
|
72
|
+
from .telemetry import ClassificationPrediction, RegressionPrediction
|
|
73
|
+
|
|
74
|
+
if TYPE_CHECKING:
|
|
75
|
+
from .classification_model import ClassificationModel
|
|
76
|
+
from .regression_model import RegressionModel
|
|
59
77
|
|
|
60
78
|
TelemetrySortItem = tuple[str, Literal["asc", "desc"]]
|
|
61
79
|
"""
|
|
@@ -74,7 +92,7 @@ FilterOperation = Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in", "lik
|
|
|
74
92
|
Operations that can be used in a filter expression.
|
|
75
93
|
"""
|
|
76
94
|
|
|
77
|
-
FilterValue = str | int | float | bool | datetime | None | list[str] | list[int] | list[float] | list[bool]
|
|
95
|
+
FilterValue = str | int | float | bool | datetime | None | list[str | None] | list[int] | list[float] | list[bool]
|
|
78
96
|
"""
|
|
79
97
|
Values that can be used in a filter expression.
|
|
80
98
|
"""
|
|
@@ -292,6 +310,110 @@ class MemoryBase(ABC):
|
|
|
292
310
|
raise AttributeError(f"{key} is not a valid attribute")
|
|
293
311
|
return self.metadata[key]
|
|
294
312
|
|
|
313
|
+
def _convert_to_classification_prediction(
|
|
314
|
+
self,
|
|
315
|
+
prediction: LabelPredictionWithMemoriesAndFeedback,
|
|
316
|
+
*,
|
|
317
|
+
memoryset: LabeledMemoryset,
|
|
318
|
+
model: ClassificationModel,
|
|
319
|
+
) -> ClassificationPrediction:
|
|
320
|
+
"""
|
|
321
|
+
Convert internal prediction TypedDict to ClassificationPrediction object.
|
|
322
|
+
"""
|
|
323
|
+
input_value = prediction.get("input_value")
|
|
324
|
+
input_value_str: str | None = None
|
|
325
|
+
if input_value is not None:
|
|
326
|
+
input_value_str = input_value.decode("utf-8") if isinstance(input_value, bytes) else input_value
|
|
327
|
+
|
|
328
|
+
return ClassificationPrediction(
|
|
329
|
+
prediction_id=prediction["prediction_id"],
|
|
330
|
+
label=prediction.get("label"),
|
|
331
|
+
label_name=prediction.get("label_name"),
|
|
332
|
+
score=None,
|
|
333
|
+
confidence=prediction["confidence"],
|
|
334
|
+
anomaly_score=prediction["anomaly_score"],
|
|
335
|
+
memoryset=memoryset,
|
|
336
|
+
model=model,
|
|
337
|
+
telemetry=prediction,
|
|
338
|
+
logits=prediction.get("logits"),
|
|
339
|
+
input_value=input_value_str,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
def _convert_to_regression_prediction(
|
|
343
|
+
self,
|
|
344
|
+
prediction: ScorePredictionWithMemoriesAndFeedback,
|
|
345
|
+
*,
|
|
346
|
+
memoryset: ScoredMemoryset,
|
|
347
|
+
model: RegressionModel,
|
|
348
|
+
) -> RegressionPrediction:
|
|
349
|
+
"""
|
|
350
|
+
Convert internal prediction TypedDict to RegressionPrediction object.
|
|
351
|
+
"""
|
|
352
|
+
input_value = prediction.get("input_value")
|
|
353
|
+
input_value_str: str | None = None
|
|
354
|
+
if input_value is not None:
|
|
355
|
+
input_value_str = input_value.decode("utf-8") if isinstance(input_value, bytes) else input_value
|
|
356
|
+
|
|
357
|
+
return RegressionPrediction(
|
|
358
|
+
prediction_id=prediction["prediction_id"],
|
|
359
|
+
label=None,
|
|
360
|
+
label_name=None,
|
|
361
|
+
score=prediction.get("score"),
|
|
362
|
+
confidence=prediction["confidence"],
|
|
363
|
+
anomaly_score=prediction["anomaly_score"],
|
|
364
|
+
memoryset=memoryset,
|
|
365
|
+
model=model,
|
|
366
|
+
telemetry=prediction,
|
|
367
|
+
logits=None,
|
|
368
|
+
input_value=input_value_str,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
def feedback(self) -> dict[str, list[bool] | list[float]]:
|
|
372
|
+
"""
|
|
373
|
+
Get feedback metrics computed from predictions that used this memory.
|
|
374
|
+
|
|
375
|
+
Returns a dictionary where:
|
|
376
|
+
- Keys are feedback category names
|
|
377
|
+
- Values are lists of feedback values (you may want to look at mean on the raw data)
|
|
378
|
+
"""
|
|
379
|
+
# Collect all feedbacks by category, paginating through all predictions
|
|
380
|
+
feedback_by_category: dict[str, list[bool] | list[float]] = {}
|
|
381
|
+
batch_size = 500
|
|
382
|
+
offset = 0
|
|
383
|
+
|
|
384
|
+
while True:
|
|
385
|
+
predictions_batch = self.predictions(limit=batch_size, offset=offset)
|
|
386
|
+
|
|
387
|
+
if not predictions_batch:
|
|
388
|
+
break
|
|
389
|
+
|
|
390
|
+
for prediction in predictions_batch:
|
|
391
|
+
telemetry = prediction._telemetry
|
|
392
|
+
if "feedbacks" not in telemetry:
|
|
393
|
+
continue
|
|
394
|
+
|
|
395
|
+
for fb in telemetry["feedbacks"]:
|
|
396
|
+
category_name = fb["category_name"]
|
|
397
|
+
value = fb["value"]
|
|
398
|
+
# Convert BINARY (1/0) to boolean, CONTINUOUS to float
|
|
399
|
+
if fb["category_type"] == "BINARY":
|
|
400
|
+
value = bool(value)
|
|
401
|
+
if category_name not in feedback_by_category:
|
|
402
|
+
feedback_by_category[category_name] = []
|
|
403
|
+
cast(list[bool], feedback_by_category[category_name]).append(value)
|
|
404
|
+
else:
|
|
405
|
+
value = float(value)
|
|
406
|
+
if category_name not in feedback_by_category:
|
|
407
|
+
feedback_by_category[category_name] = []
|
|
408
|
+
cast(list[float], feedback_by_category[category_name]).append(value)
|
|
409
|
+
|
|
410
|
+
if len(predictions_batch) < batch_size:
|
|
411
|
+
break
|
|
412
|
+
|
|
413
|
+
offset += batch_size
|
|
414
|
+
|
|
415
|
+
return feedback_by_category
|
|
416
|
+
|
|
295
417
|
def _update(
|
|
296
418
|
self,
|
|
297
419
|
*,
|
|
@@ -416,6 +538,75 @@ class LabeledMemory(MemoryBase):
|
|
|
416
538
|
self._update(value=value, label=label, source_id=source_id, **metadata)
|
|
417
539
|
return self
|
|
418
540
|
|
|
541
|
+
def predictions(
|
|
542
|
+
self,
|
|
543
|
+
limit: int = 100,
|
|
544
|
+
offset: int = 0,
|
|
545
|
+
tag: str | None = None,
|
|
546
|
+
sort: list[tuple[Literal["anomaly_score", "confidence", "timestamp"], Literal["asc", "desc"]]] = [],
|
|
547
|
+
expected_label_match: bool | None = None,
|
|
548
|
+
) -> list[ClassificationPrediction]:
|
|
549
|
+
"""
|
|
550
|
+
Get classification predictions that used this memory.
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
limit: Maximum number of predictions to return (default: 100)
|
|
554
|
+
offset: Number of predictions to skip for pagination (default: 0)
|
|
555
|
+
tag: Optional tag filter to only include predictions with this tag
|
|
556
|
+
sort: List of (field, direction) tuples for sorting results.
|
|
557
|
+
Valid fields: "anomaly_score", "confidence", "timestamp".
|
|
558
|
+
Valid directions: "asc", "desc"
|
|
559
|
+
expected_label_match: Filter by prediction correctness:
|
|
560
|
+
- True: only return correct predictions (label == expected_label)
|
|
561
|
+
- False: only return incorrect predictions (label != expected_label)
|
|
562
|
+
- None: return all predictions (default)
|
|
563
|
+
|
|
564
|
+
Returns:
|
|
565
|
+
List of ClassificationPrediction objects that used this memory
|
|
566
|
+
"""
|
|
567
|
+
|
|
568
|
+
client = OrcaClient._resolve_client()
|
|
569
|
+
predictions_data = client.POST(
|
|
570
|
+
"/telemetry/prediction",
|
|
571
|
+
json={
|
|
572
|
+
"memory_id": self.memory_id,
|
|
573
|
+
"limit": limit,
|
|
574
|
+
"offset": offset,
|
|
575
|
+
"sort": [list(sort_item) for sort_item in sort],
|
|
576
|
+
"tag": tag,
|
|
577
|
+
"expected_label_match": expected_label_match,
|
|
578
|
+
},
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
# Filter to only classification predictions and convert to ClassificationPrediction objects
|
|
582
|
+
classification_predictions = [
|
|
583
|
+
cast(LabelPredictionWithMemoriesAndFeedback, p) for p in predictions_data if "label" in p
|
|
584
|
+
]
|
|
585
|
+
|
|
586
|
+
from .classification_model import ClassificationModel
|
|
587
|
+
|
|
588
|
+
memorysets: dict[str, LabeledMemoryset] = {}
|
|
589
|
+
models: dict[str, ClassificationModel] = {}
|
|
590
|
+
|
|
591
|
+
def resolve_memoryset(memoryset_id: str) -> LabeledMemoryset:
|
|
592
|
+
if memoryset_id not in memorysets:
|
|
593
|
+
memorysets[memoryset_id] = LabeledMemoryset.open(memoryset_id)
|
|
594
|
+
return memorysets[memoryset_id]
|
|
595
|
+
|
|
596
|
+
def resolve_model(model_id: str) -> ClassificationModel:
|
|
597
|
+
if model_id not in models:
|
|
598
|
+
models[model_id] = ClassificationModel.open(model_id)
|
|
599
|
+
return models[model_id]
|
|
600
|
+
|
|
601
|
+
return [
|
|
602
|
+
self._convert_to_classification_prediction(
|
|
603
|
+
p,
|
|
604
|
+
memoryset=resolve_memoryset(p["memoryset_id"]),
|
|
605
|
+
model=resolve_model(p["model_id"]),
|
|
606
|
+
)
|
|
607
|
+
for p in classification_predictions
|
|
608
|
+
]
|
|
609
|
+
|
|
419
610
|
def to_dict(self) -> dict[str, Any]:
|
|
420
611
|
"""
|
|
421
612
|
Convert the memory to a dictionary
|
|
@@ -457,7 +648,11 @@ class LabeledMemoryLookup(LabeledMemory):
|
|
|
457
648
|
lookup_score: float
|
|
458
649
|
attention_weight: float | None
|
|
459
650
|
|
|
460
|
-
def __init__(
|
|
651
|
+
def __init__(
|
|
652
|
+
self,
|
|
653
|
+
memoryset_id: str,
|
|
654
|
+
memory_lookup: LabeledMemoryLookupResponse | LabelPredictionMemoryLookup,
|
|
655
|
+
):
|
|
461
656
|
# for internal use only, do not document
|
|
462
657
|
super().__init__(memoryset_id, memory_lookup)
|
|
463
658
|
self.lookup_score = memory_lookup["lookup_score"]
|
|
@@ -553,6 +748,75 @@ class ScoredMemory(MemoryBase):
|
|
|
553
748
|
self._update(value=value, score=score, source_id=source_id, **metadata)
|
|
554
749
|
return self
|
|
555
750
|
|
|
751
|
+
def predictions(
|
|
752
|
+
self,
|
|
753
|
+
limit: int = 100,
|
|
754
|
+
offset: int = 0,
|
|
755
|
+
tag: str | None = None,
|
|
756
|
+
sort: list[tuple[Literal["anomaly_score", "confidence", "timestamp"], Literal["asc", "desc"]]] = [],
|
|
757
|
+
expected_label_match: bool | None = None,
|
|
758
|
+
) -> list[RegressionPrediction]:
|
|
759
|
+
"""
|
|
760
|
+
Get regression predictions that used this memory.
|
|
761
|
+
|
|
762
|
+
Args:
|
|
763
|
+
limit: Maximum number of predictions to return (default: 100)
|
|
764
|
+
offset: Number of predictions to skip for pagination (default: 0)
|
|
765
|
+
tag: Optional tag filter to only include predictions with this tag
|
|
766
|
+
sort: List of (field, direction) tuples for sorting results.
|
|
767
|
+
Valid fields: "anomaly_score", "confidence", "timestamp".
|
|
768
|
+
Valid directions: "asc", "desc"
|
|
769
|
+
expected_label_match: Filter by prediction correctness:
|
|
770
|
+
- True: only return correct predictions (score close to expected_score)
|
|
771
|
+
- False: only return incorrect predictions (score differs from expected_score)
|
|
772
|
+
- None: return all predictions (default)
|
|
773
|
+
Note: For regression, "correctness" is based on score proximity to expected_score.
|
|
774
|
+
|
|
775
|
+
Returns:
|
|
776
|
+
List of RegressionPrediction objects that used this memory
|
|
777
|
+
"""
|
|
778
|
+
client = OrcaClient._resolve_client()
|
|
779
|
+
predictions_data = client.POST(
|
|
780
|
+
"/telemetry/prediction",
|
|
781
|
+
json={
|
|
782
|
+
"memory_id": self.memory_id,
|
|
783
|
+
"limit": limit,
|
|
784
|
+
"offset": offset,
|
|
785
|
+
"sort": [list(sort_item) for sort_item in sort],
|
|
786
|
+
"tag": tag,
|
|
787
|
+
"expected_label_match": expected_label_match,
|
|
788
|
+
},
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
# Filter to only regression predictions and convert to RegressionPrediction objects
|
|
792
|
+
regression_predictions = [
|
|
793
|
+
cast(ScorePredictionWithMemoriesAndFeedback, p) for p in predictions_data if "score" in p
|
|
794
|
+
]
|
|
795
|
+
|
|
796
|
+
from .regression_model import RegressionModel
|
|
797
|
+
|
|
798
|
+
memorysets: dict[str, ScoredMemoryset] = {}
|
|
799
|
+
models: dict[str, RegressionModel] = {}
|
|
800
|
+
|
|
801
|
+
def resolve_memoryset(memoryset_id: str) -> ScoredMemoryset:
|
|
802
|
+
if memoryset_id not in memorysets:
|
|
803
|
+
memorysets[memoryset_id] = ScoredMemoryset.open(memoryset_id)
|
|
804
|
+
return memorysets[memoryset_id]
|
|
805
|
+
|
|
806
|
+
def resolve_model(model_id: str) -> RegressionModel:
|
|
807
|
+
if model_id not in models:
|
|
808
|
+
models[model_id] = RegressionModel.open(model_id)
|
|
809
|
+
return models[model_id]
|
|
810
|
+
|
|
811
|
+
return [
|
|
812
|
+
self._convert_to_regression_prediction(
|
|
813
|
+
p,
|
|
814
|
+
memoryset=resolve_memoryset(p["memoryset_id"]),
|
|
815
|
+
model=resolve_model(p["model_id"]),
|
|
816
|
+
)
|
|
817
|
+
for p in regression_predictions
|
|
818
|
+
]
|
|
819
|
+
|
|
556
820
|
def to_dict(self) -> dict[str, Any]:
|
|
557
821
|
"""
|
|
558
822
|
Convert the memory to a dictionary
|
|
@@ -589,7 +853,11 @@ class ScoredMemoryLookup(ScoredMemory):
|
|
|
589
853
|
lookup_score: float
|
|
590
854
|
attention_weight: float | None
|
|
591
855
|
|
|
592
|
-
def __init__(
|
|
856
|
+
def __init__(
|
|
857
|
+
self,
|
|
858
|
+
memoryset_id: str,
|
|
859
|
+
memory_lookup: ScoredMemoryLookupResponse | ScorePredictionMemoryLookup,
|
|
860
|
+
):
|
|
593
861
|
# for internal use only, do not document
|
|
594
862
|
super().__init__(memoryset_id, memory_lookup)
|
|
595
863
|
self.lookup_score = memory_lookup["lookup_score"]
|
|
@@ -737,6 +1005,8 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
737
1005
|
if_exists: CreateMode = "error",
|
|
738
1006
|
background: bool = False,
|
|
739
1007
|
hidden: bool = False,
|
|
1008
|
+
subsample: int | float | None = None,
|
|
1009
|
+
memory_type: MemoryType | None = None,
|
|
740
1010
|
) -> Self | Job[Self]:
|
|
741
1011
|
"""
|
|
742
1012
|
Create a new memoryset in the OrcaCloud
|
|
@@ -750,8 +1020,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
750
1020
|
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
751
1021
|
If not provided, a default embedding model for the memoryset will be used.
|
|
752
1022
|
value_column: Name of the column in the datasource that contains the memory values
|
|
753
|
-
label_column: Name of the column in the datasource that contains the memory labels
|
|
754
|
-
|
|
1023
|
+
label_column: Name of the column in the datasource that contains the memory labels.
|
|
1024
|
+
Must contain categorical values as integers or strings. String labels will be
|
|
1025
|
+
converted to integers with the unique strings extracted as `label_names`
|
|
755
1026
|
score_column: Name of the column in the datasource that contains the memory scores
|
|
756
1027
|
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
757
1028
|
the system of reference
|
|
@@ -759,9 +1030,9 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
759
1030
|
so make sure it is concise and describes the contents of your memoryset not the
|
|
760
1031
|
datasource or the embedding model.
|
|
761
1032
|
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
762
|
-
the number of labels in the `label_column`. Will be automatically inferred if
|
|
763
|
-
[Dataset][datasets.Dataset] with a
|
|
764
|
-
labels is used as the datasource
|
|
1033
|
+
the number of labels in the `label_column`. Will be automatically inferred if string
|
|
1034
|
+
labels are provided or if a [Dataset][datasets.Dataset] with a
|
|
1035
|
+
[`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
|
|
765
1036
|
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
766
1037
|
value is longer than this it will be truncated, will default to the model's max
|
|
767
1038
|
sequence length if not provided
|
|
@@ -775,7 +1046,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
775
1046
|
`"error"`. Other option is `"open"` to open the existing memoryset.
|
|
776
1047
|
background: Whether to run the operation none blocking and return a job handle
|
|
777
1048
|
hidden: Whether the memoryset should be hidden
|
|
778
|
-
|
|
1049
|
+
subsample: Optional number (int) of rows to insert or fraction (float in (0, 1]) of the
|
|
1050
|
+
datasource to insert. Use to limit the size of the initial memoryset.
|
|
1051
|
+
memory_type: Type of memoryset to create, defaults to `"LABELED"` if `label_column` is provided,
|
|
1052
|
+
and `"SCORED"` if `score_column` is provided, must be specified for other cases.
|
|
779
1053
|
Returns:
|
|
780
1054
|
Handle to the new memoryset in the OrcaCloud
|
|
781
1055
|
|
|
@@ -786,9 +1060,6 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
786
1060
|
if embedding_model is None:
|
|
787
1061
|
embedding_model = PretrainedEmbeddingModel.GTE_BASE
|
|
788
1062
|
|
|
789
|
-
if label_column is None and score_column is None:
|
|
790
|
-
raise ValueError("label_column or score_column must be provided")
|
|
791
|
-
|
|
792
1063
|
existing = cls._handle_if_exists(
|
|
793
1064
|
name,
|
|
794
1065
|
if_exists=if_exists,
|
|
@@ -813,6 +1084,10 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
813
1084
|
"index_params": index_params,
|
|
814
1085
|
"hidden": hidden,
|
|
815
1086
|
}
|
|
1087
|
+
if memory_type is not None:
|
|
1088
|
+
payload["memory_type"] = memory_type
|
|
1089
|
+
if subsample is not None:
|
|
1090
|
+
payload["subsample"] = subsample
|
|
816
1091
|
if prompt is not None:
|
|
817
1092
|
payload["prompt"] = prompt
|
|
818
1093
|
if isinstance(embedding_model, PretrainedEmbeddingModel):
|
|
@@ -823,7 +1098,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
823
1098
|
raise ValueError("Invalid embedding model")
|
|
824
1099
|
client = OrcaClient._resolve_client()
|
|
825
1100
|
response = client.POST("/memoryset", json=payload)
|
|
826
|
-
job = Job(response["
|
|
1101
|
+
job = Job(response["insertion_job_id"], lambda: cls.open(response["id"]))
|
|
827
1102
|
return job if background else job.result()
|
|
828
1103
|
|
|
829
1104
|
@overload
|
|
@@ -1516,7 +1791,7 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
1516
1791
|
client = OrcaClient._resolve_client()
|
|
1517
1792
|
metadata = client.POST("/memoryset/{name_or_id}/clone", params={"name_or_id": self.id}, json=payload)
|
|
1518
1793
|
job = Job(
|
|
1519
|
-
metadata["
|
|
1794
|
+
metadata["insertion_job_id"],
|
|
1520
1795
|
lambda: self.open(metadata["id"]),
|
|
1521
1796
|
)
|
|
1522
1797
|
return job if background else job.result()
|
|
@@ -2189,11 +2464,11 @@ class MemorysetBase(Generic[MemoryT, MemoryLookupT], ABC):
|
|
|
2189
2464
|
def get_analysis_result():
|
|
2190
2465
|
client = OrcaClient._resolve_client()
|
|
2191
2466
|
return client.GET(
|
|
2192
|
-
"/memoryset/{name_or_id}/analysis/{
|
|
2193
|
-
params={"name_or_id": self.id, "
|
|
2467
|
+
"/memoryset/{name_or_id}/analysis/{analysis_job_id}",
|
|
2468
|
+
params={"name_or_id": self.id, "analysis_job_id": analysis["job_id"]},
|
|
2194
2469
|
)["results"]
|
|
2195
2470
|
|
|
2196
|
-
job = Job(analysis["
|
|
2471
|
+
job = Job(analysis["job_id"], get_analysis_result)
|
|
2197
2472
|
return job if background else job.result()
|
|
2198
2473
|
|
|
2199
2474
|
def get_potential_duplicate_groups(self) -> list[list[MemoryT]]:
|
|
@@ -2241,7 +2516,7 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2241
2516
|
*,
|
|
2242
2517
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2243
2518
|
value_column: str = "value",
|
|
2244
|
-
label_column: str = "label",
|
|
2519
|
+
label_column: str | None = "label",
|
|
2245
2520
|
source_id_column: str | None = None,
|
|
2246
2521
|
description: str | None = None,
|
|
2247
2522
|
label_names: list[str] | None = None,
|
|
@@ -2253,6 +2528,7 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2253
2528
|
if_exists: CreateMode = "error",
|
|
2254
2529
|
background: Literal[True],
|
|
2255
2530
|
hidden: bool = False,
|
|
2531
|
+
subsample: int | float | None = None,
|
|
2256
2532
|
) -> Job[Self]:
|
|
2257
2533
|
pass
|
|
2258
2534
|
|
|
@@ -2265,7 +2541,7 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2265
2541
|
*,
|
|
2266
2542
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2267
2543
|
value_column: str = "value",
|
|
2268
|
-
label_column: str = "label",
|
|
2544
|
+
label_column: str | None = "label",
|
|
2269
2545
|
source_id_column: str | None = None,
|
|
2270
2546
|
description: str | None = None,
|
|
2271
2547
|
label_names: list[str] | None = None,
|
|
@@ -2277,6 +2553,7 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2277
2553
|
if_exists: CreateMode = "error",
|
|
2278
2554
|
background: Literal[False] = False,
|
|
2279
2555
|
hidden: bool = False,
|
|
2556
|
+
subsample: int | float | None = None,
|
|
2280
2557
|
) -> Self:
|
|
2281
2558
|
pass
|
|
2282
2559
|
|
|
@@ -2288,7 +2565,7 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2288
2565
|
*,
|
|
2289
2566
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2290
2567
|
value_column: str = "value",
|
|
2291
|
-
label_column: str = "label",
|
|
2568
|
+
label_column: str | None = "label",
|
|
2292
2569
|
source_id_column: str | None = None,
|
|
2293
2570
|
description: str | None = None,
|
|
2294
2571
|
label_names: list[str] | None = None,
|
|
@@ -2300,6 +2577,7 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2300
2577
|
if_exists: CreateMode = "error",
|
|
2301
2578
|
background: bool = False,
|
|
2302
2579
|
hidden: bool = False,
|
|
2580
|
+
subsample: int | float | None = None,
|
|
2303
2581
|
) -> Self | Job[Self]:
|
|
2304
2582
|
"""
|
|
2305
2583
|
Create a new labeled memoryset in the OrcaCloud
|
|
@@ -2313,17 +2591,19 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2313
2591
|
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
2314
2592
|
If not provided, a default embedding model for the memoryset will be used.
|
|
2315
2593
|
value_column: Name of the column in the datasource that contains the memory values
|
|
2316
|
-
label_column: Name of the column in the datasource that contains the memory labels
|
|
2317
|
-
|
|
2594
|
+
label_column: Name of the column in the datasource that contains the memory labels.
|
|
2595
|
+
Must contain categorical values as integers or strings. String labels will be
|
|
2596
|
+
converted to integers with the unique strings extracted as `label_names`. To create
|
|
2597
|
+
a memoryset with all none labels, set to `None`.
|
|
2318
2598
|
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
2319
2599
|
the system of reference
|
|
2320
2600
|
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
2321
2601
|
so make sure it is concise and describes the contents of your memoryset not the
|
|
2322
2602
|
datasource or the embedding model.
|
|
2323
2603
|
label_names: List of human-readable names for the labels in the memoryset, must match
|
|
2324
|
-
the number of labels in the `label_column`. Will be automatically inferred if
|
|
2325
|
-
[Dataset][datasets.Dataset] with a
|
|
2326
|
-
labels is used as the datasource
|
|
2604
|
+
the number of labels in the `label_column`. Will be automatically inferred if string
|
|
2605
|
+
labels are provided or if a [Dataset][datasets.Dataset] with a
|
|
2606
|
+
[`ClassLabel`][datasets.ClassLabel] feature for labels is used as the datasource
|
|
2327
2607
|
max_seq_length_override: Maximum sequence length of values in the memoryset, if the
|
|
2328
2608
|
value is longer than this it will be truncated, will default to the model's max
|
|
2329
2609
|
sequence length if not provided
|
|
@@ -2363,6 +2643,8 @@ class LabeledMemoryset(MemorysetBase[LabeledMemory, LabeledMemoryLookup]):
|
|
|
2363
2643
|
if_exists=if_exists,
|
|
2364
2644
|
background=background,
|
|
2365
2645
|
hidden=hidden,
|
|
2646
|
+
subsample=subsample,
|
|
2647
|
+
memory_type="LABELED",
|
|
2366
2648
|
)
|
|
2367
2649
|
|
|
2368
2650
|
def display_label_analysis(self):
|
|
@@ -2405,7 +2687,7 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2405
2687
|
*,
|
|
2406
2688
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2407
2689
|
value_column: str = "value",
|
|
2408
|
-
score_column: str = "score",
|
|
2690
|
+
score_column: str | None = "score",
|
|
2409
2691
|
source_id_column: str | None = None,
|
|
2410
2692
|
description: str | None = None,
|
|
2411
2693
|
max_seq_length_override: int | None = None,
|
|
@@ -2416,6 +2698,7 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2416
2698
|
if_exists: CreateMode = "error",
|
|
2417
2699
|
background: Literal[True],
|
|
2418
2700
|
hidden: bool = False,
|
|
2701
|
+
subsample: int | float | None = None,
|
|
2419
2702
|
) -> Job[Self]:
|
|
2420
2703
|
pass
|
|
2421
2704
|
|
|
@@ -2427,7 +2710,7 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2427
2710
|
datasource: Datasource,
|
|
2428
2711
|
*,
|
|
2429
2712
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2430
|
-
score_column: str = "score",
|
|
2713
|
+
score_column: str | None = "score",
|
|
2431
2714
|
value_column: str = "value",
|
|
2432
2715
|
source_id_column: str | None = None,
|
|
2433
2716
|
description: str | None = None,
|
|
@@ -2439,6 +2722,7 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2439
2722
|
if_exists: CreateMode = "error",
|
|
2440
2723
|
background: Literal[False] = False,
|
|
2441
2724
|
hidden: bool = False,
|
|
2725
|
+
subsample: int | float | None = None,
|
|
2442
2726
|
) -> Self:
|
|
2443
2727
|
pass
|
|
2444
2728
|
|
|
@@ -2450,7 +2734,7 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2450
2734
|
*,
|
|
2451
2735
|
embedding_model: FinetunedEmbeddingModel | PretrainedEmbeddingModel | None = None,
|
|
2452
2736
|
value_column: str = "value",
|
|
2453
|
-
score_column: str = "score",
|
|
2737
|
+
score_column: str | None = "score",
|
|
2454
2738
|
source_id_column: str | None = None,
|
|
2455
2739
|
description: str | None = None,
|
|
2456
2740
|
max_seq_length_override: int | None = None,
|
|
@@ -2461,6 +2745,7 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2461
2745
|
if_exists: CreateMode = "error",
|
|
2462
2746
|
background: bool = False,
|
|
2463
2747
|
hidden: bool = False,
|
|
2748
|
+
subsample: int | float | None = None,
|
|
2464
2749
|
) -> Self | Job[Self]:
|
|
2465
2750
|
"""
|
|
2466
2751
|
Create a new scored memoryset in the OrcaCloud
|
|
@@ -2474,7 +2759,8 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2474
2759
|
embedding_model: Embedding model to use for embedding memory values for semantic search.
|
|
2475
2760
|
If not provided, a default embedding model for the memoryset will be used.
|
|
2476
2761
|
value_column: Name of the column in the datasource that contains the memory values
|
|
2477
|
-
score_column: Name of the column in the datasource that contains the memory scores
|
|
2762
|
+
score_column: Name of the column in the datasource that contains the memory scores. Must
|
|
2763
|
+
contain numerical values. To create a memoryset with all none scores, set to `None`.
|
|
2478
2764
|
source_id_column: Optional name of the column in the datasource that contains the ids in
|
|
2479
2765
|
the system of reference
|
|
2480
2766
|
description: Optional description for the memoryset, this will be used in agentic flows,
|
|
@@ -2517,4 +2803,6 @@ class ScoredMemoryset(MemorysetBase[ScoredMemory, ScoredMemoryLookup]):
|
|
|
2517
2803
|
if_exists=if_exists,
|
|
2518
2804
|
background=background,
|
|
2519
2805
|
hidden=hidden,
|
|
2806
|
+
subsample=subsample,
|
|
2807
|
+
memory_type="SCORED",
|
|
2520
2808
|
)
|