orca-sdk 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
orca_sdk/client.py CHANGED
@@ -135,6 +135,8 @@ class ClassificationEvaluationRequest(TypedDict):
135
135
  telemetry_tags: NotRequired[list[str] | None]
136
136
  subsample: NotRequired[int | float | None]
137
137
  ignore_unlabeled: NotRequired[bool]
138
+ datasource_partition_column: NotRequired[str | None]
139
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
138
140
 
139
141
 
140
142
  class CleanupResponse(TypedDict):
@@ -315,12 +317,16 @@ class ListMemoriesRequest(TypedDict):
315
317
  offset: NotRequired[int]
316
318
  limit: NotRequired[int]
317
319
  filters: NotRequired[list[FilterItem]]
320
+ partition_id: NotRequired[str | None]
321
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
318
322
 
319
323
 
320
324
  class LookupRequest(TypedDict):
321
325
  query: list[str]
322
326
  count: NotRequired[int]
323
327
  prompt: NotRequired[str | None]
328
+ partition_id: NotRequired[str | list[str | None] | None]
329
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
324
330
 
325
331
 
326
332
  class LookupScoreMetrics(TypedDict):
@@ -547,16 +553,7 @@ class PredictiveModelUpdate(TypedDict):
547
553
 
548
554
 
549
555
  PretrainedEmbeddingModelName = Literal[
550
- "CLIP_BASE",
551
- "GTE_BASE",
552
- "CDE_SMALL",
553
- "DISTILBERT",
554
- "GTE_SMALL",
555
- "MXBAI_LARGE",
556
- "E5_LARGE",
557
- "QWEN2_1_5B",
558
- "BGE_BASE",
559
- "GIST_LARGE",
556
+ "CLIP_BASE", "GTE_BASE", "CDE_SMALL", "DISTILBERT", "GTE_SMALL", "MXBAI_LARGE", "E5_LARGE", "BGE_BASE", "GIST_LARGE"
560
557
  ]
561
558
 
562
559
 
@@ -586,6 +583,8 @@ class RegressionEvaluationRequest(TypedDict):
586
583
  telemetry_tags: NotRequired[list[str] | None]
587
584
  subsample: NotRequired[int | float | None]
588
585
  ignore_unlabeled: NotRequired[bool]
586
+ datasource_partition_column: NotRequired[str | None]
587
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
589
588
 
590
589
 
591
590
  class RegressionMetrics(TypedDict):
@@ -629,6 +628,8 @@ class RegressionPredictionRequest(TypedDict):
629
628
  use_lookup_cache: NotRequired[bool]
630
629
  consistency_level: NotRequired[Literal["Bounded", "Session", "Strong", "Eventual"] | None]
631
630
  ignore_unlabeled: NotRequired[bool]
631
+ partition_ids: NotRequired[str | list[str | None] | None]
632
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
632
633
 
633
634
 
634
635
  class ScorePredictionMemoryLookup(TypedDict):
@@ -1163,7 +1164,14 @@ class BootstrapClassificationModelRequest(TypedDict):
1163
1164
  num_examples_per_label: NotRequired[int]
1164
1165
 
1165
1166
 
1166
- class BootstrapClassificationModelResult(TypedDict):
1167
+ class BootstrapLabeledMemoryDataInput(TypedDict):
1168
+ model_description: str
1169
+ label_names: list[str]
1170
+ initial_examples: NotRequired[list[LabeledExample]]
1171
+ num_examples_per_label: NotRequired[int]
1172
+
1173
+
1174
+ class BootstrapLabeledMemoryDataResult(TypedDict):
1167
1175
  model_description: str
1168
1176
  label_names: list[str]
1169
1177
  model_name: str
@@ -1216,6 +1224,8 @@ class ClassificationPredictionRequest(TypedDict):
1216
1224
  use_lookup_cache: NotRequired[bool]
1217
1225
  consistency_level: NotRequired[Literal["Bounded", "Session", "Strong", "Eventual"] | None]
1218
1226
  ignore_unlabeled: NotRequired[bool]
1227
+ partition_ids: NotRequired[str | list[str | None] | None]
1228
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
1219
1229
 
1220
1230
 
1221
1231
  class CloneMemorysetRequest(TypedDict):
@@ -1269,6 +1279,7 @@ class CreateMemorysetRequest(TypedDict):
1269
1279
  datasource_score_column: NotRequired[str | None]
1270
1280
  datasource_value_column: str
1271
1281
  datasource_source_id_column: NotRequired[str | None]
1282
+ datasource_partition_id_column: NotRequired[str | None]
1272
1283
  remove_duplicates: NotRequired[bool]
1273
1284
  pretrained_embedding_model_name: NotRequired[PretrainedEmbeddingModelName | None]
1274
1285
  finetuned_embedding_model_name_or_id: NotRequired[str | None]
@@ -1539,6 +1550,7 @@ class MemorysetAnalysisRequest(TypedDict):
1539
1550
  batch_size: NotRequired[int]
1540
1551
  clear_metrics: NotRequired[bool]
1541
1552
  configs: MemorysetAnalysisConfigs
1553
+ partition_filter_mode: NotRequired[Literal["ignore_partitions", "include_global", "exclude_global", "only_global"]]
1542
1554
 
1543
1555
 
1544
1556
  class MemorysetConceptMetrics(TypedDict):
@@ -1664,7 +1676,7 @@ class BootstrapClassificationModelMeta(TypedDict):
1664
1676
  datasource_meta: DatasourceMetadata
1665
1677
  memoryset_meta: MemorysetMetadata
1666
1678
  model_meta: ClassificationModelMetadata
1667
- agent_output: BootstrapClassificationModelResult
1679
+ agent_output: BootstrapLabeledMemoryDataResult
1668
1680
 
1669
1681
 
1670
1682
  class BootstrapClassificationModelResponse(TypedDict):
@@ -2554,7 +2566,7 @@ class OrcaClient(Client):
2554
2566
  timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
2555
2567
  extensions: RequestExtensions | None = None,
2556
2568
  ) -> BootstrapClassificationModelResponse:
2557
- """Get the status of a bootstrap classification model job"""
2569
+ """Get the status of a bootstrap labeled memory data job"""
2558
2570
  pass
2559
2571
 
2560
2572
  def GET(
@@ -3276,6 +3288,32 @@ class OrcaClient(Client):
3276
3288
  """Get row count from a specific datasource with optional filtering."""
3277
3289
  pass
3278
3290
 
3291
+ @overload
3292
+ def POST(
3293
+ self,
3294
+ path: Literal["/datasource/bootstrap_memory_data"],
3295
+ *,
3296
+ params: None = None,
3297
+ json: BootstrapLabeledMemoryDataInput,
3298
+ data: None = None,
3299
+ files: None = None,
3300
+ content: None = None,
3301
+ parse_as: Literal["json"] = "json",
3302
+ headers: HeaderTypes | None = None,
3303
+ cookies: CookieTypes | None = None,
3304
+ auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT,
3305
+ follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT,
3306
+ timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
3307
+ extensions: RequestExtensions | None = None,
3308
+ ) -> BootstrapLabeledMemoryDataResult:
3309
+ """
3310
+ Bootstrap memory data using an AI agent.
3311
+
3312
+ This endpoint uses the bootstrap labeled memory data agent to generate
3313
+ high-quality, diverse training examples for a classification model.
3314
+ """
3315
+ pass
3316
+
3279
3317
  @overload
3280
3318
  def POST(
3281
3319
  self,
@@ -3524,7 +3562,7 @@ class OrcaClient(Client):
3524
3562
  """
3525
3563
  Bootstrap a classification model by creating a memoryset with generated memories and a classification model.
3526
3564
 
3527
- This endpoint uses the bootstrap_classification_model agent to generate:
3565
+ This endpoint uses the bootstrap_labeled_memory_data agent to generate:
3528
3566
  1. Memoryset configuration with appropriate settings
3529
3567
  2. Model configuration with optimal parameters
3530
3568
  3. High-quality training memories for each label
orca_sdk/conftest.py CHANGED
@@ -99,34 +99,105 @@ def label_names():
99
99
 
100
100
 
101
101
  SAMPLE_DATA = [
102
- {"value": "i love soup", "label": 0, "key": "g1", "score": 0.1, "source_id": "s1"},
103
- {"value": "cats are cute", "label": 1, "key": "g1", "score": 0.9, "source_id": "s2"},
104
- {"value": "soup is good", "label": 0, "key": "g1", "score": 0.1, "source_id": "s3"},
105
- {"value": "i love cats", "label": 1, "key": "g1", "score": 0.9, "source_id": "s4"},
106
- {"value": "everyone loves cats", "label": 1, "key": "g1", "score": 0.9, "source_id": "s5"},
107
- {"value": "soup is great for the winter", "label": 0, "key": "g1", "score": 0.1, "source_id": "s6"},
108
- {"value": "hot soup on a rainy day!", "label": 0, "key": "g1", "score": 0.1, "source_id": "s7"},
109
- {"value": "cats sleep all day", "label": 1, "key": "g1", "score": 0.9, "source_id": "s8"},
110
- {"value": "homemade soup recipes", "label": 0, "key": "g1", "score": 0.1, "source_id": "s9"},
111
- {"value": "cats purr when happy", "label": 1, "key": "g2", "score": 0.9, "source_id": "s10"},
112
- {"value": "chicken noodle soup is classic", "label": 0, "key": "g1", "score": 0.1, "source_id": "s11"},
113
- {"value": "kittens are baby cats", "label": 1, "key": "g2", "score": 0.9, "source_id": "s12"},
114
- {"value": "soup can be served cold too", "label": 0, "key": "g1", "score": 0.1, "source_id": "s13"},
115
- {"value": "cats have nine lives", "label": 1, "key": "g2", "score": 0.9, "source_id": "s14"},
116
- {"value": "tomato soup with grilled cheese", "label": 0, "key": "g1", "score": 0.1, "source_id": "s15"},
117
- {"value": "cats are independent animals", "label": 1, "key": "g2", "score": 0.9, "source_id": "s16"},
118
- {"value": "the beach is always fun", "label": None, "key": "g3", "score": None, "source_id": "s17"},
119
- {"value": "i love the beach", "label": None, "key": "g3", "score": None, "source_id": "s18"},
120
- {"value": "the ocean is healing", "label": None, "key": "g3", "score": None, "source_id": "s19"},
102
+ {"value": "i love soup", "label": 0, "key": "g1", "score": 0.1, "source_id": "s1", "partition_id": "p1"},
103
+ {"value": "cats are cute", "label": 1, "key": "g1", "score": 0.9, "source_id": "s2", "partition_id": "p1"},
104
+ {"value": "soup is good", "label": 0, "key": "g1", "score": 0.1, "source_id": "s3", "partition_id": "p1"},
105
+ {"value": "i love cats", "label": 1, "key": "g1", "score": 0.9, "source_id": "s4", "partition_id": "p1"},
106
+ {"value": "everyone loves cats", "label": 1, "key": "g1", "score": 0.9, "source_id": "s5", "partition_id": "p1"},
107
+ {
108
+ "value": "soup is great for the winter",
109
+ "label": 0,
110
+ "key": "g1",
111
+ "score": 0.1,
112
+ "source_id": "s6",
113
+ "partition_id": "p1",
114
+ },
115
+ {
116
+ "value": "hot soup on a rainy day!",
117
+ "label": 0,
118
+ "key": "g1",
119
+ "score": 0.1,
120
+ "source_id": "s7",
121
+ "partition_id": "p1",
122
+ },
123
+ {"value": "cats sleep all day", "label": 1, "key": "g1", "score": 0.9, "source_id": "s8", "partition_id": "p1"},
124
+ {"value": "homemade soup recipes", "label": 0, "key": "g1", "score": 0.1, "source_id": "s9", "partition_id": "p2"},
125
+ {"value": "cats purr when happy", "label": 1, "key": "g2", "score": 0.9, "source_id": "s10", "partition_id": "p2"},
126
+ {
127
+ "value": "chicken noodle soup is classic",
128
+ "label": 0,
129
+ "key": "g1",
130
+ "score": 0.1,
131
+ "source_id": "s11",
132
+ "partition_id": "p2",
133
+ },
134
+ {"value": "kittens are baby cats", "label": 1, "key": "g2", "score": 0.9, "source_id": "s12", "partition_id": "p2"},
135
+ {
136
+ "value": "soup can be served cold too",
137
+ "label": 0,
138
+ "key": "g1",
139
+ "score": 0.1,
140
+ "source_id": "s13",
141
+ "partition_id": "p2",
142
+ },
143
+ {"value": "cats have nine lives", "label": 1, "key": "g2", "score": 0.9, "source_id": "s14", "partition_id": "p2"},
144
+ {
145
+ "value": "tomato soup with grilled cheese",
146
+ "label": 0,
147
+ "key": "g1",
148
+ "score": 0.1,
149
+ "source_id": "s15",
150
+ "partition_id": "p2",
151
+ },
152
+ {
153
+ "value": "cats are independent animals",
154
+ "label": 1,
155
+ "key": "g2",
156
+ "score": 0.9,
157
+ "source_id": "s16",
158
+ "partition_id": None,
159
+ },
160
+ {
161
+ "value": "the beach is always fun",
162
+ "label": None,
163
+ "key": "g3",
164
+ "score": None,
165
+ "source_id": "s17",
166
+ "partition_id": None,
167
+ },
168
+ {"value": "i love the beach", "label": None, "key": "g3", "score": None, "source_id": "s18", "partition_id": None},
169
+ {
170
+ "value": "the ocean is healing",
171
+ "label": None,
172
+ "key": "g3",
173
+ "score": None,
174
+ "source_id": "s19",
175
+ "partition_id": None,
176
+ },
121
177
  {
122
178
  "value": "sandy feet, sand between my toes at the beach",
123
179
  "label": None,
124
180
  "key": "g3",
125
181
  "score": None,
126
182
  "source_id": "s20",
183
+ "partition_id": None,
184
+ },
185
+ {
186
+ "value": "i am such a beach bum",
187
+ "label": None,
188
+ "key": "g3",
189
+ "score": None,
190
+ "source_id": "s21",
191
+ "partition_id": None,
192
+ },
193
+ {
194
+ "value": "i will always want to be at the beach",
195
+ "label": None,
196
+ "key": "g3",
197
+ "score": None,
198
+ "source_id": "s22",
199
+ "partition_id": None,
127
200
  },
128
- {"value": "i am such a beach bum", "label": None, "key": "g3", "score": None, "source_id": "s21"},
129
- {"value": "i will always want to be at the beach", "label": None, "key": "g3", "score": None, "source_id": "s22"},
130
201
  ]
131
202
 
132
203
 
@@ -141,6 +212,7 @@ def hf_dataset(label_names: list[str]) -> Dataset:
141
212
  "key": Value("string"),
142
213
  "score": Value("float"),
143
214
  "source_id": Value("string"),
215
+ "partition_id": Value("string"),
144
216
  }
145
217
  ),
146
218
  )
@@ -186,6 +258,18 @@ def readonly_memoryset(datasource: Datasource) -> LabeledMemoryset:
186
258
  return memoryset
187
259
 
188
260
 
261
+ @pytest.fixture(scope="session")
262
+ def readonly_partitioned_memoryset(datasource: Datasource) -> LabeledMemoryset:
263
+ memoryset = LabeledMemoryset.create(
264
+ "test_readonly_partitioned_memoryset",
265
+ datasource=datasource,
266
+ embedding_model=PretrainedEmbeddingModel.GTE_BASE,
267
+ source_id_column="source_id",
268
+ partition_id_column="partition_id",
269
+ )
270
+ return memoryset
271
+
272
+
189
273
  @pytest.fixture(scope="function")
190
274
  def writable_memoryset(datasource: Datasource, api_key: str) -> Generator[LabeledMemoryset, None, None]:
191
275
  """
@@ -237,6 +321,18 @@ def classification_model(readonly_memoryset: LabeledMemoryset) -> Classification
237
321
  return model
238
322
 
239
323
 
324
+ @pytest.fixture(scope="session")
325
+ def partitioned_classification_model(readonly_partitioned_memoryset: LabeledMemoryset) -> ClassificationModel:
326
+ model = ClassificationModel.create(
327
+ "test_partitioned_classification_model",
328
+ readonly_partitioned_memoryset,
329
+ num_classes=2,
330
+ memory_lookup_count=3,
331
+ description="test_partitioned_description",
332
+ )
333
+ return model
334
+
335
+
240
336
  # Add scored memoryset and regression model fixtures
241
337
  @pytest.fixture(scope="session")
242
338
  def scored_memoryset(datasource: Datasource) -> ScoredMemoryset:
@@ -261,3 +357,26 @@ def regression_model(scored_memoryset: ScoredMemoryset) -> RegressionModel:
261
357
  description="test_regression_description",
262
358
  )
263
359
  return model
360
+
361
+
362
+ @pytest.fixture(scope="session")
363
+ def readonly_partitioned_scored_memoryset(datasource: Datasource) -> ScoredMemoryset:
364
+ memoryset = ScoredMemoryset.create(
365
+ "test_readonly_partitioned_scored_memoryset",
366
+ datasource=datasource,
367
+ embedding_model=PretrainedEmbeddingModel.GTE_BASE,
368
+ source_id_column="source_id",
369
+ partition_id_column="partition_id",
370
+ )
371
+ return memoryset
372
+
373
+
374
+ @pytest.fixture(scope="session")
375
+ def partitioned_regression_model(readonly_partitioned_scored_memoryset: ScoredMemoryset) -> RegressionModel:
376
+ model = RegressionModel.create(
377
+ "test_partitioned_regression_model",
378
+ readonly_partitioned_scored_memoryset,
379
+ memory_lookup_count=3,
380
+ description="test_partitioned_regression_description",
381
+ )
382
+ return model
@@ -340,7 +340,6 @@ class PretrainedEmbeddingModel(EmbeddingModelBase):
340
340
  - **`E5_LARGE`**: E5-Large instruction-tuned embedding model from Hugging Face ([intfloat/multilingual-e5-large-instruct](https://huggingface.co/intfloat/multilingual-e5-large-instruct))
341
341
  - **`GIST_LARGE`**: GIST-Large embedding model from Hugging Face ([avsolatorio/GIST-large-Embedding-v0](https://huggingface.co/avsolatorio/GIST-large-Embedding-v0))
342
342
  - **`MXBAI_LARGE`**: Mixbreas's Large embedding model from Hugging Face ([mixedbread-ai/mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1))
343
- - **`QWEN2_1_5B`**: Alibaba's Qwen2-1.5B instruction-tuned embedding model from Hugging Face ([Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct))
344
343
  - **`BGE_BASE`**: BAAI's BGE-Base instruction-tuned embedding model from Hugging Face ([BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5))
345
344
 
346
345
  **Instruction Support:**
@@ -373,7 +372,6 @@ class PretrainedEmbeddingModel(EmbeddingModelBase):
373
372
  E5_LARGE = _ModelDescriptor("E5_LARGE")
374
373
  GIST_LARGE = _ModelDescriptor("GIST_LARGE")
375
374
  MXBAI_LARGE = _ModelDescriptor("MXBAI_LARGE")
376
- QWEN2_1_5B = _ModelDescriptor("QWEN2_1_5B")
377
375
  BGE_BASE = _ModelDescriptor("BGE_BASE")
378
376
 
379
377
  name: PretrainedEmbeddingModelName