orca-sdk 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
orca_sdk/conftest.py CHANGED
@@ -288,6 +288,7 @@ def writable_memoryset(datasource: Datasource, api_key: str) -> Generator[Labele
288
288
  datasource=datasource,
289
289
  embedding_model=PretrainedEmbeddingModel.GTE_BASE,
290
290
  source_id_column="source_id",
291
+ partition_id_column="partition_id",
291
292
  max_seq_length_override=32,
292
293
  if_exists="open",
293
294
  )
@@ -297,13 +298,7 @@ def writable_memoryset(datasource: Datasource, api_key: str) -> Generator[Labele
297
298
  # Restore the memoryset to a clean state for the next test.
298
299
  with OrcaClient(api_key=api_key).use():
299
300
  if LabeledMemoryset.exists("test_writable_memoryset"):
300
- memoryset.refresh()
301
-
302
- memory_ids = [memoryset[i].memory_id for i in range(len(memoryset))]
303
-
304
- if memory_ids:
305
- memoryset.delete(memory_ids)
306
- memoryset.refresh()
301
+ memoryset.truncate()
307
302
  assert len(memoryset) == 0
308
303
  memoryset.insert(SAMPLE_DATA)
309
304
  # If the test dropped the memoryset, do nothing — it will be recreated on the next use.
@@ -380,3 +375,88 @@ def partitioned_regression_model(readonly_partitioned_scored_memoryset: ScoredMe
380
375
  description="test_partitioned_regression_description",
381
376
  )
382
377
  return model
378
+
379
+
380
+ @pytest.fixture(scope="function")
381
+ def fully_partitioned_classification_resources() -> (
382
+ Generator[tuple[Datasource, LabeledMemoryset, ClassificationModel], None, None]
383
+ ):
384
+ data = [
385
+ {"value": "i love soup", "label": 0, "partition_id": "p1"},
386
+ {"value": "cats are cute", "label": 1, "partition_id": "p1"},
387
+ {"value": "soup is good", "label": 0, "partition_id": "p1"},
388
+ {"value": "i love cats", "label": 1, "partition_id": "p2"},
389
+ {"value": "everyone loves cats", "label": 1, "partition_id": "p2"},
390
+ {"value": "soup is good", "label": 0, "partition_id": "p1"},
391
+ {"value": "cats are amazing animals", "label": 1, "partition_id": "p2"},
392
+ {"value": "tomato soup is delicious", "label": 0, "partition_id": "p1"},
393
+ {"value": "cats love to play", "label": 1, "partition_id": "p2"},
394
+ {"value": "i enjoy eating soup", "label": 0, "partition_id": "p1"},
395
+ {"value": "my cat is fluffy", "label": 1, "partition_id": "p2"},
396
+ {"value": "chicken soup is tasty", "label": 0, "partition_id": "p1"},
397
+ {"value": "cats are playful pets", "label": 1, "partition_id": "p2"},
398
+ {"value": "soup warms the soul", "label": 0, "partition_id": "p1"},
399
+ {"value": "cats have soft fur", "label": 1, "partition_id": "p2"},
400
+ {"value": "vegetable soup is healthy", "label": 0, "partition_id": "p1"},
401
+ ]
402
+
403
+ datasource = None
404
+ memoryset = None
405
+ classification_model = None
406
+ try:
407
+ datasource = Datasource.from_list("fully_partitioned_classification_datasource", data)
408
+ memoryset = LabeledMemoryset.create(
409
+ "fully_partitioned_classification_memoryset",
410
+ datasource=datasource,
411
+ label_names=["soup", "cats"],
412
+ partition_id_column="partition_id",
413
+ )
414
+ classification_model = ClassificationModel.create("fully_partitioned_classification_model", memoryset=memoryset)
415
+ yield (datasource, memoryset, classification_model)
416
+ finally:
417
+ # Clean up in reverse order of creation
418
+ ClassificationModel.drop("fully_partitioned_classification_model", if_not_exists="ignore")
419
+ LabeledMemoryset.drop("fully_partitioned_classification_memoryset", if_not_exists="ignore")
420
+ Datasource.drop("fully_partitioned_classification_datasource", if_not_exists="ignore")
421
+
422
+
423
+ @pytest.fixture(scope="function")
424
+ def fully_partitioned_regression_resources() -> (
425
+ Generator[tuple[Datasource, ScoredMemoryset, RegressionModel], None, None]
426
+ ):
427
+ data = [
428
+ {"value": "i love soup", "score": 0.1, "partition_id": "p1"},
429
+ {"value": "cats are cute", "score": 0.9, "partition_id": "p1"},
430
+ {"value": "soup is good", "score": 0.1, "partition_id": "p1"},
431
+ {"value": "i love cats", "score": 0.9, "partition_id": "p2"},
432
+ {"value": "everyone loves cats", "score": 0.9, "partition_id": "p2"},
433
+ {"value": "soup is good", "score": 0.1, "partition_id": "p1"},
434
+ {"value": "cats are amazing animals", "score": 0.9, "partition_id": "p2"},
435
+ {"value": "tomato soup is delicious", "score": 0.1, "partition_id": "p1"},
436
+ {"value": "cats love to play", "score": 0.9, "partition_id": "p2"},
437
+ {"value": "i enjoy eating soup", "score": 0.1, "partition_id": "p1"},
438
+ {"value": "my cat is fluffy", "score": 0.9, "partition_id": "p2"},
439
+ {"value": "chicken soup is tasty", "score": 0.1, "partition_id": "p1"},
440
+ {"value": "cats are playful pets", "score": 0.9, "partition_id": "p2"},
441
+ {"value": "soup warms the soul", "score": 0.1, "partition_id": "p1"},
442
+ {"value": "cats have soft fur", "score": 0.9, "partition_id": "p2"},
443
+ {"value": "vegetable soup is healthy", "score": 0.1, "partition_id": "p1"},
444
+ ]
445
+
446
+ datasource = None
447
+ memoryset = None
448
+ regression_model = None
449
+ try:
450
+ datasource = Datasource.from_list("fully_partitioned_regression_datasource", data)
451
+ memoryset = ScoredMemoryset.create(
452
+ "fully_partitioned_regression_memoryset",
453
+ datasource=datasource,
454
+ partition_id_column="partition_id",
455
+ )
456
+ regression_model = RegressionModel.create("fully_partitioned_regression_model", memoryset=memoryset)
457
+ yield (datasource, memoryset, regression_model)
458
+ finally:
459
+ # Clean up in reverse order of creation
460
+ RegressionModel.drop("fully_partitioned_regression_model", if_not_exists="ignore")
461
+ ScoredMemoryset.drop("fully_partitioned_regression_memoryset", if_not_exists="ignore")
462
+ Datasource.drop("fully_partitioned_regression_datasource", if_not_exists="ignore")
orca_sdk/credentials.py CHANGED
@@ -1,10 +1,8 @@
1
- import os
2
1
  from datetime import datetime
3
- from typing import Literal, NamedTuple
2
+ from typing import Literal
4
3
 
5
4
  import httpx
6
- from httpx import ConnectError, Headers, HTTPTransport
7
- from typing_extensions import deprecated
5
+ from httpx import ConnectError, Headers
8
6
 
9
7
  from .async_client import OrcaAsyncClient
10
8
  from .client import OrcaClient
@@ -132,9 +130,6 @@ class OrcaCredentials:
132
130
  client = OrcaClient._resolve_client()
133
131
  client.DELETE("/auth/api_key/{name_or_id}", params={"name_or_id": name})
134
132
 
135
- # TODO: remove deprecated methods after 2026-01-01
136
-
137
- @deprecated("Use `OrcaClient.api_key` instead")
138
133
  @staticmethod
139
134
  def set_api_key(api_key: str, check_validity: bool = True):
140
135
  """
@@ -158,21 +153,25 @@ class OrcaCredentials:
158
153
  async_client = OrcaAsyncClient._resolve_client()
159
154
  async_client.api_key = api_key
160
155
 
161
- @deprecated("Use `OrcaClient.base_url` instead")
162
156
  @staticmethod
163
157
  def get_api_url() -> str:
164
158
  """
165
159
  Get the base URL of the Orca API that is currently being used
166
160
  """
167
161
  client = OrcaClient._resolve_client()
162
+ async_client = OrcaAsyncClient._resolve_client()
163
+ if client.base_url != async_client.base_url:
164
+ raise RuntimeError("The base URL of the sync and async clients do not match")
168
165
  return str(client.base_url)
169
166
 
170
- @deprecated("Use `OrcaClient.base_url` instead")
171
167
  @staticmethod
172
168
  def set_api_url(url: str, check_validity: bool = True):
173
169
  """
174
170
  Set the base URL for the Orca API
175
171
 
172
+ Note:
173
+ The base URL can also be provided by setting the `ORCA_API_URL` environment variable
174
+
176
175
  Args:
177
176
  url: The base URL to set
178
177
  check_validity: Whether to check if there is an API running at the given base URL
@@ -197,7 +196,6 @@ class OrcaCredentials:
197
196
  if check_validity:
198
197
  OrcaCredentials.is_healthy()
199
198
 
200
- @deprecated("Use `OrcaClient.headers` instead")
201
199
  @staticmethod
202
200
  def set_api_headers(headers: dict[str, str]):
203
201
  """
@@ -75,7 +75,7 @@ def test_create_api_key_already_exists():
75
75
  OrcaCredentials.create_api_key("orca_sdk_test")
76
76
 
77
77
 
78
- def test_set_api_key(api_key):
78
+ def test_use_client(api_key):
79
79
  client = OrcaClient(api_key=str(uuid4()))
80
80
  with client.use():
81
81
  assert not OrcaCredentials.is_authenticated()
@@ -91,17 +91,14 @@ def test_set_base_url(api_key):
91
91
  assert client.base_url == "http://localhost:1583"
92
92
 
93
93
 
94
- # deprecated methods:
95
-
96
-
97
- def test_deprecated_set_api_key(api_key):
94
+ def test_set_api_key(api_key):
98
95
  with OrcaClient(api_key=str(uuid4())).use():
99
96
  assert not OrcaCredentials.is_authenticated()
100
97
  OrcaCredentials.set_api_key(api_key)
101
98
  assert OrcaCredentials.is_authenticated()
102
99
 
103
100
 
104
- def test_deprecated_set_invalid_api_key(api_key):
101
+ def test_set_invalid_api_key(api_key):
105
102
  with OrcaClient(api_key=api_key).use():
106
103
  assert OrcaCredentials.is_authenticated()
107
104
  with pytest.raises(ValueError, match="Invalid API key"):
@@ -109,13 +106,13 @@ def test_deprecated_set_invalid_api_key(api_key):
109
106
  assert not OrcaCredentials.is_authenticated()
110
107
 
111
108
 
112
- def test_deprecated_set_api_url(api_key):
109
+ def test_set_api_url(api_key):
113
110
  with OrcaClient(api_key=api_key).use():
114
111
  OrcaCredentials.set_api_url("http://api.orcadb.ai")
115
112
  assert str(OrcaClient._resolve_client().base_url) == "http://api.orcadb.ai"
116
113
 
117
114
 
118
- def test_deprecated_set_invalid_api_url(api_key):
115
+ def test_set_invalid_api_url(api_key):
119
116
  with OrcaClient(api_key=api_key).use():
120
117
  with pytest.raises(ValueError, match="No API found at http://localhost:1582"):
121
118
  OrcaCredentials.set_api_url("http://localhost:1582")
orca_sdk/datasource.py CHANGED
@@ -7,15 +7,10 @@ from datetime import datetime
7
7
  from io import BytesIO
8
8
  from os import PathLike
9
9
  from pathlib import Path
10
- from typing import Any, Literal, Union, cast
10
+ from typing import TYPE_CHECKING, Any, Literal, Union, cast
11
11
 
12
- import pandas as pd
13
- import pyarrow as pa
14
12
  from datasets import Dataset, DatasetDict
15
13
  from httpx._types import FileTypes # type: ignore
16
- from pyarrow import parquet
17
- from torch.utils.data import DataLoader as TorchDataLoader
18
- from torch.utils.data import Dataset as TorchDataset
19
14
  from tqdm.auto import tqdm
20
15
 
21
16
  from ._utils.common import CreateMode, DropMode
@@ -23,6 +18,13 @@ from ._utils.data_parsing import hf_dataset_from_torch
23
18
  from ._utils.tqdm_file_reader import TqdmFileReader
24
19
  from .client import DatasourceMetadata, OrcaClient
25
20
 
21
+ if TYPE_CHECKING:
22
+ # These are peer dependencies that are used for types only
23
+ from pandas import DataFrame as PandasDataFrame # type: ignore
24
+ from pyarrow import Table as PyArrowTable # type: ignore
25
+ from torch.utils.data import DataLoader as TorchDataLoader # type: ignore
26
+ from torch.utils.data import Dataset as TorchDataset # type: ignore
27
+
26
28
 
27
29
  def _upload_files_to_datasource(
28
30
  name: str,
@@ -312,7 +314,7 @@ class Datasource:
312
314
 
313
315
  @classmethod
314
316
  def from_pandas(
315
- cls, name: str, dataframe: pd.DataFrame, if_exists: CreateMode = "error", description: str | None = None
317
+ cls, name: str, dataframe: PandasDataFrame, if_exists: CreateMode = "error", description: str | None = None
316
318
  ) -> Datasource:
317
319
  """
318
320
  Create a new datasource from a pandas DataFrame
@@ -335,7 +337,7 @@ class Datasource:
335
337
 
336
338
  @classmethod
337
339
  def from_arrow(
338
- cls, name: str, pyarrow_table: pa.Table, if_exists: CreateMode = "error", description: str | None = None
340
+ cls, name: str, pyarrow_table: PyArrowTable, if_exists: CreateMode = "error", description: str | None = None
339
341
  ) -> Datasource:
340
342
  """
341
343
  Create a new datasource from a pyarrow Table
@@ -358,6 +360,9 @@ class Datasource:
358
360
  if existing is not None:
359
361
  return existing
360
362
 
363
+ # peer dependency that is guaranteed to exist if the user provided a pyarrow table
364
+ from pyarrow import parquet # type: ignore
365
+
361
366
  # Write to bytes buffer
362
367
  buffer = BytesIO()
363
368
  parquet.write_table(pyarrow_table, buffer)
@@ -5,8 +5,6 @@ from typing import cast
5
5
  from uuid import uuid4
6
6
 
7
7
  import numpy as np
8
- import pandas as pd
9
- import pyarrow as pa
10
8
  import pytest
11
9
  from datasets import Dataset
12
10
 
@@ -137,6 +135,8 @@ def test_from_dict():
137
135
 
138
136
 
139
137
  def test_from_pandas():
138
+ pd = pytest.importorskip("pandas")
139
+
140
140
  # Test creating datasource from pandas DataFrame
141
141
  df = pd.DataFrame(
142
142
  {
@@ -152,6 +152,8 @@ def test_from_pandas():
152
152
 
153
153
 
154
154
  def test_from_arrow():
155
+ pa = pytest.importorskip("pyarrow")
156
+
155
157
  # Test creating datasource from pyarrow Table
156
158
  table = pa.table(
157
159
  {
@@ -205,6 +207,8 @@ def test_from_dict_already_exists():
205
207
 
206
208
 
207
209
  def test_from_pandas_already_exists():
210
+ pd = pytest.importorskip("pandas")
211
+
208
212
  # Test the if_exists parameter with from_pandas
209
213
  df = pd.DataFrame({"column1": [1], "column2": ["a"]})
210
214
  name = f"test_pandas_exists_{uuid4()}"
@@ -224,6 +228,8 @@ def test_from_pandas_already_exists():
224
228
 
225
229
 
226
230
  def test_from_arrow_already_exists():
231
+ pa = pytest.importorskip("pyarrow")
232
+
227
233
  # Test the if_exists parameter with from_arrow
228
234
  table = pa.table({"column1": [1], "column2": ["a"]})
229
235
  name = f"test_arrow_exists_{uuid4()}"
@@ -691,21 +691,26 @@ class FinetunedEmbeddingModel(EmbeddingModelBase):
691
691
  return False
692
692
 
693
693
  @classmethod
694
- def drop(cls, name_or_id: str, *, if_not_exists: DropMode = "error"):
694
+ def drop(cls, name_or_id: str, *, if_not_exists: DropMode = "error", cascade: bool = False):
695
695
  """
696
696
  Delete the finetuned embedding model from the OrcaCloud
697
697
 
698
698
  Params:
699
699
  name_or_id: The name or id of the finetuned embedding model
700
+ if_not_exists: What to do if the finetuned embedding model does not exist, defaults to `"error"`.
701
+ Other option is `"ignore"` to do nothing if the model does not exist.
702
+ cascade: If True, also delete all associated memorysets and their predictive models.
703
+ Defaults to False.
700
704
 
701
705
  Raises:
702
706
  LookupError: If the finetuned embedding model does not exist and `if_not_exists` is `"error"`
707
+ RuntimeError: If the model has associated memorysets and cascade is False
703
708
  """
704
709
  try:
705
710
  client = OrcaClient._resolve_client()
706
711
  client.DELETE(
707
712
  "/finetuned_embedding_model/{name_or_id}",
708
- params={"name_or_id": name_or_id},
713
+ params={"name_or_id": name_or_id, "cascade": cascade},
709
714
  )
710
715
  except LookupError:
711
716
  if if_not_exists == "error":
@@ -172,6 +172,35 @@ def test_drop_finetuned_model(datasource: Datasource):
172
172
  FinetunedEmbeddingModel.open("finetuned_model_to_delete")
173
173
 
174
174
 
175
+ def test_drop_finetuned_model_with_memoryset_cascade(datasource: Datasource):
176
+ """Test that cascade=False prevents deletion and cascade=True allows it."""
177
+ finetuned_model = PretrainedEmbeddingModel.DISTILBERT.finetune("finetuned_model_cascade_delete", datasource)
178
+ memoryset = LabeledMemoryset.create(
179
+ "test_memoryset_for_finetuned_model_cascade",
180
+ datasource=datasource,
181
+ embedding_model=finetuned_model,
182
+ )
183
+
184
+ # Verify memoryset exists and uses the finetuned model
185
+ assert LabeledMemoryset.open(memoryset.name) is not None
186
+ assert memoryset.embedding_model == finetuned_model
187
+
188
+ # Without cascade, deletion should fail
189
+ with pytest.raises(RuntimeError):
190
+ FinetunedEmbeddingModel.drop(finetuned_model.id, cascade=False)
191
+
192
+ # Model and memoryset should still exist
193
+ assert FinetunedEmbeddingModel.exists(finetuned_model.name)
194
+ assert LabeledMemoryset.exists(memoryset.name)
195
+
196
+ # With cascade, deletion should succeed
197
+ FinetunedEmbeddingModel.drop(finetuned_model.id, cascade=True)
198
+
199
+ # Both model and memoryset should be deleted
200
+ assert not FinetunedEmbeddingModel.exists(finetuned_model.name)
201
+ assert not LabeledMemoryset.exists(memoryset.name)
202
+
203
+
175
204
  def test_drop_finetuned_model_unauthenticated(unauthenticated_client, datasource: Datasource):
176
205
  with unauthenticated_client.use():
177
206
  with pytest.raises(ValueError, match="Invalid API key"):