orca-sdk 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. orca_sdk/__init__.py +30 -0
  2. orca_sdk/_shared/__init__.py +10 -0
  3. orca_sdk/_shared/metrics.py +634 -0
  4. orca_sdk/_shared/metrics_test.py +570 -0
  5. orca_sdk/_utils/__init__.py +0 -0
  6. orca_sdk/_utils/analysis_ui.py +196 -0
  7. orca_sdk/_utils/analysis_ui_style.css +51 -0
  8. orca_sdk/_utils/auth.py +65 -0
  9. orca_sdk/_utils/auth_test.py +31 -0
  10. orca_sdk/_utils/common.py +37 -0
  11. orca_sdk/_utils/data_parsing.py +129 -0
  12. orca_sdk/_utils/data_parsing_test.py +244 -0
  13. orca_sdk/_utils/pagination.py +126 -0
  14. orca_sdk/_utils/pagination_test.py +132 -0
  15. orca_sdk/_utils/prediction_result_ui.css +18 -0
  16. orca_sdk/_utils/prediction_result_ui.py +110 -0
  17. orca_sdk/_utils/tqdm_file_reader.py +12 -0
  18. orca_sdk/_utils/value_parser.py +45 -0
  19. orca_sdk/_utils/value_parser_test.py +39 -0
  20. orca_sdk/async_client.py +4104 -0
  21. orca_sdk/classification_model.py +1165 -0
  22. orca_sdk/classification_model_test.py +887 -0
  23. orca_sdk/client.py +4096 -0
  24. orca_sdk/conftest.py +382 -0
  25. orca_sdk/credentials.py +217 -0
  26. orca_sdk/credentials_test.py +121 -0
  27. orca_sdk/datasource.py +576 -0
  28. orca_sdk/datasource_test.py +463 -0
  29. orca_sdk/embedding_model.py +712 -0
  30. orca_sdk/embedding_model_test.py +206 -0
  31. orca_sdk/job.py +343 -0
  32. orca_sdk/job_test.py +108 -0
  33. orca_sdk/memoryset.py +3811 -0
  34. orca_sdk/memoryset_test.py +1150 -0
  35. orca_sdk/regression_model.py +841 -0
  36. orca_sdk/regression_model_test.py +595 -0
  37. orca_sdk/telemetry.py +742 -0
  38. orca_sdk/telemetry_test.py +119 -0
  39. orca_sdk-0.1.9.dist-info/METADATA +98 -0
  40. orca_sdk-0.1.9.dist-info/RECORD +41 -0
  41. orca_sdk-0.1.9.dist-info/WHEEL +4 -0
@@ -0,0 +1,121 @@
1
+ from uuid import uuid4
2
+
3
+ import pytest
4
+
5
+ from .client import OrcaClient
6
+ from .credentials import OrcaCredentials
7
+
8
+
9
+ def test_is_authenticated():
10
+ assert OrcaCredentials.is_authenticated()
11
+
12
+
13
+ def test_is_authenticated_false(unauthenticated_client):
14
+ with unauthenticated_client.use():
15
+ assert not OrcaCredentials.is_authenticated()
16
+
17
+
18
+ def test_is_healthy():
19
+ assert OrcaCredentials.is_healthy()
20
+
21
+
22
+ def test_is_healthy_false(api_key):
23
+ with OrcaClient(api_key=api_key, base_url="http://localhost:1582").use():
24
+ assert not OrcaCredentials.is_healthy()
25
+
26
+
27
+ def test_list_api_keys():
28
+ api_keys = OrcaCredentials.list_api_keys()
29
+ assert len(api_keys) >= 1
30
+ assert "orca_sdk_test" in [api_key.name for api_key in api_keys]
31
+
32
+
33
+ def test_list_api_keys_unauthenticated(unauthenticated_client):
34
+ with unauthenticated_client.use():
35
+ with pytest.raises(ValueError, match="Invalid API key"):
36
+ OrcaCredentials.list_api_keys()
37
+
38
+
39
+ def test_manage_api_key():
40
+ api_key_name = f"orca_sdk_test_{uuid4().hex[:8]}"
41
+ api_key = OrcaCredentials.create_api_key(api_key_name)
42
+ assert api_key is not None
43
+ assert len(api_key) > 0
44
+ assert api_key_name in [aki.name for aki in OrcaCredentials.list_api_keys()]
45
+ OrcaCredentials.revoke_api_key(api_key_name)
46
+ assert api_key_name not in [aki.name for aki in OrcaCredentials.list_api_keys()]
47
+
48
+
49
+ def test_create_api_key_unauthenticated(unauthenticated_client):
50
+ with unauthenticated_client.use():
51
+ with pytest.raises(ValueError, match="Invalid API key"):
52
+ OrcaCredentials.create_api_key(f"orca_sdk_test_{uuid4().hex[:8]}")
53
+
54
+
55
+ def test_create_api_key_unauthorized(predict_only_client):
56
+ with predict_only_client.use():
57
+ with pytest.raises(PermissionError):
58
+ OrcaCredentials.create_api_key(f"orca_sdk_test_{uuid4().hex[:8]}")
59
+
60
+
61
+ def test_revoke_api_key_unauthenticated(unauthenticated_client):
62
+ with unauthenticated_client.use():
63
+ with pytest.raises(ValueError, match="Invalid API key"):
64
+ OrcaCredentials.revoke_api_key(f"orca_sdk_test_{uuid4().hex[:8]}")
65
+
66
+
67
+ def test_revoke_api_key_unauthorized(predict_only_client):
68
+ with predict_only_client.use():
69
+ with pytest.raises(PermissionError):
70
+ OrcaCredentials.revoke_api_key(f"orca_sdk_test_{uuid4().hex[:8]}")
71
+
72
+
73
+ def test_create_api_key_already_exists():
74
+ with pytest.raises(ValueError, match="API key with this name already exists"):
75
+ OrcaCredentials.create_api_key("orca_sdk_test")
76
+
77
+
78
+ def test_set_api_key(api_key):
79
+ client = OrcaClient(api_key=str(uuid4()))
80
+ with client.use():
81
+ assert not OrcaCredentials.is_authenticated()
82
+ client.api_key = api_key
83
+ assert client.api_key == api_key
84
+ assert OrcaCredentials.is_authenticated()
85
+
86
+
87
+ def test_set_base_url(api_key):
88
+ client = OrcaClient(base_url="http://localhost:1582")
89
+ assert client.base_url == "http://localhost:1582"
90
+ client.base_url = "http://localhost:1583"
91
+ assert client.base_url == "http://localhost:1583"
92
+
93
+
94
+ # deprecated methods:
95
+
96
+
97
+ def test_deprecated_set_api_key(api_key):
98
+ with OrcaClient(api_key=str(uuid4())).use():
99
+ assert not OrcaCredentials.is_authenticated()
100
+ OrcaCredentials.set_api_key(api_key)
101
+ assert OrcaCredentials.is_authenticated()
102
+
103
+
104
+ def test_deprecated_set_invalid_api_key(api_key):
105
+ with OrcaClient(api_key=api_key).use():
106
+ assert OrcaCredentials.is_authenticated()
107
+ with pytest.raises(ValueError, match="Invalid API key"):
108
+ OrcaCredentials.set_api_key(str(uuid4()))
109
+ assert not OrcaCredentials.is_authenticated()
110
+
111
+
112
+ def test_deprecated_set_api_url(api_key):
113
+ with OrcaClient(api_key=api_key).use():
114
+ OrcaCredentials.set_api_url("http://api.orcadb.ai")
115
+ assert str(OrcaClient._resolve_client().base_url) == "http://api.orcadb.ai"
116
+
117
+
118
+ def test_deprecated_set_invalid_api_url(api_key):
119
+ with OrcaClient(api_key=api_key).use():
120
+ with pytest.raises(ValueError, match="No API found at http://localhost:1582"):
121
+ OrcaCredentials.set_api_url("http://localhost:1582")
orca_sdk/datasource.py ADDED
@@ -0,0 +1,576 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import tempfile
5
+ import zipfile
6
+ from datetime import datetime
7
+ from io import BytesIO
8
+ from os import PathLike
9
+ from pathlib import Path
10
+ from typing import Any, Literal, Union, cast
11
+
12
+ import pandas as pd
13
+ import pyarrow as pa
14
+ from datasets import Dataset, DatasetDict
15
+ from httpx._types import FileTypes # type: ignore
16
+ from pyarrow import parquet
17
+ from torch.utils.data import DataLoader as TorchDataLoader
18
+ from torch.utils.data import Dataset as TorchDataset
19
+ from tqdm.auto import tqdm
20
+
21
+ from ._utils.common import CreateMode, DropMode
22
+ from ._utils.data_parsing import hf_dataset_from_torch
23
+ from ._utils.tqdm_file_reader import TqdmFileReader
24
+ from .client import DatasourceMetadata, OrcaClient
25
+
26
+
27
+ def _upload_files_to_datasource(
28
+ name: str,
29
+ file_paths: list[Path],
30
+ description: str | None = None,
31
+ ) -> DatasourceMetadata:
32
+ """
33
+ Helper function to upload files to create a datasource using manual HTTP requests.
34
+
35
+ This bypasses the generated client because it doesn't handle file uploads properly.
36
+
37
+ Params:
38
+ name: Name for the datasource
39
+ file_paths: List of file paths to upload
40
+ description: Optional description for the datasource
41
+
42
+ Returns:
43
+ Metadata for the created datasource
44
+ """
45
+ files: list[tuple[Literal["files"], FileTypes]] = []
46
+
47
+ # Calculate total size for all files
48
+ total_size = sum(file_path.stat().st_size for file_path in file_paths)
49
+
50
+ with tqdm(total=total_size, unit="B", unit_scale=True, desc="Uploading") as pbar:
51
+ for file_path in file_paths:
52
+ buffered_reader = open(file_path, "rb")
53
+ tqdm_reader = TqdmFileReader(buffered_reader, pbar)
54
+ files.append(("files", (file_path.name, cast(bytes, tqdm_reader))))
55
+
56
+ # Use manual HTTP request for file uploads
57
+ client = OrcaClient._resolve_client()
58
+ metadata = client.POST(
59
+ "/datasource/upload",
60
+ files=files,
61
+ data={"name": name, "description": description},
62
+ )
63
+
64
+ return metadata
65
+
66
+
67
+ def _handle_existing_datasource(name: str, if_exists: CreateMode) -> Union["Datasource", None]:
68
+ """
69
+ Helper function to handle the common pattern of checking if a datasource exists
70
+ and taking action based on the if_exists parameter.
71
+
72
+ Params:
73
+ name: Name of the datasource to check
74
+ if_exists: What to do if a datasource with the same name already exists
75
+
76
+ Returns:
77
+ Datasource instance if opening existing, None if should proceed with creation
78
+
79
+ Raises:
80
+ ValueError: If the datasource already exists and if_exists is "error"
81
+ """
82
+ if Datasource.exists(name):
83
+ if if_exists == "error":
84
+ raise ValueError(f"Dataset with name {name} already exists")
85
+ elif if_exists == "open":
86
+ return Datasource.open(name)
87
+ return None
88
+
89
+
90
+ class Datasource:
91
+ """
92
+ A Handle to a datasource in the OrcaCloud
93
+
94
+ A Datasource is a collection of data saved to the OrcaCloud that can be used to create a [`Memoryset`][orca_sdk.LabeledMemoryset].
95
+ It can be created from a Hugging Face Dataset, a PyTorch DataLoader or Dataset, a list of dictionaries, a dictionary of columns, a pandas DataFrame, a pyarrow Table, or a local file.
96
+
97
+ Attributes:
98
+ id: Unique identifier for the datasource
99
+ name: Unique name of the datasource
100
+ description: Optional description of the datasource
101
+ length: Number of rows in the datasource
102
+ created_at: When the datasource was created
103
+ columns: Dictionary of column names and types
104
+ """
105
+
106
+ id: str
107
+ name: str
108
+ description: str | None
109
+ length: int
110
+ created_at: datetime
111
+ updated_at: datetime
112
+ columns: dict[str, str]
113
+
114
+ def __init__(self, metadata: DatasourceMetadata):
115
+ # for internal use only, do not document
116
+ self.id = metadata["id"]
117
+ self.name = metadata["name"]
118
+ self.length = metadata["length"]
119
+ self.created_at = datetime.fromisoformat(metadata["created_at"])
120
+ self.updated_at = datetime.fromisoformat(metadata["updated_at"])
121
+ self.description = metadata["description"]
122
+ self.columns = {
123
+ column["name"]: (
124
+ f"enum({', '.join(f'{option!r}' for option in column['enum_options'] or []) if 'enum_options' in column else ''})"
125
+ if column["type"] == "ENUM"
126
+ else "str" if column["type"] == "STRING" else column["type"].lower()
127
+ )
128
+ for column in metadata["columns"]
129
+ }
130
+
131
+ def __eq__(self, other) -> bool:
132
+ return isinstance(other, Datasource) and self.id == other.id
133
+
134
+ def __repr__(self) -> str:
135
+ return (
136
+ "Datasource({\n"
137
+ + f" name: '{self.name}',\n"
138
+ + f" length: {self.length},\n"
139
+ + " columns: {{\n "
140
+ + "\n ".join([f"{k}: {v}" for k, v in self.columns.items()])
141
+ + "\n }}\n"
142
+ + "})"
143
+ )
144
+
145
+ @classmethod
146
+ def from_hf_dataset(
147
+ cls, name: str, dataset: Dataset, if_exists: CreateMode = "error", description: str | None = None
148
+ ) -> Datasource:
149
+ """
150
+ Create a new datasource from a Hugging Face Dataset
151
+
152
+ Params:
153
+ name: Required name for the new datasource (must be unique)
154
+ dataset: The Hugging Face Dataset to create the datasource from
155
+ if_exists: What to do if a datasource with the same name already exists, defaults to
156
+ `"error"`. Other option is `"open"` to open the existing datasource.
157
+ description: Optional description for the datasource
158
+
159
+ Returns:
160
+ A handle to the new datasource in the OrcaCloud
161
+
162
+ Raises:
163
+ ValueError: If the datasource already exists and if_exists is `"error"`
164
+ """
165
+ # Check if datasource already exists and handle accordingly
166
+ existing = _handle_existing_datasource(name, if_exists)
167
+ if existing is not None:
168
+ return existing
169
+
170
+ with tempfile.TemporaryDirectory() as tmp_dir:
171
+ dataset.save_to_disk(tmp_dir)
172
+
173
+ # Get all file paths in the directory
174
+ file_paths = list(Path(tmp_dir).iterdir())
175
+
176
+ # Use the helper function to upload files
177
+ metadata = _upload_files_to_datasource(name, file_paths, description)
178
+ return cls(metadata=metadata)
179
+
180
+ @classmethod
181
+ def from_hf_dataset_dict(
182
+ cls,
183
+ name: str,
184
+ dataset_dict: DatasetDict,
185
+ if_exists: CreateMode = "error",
186
+ description: dict[str, str | None] | str | None = None,
187
+ ) -> dict[str, Datasource]:
188
+ """
189
+ Create datasources from a Hugging Face DatasetDict
190
+
191
+ Params:
192
+ name: Name prefix for the new datasources, will be suffixed with the dataset name
193
+ dataset_dict: The Hugging Face DatasetDict to create the datasources from
194
+ if_exists: What to do if a datasource with the same name already exists, defaults to
195
+ `"error"`. Other option is `"open"` to open the existing datasource.
196
+ description: Optional description for the datasources, can be a string or a dictionary of dataset names to descriptions
197
+
198
+ Returns:
199
+ A dictionary of datasource handles, keyed by the dataset name
200
+
201
+ Raises:
202
+ ValueError: If a datasource already exists and if_exists is `"error"`
203
+ """
204
+ if description is None or isinstance(description, str):
205
+ description = {str(dataset_name): description for dataset_name in dataset_dict.keys()}
206
+ return {
207
+ str(dataset_name): cls.from_hf_dataset(
208
+ f"{name}_{dataset_name}", dataset, if_exists=if_exists, description=description[str(dataset_name)]
209
+ )
210
+ for dataset_name, dataset in dataset_dict.items()
211
+ }
212
+
213
+ @classmethod
214
+ def from_pytorch(
215
+ cls,
216
+ name: str,
217
+ torch_data: TorchDataLoader | TorchDataset,
218
+ column_names: list[str] | None = None,
219
+ if_exists: CreateMode = "error",
220
+ description: str | None = None,
221
+ ) -> Datasource:
222
+ """
223
+ Create a new datasource from a PyTorch DataLoader or Dataset
224
+
225
+ Params:
226
+ name: Required name for the new datasource (must be unique)
227
+ torch_data: The PyTorch DataLoader or Dataset to create the datasource from
228
+ column_names: If the provided dataset or data loader returns unnamed tuples, this
229
+ argument must be provided to specify the names of the columns.
230
+ if_exists: What to do if a datasource with the same name already exists, defaults to
231
+ `"error"`. Other option is `"open"` to open the existing datasource.
232
+ description: Optional description for the datasource
233
+
234
+ Returns:
235
+ A handle to the new datasource in the OrcaCloud
236
+
237
+ Raises:
238
+ ValueError: If the datasource already exists and if_exists is `"error"`
239
+ """
240
+ hf_dataset = hf_dataset_from_torch(torch_data, column_names=column_names)
241
+ return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists, description=description)
242
+
243
+ @classmethod
244
+ def from_list(
245
+ cls, name: str, data: list[dict], if_exists: CreateMode = "error", description: str | None = None
246
+ ) -> Datasource:
247
+ """
248
+ Create a new datasource from a list of dictionaries
249
+
250
+ Params:
251
+ name: Required name for the new datasource (must be unique)
252
+ data: The list of dictionaries to create the datasource from
253
+ if_exists: What to do if a datasource with the same name already exists, defaults to
254
+ `"error"`. Other option is `"open"` to open the existing datasource.
255
+ description: Optional description for the datasource
256
+
257
+ Returns:
258
+ A handle to the new datasource in the OrcaCloud
259
+
260
+ Raises:
261
+ ValueError: If the datasource already exists and if_exists is `"error"`
262
+
263
+ Examples:
264
+ >>> Datasource.from_list("my_datasource", [{"text": "Hello, world!", "label": 1}, {"text": "Goodbye", "label": 0}])
265
+ """
266
+ # Check if datasource already exists and handle accordingly
267
+ existing = _handle_existing_datasource(name, if_exists)
268
+ if existing is not None:
269
+ return existing
270
+
271
+ client = OrcaClient._resolve_client()
272
+ metadata = client.POST(
273
+ "/datasource",
274
+ json={"name": name, "description": description, "content": data},
275
+ )
276
+ return cls(metadata=metadata)
277
+
278
+ @classmethod
279
+ def from_dict(
280
+ cls, name: str, data: dict, if_exists: CreateMode = "error", description: str | None = None
281
+ ) -> Datasource:
282
+ """
283
+ Create a new datasource from a dictionary of columns
284
+
285
+ Params:
286
+ name: Required name for the new datasource (must be unique)
287
+ data: The dictionary of columns to create the datasource from
288
+ if_exists: What to do if a datasource with the same name already exists, defaults to
289
+ `"error"`. Other option is `"open"` to open the existing datasource.
290
+ description: Optional description for the datasource
291
+
292
+ Returns:
293
+ A handle to the new datasource in the OrcaCloud
294
+
295
+ Raises:
296
+ ValueError: If the datasource already exists and if_exists is `"error"`
297
+
298
+ Examples:
299
+ >>> Datasource.from_dict("my_datasource", {"text": ["Hello, world!", "Goodbye"], "label": [1, 0]})
300
+ """
301
+ # Check if datasource already exists and handle accordingly
302
+ existing = _handle_existing_datasource(name, if_exists)
303
+ if existing is not None:
304
+ return existing
305
+
306
+ client = OrcaClient._resolve_client()
307
+ metadata = client.POST(
308
+ "/datasource",
309
+ json={"name": name, "description": description, "content": data},
310
+ )
311
+ return cls(metadata=metadata)
312
+
313
+ @classmethod
314
+ def from_pandas(
315
+ cls, name: str, dataframe: pd.DataFrame, if_exists: CreateMode = "error", description: str | None = None
316
+ ) -> Datasource:
317
+ """
318
+ Create a new datasource from a pandas DataFrame
319
+
320
+ Params:
321
+ name: Required name for the new datasource (must be unique)
322
+ dataframe: The pandas DataFrame to create the datasource from
323
+ if_exists: What to do if a datasource with the same name already exists, defaults to
324
+ `"error"`. Other option is `"open"` to open the existing datasource.
325
+ description: Optional description for the datasource
326
+
327
+ Returns:
328
+ A handle to the new datasource in the OrcaCloud
329
+
330
+ Raises:
331
+ ValueError: If the datasource already exists and if_exists is `"error"`
332
+ """
333
+ dataset = Dataset.from_pandas(dataframe)
334
+ return cls.from_hf_dataset(name, dataset, if_exists=if_exists, description=description)
335
+
336
+ @classmethod
337
+ def from_arrow(
338
+ cls, name: str, pyarrow_table: pa.Table, if_exists: CreateMode = "error", description: str | None = None
339
+ ) -> Datasource:
340
+ """
341
+ Create a new datasource from a pyarrow Table
342
+
343
+ Params:
344
+ name: Required name for the new datasource (must be unique)
345
+ pyarrow_table: The pyarrow Table to create the datasource from
346
+ if_exists: What to do if a datasource with the same name already exists, defaults to
347
+ `"error"`. Other option is `"open"` to open the existing datasource.
348
+ description: Optional description for the datasource
349
+
350
+ Returns:
351
+ A handle to the new datasource in the OrcaCloud
352
+
353
+ Raises:
354
+ ValueError: If the datasource already exists and if_exists is `"error"`
355
+ """
356
+ # Check if datasource already exists and handle accordingly
357
+ existing = _handle_existing_datasource(name, if_exists)
358
+ if existing is not None:
359
+ return existing
360
+
361
+ # Write to bytes buffer
362
+ buffer = BytesIO()
363
+ parquet.write_table(pyarrow_table, buffer)
364
+ parquet_bytes = buffer.getvalue()
365
+
366
+ client = OrcaClient._resolve_client()
367
+ metadata = client.POST(
368
+ "/datasource/upload",
369
+ files=[("files", ("data.parquet", parquet_bytes))],
370
+ data={"name": name, "description": description},
371
+ )
372
+
373
+ return cls(metadata=metadata)
374
+
375
+ @classmethod
376
+ def from_disk(
377
+ cls, name: str, file_path: str | PathLike, if_exists: CreateMode = "error", description: str | None = None
378
+ ) -> Datasource:
379
+ """
380
+ Create a new datasource from a local file
381
+
382
+ Params:
383
+ name: Required name for the new datasource (must be unique)
384
+ file_path: Path to the file on disk to create the datasource from. The file type will
385
+ be inferred from the file extension. The following file types are supported:
386
+
387
+ - .pkl: [`Pickle`][pickle] files containing lists of dictionaries or dictionaries of columns
388
+ - .json/.jsonl: [`JSON`][json] and [`JSON`] Lines files
389
+ - .csv: [`CSV`][csv] files
390
+ - .parquet: [`Parquet`][pyarrow.parquet.ParquetFile] files
391
+ - dataset directory: Directory containing a saved HuggingFace [`Dataset`][datasets.Dataset]
392
+
393
+ if_exists: What to do if a datasource with the same name already exists, defaults to
394
+ `"error"`. Other option is `"open"` to open the existing datasource.
395
+ description: Optional description for the datasource
396
+
397
+ Returns:
398
+ A handle to the new datasource in the OrcaCloud
399
+
400
+ Raises:
401
+ ValueError: If the datasource already exists and if_exists is `"error"`
402
+ """
403
+ # Check if datasource already exists and handle accordingly
404
+ existing = _handle_existing_datasource(name, if_exists)
405
+ if existing is not None:
406
+ return existing
407
+
408
+ file_path = Path(file_path)
409
+
410
+ # For dataset directories, use the upload endpoint with multiple files
411
+ if file_path.is_dir():
412
+ return cls.from_hf_dataset(
413
+ name, Dataset.load_from_disk(file_path), if_exists=if_exists, description=description
414
+ )
415
+
416
+ # For single files, use the helper function to upload files
417
+ metadata = _upload_files_to_datasource(name, [file_path], description)
418
+
419
+ return cls(metadata=metadata)
420
+
421
+ @classmethod
422
+ def open(cls, name_or_id: str) -> Datasource:
423
+ """
424
+ Get a handle to a datasource by name or id in the OrcaCloud
425
+
426
+ Params:
427
+ name_or_id: The name or unique identifier of the datasource to get
428
+
429
+ Returns:
430
+ A handle to the existing datasource in the OrcaCloud
431
+
432
+ Raises:
433
+ LookupError: If the datasource does not exist
434
+ """
435
+ client = OrcaClient._resolve_client()
436
+ return cls(client.GET("/datasource/{name_or_id}", params={"name_or_id": name_or_id}))
437
+
438
+ @classmethod
439
+ def exists(cls, name_or_id: str) -> bool:
440
+ """
441
+ Check if a datasource exists in the OrcaCloud
442
+
443
+ Params:
444
+ name_or_id: The name or id of the datasource to check
445
+
446
+ Returns:
447
+ `True` if the datasource exists, `False` otherwise
448
+ """
449
+ try:
450
+ cls.open(name_or_id)
451
+ return True
452
+ except LookupError:
453
+ return False
454
+
455
+ @classmethod
456
+ def all(cls) -> list[Datasource]:
457
+ """
458
+ List all datasource handles in the OrcaCloud
459
+
460
+ Returns:
461
+ A list of all datasource handles in the OrcaCloud
462
+ """
463
+ client = OrcaClient._resolve_client()
464
+ return [cls(metadata) for metadata in client.GET("/datasource")]
465
+
466
+ @classmethod
467
+ def drop(cls, name_or_id: str, if_not_exists: DropMode = "error") -> None:
468
+ """
469
+ Delete a datasource from the OrcaCloud
470
+
471
+ Params:
472
+ name_or_id: The name or id of the datasource to delete
473
+ if_not_exists: What to do if the datasource does not exist, defaults to
474
+ `"error"`. Other options are `"ignore"` to do nothing.
475
+
476
+ Raises:
477
+ LookupError: If the datasource does not exist and if_not_exists is `"error"`
478
+ """
479
+ try:
480
+ client = OrcaClient._resolve_client()
481
+ client.DELETE("/datasource/{name_or_id}", params={"name_or_id": name_or_id})
482
+ logging.info(f"Deleted datasource {name_or_id}")
483
+ except LookupError:
484
+ if if_not_exists == "error":
485
+ raise
486
+
487
+ def __len__(self) -> int:
488
+ return self.length
489
+
490
+ def query(
491
+ self,
492
+ offset: int = 0,
493
+ limit: int = 100,
494
+ shuffle: bool = False,
495
+ shuffle_seed: int | None = None,
496
+ filters: list[tuple[str, Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in", "like"], Any]] = [],
497
+ ) -> list[dict[str, Any]]:
498
+ """
499
+ Query the datasource for rows with pagination and filtering support.
500
+
501
+ Params:
502
+ offset: Number of rows to skip
503
+ limit: Maximum number of rows to return
504
+ shuffle: Whether to shuffle the dataset before pagination
505
+ shuffle_seed: Seed for shuffling (for reproducible results)
506
+ filters: List of filter tuples. Each tuple contains:
507
+ - field (str): Column name to filter on
508
+ - op (str): Operator ("==", "!=", ">", ">=", "<", "<=", "in", "not in", "like")
509
+ - value: Value to compare against
510
+
511
+ Returns:
512
+ List of rows from the datasource
513
+
514
+ Examples:
515
+ >>> datasource.query(filters=[("age", ">", 25)])
516
+ >>> datasource.query(filters=[("city", "in", ["NYC", "LA"])])
517
+ >>> datasource.query(filters=[("name", "like", "John")])
518
+ """
519
+
520
+ client = OrcaClient._resolve_client()
521
+ response = client.POST(
522
+ "/datasource/{name_or_id}/rows",
523
+ params={"name_or_id": self.id},
524
+ json={
525
+ "limit": limit,
526
+ "offset": offset,
527
+ "shuffle": shuffle,
528
+ "shuffle_seed": shuffle_seed,
529
+ "filters": [{"field": field, "op": op, "value": value} for field, op, value in filters],
530
+ },
531
+ )
532
+ return response
533
+
534
+ def download(
535
+ self, output_dir: str | PathLike, file_type: Literal["hf_dataset", "json", "csv"] = "hf_dataset"
536
+ ) -> None:
537
+ """
538
+ Download the datasource to a specified path in the specified format type
539
+
540
+ Params:
541
+ output_dir: The local directory where the downloaded file will be saved.
542
+ file_type: The type of file to download.
543
+
544
+ Returns:
545
+ None
546
+ """
547
+ extension = "zip" if file_type == "hf_dataset" else file_type
548
+ output_path = Path(output_dir) / f"{self.name}.{extension}"
549
+ with open(output_path, "wb") as download_file:
550
+ client = OrcaClient._resolve_client()
551
+ with client.stream("GET", f"/datasource/{self.id}/download", params={"file_type": file_type}) as response:
552
+ total_chunks = int(response.headers["X-Total-Chunks"]) if "X-Total-Chunks" in response.headers else None
553
+ with tqdm(desc="Downloading", total=total_chunks, disable=total_chunks is None) as progress:
554
+ for chunk in response.iter_bytes():
555
+ download_file.write(chunk)
556
+ progress.update(1)
557
+
558
+ # extract the zip file
559
+ if extension == "zip":
560
+ extract_dir = Path(output_dir) / self.name
561
+ with zipfile.ZipFile(output_path, "r") as zip_ref:
562
+ zip_ref.extractall(extract_dir)
563
+ output_path.unlink() # Remove the zip file after extraction
564
+ logging.info(f"Downloaded {extract_dir}")
565
+ else:
566
+ logging.info(f"Downloaded {output_path}")
567
+
568
+ def to_list(self) -> list[dict]:
569
+ """
570
+ Convert the datasource to a list of dictionaries.
571
+
572
+ Returns:
573
+ A list of dictionaries representation of the datasource.
574
+ """
575
+ client = OrcaClient._resolve_client()
576
+ return client.GET("/datasource/{name_or_id}/download", params={"name_or_id": self.id, "file_type": "json"})