orca-sdk 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/__init__.py +30 -0
- orca_sdk/_shared/__init__.py +10 -0
- orca_sdk/_shared/metrics.py +634 -0
- orca_sdk/_shared/metrics_test.py +570 -0
- orca_sdk/_utils/__init__.py +0 -0
- orca_sdk/_utils/analysis_ui.py +196 -0
- orca_sdk/_utils/analysis_ui_style.css +51 -0
- orca_sdk/_utils/auth.py +65 -0
- orca_sdk/_utils/auth_test.py +31 -0
- orca_sdk/_utils/common.py +37 -0
- orca_sdk/_utils/data_parsing.py +129 -0
- orca_sdk/_utils/data_parsing_test.py +244 -0
- orca_sdk/_utils/pagination.py +126 -0
- orca_sdk/_utils/pagination_test.py +132 -0
- orca_sdk/_utils/prediction_result_ui.css +18 -0
- orca_sdk/_utils/prediction_result_ui.py +110 -0
- orca_sdk/_utils/tqdm_file_reader.py +12 -0
- orca_sdk/_utils/value_parser.py +45 -0
- orca_sdk/_utils/value_parser_test.py +39 -0
- orca_sdk/async_client.py +4104 -0
- orca_sdk/classification_model.py +1165 -0
- orca_sdk/classification_model_test.py +887 -0
- orca_sdk/client.py +4096 -0
- orca_sdk/conftest.py +382 -0
- orca_sdk/credentials.py +217 -0
- orca_sdk/credentials_test.py +121 -0
- orca_sdk/datasource.py +576 -0
- orca_sdk/datasource_test.py +463 -0
- orca_sdk/embedding_model.py +712 -0
- orca_sdk/embedding_model_test.py +206 -0
- orca_sdk/job.py +343 -0
- orca_sdk/job_test.py +108 -0
- orca_sdk/memoryset.py +3811 -0
- orca_sdk/memoryset_test.py +1150 -0
- orca_sdk/regression_model.py +841 -0
- orca_sdk/regression_model_test.py +595 -0
- orca_sdk/telemetry.py +742 -0
- orca_sdk/telemetry_test.py +119 -0
- orca_sdk-0.1.9.dist-info/METADATA +98 -0
- orca_sdk-0.1.9.dist-info/RECORD +41 -0
- orca_sdk-0.1.9.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from uuid import uuid4
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from .client import OrcaClient
|
|
6
|
+
from .credentials import OrcaCredentials
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_is_authenticated():
|
|
10
|
+
assert OrcaCredentials.is_authenticated()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_is_authenticated_false(unauthenticated_client):
|
|
14
|
+
with unauthenticated_client.use():
|
|
15
|
+
assert not OrcaCredentials.is_authenticated()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_is_healthy():
|
|
19
|
+
assert OrcaCredentials.is_healthy()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_is_healthy_false(api_key):
|
|
23
|
+
with OrcaClient(api_key=api_key, base_url="http://localhost:1582").use():
|
|
24
|
+
assert not OrcaCredentials.is_healthy()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_list_api_keys():
|
|
28
|
+
api_keys = OrcaCredentials.list_api_keys()
|
|
29
|
+
assert len(api_keys) >= 1
|
|
30
|
+
assert "orca_sdk_test" in [api_key.name for api_key in api_keys]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_list_api_keys_unauthenticated(unauthenticated_client):
|
|
34
|
+
with unauthenticated_client.use():
|
|
35
|
+
with pytest.raises(ValueError, match="Invalid API key"):
|
|
36
|
+
OrcaCredentials.list_api_keys()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_manage_api_key():
|
|
40
|
+
api_key_name = f"orca_sdk_test_{uuid4().hex[:8]}"
|
|
41
|
+
api_key = OrcaCredentials.create_api_key(api_key_name)
|
|
42
|
+
assert api_key is not None
|
|
43
|
+
assert len(api_key) > 0
|
|
44
|
+
assert api_key_name in [aki.name for aki in OrcaCredentials.list_api_keys()]
|
|
45
|
+
OrcaCredentials.revoke_api_key(api_key_name)
|
|
46
|
+
assert api_key_name not in [aki.name for aki in OrcaCredentials.list_api_keys()]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_create_api_key_unauthenticated(unauthenticated_client):
|
|
50
|
+
with unauthenticated_client.use():
|
|
51
|
+
with pytest.raises(ValueError, match="Invalid API key"):
|
|
52
|
+
OrcaCredentials.create_api_key(f"orca_sdk_test_{uuid4().hex[:8]}")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_create_api_key_unauthorized(predict_only_client):
|
|
56
|
+
with predict_only_client.use():
|
|
57
|
+
with pytest.raises(PermissionError):
|
|
58
|
+
OrcaCredentials.create_api_key(f"orca_sdk_test_{uuid4().hex[:8]}")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_revoke_api_key_unauthenticated(unauthenticated_client):
|
|
62
|
+
with unauthenticated_client.use():
|
|
63
|
+
with pytest.raises(ValueError, match="Invalid API key"):
|
|
64
|
+
OrcaCredentials.revoke_api_key(f"orca_sdk_test_{uuid4().hex[:8]}")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_revoke_api_key_unauthorized(predict_only_client):
|
|
68
|
+
with predict_only_client.use():
|
|
69
|
+
with pytest.raises(PermissionError):
|
|
70
|
+
OrcaCredentials.revoke_api_key(f"orca_sdk_test_{uuid4().hex[:8]}")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_create_api_key_already_exists():
|
|
74
|
+
with pytest.raises(ValueError, match="API key with this name already exists"):
|
|
75
|
+
OrcaCredentials.create_api_key("orca_sdk_test")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def test_set_api_key(api_key):
|
|
79
|
+
client = OrcaClient(api_key=str(uuid4()))
|
|
80
|
+
with client.use():
|
|
81
|
+
assert not OrcaCredentials.is_authenticated()
|
|
82
|
+
client.api_key = api_key
|
|
83
|
+
assert client.api_key == api_key
|
|
84
|
+
assert OrcaCredentials.is_authenticated()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_set_base_url(api_key):
|
|
88
|
+
client = OrcaClient(base_url="http://localhost:1582")
|
|
89
|
+
assert client.base_url == "http://localhost:1582"
|
|
90
|
+
client.base_url = "http://localhost:1583"
|
|
91
|
+
assert client.base_url == "http://localhost:1583"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# deprecated methods:
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_deprecated_set_api_key(api_key):
|
|
98
|
+
with OrcaClient(api_key=str(uuid4())).use():
|
|
99
|
+
assert not OrcaCredentials.is_authenticated()
|
|
100
|
+
OrcaCredentials.set_api_key(api_key)
|
|
101
|
+
assert OrcaCredentials.is_authenticated()
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test_deprecated_set_invalid_api_key(api_key):
|
|
105
|
+
with OrcaClient(api_key=api_key).use():
|
|
106
|
+
assert OrcaCredentials.is_authenticated()
|
|
107
|
+
with pytest.raises(ValueError, match="Invalid API key"):
|
|
108
|
+
OrcaCredentials.set_api_key(str(uuid4()))
|
|
109
|
+
assert not OrcaCredentials.is_authenticated()
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def test_deprecated_set_api_url(api_key):
|
|
113
|
+
with OrcaClient(api_key=api_key).use():
|
|
114
|
+
OrcaCredentials.set_api_url("http://api.orcadb.ai")
|
|
115
|
+
assert str(OrcaClient._resolve_client().base_url) == "http://api.orcadb.ai"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def test_deprecated_set_invalid_api_url(api_key):
|
|
119
|
+
with OrcaClient(api_key=api_key).use():
|
|
120
|
+
with pytest.raises(ValueError, match="No API found at http://localhost:1582"):
|
|
121
|
+
OrcaCredentials.set_api_url("http://localhost:1582")
|
orca_sdk/datasource.py
ADDED
|
@@ -0,0 +1,576 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import tempfile
|
|
5
|
+
import zipfile
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from os import PathLike
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Literal, Union, cast
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import pyarrow as pa
|
|
14
|
+
from datasets import Dataset, DatasetDict
|
|
15
|
+
from httpx._types import FileTypes # type: ignore
|
|
16
|
+
from pyarrow import parquet
|
|
17
|
+
from torch.utils.data import DataLoader as TorchDataLoader
|
|
18
|
+
from torch.utils.data import Dataset as TorchDataset
|
|
19
|
+
from tqdm.auto import tqdm
|
|
20
|
+
|
|
21
|
+
from ._utils.common import CreateMode, DropMode
|
|
22
|
+
from ._utils.data_parsing import hf_dataset_from_torch
|
|
23
|
+
from ._utils.tqdm_file_reader import TqdmFileReader
|
|
24
|
+
from .client import DatasourceMetadata, OrcaClient
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _upload_files_to_datasource(
|
|
28
|
+
name: str,
|
|
29
|
+
file_paths: list[Path],
|
|
30
|
+
description: str | None = None,
|
|
31
|
+
) -> DatasourceMetadata:
|
|
32
|
+
"""
|
|
33
|
+
Helper function to upload files to create a datasource using manual HTTP requests.
|
|
34
|
+
|
|
35
|
+
This bypasses the generated client because it doesn't handle file uploads properly.
|
|
36
|
+
|
|
37
|
+
Params:
|
|
38
|
+
name: Name for the datasource
|
|
39
|
+
file_paths: List of file paths to upload
|
|
40
|
+
description: Optional description for the datasource
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Metadata for the created datasource
|
|
44
|
+
"""
|
|
45
|
+
files: list[tuple[Literal["files"], FileTypes]] = []
|
|
46
|
+
|
|
47
|
+
# Calculate total size for all files
|
|
48
|
+
total_size = sum(file_path.stat().st_size for file_path in file_paths)
|
|
49
|
+
|
|
50
|
+
with tqdm(total=total_size, unit="B", unit_scale=True, desc="Uploading") as pbar:
|
|
51
|
+
for file_path in file_paths:
|
|
52
|
+
buffered_reader = open(file_path, "rb")
|
|
53
|
+
tqdm_reader = TqdmFileReader(buffered_reader, pbar)
|
|
54
|
+
files.append(("files", (file_path.name, cast(bytes, tqdm_reader))))
|
|
55
|
+
|
|
56
|
+
# Use manual HTTP request for file uploads
|
|
57
|
+
client = OrcaClient._resolve_client()
|
|
58
|
+
metadata = client.POST(
|
|
59
|
+
"/datasource/upload",
|
|
60
|
+
files=files,
|
|
61
|
+
data={"name": name, "description": description},
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return metadata
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _handle_existing_datasource(name: str, if_exists: CreateMode) -> Union["Datasource", None]:
|
|
68
|
+
"""
|
|
69
|
+
Helper function to handle the common pattern of checking if a datasource exists
|
|
70
|
+
and taking action based on the if_exists parameter.
|
|
71
|
+
|
|
72
|
+
Params:
|
|
73
|
+
name: Name of the datasource to check
|
|
74
|
+
if_exists: What to do if a datasource with the same name already exists
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Datasource instance if opening existing, None if should proceed with creation
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
ValueError: If the datasource already exists and if_exists is "error"
|
|
81
|
+
"""
|
|
82
|
+
if Datasource.exists(name):
|
|
83
|
+
if if_exists == "error":
|
|
84
|
+
raise ValueError(f"Dataset with name {name} already exists")
|
|
85
|
+
elif if_exists == "open":
|
|
86
|
+
return Datasource.open(name)
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class Datasource:
|
|
91
|
+
"""
|
|
92
|
+
A Handle to a datasource in the OrcaCloud
|
|
93
|
+
|
|
94
|
+
A Datasource is a collection of data saved to the OrcaCloud that can be used to create a [`Memoryset`][orca_sdk.LabeledMemoryset].
|
|
95
|
+
It can be created from a Hugging Face Dataset, a PyTorch DataLoader or Dataset, a list of dictionaries, a dictionary of columns, a pandas DataFrame, a pyarrow Table, or a local file.
|
|
96
|
+
|
|
97
|
+
Attributes:
|
|
98
|
+
id: Unique identifier for the datasource
|
|
99
|
+
name: Unique name of the datasource
|
|
100
|
+
description: Optional description of the datasource
|
|
101
|
+
length: Number of rows in the datasource
|
|
102
|
+
created_at: When the datasource was created
|
|
103
|
+
columns: Dictionary of column names and types
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
id: str
|
|
107
|
+
name: str
|
|
108
|
+
description: str | None
|
|
109
|
+
length: int
|
|
110
|
+
created_at: datetime
|
|
111
|
+
updated_at: datetime
|
|
112
|
+
columns: dict[str, str]
|
|
113
|
+
|
|
114
|
+
def __init__(self, metadata: DatasourceMetadata):
|
|
115
|
+
# for internal use only, do not document
|
|
116
|
+
self.id = metadata["id"]
|
|
117
|
+
self.name = metadata["name"]
|
|
118
|
+
self.length = metadata["length"]
|
|
119
|
+
self.created_at = datetime.fromisoformat(metadata["created_at"])
|
|
120
|
+
self.updated_at = datetime.fromisoformat(metadata["updated_at"])
|
|
121
|
+
self.description = metadata["description"]
|
|
122
|
+
self.columns = {
|
|
123
|
+
column["name"]: (
|
|
124
|
+
f"enum({', '.join(f'{option!r}' for option in column['enum_options'] or []) if 'enum_options' in column else ''})"
|
|
125
|
+
if column["type"] == "ENUM"
|
|
126
|
+
else "str" if column["type"] == "STRING" else column["type"].lower()
|
|
127
|
+
)
|
|
128
|
+
for column in metadata["columns"]
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
def __eq__(self, other) -> bool:
|
|
132
|
+
return isinstance(other, Datasource) and self.id == other.id
|
|
133
|
+
|
|
134
|
+
def __repr__(self) -> str:
|
|
135
|
+
return (
|
|
136
|
+
"Datasource({\n"
|
|
137
|
+
+ f" name: '{self.name}',\n"
|
|
138
|
+
+ f" length: {self.length},\n"
|
|
139
|
+
+ " columns: {{\n "
|
|
140
|
+
+ "\n ".join([f"{k}: {v}" for k, v in self.columns.items()])
|
|
141
|
+
+ "\n }}\n"
|
|
142
|
+
+ "})"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def from_hf_dataset(
|
|
147
|
+
cls, name: str, dataset: Dataset, if_exists: CreateMode = "error", description: str | None = None
|
|
148
|
+
) -> Datasource:
|
|
149
|
+
"""
|
|
150
|
+
Create a new datasource from a Hugging Face Dataset
|
|
151
|
+
|
|
152
|
+
Params:
|
|
153
|
+
name: Required name for the new datasource (must be unique)
|
|
154
|
+
dataset: The Hugging Face Dataset to create the datasource from
|
|
155
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
156
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
157
|
+
description: Optional description for the datasource
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
A handle to the new datasource in the OrcaCloud
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
164
|
+
"""
|
|
165
|
+
# Check if datasource already exists and handle accordingly
|
|
166
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
167
|
+
if existing is not None:
|
|
168
|
+
return existing
|
|
169
|
+
|
|
170
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
171
|
+
dataset.save_to_disk(tmp_dir)
|
|
172
|
+
|
|
173
|
+
# Get all file paths in the directory
|
|
174
|
+
file_paths = list(Path(tmp_dir).iterdir())
|
|
175
|
+
|
|
176
|
+
# Use the helper function to upload files
|
|
177
|
+
metadata = _upload_files_to_datasource(name, file_paths, description)
|
|
178
|
+
return cls(metadata=metadata)
|
|
179
|
+
|
|
180
|
+
@classmethod
|
|
181
|
+
def from_hf_dataset_dict(
|
|
182
|
+
cls,
|
|
183
|
+
name: str,
|
|
184
|
+
dataset_dict: DatasetDict,
|
|
185
|
+
if_exists: CreateMode = "error",
|
|
186
|
+
description: dict[str, str | None] | str | None = None,
|
|
187
|
+
) -> dict[str, Datasource]:
|
|
188
|
+
"""
|
|
189
|
+
Create datasources from a Hugging Face DatasetDict
|
|
190
|
+
|
|
191
|
+
Params:
|
|
192
|
+
name: Name prefix for the new datasources, will be suffixed with the dataset name
|
|
193
|
+
dataset_dict: The Hugging Face DatasetDict to create the datasources from
|
|
194
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
195
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
196
|
+
description: Optional description for the datasources, can be a string or a dictionary of dataset names to descriptions
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
A dictionary of datasource handles, keyed by the dataset name
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
ValueError: If a datasource already exists and if_exists is `"error"`
|
|
203
|
+
"""
|
|
204
|
+
if description is None or isinstance(description, str):
|
|
205
|
+
description = {str(dataset_name): description for dataset_name in dataset_dict.keys()}
|
|
206
|
+
return {
|
|
207
|
+
str(dataset_name): cls.from_hf_dataset(
|
|
208
|
+
f"{name}_{dataset_name}", dataset, if_exists=if_exists, description=description[str(dataset_name)]
|
|
209
|
+
)
|
|
210
|
+
for dataset_name, dataset in dataset_dict.items()
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
@classmethod
|
|
214
|
+
def from_pytorch(
|
|
215
|
+
cls,
|
|
216
|
+
name: str,
|
|
217
|
+
torch_data: TorchDataLoader | TorchDataset,
|
|
218
|
+
column_names: list[str] | None = None,
|
|
219
|
+
if_exists: CreateMode = "error",
|
|
220
|
+
description: str | None = None,
|
|
221
|
+
) -> Datasource:
|
|
222
|
+
"""
|
|
223
|
+
Create a new datasource from a PyTorch DataLoader or Dataset
|
|
224
|
+
|
|
225
|
+
Params:
|
|
226
|
+
name: Required name for the new datasource (must be unique)
|
|
227
|
+
torch_data: The PyTorch DataLoader or Dataset to create the datasource from
|
|
228
|
+
column_names: If the provided dataset or data loader returns unnamed tuples, this
|
|
229
|
+
argument must be provided to specify the names of the columns.
|
|
230
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
231
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
232
|
+
description: Optional description for the datasource
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
A handle to the new datasource in the OrcaCloud
|
|
236
|
+
|
|
237
|
+
Raises:
|
|
238
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
239
|
+
"""
|
|
240
|
+
hf_dataset = hf_dataset_from_torch(torch_data, column_names=column_names)
|
|
241
|
+
return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists, description=description)
|
|
242
|
+
|
|
243
|
+
@classmethod
|
|
244
|
+
def from_list(
|
|
245
|
+
cls, name: str, data: list[dict], if_exists: CreateMode = "error", description: str | None = None
|
|
246
|
+
) -> Datasource:
|
|
247
|
+
"""
|
|
248
|
+
Create a new datasource from a list of dictionaries
|
|
249
|
+
|
|
250
|
+
Params:
|
|
251
|
+
name: Required name for the new datasource (must be unique)
|
|
252
|
+
data: The list of dictionaries to create the datasource from
|
|
253
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
254
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
255
|
+
description: Optional description for the datasource
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
A handle to the new datasource in the OrcaCloud
|
|
259
|
+
|
|
260
|
+
Raises:
|
|
261
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
262
|
+
|
|
263
|
+
Examples:
|
|
264
|
+
>>> Datasource.from_list("my_datasource", [{"text": "Hello, world!", "label": 1}, {"text": "Goodbye", "label": 0}])
|
|
265
|
+
"""
|
|
266
|
+
# Check if datasource already exists and handle accordingly
|
|
267
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
268
|
+
if existing is not None:
|
|
269
|
+
return existing
|
|
270
|
+
|
|
271
|
+
client = OrcaClient._resolve_client()
|
|
272
|
+
metadata = client.POST(
|
|
273
|
+
"/datasource",
|
|
274
|
+
json={"name": name, "description": description, "content": data},
|
|
275
|
+
)
|
|
276
|
+
return cls(metadata=metadata)
|
|
277
|
+
|
|
278
|
+
@classmethod
|
|
279
|
+
def from_dict(
|
|
280
|
+
cls, name: str, data: dict, if_exists: CreateMode = "error", description: str | None = None
|
|
281
|
+
) -> Datasource:
|
|
282
|
+
"""
|
|
283
|
+
Create a new datasource from a dictionary of columns
|
|
284
|
+
|
|
285
|
+
Params:
|
|
286
|
+
name: Required name for the new datasource (must be unique)
|
|
287
|
+
data: The dictionary of columns to create the datasource from
|
|
288
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
289
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
290
|
+
description: Optional description for the datasource
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
A handle to the new datasource in the OrcaCloud
|
|
294
|
+
|
|
295
|
+
Raises:
|
|
296
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
297
|
+
|
|
298
|
+
Examples:
|
|
299
|
+
>>> Datasource.from_dict("my_datasource", {"text": ["Hello, world!", "Goodbye"], "label": [1, 0]})
|
|
300
|
+
"""
|
|
301
|
+
# Check if datasource already exists and handle accordingly
|
|
302
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
303
|
+
if existing is not None:
|
|
304
|
+
return existing
|
|
305
|
+
|
|
306
|
+
client = OrcaClient._resolve_client()
|
|
307
|
+
metadata = client.POST(
|
|
308
|
+
"/datasource",
|
|
309
|
+
json={"name": name, "description": description, "content": data},
|
|
310
|
+
)
|
|
311
|
+
return cls(metadata=metadata)
|
|
312
|
+
|
|
313
|
+
@classmethod
|
|
314
|
+
def from_pandas(
|
|
315
|
+
cls, name: str, dataframe: pd.DataFrame, if_exists: CreateMode = "error", description: str | None = None
|
|
316
|
+
) -> Datasource:
|
|
317
|
+
"""
|
|
318
|
+
Create a new datasource from a pandas DataFrame
|
|
319
|
+
|
|
320
|
+
Params:
|
|
321
|
+
name: Required name for the new datasource (must be unique)
|
|
322
|
+
dataframe: The pandas DataFrame to create the datasource from
|
|
323
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
324
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
325
|
+
description: Optional description for the datasource
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
A handle to the new datasource in the OrcaCloud
|
|
329
|
+
|
|
330
|
+
Raises:
|
|
331
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
332
|
+
"""
|
|
333
|
+
dataset = Dataset.from_pandas(dataframe)
|
|
334
|
+
return cls.from_hf_dataset(name, dataset, if_exists=if_exists, description=description)
|
|
335
|
+
|
|
336
|
+
@classmethod
|
|
337
|
+
def from_arrow(
|
|
338
|
+
cls, name: str, pyarrow_table: pa.Table, if_exists: CreateMode = "error", description: str | None = None
|
|
339
|
+
) -> Datasource:
|
|
340
|
+
"""
|
|
341
|
+
Create a new datasource from a pyarrow Table
|
|
342
|
+
|
|
343
|
+
Params:
|
|
344
|
+
name: Required name for the new datasource (must be unique)
|
|
345
|
+
pyarrow_table: The pyarrow Table to create the datasource from
|
|
346
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
347
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
348
|
+
description: Optional description for the datasource
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
A handle to the new datasource in the OrcaCloud
|
|
352
|
+
|
|
353
|
+
Raises:
|
|
354
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
355
|
+
"""
|
|
356
|
+
# Check if datasource already exists and handle accordingly
|
|
357
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
358
|
+
if existing is not None:
|
|
359
|
+
return existing
|
|
360
|
+
|
|
361
|
+
# Write to bytes buffer
|
|
362
|
+
buffer = BytesIO()
|
|
363
|
+
parquet.write_table(pyarrow_table, buffer)
|
|
364
|
+
parquet_bytes = buffer.getvalue()
|
|
365
|
+
|
|
366
|
+
client = OrcaClient._resolve_client()
|
|
367
|
+
metadata = client.POST(
|
|
368
|
+
"/datasource/upload",
|
|
369
|
+
files=[("files", ("data.parquet", parquet_bytes))],
|
|
370
|
+
data={"name": name, "description": description},
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
return cls(metadata=metadata)
|
|
374
|
+
|
|
375
|
+
@classmethod
|
|
376
|
+
def from_disk(
|
|
377
|
+
cls, name: str, file_path: str | PathLike, if_exists: CreateMode = "error", description: str | None = None
|
|
378
|
+
) -> Datasource:
|
|
379
|
+
"""
|
|
380
|
+
Create a new datasource from a local file
|
|
381
|
+
|
|
382
|
+
Params:
|
|
383
|
+
name: Required name for the new datasource (must be unique)
|
|
384
|
+
file_path: Path to the file on disk to create the datasource from. The file type will
|
|
385
|
+
be inferred from the file extension. The following file types are supported:
|
|
386
|
+
|
|
387
|
+
- .pkl: [`Pickle`][pickle] files containing lists of dictionaries or dictionaries of columns
|
|
388
|
+
- .json/.jsonl: [`JSON`][json] and [`JSON`] Lines files
|
|
389
|
+
- .csv: [`CSV`][csv] files
|
|
390
|
+
- .parquet: [`Parquet`][pyarrow.parquet.ParquetFile] files
|
|
391
|
+
- dataset directory: Directory containing a saved HuggingFace [`Dataset`][datasets.Dataset]
|
|
392
|
+
|
|
393
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
394
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
395
|
+
description: Optional description for the datasource
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
A handle to the new datasource in the OrcaCloud
|
|
399
|
+
|
|
400
|
+
Raises:
|
|
401
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
402
|
+
"""
|
|
403
|
+
# Check if datasource already exists and handle accordingly
|
|
404
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
405
|
+
if existing is not None:
|
|
406
|
+
return existing
|
|
407
|
+
|
|
408
|
+
file_path = Path(file_path)
|
|
409
|
+
|
|
410
|
+
# For dataset directories, use the upload endpoint with multiple files
|
|
411
|
+
if file_path.is_dir():
|
|
412
|
+
return cls.from_hf_dataset(
|
|
413
|
+
name, Dataset.load_from_disk(file_path), if_exists=if_exists, description=description
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
# For single files, use the helper function to upload files
|
|
417
|
+
metadata = _upload_files_to_datasource(name, [file_path], description)
|
|
418
|
+
|
|
419
|
+
return cls(metadata=metadata)
|
|
420
|
+
|
|
421
|
+
@classmethod
|
|
422
|
+
def open(cls, name_or_id: str) -> Datasource:
|
|
423
|
+
"""
|
|
424
|
+
Get a handle to a datasource by name or id in the OrcaCloud
|
|
425
|
+
|
|
426
|
+
Params:
|
|
427
|
+
name_or_id: The name or unique identifier of the datasource to get
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
A handle to the existing datasource in the OrcaCloud
|
|
431
|
+
|
|
432
|
+
Raises:
|
|
433
|
+
LookupError: If the datasource does not exist
|
|
434
|
+
"""
|
|
435
|
+
client = OrcaClient._resolve_client()
|
|
436
|
+
return cls(client.GET("/datasource/{name_or_id}", params={"name_or_id": name_or_id}))
|
|
437
|
+
|
|
438
|
+
@classmethod
|
|
439
|
+
def exists(cls, name_or_id: str) -> bool:
|
|
440
|
+
"""
|
|
441
|
+
Check if a datasource exists in the OrcaCloud
|
|
442
|
+
|
|
443
|
+
Params:
|
|
444
|
+
name_or_id: The name or id of the datasource to check
|
|
445
|
+
|
|
446
|
+
Returns:
|
|
447
|
+
`True` if the datasource exists, `False` otherwise
|
|
448
|
+
"""
|
|
449
|
+
try:
|
|
450
|
+
cls.open(name_or_id)
|
|
451
|
+
return True
|
|
452
|
+
except LookupError:
|
|
453
|
+
return False
|
|
454
|
+
|
|
455
|
+
@classmethod
|
|
456
|
+
def all(cls) -> list[Datasource]:
|
|
457
|
+
"""
|
|
458
|
+
List all datasource handles in the OrcaCloud
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
A list of all datasource handles in the OrcaCloud
|
|
462
|
+
"""
|
|
463
|
+
client = OrcaClient._resolve_client()
|
|
464
|
+
return [cls(metadata) for metadata in client.GET("/datasource")]
|
|
465
|
+
|
|
466
|
+
@classmethod
|
|
467
|
+
def drop(cls, name_or_id: str, if_not_exists: DropMode = "error") -> None:
|
|
468
|
+
"""
|
|
469
|
+
Delete a datasource from the OrcaCloud
|
|
470
|
+
|
|
471
|
+
Params:
|
|
472
|
+
name_or_id: The name or id of the datasource to delete
|
|
473
|
+
if_not_exists: What to do if the datasource does not exist, defaults to
|
|
474
|
+
`"error"`. Other options are `"ignore"` to do nothing.
|
|
475
|
+
|
|
476
|
+
Raises:
|
|
477
|
+
LookupError: If the datasource does not exist and if_not_exists is `"error"`
|
|
478
|
+
"""
|
|
479
|
+
try:
|
|
480
|
+
client = OrcaClient._resolve_client()
|
|
481
|
+
client.DELETE("/datasource/{name_or_id}", params={"name_or_id": name_or_id})
|
|
482
|
+
logging.info(f"Deleted datasource {name_or_id}")
|
|
483
|
+
except LookupError:
|
|
484
|
+
if if_not_exists == "error":
|
|
485
|
+
raise
|
|
486
|
+
|
|
487
|
+
def __len__(self) -> int:
|
|
488
|
+
return self.length
|
|
489
|
+
|
|
490
|
+
def query(
|
|
491
|
+
self,
|
|
492
|
+
offset: int = 0,
|
|
493
|
+
limit: int = 100,
|
|
494
|
+
shuffle: bool = False,
|
|
495
|
+
shuffle_seed: int | None = None,
|
|
496
|
+
filters: list[tuple[str, Literal["==", "!=", ">", ">=", "<", "<=", "in", "not in", "like"], Any]] = [],
|
|
497
|
+
) -> list[dict[str, Any]]:
|
|
498
|
+
"""
|
|
499
|
+
Query the datasource for rows with pagination and filtering support.
|
|
500
|
+
|
|
501
|
+
Params:
|
|
502
|
+
offset: Number of rows to skip
|
|
503
|
+
limit: Maximum number of rows to return
|
|
504
|
+
shuffle: Whether to shuffle the dataset before pagination
|
|
505
|
+
shuffle_seed: Seed for shuffling (for reproducible results)
|
|
506
|
+
filters: List of filter tuples. Each tuple contains:
|
|
507
|
+
- field (str): Column name to filter on
|
|
508
|
+
- op (str): Operator ("==", "!=", ">", ">=", "<", "<=", "in", "not in", "like")
|
|
509
|
+
- value: Value to compare against
|
|
510
|
+
|
|
511
|
+
Returns:
|
|
512
|
+
List of rows from the datasource
|
|
513
|
+
|
|
514
|
+
Examples:
|
|
515
|
+
>>> datasource.query(filters=[("age", ">", 25)])
|
|
516
|
+
>>> datasource.query(filters=[("city", "in", ["NYC", "LA"])])
|
|
517
|
+
>>> datasource.query(filters=[("name", "like", "John")])
|
|
518
|
+
"""
|
|
519
|
+
|
|
520
|
+
client = OrcaClient._resolve_client()
|
|
521
|
+
response = client.POST(
|
|
522
|
+
"/datasource/{name_or_id}/rows",
|
|
523
|
+
params={"name_or_id": self.id},
|
|
524
|
+
json={
|
|
525
|
+
"limit": limit,
|
|
526
|
+
"offset": offset,
|
|
527
|
+
"shuffle": shuffle,
|
|
528
|
+
"shuffle_seed": shuffle_seed,
|
|
529
|
+
"filters": [{"field": field, "op": op, "value": value} for field, op, value in filters],
|
|
530
|
+
},
|
|
531
|
+
)
|
|
532
|
+
return response
|
|
533
|
+
|
|
534
|
+
def download(
|
|
535
|
+
self, output_dir: str | PathLike, file_type: Literal["hf_dataset", "json", "csv"] = "hf_dataset"
|
|
536
|
+
) -> None:
|
|
537
|
+
"""
|
|
538
|
+
Download the datasource to a specified path in the specified format type
|
|
539
|
+
|
|
540
|
+
Params:
|
|
541
|
+
output_dir: The local directory where the downloaded file will be saved.
|
|
542
|
+
file_type: The type of file to download.
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
None
|
|
546
|
+
"""
|
|
547
|
+
extension = "zip" if file_type == "hf_dataset" else file_type
|
|
548
|
+
output_path = Path(output_dir) / f"{self.name}.{extension}"
|
|
549
|
+
with open(output_path, "wb") as download_file:
|
|
550
|
+
client = OrcaClient._resolve_client()
|
|
551
|
+
with client.stream("GET", f"/datasource/{self.id}/download", params={"file_type": file_type}) as response:
|
|
552
|
+
total_chunks = int(response.headers["X-Total-Chunks"]) if "X-Total-Chunks" in response.headers else None
|
|
553
|
+
with tqdm(desc="Downloading", total=total_chunks, disable=total_chunks is None) as progress:
|
|
554
|
+
for chunk in response.iter_bytes():
|
|
555
|
+
download_file.write(chunk)
|
|
556
|
+
progress.update(1)
|
|
557
|
+
|
|
558
|
+
# extract the zip file
|
|
559
|
+
if extension == "zip":
|
|
560
|
+
extract_dir = Path(output_dir) / self.name
|
|
561
|
+
with zipfile.ZipFile(output_path, "r") as zip_ref:
|
|
562
|
+
zip_ref.extractall(extract_dir)
|
|
563
|
+
output_path.unlink() # Remove the zip file after extraction
|
|
564
|
+
logging.info(f"Downloaded {extract_dir}")
|
|
565
|
+
else:
|
|
566
|
+
logging.info(f"Downloaded {output_path}")
|
|
567
|
+
|
|
568
|
+
def to_list(self) -> list[dict]:
|
|
569
|
+
"""
|
|
570
|
+
Convert the datasource to a list of dictionaries.
|
|
571
|
+
|
|
572
|
+
Returns:
|
|
573
|
+
A list of dictionaries representation of the datasource.
|
|
574
|
+
"""
|
|
575
|
+
client = OrcaClient._resolve_client()
|
|
576
|
+
return client.GET("/datasource/{name_or_id}/download", params={"name_or_id": self.id, "file_type": "json"})
|