orca-sdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/__init__.py +19 -0
- orca_sdk/_generated_api_client/__init__.py +3 -0
- orca_sdk/_generated_api_client/api/__init__.py +193 -0
- orca_sdk/_generated_api_client/api/auth/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/auth/check_authentication_auth_get.py +128 -0
- orca_sdk/_generated_api_client/api/auth/create_api_key_auth_api_key_post.py +170 -0
- orca_sdk/_generated_api_client/api/auth/delete_api_key_auth_api_key_name_or_id_delete.py +156 -0
- orca_sdk/_generated_api_client/api/auth/delete_org_auth_org_delete.py +130 -0
- orca_sdk/_generated_api_client/api/auth/list_api_keys_auth_api_key_get.py +127 -0
- orca_sdk/_generated_api_client/api/classification_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/classification_model/create_evaluation_classification_model_model_name_or_id_evaluation_post.py +183 -0
- orca_sdk/_generated_api_client/api/classification_model/create_model_classification_model_post.py +170 -0
- orca_sdk/_generated_api_client/api/classification_model/delete_evaluation_classification_model_model_name_or_id_evaluation_task_id_delete.py +168 -0
- orca_sdk/_generated_api_client/api/classification_model/delete_model_classification_model_name_or_id_delete.py +154 -0
- orca_sdk/_generated_api_client/api/classification_model/get_evaluation_classification_model_model_name_or_id_evaluation_task_id_get.py +170 -0
- orca_sdk/_generated_api_client/api/classification_model/get_model_classification_model_name_or_id_get.py +156 -0
- orca_sdk/_generated_api_client/api/classification_model/list_evaluations_classification_model_model_name_or_id_evaluation_get.py +161 -0
- orca_sdk/_generated_api_client/api/classification_model/list_models_classification_model_get.py +127 -0
- orca_sdk/_generated_api_client/api/classification_model/predict_gpu_classification_model_name_or_id_prediction_post.py +190 -0
- orca_sdk/_generated_api_client/api/datasource/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/datasource/create_datasource_datasource_post.py +167 -0
- orca_sdk/_generated_api_client/api/datasource/delete_datasource_datasource_name_or_id_delete.py +156 -0
- orca_sdk/_generated_api_client/api/datasource/get_datasource_datasource_name_or_id_get.py +156 -0
- orca_sdk/_generated_api_client/api/datasource/list_datasources_datasource_get.py +127 -0
- orca_sdk/_generated_api_client/api/default/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/default/healthcheck_get.py +118 -0
- orca_sdk/_generated_api_client/api/default/healthcheck_gpu_get.py +118 -0
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/create_finetuned_embedding_model_finetuned_embedding_model_post.py +168 -0
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/delete_finetuned_embedding_model_finetuned_embedding_model_name_or_id_delete.py +156 -0
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/embed_with_finetuned_model_gpu_finetuned_embedding_model_name_or_id_embedding_post.py +189 -0
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/get_finetuned_embedding_model_finetuned_embedding_model_name_or_id_get.py +156 -0
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/list_finetuned_embedding_models_finetuned_embedding_model_get.py +127 -0
- orca_sdk/_generated_api_client/api/memoryset/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/memoryset/clone_memoryset_memoryset_name_or_id_clone_post.py +181 -0
- orca_sdk/_generated_api_client/api/memoryset/create_analysis_memoryset_name_or_id_analysis_post.py +183 -0
- orca_sdk/_generated_api_client/api/memoryset/create_memoryset_memoryset_post.py +168 -0
- orca_sdk/_generated_api_client/api/memoryset/delete_memories_memoryset_name_or_id_memories_delete_post.py +181 -0
- orca_sdk/_generated_api_client/api/memoryset/delete_memory_memoryset_name_or_id_memory_memory_id_delete.py +167 -0
- orca_sdk/_generated_api_client/api/memoryset/delete_memoryset_memoryset_name_or_id_delete.py +156 -0
- orca_sdk/_generated_api_client/api/memoryset/get_analysis_memoryset_name_or_id_analysis_analysis_task_id_get.py +169 -0
- orca_sdk/_generated_api_client/api/memoryset/get_memories_memoryset_name_or_id_memories_get_post.py +188 -0
- orca_sdk/_generated_api_client/api/memoryset/get_memory_memoryset_name_or_id_memory_memory_id_get.py +169 -0
- orca_sdk/_generated_api_client/api/memoryset/get_memoryset_memoryset_name_or_id_get.py +156 -0
- orca_sdk/_generated_api_client/api/memoryset/insert_memories_gpu_memoryset_name_or_id_memory_post.py +184 -0
- orca_sdk/_generated_api_client/api/memoryset/list_analyses_memoryset_name_or_id_analysis_get.py +260 -0
- orca_sdk/_generated_api_client/api/memoryset/list_memorysets_memoryset_get.py +127 -0
- orca_sdk/_generated_api_client/api/memoryset/memoryset_lookup_gpu_memoryset_name_or_id_lookup_post.py +193 -0
- orca_sdk/_generated_api_client/api/memoryset/query_memoryset_memoryset_name_or_id_memories_post.py +188 -0
- orca_sdk/_generated_api_client/api/memoryset/update_memories_gpu_memoryset_name_or_id_memories_patch.py +191 -0
- orca_sdk/_generated_api_client/api/memoryset/update_memory_gpu_memoryset_name_or_id_memory_patch.py +187 -0
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/embed_with_pretrained_model_gpu_pretrained_embedding_model_model_name_embedding_post.py +188 -0
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/get_pretrained_embedding_model_pretrained_embedding_model_model_name_get.py +157 -0
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/list_pretrained_embedding_models_pretrained_embedding_model_get.py +127 -0
- orca_sdk/_generated_api_client/api/task/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/task/abort_task_task_task_id_abort_delete.py +154 -0
- orca_sdk/_generated_api_client/api/task/get_task_status_task_task_id_status_get.py +156 -0
- orca_sdk/_generated_api_client/api/task/list_tasks_task_get.py +243 -0
- orca_sdk/_generated_api_client/api/telemetry/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/telemetry/drop_feedback_category_with_data_telemetry_feedback_category_name_or_id_delete.py +162 -0
- orca_sdk/_generated_api_client/api/telemetry/get_feedback_category_telemetry_feedback_category_name_or_id_get.py +156 -0
- orca_sdk/_generated_api_client/api/telemetry/get_prediction_telemetry_prediction_prediction_id_get.py +157 -0
- orca_sdk/_generated_api_client/api/telemetry/list_feedback_categories_telemetry_feedback_category_get.py +127 -0
- orca_sdk/_generated_api_client/api/telemetry/list_predictions_telemetry_prediction_post.py +175 -0
- orca_sdk/_generated_api_client/api/telemetry/record_prediction_feedback_telemetry_prediction_feedback_put.py +171 -0
- orca_sdk/_generated_api_client/api/telemetry/update_prediction_telemetry_prediction_prediction_id_patch.py +181 -0
- orca_sdk/_generated_api_client/client.py +216 -0
- orca_sdk/_generated_api_client/errors.py +38 -0
- orca_sdk/_generated_api_client/models/__init__.py +159 -0
- orca_sdk/_generated_api_client/models/analyze_neighbor_labels_result.py +84 -0
- orca_sdk/_generated_api_client/models/api_key_metadata.py +118 -0
- orca_sdk/_generated_api_client/models/base_model.py +55 -0
- orca_sdk/_generated_api_client/models/body_create_datasource_datasource_post.py +176 -0
- orca_sdk/_generated_api_client/models/classification_evaluation_result.py +114 -0
- orca_sdk/_generated_api_client/models/clone_labeled_memoryset_request.py +150 -0
- orca_sdk/_generated_api_client/models/column_info.py +114 -0
- orca_sdk/_generated_api_client/models/column_type.py +14 -0
- orca_sdk/_generated_api_client/models/conflict_error_response.py +80 -0
- orca_sdk/_generated_api_client/models/create_api_key_request.py +99 -0
- orca_sdk/_generated_api_client/models/create_api_key_response.py +126 -0
- orca_sdk/_generated_api_client/models/create_labeled_memoryset_request.py +259 -0
- orca_sdk/_generated_api_client/models/create_rac_model_request.py +209 -0
- orca_sdk/_generated_api_client/models/datasource_metadata.py +142 -0
- orca_sdk/_generated_api_client/models/delete_memories_request.py +70 -0
- orca_sdk/_generated_api_client/models/embed_request.py +127 -0
- orca_sdk/_generated_api_client/models/embedding_finetuning_method.py +9 -0
- orca_sdk/_generated_api_client/models/evaluation_request.py +180 -0
- orca_sdk/_generated_api_client/models/evaluation_response.py +140 -0
- orca_sdk/_generated_api_client/models/feedback_type.py +9 -0
- orca_sdk/_generated_api_client/models/field_validation_error.py +103 -0
- orca_sdk/_generated_api_client/models/filter_item.py +231 -0
- orca_sdk/_generated_api_client/models/filter_item_field_type_0_item.py +15 -0
- orca_sdk/_generated_api_client/models/filter_item_field_type_2_item_type_1.py +16 -0
- orca_sdk/_generated_api_client/models/filter_item_op.py +16 -0
- orca_sdk/_generated_api_client/models/find_duplicates_analysis_result.py +70 -0
- orca_sdk/_generated_api_client/models/finetune_embedding_model_request.py +259 -0
- orca_sdk/_generated_api_client/models/finetune_embedding_model_request_training_args.py +66 -0
- orca_sdk/_generated_api_client/models/finetuned_embedding_model_metadata.py +166 -0
- orca_sdk/_generated_api_client/models/get_memories_request.py +70 -0
- orca_sdk/_generated_api_client/models/internal_server_error_response.py +80 -0
- orca_sdk/_generated_api_client/models/label_class_metrics.py +108 -0
- orca_sdk/_generated_api_client/models/label_prediction_memory_lookup.py +274 -0
- orca_sdk/_generated_api_client/models/label_prediction_memory_lookup_metadata.py +68 -0
- orca_sdk/_generated_api_client/models/label_prediction_result.py +101 -0
- orca_sdk/_generated_api_client/models/label_prediction_with_memories_and_feedback.py +232 -0
- orca_sdk/_generated_api_client/models/labeled_memory.py +197 -0
- orca_sdk/_generated_api_client/models/labeled_memory_insert.py +108 -0
- orca_sdk/_generated_api_client/models/labeled_memory_insert_metadata.py +68 -0
- orca_sdk/_generated_api_client/models/labeled_memory_lookup.py +258 -0
- orca_sdk/_generated_api_client/models/labeled_memory_lookup_metadata.py +68 -0
- orca_sdk/_generated_api_client/models/labeled_memory_metadata.py +68 -0
- orca_sdk/_generated_api_client/models/labeled_memory_metrics.py +277 -0
- orca_sdk/_generated_api_client/models/labeled_memory_update.py +171 -0
- orca_sdk/_generated_api_client/models/labeled_memory_update_metadata_type_0.py +68 -0
- orca_sdk/_generated_api_client/models/labeled_memoryset_metadata.py +195 -0
- orca_sdk/_generated_api_client/models/list_analyses_memoryset_name_or_id_analysis_get_type_type_0.py +9 -0
- orca_sdk/_generated_api_client/models/list_memories_request.py +104 -0
- orca_sdk/_generated_api_client/models/list_predictions_request.py +234 -0
- orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_0.py +9 -0
- orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_1.py +9 -0
- orca_sdk/_generated_api_client/models/lookup_request.py +81 -0
- orca_sdk/_generated_api_client/models/memoryset_analysis_request.py +83 -0
- orca_sdk/_generated_api_client/models/memoryset_analysis_request_type.py +9 -0
- orca_sdk/_generated_api_client/models/memoryset_analysis_response.py +180 -0
- orca_sdk/_generated_api_client/models/memoryset_analysis_response_config.py +66 -0
- orca_sdk/_generated_api_client/models/memoryset_analysis_response_type.py +9 -0
- orca_sdk/_generated_api_client/models/not_found_error_response.py +100 -0
- orca_sdk/_generated_api_client/models/not_found_error_response_resource_type_0.py +20 -0
- orca_sdk/_generated_api_client/models/prediction_feedback.py +157 -0
- orca_sdk/_generated_api_client/models/prediction_feedback_category.py +115 -0
- orca_sdk/_generated_api_client/models/prediction_feedback_request.py +122 -0
- orca_sdk/_generated_api_client/models/prediction_feedback_result.py +102 -0
- orca_sdk/_generated_api_client/models/prediction_request.py +169 -0
- orca_sdk/_generated_api_client/models/pretrained_embedding_model_metadata.py +97 -0
- orca_sdk/_generated_api_client/models/pretrained_embedding_model_name.py +11 -0
- orca_sdk/_generated_api_client/models/rac_head_type.py +11 -0
- orca_sdk/_generated_api_client/models/rac_model_metadata.py +191 -0
- orca_sdk/_generated_api_client/models/service_unavailable_error_response.py +80 -0
- orca_sdk/_generated_api_client/models/task.py +198 -0
- orca_sdk/_generated_api_client/models/task_status.py +14 -0
- orca_sdk/_generated_api_client/models/task_status_info.py +133 -0
- orca_sdk/_generated_api_client/models/unauthenticated_error_response.py +72 -0
- orca_sdk/_generated_api_client/models/unauthorized_error_response.py +80 -0
- orca_sdk/_generated_api_client/models/unprocessable_input_error_response.py +94 -0
- orca_sdk/_generated_api_client/models/update_prediction_request.py +93 -0
- orca_sdk/_generated_api_client/py.typed +1 -0
- orca_sdk/_generated_api_client/types.py +56 -0
- orca_sdk/_utils/__init__.py +0 -0
- orca_sdk/_utils/analysis_ui.py +194 -0
- orca_sdk/_utils/analysis_ui_style.css +54 -0
- orca_sdk/_utils/auth.py +63 -0
- orca_sdk/_utils/auth_test.py +31 -0
- orca_sdk/_utils/common.py +37 -0
- orca_sdk/_utils/data_parsing.py +99 -0
- orca_sdk/_utils/data_parsing_test.py +244 -0
- orca_sdk/_utils/prediction_result_ui.css +18 -0
- orca_sdk/_utils/prediction_result_ui.py +64 -0
- orca_sdk/_utils/task.py +73 -0
- orca_sdk/classification_model.py +499 -0
- orca_sdk/classification_model_test.py +266 -0
- orca_sdk/conftest.py +117 -0
- orca_sdk/datasource.py +333 -0
- orca_sdk/datasource_test.py +95 -0
- orca_sdk/embedding_model.py +336 -0
- orca_sdk/embedding_model_test.py +173 -0
- orca_sdk/labeled_memoryset.py +1154 -0
- orca_sdk/labeled_memoryset_test.py +271 -0
- orca_sdk/orca_credentials.py +75 -0
- orca_sdk/orca_credentials_test.py +37 -0
- orca_sdk/telemetry.py +386 -0
- orca_sdk/telemetry_test.py +100 -0
- orca_sdk-0.1.0.dist-info/METADATA +39 -0
- orca_sdk-0.1.0.dist-info/RECORD +175 -0
- orca_sdk-0.1.0.dist-info/WHEEL +4 -0
orca_sdk/datasource.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import tempfile
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from os import PathLike
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import cast
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import pyarrow as pa
|
|
12
|
+
from datasets import Dataset
|
|
13
|
+
from torch.utils.data import DataLoader as TorchDataLoader
|
|
14
|
+
from torch.utils.data import Dataset as TorchDataset
|
|
15
|
+
|
|
16
|
+
from ._generated_api_client.api import (
|
|
17
|
+
delete_datasource,
|
|
18
|
+
get_datasource,
|
|
19
|
+
list_datasources,
|
|
20
|
+
)
|
|
21
|
+
from ._generated_api_client.api.datasource.create_datasource_datasource_post import (
|
|
22
|
+
_parse_response as parse_create_response,
|
|
23
|
+
)
|
|
24
|
+
from ._generated_api_client.client import get_client
|
|
25
|
+
from ._generated_api_client.models import ColumnType, DatasourceMetadata
|
|
26
|
+
from ._utils.common import CreateMode, DropMode
|
|
27
|
+
from ._utils.data_parsing import hf_dataset_from_disk, hf_dataset_from_torch
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Datasource:
|
|
31
|
+
"""
|
|
32
|
+
A Handle to a datasource in the OrcaCloud
|
|
33
|
+
|
|
34
|
+
A Datasource is a collection of data saved to the OrcaCloud that can be used to create a [`Memoryset`][orca_sdk.LabeledMemoryset].
|
|
35
|
+
It can be created from a Hugging Face Dataset, a PyTorch DataLoader or Dataset, a list of dictionaries, a dictionary of columns, a pandas DataFrame, a pyarrow Table, or a local file.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
id: Unique identifier for the datasource
|
|
39
|
+
name: Unique name of the datasource
|
|
40
|
+
length: Number of rows in the datasource
|
|
41
|
+
created_at: When the datasource was created
|
|
42
|
+
columns: Dictionary of column names and types
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
id: str
|
|
46
|
+
name: str
|
|
47
|
+
length: int
|
|
48
|
+
created_at: datetime
|
|
49
|
+
updated_at: datetime
|
|
50
|
+
columns: dict[str, str]
|
|
51
|
+
|
|
52
|
+
def __init__(self, metadata: DatasourceMetadata):
|
|
53
|
+
# for internal use only, do not document
|
|
54
|
+
self.id = metadata.id
|
|
55
|
+
self.name = metadata.name
|
|
56
|
+
self.length = metadata.length
|
|
57
|
+
self.created_at = metadata.created_at
|
|
58
|
+
self.updated_at = metadata.updated_at
|
|
59
|
+
self.columns = {
|
|
60
|
+
column.name: (
|
|
61
|
+
f"enum({', '.join(f'{option!r}' for option in column.enum_options) if column.enum_options else ''}"
|
|
62
|
+
if column.type == ColumnType.ENUM
|
|
63
|
+
else "str"
|
|
64
|
+
if column.type == ColumnType.STRING
|
|
65
|
+
else column.type.value.lower()
|
|
66
|
+
)
|
|
67
|
+
for column in metadata.columns
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
def __eq__(self, other) -> bool:
|
|
71
|
+
return isinstance(other, Datasource) and self.id == other.id
|
|
72
|
+
|
|
73
|
+
def __repr__(self) -> str:
|
|
74
|
+
return (
|
|
75
|
+
"Datasource({\n"
|
|
76
|
+
+ f" name: '{self.name}',\n"
|
|
77
|
+
+ f" length: {self.length},\n"
|
|
78
|
+
+ " columns: {{\n "
|
|
79
|
+
+ "\n ".join([f"{k}: {v}" for k, v in self.columns.items()])
|
|
80
|
+
+ "\n }}\n"
|
|
81
|
+
+ "})"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def from_hf_dataset(cls, name: str, dataset: Dataset, if_exists: CreateMode = "error") -> Datasource:
|
|
86
|
+
"""
|
|
87
|
+
Create a new datasource from a Hugging Face Dataset
|
|
88
|
+
|
|
89
|
+
Params:
|
|
90
|
+
name: Required name for the new datasource (must be unique)
|
|
91
|
+
dataset: The Hugging Face Dataset to create the datasource from
|
|
92
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
93
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
A handle to the new datasource in the OrcaCloud
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
100
|
+
"""
|
|
101
|
+
client = get_client()
|
|
102
|
+
|
|
103
|
+
if cls.exists(name):
|
|
104
|
+
if if_exists == "error":
|
|
105
|
+
raise ValueError(f"Dataset with name {name} already exists")
|
|
106
|
+
elif if_exists == "open":
|
|
107
|
+
return cls.open(name)
|
|
108
|
+
|
|
109
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
110
|
+
dataset.save_to_disk(tmp_dir)
|
|
111
|
+
files = []
|
|
112
|
+
for file_path in Path(tmp_dir).iterdir():
|
|
113
|
+
buffered_reader = open(file_path, "rb")
|
|
114
|
+
files.append(("files", buffered_reader))
|
|
115
|
+
|
|
116
|
+
# Do not use Generated client for this endpoint b/c it does not handle files properly
|
|
117
|
+
metadata = parse_create_response(
|
|
118
|
+
response=client.get_httpx_client().request(
|
|
119
|
+
method="post",
|
|
120
|
+
url="/datasource/",
|
|
121
|
+
files=files,
|
|
122
|
+
data={"name": name},
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
return cls(metadata=metadata)
|
|
126
|
+
|
|
127
|
+
@classmethod
|
|
128
|
+
def from_pytorch(
|
|
129
|
+
cls,
|
|
130
|
+
name: str,
|
|
131
|
+
torch_data: TorchDataLoader | TorchDataset,
|
|
132
|
+
column_names: list[str] | None = None,
|
|
133
|
+
if_exists: CreateMode = "error",
|
|
134
|
+
) -> Datasource:
|
|
135
|
+
"""
|
|
136
|
+
Create a new datasource from a PyTorch DataLoader or Dataset
|
|
137
|
+
|
|
138
|
+
Params:
|
|
139
|
+
name: Required name for the new datasource (must be unique)
|
|
140
|
+
torch_data: The PyTorch DataLoader or Dataset to create the datasource from
|
|
141
|
+
column_names: If the provided dataset or data loader returns unnamed tuples, this
|
|
142
|
+
argument must be provided to specify the names of the columns.
|
|
143
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
144
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
A handle to the new datasource in the OrcaCloud
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
151
|
+
"""
|
|
152
|
+
hf_dataset = hf_dataset_from_torch(torch_data, column_names=column_names)
|
|
153
|
+
return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
|
|
154
|
+
|
|
155
|
+
@classmethod
|
|
156
|
+
def from_list(cls, name: str, data: list[dict], if_exists: CreateMode = "error") -> Datasource:
|
|
157
|
+
"""
|
|
158
|
+
Create a new datasource from a list of dictionaries
|
|
159
|
+
|
|
160
|
+
Params:
|
|
161
|
+
name: Required name for the new datasource (must be unique)
|
|
162
|
+
data: The list of dictionaries to create the datasource from
|
|
163
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
164
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
A handle to the new datasource in the OrcaCloud
|
|
168
|
+
|
|
169
|
+
Raises:
|
|
170
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
171
|
+
|
|
172
|
+
Examples:
|
|
173
|
+
>>> Datasource.from_list("my_datasource", [{"text": "Hello, world!", "label": 1}, {"text": "Goodbye", "label": 0}])
|
|
174
|
+
"""
|
|
175
|
+
hf_dataset = Dataset.from_list(data)
|
|
176
|
+
return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
|
|
177
|
+
|
|
178
|
+
@classmethod
|
|
179
|
+
def from_dict(cls, name: str, data: dict, if_exists: CreateMode = "error") -> Datasource:
|
|
180
|
+
"""
|
|
181
|
+
Create a new datasource from a dictionary of columns
|
|
182
|
+
|
|
183
|
+
Params:
|
|
184
|
+
name: Required name for the new datasource (must be unique)
|
|
185
|
+
data: The dictionary of columns to create the datasource from
|
|
186
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
187
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
A handle to the new datasource in the OrcaCloud
|
|
191
|
+
|
|
192
|
+
Raises:
|
|
193
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
194
|
+
|
|
195
|
+
Examples:
|
|
196
|
+
>>> Datasource.from_dict("my_datasource", {"text": ["Hello, world!", "Goodbye"], "label": [1, 0]})
|
|
197
|
+
"""
|
|
198
|
+
hf_dataset = Dataset.from_dict(data)
|
|
199
|
+
return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
|
|
200
|
+
|
|
201
|
+
@classmethod
|
|
202
|
+
def from_pandas(cls, name: str, dataframe: pd.DataFrame, if_exists: CreateMode = "error") -> Datasource:
|
|
203
|
+
"""
|
|
204
|
+
Create a new datasource from a pandas DataFrame
|
|
205
|
+
|
|
206
|
+
Params:
|
|
207
|
+
name: Required name for the new datasource (must be unique)
|
|
208
|
+
dataframe: The pandas DataFrame to create the datasource from
|
|
209
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
210
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
A handle to the new datasource in the OrcaCloud
|
|
214
|
+
|
|
215
|
+
Raises:
|
|
216
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
217
|
+
"""
|
|
218
|
+
hf_dataset = Dataset.from_pandas(dataframe)
|
|
219
|
+
return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
|
|
220
|
+
|
|
221
|
+
@classmethod
|
|
222
|
+
def from_arrow(cls, name: str, pyarrow_table: pa.Table, if_exists: CreateMode = "error") -> Datasource:
|
|
223
|
+
"""
|
|
224
|
+
Create a new datasource from a pyarrow Table
|
|
225
|
+
|
|
226
|
+
Params:
|
|
227
|
+
name: Required name for the new datasource (must be unique)
|
|
228
|
+
pyarrow_table: The pyarrow Table to create the datasource from
|
|
229
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
230
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
A handle to the new datasource in the OrcaCloud
|
|
234
|
+
|
|
235
|
+
Raises:
|
|
236
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
237
|
+
"""
|
|
238
|
+
hf_dataset = Dataset(pyarrow_table)
|
|
239
|
+
return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
|
|
240
|
+
|
|
241
|
+
@classmethod
|
|
242
|
+
def from_disk(cls, name: str, file_path: str | PathLike, if_exists: CreateMode = "error") -> Datasource:
|
|
243
|
+
"""
|
|
244
|
+
Create a new datasource from a local file
|
|
245
|
+
|
|
246
|
+
Params:
|
|
247
|
+
name: Required name for the new datasource (must be unique)
|
|
248
|
+
file_path: Path to the file on disk to create the datasource from. The file type will
|
|
249
|
+
be inferred from the file extension. The following file types are supported:
|
|
250
|
+
|
|
251
|
+
- .pkl: [`Pickle`][pickle] files containing lists of dictionaries or dictionaries of columns
|
|
252
|
+
- .json/.jsonl: [`JSON`][json] and [`JSON`] Lines files
|
|
253
|
+
- .csv: [`CSV`][csv] files
|
|
254
|
+
- .parquet: [`Parquet`][pyarrow.parquet.ParquetFile] files
|
|
255
|
+
- dataset directory: Directory containing a saved HuggingFace [`Dataset`][datasets.Dataset]
|
|
256
|
+
|
|
257
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
258
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
A handle to the new datasource in the OrcaCloud
|
|
262
|
+
|
|
263
|
+
Raises:
|
|
264
|
+
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
265
|
+
"""
|
|
266
|
+
hf_dataset = hf_dataset_from_disk(file_path)
|
|
267
|
+
return cls.from_hf_dataset(name, cast(Dataset, hf_dataset), if_exists=if_exists)
|
|
268
|
+
|
|
269
|
+
@classmethod
|
|
270
|
+
def open(cls, name: str) -> Datasource:
|
|
271
|
+
"""
|
|
272
|
+
Get a handle to a datasource by name or id in the OrcaCloud
|
|
273
|
+
|
|
274
|
+
Params:
|
|
275
|
+
name: The name or unique identifier of the datasource to get
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
A handle to the existing datasource in the OrcaCloud
|
|
279
|
+
|
|
280
|
+
Raises:
|
|
281
|
+
LookupError: If the datasource does not exist
|
|
282
|
+
"""
|
|
283
|
+
return cls(get_datasource(name))
|
|
284
|
+
|
|
285
|
+
@classmethod
|
|
286
|
+
def exists(cls, name_or_id: str) -> bool:
|
|
287
|
+
"""
|
|
288
|
+
Check if a datasource exists in the OrcaCloud
|
|
289
|
+
|
|
290
|
+
Params:
|
|
291
|
+
name_or_id: The name or id of the datasource to check
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
`True` if the datasource exists, `False` otherwise
|
|
295
|
+
"""
|
|
296
|
+
try:
|
|
297
|
+
cls.open(name_or_id)
|
|
298
|
+
return True
|
|
299
|
+
except LookupError:
|
|
300
|
+
return False
|
|
301
|
+
|
|
302
|
+
@classmethod
|
|
303
|
+
def all(cls) -> list[Datasource]:
|
|
304
|
+
"""
|
|
305
|
+
List all datasource handles in the OrcaCloud
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
A list of all datasource handles in the OrcaCloud
|
|
309
|
+
"""
|
|
310
|
+
return [cls(metadata) for metadata in list_datasources()]
|
|
311
|
+
|
|
312
|
+
@classmethod
|
|
313
|
+
def drop(cls, name_or_id: str, if_not_exists: DropMode = "error") -> None:
|
|
314
|
+
"""
|
|
315
|
+
Delete a datasource from the OrcaCloud
|
|
316
|
+
|
|
317
|
+
Params:
|
|
318
|
+
name_or_id: The name or id of the datasource to delete
|
|
319
|
+
if_not_exists: What to do if the datasource does not exist, defaults to
|
|
320
|
+
`"error"`. Other options are `"ignore"` to do nothing.
|
|
321
|
+
|
|
322
|
+
Raises:
|
|
323
|
+
LookupError: If the datasource does not exist and if_not_exists is `"error"`
|
|
324
|
+
"""
|
|
325
|
+
try:
|
|
326
|
+
delete_datasource(name_or_id)
|
|
327
|
+
logging.info(f"Deleted datasource {name_or_id}")
|
|
328
|
+
except LookupError:
|
|
329
|
+
if if_not_exists == "error":
|
|
330
|
+
raise
|
|
331
|
+
|
|
332
|
+
def __len__(self) -> int:
|
|
333
|
+
return self.length
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from uuid import uuid4
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from .datasource import Datasource
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_create_datasource(datasource, hf_dataset):
|
|
9
|
+
assert datasource is not None
|
|
10
|
+
assert datasource.name == "test_datasource"
|
|
11
|
+
assert datasource.length == len(hf_dataset)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_create_datasource_unauthenticated(unauthenticated, hf_dataset):
|
|
15
|
+
with pytest.raises(ValueError, match="Invalid API key"):
|
|
16
|
+
Datasource.from_hf_dataset("test_datasource", hf_dataset)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_create_datasource_already_exists_error(hf_dataset, datasource):
|
|
20
|
+
with pytest.raises(ValueError):
|
|
21
|
+
Datasource.from_hf_dataset("test_datasource", hf_dataset, if_exists="error")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_create_datasource_already_exists_return(hf_dataset, datasource):
|
|
25
|
+
returned_dataset = Datasource.from_hf_dataset("test_datasource", hf_dataset, if_exists="open")
|
|
26
|
+
assert returned_dataset is not None
|
|
27
|
+
assert returned_dataset.name == "test_datasource"
|
|
28
|
+
assert returned_dataset.length == len(hf_dataset)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_open_datasource(datasource):
|
|
32
|
+
fetched_datasource = Datasource.open(datasource.name)
|
|
33
|
+
assert fetched_datasource is not None
|
|
34
|
+
assert fetched_datasource.name == datasource.name
|
|
35
|
+
assert fetched_datasource.length == len(datasource)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_open_datasource_unauthenticated(datasource, unauthenticated):
|
|
39
|
+
with pytest.raises(ValueError, match="Invalid API key"):
|
|
40
|
+
Datasource.open("test_datasource")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_open_datasource_invalid_input():
|
|
44
|
+
with pytest.raises(ValueError, match=r"Invalid input:.*"):
|
|
45
|
+
Datasource.open("not valid id")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_open_datasource_not_found():
|
|
49
|
+
with pytest.raises(LookupError):
|
|
50
|
+
Datasource.open(str(uuid4()))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_open_datasource_unauthorized(datasource, unauthorized):
|
|
54
|
+
with pytest.raises(LookupError):
|
|
55
|
+
Datasource.open(datasource.id)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_all_datasources(datasource):
|
|
59
|
+
datasources = Datasource.all()
|
|
60
|
+
assert len(datasources) > 0
|
|
61
|
+
assert any(datasource.name == datasource.name for datasource in datasources)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_all_datasources_unauthenticated(unauthenticated):
|
|
65
|
+
with pytest.raises(ValueError, match="Invalid API key"):
|
|
66
|
+
Datasource.all()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_drop_datasource(hf_dataset):
|
|
70
|
+
Datasource.from_hf_dataset("datasource_to_delete", hf_dataset)
|
|
71
|
+
assert Datasource.exists("datasource_to_delete")
|
|
72
|
+
Datasource.drop("datasource_to_delete")
|
|
73
|
+
assert not Datasource.exists("datasource_to_delete")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_drop_datasource_unauthenticated(datasource, unauthenticated):
|
|
77
|
+
with pytest.raises(ValueError, match="Invalid API key"):
|
|
78
|
+
Datasource.drop(datasource.id)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_drop_datasource_not_found():
|
|
82
|
+
with pytest.raises(LookupError):
|
|
83
|
+
Datasource.drop(str(uuid4()))
|
|
84
|
+
# ignores error if specified
|
|
85
|
+
Datasource.drop(str(uuid4()), if_not_exists="ignore")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_drop_datasource_unauthorized(datasource, unauthorized):
|
|
89
|
+
with pytest.raises(LookupError):
|
|
90
|
+
Datasource.drop(datasource.id)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def test_drop_datasource_invalid_input():
|
|
94
|
+
with pytest.raises(ValueError, match=r"Invalid input:.*"):
|
|
95
|
+
Datasource.drop("not valid id")
|