orca-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. orca_sdk/__init__.py +19 -0
  2. orca_sdk/_generated_api_client/__init__.py +3 -0
  3. orca_sdk/_generated_api_client/api/__init__.py +193 -0
  4. orca_sdk/_generated_api_client/api/auth/__init__.py +0 -0
  5. orca_sdk/_generated_api_client/api/auth/check_authentication_auth_get.py +128 -0
  6. orca_sdk/_generated_api_client/api/auth/create_api_key_auth_api_key_post.py +170 -0
  7. orca_sdk/_generated_api_client/api/auth/delete_api_key_auth_api_key_name_or_id_delete.py +156 -0
  8. orca_sdk/_generated_api_client/api/auth/delete_org_auth_org_delete.py +130 -0
  9. orca_sdk/_generated_api_client/api/auth/list_api_keys_auth_api_key_get.py +127 -0
  10. orca_sdk/_generated_api_client/api/classification_model/__init__.py +0 -0
  11. orca_sdk/_generated_api_client/api/classification_model/create_evaluation_classification_model_model_name_or_id_evaluation_post.py +183 -0
  12. orca_sdk/_generated_api_client/api/classification_model/create_model_classification_model_post.py +170 -0
  13. orca_sdk/_generated_api_client/api/classification_model/delete_evaluation_classification_model_model_name_or_id_evaluation_task_id_delete.py +168 -0
  14. orca_sdk/_generated_api_client/api/classification_model/delete_model_classification_model_name_or_id_delete.py +154 -0
  15. orca_sdk/_generated_api_client/api/classification_model/get_evaluation_classification_model_model_name_or_id_evaluation_task_id_get.py +170 -0
  16. orca_sdk/_generated_api_client/api/classification_model/get_model_classification_model_name_or_id_get.py +156 -0
  17. orca_sdk/_generated_api_client/api/classification_model/list_evaluations_classification_model_model_name_or_id_evaluation_get.py +161 -0
  18. orca_sdk/_generated_api_client/api/classification_model/list_models_classification_model_get.py +127 -0
  19. orca_sdk/_generated_api_client/api/classification_model/predict_gpu_classification_model_name_or_id_prediction_post.py +190 -0
  20. orca_sdk/_generated_api_client/api/datasource/__init__.py +0 -0
  21. orca_sdk/_generated_api_client/api/datasource/create_datasource_datasource_post.py +167 -0
  22. orca_sdk/_generated_api_client/api/datasource/delete_datasource_datasource_name_or_id_delete.py +156 -0
  23. orca_sdk/_generated_api_client/api/datasource/get_datasource_datasource_name_or_id_get.py +156 -0
  24. orca_sdk/_generated_api_client/api/datasource/list_datasources_datasource_get.py +127 -0
  25. orca_sdk/_generated_api_client/api/default/__init__.py +0 -0
  26. orca_sdk/_generated_api_client/api/default/healthcheck_get.py +118 -0
  27. orca_sdk/_generated_api_client/api/default/healthcheck_gpu_get.py +118 -0
  28. orca_sdk/_generated_api_client/api/finetuned_embedding_model/__init__.py +0 -0
  29. orca_sdk/_generated_api_client/api/finetuned_embedding_model/create_finetuned_embedding_model_finetuned_embedding_model_post.py +168 -0
  30. orca_sdk/_generated_api_client/api/finetuned_embedding_model/delete_finetuned_embedding_model_finetuned_embedding_model_name_or_id_delete.py +156 -0
  31. orca_sdk/_generated_api_client/api/finetuned_embedding_model/embed_with_finetuned_model_gpu_finetuned_embedding_model_name_or_id_embedding_post.py +189 -0
  32. orca_sdk/_generated_api_client/api/finetuned_embedding_model/get_finetuned_embedding_model_finetuned_embedding_model_name_or_id_get.py +156 -0
  33. orca_sdk/_generated_api_client/api/finetuned_embedding_model/list_finetuned_embedding_models_finetuned_embedding_model_get.py +127 -0
  34. orca_sdk/_generated_api_client/api/memoryset/__init__.py +0 -0
  35. orca_sdk/_generated_api_client/api/memoryset/clone_memoryset_memoryset_name_or_id_clone_post.py +181 -0
  36. orca_sdk/_generated_api_client/api/memoryset/create_analysis_memoryset_name_or_id_analysis_post.py +183 -0
  37. orca_sdk/_generated_api_client/api/memoryset/create_memoryset_memoryset_post.py +168 -0
  38. orca_sdk/_generated_api_client/api/memoryset/delete_memories_memoryset_name_or_id_memories_delete_post.py +181 -0
  39. orca_sdk/_generated_api_client/api/memoryset/delete_memory_memoryset_name_or_id_memory_memory_id_delete.py +167 -0
  40. orca_sdk/_generated_api_client/api/memoryset/delete_memoryset_memoryset_name_or_id_delete.py +156 -0
  41. orca_sdk/_generated_api_client/api/memoryset/get_analysis_memoryset_name_or_id_analysis_analysis_task_id_get.py +169 -0
  42. orca_sdk/_generated_api_client/api/memoryset/get_memories_memoryset_name_or_id_memories_get_post.py +188 -0
  43. orca_sdk/_generated_api_client/api/memoryset/get_memory_memoryset_name_or_id_memory_memory_id_get.py +169 -0
  44. orca_sdk/_generated_api_client/api/memoryset/get_memoryset_memoryset_name_or_id_get.py +156 -0
  45. orca_sdk/_generated_api_client/api/memoryset/insert_memories_gpu_memoryset_name_or_id_memory_post.py +184 -0
  46. orca_sdk/_generated_api_client/api/memoryset/list_analyses_memoryset_name_or_id_analysis_get.py +260 -0
  47. orca_sdk/_generated_api_client/api/memoryset/list_memorysets_memoryset_get.py +127 -0
  48. orca_sdk/_generated_api_client/api/memoryset/memoryset_lookup_gpu_memoryset_name_or_id_lookup_post.py +193 -0
  49. orca_sdk/_generated_api_client/api/memoryset/query_memoryset_memoryset_name_or_id_memories_post.py +188 -0
  50. orca_sdk/_generated_api_client/api/memoryset/update_memories_gpu_memoryset_name_or_id_memories_patch.py +191 -0
  51. orca_sdk/_generated_api_client/api/memoryset/update_memory_gpu_memoryset_name_or_id_memory_patch.py +187 -0
  52. orca_sdk/_generated_api_client/api/pretrained_embedding_model/__init__.py +0 -0
  53. orca_sdk/_generated_api_client/api/pretrained_embedding_model/embed_with_pretrained_model_gpu_pretrained_embedding_model_model_name_embedding_post.py +188 -0
  54. orca_sdk/_generated_api_client/api/pretrained_embedding_model/get_pretrained_embedding_model_pretrained_embedding_model_model_name_get.py +157 -0
  55. orca_sdk/_generated_api_client/api/pretrained_embedding_model/list_pretrained_embedding_models_pretrained_embedding_model_get.py +127 -0
  56. orca_sdk/_generated_api_client/api/task/__init__.py +0 -0
  57. orca_sdk/_generated_api_client/api/task/abort_task_task_task_id_abort_delete.py +154 -0
  58. orca_sdk/_generated_api_client/api/task/get_task_status_task_task_id_status_get.py +156 -0
  59. orca_sdk/_generated_api_client/api/task/list_tasks_task_get.py +243 -0
  60. orca_sdk/_generated_api_client/api/telemetry/__init__.py +0 -0
  61. orca_sdk/_generated_api_client/api/telemetry/drop_feedback_category_with_data_telemetry_feedback_category_name_or_id_delete.py +162 -0
  62. orca_sdk/_generated_api_client/api/telemetry/get_feedback_category_telemetry_feedback_category_name_or_id_get.py +156 -0
  63. orca_sdk/_generated_api_client/api/telemetry/get_prediction_telemetry_prediction_prediction_id_get.py +157 -0
  64. orca_sdk/_generated_api_client/api/telemetry/list_feedback_categories_telemetry_feedback_category_get.py +127 -0
  65. orca_sdk/_generated_api_client/api/telemetry/list_predictions_telemetry_prediction_post.py +175 -0
  66. orca_sdk/_generated_api_client/api/telemetry/record_prediction_feedback_telemetry_prediction_feedback_put.py +171 -0
  67. orca_sdk/_generated_api_client/api/telemetry/update_prediction_telemetry_prediction_prediction_id_patch.py +181 -0
  68. orca_sdk/_generated_api_client/client.py +216 -0
  69. orca_sdk/_generated_api_client/errors.py +38 -0
  70. orca_sdk/_generated_api_client/models/__init__.py +159 -0
  71. orca_sdk/_generated_api_client/models/analyze_neighbor_labels_result.py +84 -0
  72. orca_sdk/_generated_api_client/models/api_key_metadata.py +118 -0
  73. orca_sdk/_generated_api_client/models/base_model.py +55 -0
  74. orca_sdk/_generated_api_client/models/body_create_datasource_datasource_post.py +176 -0
  75. orca_sdk/_generated_api_client/models/classification_evaluation_result.py +114 -0
  76. orca_sdk/_generated_api_client/models/clone_labeled_memoryset_request.py +150 -0
  77. orca_sdk/_generated_api_client/models/column_info.py +114 -0
  78. orca_sdk/_generated_api_client/models/column_type.py +14 -0
  79. orca_sdk/_generated_api_client/models/conflict_error_response.py +80 -0
  80. orca_sdk/_generated_api_client/models/create_api_key_request.py +99 -0
  81. orca_sdk/_generated_api_client/models/create_api_key_response.py +126 -0
  82. orca_sdk/_generated_api_client/models/create_labeled_memoryset_request.py +259 -0
  83. orca_sdk/_generated_api_client/models/create_rac_model_request.py +209 -0
  84. orca_sdk/_generated_api_client/models/datasource_metadata.py +142 -0
  85. orca_sdk/_generated_api_client/models/delete_memories_request.py +70 -0
  86. orca_sdk/_generated_api_client/models/embed_request.py +127 -0
  87. orca_sdk/_generated_api_client/models/embedding_finetuning_method.py +9 -0
  88. orca_sdk/_generated_api_client/models/evaluation_request.py +180 -0
  89. orca_sdk/_generated_api_client/models/evaluation_response.py +140 -0
  90. orca_sdk/_generated_api_client/models/feedback_type.py +9 -0
  91. orca_sdk/_generated_api_client/models/field_validation_error.py +103 -0
  92. orca_sdk/_generated_api_client/models/filter_item.py +231 -0
  93. orca_sdk/_generated_api_client/models/filter_item_field_type_0_item.py +15 -0
  94. orca_sdk/_generated_api_client/models/filter_item_field_type_2_item_type_1.py +16 -0
  95. orca_sdk/_generated_api_client/models/filter_item_op.py +16 -0
  96. orca_sdk/_generated_api_client/models/find_duplicates_analysis_result.py +70 -0
  97. orca_sdk/_generated_api_client/models/finetune_embedding_model_request.py +259 -0
  98. orca_sdk/_generated_api_client/models/finetune_embedding_model_request_training_args.py +66 -0
  99. orca_sdk/_generated_api_client/models/finetuned_embedding_model_metadata.py +166 -0
  100. orca_sdk/_generated_api_client/models/get_memories_request.py +70 -0
  101. orca_sdk/_generated_api_client/models/internal_server_error_response.py +80 -0
  102. orca_sdk/_generated_api_client/models/label_class_metrics.py +108 -0
  103. orca_sdk/_generated_api_client/models/label_prediction_memory_lookup.py +274 -0
  104. orca_sdk/_generated_api_client/models/label_prediction_memory_lookup_metadata.py +68 -0
  105. orca_sdk/_generated_api_client/models/label_prediction_result.py +101 -0
  106. orca_sdk/_generated_api_client/models/label_prediction_with_memories_and_feedback.py +232 -0
  107. orca_sdk/_generated_api_client/models/labeled_memory.py +197 -0
  108. orca_sdk/_generated_api_client/models/labeled_memory_insert.py +108 -0
  109. orca_sdk/_generated_api_client/models/labeled_memory_insert_metadata.py +68 -0
  110. orca_sdk/_generated_api_client/models/labeled_memory_lookup.py +258 -0
  111. orca_sdk/_generated_api_client/models/labeled_memory_lookup_metadata.py +68 -0
  112. orca_sdk/_generated_api_client/models/labeled_memory_metadata.py +68 -0
  113. orca_sdk/_generated_api_client/models/labeled_memory_metrics.py +277 -0
  114. orca_sdk/_generated_api_client/models/labeled_memory_update.py +171 -0
  115. orca_sdk/_generated_api_client/models/labeled_memory_update_metadata_type_0.py +68 -0
  116. orca_sdk/_generated_api_client/models/labeled_memoryset_metadata.py +195 -0
  117. orca_sdk/_generated_api_client/models/list_analyses_memoryset_name_or_id_analysis_get_type_type_0.py +9 -0
  118. orca_sdk/_generated_api_client/models/list_memories_request.py +104 -0
  119. orca_sdk/_generated_api_client/models/list_predictions_request.py +234 -0
  120. orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_0.py +9 -0
  121. orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_1.py +9 -0
  122. orca_sdk/_generated_api_client/models/lookup_request.py +81 -0
  123. orca_sdk/_generated_api_client/models/memoryset_analysis_request.py +83 -0
  124. orca_sdk/_generated_api_client/models/memoryset_analysis_request_type.py +9 -0
  125. orca_sdk/_generated_api_client/models/memoryset_analysis_response.py +180 -0
  126. orca_sdk/_generated_api_client/models/memoryset_analysis_response_config.py +66 -0
  127. orca_sdk/_generated_api_client/models/memoryset_analysis_response_type.py +9 -0
  128. orca_sdk/_generated_api_client/models/not_found_error_response.py +100 -0
  129. orca_sdk/_generated_api_client/models/not_found_error_response_resource_type_0.py +20 -0
  130. orca_sdk/_generated_api_client/models/prediction_feedback.py +157 -0
  131. orca_sdk/_generated_api_client/models/prediction_feedback_category.py +115 -0
  132. orca_sdk/_generated_api_client/models/prediction_feedback_request.py +122 -0
  133. orca_sdk/_generated_api_client/models/prediction_feedback_result.py +102 -0
  134. orca_sdk/_generated_api_client/models/prediction_request.py +169 -0
  135. orca_sdk/_generated_api_client/models/pretrained_embedding_model_metadata.py +97 -0
  136. orca_sdk/_generated_api_client/models/pretrained_embedding_model_name.py +11 -0
  137. orca_sdk/_generated_api_client/models/rac_head_type.py +11 -0
  138. orca_sdk/_generated_api_client/models/rac_model_metadata.py +191 -0
  139. orca_sdk/_generated_api_client/models/service_unavailable_error_response.py +80 -0
  140. orca_sdk/_generated_api_client/models/task.py +198 -0
  141. orca_sdk/_generated_api_client/models/task_status.py +14 -0
  142. orca_sdk/_generated_api_client/models/task_status_info.py +133 -0
  143. orca_sdk/_generated_api_client/models/unauthenticated_error_response.py +72 -0
  144. orca_sdk/_generated_api_client/models/unauthorized_error_response.py +80 -0
  145. orca_sdk/_generated_api_client/models/unprocessable_input_error_response.py +94 -0
  146. orca_sdk/_generated_api_client/models/update_prediction_request.py +93 -0
  147. orca_sdk/_generated_api_client/py.typed +1 -0
  148. orca_sdk/_generated_api_client/types.py +56 -0
  149. orca_sdk/_utils/__init__.py +0 -0
  150. orca_sdk/_utils/analysis_ui.py +194 -0
  151. orca_sdk/_utils/analysis_ui_style.css +54 -0
  152. orca_sdk/_utils/auth.py +63 -0
  153. orca_sdk/_utils/auth_test.py +31 -0
  154. orca_sdk/_utils/common.py +37 -0
  155. orca_sdk/_utils/data_parsing.py +99 -0
  156. orca_sdk/_utils/data_parsing_test.py +244 -0
  157. orca_sdk/_utils/prediction_result_ui.css +18 -0
  158. orca_sdk/_utils/prediction_result_ui.py +64 -0
  159. orca_sdk/_utils/task.py +73 -0
  160. orca_sdk/classification_model.py +499 -0
  161. orca_sdk/classification_model_test.py +266 -0
  162. orca_sdk/conftest.py +117 -0
  163. orca_sdk/datasource.py +333 -0
  164. orca_sdk/datasource_test.py +95 -0
  165. orca_sdk/embedding_model.py +336 -0
  166. orca_sdk/embedding_model_test.py +173 -0
  167. orca_sdk/labeled_memoryset.py +1154 -0
  168. orca_sdk/labeled_memoryset_test.py +271 -0
  169. orca_sdk/orca_credentials.py +75 -0
  170. orca_sdk/orca_credentials_test.py +37 -0
  171. orca_sdk/telemetry.py +386 -0
  172. orca_sdk/telemetry_test.py +100 -0
  173. orca_sdk-0.1.0.dist-info/METADATA +39 -0
  174. orca_sdk-0.1.0.dist-info/RECORD +175 -0
  175. orca_sdk-0.1.0.dist-info/WHEEL +4 -0
orca_sdk/datasource.py ADDED
@@ -0,0 +1,333 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import tempfile
5
+ from datetime import datetime
6
+ from os import PathLike
7
+ from pathlib import Path
8
+ from typing import cast
9
+
10
+ import pandas as pd
11
+ import pyarrow as pa
12
+ from datasets import Dataset
13
+ from torch.utils.data import DataLoader as TorchDataLoader
14
+ from torch.utils.data import Dataset as TorchDataset
15
+
16
+ from ._generated_api_client.api import (
17
+ delete_datasource,
18
+ get_datasource,
19
+ list_datasources,
20
+ )
21
+ from ._generated_api_client.api.datasource.create_datasource_datasource_post import (
22
+ _parse_response as parse_create_response,
23
+ )
24
+ from ._generated_api_client.client import get_client
25
+ from ._generated_api_client.models import ColumnType, DatasourceMetadata
26
+ from ._utils.common import CreateMode, DropMode
27
+ from ._utils.data_parsing import hf_dataset_from_disk, hf_dataset_from_torch
28
+
29
+
30
+ class Datasource:
31
+ """
32
+ A Handle to a datasource in the OrcaCloud
33
+
34
+ A Datasource is a collection of data saved to the OrcaCloud that can be used to create a [`Memoryset`][orca_sdk.LabeledMemoryset].
35
+ It can be created from a Hugging Face Dataset, a PyTorch DataLoader or Dataset, a list of dictionaries, a dictionary of columns, a pandas DataFrame, a pyarrow Table, or a local file.
36
+
37
+ Attributes:
38
+ id: Unique identifier for the datasource
39
+ name: Unique name of the datasource
40
+ length: Number of rows in the datasource
41
+ created_at: When the datasource was created
42
+ columns: Dictionary of column names and types
43
+ """
44
+
45
+ id: str
46
+ name: str
47
+ length: int
48
+ created_at: datetime
49
+ updated_at: datetime
50
+ columns: dict[str, str]
51
+
52
+ def __init__(self, metadata: DatasourceMetadata):
53
+ # for internal use only, do not document
54
+ self.id = metadata.id
55
+ self.name = metadata.name
56
+ self.length = metadata.length
57
+ self.created_at = metadata.created_at
58
+ self.updated_at = metadata.updated_at
59
+ self.columns = {
60
+ column.name: (
61
+ f"enum({', '.join(f'{option!r}' for option in column.enum_options) if column.enum_options else ''}"
62
+ if column.type == ColumnType.ENUM
63
+ else "str"
64
+ if column.type == ColumnType.STRING
65
+ else column.type.value.lower()
66
+ )
67
+ for column in metadata.columns
68
+ }
69
+
70
+ def __eq__(self, other) -> bool:
71
+ return isinstance(other, Datasource) and self.id == other.id
72
+
73
+ def __repr__(self) -> str:
74
+ return (
75
+ "Datasource({\n"
76
+ + f" name: '{self.name}',\n"
77
+ + f" length: {self.length},\n"
78
+ + " columns: {{\n "
79
+ + "\n ".join([f"{k}: {v}" for k, v in self.columns.items()])
80
+ + "\n }}\n"
81
+ + "})"
82
+ )
83
+
84
+ @classmethod
85
+ def from_hf_dataset(cls, name: str, dataset: Dataset, if_exists: CreateMode = "error") -> Datasource:
86
+ """
87
+ Create a new datasource from a Hugging Face Dataset
88
+
89
+ Params:
90
+ name: Required name for the new datasource (must be unique)
91
+ dataset: The Hugging Face Dataset to create the datasource from
92
+ if_exists: What to do if a datasource with the same name already exists, defaults to
93
+ `"error"`. Other option is `"open"` to open the existing datasource.
94
+
95
+ Returns:
96
+ A handle to the new datasource in the OrcaCloud
97
+
98
+ Raises:
99
+ ValueError: If the datasource already exists and if_exists is `"error"`
100
+ """
101
+ client = get_client()
102
+
103
+ if cls.exists(name):
104
+ if if_exists == "error":
105
+ raise ValueError(f"Dataset with name {name} already exists")
106
+ elif if_exists == "open":
107
+ return cls.open(name)
108
+
109
+ with tempfile.TemporaryDirectory() as tmp_dir:
110
+ dataset.save_to_disk(tmp_dir)
111
+ files = []
112
+ for file_path in Path(tmp_dir).iterdir():
113
+ buffered_reader = open(file_path, "rb")
114
+ files.append(("files", buffered_reader))
115
+
116
+ # Do not use Generated client for this endpoint b/c it does not handle files properly
117
+ metadata = parse_create_response(
118
+ response=client.get_httpx_client().request(
119
+ method="post",
120
+ url="/datasource/",
121
+ files=files,
122
+ data={"name": name},
123
+ )
124
+ )
125
+ return cls(metadata=metadata)
126
+
127
+ @classmethod
128
+ def from_pytorch(
129
+ cls,
130
+ name: str,
131
+ torch_data: TorchDataLoader | TorchDataset,
132
+ column_names: list[str] | None = None,
133
+ if_exists: CreateMode = "error",
134
+ ) -> Datasource:
135
+ """
136
+ Create a new datasource from a PyTorch DataLoader or Dataset
137
+
138
+ Params:
139
+ name: Required name for the new datasource (must be unique)
140
+ torch_data: The PyTorch DataLoader or Dataset to create the datasource from
141
+ column_names: If the provided dataset or data loader returns unnamed tuples, this
142
+ argument must be provided to specify the names of the columns.
143
+ if_exists: What to do if a datasource with the same name already exists, defaults to
144
+ `"error"`. Other option is `"open"` to open the existing datasource.
145
+
146
+ Returns:
147
+ A handle to the new datasource in the OrcaCloud
148
+
149
+ Raises:
150
+ ValueError: If the datasource already exists and if_exists is `"error"`
151
+ """
152
+ hf_dataset = hf_dataset_from_torch(torch_data, column_names=column_names)
153
+ return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
154
+
155
+ @classmethod
156
+ def from_list(cls, name: str, data: list[dict], if_exists: CreateMode = "error") -> Datasource:
157
+ """
158
+ Create a new datasource from a list of dictionaries
159
+
160
+ Params:
161
+ name: Required name for the new datasource (must be unique)
162
+ data: The list of dictionaries to create the datasource from
163
+ if_exists: What to do if a datasource with the same name already exists, defaults to
164
+ `"error"`. Other option is `"open"` to open the existing datasource.
165
+
166
+ Returns:
167
+ A handle to the new datasource in the OrcaCloud
168
+
169
+ Raises:
170
+ ValueError: If the datasource already exists and if_exists is `"error"`
171
+
172
+ Examples:
173
+ >>> Datasource.from_list("my_datasource", [{"text": "Hello, world!", "label": 1}, {"text": "Goodbye", "label": 0}])
174
+ """
175
+ hf_dataset = Dataset.from_list(data)
176
+ return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
177
+
178
+ @classmethod
179
+ def from_dict(cls, name: str, data: dict, if_exists: CreateMode = "error") -> Datasource:
180
+ """
181
+ Create a new datasource from a dictionary of columns
182
+
183
+ Params:
184
+ name: Required name for the new datasource (must be unique)
185
+ data: The dictionary of columns to create the datasource from
186
+ if_exists: What to do if a datasource with the same name already exists, defaults to
187
+ `"error"`. Other option is `"open"` to open the existing datasource.
188
+
189
+ Returns:
190
+ A handle to the new datasource in the OrcaCloud
191
+
192
+ Raises:
193
+ ValueError: If the datasource already exists and if_exists is `"error"`
194
+
195
+ Examples:
196
+ >>> Datasource.from_dict("my_datasource", {"text": ["Hello, world!", "Goodbye"], "label": [1, 0]})
197
+ """
198
+ hf_dataset = Dataset.from_dict(data)
199
+ return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
200
+
201
+ @classmethod
202
+ def from_pandas(cls, name: str, dataframe: pd.DataFrame, if_exists: CreateMode = "error") -> Datasource:
203
+ """
204
+ Create a new datasource from a pandas DataFrame
205
+
206
+ Params:
207
+ name: Required name for the new datasource (must be unique)
208
+ dataframe: The pandas DataFrame to create the datasource from
209
+ if_exists: What to do if a datasource with the same name already exists, defaults to
210
+ `"error"`. Other option is `"open"` to open the existing datasource.
211
+
212
+ Returns:
213
+ A handle to the new datasource in the OrcaCloud
214
+
215
+ Raises:
216
+ ValueError: If the datasource already exists and if_exists is `"error"`
217
+ """
218
+ hf_dataset = Dataset.from_pandas(dataframe)
219
+ return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
220
+
221
+ @classmethod
222
+ def from_arrow(cls, name: str, pyarrow_table: pa.Table, if_exists: CreateMode = "error") -> Datasource:
223
+ """
224
+ Create a new datasource from a pyarrow Table
225
+
226
+ Params:
227
+ name: Required name for the new datasource (must be unique)
228
+ pyarrow_table: The pyarrow Table to create the datasource from
229
+ if_exists: What to do if a datasource with the same name already exists, defaults to
230
+ `"error"`. Other option is `"open"` to open the existing datasource.
231
+
232
+ Returns:
233
+ A handle to the new datasource in the OrcaCloud
234
+
235
+ Raises:
236
+ ValueError: If the datasource already exists and if_exists is `"error"`
237
+ """
238
+ hf_dataset = Dataset(pyarrow_table)
239
+ return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
240
+
241
+ @classmethod
242
+ def from_disk(cls, name: str, file_path: str | PathLike, if_exists: CreateMode = "error") -> Datasource:
243
+ """
244
+ Create a new datasource from a local file
245
+
246
+ Params:
247
+ name: Required name for the new datasource (must be unique)
248
+ file_path: Path to the file on disk to create the datasource from. The file type will
249
+ be inferred from the file extension. The following file types are supported:
250
+
251
+ - .pkl: [`Pickle`][pickle] files containing lists of dictionaries or dictionaries of columns
252
+ - .json/.jsonl: [`JSON`][json] and [`JSON`] Lines files
253
+ - .csv: [`CSV`][csv] files
254
+ - .parquet: [`Parquet`][pyarrow.parquet.ParquetFile] files
255
+ - dataset directory: Directory containing a saved HuggingFace [`Dataset`][datasets.Dataset]
256
+
257
+ if_exists: What to do if a datasource with the same name already exists, defaults to
258
+ `"error"`. Other option is `"open"` to open the existing datasource.
259
+
260
+ Returns:
261
+ A handle to the new datasource in the OrcaCloud
262
+
263
+ Raises:
264
+ ValueError: If the datasource already exists and if_exists is `"error"`
265
+ """
266
+ hf_dataset = hf_dataset_from_disk(file_path)
267
+ return cls.from_hf_dataset(name, cast(Dataset, hf_dataset), if_exists=if_exists)
268
+
269
+ @classmethod
270
+ def open(cls, name: str) -> Datasource:
271
+ """
272
+ Get a handle to a datasource by name or id in the OrcaCloud
273
+
274
+ Params:
275
+ name: The name or unique identifier of the datasource to get
276
+
277
+ Returns:
278
+ A handle to the existing datasource in the OrcaCloud
279
+
280
+ Raises:
281
+ LookupError: If the datasource does not exist
282
+ """
283
+ return cls(get_datasource(name))
284
+
285
+ @classmethod
286
+ def exists(cls, name_or_id: str) -> bool:
287
+ """
288
+ Check if a datasource exists in the OrcaCloud
289
+
290
+ Params:
291
+ name_or_id: The name or id of the datasource to check
292
+
293
+ Returns:
294
+ `True` if the datasource exists, `False` otherwise
295
+ """
296
+ try:
297
+ cls.open(name_or_id)
298
+ return True
299
+ except LookupError:
300
+ return False
301
+
302
+ @classmethod
303
+ def all(cls) -> list[Datasource]:
304
+ """
305
+ List all datasource handles in the OrcaCloud
306
+
307
+ Returns:
308
+ A list of all datasource handles in the OrcaCloud
309
+ """
310
+ return [cls(metadata) for metadata in list_datasources()]
311
+
312
+ @classmethod
313
+ def drop(cls, name_or_id: str, if_not_exists: DropMode = "error") -> None:
314
+ """
315
+ Delete a datasource from the OrcaCloud
316
+
317
+ Params:
318
+ name_or_id: The name or id of the datasource to delete
319
+ if_not_exists: What to do if the datasource does not exist, defaults to
320
+ `"error"`. Other options are `"ignore"` to do nothing.
321
+
322
+ Raises:
323
+ LookupError: If the datasource does not exist and if_not_exists is `"error"`
324
+ """
325
+ try:
326
+ delete_datasource(name_or_id)
327
+ logging.info(f"Deleted datasource {name_or_id}")
328
+ except LookupError:
329
+ if if_not_exists == "error":
330
+ raise
331
+
332
+ def __len__(self) -> int:
333
+ return self.length
@@ -0,0 +1,95 @@
1
+ from uuid import uuid4
2
+
3
+ import pytest
4
+
5
+ from .datasource import Datasource
6
+
7
+
8
+ def test_create_datasource(datasource, hf_dataset):
9
+ assert datasource is not None
10
+ assert datasource.name == "test_datasource"
11
+ assert datasource.length == len(hf_dataset)
12
+
13
+
14
+ def test_create_datasource_unauthenticated(unauthenticated, hf_dataset):
15
+ with pytest.raises(ValueError, match="Invalid API key"):
16
+ Datasource.from_hf_dataset("test_datasource", hf_dataset)
17
+
18
+
19
+ def test_create_datasource_already_exists_error(hf_dataset, datasource):
20
+ with pytest.raises(ValueError):
21
+ Datasource.from_hf_dataset("test_datasource", hf_dataset, if_exists="error")
22
+
23
+
24
+ def test_create_datasource_already_exists_return(hf_dataset, datasource):
25
+ returned_dataset = Datasource.from_hf_dataset("test_datasource", hf_dataset, if_exists="open")
26
+ assert returned_dataset is not None
27
+ assert returned_dataset.name == "test_datasource"
28
+ assert returned_dataset.length == len(hf_dataset)
29
+
30
+
31
+ def test_open_datasource(datasource):
32
+ fetched_datasource = Datasource.open(datasource.name)
33
+ assert fetched_datasource is not None
34
+ assert fetched_datasource.name == datasource.name
35
+ assert fetched_datasource.length == len(datasource)
36
+
37
+
38
+ def test_open_datasource_unauthenticated(datasource, unauthenticated):
39
+ with pytest.raises(ValueError, match="Invalid API key"):
40
+ Datasource.open("test_datasource")
41
+
42
+
43
+ def test_open_datasource_invalid_input():
44
+ with pytest.raises(ValueError, match=r"Invalid input:.*"):
45
+ Datasource.open("not valid id")
46
+
47
+
48
+ def test_open_datasource_not_found():
49
+ with pytest.raises(LookupError):
50
+ Datasource.open(str(uuid4()))
51
+
52
+
53
+ def test_open_datasource_unauthorized(datasource, unauthorized):
54
+ with pytest.raises(LookupError):
55
+ Datasource.open(datasource.id)
56
+
57
+
58
+ def test_all_datasources(datasource):
59
+ datasources = Datasource.all()
60
+ assert len(datasources) > 0
61
+ assert any(datasource.name == datasource.name for datasource in datasources)
62
+
63
+
64
+ def test_all_datasources_unauthenticated(unauthenticated):
65
+ with pytest.raises(ValueError, match="Invalid API key"):
66
+ Datasource.all()
67
+
68
+
69
+ def test_drop_datasource(hf_dataset):
70
+ Datasource.from_hf_dataset("datasource_to_delete", hf_dataset)
71
+ assert Datasource.exists("datasource_to_delete")
72
+ Datasource.drop("datasource_to_delete")
73
+ assert not Datasource.exists("datasource_to_delete")
74
+
75
+
76
+ def test_drop_datasource_unauthenticated(datasource, unauthenticated):
77
+ with pytest.raises(ValueError, match="Invalid API key"):
78
+ Datasource.drop(datasource.id)
79
+
80
+
81
+ def test_drop_datasource_not_found():
82
+ with pytest.raises(LookupError):
83
+ Datasource.drop(str(uuid4()))
84
+ # ignores error if specified
85
+ Datasource.drop(str(uuid4()), if_not_exists="ignore")
86
+
87
+
88
+ def test_drop_datasource_unauthorized(datasource, unauthorized):
89
+ with pytest.raises(LookupError):
90
+ Datasource.drop(datasource.id)
91
+
92
+
93
+ def test_drop_datasource_invalid_input():
94
+ with pytest.raises(ValueError, match=r"Invalid input:.*"):
95
+ Datasource.drop("not valid id")