orca-sdk 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. orca_sdk/__init__.py +10 -4
  2. orca_sdk/_shared/__init__.py +10 -0
  3. orca_sdk/_shared/metrics.py +393 -0
  4. orca_sdk/_shared/metrics_test.py +273 -0
  5. orca_sdk/_utils/analysis_ui.py +12 -10
  6. orca_sdk/_utils/analysis_ui_style.css +0 -3
  7. orca_sdk/_utils/auth.py +31 -29
  8. orca_sdk/_utils/data_parsing.py +28 -2
  9. orca_sdk/_utils/data_parsing_test.py +15 -15
  10. orca_sdk/_utils/pagination.py +126 -0
  11. orca_sdk/_utils/pagination_test.py +132 -0
  12. orca_sdk/_utils/prediction_result_ui.py +67 -21
  13. orca_sdk/_utils/tqdm_file_reader.py +12 -0
  14. orca_sdk/_utils/value_parser.py +45 -0
  15. orca_sdk/_utils/value_parser_test.py +39 -0
  16. orca_sdk/async_client.py +3795 -0
  17. orca_sdk/classification_model.py +601 -129
  18. orca_sdk/classification_model_test.py +415 -117
  19. orca_sdk/client.py +3787 -0
  20. orca_sdk/conftest.py +184 -38
  21. orca_sdk/credentials.py +162 -20
  22. orca_sdk/credentials_test.py +100 -16
  23. orca_sdk/datasource.py +268 -68
  24. orca_sdk/datasource_test.py +266 -18
  25. orca_sdk/embedding_model.py +434 -82
  26. orca_sdk/embedding_model_test.py +66 -33
  27. orca_sdk/job.py +343 -0
  28. orca_sdk/job_test.py +108 -0
  29. orca_sdk/memoryset.py +1690 -324
  30. orca_sdk/memoryset_test.py +456 -119
  31. orca_sdk/regression_model.py +694 -0
  32. orca_sdk/regression_model_test.py +378 -0
  33. orca_sdk/telemetry.py +460 -143
  34. orca_sdk/telemetry_test.py +43 -24
  35. {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.3.dist-info}/METADATA +34 -16
  36. orca_sdk-0.1.3.dist-info/RECORD +41 -0
  37. {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.3.dist-info}/WHEEL +1 -1
  38. orca_sdk/_generated_api_client/__init__.py +0 -3
  39. orca_sdk/_generated_api_client/api/__init__.py +0 -193
  40. orca_sdk/_generated_api_client/api/auth/__init__.py +0 -0
  41. orca_sdk/_generated_api_client/api/auth/check_authentication_auth_get.py +0 -128
  42. orca_sdk/_generated_api_client/api/auth/create_api_key_auth_api_key_post.py +0 -170
  43. orca_sdk/_generated_api_client/api/auth/delete_api_key_auth_api_key_name_or_id_delete.py +0 -156
  44. orca_sdk/_generated_api_client/api/auth/delete_org_auth_org_delete.py +0 -130
  45. orca_sdk/_generated_api_client/api/auth/list_api_keys_auth_api_key_get.py +0 -127
  46. orca_sdk/_generated_api_client/api/classification_model/__init__.py +0 -0
  47. orca_sdk/_generated_api_client/api/classification_model/create_evaluation_classification_model_model_name_or_id_evaluation_post.py +0 -183
  48. orca_sdk/_generated_api_client/api/classification_model/create_model_classification_model_post.py +0 -170
  49. orca_sdk/_generated_api_client/api/classification_model/delete_evaluation_classification_model_model_name_or_id_evaluation_task_id_delete.py +0 -168
  50. orca_sdk/_generated_api_client/api/classification_model/delete_model_classification_model_name_or_id_delete.py +0 -154
  51. orca_sdk/_generated_api_client/api/classification_model/get_evaluation_classification_model_model_name_or_id_evaluation_task_id_get.py +0 -170
  52. orca_sdk/_generated_api_client/api/classification_model/get_model_classification_model_name_or_id_get.py +0 -156
  53. orca_sdk/_generated_api_client/api/classification_model/list_evaluations_classification_model_model_name_or_id_evaluation_get.py +0 -161
  54. orca_sdk/_generated_api_client/api/classification_model/list_models_classification_model_get.py +0 -127
  55. orca_sdk/_generated_api_client/api/classification_model/predict_gpu_classification_model_name_or_id_prediction_post.py +0 -190
  56. orca_sdk/_generated_api_client/api/datasource/__init__.py +0 -0
  57. orca_sdk/_generated_api_client/api/datasource/create_datasource_datasource_post.py +0 -167
  58. orca_sdk/_generated_api_client/api/datasource/delete_datasource_datasource_name_or_id_delete.py +0 -156
  59. orca_sdk/_generated_api_client/api/datasource/get_datasource_datasource_name_or_id_get.py +0 -156
  60. orca_sdk/_generated_api_client/api/datasource/list_datasources_datasource_get.py +0 -127
  61. orca_sdk/_generated_api_client/api/default/__init__.py +0 -0
  62. orca_sdk/_generated_api_client/api/default/healthcheck_get.py +0 -118
  63. orca_sdk/_generated_api_client/api/default/healthcheck_gpu_get.py +0 -118
  64. orca_sdk/_generated_api_client/api/finetuned_embedding_model/__init__.py +0 -0
  65. orca_sdk/_generated_api_client/api/finetuned_embedding_model/create_finetuned_embedding_model_finetuned_embedding_model_post.py +0 -168
  66. orca_sdk/_generated_api_client/api/finetuned_embedding_model/delete_finetuned_embedding_model_finetuned_embedding_model_name_or_id_delete.py +0 -156
  67. orca_sdk/_generated_api_client/api/finetuned_embedding_model/embed_with_finetuned_model_gpu_finetuned_embedding_model_name_or_id_embedding_post.py +0 -189
  68. orca_sdk/_generated_api_client/api/finetuned_embedding_model/get_finetuned_embedding_model_finetuned_embedding_model_name_or_id_get.py +0 -156
  69. orca_sdk/_generated_api_client/api/finetuned_embedding_model/list_finetuned_embedding_models_finetuned_embedding_model_get.py +0 -127
  70. orca_sdk/_generated_api_client/api/memoryset/__init__.py +0 -0
  71. orca_sdk/_generated_api_client/api/memoryset/clone_memoryset_memoryset_name_or_id_clone_post.py +0 -181
  72. orca_sdk/_generated_api_client/api/memoryset/create_analysis_memoryset_name_or_id_analysis_post.py +0 -183
  73. orca_sdk/_generated_api_client/api/memoryset/create_memoryset_memoryset_post.py +0 -168
  74. orca_sdk/_generated_api_client/api/memoryset/delete_memories_memoryset_name_or_id_memories_delete_post.py +0 -181
  75. orca_sdk/_generated_api_client/api/memoryset/delete_memory_memoryset_name_or_id_memory_memory_id_delete.py +0 -167
  76. orca_sdk/_generated_api_client/api/memoryset/delete_memoryset_memoryset_name_or_id_delete.py +0 -156
  77. orca_sdk/_generated_api_client/api/memoryset/get_analysis_memoryset_name_or_id_analysis_analysis_task_id_get.py +0 -169
  78. orca_sdk/_generated_api_client/api/memoryset/get_memories_memoryset_name_or_id_memories_get_post.py +0 -188
  79. orca_sdk/_generated_api_client/api/memoryset/get_memory_memoryset_name_or_id_memory_memory_id_get.py +0 -169
  80. orca_sdk/_generated_api_client/api/memoryset/get_memoryset_memoryset_name_or_id_get.py +0 -156
  81. orca_sdk/_generated_api_client/api/memoryset/insert_memories_gpu_memoryset_name_or_id_memory_post.py +0 -184
  82. orca_sdk/_generated_api_client/api/memoryset/list_analyses_memoryset_name_or_id_analysis_get.py +0 -260
  83. orca_sdk/_generated_api_client/api/memoryset/list_memorysets_memoryset_get.py +0 -127
  84. orca_sdk/_generated_api_client/api/memoryset/memoryset_lookup_gpu_memoryset_name_or_id_lookup_post.py +0 -193
  85. orca_sdk/_generated_api_client/api/memoryset/query_memoryset_memoryset_name_or_id_memories_post.py +0 -188
  86. orca_sdk/_generated_api_client/api/memoryset/update_memories_gpu_memoryset_name_or_id_memories_patch.py +0 -191
  87. orca_sdk/_generated_api_client/api/memoryset/update_memory_gpu_memoryset_name_or_id_memory_patch.py +0 -187
  88. orca_sdk/_generated_api_client/api/pretrained_embedding_model/__init__.py +0 -0
  89. orca_sdk/_generated_api_client/api/pretrained_embedding_model/embed_with_pretrained_model_gpu_pretrained_embedding_model_model_name_embedding_post.py +0 -188
  90. orca_sdk/_generated_api_client/api/pretrained_embedding_model/get_pretrained_embedding_model_pretrained_embedding_model_model_name_get.py +0 -157
  91. orca_sdk/_generated_api_client/api/pretrained_embedding_model/list_pretrained_embedding_models_pretrained_embedding_model_get.py +0 -127
  92. orca_sdk/_generated_api_client/api/task/__init__.py +0 -0
  93. orca_sdk/_generated_api_client/api/task/abort_task_task_task_id_abort_delete.py +0 -154
  94. orca_sdk/_generated_api_client/api/task/get_task_status_task_task_id_status_get.py +0 -156
  95. orca_sdk/_generated_api_client/api/task/list_tasks_task_get.py +0 -243
  96. orca_sdk/_generated_api_client/api/telemetry/__init__.py +0 -0
  97. orca_sdk/_generated_api_client/api/telemetry/drop_feedback_category_with_data_telemetry_feedback_category_name_or_id_delete.py +0 -162
  98. orca_sdk/_generated_api_client/api/telemetry/get_feedback_category_telemetry_feedback_category_name_or_id_get.py +0 -156
  99. orca_sdk/_generated_api_client/api/telemetry/get_prediction_telemetry_prediction_prediction_id_get.py +0 -157
  100. orca_sdk/_generated_api_client/api/telemetry/list_feedback_categories_telemetry_feedback_category_get.py +0 -127
  101. orca_sdk/_generated_api_client/api/telemetry/list_predictions_telemetry_prediction_post.py +0 -175
  102. orca_sdk/_generated_api_client/api/telemetry/record_prediction_feedback_telemetry_prediction_feedback_put.py +0 -171
  103. orca_sdk/_generated_api_client/api/telemetry/update_prediction_telemetry_prediction_prediction_id_patch.py +0 -181
  104. orca_sdk/_generated_api_client/client.py +0 -216
  105. orca_sdk/_generated_api_client/errors.py +0 -38
  106. orca_sdk/_generated_api_client/models/__init__.py +0 -159
  107. orca_sdk/_generated_api_client/models/analyze_neighbor_labels_result.py +0 -84
  108. orca_sdk/_generated_api_client/models/api_key_metadata.py +0 -118
  109. orca_sdk/_generated_api_client/models/base_model.py +0 -55
  110. orca_sdk/_generated_api_client/models/body_create_datasource_datasource_post.py +0 -176
  111. orca_sdk/_generated_api_client/models/classification_evaluation_result.py +0 -114
  112. orca_sdk/_generated_api_client/models/clone_labeled_memoryset_request.py +0 -150
  113. orca_sdk/_generated_api_client/models/column_info.py +0 -114
  114. orca_sdk/_generated_api_client/models/column_type.py +0 -14
  115. orca_sdk/_generated_api_client/models/conflict_error_response.py +0 -80
  116. orca_sdk/_generated_api_client/models/create_api_key_request.py +0 -99
  117. orca_sdk/_generated_api_client/models/create_api_key_response.py +0 -126
  118. orca_sdk/_generated_api_client/models/create_labeled_memoryset_request.py +0 -259
  119. orca_sdk/_generated_api_client/models/create_rac_model_request.py +0 -209
  120. orca_sdk/_generated_api_client/models/datasource_metadata.py +0 -142
  121. orca_sdk/_generated_api_client/models/delete_memories_request.py +0 -70
  122. orca_sdk/_generated_api_client/models/embed_request.py +0 -127
  123. orca_sdk/_generated_api_client/models/embedding_finetuning_method.py +0 -9
  124. orca_sdk/_generated_api_client/models/evaluation_request.py +0 -180
  125. orca_sdk/_generated_api_client/models/evaluation_response.py +0 -140
  126. orca_sdk/_generated_api_client/models/feedback_type.py +0 -9
  127. orca_sdk/_generated_api_client/models/field_validation_error.py +0 -103
  128. orca_sdk/_generated_api_client/models/filter_item.py +0 -231
  129. orca_sdk/_generated_api_client/models/filter_item_field_type_0_item.py +0 -15
  130. orca_sdk/_generated_api_client/models/filter_item_field_type_2_item_type_1.py +0 -16
  131. orca_sdk/_generated_api_client/models/filter_item_op.py +0 -16
  132. orca_sdk/_generated_api_client/models/find_duplicates_analysis_result.py +0 -70
  133. orca_sdk/_generated_api_client/models/finetune_embedding_model_request.py +0 -259
  134. orca_sdk/_generated_api_client/models/finetune_embedding_model_request_training_args.py +0 -66
  135. orca_sdk/_generated_api_client/models/finetuned_embedding_model_metadata.py +0 -166
  136. orca_sdk/_generated_api_client/models/get_memories_request.py +0 -70
  137. orca_sdk/_generated_api_client/models/internal_server_error_response.py +0 -80
  138. orca_sdk/_generated_api_client/models/label_class_metrics.py +0 -108
  139. orca_sdk/_generated_api_client/models/label_prediction_memory_lookup.py +0 -274
  140. orca_sdk/_generated_api_client/models/label_prediction_memory_lookup_metadata.py +0 -68
  141. orca_sdk/_generated_api_client/models/label_prediction_result.py +0 -101
  142. orca_sdk/_generated_api_client/models/label_prediction_with_memories_and_feedback.py +0 -232
  143. orca_sdk/_generated_api_client/models/labeled_memory.py +0 -197
  144. orca_sdk/_generated_api_client/models/labeled_memory_insert.py +0 -108
  145. orca_sdk/_generated_api_client/models/labeled_memory_insert_metadata.py +0 -68
  146. orca_sdk/_generated_api_client/models/labeled_memory_lookup.py +0 -258
  147. orca_sdk/_generated_api_client/models/labeled_memory_lookup_metadata.py +0 -68
  148. orca_sdk/_generated_api_client/models/labeled_memory_metadata.py +0 -68
  149. orca_sdk/_generated_api_client/models/labeled_memory_metrics.py +0 -277
  150. orca_sdk/_generated_api_client/models/labeled_memory_update.py +0 -171
  151. orca_sdk/_generated_api_client/models/labeled_memory_update_metadata_type_0.py +0 -68
  152. orca_sdk/_generated_api_client/models/labeled_memoryset_metadata.py +0 -195
  153. orca_sdk/_generated_api_client/models/list_analyses_memoryset_name_or_id_analysis_get_type_type_0.py +0 -9
  154. orca_sdk/_generated_api_client/models/list_memories_request.py +0 -104
  155. orca_sdk/_generated_api_client/models/list_predictions_request.py +0 -234
  156. orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_0.py +0 -9
  157. orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_1.py +0 -9
  158. orca_sdk/_generated_api_client/models/lookup_request.py +0 -81
  159. orca_sdk/_generated_api_client/models/memoryset_analysis_request.py +0 -83
  160. orca_sdk/_generated_api_client/models/memoryset_analysis_request_type.py +0 -9
  161. orca_sdk/_generated_api_client/models/memoryset_analysis_response.py +0 -180
  162. orca_sdk/_generated_api_client/models/memoryset_analysis_response_config.py +0 -66
  163. orca_sdk/_generated_api_client/models/memoryset_analysis_response_type.py +0 -9
  164. orca_sdk/_generated_api_client/models/not_found_error_response.py +0 -100
  165. orca_sdk/_generated_api_client/models/not_found_error_response_resource_type_0.py +0 -20
  166. orca_sdk/_generated_api_client/models/prediction_feedback.py +0 -157
  167. orca_sdk/_generated_api_client/models/prediction_feedback_category.py +0 -115
  168. orca_sdk/_generated_api_client/models/prediction_feedback_request.py +0 -122
  169. orca_sdk/_generated_api_client/models/prediction_feedback_result.py +0 -102
  170. orca_sdk/_generated_api_client/models/prediction_request.py +0 -169
  171. orca_sdk/_generated_api_client/models/pretrained_embedding_model_metadata.py +0 -97
  172. orca_sdk/_generated_api_client/models/pretrained_embedding_model_name.py +0 -11
  173. orca_sdk/_generated_api_client/models/rac_head_type.py +0 -11
  174. orca_sdk/_generated_api_client/models/rac_model_metadata.py +0 -191
  175. orca_sdk/_generated_api_client/models/service_unavailable_error_response.py +0 -80
  176. orca_sdk/_generated_api_client/models/task.py +0 -198
  177. orca_sdk/_generated_api_client/models/task_status.py +0 -14
  178. orca_sdk/_generated_api_client/models/task_status_info.py +0 -133
  179. orca_sdk/_generated_api_client/models/unauthenticated_error_response.py +0 -72
  180. orca_sdk/_generated_api_client/models/unauthorized_error_response.py +0 -80
  181. orca_sdk/_generated_api_client/models/unprocessable_input_error_response.py +0 -94
  182. orca_sdk/_generated_api_client/models/update_prediction_request.py +0 -93
  183. orca_sdk/_generated_api_client/py.typed +0 -1
  184. orca_sdk/_generated_api_client/types.py +0 -56
  185. orca_sdk/_utils/task.py +0 -73
  186. orca_sdk-0.1.1.dist-info/RECORD +0 -175
orca_sdk/datasource.py CHANGED
@@ -1,30 +1,91 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ import os
4
5
  import tempfile
6
+ import zipfile
5
7
  from datetime import datetime
8
+ from io import BytesIO
6
9
  from os import PathLike
7
10
  from pathlib import Path
8
- from typing import cast
11
+ from typing import Literal, Union, cast
9
12
 
10
13
  import pandas as pd
11
14
  import pyarrow as pa
12
- from datasets import Dataset
15
+ from datasets import Dataset, DatasetDict
16
+ from httpx._types import FileTypes # type: ignore
17
+ from pyarrow import parquet
13
18
  from torch.utils.data import DataLoader as TorchDataLoader
14
19
  from torch.utils.data import Dataset as TorchDataset
20
+ from tqdm.auto import tqdm
15
21
 
16
- from ._generated_api_client.api import (
17
- delete_datasource,
18
- get_datasource,
19
- list_datasources,
20
- )
21
- from ._generated_api_client.api.datasource.create_datasource_datasource_post import (
22
- _parse_response as parse_create_response,
23
- )
24
- from ._generated_api_client.client import get_client
25
- from ._generated_api_client.models import ColumnType, DatasourceMetadata
26
22
  from ._utils.common import CreateMode, DropMode
27
- from ._utils.data_parsing import hf_dataset_from_disk, hf_dataset_from_torch
23
+ from ._utils.data_parsing import hf_dataset_from_torch
24
+ from ._utils.tqdm_file_reader import TqdmFileReader
25
+ from .client import DatasourceMetadata, OrcaClient
26
+
27
+
28
+ def _upload_files_to_datasource(
29
+ name: str,
30
+ file_paths: list[Path],
31
+ description: str | None = None,
32
+ ) -> DatasourceMetadata:
33
+ """
34
+ Helper function to upload files to create a datasource using manual HTTP requests.
35
+
36
+ This bypasses the generated client because it doesn't handle file uploads properly.
37
+
38
+ Params:
39
+ name: Name for the datasource
40
+ file_paths: List of file paths to upload
41
+ description: Optional description for the datasource
42
+
43
+ Returns:
44
+ Metadata for the created datasource
45
+ """
46
+ files: list[tuple[Literal["files"], FileTypes]] = []
47
+
48
+ # Calculate total size for all files
49
+ total_size = sum(file_path.stat().st_size for file_path in file_paths)
50
+
51
+ with tqdm(total=total_size, unit="B", unit_scale=True, desc="Uploading") as pbar:
52
+ for file_path in file_paths:
53
+ buffered_reader = open(file_path, "rb")
54
+ tqdm_reader = TqdmFileReader(buffered_reader, pbar)
55
+ files.append(("files", (file_path.name, cast(bytes, tqdm_reader))))
56
+
57
+ # Use manual HTTP request for file uploads
58
+ client = OrcaClient._resolve_client()
59
+ metadata = client.POST(
60
+ "/datasource/upload",
61
+ files=files,
62
+ data={"name": name, "description": description},
63
+ )
64
+
65
+ return metadata
66
+
67
+
68
+ def _handle_existing_datasource(name: str, if_exists: CreateMode) -> Union["Datasource", None]:
69
+ """
70
+ Helper function to handle the common pattern of checking if a datasource exists
71
+ and taking action based on the if_exists parameter.
72
+
73
+ Params:
74
+ name: Name of the datasource to check
75
+ if_exists: What to do if a datasource with the same name already exists
76
+
77
+ Returns:
78
+ Datasource instance if opening existing, None if should proceed with creation
79
+
80
+ Raises:
81
+ ValueError: If the datasource already exists and if_exists is "error"
82
+ """
83
+ if Datasource.exists(name):
84
+ if if_exists == "error":
85
+ raise ValueError(f"Dataset with name {name} already exists")
86
+ elif if_exists == "open":
87
+ return Datasource.open(name)
88
+ return None
28
89
 
29
90
 
30
91
  class Datasource:
@@ -37,6 +98,7 @@ class Datasource:
37
98
  Attributes:
38
99
  id: Unique identifier for the datasource
39
100
  name: Unique name of the datasource
101
+ description: Optional description of the datasource
40
102
  length: Number of rows in the datasource
41
103
  created_at: When the datasource was created
42
104
  columns: Dictionary of column names and types
@@ -44,6 +106,7 @@ class Datasource:
44
106
 
45
107
  id: str
46
108
  name: str
109
+ description: str | None
47
110
  length: int
48
111
  created_at: datetime
49
112
  updated_at: datetime
@@ -51,20 +114,19 @@ class Datasource:
51
114
 
52
115
  def __init__(self, metadata: DatasourceMetadata):
53
116
  # for internal use only, do not document
54
- self.id = metadata.id
55
- self.name = metadata.name
56
- self.length = metadata.length
57
- self.created_at = metadata.created_at
58
- self.updated_at = metadata.updated_at
117
+ self.id = metadata["id"]
118
+ self.name = metadata["name"]
119
+ self.length = metadata["length"]
120
+ self.created_at = datetime.fromisoformat(metadata["created_at"])
121
+ self.updated_at = datetime.fromisoformat(metadata["updated_at"])
122
+ self.description = metadata["description"]
59
123
  self.columns = {
60
- column.name: (
61
- f"enum({', '.join(f'{option!r}' for option in column.enum_options) if column.enum_options else ''}"
62
- if column.type == ColumnType.ENUM
63
- else "str"
64
- if column.type == ColumnType.STRING
65
- else column.type.value.lower()
124
+ column["name"]: (
125
+ f"enum({', '.join(f'{option!r}' for option in column['enum_options'] or []) if 'enum_options' in column else ''})"
126
+ if column["type"] == "ENUM"
127
+ else "str" if column["type"] == "STRING" else column["type"].lower()
66
128
  )
67
- for column in metadata.columns
129
+ for column in metadata["columns"]
68
130
  }
69
131
 
70
132
  def __eq__(self, other) -> bool:
@@ -82,7 +144,9 @@ class Datasource:
82
144
  )
83
145
 
84
146
  @classmethod
85
- def from_hf_dataset(cls, name: str, dataset: Dataset, if_exists: CreateMode = "error") -> Datasource:
147
+ def from_hf_dataset(
148
+ cls, name: str, dataset: Dataset, if_exists: CreateMode = "error", description: str | None = None
149
+ ) -> Datasource:
86
150
  """
87
151
  Create a new datasource from a Hugging Face Dataset
88
152
 
@@ -91,6 +155,7 @@ class Datasource:
91
155
  dataset: The Hugging Face Dataset to create the datasource from
92
156
  if_exists: What to do if a datasource with the same name already exists, defaults to
93
157
  `"error"`. Other option is `"open"` to open the existing datasource.
158
+ description: Optional description for the datasource
94
159
 
95
160
  Returns:
96
161
  A handle to the new datasource in the OrcaCloud
@@ -98,32 +163,54 @@ class Datasource:
98
163
  Raises:
99
164
  ValueError: If the datasource already exists and if_exists is `"error"`
100
165
  """
101
- client = get_client()
102
-
103
- if cls.exists(name):
104
- if if_exists == "error":
105
- raise ValueError(f"Dataset with name {name} already exists")
106
- elif if_exists == "open":
107
- return cls.open(name)
166
+ # Check if datasource already exists and handle accordingly
167
+ existing = _handle_existing_datasource(name, if_exists)
168
+ if existing is not None:
169
+ return existing
108
170
 
109
171
  with tempfile.TemporaryDirectory() as tmp_dir:
110
172
  dataset.save_to_disk(tmp_dir)
111
- files = []
112
- for file_path in Path(tmp_dir).iterdir():
113
- buffered_reader = open(file_path, "rb")
114
- files.append(("files", buffered_reader))
115
-
116
- # Do not use Generated client for this endpoint b/c it does not handle files properly
117
- metadata = parse_create_response(
118
- response=client.get_httpx_client().request(
119
- method="post",
120
- url="/datasource/",
121
- files=files,
122
- data={"name": name},
123
- )
124
- )
173
+
174
+ # Get all file paths in the directory
175
+ file_paths = list(Path(tmp_dir).iterdir())
176
+
177
+ # Use the helper function to upload files
178
+ metadata = _upload_files_to_datasource(name, file_paths, description)
125
179
  return cls(metadata=metadata)
126
180
 
181
+ @classmethod
182
+ def from_hf_dataset_dict(
183
+ cls,
184
+ name: str,
185
+ dataset_dict: DatasetDict,
186
+ if_exists: CreateMode = "error",
187
+ description: dict[str, str | None] | str | None = None,
188
+ ) -> dict[str, Datasource]:
189
+ """
190
+ Create datasources from a Hugging Face DatasetDict
191
+
192
+ Params:
193
+ name: Name prefix for the new datasources, will be suffixed with the dataset name
194
+ dataset_dict: The Hugging Face DatasetDict to create the datasources from
195
+ if_exists: What to do if a datasource with the same name already exists, defaults to
196
+ `"error"`. Other option is `"open"` to open the existing datasource.
197
+ description: Optional description for the datasources, can be a string or a dictionary of dataset names to descriptions
198
+
199
+ Returns:
200
+ A dictionary of datasource handles, keyed by the dataset name
201
+
202
+ Raises:
203
+ ValueError: If a datasource already exists and if_exists is `"error"`
204
+ """
205
+ if description is None or isinstance(description, str):
206
+ description = {dataset_name: description for dataset_name in dataset_dict.keys()}
207
+ return {
208
+ dataset_name: cls.from_hf_dataset(
209
+ f"{name}_{dataset_name}", dataset, if_exists=if_exists, description=description[dataset_name]
210
+ )
211
+ for dataset_name, dataset in dataset_dict.items()
212
+ }
213
+
127
214
  @classmethod
128
215
  def from_pytorch(
129
216
  cls,
@@ -131,6 +218,7 @@ class Datasource:
131
218
  torch_data: TorchDataLoader | TorchDataset,
132
219
  column_names: list[str] | None = None,
133
220
  if_exists: CreateMode = "error",
221
+ description: str | None = None,
134
222
  ) -> Datasource:
135
223
  """
136
224
  Create a new datasource from a PyTorch DataLoader or Dataset
@@ -142,6 +230,7 @@ class Datasource:
142
230
  argument must be provided to specify the names of the columns.
143
231
  if_exists: What to do if a datasource with the same name already exists, defaults to
144
232
  `"error"`. Other option is `"open"` to open the existing datasource.
233
+ description: Optional description for the datasource
145
234
 
146
235
  Returns:
147
236
  A handle to the new datasource in the OrcaCloud
@@ -150,10 +239,12 @@ class Datasource:
150
239
  ValueError: If the datasource already exists and if_exists is `"error"`
151
240
  """
152
241
  hf_dataset = hf_dataset_from_torch(torch_data, column_names=column_names)
153
- return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
242
+ return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists, description=description)
154
243
 
155
244
  @classmethod
156
- def from_list(cls, name: str, data: list[dict], if_exists: CreateMode = "error") -> Datasource:
245
+ def from_list(
246
+ cls, name: str, data: list[dict], if_exists: CreateMode = "error", description: str | None = None
247
+ ) -> Datasource:
157
248
  """
158
249
  Create a new datasource from a list of dictionaries
159
250
 
@@ -162,6 +253,7 @@ class Datasource:
162
253
  data: The list of dictionaries to create the datasource from
163
254
  if_exists: What to do if a datasource with the same name already exists, defaults to
164
255
  `"error"`. Other option is `"open"` to open the existing datasource.
256
+ description: Optional description for the datasource
165
257
 
166
258
  Returns:
167
259
  A handle to the new datasource in the OrcaCloud
@@ -172,11 +264,22 @@ class Datasource:
172
264
  Examples:
173
265
  >>> Datasource.from_list("my_datasource", [{"text": "Hello, world!", "label": 1}, {"text": "Goodbye", "label": 0}])
174
266
  """
175
- hf_dataset = Dataset.from_list(data)
176
- return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
267
+ # Check if datasource already exists and handle accordingly
268
+ existing = _handle_existing_datasource(name, if_exists)
269
+ if existing is not None:
270
+ return existing
271
+
272
+ client = OrcaClient._resolve_client()
273
+ metadata = client.POST(
274
+ "/datasource",
275
+ json={"name": name, "description": description, "content": data},
276
+ )
277
+ return cls(metadata=metadata)
177
278
 
178
279
  @classmethod
179
- def from_dict(cls, name: str, data: dict, if_exists: CreateMode = "error") -> Datasource:
280
+ def from_dict(
281
+ cls, name: str, data: dict, if_exists: CreateMode = "error", description: str | None = None
282
+ ) -> Datasource:
180
283
  """
181
284
  Create a new datasource from a dictionary of columns
182
285
 
@@ -185,6 +288,7 @@ class Datasource:
185
288
  data: The dictionary of columns to create the datasource from
186
289
  if_exists: What to do if a datasource with the same name already exists, defaults to
187
290
  `"error"`. Other option is `"open"` to open the existing datasource.
291
+ description: Optional description for the datasource
188
292
 
189
293
  Returns:
190
294
  A handle to the new datasource in the OrcaCloud
@@ -195,11 +299,22 @@ class Datasource:
195
299
  Examples:
196
300
  >>> Datasource.from_dict("my_datasource", {"text": ["Hello, world!", "Goodbye"], "label": [1, 0]})
197
301
  """
198
- hf_dataset = Dataset.from_dict(data)
199
- return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
302
+ # Check if datasource already exists and handle accordingly
303
+ existing = _handle_existing_datasource(name, if_exists)
304
+ if existing is not None:
305
+ return existing
306
+
307
+ client = OrcaClient._resolve_client()
308
+ metadata = client.POST(
309
+ "/datasource",
310
+ json={"name": name, "description": description, "content": data},
311
+ )
312
+ return cls(metadata=metadata)
200
313
 
201
314
  @classmethod
202
- def from_pandas(cls, name: str, dataframe: pd.DataFrame, if_exists: CreateMode = "error") -> Datasource:
315
+ def from_pandas(
316
+ cls, name: str, dataframe: pd.DataFrame, if_exists: CreateMode = "error", description: str | None = None
317
+ ) -> Datasource:
203
318
  """
204
319
  Create a new datasource from a pandas DataFrame
205
320
 
@@ -208,6 +323,7 @@ class Datasource:
208
323
  dataframe: The pandas DataFrame to create the datasource from
209
324
  if_exists: What to do if a datasource with the same name already exists, defaults to
210
325
  `"error"`. Other option is `"open"` to open the existing datasource.
326
+ description: Optional description for the datasource
211
327
 
212
328
  Returns:
213
329
  A handle to the new datasource in the OrcaCloud
@@ -215,11 +331,13 @@ class Datasource:
215
331
  Raises:
216
332
  ValueError: If the datasource already exists and if_exists is `"error"`
217
333
  """
218
- hf_dataset = Dataset.from_pandas(dataframe)
219
- return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
334
+ dataset = Dataset.from_pandas(dataframe)
335
+ return cls.from_hf_dataset(name, dataset, if_exists=if_exists, description=description)
220
336
 
221
337
  @classmethod
222
- def from_arrow(cls, name: str, pyarrow_table: pa.Table, if_exists: CreateMode = "error") -> Datasource:
338
+ def from_arrow(
339
+ cls, name: str, pyarrow_table: pa.Table, if_exists: CreateMode = "error", description: str | None = None
340
+ ) -> Datasource:
223
341
  """
224
342
  Create a new datasource from a pyarrow Table
225
343
 
@@ -228,6 +346,7 @@ class Datasource:
228
346
  pyarrow_table: The pyarrow Table to create the datasource from
229
347
  if_exists: What to do if a datasource with the same name already exists, defaults to
230
348
  `"error"`. Other option is `"open"` to open the existing datasource.
349
+ description: Optional description for the datasource
231
350
 
232
351
  Returns:
233
352
  A handle to the new datasource in the OrcaCloud
@@ -235,11 +354,29 @@ class Datasource:
235
354
  Raises:
236
355
  ValueError: If the datasource already exists and if_exists is `"error"`
237
356
  """
238
- hf_dataset = Dataset(pyarrow_table)
239
- return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
357
+ # Check if datasource already exists and handle accordingly
358
+ existing = _handle_existing_datasource(name, if_exists)
359
+ if existing is not None:
360
+ return existing
361
+
362
+ # Write to bytes buffer
363
+ buffer = BytesIO()
364
+ parquet.write_table(pyarrow_table, buffer)
365
+ parquet_bytes = buffer.getvalue()
366
+
367
+ client = OrcaClient._resolve_client()
368
+ metadata = client.POST(
369
+ "/datasource/upload",
370
+ files=[("files", ("data.parquet", parquet_bytes))],
371
+ data={"name": name, "description": description},
372
+ )
373
+
374
+ return cls(metadata=metadata)
240
375
 
241
376
  @classmethod
242
- def from_disk(cls, name: str, file_path: str | PathLike, if_exists: CreateMode = "error") -> Datasource:
377
+ def from_disk(
378
+ cls, name: str, file_path: str | PathLike, if_exists: CreateMode = "error", description: str | None = None
379
+ ) -> Datasource:
243
380
  """
244
381
  Create a new datasource from a local file
245
382
 
@@ -256,6 +393,7 @@ class Datasource:
256
393
 
257
394
  if_exists: What to do if a datasource with the same name already exists, defaults to
258
395
  `"error"`. Other option is `"open"` to open the existing datasource.
396
+ description: Optional description for the datasource
259
397
 
260
398
  Returns:
261
399
  A handle to the new datasource in the OrcaCloud
@@ -263,16 +401,31 @@ class Datasource:
263
401
  Raises:
264
402
  ValueError: If the datasource already exists and if_exists is `"error"`
265
403
  """
266
- hf_dataset = hf_dataset_from_disk(file_path)
267
- return cls.from_hf_dataset(name, cast(Dataset, hf_dataset), if_exists=if_exists)
404
+ # Check if datasource already exists and handle accordingly
405
+ existing = _handle_existing_datasource(name, if_exists)
406
+ if existing is not None:
407
+ return existing
408
+
409
+ file_path = Path(file_path)
410
+
411
+ # For dataset directories, use the upload endpoint with multiple files
412
+ if file_path.is_dir():
413
+ return cls.from_hf_dataset(
414
+ name, Dataset.load_from_disk(file_path), if_exists=if_exists, description=description
415
+ )
416
+
417
+ # For single files, use the helper function to upload files
418
+ metadata = _upload_files_to_datasource(name, [file_path], description)
419
+
420
+ return cls(metadata=metadata)
268
421
 
269
422
  @classmethod
270
- def open(cls, name: str) -> Datasource:
423
+ def open(cls, name_or_id: str) -> Datasource:
271
424
  """
272
425
  Get a handle to a datasource by name or id in the OrcaCloud
273
426
 
274
427
  Params:
275
- name: The name or unique identifier of the datasource to get
428
+ name_or_id: The name or unique identifier of the datasource to get
276
429
 
277
430
  Returns:
278
431
  A handle to the existing datasource in the OrcaCloud
@@ -280,7 +433,8 @@ class Datasource:
280
433
  Raises:
281
434
  LookupError: If the datasource does not exist
282
435
  """
283
- return cls(get_datasource(name))
436
+ client = OrcaClient._resolve_client()
437
+ return cls(client.GET("/datasource/{name_or_id}", params={"name_or_id": name_or_id}))
284
438
 
285
439
  @classmethod
286
440
  def exists(cls, name_or_id: str) -> bool:
@@ -307,7 +461,8 @@ class Datasource:
307
461
  Returns:
308
462
  A list of all datasource handles in the OrcaCloud
309
463
  """
310
- return [cls(metadata) for metadata in list_datasources()]
464
+ client = OrcaClient._resolve_client()
465
+ return [cls(metadata) for metadata in client.GET("/datasource")]
311
466
 
312
467
  @classmethod
313
468
  def drop(cls, name_or_id: str, if_not_exists: DropMode = "error") -> None:
@@ -323,7 +478,8 @@ class Datasource:
323
478
  LookupError: If the datasource does not exist and if_not_exists is `"error"`
324
479
  """
325
480
  try:
326
- delete_datasource(name_or_id)
481
+ client = OrcaClient._resolve_client()
482
+ client.DELETE("/datasource/{name_or_id}", params={"name_or_id": name_or_id})
327
483
  logging.info(f"Deleted datasource {name_or_id}")
328
484
  except LookupError:
329
485
  if if_not_exists == "error":
@@ -331,3 +487,47 @@ class Datasource:
331
487
 
332
488
  def __len__(self) -> int:
333
489
  return self.length
490
+
491
+ def download(
492
+ self, output_dir: str | PathLike, file_type: Literal["hf_dataset", "json", "csv"] = "hf_dataset"
493
+ ) -> None:
494
+ """
495
+ Download the datasource to a specified path in the specified format type
496
+
497
+ Params:
498
+ output_dir: The local directory where the downloaded file will be saved.
499
+ file_type: The type of file to download.
500
+
501
+ Returns:
502
+ None
503
+ """
504
+ extension = "zip" if file_type == "hf_dataset" else file_type
505
+ output_path = Path(output_dir) / f"{self.name}.{extension}"
506
+ with open(output_path, "wb") as download_file:
507
+ client = OrcaClient._resolve_client()
508
+ with client.stream("GET", f"/datasource/{self.id}/download", params={"file_type": file_type}) as response:
509
+ total_chunks = int(response.headers["X-Total-Chunks"]) if "X-Total-Chunks" in response.headers else None
510
+ with tqdm(desc="Downloading", total=total_chunks, disable=total_chunks is None) as progress:
511
+ for chunk in response.iter_bytes():
512
+ download_file.write(chunk)
513
+ progress.update(1)
514
+
515
+ # extract the zip file
516
+ if extension == "zip":
517
+ extract_dir = Path(output_dir) / self.name
518
+ with zipfile.ZipFile(output_path, "r") as zip_ref:
519
+ zip_ref.extractall(extract_dir)
520
+ output_path.unlink() # Remove the zip file after extraction
521
+ logging.info(f"Downloaded {extract_dir}")
522
+ else:
523
+ logging.info(f"Downloaded {output_path}")
524
+
525
+ def to_list(self) -> list[dict]:
526
+ """
527
+ Convert the datasource to a list of dictionaries.
528
+
529
+ Returns:
530
+ A list of dictionaries representation of the datasource.
531
+ """
532
+ client = OrcaClient._resolve_client()
533
+ return client.GET("/datasource/{name_or_id}/download", params={"name_or_id": self.id, "file_type": "json"})