orca-sdk 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. orca_sdk/__init__.py +10 -4
  2. orca_sdk/_shared/__init__.py +10 -0
  3. orca_sdk/_shared/metrics.py +393 -0
  4. orca_sdk/_shared/metrics_test.py +273 -0
  5. orca_sdk/_utils/analysis_ui.py +12 -10
  6. orca_sdk/_utils/analysis_ui_style.css +0 -3
  7. orca_sdk/_utils/auth.py +27 -29
  8. orca_sdk/_utils/data_parsing.py +28 -2
  9. orca_sdk/_utils/data_parsing_test.py +15 -15
  10. orca_sdk/_utils/pagination.py +126 -0
  11. orca_sdk/_utils/pagination_test.py +132 -0
  12. orca_sdk/_utils/prediction_result_ui.py +67 -21
  13. orca_sdk/_utils/tqdm_file_reader.py +12 -0
  14. orca_sdk/_utils/value_parser.py +45 -0
  15. orca_sdk/_utils/value_parser_test.py +39 -0
  16. orca_sdk/classification_model.py +439 -129
  17. orca_sdk/classification_model_test.py +334 -104
  18. orca_sdk/client.py +3747 -0
  19. orca_sdk/conftest.py +164 -19
  20. orca_sdk/credentials.py +120 -18
  21. orca_sdk/credentials_test.py +20 -0
  22. orca_sdk/datasource.py +259 -68
  23. orca_sdk/datasource_test.py +242 -0
  24. orca_sdk/embedding_model.py +425 -82
  25. orca_sdk/embedding_model_test.py +39 -13
  26. orca_sdk/job.py +337 -0
  27. orca_sdk/job_test.py +108 -0
  28. orca_sdk/memoryset.py +1341 -305
  29. orca_sdk/memoryset_test.py +350 -111
  30. orca_sdk/regression_model.py +684 -0
  31. orca_sdk/regression_model_test.py +369 -0
  32. orca_sdk/telemetry.py +449 -143
  33. orca_sdk/telemetry_test.py +43 -24
  34. {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.2.dist-info}/METADATA +34 -16
  35. orca_sdk-0.1.2.dist-info/RECORD +40 -0
  36. {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.2.dist-info}/WHEEL +1 -1
  37. orca_sdk/_generated_api_client/__init__.py +0 -3
  38. orca_sdk/_generated_api_client/api/__init__.py +0 -193
  39. orca_sdk/_generated_api_client/api/auth/__init__.py +0 -0
  40. orca_sdk/_generated_api_client/api/auth/check_authentication_auth_get.py +0 -128
  41. orca_sdk/_generated_api_client/api/auth/create_api_key_auth_api_key_post.py +0 -170
  42. orca_sdk/_generated_api_client/api/auth/delete_api_key_auth_api_key_name_or_id_delete.py +0 -156
  43. orca_sdk/_generated_api_client/api/auth/delete_org_auth_org_delete.py +0 -130
  44. orca_sdk/_generated_api_client/api/auth/list_api_keys_auth_api_key_get.py +0 -127
  45. orca_sdk/_generated_api_client/api/classification_model/__init__.py +0 -0
  46. orca_sdk/_generated_api_client/api/classification_model/create_evaluation_classification_model_model_name_or_id_evaluation_post.py +0 -183
  47. orca_sdk/_generated_api_client/api/classification_model/create_model_classification_model_post.py +0 -170
  48. orca_sdk/_generated_api_client/api/classification_model/delete_evaluation_classification_model_model_name_or_id_evaluation_task_id_delete.py +0 -168
  49. orca_sdk/_generated_api_client/api/classification_model/delete_model_classification_model_name_or_id_delete.py +0 -154
  50. orca_sdk/_generated_api_client/api/classification_model/get_evaluation_classification_model_model_name_or_id_evaluation_task_id_get.py +0 -170
  51. orca_sdk/_generated_api_client/api/classification_model/get_model_classification_model_name_or_id_get.py +0 -156
  52. orca_sdk/_generated_api_client/api/classification_model/list_evaluations_classification_model_model_name_or_id_evaluation_get.py +0 -161
  53. orca_sdk/_generated_api_client/api/classification_model/list_models_classification_model_get.py +0 -127
  54. orca_sdk/_generated_api_client/api/classification_model/predict_gpu_classification_model_name_or_id_prediction_post.py +0 -190
  55. orca_sdk/_generated_api_client/api/datasource/__init__.py +0 -0
  56. orca_sdk/_generated_api_client/api/datasource/create_datasource_datasource_post.py +0 -167
  57. orca_sdk/_generated_api_client/api/datasource/delete_datasource_datasource_name_or_id_delete.py +0 -156
  58. orca_sdk/_generated_api_client/api/datasource/get_datasource_datasource_name_or_id_get.py +0 -156
  59. orca_sdk/_generated_api_client/api/datasource/list_datasources_datasource_get.py +0 -127
  60. orca_sdk/_generated_api_client/api/default/__init__.py +0 -0
  61. orca_sdk/_generated_api_client/api/default/healthcheck_get.py +0 -118
  62. orca_sdk/_generated_api_client/api/default/healthcheck_gpu_get.py +0 -118
  63. orca_sdk/_generated_api_client/api/finetuned_embedding_model/__init__.py +0 -0
  64. orca_sdk/_generated_api_client/api/finetuned_embedding_model/create_finetuned_embedding_model_finetuned_embedding_model_post.py +0 -168
  65. orca_sdk/_generated_api_client/api/finetuned_embedding_model/delete_finetuned_embedding_model_finetuned_embedding_model_name_or_id_delete.py +0 -156
  66. orca_sdk/_generated_api_client/api/finetuned_embedding_model/embed_with_finetuned_model_gpu_finetuned_embedding_model_name_or_id_embedding_post.py +0 -189
  67. orca_sdk/_generated_api_client/api/finetuned_embedding_model/get_finetuned_embedding_model_finetuned_embedding_model_name_or_id_get.py +0 -156
  68. orca_sdk/_generated_api_client/api/finetuned_embedding_model/list_finetuned_embedding_models_finetuned_embedding_model_get.py +0 -127
  69. orca_sdk/_generated_api_client/api/memoryset/__init__.py +0 -0
  70. orca_sdk/_generated_api_client/api/memoryset/clone_memoryset_memoryset_name_or_id_clone_post.py +0 -181
  71. orca_sdk/_generated_api_client/api/memoryset/create_analysis_memoryset_name_or_id_analysis_post.py +0 -183
  72. orca_sdk/_generated_api_client/api/memoryset/create_memoryset_memoryset_post.py +0 -168
  73. orca_sdk/_generated_api_client/api/memoryset/delete_memories_memoryset_name_or_id_memories_delete_post.py +0 -181
  74. orca_sdk/_generated_api_client/api/memoryset/delete_memory_memoryset_name_or_id_memory_memory_id_delete.py +0 -167
  75. orca_sdk/_generated_api_client/api/memoryset/delete_memoryset_memoryset_name_or_id_delete.py +0 -156
  76. orca_sdk/_generated_api_client/api/memoryset/get_analysis_memoryset_name_or_id_analysis_analysis_task_id_get.py +0 -169
  77. orca_sdk/_generated_api_client/api/memoryset/get_memories_memoryset_name_or_id_memories_get_post.py +0 -188
  78. orca_sdk/_generated_api_client/api/memoryset/get_memory_memoryset_name_or_id_memory_memory_id_get.py +0 -169
  79. orca_sdk/_generated_api_client/api/memoryset/get_memoryset_memoryset_name_or_id_get.py +0 -156
  80. orca_sdk/_generated_api_client/api/memoryset/insert_memories_gpu_memoryset_name_or_id_memory_post.py +0 -184
  81. orca_sdk/_generated_api_client/api/memoryset/list_analyses_memoryset_name_or_id_analysis_get.py +0 -260
  82. orca_sdk/_generated_api_client/api/memoryset/list_memorysets_memoryset_get.py +0 -127
  83. orca_sdk/_generated_api_client/api/memoryset/memoryset_lookup_gpu_memoryset_name_or_id_lookup_post.py +0 -193
  84. orca_sdk/_generated_api_client/api/memoryset/query_memoryset_memoryset_name_or_id_memories_post.py +0 -188
  85. orca_sdk/_generated_api_client/api/memoryset/update_memories_gpu_memoryset_name_or_id_memories_patch.py +0 -191
  86. orca_sdk/_generated_api_client/api/memoryset/update_memory_gpu_memoryset_name_or_id_memory_patch.py +0 -187
  87. orca_sdk/_generated_api_client/api/pretrained_embedding_model/__init__.py +0 -0
  88. orca_sdk/_generated_api_client/api/pretrained_embedding_model/embed_with_pretrained_model_gpu_pretrained_embedding_model_model_name_embedding_post.py +0 -188
  89. orca_sdk/_generated_api_client/api/pretrained_embedding_model/get_pretrained_embedding_model_pretrained_embedding_model_model_name_get.py +0 -157
  90. orca_sdk/_generated_api_client/api/pretrained_embedding_model/list_pretrained_embedding_models_pretrained_embedding_model_get.py +0 -127
  91. orca_sdk/_generated_api_client/api/task/__init__.py +0 -0
  92. orca_sdk/_generated_api_client/api/task/abort_task_task_task_id_abort_delete.py +0 -154
  93. orca_sdk/_generated_api_client/api/task/get_task_status_task_task_id_status_get.py +0 -156
  94. orca_sdk/_generated_api_client/api/task/list_tasks_task_get.py +0 -243
  95. orca_sdk/_generated_api_client/api/telemetry/__init__.py +0 -0
  96. orca_sdk/_generated_api_client/api/telemetry/drop_feedback_category_with_data_telemetry_feedback_category_name_or_id_delete.py +0 -162
  97. orca_sdk/_generated_api_client/api/telemetry/get_feedback_category_telemetry_feedback_category_name_or_id_get.py +0 -156
  98. orca_sdk/_generated_api_client/api/telemetry/get_prediction_telemetry_prediction_prediction_id_get.py +0 -157
  99. orca_sdk/_generated_api_client/api/telemetry/list_feedback_categories_telemetry_feedback_category_get.py +0 -127
  100. orca_sdk/_generated_api_client/api/telemetry/list_predictions_telemetry_prediction_post.py +0 -175
  101. orca_sdk/_generated_api_client/api/telemetry/record_prediction_feedback_telemetry_prediction_feedback_put.py +0 -171
  102. orca_sdk/_generated_api_client/api/telemetry/update_prediction_telemetry_prediction_prediction_id_patch.py +0 -181
  103. orca_sdk/_generated_api_client/client.py +0 -216
  104. orca_sdk/_generated_api_client/errors.py +0 -38
  105. orca_sdk/_generated_api_client/models/__init__.py +0 -159
  106. orca_sdk/_generated_api_client/models/analyze_neighbor_labels_result.py +0 -84
  107. orca_sdk/_generated_api_client/models/api_key_metadata.py +0 -118
  108. orca_sdk/_generated_api_client/models/base_model.py +0 -55
  109. orca_sdk/_generated_api_client/models/body_create_datasource_datasource_post.py +0 -176
  110. orca_sdk/_generated_api_client/models/classification_evaluation_result.py +0 -114
  111. orca_sdk/_generated_api_client/models/clone_labeled_memoryset_request.py +0 -150
  112. orca_sdk/_generated_api_client/models/column_info.py +0 -114
  113. orca_sdk/_generated_api_client/models/column_type.py +0 -14
  114. orca_sdk/_generated_api_client/models/conflict_error_response.py +0 -80
  115. orca_sdk/_generated_api_client/models/create_api_key_request.py +0 -99
  116. orca_sdk/_generated_api_client/models/create_api_key_response.py +0 -126
  117. orca_sdk/_generated_api_client/models/create_labeled_memoryset_request.py +0 -259
  118. orca_sdk/_generated_api_client/models/create_rac_model_request.py +0 -209
  119. orca_sdk/_generated_api_client/models/datasource_metadata.py +0 -142
  120. orca_sdk/_generated_api_client/models/delete_memories_request.py +0 -70
  121. orca_sdk/_generated_api_client/models/embed_request.py +0 -127
  122. orca_sdk/_generated_api_client/models/embedding_finetuning_method.py +0 -9
  123. orca_sdk/_generated_api_client/models/evaluation_request.py +0 -180
  124. orca_sdk/_generated_api_client/models/evaluation_response.py +0 -140
  125. orca_sdk/_generated_api_client/models/feedback_type.py +0 -9
  126. orca_sdk/_generated_api_client/models/field_validation_error.py +0 -103
  127. orca_sdk/_generated_api_client/models/filter_item.py +0 -231
  128. orca_sdk/_generated_api_client/models/filter_item_field_type_0_item.py +0 -15
  129. orca_sdk/_generated_api_client/models/filter_item_field_type_2_item_type_1.py +0 -16
  130. orca_sdk/_generated_api_client/models/filter_item_op.py +0 -16
  131. orca_sdk/_generated_api_client/models/find_duplicates_analysis_result.py +0 -70
  132. orca_sdk/_generated_api_client/models/finetune_embedding_model_request.py +0 -259
  133. orca_sdk/_generated_api_client/models/finetune_embedding_model_request_training_args.py +0 -66
  134. orca_sdk/_generated_api_client/models/finetuned_embedding_model_metadata.py +0 -166
  135. orca_sdk/_generated_api_client/models/get_memories_request.py +0 -70
  136. orca_sdk/_generated_api_client/models/internal_server_error_response.py +0 -80
  137. orca_sdk/_generated_api_client/models/label_class_metrics.py +0 -108
  138. orca_sdk/_generated_api_client/models/label_prediction_memory_lookup.py +0 -274
  139. orca_sdk/_generated_api_client/models/label_prediction_memory_lookup_metadata.py +0 -68
  140. orca_sdk/_generated_api_client/models/label_prediction_result.py +0 -101
  141. orca_sdk/_generated_api_client/models/label_prediction_with_memories_and_feedback.py +0 -232
  142. orca_sdk/_generated_api_client/models/labeled_memory.py +0 -197
  143. orca_sdk/_generated_api_client/models/labeled_memory_insert.py +0 -108
  144. orca_sdk/_generated_api_client/models/labeled_memory_insert_metadata.py +0 -68
  145. orca_sdk/_generated_api_client/models/labeled_memory_lookup.py +0 -258
  146. orca_sdk/_generated_api_client/models/labeled_memory_lookup_metadata.py +0 -68
  147. orca_sdk/_generated_api_client/models/labeled_memory_metadata.py +0 -68
  148. orca_sdk/_generated_api_client/models/labeled_memory_metrics.py +0 -277
  149. orca_sdk/_generated_api_client/models/labeled_memory_update.py +0 -171
  150. orca_sdk/_generated_api_client/models/labeled_memory_update_metadata_type_0.py +0 -68
  151. orca_sdk/_generated_api_client/models/labeled_memoryset_metadata.py +0 -195
  152. orca_sdk/_generated_api_client/models/list_analyses_memoryset_name_or_id_analysis_get_type_type_0.py +0 -9
  153. orca_sdk/_generated_api_client/models/list_memories_request.py +0 -104
  154. orca_sdk/_generated_api_client/models/list_predictions_request.py +0 -234
  155. orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_0.py +0 -9
  156. orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_1.py +0 -9
  157. orca_sdk/_generated_api_client/models/lookup_request.py +0 -81
  158. orca_sdk/_generated_api_client/models/memoryset_analysis_request.py +0 -83
  159. orca_sdk/_generated_api_client/models/memoryset_analysis_request_type.py +0 -9
  160. orca_sdk/_generated_api_client/models/memoryset_analysis_response.py +0 -180
  161. orca_sdk/_generated_api_client/models/memoryset_analysis_response_config.py +0 -66
  162. orca_sdk/_generated_api_client/models/memoryset_analysis_response_type.py +0 -9
  163. orca_sdk/_generated_api_client/models/not_found_error_response.py +0 -100
  164. orca_sdk/_generated_api_client/models/not_found_error_response_resource_type_0.py +0 -20
  165. orca_sdk/_generated_api_client/models/prediction_feedback.py +0 -157
  166. orca_sdk/_generated_api_client/models/prediction_feedback_category.py +0 -115
  167. orca_sdk/_generated_api_client/models/prediction_feedback_request.py +0 -122
  168. orca_sdk/_generated_api_client/models/prediction_feedback_result.py +0 -102
  169. orca_sdk/_generated_api_client/models/prediction_request.py +0 -169
  170. orca_sdk/_generated_api_client/models/pretrained_embedding_model_metadata.py +0 -97
  171. orca_sdk/_generated_api_client/models/pretrained_embedding_model_name.py +0 -11
  172. orca_sdk/_generated_api_client/models/rac_head_type.py +0 -11
  173. orca_sdk/_generated_api_client/models/rac_model_metadata.py +0 -191
  174. orca_sdk/_generated_api_client/models/service_unavailable_error_response.py +0 -80
  175. orca_sdk/_generated_api_client/models/task.py +0 -198
  176. orca_sdk/_generated_api_client/models/task_status.py +0 -14
  177. orca_sdk/_generated_api_client/models/task_status_info.py +0 -133
  178. orca_sdk/_generated_api_client/models/unauthenticated_error_response.py +0 -72
  179. orca_sdk/_generated_api_client/models/unauthorized_error_response.py +0 -80
  180. orca_sdk/_generated_api_client/models/unprocessable_input_error_response.py +0 -94
  181. orca_sdk/_generated_api_client/models/update_prediction_request.py +0 -93
  182. orca_sdk/_generated_api_client/py.typed +0 -1
  183. orca_sdk/_generated_api_client/types.py +0 -56
  184. orca_sdk/_utils/task.py +0 -73
  185. orca_sdk-0.1.1.dist-info/RECORD +0 -175
orca_sdk/datasource.py CHANGED
@@ -1,30 +1,90 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ import os
4
5
  import tempfile
6
+ import zipfile
5
7
  from datetime import datetime
8
+ from io import BytesIO
6
9
  from os import PathLike
7
10
  from pathlib import Path
8
- from typing import cast
11
+ from typing import Literal, Union, cast
9
12
 
10
13
  import pandas as pd
11
14
  import pyarrow as pa
12
- from datasets import Dataset
15
+ from datasets import Dataset, DatasetDict
16
+ from httpx._types import FileTypes # type: ignore
17
+ from pyarrow import parquet
13
18
  from torch.utils.data import DataLoader as TorchDataLoader
14
19
  from torch.utils.data import Dataset as TorchDataset
20
+ from tqdm.auto import tqdm
15
21
 
16
- from ._generated_api_client.api import (
17
- delete_datasource,
18
- get_datasource,
19
- list_datasources,
20
- )
21
- from ._generated_api_client.api.datasource.create_datasource_datasource_post import (
22
- _parse_response as parse_create_response,
23
- )
24
- from ._generated_api_client.client import get_client
25
- from ._generated_api_client.models import ColumnType, DatasourceMetadata
26
22
  from ._utils.common import CreateMode, DropMode
27
- from ._utils.data_parsing import hf_dataset_from_disk, hf_dataset_from_torch
23
+ from ._utils.data_parsing import hf_dataset_from_torch
24
+ from ._utils.tqdm_file_reader import TqdmFileReader
25
+ from .client import DatasourceMetadata, orca_api
26
+
27
+
28
+ def _upload_files_to_datasource(
29
+ name: str,
30
+ file_paths: list[Path],
31
+ description: str | None = None,
32
+ ) -> DatasourceMetadata:
33
+ """
34
+ Helper function to upload files to create a datasource using manual HTTP requests.
35
+
36
+ This bypasses the generated client because it doesn't handle file uploads properly.
37
+
38
+ Params:
39
+ name: Name for the datasource
40
+ file_paths: List of file paths to upload
41
+ description: Optional description for the datasource
42
+
43
+ Returns:
44
+ Metadata for the created datasource
45
+ """
46
+ files: list[tuple[Literal["files"], FileTypes]] = []
47
+
48
+ # Calculate total size for all files
49
+ total_size = sum(file_path.stat().st_size for file_path in file_paths)
50
+
51
+ with tqdm(total=total_size, unit="B", unit_scale=True, desc="Uploading") as pbar:
52
+ for file_path in file_paths:
53
+ buffered_reader = open(file_path, "rb")
54
+ tqdm_reader = TqdmFileReader(buffered_reader, pbar)
55
+ files.append(("files", (file_path.name, cast(bytes, tqdm_reader))))
56
+
57
+ # Use manual HTTP request for file uploads
58
+ metadata = orca_api.POST(
59
+ "/datasource/upload",
60
+ files=files,
61
+ data={"name": name, "description": description},
62
+ )
63
+
64
+ return metadata
65
+
66
+
67
+ def _handle_existing_datasource(name: str, if_exists: CreateMode) -> Union["Datasource", None]:
68
+ """
69
+ Helper function to handle the common pattern of checking if a datasource exists
70
+ and taking action based on the if_exists parameter.
71
+
72
+ Params:
73
+ name: Name of the datasource to check
74
+ if_exists: What to do if a datasource with the same name already exists
75
+
76
+ Returns:
77
+ Datasource instance if opening existing, None if should proceed with creation
78
+
79
+ Raises:
80
+ ValueError: If the datasource already exists and if_exists is "error"
81
+ """
82
+ if Datasource.exists(name):
83
+ if if_exists == "error":
84
+ raise ValueError(f"Dataset with name {name} already exists")
85
+ elif if_exists == "open":
86
+ return Datasource.open(name)
87
+ return None
28
88
 
29
89
 
30
90
  class Datasource:
@@ -37,6 +97,7 @@ class Datasource:
37
97
  Attributes:
38
98
  id: Unique identifier for the datasource
39
99
  name: Unique name of the datasource
100
+ description: Optional description of the datasource
40
101
  length: Number of rows in the datasource
41
102
  created_at: When the datasource was created
42
103
  columns: Dictionary of column names and types
@@ -44,6 +105,7 @@ class Datasource:
44
105
 
45
106
  id: str
46
107
  name: str
108
+ description: str | None
47
109
  length: int
48
110
  created_at: datetime
49
111
  updated_at: datetime
@@ -51,20 +113,19 @@ class Datasource:
51
113
 
52
114
  def __init__(self, metadata: DatasourceMetadata):
53
115
  # for internal use only, do not document
54
- self.id = metadata.id
55
- self.name = metadata.name
56
- self.length = metadata.length
57
- self.created_at = metadata.created_at
58
- self.updated_at = metadata.updated_at
116
+ self.id = metadata["id"]
117
+ self.name = metadata["name"]
118
+ self.length = metadata["length"]
119
+ self.created_at = datetime.fromisoformat(metadata["created_at"])
120
+ self.updated_at = datetime.fromisoformat(metadata["updated_at"])
121
+ self.description = metadata["description"]
59
122
  self.columns = {
60
- column.name: (
61
- f"enum({', '.join(f'{option!r}' for option in column.enum_options) if column.enum_options else ''}"
62
- if column.type == ColumnType.ENUM
63
- else "str"
64
- if column.type == ColumnType.STRING
65
- else column.type.value.lower()
123
+ column["name"]: (
124
+ f"enum({', '.join(f'{option!r}' for option in column['enum_options'] or []) if 'enum_options' in column else ''})"
125
+ if column["type"] == "ENUM"
126
+ else "str" if column["type"] == "STRING" else column["type"].lower()
66
127
  )
67
- for column in metadata.columns
128
+ for column in metadata["columns"]
68
129
  }
69
130
 
70
131
  def __eq__(self, other) -> bool:
@@ -82,7 +143,9 @@ class Datasource:
82
143
  )
83
144
 
84
145
  @classmethod
85
- def from_hf_dataset(cls, name: str, dataset: Dataset, if_exists: CreateMode = "error") -> Datasource:
146
+ def from_hf_dataset(
147
+ cls, name: str, dataset: Dataset, if_exists: CreateMode = "error", description: str | None = None
148
+ ) -> Datasource:
86
149
  """
87
150
  Create a new datasource from a Hugging Face Dataset
88
151
 
@@ -91,6 +154,7 @@ class Datasource:
91
154
  dataset: The Hugging Face Dataset to create the datasource from
92
155
  if_exists: What to do if a datasource with the same name already exists, defaults to
93
156
  `"error"`. Other option is `"open"` to open the existing datasource.
157
+ description: Optional description for the datasource
94
158
 
95
159
  Returns:
96
160
  A handle to the new datasource in the OrcaCloud
@@ -98,32 +162,54 @@ class Datasource:
98
162
  Raises:
99
163
  ValueError: If the datasource already exists and if_exists is `"error"`
100
164
  """
101
- client = get_client()
102
-
103
- if cls.exists(name):
104
- if if_exists == "error":
105
- raise ValueError(f"Dataset with name {name} already exists")
106
- elif if_exists == "open":
107
- return cls.open(name)
165
+ # Check if datasource already exists and handle accordingly
166
+ existing = _handle_existing_datasource(name, if_exists)
167
+ if existing is not None:
168
+ return existing
108
169
 
109
170
  with tempfile.TemporaryDirectory() as tmp_dir:
110
171
  dataset.save_to_disk(tmp_dir)
111
- files = []
112
- for file_path in Path(tmp_dir).iterdir():
113
- buffered_reader = open(file_path, "rb")
114
- files.append(("files", buffered_reader))
115
-
116
- # Do not use Generated client for this endpoint b/c it does not handle files properly
117
- metadata = parse_create_response(
118
- response=client.get_httpx_client().request(
119
- method="post",
120
- url="/datasource/",
121
- files=files,
122
- data={"name": name},
123
- )
124
- )
172
+
173
+ # Get all file paths in the directory
174
+ file_paths = list(Path(tmp_dir).iterdir())
175
+
176
+ # Use the helper function to upload files
177
+ metadata = _upload_files_to_datasource(name, file_paths, description)
125
178
  return cls(metadata=metadata)
126
179
 
180
+ @classmethod
181
+ def from_hf_dataset_dict(
182
+ cls,
183
+ name: str,
184
+ dataset_dict: DatasetDict,
185
+ if_exists: CreateMode = "error",
186
+ description: dict[str, str | None] | str | None = None,
187
+ ) -> dict[str, Datasource]:
188
+ """
189
+ Create datasources from a Hugging Face DatasetDict
190
+
191
+ Params:
192
+ name: Name prefix for the new datasources, will be suffixed with the dataset name
193
+ dataset_dict: The Hugging Face DatasetDict to create the datasources from
194
+ if_exists: What to do if a datasource with the same name already exists, defaults to
195
+ `"error"`. Other option is `"open"` to open the existing datasource.
196
+ description: Optional description for the datasources, can be a string or a dictionary of dataset names to descriptions
197
+
198
+ Returns:
199
+ A dictionary of datasource handles, keyed by the dataset name
200
+
201
+ Raises:
202
+ ValueError: If a datasource already exists and if_exists is `"error"`
203
+ """
204
+ if description is None or isinstance(description, str):
205
+ description = {dataset_name: description for dataset_name in dataset_dict.keys()}
206
+ return {
207
+ dataset_name: cls.from_hf_dataset(
208
+ f"{name}_{dataset_name}", dataset, if_exists=if_exists, description=description[dataset_name]
209
+ )
210
+ for dataset_name, dataset in dataset_dict.items()
211
+ }
212
+
127
213
  @classmethod
128
214
  def from_pytorch(
129
215
  cls,
@@ -131,6 +217,7 @@ class Datasource:
131
217
  torch_data: TorchDataLoader | TorchDataset,
132
218
  column_names: list[str] | None = None,
133
219
  if_exists: CreateMode = "error",
220
+ description: str | None = None,
134
221
  ) -> Datasource:
135
222
  """
136
223
  Create a new datasource from a PyTorch DataLoader or Dataset
@@ -142,6 +229,7 @@ class Datasource:
142
229
  argument must be provided to specify the names of the columns.
143
230
  if_exists: What to do if a datasource with the same name already exists, defaults to
144
231
  `"error"`. Other option is `"open"` to open the existing datasource.
232
+ description: Optional description for the datasource
145
233
 
146
234
  Returns:
147
235
  A handle to the new datasource in the OrcaCloud
@@ -150,10 +238,12 @@ class Datasource:
150
238
  ValueError: If the datasource already exists and if_exists is `"error"`
151
239
  """
152
240
  hf_dataset = hf_dataset_from_torch(torch_data, column_names=column_names)
153
- return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
241
+ return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists, description=description)
154
242
 
155
243
  @classmethod
156
- def from_list(cls, name: str, data: list[dict], if_exists: CreateMode = "error") -> Datasource:
244
+ def from_list(
245
+ cls, name: str, data: list[dict], if_exists: CreateMode = "error", description: str | None = None
246
+ ) -> Datasource:
157
247
  """
158
248
  Create a new datasource from a list of dictionaries
159
249
 
@@ -162,6 +252,7 @@ class Datasource:
162
252
  data: The list of dictionaries to create the datasource from
163
253
  if_exists: What to do if a datasource with the same name already exists, defaults to
164
254
  `"error"`. Other option is `"open"` to open the existing datasource.
255
+ description: Optional description for the datasource
165
256
 
166
257
  Returns:
167
258
  A handle to the new datasource in the OrcaCloud
@@ -172,11 +263,21 @@ class Datasource:
172
263
  Examples:
173
264
  >>> Datasource.from_list("my_datasource", [{"text": "Hello, world!", "label": 1}, {"text": "Goodbye", "label": 0}])
174
265
  """
175
- hf_dataset = Dataset.from_list(data)
176
- return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
266
+ # Check if datasource already exists and handle accordingly
267
+ existing = _handle_existing_datasource(name, if_exists)
268
+ if existing is not None:
269
+ return existing
270
+
271
+ metadata = orca_api.POST(
272
+ "/datasource",
273
+ json={"name": name, "description": description, "content": data},
274
+ )
275
+ return cls(metadata=metadata)
177
276
 
178
277
  @classmethod
179
- def from_dict(cls, name: str, data: dict, if_exists: CreateMode = "error") -> Datasource:
278
+ def from_dict(
279
+ cls, name: str, data: dict, if_exists: CreateMode = "error", description: str | None = None
280
+ ) -> Datasource:
180
281
  """
181
282
  Create a new datasource from a dictionary of columns
182
283
 
@@ -185,6 +286,7 @@ class Datasource:
185
286
  data: The dictionary of columns to create the datasource from
186
287
  if_exists: What to do if a datasource with the same name already exists, defaults to
187
288
  `"error"`. Other option is `"open"` to open the existing datasource.
289
+ description: Optional description for the datasource
188
290
 
189
291
  Returns:
190
292
  A handle to the new datasource in the OrcaCloud
@@ -195,11 +297,21 @@ class Datasource:
195
297
  Examples:
196
298
  >>> Datasource.from_dict("my_datasource", {"text": ["Hello, world!", "Goodbye"], "label": [1, 0]})
197
299
  """
198
- hf_dataset = Dataset.from_dict(data)
199
- return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
300
+ # Check if datasource already exists and handle accordingly
301
+ existing = _handle_existing_datasource(name, if_exists)
302
+ if existing is not None:
303
+ return existing
304
+
305
+ metadata = orca_api.POST(
306
+ "/datasource",
307
+ json={"name": name, "description": description, "content": data},
308
+ )
309
+ return cls(metadata=metadata)
200
310
 
201
311
  @classmethod
202
- def from_pandas(cls, name: str, dataframe: pd.DataFrame, if_exists: CreateMode = "error") -> Datasource:
312
+ def from_pandas(
313
+ cls, name: str, dataframe: pd.DataFrame, if_exists: CreateMode = "error", description: str | None = None
314
+ ) -> Datasource:
203
315
  """
204
316
  Create a new datasource from a pandas DataFrame
205
317
 
@@ -208,6 +320,7 @@ class Datasource:
208
320
  dataframe: The pandas DataFrame to create the datasource from
209
321
  if_exists: What to do if a datasource with the same name already exists, defaults to
210
322
  `"error"`. Other option is `"open"` to open the existing datasource.
323
+ description: Optional description for the datasource
211
324
 
212
325
  Returns:
213
326
  A handle to the new datasource in the OrcaCloud
@@ -215,11 +328,13 @@ class Datasource:
215
328
  Raises:
216
329
  ValueError: If the datasource already exists and if_exists is `"error"`
217
330
  """
218
- hf_dataset = Dataset.from_pandas(dataframe)
219
- return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
331
+ dataset = Dataset.from_pandas(dataframe)
332
+ return cls.from_hf_dataset(name, dataset, if_exists=if_exists, description=description)
220
333
 
221
334
  @classmethod
222
- def from_arrow(cls, name: str, pyarrow_table: pa.Table, if_exists: CreateMode = "error") -> Datasource:
335
+ def from_arrow(
336
+ cls, name: str, pyarrow_table: pa.Table, if_exists: CreateMode = "error", description: str | None = None
337
+ ) -> Datasource:
223
338
  """
224
339
  Create a new datasource from a pyarrow Table
225
340
 
@@ -228,6 +343,7 @@ class Datasource:
228
343
  pyarrow_table: The pyarrow Table to create the datasource from
229
344
  if_exists: What to do if a datasource with the same name already exists, defaults to
230
345
  `"error"`. Other option is `"open"` to open the existing datasource.
346
+ description: Optional description for the datasource
231
347
 
232
348
  Returns:
233
349
  A handle to the new datasource in the OrcaCloud
@@ -235,11 +351,28 @@ class Datasource:
235
351
  Raises:
236
352
  ValueError: If the datasource already exists and if_exists is `"error"`
237
353
  """
238
- hf_dataset = Dataset(pyarrow_table)
239
- return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
354
+ # Check if datasource already exists and handle accordingly
355
+ existing = _handle_existing_datasource(name, if_exists)
356
+ if existing is not None:
357
+ return existing
358
+
359
+ # Write to bytes buffer
360
+ buffer = BytesIO()
361
+ parquet.write_table(pyarrow_table, buffer)
362
+ parquet_bytes = buffer.getvalue()
363
+
364
+ metadata = orca_api.POST(
365
+ "/datasource/upload",
366
+ files=[("files", ("data.parquet", parquet_bytes))],
367
+ data={"name": name, "description": description},
368
+ )
369
+
370
+ return cls(metadata=metadata)
240
371
 
241
372
  @classmethod
242
- def from_disk(cls, name: str, file_path: str | PathLike, if_exists: CreateMode = "error") -> Datasource:
373
+ def from_disk(
374
+ cls, name: str, file_path: str | PathLike, if_exists: CreateMode = "error", description: str | None = None
375
+ ) -> Datasource:
243
376
  """
244
377
  Create a new datasource from a local file
245
378
 
@@ -256,6 +389,7 @@ class Datasource:
256
389
 
257
390
  if_exists: What to do if a datasource with the same name already exists, defaults to
258
391
  `"error"`. Other option is `"open"` to open the existing datasource.
392
+ description: Optional description for the datasource
259
393
 
260
394
  Returns:
261
395
  A handle to the new datasource in the OrcaCloud
@@ -263,16 +397,31 @@ class Datasource:
263
397
  Raises:
264
398
  ValueError: If the datasource already exists and if_exists is `"error"`
265
399
  """
266
- hf_dataset = hf_dataset_from_disk(file_path)
267
- return cls.from_hf_dataset(name, cast(Dataset, hf_dataset), if_exists=if_exists)
400
+ # Check if datasource already exists and handle accordingly
401
+ existing = _handle_existing_datasource(name, if_exists)
402
+ if existing is not None:
403
+ return existing
404
+
405
+ file_path = Path(file_path)
406
+
407
+ # For dataset directories, use the upload endpoint with multiple files
408
+ if file_path.is_dir():
409
+ return cls.from_hf_dataset(
410
+ name, Dataset.load_from_disk(file_path), if_exists=if_exists, description=description
411
+ )
412
+
413
+ # For single files, use the helper function to upload files
414
+ metadata = _upload_files_to_datasource(name, [file_path], description)
415
+
416
+ return cls(metadata=metadata)
268
417
 
269
418
  @classmethod
270
- def open(cls, name: str) -> Datasource:
419
+ def open(cls, name_or_id: str) -> Datasource:
271
420
  """
272
421
  Get a handle to a datasource by name or id in the OrcaCloud
273
422
 
274
423
  Params:
275
- name: The name or unique identifier of the datasource to get
424
+ name_or_id: The name or unique identifier of the datasource to get
276
425
 
277
426
  Returns:
278
427
  A handle to the existing datasource in the OrcaCloud
@@ -280,7 +429,7 @@ class Datasource:
280
429
  Raises:
281
430
  LookupError: If the datasource does not exist
282
431
  """
283
- return cls(get_datasource(name))
432
+ return cls(orca_api.GET("/datasource/{name_or_id}", params={"name_or_id": name_or_id}))
284
433
 
285
434
  @classmethod
286
435
  def exists(cls, name_or_id: str) -> bool:
@@ -307,7 +456,7 @@ class Datasource:
307
456
  Returns:
308
457
  A list of all datasource handles in the OrcaCloud
309
458
  """
310
- return [cls(metadata) for metadata in list_datasources()]
459
+ return [cls(metadata) for metadata in orca_api.GET("/datasource")]
311
460
 
312
461
  @classmethod
313
462
  def drop(cls, name_or_id: str, if_not_exists: DropMode = "error") -> None:
@@ -323,7 +472,7 @@ class Datasource:
323
472
  LookupError: If the datasource does not exist and if_not_exists is `"error"`
324
473
  """
325
474
  try:
326
- delete_datasource(name_or_id)
475
+ orca_api.DELETE("/datasource/{name_or_id}", params={"name_or_id": name_or_id})
327
476
  logging.info(f"Deleted datasource {name_or_id}")
328
477
  except LookupError:
329
478
  if if_not_exists == "error":
@@ -331,3 +480,45 @@ class Datasource:
331
480
 
332
481
  def __len__(self) -> int:
333
482
  return self.length
483
+
484
+ def download(
485
+ self, output_dir: str | PathLike, file_type: Literal["hf_dataset", "json", "csv"] = "hf_dataset"
486
+ ) -> None:
487
+ """
488
+ Download the datasource to a specified path in the specified format type
489
+
490
+ Params:
491
+ output_dir: The local directory where the downloaded file will be saved.
492
+ file_type: The type of file to download.
493
+
494
+ Returns:
495
+ None
496
+ """
497
+ extension = "zip" if file_type == "hf_dataset" else file_type
498
+ output_path = Path(output_dir) / f"{self.name}.{extension}"
499
+ with open(output_path, "wb") as download_file:
500
+ with orca_api.stream("GET", f"/datasource/{self.id}/download", params={"file_type": file_type}) as response:
501
+ total_chunks = int(response.headers["X-Total-Chunks"]) if "X-Total-Chunks" in response.headers else None
502
+ with tqdm(desc="Downloading", total=total_chunks, disable=total_chunks is None) as progress:
503
+ for chunk in response.iter_bytes():
504
+ download_file.write(chunk)
505
+ progress.update(1)
506
+
507
+ # extract the zip file
508
+ if extension == "zip":
509
+ extract_dir = Path(output_dir) / self.name
510
+ with zipfile.ZipFile(output_path, "r") as zip_ref:
511
+ zip_ref.extractall(extract_dir)
512
+ output_path.unlink() # Remove the zip file after extraction
513
+ logging.info(f"Downloaded {extract_dir}")
514
+ else:
515
+ logging.info(f"Downloaded {output_path}")
516
+
517
+ def to_list(self) -> list[dict]:
518
+ """
519
+ Convert the datasource to a list of dictionaries.
520
+
521
+ Returns:
522
+ A list of dictionaries representation of the datasource.
523
+ """
524
+ return orca_api.GET("/datasource/{name_or_id}/download", params={"name_or_id": self.id, "file_type": "json"})