orca-sdk 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/__init__.py +10 -4
- orca_sdk/_shared/__init__.py +10 -0
- orca_sdk/_shared/metrics.py +393 -0
- orca_sdk/_shared/metrics_test.py +273 -0
- orca_sdk/_utils/analysis_ui.py +12 -10
- orca_sdk/_utils/analysis_ui_style.css +0 -3
- orca_sdk/_utils/auth.py +31 -29
- orca_sdk/_utils/data_parsing.py +28 -2
- orca_sdk/_utils/data_parsing_test.py +15 -15
- orca_sdk/_utils/pagination.py +126 -0
- orca_sdk/_utils/pagination_test.py +132 -0
- orca_sdk/_utils/prediction_result_ui.py +67 -21
- orca_sdk/_utils/tqdm_file_reader.py +12 -0
- orca_sdk/_utils/value_parser.py +45 -0
- orca_sdk/_utils/value_parser_test.py +39 -0
- orca_sdk/async_client.py +3795 -0
- orca_sdk/classification_model.py +601 -129
- orca_sdk/classification_model_test.py +415 -117
- orca_sdk/client.py +3787 -0
- orca_sdk/conftest.py +184 -38
- orca_sdk/credentials.py +162 -20
- orca_sdk/credentials_test.py +100 -16
- orca_sdk/datasource.py +268 -68
- orca_sdk/datasource_test.py +266 -18
- orca_sdk/embedding_model.py +434 -82
- orca_sdk/embedding_model_test.py +66 -33
- orca_sdk/job.py +343 -0
- orca_sdk/job_test.py +108 -0
- orca_sdk/memoryset.py +1690 -324
- orca_sdk/memoryset_test.py +456 -119
- orca_sdk/regression_model.py +694 -0
- orca_sdk/regression_model_test.py +378 -0
- orca_sdk/telemetry.py +460 -143
- orca_sdk/telemetry_test.py +43 -24
- {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.3.dist-info}/METADATA +34 -16
- orca_sdk-0.1.3.dist-info/RECORD +41 -0
- {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.3.dist-info}/WHEEL +1 -1
- orca_sdk/_generated_api_client/__init__.py +0 -3
- orca_sdk/_generated_api_client/api/__init__.py +0 -193
- orca_sdk/_generated_api_client/api/auth/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/auth/check_authentication_auth_get.py +0 -128
- orca_sdk/_generated_api_client/api/auth/create_api_key_auth_api_key_post.py +0 -170
- orca_sdk/_generated_api_client/api/auth/delete_api_key_auth_api_key_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/auth/delete_org_auth_org_delete.py +0 -130
- orca_sdk/_generated_api_client/api/auth/list_api_keys_auth_api_key_get.py +0 -127
- orca_sdk/_generated_api_client/api/classification_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/classification_model/create_evaluation_classification_model_model_name_or_id_evaluation_post.py +0 -183
- orca_sdk/_generated_api_client/api/classification_model/create_model_classification_model_post.py +0 -170
- orca_sdk/_generated_api_client/api/classification_model/delete_evaluation_classification_model_model_name_or_id_evaluation_task_id_delete.py +0 -168
- orca_sdk/_generated_api_client/api/classification_model/delete_model_classification_model_name_or_id_delete.py +0 -154
- orca_sdk/_generated_api_client/api/classification_model/get_evaluation_classification_model_model_name_or_id_evaluation_task_id_get.py +0 -170
- orca_sdk/_generated_api_client/api/classification_model/get_model_classification_model_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/classification_model/list_evaluations_classification_model_model_name_or_id_evaluation_get.py +0 -161
- orca_sdk/_generated_api_client/api/classification_model/list_models_classification_model_get.py +0 -127
- orca_sdk/_generated_api_client/api/classification_model/predict_gpu_classification_model_name_or_id_prediction_post.py +0 -190
- orca_sdk/_generated_api_client/api/datasource/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/datasource/create_datasource_datasource_post.py +0 -167
- orca_sdk/_generated_api_client/api/datasource/delete_datasource_datasource_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/datasource/get_datasource_datasource_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/datasource/list_datasources_datasource_get.py +0 -127
- orca_sdk/_generated_api_client/api/default/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/default/healthcheck_get.py +0 -118
- orca_sdk/_generated_api_client/api/default/healthcheck_gpu_get.py +0 -118
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/create_finetuned_embedding_model_finetuned_embedding_model_post.py +0 -168
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/delete_finetuned_embedding_model_finetuned_embedding_model_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/embed_with_finetuned_model_gpu_finetuned_embedding_model_name_or_id_embedding_post.py +0 -189
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/get_finetuned_embedding_model_finetuned_embedding_model_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/list_finetuned_embedding_models_finetuned_embedding_model_get.py +0 -127
- orca_sdk/_generated_api_client/api/memoryset/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/memoryset/clone_memoryset_memoryset_name_or_id_clone_post.py +0 -181
- orca_sdk/_generated_api_client/api/memoryset/create_analysis_memoryset_name_or_id_analysis_post.py +0 -183
- orca_sdk/_generated_api_client/api/memoryset/create_memoryset_memoryset_post.py +0 -168
- orca_sdk/_generated_api_client/api/memoryset/delete_memories_memoryset_name_or_id_memories_delete_post.py +0 -181
- orca_sdk/_generated_api_client/api/memoryset/delete_memory_memoryset_name_or_id_memory_memory_id_delete.py +0 -167
- orca_sdk/_generated_api_client/api/memoryset/delete_memoryset_memoryset_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/memoryset/get_analysis_memoryset_name_or_id_analysis_analysis_task_id_get.py +0 -169
- orca_sdk/_generated_api_client/api/memoryset/get_memories_memoryset_name_or_id_memories_get_post.py +0 -188
- orca_sdk/_generated_api_client/api/memoryset/get_memory_memoryset_name_or_id_memory_memory_id_get.py +0 -169
- orca_sdk/_generated_api_client/api/memoryset/get_memoryset_memoryset_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/memoryset/insert_memories_gpu_memoryset_name_or_id_memory_post.py +0 -184
- orca_sdk/_generated_api_client/api/memoryset/list_analyses_memoryset_name_or_id_analysis_get.py +0 -260
- orca_sdk/_generated_api_client/api/memoryset/list_memorysets_memoryset_get.py +0 -127
- orca_sdk/_generated_api_client/api/memoryset/memoryset_lookup_gpu_memoryset_name_or_id_lookup_post.py +0 -193
- orca_sdk/_generated_api_client/api/memoryset/query_memoryset_memoryset_name_or_id_memories_post.py +0 -188
- orca_sdk/_generated_api_client/api/memoryset/update_memories_gpu_memoryset_name_or_id_memories_patch.py +0 -191
- orca_sdk/_generated_api_client/api/memoryset/update_memory_gpu_memoryset_name_or_id_memory_patch.py +0 -187
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/embed_with_pretrained_model_gpu_pretrained_embedding_model_model_name_embedding_post.py +0 -188
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/get_pretrained_embedding_model_pretrained_embedding_model_model_name_get.py +0 -157
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/list_pretrained_embedding_models_pretrained_embedding_model_get.py +0 -127
- orca_sdk/_generated_api_client/api/task/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/task/abort_task_task_task_id_abort_delete.py +0 -154
- orca_sdk/_generated_api_client/api/task/get_task_status_task_task_id_status_get.py +0 -156
- orca_sdk/_generated_api_client/api/task/list_tasks_task_get.py +0 -243
- orca_sdk/_generated_api_client/api/telemetry/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/telemetry/drop_feedback_category_with_data_telemetry_feedback_category_name_or_id_delete.py +0 -162
- orca_sdk/_generated_api_client/api/telemetry/get_feedback_category_telemetry_feedback_category_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/telemetry/get_prediction_telemetry_prediction_prediction_id_get.py +0 -157
- orca_sdk/_generated_api_client/api/telemetry/list_feedback_categories_telemetry_feedback_category_get.py +0 -127
- orca_sdk/_generated_api_client/api/telemetry/list_predictions_telemetry_prediction_post.py +0 -175
- orca_sdk/_generated_api_client/api/telemetry/record_prediction_feedback_telemetry_prediction_feedback_put.py +0 -171
- orca_sdk/_generated_api_client/api/telemetry/update_prediction_telemetry_prediction_prediction_id_patch.py +0 -181
- orca_sdk/_generated_api_client/client.py +0 -216
- orca_sdk/_generated_api_client/errors.py +0 -38
- orca_sdk/_generated_api_client/models/__init__.py +0 -159
- orca_sdk/_generated_api_client/models/analyze_neighbor_labels_result.py +0 -84
- orca_sdk/_generated_api_client/models/api_key_metadata.py +0 -118
- orca_sdk/_generated_api_client/models/base_model.py +0 -55
- orca_sdk/_generated_api_client/models/body_create_datasource_datasource_post.py +0 -176
- orca_sdk/_generated_api_client/models/classification_evaluation_result.py +0 -114
- orca_sdk/_generated_api_client/models/clone_labeled_memoryset_request.py +0 -150
- orca_sdk/_generated_api_client/models/column_info.py +0 -114
- orca_sdk/_generated_api_client/models/column_type.py +0 -14
- orca_sdk/_generated_api_client/models/conflict_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/create_api_key_request.py +0 -99
- orca_sdk/_generated_api_client/models/create_api_key_response.py +0 -126
- orca_sdk/_generated_api_client/models/create_labeled_memoryset_request.py +0 -259
- orca_sdk/_generated_api_client/models/create_rac_model_request.py +0 -209
- orca_sdk/_generated_api_client/models/datasource_metadata.py +0 -142
- orca_sdk/_generated_api_client/models/delete_memories_request.py +0 -70
- orca_sdk/_generated_api_client/models/embed_request.py +0 -127
- orca_sdk/_generated_api_client/models/embedding_finetuning_method.py +0 -9
- orca_sdk/_generated_api_client/models/evaluation_request.py +0 -180
- orca_sdk/_generated_api_client/models/evaluation_response.py +0 -140
- orca_sdk/_generated_api_client/models/feedback_type.py +0 -9
- orca_sdk/_generated_api_client/models/field_validation_error.py +0 -103
- orca_sdk/_generated_api_client/models/filter_item.py +0 -231
- orca_sdk/_generated_api_client/models/filter_item_field_type_0_item.py +0 -15
- orca_sdk/_generated_api_client/models/filter_item_field_type_2_item_type_1.py +0 -16
- orca_sdk/_generated_api_client/models/filter_item_op.py +0 -16
- orca_sdk/_generated_api_client/models/find_duplicates_analysis_result.py +0 -70
- orca_sdk/_generated_api_client/models/finetune_embedding_model_request.py +0 -259
- orca_sdk/_generated_api_client/models/finetune_embedding_model_request_training_args.py +0 -66
- orca_sdk/_generated_api_client/models/finetuned_embedding_model_metadata.py +0 -166
- orca_sdk/_generated_api_client/models/get_memories_request.py +0 -70
- orca_sdk/_generated_api_client/models/internal_server_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/label_class_metrics.py +0 -108
- orca_sdk/_generated_api_client/models/label_prediction_memory_lookup.py +0 -274
- orca_sdk/_generated_api_client/models/label_prediction_memory_lookup_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/label_prediction_result.py +0 -101
- orca_sdk/_generated_api_client/models/label_prediction_with_memories_and_feedback.py +0 -232
- orca_sdk/_generated_api_client/models/labeled_memory.py +0 -197
- orca_sdk/_generated_api_client/models/labeled_memory_insert.py +0 -108
- orca_sdk/_generated_api_client/models/labeled_memory_insert_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memory_lookup.py +0 -258
- orca_sdk/_generated_api_client/models/labeled_memory_lookup_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memory_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memory_metrics.py +0 -277
- orca_sdk/_generated_api_client/models/labeled_memory_update.py +0 -171
- orca_sdk/_generated_api_client/models/labeled_memory_update_metadata_type_0.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memoryset_metadata.py +0 -195
- orca_sdk/_generated_api_client/models/list_analyses_memoryset_name_or_id_analysis_get_type_type_0.py +0 -9
- orca_sdk/_generated_api_client/models/list_memories_request.py +0 -104
- orca_sdk/_generated_api_client/models/list_predictions_request.py +0 -234
- orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_0.py +0 -9
- orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_1.py +0 -9
- orca_sdk/_generated_api_client/models/lookup_request.py +0 -81
- orca_sdk/_generated_api_client/models/memoryset_analysis_request.py +0 -83
- orca_sdk/_generated_api_client/models/memoryset_analysis_request_type.py +0 -9
- orca_sdk/_generated_api_client/models/memoryset_analysis_response.py +0 -180
- orca_sdk/_generated_api_client/models/memoryset_analysis_response_config.py +0 -66
- orca_sdk/_generated_api_client/models/memoryset_analysis_response_type.py +0 -9
- orca_sdk/_generated_api_client/models/not_found_error_response.py +0 -100
- orca_sdk/_generated_api_client/models/not_found_error_response_resource_type_0.py +0 -20
- orca_sdk/_generated_api_client/models/prediction_feedback.py +0 -157
- orca_sdk/_generated_api_client/models/prediction_feedback_category.py +0 -115
- orca_sdk/_generated_api_client/models/prediction_feedback_request.py +0 -122
- orca_sdk/_generated_api_client/models/prediction_feedback_result.py +0 -102
- orca_sdk/_generated_api_client/models/prediction_request.py +0 -169
- orca_sdk/_generated_api_client/models/pretrained_embedding_model_metadata.py +0 -97
- orca_sdk/_generated_api_client/models/pretrained_embedding_model_name.py +0 -11
- orca_sdk/_generated_api_client/models/rac_head_type.py +0 -11
- orca_sdk/_generated_api_client/models/rac_model_metadata.py +0 -191
- orca_sdk/_generated_api_client/models/service_unavailable_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/task.py +0 -198
- orca_sdk/_generated_api_client/models/task_status.py +0 -14
- orca_sdk/_generated_api_client/models/task_status_info.py +0 -133
- orca_sdk/_generated_api_client/models/unauthenticated_error_response.py +0 -72
- orca_sdk/_generated_api_client/models/unauthorized_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/unprocessable_input_error_response.py +0 -94
- orca_sdk/_generated_api_client/models/update_prediction_request.py +0 -93
- orca_sdk/_generated_api_client/py.typed +0 -1
- orca_sdk/_generated_api_client/types.py +0 -56
- orca_sdk/_utils/task.py +0 -73
- orca_sdk-0.1.1.dist-info/RECORD +0 -175
orca_sdk/datasource.py
CHANGED
|
@@ -1,30 +1,91 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import os
|
|
4
5
|
import tempfile
|
|
6
|
+
import zipfile
|
|
5
7
|
from datetime import datetime
|
|
8
|
+
from io import BytesIO
|
|
6
9
|
from os import PathLike
|
|
7
10
|
from pathlib import Path
|
|
8
|
-
from typing import cast
|
|
11
|
+
from typing import Literal, Union, cast
|
|
9
12
|
|
|
10
13
|
import pandas as pd
|
|
11
14
|
import pyarrow as pa
|
|
12
|
-
from datasets import Dataset
|
|
15
|
+
from datasets import Dataset, DatasetDict
|
|
16
|
+
from httpx._types import FileTypes # type: ignore
|
|
17
|
+
from pyarrow import parquet
|
|
13
18
|
from torch.utils.data import DataLoader as TorchDataLoader
|
|
14
19
|
from torch.utils.data import Dataset as TorchDataset
|
|
20
|
+
from tqdm.auto import tqdm
|
|
15
21
|
|
|
16
|
-
from ._generated_api_client.api import (
|
|
17
|
-
delete_datasource,
|
|
18
|
-
get_datasource,
|
|
19
|
-
list_datasources,
|
|
20
|
-
)
|
|
21
|
-
from ._generated_api_client.api.datasource.create_datasource_datasource_post import (
|
|
22
|
-
_parse_response as parse_create_response,
|
|
23
|
-
)
|
|
24
|
-
from ._generated_api_client.client import get_client
|
|
25
|
-
from ._generated_api_client.models import ColumnType, DatasourceMetadata
|
|
26
22
|
from ._utils.common import CreateMode, DropMode
|
|
27
|
-
from ._utils.data_parsing import
|
|
23
|
+
from ._utils.data_parsing import hf_dataset_from_torch
|
|
24
|
+
from ._utils.tqdm_file_reader import TqdmFileReader
|
|
25
|
+
from .client import DatasourceMetadata, OrcaClient
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _upload_files_to_datasource(
|
|
29
|
+
name: str,
|
|
30
|
+
file_paths: list[Path],
|
|
31
|
+
description: str | None = None,
|
|
32
|
+
) -> DatasourceMetadata:
|
|
33
|
+
"""
|
|
34
|
+
Helper function to upload files to create a datasource using manual HTTP requests.
|
|
35
|
+
|
|
36
|
+
This bypasses the generated client because it doesn't handle file uploads properly.
|
|
37
|
+
|
|
38
|
+
Params:
|
|
39
|
+
name: Name for the datasource
|
|
40
|
+
file_paths: List of file paths to upload
|
|
41
|
+
description: Optional description for the datasource
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Metadata for the created datasource
|
|
45
|
+
"""
|
|
46
|
+
files: list[tuple[Literal["files"], FileTypes]] = []
|
|
47
|
+
|
|
48
|
+
# Calculate total size for all files
|
|
49
|
+
total_size = sum(file_path.stat().st_size for file_path in file_paths)
|
|
50
|
+
|
|
51
|
+
with tqdm(total=total_size, unit="B", unit_scale=True, desc="Uploading") as pbar:
|
|
52
|
+
for file_path in file_paths:
|
|
53
|
+
buffered_reader = open(file_path, "rb")
|
|
54
|
+
tqdm_reader = TqdmFileReader(buffered_reader, pbar)
|
|
55
|
+
files.append(("files", (file_path.name, cast(bytes, tqdm_reader))))
|
|
56
|
+
|
|
57
|
+
# Use manual HTTP request for file uploads
|
|
58
|
+
client = OrcaClient._resolve_client()
|
|
59
|
+
metadata = client.POST(
|
|
60
|
+
"/datasource/upload",
|
|
61
|
+
files=files,
|
|
62
|
+
data={"name": name, "description": description},
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
return metadata
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _handle_existing_datasource(name: str, if_exists: CreateMode) -> Union["Datasource", None]:
|
|
69
|
+
"""
|
|
70
|
+
Helper function to handle the common pattern of checking if a datasource exists
|
|
71
|
+
and taking action based on the if_exists parameter.
|
|
72
|
+
|
|
73
|
+
Params:
|
|
74
|
+
name: Name of the datasource to check
|
|
75
|
+
if_exists: What to do if a datasource with the same name already exists
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Datasource instance if opening existing, None if should proceed with creation
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
ValueError: If the datasource already exists and if_exists is "error"
|
|
82
|
+
"""
|
|
83
|
+
if Datasource.exists(name):
|
|
84
|
+
if if_exists == "error":
|
|
85
|
+
raise ValueError(f"Dataset with name {name} already exists")
|
|
86
|
+
elif if_exists == "open":
|
|
87
|
+
return Datasource.open(name)
|
|
88
|
+
return None
|
|
28
89
|
|
|
29
90
|
|
|
30
91
|
class Datasource:
|
|
@@ -37,6 +98,7 @@ class Datasource:
|
|
|
37
98
|
Attributes:
|
|
38
99
|
id: Unique identifier for the datasource
|
|
39
100
|
name: Unique name of the datasource
|
|
101
|
+
description: Optional description of the datasource
|
|
40
102
|
length: Number of rows in the datasource
|
|
41
103
|
created_at: When the datasource was created
|
|
42
104
|
columns: Dictionary of column names and types
|
|
@@ -44,6 +106,7 @@ class Datasource:
|
|
|
44
106
|
|
|
45
107
|
id: str
|
|
46
108
|
name: str
|
|
109
|
+
description: str | None
|
|
47
110
|
length: int
|
|
48
111
|
created_at: datetime
|
|
49
112
|
updated_at: datetime
|
|
@@ -51,20 +114,19 @@ class Datasource:
|
|
|
51
114
|
|
|
52
115
|
def __init__(self, metadata: DatasourceMetadata):
|
|
53
116
|
# for internal use only, do not document
|
|
54
|
-
self.id = metadata
|
|
55
|
-
self.name = metadata
|
|
56
|
-
self.length = metadata
|
|
57
|
-
self.created_at = metadata
|
|
58
|
-
self.updated_at = metadata
|
|
117
|
+
self.id = metadata["id"]
|
|
118
|
+
self.name = metadata["name"]
|
|
119
|
+
self.length = metadata["length"]
|
|
120
|
+
self.created_at = datetime.fromisoformat(metadata["created_at"])
|
|
121
|
+
self.updated_at = datetime.fromisoformat(metadata["updated_at"])
|
|
122
|
+
self.description = metadata["description"]
|
|
59
123
|
self.columns = {
|
|
60
|
-
column
|
|
61
|
-
f"enum({', '.join(f'{option!r}' for option in column
|
|
62
|
-
if column
|
|
63
|
-
else "str"
|
|
64
|
-
if column.type == ColumnType.STRING
|
|
65
|
-
else column.type.value.lower()
|
|
124
|
+
column["name"]: (
|
|
125
|
+
f"enum({', '.join(f'{option!r}' for option in column['enum_options'] or []) if 'enum_options' in column else ''})"
|
|
126
|
+
if column["type"] == "ENUM"
|
|
127
|
+
else "str" if column["type"] == "STRING" else column["type"].lower()
|
|
66
128
|
)
|
|
67
|
-
for column in metadata
|
|
129
|
+
for column in metadata["columns"]
|
|
68
130
|
}
|
|
69
131
|
|
|
70
132
|
def __eq__(self, other) -> bool:
|
|
@@ -82,7 +144,9 @@ class Datasource:
|
|
|
82
144
|
)
|
|
83
145
|
|
|
84
146
|
@classmethod
|
|
85
|
-
def from_hf_dataset(
|
|
147
|
+
def from_hf_dataset(
|
|
148
|
+
cls, name: str, dataset: Dataset, if_exists: CreateMode = "error", description: str | None = None
|
|
149
|
+
) -> Datasource:
|
|
86
150
|
"""
|
|
87
151
|
Create a new datasource from a Hugging Face Dataset
|
|
88
152
|
|
|
@@ -91,6 +155,7 @@ class Datasource:
|
|
|
91
155
|
dataset: The Hugging Face Dataset to create the datasource from
|
|
92
156
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
93
157
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
158
|
+
description: Optional description for the datasource
|
|
94
159
|
|
|
95
160
|
Returns:
|
|
96
161
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -98,32 +163,54 @@ class Datasource:
|
|
|
98
163
|
Raises:
|
|
99
164
|
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
100
165
|
"""
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
if
|
|
104
|
-
|
|
105
|
-
raise ValueError(f"Dataset with name {name} already exists")
|
|
106
|
-
elif if_exists == "open":
|
|
107
|
-
return cls.open(name)
|
|
166
|
+
# Check if datasource already exists and handle accordingly
|
|
167
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
168
|
+
if existing is not None:
|
|
169
|
+
return existing
|
|
108
170
|
|
|
109
171
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
110
172
|
dataset.save_to_disk(tmp_dir)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
metadata = parse_create_response(
|
|
118
|
-
response=client.get_httpx_client().request(
|
|
119
|
-
method="post",
|
|
120
|
-
url="/datasource/",
|
|
121
|
-
files=files,
|
|
122
|
-
data={"name": name},
|
|
123
|
-
)
|
|
124
|
-
)
|
|
173
|
+
|
|
174
|
+
# Get all file paths in the directory
|
|
175
|
+
file_paths = list(Path(tmp_dir).iterdir())
|
|
176
|
+
|
|
177
|
+
# Use the helper function to upload files
|
|
178
|
+
metadata = _upload_files_to_datasource(name, file_paths, description)
|
|
125
179
|
return cls(metadata=metadata)
|
|
126
180
|
|
|
181
|
+
@classmethod
|
|
182
|
+
def from_hf_dataset_dict(
|
|
183
|
+
cls,
|
|
184
|
+
name: str,
|
|
185
|
+
dataset_dict: DatasetDict,
|
|
186
|
+
if_exists: CreateMode = "error",
|
|
187
|
+
description: dict[str, str | None] | str | None = None,
|
|
188
|
+
) -> dict[str, Datasource]:
|
|
189
|
+
"""
|
|
190
|
+
Create datasources from a Hugging Face DatasetDict
|
|
191
|
+
|
|
192
|
+
Params:
|
|
193
|
+
name: Name prefix for the new datasources, will be suffixed with the dataset name
|
|
194
|
+
dataset_dict: The Hugging Face DatasetDict to create the datasources from
|
|
195
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
196
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
197
|
+
description: Optional description for the datasources, can be a string or a dictionary of dataset names to descriptions
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
A dictionary of datasource handles, keyed by the dataset name
|
|
201
|
+
|
|
202
|
+
Raises:
|
|
203
|
+
ValueError: If a datasource already exists and if_exists is `"error"`
|
|
204
|
+
"""
|
|
205
|
+
if description is None or isinstance(description, str):
|
|
206
|
+
description = {dataset_name: description for dataset_name in dataset_dict.keys()}
|
|
207
|
+
return {
|
|
208
|
+
dataset_name: cls.from_hf_dataset(
|
|
209
|
+
f"{name}_{dataset_name}", dataset, if_exists=if_exists, description=description[dataset_name]
|
|
210
|
+
)
|
|
211
|
+
for dataset_name, dataset in dataset_dict.items()
|
|
212
|
+
}
|
|
213
|
+
|
|
127
214
|
@classmethod
|
|
128
215
|
def from_pytorch(
|
|
129
216
|
cls,
|
|
@@ -131,6 +218,7 @@ class Datasource:
|
|
|
131
218
|
torch_data: TorchDataLoader | TorchDataset,
|
|
132
219
|
column_names: list[str] | None = None,
|
|
133
220
|
if_exists: CreateMode = "error",
|
|
221
|
+
description: str | None = None,
|
|
134
222
|
) -> Datasource:
|
|
135
223
|
"""
|
|
136
224
|
Create a new datasource from a PyTorch DataLoader or Dataset
|
|
@@ -142,6 +230,7 @@ class Datasource:
|
|
|
142
230
|
argument must be provided to specify the names of the columns.
|
|
143
231
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
144
232
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
233
|
+
description: Optional description for the datasource
|
|
145
234
|
|
|
146
235
|
Returns:
|
|
147
236
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -150,10 +239,12 @@ class Datasource:
|
|
|
150
239
|
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
151
240
|
"""
|
|
152
241
|
hf_dataset = hf_dataset_from_torch(torch_data, column_names=column_names)
|
|
153
|
-
return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
|
|
242
|
+
return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists, description=description)
|
|
154
243
|
|
|
155
244
|
@classmethod
|
|
156
|
-
def from_list(
|
|
245
|
+
def from_list(
|
|
246
|
+
cls, name: str, data: list[dict], if_exists: CreateMode = "error", description: str | None = None
|
|
247
|
+
) -> Datasource:
|
|
157
248
|
"""
|
|
158
249
|
Create a new datasource from a list of dictionaries
|
|
159
250
|
|
|
@@ -162,6 +253,7 @@ class Datasource:
|
|
|
162
253
|
data: The list of dictionaries to create the datasource from
|
|
163
254
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
164
255
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
256
|
+
description: Optional description for the datasource
|
|
165
257
|
|
|
166
258
|
Returns:
|
|
167
259
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -172,11 +264,22 @@ class Datasource:
|
|
|
172
264
|
Examples:
|
|
173
265
|
>>> Datasource.from_list("my_datasource", [{"text": "Hello, world!", "label": 1}, {"text": "Goodbye", "label": 0}])
|
|
174
266
|
"""
|
|
175
|
-
|
|
176
|
-
|
|
267
|
+
# Check if datasource already exists and handle accordingly
|
|
268
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
269
|
+
if existing is not None:
|
|
270
|
+
return existing
|
|
271
|
+
|
|
272
|
+
client = OrcaClient._resolve_client()
|
|
273
|
+
metadata = client.POST(
|
|
274
|
+
"/datasource",
|
|
275
|
+
json={"name": name, "description": description, "content": data},
|
|
276
|
+
)
|
|
277
|
+
return cls(metadata=metadata)
|
|
177
278
|
|
|
178
279
|
@classmethod
|
|
179
|
-
def from_dict(
|
|
280
|
+
def from_dict(
|
|
281
|
+
cls, name: str, data: dict, if_exists: CreateMode = "error", description: str | None = None
|
|
282
|
+
) -> Datasource:
|
|
180
283
|
"""
|
|
181
284
|
Create a new datasource from a dictionary of columns
|
|
182
285
|
|
|
@@ -185,6 +288,7 @@ class Datasource:
|
|
|
185
288
|
data: The dictionary of columns to create the datasource from
|
|
186
289
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
187
290
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
291
|
+
description: Optional description for the datasource
|
|
188
292
|
|
|
189
293
|
Returns:
|
|
190
294
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -195,11 +299,22 @@ class Datasource:
|
|
|
195
299
|
Examples:
|
|
196
300
|
>>> Datasource.from_dict("my_datasource", {"text": ["Hello, world!", "Goodbye"], "label": [1, 0]})
|
|
197
301
|
"""
|
|
198
|
-
|
|
199
|
-
|
|
302
|
+
# Check if datasource already exists and handle accordingly
|
|
303
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
304
|
+
if existing is not None:
|
|
305
|
+
return existing
|
|
306
|
+
|
|
307
|
+
client = OrcaClient._resolve_client()
|
|
308
|
+
metadata = client.POST(
|
|
309
|
+
"/datasource",
|
|
310
|
+
json={"name": name, "description": description, "content": data},
|
|
311
|
+
)
|
|
312
|
+
return cls(metadata=metadata)
|
|
200
313
|
|
|
201
314
|
@classmethod
|
|
202
|
-
def from_pandas(
|
|
315
|
+
def from_pandas(
|
|
316
|
+
cls, name: str, dataframe: pd.DataFrame, if_exists: CreateMode = "error", description: str | None = None
|
|
317
|
+
) -> Datasource:
|
|
203
318
|
"""
|
|
204
319
|
Create a new datasource from a pandas DataFrame
|
|
205
320
|
|
|
@@ -208,6 +323,7 @@ class Datasource:
|
|
|
208
323
|
dataframe: The pandas DataFrame to create the datasource from
|
|
209
324
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
210
325
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
326
|
+
description: Optional description for the datasource
|
|
211
327
|
|
|
212
328
|
Returns:
|
|
213
329
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -215,11 +331,13 @@ class Datasource:
|
|
|
215
331
|
Raises:
|
|
216
332
|
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
217
333
|
"""
|
|
218
|
-
|
|
219
|
-
return cls.from_hf_dataset(name,
|
|
334
|
+
dataset = Dataset.from_pandas(dataframe)
|
|
335
|
+
return cls.from_hf_dataset(name, dataset, if_exists=if_exists, description=description)
|
|
220
336
|
|
|
221
337
|
@classmethod
|
|
222
|
-
def from_arrow(
|
|
338
|
+
def from_arrow(
|
|
339
|
+
cls, name: str, pyarrow_table: pa.Table, if_exists: CreateMode = "error", description: str | None = None
|
|
340
|
+
) -> Datasource:
|
|
223
341
|
"""
|
|
224
342
|
Create a new datasource from a pyarrow Table
|
|
225
343
|
|
|
@@ -228,6 +346,7 @@ class Datasource:
|
|
|
228
346
|
pyarrow_table: The pyarrow Table to create the datasource from
|
|
229
347
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
230
348
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
349
|
+
description: Optional description for the datasource
|
|
231
350
|
|
|
232
351
|
Returns:
|
|
233
352
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -235,11 +354,29 @@ class Datasource:
|
|
|
235
354
|
Raises:
|
|
236
355
|
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
237
356
|
"""
|
|
238
|
-
|
|
239
|
-
|
|
357
|
+
# Check if datasource already exists and handle accordingly
|
|
358
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
359
|
+
if existing is not None:
|
|
360
|
+
return existing
|
|
361
|
+
|
|
362
|
+
# Write to bytes buffer
|
|
363
|
+
buffer = BytesIO()
|
|
364
|
+
parquet.write_table(pyarrow_table, buffer)
|
|
365
|
+
parquet_bytes = buffer.getvalue()
|
|
366
|
+
|
|
367
|
+
client = OrcaClient._resolve_client()
|
|
368
|
+
metadata = client.POST(
|
|
369
|
+
"/datasource/upload",
|
|
370
|
+
files=[("files", ("data.parquet", parquet_bytes))],
|
|
371
|
+
data={"name": name, "description": description},
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
return cls(metadata=metadata)
|
|
240
375
|
|
|
241
376
|
@classmethod
|
|
242
|
-
def from_disk(
|
|
377
|
+
def from_disk(
|
|
378
|
+
cls, name: str, file_path: str | PathLike, if_exists: CreateMode = "error", description: str | None = None
|
|
379
|
+
) -> Datasource:
|
|
243
380
|
"""
|
|
244
381
|
Create a new datasource from a local file
|
|
245
382
|
|
|
@@ -256,6 +393,7 @@ class Datasource:
|
|
|
256
393
|
|
|
257
394
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
258
395
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
396
|
+
description: Optional description for the datasource
|
|
259
397
|
|
|
260
398
|
Returns:
|
|
261
399
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -263,16 +401,31 @@ class Datasource:
|
|
|
263
401
|
Raises:
|
|
264
402
|
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
265
403
|
"""
|
|
266
|
-
|
|
267
|
-
|
|
404
|
+
# Check if datasource already exists and handle accordingly
|
|
405
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
406
|
+
if existing is not None:
|
|
407
|
+
return existing
|
|
408
|
+
|
|
409
|
+
file_path = Path(file_path)
|
|
410
|
+
|
|
411
|
+
# For dataset directories, use the upload endpoint with multiple files
|
|
412
|
+
if file_path.is_dir():
|
|
413
|
+
return cls.from_hf_dataset(
|
|
414
|
+
name, Dataset.load_from_disk(file_path), if_exists=if_exists, description=description
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
# For single files, use the helper function to upload files
|
|
418
|
+
metadata = _upload_files_to_datasource(name, [file_path], description)
|
|
419
|
+
|
|
420
|
+
return cls(metadata=metadata)
|
|
268
421
|
|
|
269
422
|
@classmethod
|
|
270
|
-
def open(cls,
|
|
423
|
+
def open(cls, name_or_id: str) -> Datasource:
|
|
271
424
|
"""
|
|
272
425
|
Get a handle to a datasource by name or id in the OrcaCloud
|
|
273
426
|
|
|
274
427
|
Params:
|
|
275
|
-
|
|
428
|
+
name_or_id: The name or unique identifier of the datasource to get
|
|
276
429
|
|
|
277
430
|
Returns:
|
|
278
431
|
A handle to the existing datasource in the OrcaCloud
|
|
@@ -280,7 +433,8 @@ class Datasource:
|
|
|
280
433
|
Raises:
|
|
281
434
|
LookupError: If the datasource does not exist
|
|
282
435
|
"""
|
|
283
|
-
|
|
436
|
+
client = OrcaClient._resolve_client()
|
|
437
|
+
return cls(client.GET("/datasource/{name_or_id}", params={"name_or_id": name_or_id}))
|
|
284
438
|
|
|
285
439
|
@classmethod
|
|
286
440
|
def exists(cls, name_or_id: str) -> bool:
|
|
@@ -307,7 +461,8 @@ class Datasource:
|
|
|
307
461
|
Returns:
|
|
308
462
|
A list of all datasource handles in the OrcaCloud
|
|
309
463
|
"""
|
|
310
|
-
|
|
464
|
+
client = OrcaClient._resolve_client()
|
|
465
|
+
return [cls(metadata) for metadata in client.GET("/datasource")]
|
|
311
466
|
|
|
312
467
|
@classmethod
|
|
313
468
|
def drop(cls, name_or_id: str, if_not_exists: DropMode = "error") -> None:
|
|
@@ -323,7 +478,8 @@ class Datasource:
|
|
|
323
478
|
LookupError: If the datasource does not exist and if_not_exists is `"error"`
|
|
324
479
|
"""
|
|
325
480
|
try:
|
|
326
|
-
|
|
481
|
+
client = OrcaClient._resolve_client()
|
|
482
|
+
client.DELETE("/datasource/{name_or_id}", params={"name_or_id": name_or_id})
|
|
327
483
|
logging.info(f"Deleted datasource {name_or_id}")
|
|
328
484
|
except LookupError:
|
|
329
485
|
if if_not_exists == "error":
|
|
@@ -331,3 +487,47 @@ class Datasource:
|
|
|
331
487
|
|
|
332
488
|
def __len__(self) -> int:
|
|
333
489
|
return self.length
|
|
490
|
+
|
|
491
|
+
def download(
|
|
492
|
+
self, output_dir: str | PathLike, file_type: Literal["hf_dataset", "json", "csv"] = "hf_dataset"
|
|
493
|
+
) -> None:
|
|
494
|
+
"""
|
|
495
|
+
Download the datasource to a specified path in the specified format type
|
|
496
|
+
|
|
497
|
+
Params:
|
|
498
|
+
output_dir: The local directory where the downloaded file will be saved.
|
|
499
|
+
file_type: The type of file to download.
|
|
500
|
+
|
|
501
|
+
Returns:
|
|
502
|
+
None
|
|
503
|
+
"""
|
|
504
|
+
extension = "zip" if file_type == "hf_dataset" else file_type
|
|
505
|
+
output_path = Path(output_dir) / f"{self.name}.{extension}"
|
|
506
|
+
with open(output_path, "wb") as download_file:
|
|
507
|
+
client = OrcaClient._resolve_client()
|
|
508
|
+
with client.stream("GET", f"/datasource/{self.id}/download", params={"file_type": file_type}) as response:
|
|
509
|
+
total_chunks = int(response.headers["X-Total-Chunks"]) if "X-Total-Chunks" in response.headers else None
|
|
510
|
+
with tqdm(desc="Downloading", total=total_chunks, disable=total_chunks is None) as progress:
|
|
511
|
+
for chunk in response.iter_bytes():
|
|
512
|
+
download_file.write(chunk)
|
|
513
|
+
progress.update(1)
|
|
514
|
+
|
|
515
|
+
# extract the zip file
|
|
516
|
+
if extension == "zip":
|
|
517
|
+
extract_dir = Path(output_dir) / self.name
|
|
518
|
+
with zipfile.ZipFile(output_path, "r") as zip_ref:
|
|
519
|
+
zip_ref.extractall(extract_dir)
|
|
520
|
+
output_path.unlink() # Remove the zip file after extraction
|
|
521
|
+
logging.info(f"Downloaded {extract_dir}")
|
|
522
|
+
else:
|
|
523
|
+
logging.info(f"Downloaded {output_path}")
|
|
524
|
+
|
|
525
|
+
def to_list(self) -> list[dict]:
|
|
526
|
+
"""
|
|
527
|
+
Convert the datasource to a list of dictionaries.
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
A list of dictionaries representation of the datasource.
|
|
531
|
+
"""
|
|
532
|
+
client = OrcaClient._resolve_client()
|
|
533
|
+
return client.GET("/datasource/{name_or_id}/download", params={"name_or_id": self.id, "file_type": "json"})
|