orca-sdk 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/__init__.py +10 -4
- orca_sdk/_shared/__init__.py +10 -0
- orca_sdk/_shared/metrics.py +393 -0
- orca_sdk/_shared/metrics_test.py +273 -0
- orca_sdk/_utils/analysis_ui.py +12 -10
- orca_sdk/_utils/analysis_ui_style.css +0 -3
- orca_sdk/_utils/auth.py +27 -29
- orca_sdk/_utils/data_parsing.py +28 -2
- orca_sdk/_utils/data_parsing_test.py +15 -15
- orca_sdk/_utils/pagination.py +126 -0
- orca_sdk/_utils/pagination_test.py +132 -0
- orca_sdk/_utils/prediction_result_ui.py +67 -21
- orca_sdk/_utils/tqdm_file_reader.py +12 -0
- orca_sdk/_utils/value_parser.py +45 -0
- orca_sdk/_utils/value_parser_test.py +39 -0
- orca_sdk/classification_model.py +439 -129
- orca_sdk/classification_model_test.py +334 -104
- orca_sdk/client.py +3747 -0
- orca_sdk/conftest.py +164 -19
- orca_sdk/credentials.py +120 -18
- orca_sdk/credentials_test.py +20 -0
- orca_sdk/datasource.py +259 -68
- orca_sdk/datasource_test.py +242 -0
- orca_sdk/embedding_model.py +425 -82
- orca_sdk/embedding_model_test.py +39 -13
- orca_sdk/job.py +337 -0
- orca_sdk/job_test.py +108 -0
- orca_sdk/memoryset.py +1341 -305
- orca_sdk/memoryset_test.py +350 -111
- orca_sdk/regression_model.py +684 -0
- orca_sdk/regression_model_test.py +369 -0
- orca_sdk/telemetry.py +449 -143
- orca_sdk/telemetry_test.py +43 -24
- {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.2.dist-info}/METADATA +34 -16
- orca_sdk-0.1.2.dist-info/RECORD +40 -0
- {orca_sdk-0.1.1.dist-info → orca_sdk-0.1.2.dist-info}/WHEEL +1 -1
- orca_sdk/_generated_api_client/__init__.py +0 -3
- orca_sdk/_generated_api_client/api/__init__.py +0 -193
- orca_sdk/_generated_api_client/api/auth/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/auth/check_authentication_auth_get.py +0 -128
- orca_sdk/_generated_api_client/api/auth/create_api_key_auth_api_key_post.py +0 -170
- orca_sdk/_generated_api_client/api/auth/delete_api_key_auth_api_key_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/auth/delete_org_auth_org_delete.py +0 -130
- orca_sdk/_generated_api_client/api/auth/list_api_keys_auth_api_key_get.py +0 -127
- orca_sdk/_generated_api_client/api/classification_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/classification_model/create_evaluation_classification_model_model_name_or_id_evaluation_post.py +0 -183
- orca_sdk/_generated_api_client/api/classification_model/create_model_classification_model_post.py +0 -170
- orca_sdk/_generated_api_client/api/classification_model/delete_evaluation_classification_model_model_name_or_id_evaluation_task_id_delete.py +0 -168
- orca_sdk/_generated_api_client/api/classification_model/delete_model_classification_model_name_or_id_delete.py +0 -154
- orca_sdk/_generated_api_client/api/classification_model/get_evaluation_classification_model_model_name_or_id_evaluation_task_id_get.py +0 -170
- orca_sdk/_generated_api_client/api/classification_model/get_model_classification_model_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/classification_model/list_evaluations_classification_model_model_name_or_id_evaluation_get.py +0 -161
- orca_sdk/_generated_api_client/api/classification_model/list_models_classification_model_get.py +0 -127
- orca_sdk/_generated_api_client/api/classification_model/predict_gpu_classification_model_name_or_id_prediction_post.py +0 -190
- orca_sdk/_generated_api_client/api/datasource/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/datasource/create_datasource_datasource_post.py +0 -167
- orca_sdk/_generated_api_client/api/datasource/delete_datasource_datasource_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/datasource/get_datasource_datasource_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/datasource/list_datasources_datasource_get.py +0 -127
- orca_sdk/_generated_api_client/api/default/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/default/healthcheck_get.py +0 -118
- orca_sdk/_generated_api_client/api/default/healthcheck_gpu_get.py +0 -118
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/create_finetuned_embedding_model_finetuned_embedding_model_post.py +0 -168
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/delete_finetuned_embedding_model_finetuned_embedding_model_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/embed_with_finetuned_model_gpu_finetuned_embedding_model_name_or_id_embedding_post.py +0 -189
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/get_finetuned_embedding_model_finetuned_embedding_model_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/finetuned_embedding_model/list_finetuned_embedding_models_finetuned_embedding_model_get.py +0 -127
- orca_sdk/_generated_api_client/api/memoryset/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/memoryset/clone_memoryset_memoryset_name_or_id_clone_post.py +0 -181
- orca_sdk/_generated_api_client/api/memoryset/create_analysis_memoryset_name_or_id_analysis_post.py +0 -183
- orca_sdk/_generated_api_client/api/memoryset/create_memoryset_memoryset_post.py +0 -168
- orca_sdk/_generated_api_client/api/memoryset/delete_memories_memoryset_name_or_id_memories_delete_post.py +0 -181
- orca_sdk/_generated_api_client/api/memoryset/delete_memory_memoryset_name_or_id_memory_memory_id_delete.py +0 -167
- orca_sdk/_generated_api_client/api/memoryset/delete_memoryset_memoryset_name_or_id_delete.py +0 -156
- orca_sdk/_generated_api_client/api/memoryset/get_analysis_memoryset_name_or_id_analysis_analysis_task_id_get.py +0 -169
- orca_sdk/_generated_api_client/api/memoryset/get_memories_memoryset_name_or_id_memories_get_post.py +0 -188
- orca_sdk/_generated_api_client/api/memoryset/get_memory_memoryset_name_or_id_memory_memory_id_get.py +0 -169
- orca_sdk/_generated_api_client/api/memoryset/get_memoryset_memoryset_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/memoryset/insert_memories_gpu_memoryset_name_or_id_memory_post.py +0 -184
- orca_sdk/_generated_api_client/api/memoryset/list_analyses_memoryset_name_or_id_analysis_get.py +0 -260
- orca_sdk/_generated_api_client/api/memoryset/list_memorysets_memoryset_get.py +0 -127
- orca_sdk/_generated_api_client/api/memoryset/memoryset_lookup_gpu_memoryset_name_or_id_lookup_post.py +0 -193
- orca_sdk/_generated_api_client/api/memoryset/query_memoryset_memoryset_name_or_id_memories_post.py +0 -188
- orca_sdk/_generated_api_client/api/memoryset/update_memories_gpu_memoryset_name_or_id_memories_patch.py +0 -191
- orca_sdk/_generated_api_client/api/memoryset/update_memory_gpu_memoryset_name_or_id_memory_patch.py +0 -187
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/embed_with_pretrained_model_gpu_pretrained_embedding_model_model_name_embedding_post.py +0 -188
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/get_pretrained_embedding_model_pretrained_embedding_model_model_name_get.py +0 -157
- orca_sdk/_generated_api_client/api/pretrained_embedding_model/list_pretrained_embedding_models_pretrained_embedding_model_get.py +0 -127
- orca_sdk/_generated_api_client/api/task/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/task/abort_task_task_task_id_abort_delete.py +0 -154
- orca_sdk/_generated_api_client/api/task/get_task_status_task_task_id_status_get.py +0 -156
- orca_sdk/_generated_api_client/api/task/list_tasks_task_get.py +0 -243
- orca_sdk/_generated_api_client/api/telemetry/__init__.py +0 -0
- orca_sdk/_generated_api_client/api/telemetry/drop_feedback_category_with_data_telemetry_feedback_category_name_or_id_delete.py +0 -162
- orca_sdk/_generated_api_client/api/telemetry/get_feedback_category_telemetry_feedback_category_name_or_id_get.py +0 -156
- orca_sdk/_generated_api_client/api/telemetry/get_prediction_telemetry_prediction_prediction_id_get.py +0 -157
- orca_sdk/_generated_api_client/api/telemetry/list_feedback_categories_telemetry_feedback_category_get.py +0 -127
- orca_sdk/_generated_api_client/api/telemetry/list_predictions_telemetry_prediction_post.py +0 -175
- orca_sdk/_generated_api_client/api/telemetry/record_prediction_feedback_telemetry_prediction_feedback_put.py +0 -171
- orca_sdk/_generated_api_client/api/telemetry/update_prediction_telemetry_prediction_prediction_id_patch.py +0 -181
- orca_sdk/_generated_api_client/client.py +0 -216
- orca_sdk/_generated_api_client/errors.py +0 -38
- orca_sdk/_generated_api_client/models/__init__.py +0 -159
- orca_sdk/_generated_api_client/models/analyze_neighbor_labels_result.py +0 -84
- orca_sdk/_generated_api_client/models/api_key_metadata.py +0 -118
- orca_sdk/_generated_api_client/models/base_model.py +0 -55
- orca_sdk/_generated_api_client/models/body_create_datasource_datasource_post.py +0 -176
- orca_sdk/_generated_api_client/models/classification_evaluation_result.py +0 -114
- orca_sdk/_generated_api_client/models/clone_labeled_memoryset_request.py +0 -150
- orca_sdk/_generated_api_client/models/column_info.py +0 -114
- orca_sdk/_generated_api_client/models/column_type.py +0 -14
- orca_sdk/_generated_api_client/models/conflict_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/create_api_key_request.py +0 -99
- orca_sdk/_generated_api_client/models/create_api_key_response.py +0 -126
- orca_sdk/_generated_api_client/models/create_labeled_memoryset_request.py +0 -259
- orca_sdk/_generated_api_client/models/create_rac_model_request.py +0 -209
- orca_sdk/_generated_api_client/models/datasource_metadata.py +0 -142
- orca_sdk/_generated_api_client/models/delete_memories_request.py +0 -70
- orca_sdk/_generated_api_client/models/embed_request.py +0 -127
- orca_sdk/_generated_api_client/models/embedding_finetuning_method.py +0 -9
- orca_sdk/_generated_api_client/models/evaluation_request.py +0 -180
- orca_sdk/_generated_api_client/models/evaluation_response.py +0 -140
- orca_sdk/_generated_api_client/models/feedback_type.py +0 -9
- orca_sdk/_generated_api_client/models/field_validation_error.py +0 -103
- orca_sdk/_generated_api_client/models/filter_item.py +0 -231
- orca_sdk/_generated_api_client/models/filter_item_field_type_0_item.py +0 -15
- orca_sdk/_generated_api_client/models/filter_item_field_type_2_item_type_1.py +0 -16
- orca_sdk/_generated_api_client/models/filter_item_op.py +0 -16
- orca_sdk/_generated_api_client/models/find_duplicates_analysis_result.py +0 -70
- orca_sdk/_generated_api_client/models/finetune_embedding_model_request.py +0 -259
- orca_sdk/_generated_api_client/models/finetune_embedding_model_request_training_args.py +0 -66
- orca_sdk/_generated_api_client/models/finetuned_embedding_model_metadata.py +0 -166
- orca_sdk/_generated_api_client/models/get_memories_request.py +0 -70
- orca_sdk/_generated_api_client/models/internal_server_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/label_class_metrics.py +0 -108
- orca_sdk/_generated_api_client/models/label_prediction_memory_lookup.py +0 -274
- orca_sdk/_generated_api_client/models/label_prediction_memory_lookup_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/label_prediction_result.py +0 -101
- orca_sdk/_generated_api_client/models/label_prediction_with_memories_and_feedback.py +0 -232
- orca_sdk/_generated_api_client/models/labeled_memory.py +0 -197
- orca_sdk/_generated_api_client/models/labeled_memory_insert.py +0 -108
- orca_sdk/_generated_api_client/models/labeled_memory_insert_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memory_lookup.py +0 -258
- orca_sdk/_generated_api_client/models/labeled_memory_lookup_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memory_metadata.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memory_metrics.py +0 -277
- orca_sdk/_generated_api_client/models/labeled_memory_update.py +0 -171
- orca_sdk/_generated_api_client/models/labeled_memory_update_metadata_type_0.py +0 -68
- orca_sdk/_generated_api_client/models/labeled_memoryset_metadata.py +0 -195
- orca_sdk/_generated_api_client/models/list_analyses_memoryset_name_or_id_analysis_get_type_type_0.py +0 -9
- orca_sdk/_generated_api_client/models/list_memories_request.py +0 -104
- orca_sdk/_generated_api_client/models/list_predictions_request.py +0 -234
- orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_0.py +0 -9
- orca_sdk/_generated_api_client/models/list_predictions_request_sort_item_item_type_1.py +0 -9
- orca_sdk/_generated_api_client/models/lookup_request.py +0 -81
- orca_sdk/_generated_api_client/models/memoryset_analysis_request.py +0 -83
- orca_sdk/_generated_api_client/models/memoryset_analysis_request_type.py +0 -9
- orca_sdk/_generated_api_client/models/memoryset_analysis_response.py +0 -180
- orca_sdk/_generated_api_client/models/memoryset_analysis_response_config.py +0 -66
- orca_sdk/_generated_api_client/models/memoryset_analysis_response_type.py +0 -9
- orca_sdk/_generated_api_client/models/not_found_error_response.py +0 -100
- orca_sdk/_generated_api_client/models/not_found_error_response_resource_type_0.py +0 -20
- orca_sdk/_generated_api_client/models/prediction_feedback.py +0 -157
- orca_sdk/_generated_api_client/models/prediction_feedback_category.py +0 -115
- orca_sdk/_generated_api_client/models/prediction_feedback_request.py +0 -122
- orca_sdk/_generated_api_client/models/prediction_feedback_result.py +0 -102
- orca_sdk/_generated_api_client/models/prediction_request.py +0 -169
- orca_sdk/_generated_api_client/models/pretrained_embedding_model_metadata.py +0 -97
- orca_sdk/_generated_api_client/models/pretrained_embedding_model_name.py +0 -11
- orca_sdk/_generated_api_client/models/rac_head_type.py +0 -11
- orca_sdk/_generated_api_client/models/rac_model_metadata.py +0 -191
- orca_sdk/_generated_api_client/models/service_unavailable_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/task.py +0 -198
- orca_sdk/_generated_api_client/models/task_status.py +0 -14
- orca_sdk/_generated_api_client/models/task_status_info.py +0 -133
- orca_sdk/_generated_api_client/models/unauthenticated_error_response.py +0 -72
- orca_sdk/_generated_api_client/models/unauthorized_error_response.py +0 -80
- orca_sdk/_generated_api_client/models/unprocessable_input_error_response.py +0 -94
- orca_sdk/_generated_api_client/models/update_prediction_request.py +0 -93
- orca_sdk/_generated_api_client/py.typed +0 -1
- orca_sdk/_generated_api_client/types.py +0 -56
- orca_sdk/_utils/task.py +0 -73
- orca_sdk-0.1.1.dist-info/RECORD +0 -175
orca_sdk/datasource.py
CHANGED
|
@@ -1,30 +1,90 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import os
|
|
4
5
|
import tempfile
|
|
6
|
+
import zipfile
|
|
5
7
|
from datetime import datetime
|
|
8
|
+
from io import BytesIO
|
|
6
9
|
from os import PathLike
|
|
7
10
|
from pathlib import Path
|
|
8
|
-
from typing import cast
|
|
11
|
+
from typing import Literal, Union, cast
|
|
9
12
|
|
|
10
13
|
import pandas as pd
|
|
11
14
|
import pyarrow as pa
|
|
12
|
-
from datasets import Dataset
|
|
15
|
+
from datasets import Dataset, DatasetDict
|
|
16
|
+
from httpx._types import FileTypes # type: ignore
|
|
17
|
+
from pyarrow import parquet
|
|
13
18
|
from torch.utils.data import DataLoader as TorchDataLoader
|
|
14
19
|
from torch.utils.data import Dataset as TorchDataset
|
|
20
|
+
from tqdm.auto import tqdm
|
|
15
21
|
|
|
16
|
-
from ._generated_api_client.api import (
|
|
17
|
-
delete_datasource,
|
|
18
|
-
get_datasource,
|
|
19
|
-
list_datasources,
|
|
20
|
-
)
|
|
21
|
-
from ._generated_api_client.api.datasource.create_datasource_datasource_post import (
|
|
22
|
-
_parse_response as parse_create_response,
|
|
23
|
-
)
|
|
24
|
-
from ._generated_api_client.client import get_client
|
|
25
|
-
from ._generated_api_client.models import ColumnType, DatasourceMetadata
|
|
26
22
|
from ._utils.common import CreateMode, DropMode
|
|
27
|
-
from ._utils.data_parsing import
|
|
23
|
+
from ._utils.data_parsing import hf_dataset_from_torch
|
|
24
|
+
from ._utils.tqdm_file_reader import TqdmFileReader
|
|
25
|
+
from .client import DatasourceMetadata, orca_api
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _upload_files_to_datasource(
|
|
29
|
+
name: str,
|
|
30
|
+
file_paths: list[Path],
|
|
31
|
+
description: str | None = None,
|
|
32
|
+
) -> DatasourceMetadata:
|
|
33
|
+
"""
|
|
34
|
+
Helper function to upload files to create a datasource using manual HTTP requests.
|
|
35
|
+
|
|
36
|
+
This bypasses the generated client because it doesn't handle file uploads properly.
|
|
37
|
+
|
|
38
|
+
Params:
|
|
39
|
+
name: Name for the datasource
|
|
40
|
+
file_paths: List of file paths to upload
|
|
41
|
+
description: Optional description for the datasource
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Metadata for the created datasource
|
|
45
|
+
"""
|
|
46
|
+
files: list[tuple[Literal["files"], FileTypes]] = []
|
|
47
|
+
|
|
48
|
+
# Calculate total size for all files
|
|
49
|
+
total_size = sum(file_path.stat().st_size for file_path in file_paths)
|
|
50
|
+
|
|
51
|
+
with tqdm(total=total_size, unit="B", unit_scale=True, desc="Uploading") as pbar:
|
|
52
|
+
for file_path in file_paths:
|
|
53
|
+
buffered_reader = open(file_path, "rb")
|
|
54
|
+
tqdm_reader = TqdmFileReader(buffered_reader, pbar)
|
|
55
|
+
files.append(("files", (file_path.name, cast(bytes, tqdm_reader))))
|
|
56
|
+
|
|
57
|
+
# Use manual HTTP request for file uploads
|
|
58
|
+
metadata = orca_api.POST(
|
|
59
|
+
"/datasource/upload",
|
|
60
|
+
files=files,
|
|
61
|
+
data={"name": name, "description": description},
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return metadata
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _handle_existing_datasource(name: str, if_exists: CreateMode) -> Union["Datasource", None]:
|
|
68
|
+
"""
|
|
69
|
+
Helper function to handle the common pattern of checking if a datasource exists
|
|
70
|
+
and taking action based on the if_exists parameter.
|
|
71
|
+
|
|
72
|
+
Params:
|
|
73
|
+
name: Name of the datasource to check
|
|
74
|
+
if_exists: What to do if a datasource with the same name already exists
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Datasource instance if opening existing, None if should proceed with creation
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
ValueError: If the datasource already exists and if_exists is "error"
|
|
81
|
+
"""
|
|
82
|
+
if Datasource.exists(name):
|
|
83
|
+
if if_exists == "error":
|
|
84
|
+
raise ValueError(f"Dataset with name {name} already exists")
|
|
85
|
+
elif if_exists == "open":
|
|
86
|
+
return Datasource.open(name)
|
|
87
|
+
return None
|
|
28
88
|
|
|
29
89
|
|
|
30
90
|
class Datasource:
|
|
@@ -37,6 +97,7 @@ class Datasource:
|
|
|
37
97
|
Attributes:
|
|
38
98
|
id: Unique identifier for the datasource
|
|
39
99
|
name: Unique name of the datasource
|
|
100
|
+
description: Optional description of the datasource
|
|
40
101
|
length: Number of rows in the datasource
|
|
41
102
|
created_at: When the datasource was created
|
|
42
103
|
columns: Dictionary of column names and types
|
|
@@ -44,6 +105,7 @@ class Datasource:
|
|
|
44
105
|
|
|
45
106
|
id: str
|
|
46
107
|
name: str
|
|
108
|
+
description: str | None
|
|
47
109
|
length: int
|
|
48
110
|
created_at: datetime
|
|
49
111
|
updated_at: datetime
|
|
@@ -51,20 +113,19 @@ class Datasource:
|
|
|
51
113
|
|
|
52
114
|
def __init__(self, metadata: DatasourceMetadata):
|
|
53
115
|
# for internal use only, do not document
|
|
54
|
-
self.id = metadata
|
|
55
|
-
self.name = metadata
|
|
56
|
-
self.length = metadata
|
|
57
|
-
self.created_at = metadata
|
|
58
|
-
self.updated_at = metadata
|
|
116
|
+
self.id = metadata["id"]
|
|
117
|
+
self.name = metadata["name"]
|
|
118
|
+
self.length = metadata["length"]
|
|
119
|
+
self.created_at = datetime.fromisoformat(metadata["created_at"])
|
|
120
|
+
self.updated_at = datetime.fromisoformat(metadata["updated_at"])
|
|
121
|
+
self.description = metadata["description"]
|
|
59
122
|
self.columns = {
|
|
60
|
-
column
|
|
61
|
-
f"enum({', '.join(f'{option!r}' for option in column
|
|
62
|
-
if column
|
|
63
|
-
else "str"
|
|
64
|
-
if column.type == ColumnType.STRING
|
|
65
|
-
else column.type.value.lower()
|
|
123
|
+
column["name"]: (
|
|
124
|
+
f"enum({', '.join(f'{option!r}' for option in column['enum_options'] or []) if 'enum_options' in column else ''})"
|
|
125
|
+
if column["type"] == "ENUM"
|
|
126
|
+
else "str" if column["type"] == "STRING" else column["type"].lower()
|
|
66
127
|
)
|
|
67
|
-
for column in metadata
|
|
128
|
+
for column in metadata["columns"]
|
|
68
129
|
}
|
|
69
130
|
|
|
70
131
|
def __eq__(self, other) -> bool:
|
|
@@ -82,7 +143,9 @@ class Datasource:
|
|
|
82
143
|
)
|
|
83
144
|
|
|
84
145
|
@classmethod
|
|
85
|
-
def from_hf_dataset(
|
|
146
|
+
def from_hf_dataset(
|
|
147
|
+
cls, name: str, dataset: Dataset, if_exists: CreateMode = "error", description: str | None = None
|
|
148
|
+
) -> Datasource:
|
|
86
149
|
"""
|
|
87
150
|
Create a new datasource from a Hugging Face Dataset
|
|
88
151
|
|
|
@@ -91,6 +154,7 @@ class Datasource:
|
|
|
91
154
|
dataset: The Hugging Face Dataset to create the datasource from
|
|
92
155
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
93
156
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
157
|
+
description: Optional description for the datasource
|
|
94
158
|
|
|
95
159
|
Returns:
|
|
96
160
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -98,32 +162,54 @@ class Datasource:
|
|
|
98
162
|
Raises:
|
|
99
163
|
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
100
164
|
"""
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
if
|
|
104
|
-
|
|
105
|
-
raise ValueError(f"Dataset with name {name} already exists")
|
|
106
|
-
elif if_exists == "open":
|
|
107
|
-
return cls.open(name)
|
|
165
|
+
# Check if datasource already exists and handle accordingly
|
|
166
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
167
|
+
if existing is not None:
|
|
168
|
+
return existing
|
|
108
169
|
|
|
109
170
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
110
171
|
dataset.save_to_disk(tmp_dir)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
metadata = parse_create_response(
|
|
118
|
-
response=client.get_httpx_client().request(
|
|
119
|
-
method="post",
|
|
120
|
-
url="/datasource/",
|
|
121
|
-
files=files,
|
|
122
|
-
data={"name": name},
|
|
123
|
-
)
|
|
124
|
-
)
|
|
172
|
+
|
|
173
|
+
# Get all file paths in the directory
|
|
174
|
+
file_paths = list(Path(tmp_dir).iterdir())
|
|
175
|
+
|
|
176
|
+
# Use the helper function to upload files
|
|
177
|
+
metadata = _upload_files_to_datasource(name, file_paths, description)
|
|
125
178
|
return cls(metadata=metadata)
|
|
126
179
|
|
|
180
|
+
@classmethod
|
|
181
|
+
def from_hf_dataset_dict(
|
|
182
|
+
cls,
|
|
183
|
+
name: str,
|
|
184
|
+
dataset_dict: DatasetDict,
|
|
185
|
+
if_exists: CreateMode = "error",
|
|
186
|
+
description: dict[str, str | None] | str | None = None,
|
|
187
|
+
) -> dict[str, Datasource]:
|
|
188
|
+
"""
|
|
189
|
+
Create datasources from a Hugging Face DatasetDict
|
|
190
|
+
|
|
191
|
+
Params:
|
|
192
|
+
name: Name prefix for the new datasources, will be suffixed with the dataset name
|
|
193
|
+
dataset_dict: The Hugging Face DatasetDict to create the datasources from
|
|
194
|
+
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
195
|
+
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
196
|
+
description: Optional description for the datasources, can be a string or a dictionary of dataset names to descriptions
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
A dictionary of datasource handles, keyed by the dataset name
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
ValueError: If a datasource already exists and if_exists is `"error"`
|
|
203
|
+
"""
|
|
204
|
+
if description is None or isinstance(description, str):
|
|
205
|
+
description = {dataset_name: description for dataset_name in dataset_dict.keys()}
|
|
206
|
+
return {
|
|
207
|
+
dataset_name: cls.from_hf_dataset(
|
|
208
|
+
f"{name}_{dataset_name}", dataset, if_exists=if_exists, description=description[dataset_name]
|
|
209
|
+
)
|
|
210
|
+
for dataset_name, dataset in dataset_dict.items()
|
|
211
|
+
}
|
|
212
|
+
|
|
127
213
|
@classmethod
|
|
128
214
|
def from_pytorch(
|
|
129
215
|
cls,
|
|
@@ -131,6 +217,7 @@ class Datasource:
|
|
|
131
217
|
torch_data: TorchDataLoader | TorchDataset,
|
|
132
218
|
column_names: list[str] | None = None,
|
|
133
219
|
if_exists: CreateMode = "error",
|
|
220
|
+
description: str | None = None,
|
|
134
221
|
) -> Datasource:
|
|
135
222
|
"""
|
|
136
223
|
Create a new datasource from a PyTorch DataLoader or Dataset
|
|
@@ -142,6 +229,7 @@ class Datasource:
|
|
|
142
229
|
argument must be provided to specify the names of the columns.
|
|
143
230
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
144
231
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
232
|
+
description: Optional description for the datasource
|
|
145
233
|
|
|
146
234
|
Returns:
|
|
147
235
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -150,10 +238,12 @@ class Datasource:
|
|
|
150
238
|
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
151
239
|
"""
|
|
152
240
|
hf_dataset = hf_dataset_from_torch(torch_data, column_names=column_names)
|
|
153
|
-
return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists)
|
|
241
|
+
return cls.from_hf_dataset(name, hf_dataset, if_exists=if_exists, description=description)
|
|
154
242
|
|
|
155
243
|
@classmethod
|
|
156
|
-
def from_list(
|
|
244
|
+
def from_list(
|
|
245
|
+
cls, name: str, data: list[dict], if_exists: CreateMode = "error", description: str | None = None
|
|
246
|
+
) -> Datasource:
|
|
157
247
|
"""
|
|
158
248
|
Create a new datasource from a list of dictionaries
|
|
159
249
|
|
|
@@ -162,6 +252,7 @@ class Datasource:
|
|
|
162
252
|
data: The list of dictionaries to create the datasource from
|
|
163
253
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
164
254
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
255
|
+
description: Optional description for the datasource
|
|
165
256
|
|
|
166
257
|
Returns:
|
|
167
258
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -172,11 +263,21 @@ class Datasource:
|
|
|
172
263
|
Examples:
|
|
173
264
|
>>> Datasource.from_list("my_datasource", [{"text": "Hello, world!", "label": 1}, {"text": "Goodbye", "label": 0}])
|
|
174
265
|
"""
|
|
175
|
-
|
|
176
|
-
|
|
266
|
+
# Check if datasource already exists and handle accordingly
|
|
267
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
268
|
+
if existing is not None:
|
|
269
|
+
return existing
|
|
270
|
+
|
|
271
|
+
metadata = orca_api.POST(
|
|
272
|
+
"/datasource",
|
|
273
|
+
json={"name": name, "description": description, "content": data},
|
|
274
|
+
)
|
|
275
|
+
return cls(metadata=metadata)
|
|
177
276
|
|
|
178
277
|
@classmethod
|
|
179
|
-
def from_dict(
|
|
278
|
+
def from_dict(
|
|
279
|
+
cls, name: str, data: dict, if_exists: CreateMode = "error", description: str | None = None
|
|
280
|
+
) -> Datasource:
|
|
180
281
|
"""
|
|
181
282
|
Create a new datasource from a dictionary of columns
|
|
182
283
|
|
|
@@ -185,6 +286,7 @@ class Datasource:
|
|
|
185
286
|
data: The dictionary of columns to create the datasource from
|
|
186
287
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
187
288
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
289
|
+
description: Optional description for the datasource
|
|
188
290
|
|
|
189
291
|
Returns:
|
|
190
292
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -195,11 +297,21 @@ class Datasource:
|
|
|
195
297
|
Examples:
|
|
196
298
|
>>> Datasource.from_dict("my_datasource", {"text": ["Hello, world!", "Goodbye"], "label": [1, 0]})
|
|
197
299
|
"""
|
|
198
|
-
|
|
199
|
-
|
|
300
|
+
# Check if datasource already exists and handle accordingly
|
|
301
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
302
|
+
if existing is not None:
|
|
303
|
+
return existing
|
|
304
|
+
|
|
305
|
+
metadata = orca_api.POST(
|
|
306
|
+
"/datasource",
|
|
307
|
+
json={"name": name, "description": description, "content": data},
|
|
308
|
+
)
|
|
309
|
+
return cls(metadata=metadata)
|
|
200
310
|
|
|
201
311
|
@classmethod
|
|
202
|
-
def from_pandas(
|
|
312
|
+
def from_pandas(
|
|
313
|
+
cls, name: str, dataframe: pd.DataFrame, if_exists: CreateMode = "error", description: str | None = None
|
|
314
|
+
) -> Datasource:
|
|
203
315
|
"""
|
|
204
316
|
Create a new datasource from a pandas DataFrame
|
|
205
317
|
|
|
@@ -208,6 +320,7 @@ class Datasource:
|
|
|
208
320
|
dataframe: The pandas DataFrame to create the datasource from
|
|
209
321
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
210
322
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
323
|
+
description: Optional description for the datasource
|
|
211
324
|
|
|
212
325
|
Returns:
|
|
213
326
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -215,11 +328,13 @@ class Datasource:
|
|
|
215
328
|
Raises:
|
|
216
329
|
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
217
330
|
"""
|
|
218
|
-
|
|
219
|
-
return cls.from_hf_dataset(name,
|
|
331
|
+
dataset = Dataset.from_pandas(dataframe)
|
|
332
|
+
return cls.from_hf_dataset(name, dataset, if_exists=if_exists, description=description)
|
|
220
333
|
|
|
221
334
|
@classmethod
|
|
222
|
-
def from_arrow(
|
|
335
|
+
def from_arrow(
|
|
336
|
+
cls, name: str, pyarrow_table: pa.Table, if_exists: CreateMode = "error", description: str | None = None
|
|
337
|
+
) -> Datasource:
|
|
223
338
|
"""
|
|
224
339
|
Create a new datasource from a pyarrow Table
|
|
225
340
|
|
|
@@ -228,6 +343,7 @@ class Datasource:
|
|
|
228
343
|
pyarrow_table: The pyarrow Table to create the datasource from
|
|
229
344
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
230
345
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
346
|
+
description: Optional description for the datasource
|
|
231
347
|
|
|
232
348
|
Returns:
|
|
233
349
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -235,11 +351,28 @@ class Datasource:
|
|
|
235
351
|
Raises:
|
|
236
352
|
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
237
353
|
"""
|
|
238
|
-
|
|
239
|
-
|
|
354
|
+
# Check if datasource already exists and handle accordingly
|
|
355
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
356
|
+
if existing is not None:
|
|
357
|
+
return existing
|
|
358
|
+
|
|
359
|
+
# Write to bytes buffer
|
|
360
|
+
buffer = BytesIO()
|
|
361
|
+
parquet.write_table(pyarrow_table, buffer)
|
|
362
|
+
parquet_bytes = buffer.getvalue()
|
|
363
|
+
|
|
364
|
+
metadata = orca_api.POST(
|
|
365
|
+
"/datasource/upload",
|
|
366
|
+
files=[("files", ("data.parquet", parquet_bytes))],
|
|
367
|
+
data={"name": name, "description": description},
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
return cls(metadata=metadata)
|
|
240
371
|
|
|
241
372
|
@classmethod
|
|
242
|
-
def from_disk(
|
|
373
|
+
def from_disk(
|
|
374
|
+
cls, name: str, file_path: str | PathLike, if_exists: CreateMode = "error", description: str | None = None
|
|
375
|
+
) -> Datasource:
|
|
243
376
|
"""
|
|
244
377
|
Create a new datasource from a local file
|
|
245
378
|
|
|
@@ -256,6 +389,7 @@ class Datasource:
|
|
|
256
389
|
|
|
257
390
|
if_exists: What to do if a datasource with the same name already exists, defaults to
|
|
258
391
|
`"error"`. Other option is `"open"` to open the existing datasource.
|
|
392
|
+
description: Optional description for the datasource
|
|
259
393
|
|
|
260
394
|
Returns:
|
|
261
395
|
A handle to the new datasource in the OrcaCloud
|
|
@@ -263,16 +397,31 @@ class Datasource:
|
|
|
263
397
|
Raises:
|
|
264
398
|
ValueError: If the datasource already exists and if_exists is `"error"`
|
|
265
399
|
"""
|
|
266
|
-
|
|
267
|
-
|
|
400
|
+
# Check if datasource already exists and handle accordingly
|
|
401
|
+
existing = _handle_existing_datasource(name, if_exists)
|
|
402
|
+
if existing is not None:
|
|
403
|
+
return existing
|
|
404
|
+
|
|
405
|
+
file_path = Path(file_path)
|
|
406
|
+
|
|
407
|
+
# For dataset directories, use the upload endpoint with multiple files
|
|
408
|
+
if file_path.is_dir():
|
|
409
|
+
return cls.from_hf_dataset(
|
|
410
|
+
name, Dataset.load_from_disk(file_path), if_exists=if_exists, description=description
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
# For single files, use the helper function to upload files
|
|
414
|
+
metadata = _upload_files_to_datasource(name, [file_path], description)
|
|
415
|
+
|
|
416
|
+
return cls(metadata=metadata)
|
|
268
417
|
|
|
269
418
|
@classmethod
|
|
270
|
-
def open(cls,
|
|
419
|
+
def open(cls, name_or_id: str) -> Datasource:
|
|
271
420
|
"""
|
|
272
421
|
Get a handle to a datasource by name or id in the OrcaCloud
|
|
273
422
|
|
|
274
423
|
Params:
|
|
275
|
-
|
|
424
|
+
name_or_id: The name or unique identifier of the datasource to get
|
|
276
425
|
|
|
277
426
|
Returns:
|
|
278
427
|
A handle to the existing datasource in the OrcaCloud
|
|
@@ -280,7 +429,7 @@ class Datasource:
|
|
|
280
429
|
Raises:
|
|
281
430
|
LookupError: If the datasource does not exist
|
|
282
431
|
"""
|
|
283
|
-
return cls(
|
|
432
|
+
return cls(orca_api.GET("/datasource/{name_or_id}", params={"name_or_id": name_or_id}))
|
|
284
433
|
|
|
285
434
|
@classmethod
|
|
286
435
|
def exists(cls, name_or_id: str) -> bool:
|
|
@@ -307,7 +456,7 @@ class Datasource:
|
|
|
307
456
|
Returns:
|
|
308
457
|
A list of all datasource handles in the OrcaCloud
|
|
309
458
|
"""
|
|
310
|
-
return [cls(metadata) for metadata in
|
|
459
|
+
return [cls(metadata) for metadata in orca_api.GET("/datasource")]
|
|
311
460
|
|
|
312
461
|
@classmethod
|
|
313
462
|
def drop(cls, name_or_id: str, if_not_exists: DropMode = "error") -> None:
|
|
@@ -323,7 +472,7 @@ class Datasource:
|
|
|
323
472
|
LookupError: If the datasource does not exist and if_not_exists is `"error"`
|
|
324
473
|
"""
|
|
325
474
|
try:
|
|
326
|
-
|
|
475
|
+
orca_api.DELETE("/datasource/{name_or_id}", params={"name_or_id": name_or_id})
|
|
327
476
|
logging.info(f"Deleted datasource {name_or_id}")
|
|
328
477
|
except LookupError:
|
|
329
478
|
if if_not_exists == "error":
|
|
@@ -331,3 +480,45 @@ class Datasource:
|
|
|
331
480
|
|
|
332
481
|
def __len__(self) -> int:
|
|
333
482
|
return self.length
|
|
483
|
+
|
|
484
|
+
def download(
|
|
485
|
+
self, output_dir: str | PathLike, file_type: Literal["hf_dataset", "json", "csv"] = "hf_dataset"
|
|
486
|
+
) -> None:
|
|
487
|
+
"""
|
|
488
|
+
Download the datasource to a specified path in the specified format type
|
|
489
|
+
|
|
490
|
+
Params:
|
|
491
|
+
output_dir: The local directory where the downloaded file will be saved.
|
|
492
|
+
file_type: The type of file to download.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
None
|
|
496
|
+
"""
|
|
497
|
+
extension = "zip" if file_type == "hf_dataset" else file_type
|
|
498
|
+
output_path = Path(output_dir) / f"{self.name}.{extension}"
|
|
499
|
+
with open(output_path, "wb") as download_file:
|
|
500
|
+
with orca_api.stream("GET", f"/datasource/{self.id}/download", params={"file_type": file_type}) as response:
|
|
501
|
+
total_chunks = int(response.headers["X-Total-Chunks"]) if "X-Total-Chunks" in response.headers else None
|
|
502
|
+
with tqdm(desc="Downloading", total=total_chunks, disable=total_chunks is None) as progress:
|
|
503
|
+
for chunk in response.iter_bytes():
|
|
504
|
+
download_file.write(chunk)
|
|
505
|
+
progress.update(1)
|
|
506
|
+
|
|
507
|
+
# extract the zip file
|
|
508
|
+
if extension == "zip":
|
|
509
|
+
extract_dir = Path(output_dir) / self.name
|
|
510
|
+
with zipfile.ZipFile(output_path, "r") as zip_ref:
|
|
511
|
+
zip_ref.extractall(extract_dir)
|
|
512
|
+
output_path.unlink() # Remove the zip file after extraction
|
|
513
|
+
logging.info(f"Downloaded {extract_dir}")
|
|
514
|
+
else:
|
|
515
|
+
logging.info(f"Downloaded {output_path}")
|
|
516
|
+
|
|
517
|
+
def to_list(self) -> list[dict]:
|
|
518
|
+
"""
|
|
519
|
+
Convert the datasource to a list of dictionaries.
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
A list of dictionaries representation of the datasource.
|
|
523
|
+
"""
|
|
524
|
+
return orca_api.GET("/datasource/{name_or_id}/download", params={"name_or_id": self.id, "file_type": "json"})
|