orca-sdk 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orca_sdk/__init__.py +3 -3
- orca_sdk/_utils/auth.py +2 -3
- orca_sdk/_utils/common.py +24 -1
- orca_sdk/_utils/torch_parsing.py +77 -0
- orca_sdk/_utils/torch_parsing_test.py +142 -0
- orca_sdk/async_client.py +156 -4
- orca_sdk/classification_model.py +202 -65
- orca_sdk/classification_model_test.py +16 -3
- orca_sdk/client.py +156 -4
- orca_sdk/conftest.py +10 -9
- orca_sdk/datasource.py +31 -13
- orca_sdk/embedding_model.py +8 -31
- orca_sdk/embedding_model_test.py +1 -1
- orca_sdk/memoryset.py +236 -321
- orca_sdk/memoryset_test.py +39 -13
- orca_sdk/regression_model.py +185 -64
- orca_sdk/regression_model_test.py +18 -3
- orca_sdk/telemetry.py +15 -6
- {orca_sdk-0.1.11.dist-info → orca_sdk-0.1.12.dist-info}/METADATA +3 -5
- orca_sdk-0.1.12.dist-info/RECORD +38 -0
- orca_sdk/_shared/__init__.py +0 -10
- orca_sdk/_shared/metrics.py +0 -634
- orca_sdk/_shared/metrics_test.py +0 -570
- orca_sdk/_utils/data_parsing.py +0 -137
- orca_sdk/_utils/data_parsing_disk_test.py +0 -91
- orca_sdk/_utils/data_parsing_torch_test.py +0 -159
- orca_sdk-0.1.11.dist-info/RECORD +0 -42
- {orca_sdk-0.1.11.dist-info → orca_sdk-0.1.12.dist-info}/WHEEL +0 -0
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import pickle
|
|
3
|
-
import tempfile
|
|
4
|
-
|
|
5
|
-
from datasets import Dataset
|
|
6
|
-
|
|
7
|
-
from .data_parsing import hf_dataset_from_disk
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def test_hf_dataset_from_disk_pickle_list():
|
|
11
|
-
with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
|
|
12
|
-
# Given a pickle file with test data that is a list
|
|
13
|
-
test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
|
|
14
|
-
with open(temp_file.name, "wb") as f:
|
|
15
|
-
pickle.dump(test_data, f)
|
|
16
|
-
dataset = hf_dataset_from_disk(temp_file.name)
|
|
17
|
-
# Then the HF dataset should be created successfully
|
|
18
|
-
assert isinstance(dataset, Dataset)
|
|
19
|
-
assert len(dataset) == 30
|
|
20
|
-
assert dataset.column_names == ["value", "label"]
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def test_hf_dataset_from_disk_pickle_dict():
|
|
24
|
-
with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
|
|
25
|
-
# Given a pickle file with test data that is a dict
|
|
26
|
-
test_data = {"value": [f"test_{i}" for i in range(30)], "label": [i % 2 for i in range(30)]}
|
|
27
|
-
with open(temp_file.name, "wb") as f:
|
|
28
|
-
pickle.dump(test_data, f)
|
|
29
|
-
dataset = hf_dataset_from_disk(temp_file.name)
|
|
30
|
-
# Then the HF dataset should be created successfully
|
|
31
|
-
assert isinstance(dataset, Dataset)
|
|
32
|
-
assert len(dataset) == 30
|
|
33
|
-
assert dataset.column_names == ["value", "label"]
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def test_hf_dataset_from_disk_json():
|
|
37
|
-
with tempfile.NamedTemporaryFile(suffix=".json") as temp_file:
|
|
38
|
-
# Given a JSON file with test data
|
|
39
|
-
test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
|
|
40
|
-
with open(temp_file.name, "w") as f:
|
|
41
|
-
json.dump(test_data, f)
|
|
42
|
-
dataset = hf_dataset_from_disk(temp_file.name)
|
|
43
|
-
# Then the HF dataset should be created successfully
|
|
44
|
-
assert isinstance(dataset, Dataset)
|
|
45
|
-
assert len(dataset) == 30
|
|
46
|
-
assert dataset.column_names == ["value", "label"]
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def test_hf_dataset_from_disk_jsonl():
|
|
50
|
-
with tempfile.NamedTemporaryFile(suffix=".jsonl") as temp_file:
|
|
51
|
-
# Given a JSONL file with test data
|
|
52
|
-
test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
|
|
53
|
-
with open(temp_file.name, "w") as f:
|
|
54
|
-
for item in test_data:
|
|
55
|
-
f.write(json.dumps(item) + "\n")
|
|
56
|
-
dataset = hf_dataset_from_disk(temp_file.name)
|
|
57
|
-
# Then the HF dataset should be created successfully
|
|
58
|
-
assert isinstance(dataset, Dataset)
|
|
59
|
-
assert len(dataset) == 30
|
|
60
|
-
assert dataset.column_names == ["value", "label"]
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def test_hf_dataset_from_disk_csv():
|
|
64
|
-
with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
|
|
65
|
-
# Given a CSV file with test data
|
|
66
|
-
test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
|
|
67
|
-
with open(temp_file.name, "w") as f:
|
|
68
|
-
f.write("value,label\n")
|
|
69
|
-
for item in test_data:
|
|
70
|
-
f.write(f"{item['value']},{item['label']}\n")
|
|
71
|
-
dataset = hf_dataset_from_disk(temp_file.name)
|
|
72
|
-
# Then the HF dataset should be created successfully
|
|
73
|
-
assert isinstance(dataset, Dataset)
|
|
74
|
-
assert len(dataset) == 30
|
|
75
|
-
assert dataset.column_names == ["value", "label"]
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def test_hf_dataset_from_disk_parquet():
|
|
79
|
-
with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
|
|
80
|
-
# Given a Parquet file with test data
|
|
81
|
-
data = {
|
|
82
|
-
"value": [f"test_{i}" for i in range(30)],
|
|
83
|
-
"label": [i % 2 for i in range(30)],
|
|
84
|
-
}
|
|
85
|
-
dataset = Dataset.from_dict(data)
|
|
86
|
-
dataset.to_parquet(temp_file.name)
|
|
87
|
-
dataset = hf_dataset_from_disk(temp_file.name)
|
|
88
|
-
# Then the HF dataset should be created successfully
|
|
89
|
-
assert isinstance(dataset, Dataset)
|
|
90
|
-
assert len(dataset) == 30
|
|
91
|
-
assert dataset.column_names == ["value", "label"]
|
|
@@ -1,159 +0,0 @@
|
|
|
1
|
-
from collections import namedtuple
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
|
|
4
|
-
import pytest
|
|
5
|
-
from datasets import Dataset
|
|
6
|
-
from datasets.exceptions import DatasetGenerationError
|
|
7
|
-
|
|
8
|
-
from ..conftest import SAMPLE_DATA
|
|
9
|
-
from .data_parsing import hf_dataset_from_torch
|
|
10
|
-
|
|
11
|
-
pytest.importorskip("torch")
|
|
12
|
-
|
|
13
|
-
from torch.utils.data import DataLoader as TorchDataLoader # noqa: E402
|
|
14
|
-
from torch.utils.data import Dataset as TorchDataset # noqa: E402
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class PytorchDictDataset(TorchDataset):
|
|
18
|
-
def __init__(self):
|
|
19
|
-
self.data = SAMPLE_DATA
|
|
20
|
-
|
|
21
|
-
def __getitem__(self, i):
|
|
22
|
-
return self.data[i]
|
|
23
|
-
|
|
24
|
-
def __len__(self):
|
|
25
|
-
return len(self.data)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class PytorchTupleDataset(TorchDataset):
|
|
29
|
-
def __init__(self):
|
|
30
|
-
self.data = SAMPLE_DATA
|
|
31
|
-
|
|
32
|
-
def __getitem__(self, i):
|
|
33
|
-
return self.data[i]["value"], self.data[i]["label"]
|
|
34
|
-
|
|
35
|
-
def __len__(self):
|
|
36
|
-
return len(self.data)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
DatasetTuple = namedtuple("DatasetTuple", ["value", "label"])
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
class PytorchNamedTupleDataset(TorchDataset):
|
|
43
|
-
def __init__(self):
|
|
44
|
-
self.data = SAMPLE_DATA
|
|
45
|
-
|
|
46
|
-
def __getitem__(self, i):
|
|
47
|
-
return DatasetTuple(self.data[i]["value"], self.data[i]["label"])
|
|
48
|
-
|
|
49
|
-
def __len__(self):
|
|
50
|
-
return len(self.data)
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
@dataclass
|
|
54
|
-
class DatasetItem:
|
|
55
|
-
text: str
|
|
56
|
-
label: int
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
class PytorchDataclassDataset(TorchDataset):
|
|
60
|
-
def __init__(self):
|
|
61
|
-
self.data = SAMPLE_DATA
|
|
62
|
-
|
|
63
|
-
def __getitem__(self, i):
|
|
64
|
-
return DatasetItem(text=self.data[i]["value"], label=self.data[i]["label"])
|
|
65
|
-
|
|
66
|
-
def __len__(self):
|
|
67
|
-
return len(self.data)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class PytorchInvalidDataset(TorchDataset):
|
|
71
|
-
def __init__(self):
|
|
72
|
-
self.data = SAMPLE_DATA
|
|
73
|
-
|
|
74
|
-
def __getitem__(self, i):
|
|
75
|
-
return [self.data[i]["value"], self.data[i]["label"]]
|
|
76
|
-
|
|
77
|
-
def __len__(self):
|
|
78
|
-
return len(self.data)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def test_hf_dataset_from_torch_dict():
|
|
82
|
-
# Given a Pytorch dataset that returns a dictionary for each item
|
|
83
|
-
dataset = PytorchDictDataset()
|
|
84
|
-
hf_dataset = hf_dataset_from_torch(dataset)
|
|
85
|
-
# Then the HF dataset should be created successfully
|
|
86
|
-
assert isinstance(hf_dataset, Dataset)
|
|
87
|
-
assert len(hf_dataset) == len(dataset)
|
|
88
|
-
assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id", "partition_id"}
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def test_hf_dataset_from_torch_tuple():
|
|
92
|
-
# Given a Pytorch dataset that returns a tuple for each item
|
|
93
|
-
dataset = PytorchTupleDataset()
|
|
94
|
-
# And the correct number of column names passed in
|
|
95
|
-
hf_dataset = hf_dataset_from_torch(dataset, column_names=["value", "label"])
|
|
96
|
-
# Then the HF dataset should be created successfully
|
|
97
|
-
assert isinstance(hf_dataset, Dataset)
|
|
98
|
-
assert len(hf_dataset) == len(dataset)
|
|
99
|
-
assert hf_dataset.column_names == ["value", "label"]
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def test_hf_dataset_from_torch_tuple_error():
|
|
103
|
-
# Given a Pytorch dataset that returns a tuple for each item
|
|
104
|
-
dataset = PytorchTupleDataset()
|
|
105
|
-
# Then the HF dataset should raise an error if no column names are passed in
|
|
106
|
-
with pytest.raises(DatasetGenerationError):
|
|
107
|
-
hf_dataset_from_torch(dataset)
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def test_hf_dataset_from_torch_tuple_error_not_enough_columns():
|
|
111
|
-
# Given a Pytorch dataset that returns a tuple for each item
|
|
112
|
-
dataset = PytorchTupleDataset()
|
|
113
|
-
# Then the HF dataset should raise an error if not enough column names are passed in
|
|
114
|
-
with pytest.raises(DatasetGenerationError):
|
|
115
|
-
hf_dataset_from_torch(dataset, column_names=["value"])
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def test_hf_dataset_from_torch_named_tuple():
|
|
119
|
-
# Given a Pytorch dataset that returns a namedtuple for each item
|
|
120
|
-
dataset = PytorchNamedTupleDataset()
|
|
121
|
-
# And no column names are passed in
|
|
122
|
-
hf_dataset = hf_dataset_from_torch(dataset)
|
|
123
|
-
# Then the HF dataset should be created successfully
|
|
124
|
-
assert isinstance(hf_dataset, Dataset)
|
|
125
|
-
assert len(hf_dataset) == len(dataset)
|
|
126
|
-
assert hf_dataset.column_names == ["value", "label"]
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
def test_hf_dataset_from_torch_dataclass():
|
|
130
|
-
# Given a Pytorch dataset that returns a dataclass for each item
|
|
131
|
-
dataset = PytorchDataclassDataset()
|
|
132
|
-
hf_dataset = hf_dataset_from_torch(dataset)
|
|
133
|
-
# Then the HF dataset should be created successfully
|
|
134
|
-
assert isinstance(hf_dataset, Dataset)
|
|
135
|
-
assert len(hf_dataset) == len(dataset)
|
|
136
|
-
assert hf_dataset.column_names == ["text", "label"]
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def test_hf_dataset_from_torch_invalid_dataset():
|
|
140
|
-
# Given a Pytorch dataset that returns a list for each item
|
|
141
|
-
dataset = PytorchInvalidDataset()
|
|
142
|
-
# Then the HF dataset should raise an error
|
|
143
|
-
with pytest.raises(DatasetGenerationError):
|
|
144
|
-
hf_dataset_from_torch(dataset)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def test_hf_dataset_from_torchdataloader():
|
|
148
|
-
# Given a Pytorch dataloader that returns a column-oriented batch of items
|
|
149
|
-
dataset = PytorchDictDataset()
|
|
150
|
-
|
|
151
|
-
def collate_fn(x: list[dict]):
|
|
152
|
-
return {"value": [item["value"] for item in x], "label": [item["label"] for item in x]}
|
|
153
|
-
|
|
154
|
-
dataloader = TorchDataLoader(dataset, batch_size=3, collate_fn=collate_fn)
|
|
155
|
-
hf_dataset = hf_dataset_from_torch(dataloader)
|
|
156
|
-
# Then the HF dataset should be created successfully
|
|
157
|
-
assert isinstance(hf_dataset, Dataset)
|
|
158
|
-
assert len(hf_dataset) == len(dataset)
|
|
159
|
-
assert hf_dataset.column_names == ["value", "label"]
|
orca_sdk-0.1.11.dist-info/RECORD
DELETED
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
orca_sdk/__init__.py,sha256=xyjNwkLQXaX8A-UYgGwYDjv2btOXArT_yiMTfmW7KA8,1003
|
|
2
|
-
orca_sdk/_shared/__init__.py,sha256=3Kt0Hu3QLI5FEp9nqGTxqAm3hAoBJKcagfaGQZ-lbJQ,223
|
|
3
|
-
orca_sdk/_shared/metrics.py,sha256=faeL1B1ftmns1ikfKrIlU3xOn6j0iAGLNUupxvAFza8,24968
|
|
4
|
-
orca_sdk/_shared/metrics_test.py,sha256=vDIXoj8EuuLcdPJz_7EiVPgQ-FXiVT81JG30jxsg9HM,20752
|
|
5
|
-
orca_sdk/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
orca_sdk/_utils/analysis_ui.py,sha256=gXwCR972kffPjPyajcy7m8HgPARSfhXwBJ0rnOmu_7k,9418
|
|
7
|
-
orca_sdk/_utils/analysis_ui_style.css,sha256=q_ba_-_KtgztepHg829zLzypaxKayl7ySC1-oYDzV3k,836
|
|
8
|
-
orca_sdk/_utils/auth.py,sha256=nC252O171_3_wn4KBAN7kg8GNvoZFiQ5Xtzkrm5dWDo,2645
|
|
9
|
-
orca_sdk/_utils/auth_test.py,sha256=ygVWv1Ex53LaxIP7p2hzPHl8l9qYyBD5IGmEFJMps6s,1056
|
|
10
|
-
orca_sdk/_utils/common.py,sha256=wUm2pNDWytEecC5WiDWd02-yCZw3Akx0bIutG4lHsFA,805
|
|
11
|
-
orca_sdk/_utils/data_parsing.py,sha256=tTZwGJQ1xkl12gyq6gQ0dRIgDFaNDy3Sde8A_SIfmxo,5726
|
|
12
|
-
orca_sdk/_utils/data_parsing_disk_test.py,sha256=0IxyUNnawlNkqFwVEzIfXzuOympYkZRUP0rgxXhUrW4,3781
|
|
13
|
-
orca_sdk/_utils/data_parsing_torch_test.py,sha256=LTCd1H9V9OtBARv_SmyLEYMeAYPohf8IHJjUzxenEC8,5155
|
|
14
|
-
orca_sdk/_utils/pagination.py,sha256=986z0QPZixrZeurJWorF6eMgnTRdDF84AagEA6qNbMw,4245
|
|
15
|
-
orca_sdk/_utils/pagination_test.py,sha256=BUylCrcHnwoKEBmMUzVr0lwLpA35ivcCwdBK4rMw9y8,4887
|
|
16
|
-
orca_sdk/_utils/prediction_result_ui.css,sha256=sqBlkRLnovb5X5EcUDdB6iGpH63nVRlTW4uAmXuD0WM,258
|
|
17
|
-
orca_sdk/_utils/prediction_result_ui.py,sha256=N4Cj7PUEx2UeV-4Mhk-ZaPegilssaKtElSACVhVrx1w,4965
|
|
18
|
-
orca_sdk/_utils/tqdm_file_reader.py,sha256=Lw7Cg1UgNuRUoN6jjqZb-IlV00H-kbRcrZLdudr1GxE,324
|
|
19
|
-
orca_sdk/_utils/value_parser.py,sha256=pw-suYXKuZQ7mGV-QUFcD3-fmp6lJKjnyQ3f_Hb3Gg8,2379
|
|
20
|
-
orca_sdk/_utils/value_parser_test.py,sha256=dKni8W7KV2RgwuwK0ZN1SN-lH-W4DSSYkHdIXz52kys,1210
|
|
21
|
-
orca_sdk/async_client.py,sha256=qaDmmXrCjPql_y-_kZMskFR0OlE8n1Y_pxtC1JwGbF0,138668
|
|
22
|
-
orca_sdk/classification_model.py,sha256=rb1TmjCMrXqhpu4YKzT0ZOmzxjCV4_sc0gI9GONTc2o,50208
|
|
23
|
-
orca_sdk/classification_model_test.py,sha256=OXKv4vfnV3NYVcZypuDztVKiEFB_0sYX3S-iU5wui38,39534
|
|
24
|
-
orca_sdk/client.py,sha256=pm_NChTd3qKIwwCYoFUOj4sjZQvZJKYX1fbuT-H7hEc,137755
|
|
25
|
-
orca_sdk/conftest.py,sha256=sPxOUGHU9kFznPJ_JZ6vZxY4m6e290ygvUf47P4pW6I,16926
|
|
26
|
-
orca_sdk/credentials.py,sha256=2SwC3tq5akP-F_u2s4xMZDp8mlsKMUT1T5T9Z99-eSY,6588
|
|
27
|
-
orca_sdk/credentials_test.py,sha256=K_1aNVCE5eoqX-tfh0G0_Vhqzhui4qcfYqWaDXfaqTA,4020
|
|
28
|
-
orca_sdk/datasource.py,sha256=kPp3wOcjhTJsSwi51oK-y7tNYlz6jDAGKX9R7CoqHXs,22720
|
|
29
|
-
orca_sdk/datasource_test.py,sha256=qoePaetnlgQZAx6y5SvCv9JMdBSvB-0TB1ug0_L0FuY,16786
|
|
30
|
-
orca_sdk/embedding_model.py,sha256=hCl6vWpW7LXaM1ovGP6GzEp7sRdyJECS_sNc8kKBsvQ,28495
|
|
31
|
-
orca_sdk/embedding_model_test.py,sha256=CERI3Lk7U32N3qwZyzip41Mw1Yb4sHWPEGeSulsaY88,9368
|
|
32
|
-
orca_sdk/job.py,sha256=wHwVt-s7i-v8udhLGybB-90Kp4dwOLrY806bE4Tam5Q,13092
|
|
33
|
-
orca_sdk/job_test.py,sha256=nRSWxd_1UIfrj9oMVvrXjt6OBkBpddYAjb2y6P-DTUg,4327
|
|
34
|
-
orca_sdk/memoryset.py,sha256=JhIyusMelyg9ZinkoZKGtZIoxLANRk8XGZPlrVtX5ds,164318
|
|
35
|
-
orca_sdk/memoryset_test.py,sha256=3PsqzYkd-QG2nKQiWyW4qxC7QVPY76ytAmxkJ5EWfUs,46407
|
|
36
|
-
orca_sdk/regression_model.py,sha256=KHDVUZfnY5joRsO4HFg62LPeISH9j_cjyWW1SouuPHU,33971
|
|
37
|
-
orca_sdk/regression_model_test.py,sha256=SIVWS8gSnmolVLEdJ4k6AYCV1bY4Hcjej43Ynw-TDzE,27398
|
|
38
|
-
orca_sdk/telemetry.py,sha256=e_FiN3JFkQV62CKygd78BVQwfQwdClAZvV-XvLDEIGI,27828
|
|
39
|
-
orca_sdk/telemetry_test.py,sha256=eT66C5lFdNg-pQdo2I__BP7Tn5fTc9aTkVo9ZhWwhU0,5519
|
|
40
|
-
orca_sdk-0.1.11.dist-info/METADATA,sha256=54iBK4DRJ-rBkhYTnxTrKlnWQhMd7t9dXCY-iQ_XER8,3638
|
|
41
|
-
orca_sdk-0.1.11.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
42
|
-
orca_sdk-0.1.11.dist-info/RECORD,,
|
|
File without changes
|