orca-sdk 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,244 +0,0 @@
1
- import json
2
- import pickle
3
- import tempfile
4
- from collections import namedtuple
5
- from dataclasses import dataclass
6
-
7
- import pandas as pd
8
- import pytest
9
- from datasets import Dataset
10
- from datasets.exceptions import DatasetGenerationError
11
- from torch.utils.data import DataLoader as TorchDataLoader
12
- from torch.utils.data import Dataset as TorchDataset
13
-
14
- from ..conftest import SAMPLE_DATA
15
- from .data_parsing import hf_dataset_from_disk, hf_dataset_from_torch
16
-
17
-
18
- class PytorchDictDataset(TorchDataset):
19
- def __init__(self):
20
- self.data = SAMPLE_DATA
21
-
22
- def __getitem__(self, i):
23
- return self.data[i]
24
-
25
- def __len__(self):
26
- return len(self.data)
27
-
28
-
29
- def test_hf_dataset_from_torch_dict():
30
- # Given a Pytorch dataset that returns a dictionary for each item
31
- dataset = PytorchDictDataset()
32
- hf_dataset = hf_dataset_from_torch(dataset)
33
- # Then the HF dataset should be created successfully
34
- assert isinstance(hf_dataset, Dataset)
35
- assert len(hf_dataset) == len(dataset)
36
- assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id", "partition_id"}
37
-
38
-
39
- class PytorchTupleDataset(TorchDataset):
40
- def __init__(self):
41
- self.data = SAMPLE_DATA
42
-
43
- def __getitem__(self, i):
44
- return self.data[i]["value"], self.data[i]["label"]
45
-
46
- def __len__(self):
47
- return len(self.data)
48
-
49
-
50
- def test_hf_dataset_from_torch_tuple():
51
- # Given a Pytorch dataset that returns a tuple for each item
52
- dataset = PytorchTupleDataset()
53
- # And the correct number of column names passed in
54
- hf_dataset = hf_dataset_from_torch(dataset, column_names=["value", "label"])
55
- # Then the HF dataset should be created successfully
56
- assert isinstance(hf_dataset, Dataset)
57
- assert len(hf_dataset) == len(dataset)
58
- assert hf_dataset.column_names == ["value", "label"]
59
-
60
-
61
- def test_hf_dataset_from_torch_tuple_error():
62
- # Given a Pytorch dataset that returns a tuple for each item
63
- dataset = PytorchTupleDataset()
64
- # Then the HF dataset should raise an error if no column names are passed in
65
- with pytest.raises(DatasetGenerationError):
66
- hf_dataset_from_torch(dataset)
67
-
68
-
69
- def test_hf_dataset_from_torch_tuple_error_not_enough_columns():
70
- # Given a Pytorch dataset that returns a tuple for each item
71
- dataset = PytorchTupleDataset()
72
- # Then the HF dataset should raise an error if not enough column names are passed in
73
- with pytest.raises(DatasetGenerationError):
74
- hf_dataset_from_torch(dataset, column_names=["value"])
75
-
76
-
77
- DatasetTuple = namedtuple("DatasetTuple", ["value", "label"])
78
-
79
-
80
- class PytorchNamedTupleDataset(TorchDataset):
81
- def __init__(self):
82
- self.data = SAMPLE_DATA
83
-
84
- def __getitem__(self, i):
85
- return DatasetTuple(self.data[i]["value"], self.data[i]["label"])
86
-
87
- def __len__(self):
88
- return len(self.data)
89
-
90
-
91
- def test_hf_dataset_from_torch_named_tuple():
92
- # Given a Pytorch dataset that returns a namedtuple for each item
93
- dataset = PytorchNamedTupleDataset()
94
- # And no column names are passed in
95
- hf_dataset = hf_dataset_from_torch(dataset)
96
- # Then the HF dataset should be created successfully
97
- assert isinstance(hf_dataset, Dataset)
98
- assert len(hf_dataset) == len(dataset)
99
- assert hf_dataset.column_names == ["value", "label"]
100
-
101
-
102
- @dataclass
103
- class DatasetItem:
104
- text: str
105
- label: int
106
-
107
-
108
- class PytorchDataclassDataset(TorchDataset):
109
- def __init__(self):
110
- self.data = SAMPLE_DATA
111
-
112
- def __getitem__(self, i):
113
- return DatasetItem(text=self.data[i]["value"], label=self.data[i]["label"])
114
-
115
- def __len__(self):
116
- return len(self.data)
117
-
118
-
119
- def test_hf_dataset_from_torch_dataclass():
120
- # Given a Pytorch dataset that returns a dataclass for each item
121
- dataset = PytorchDataclassDataset()
122
- hf_dataset = hf_dataset_from_torch(dataset)
123
- # Then the HF dataset should be created successfully
124
- assert isinstance(hf_dataset, Dataset)
125
- assert len(hf_dataset) == len(dataset)
126
- assert hf_dataset.column_names == ["text", "label"]
127
-
128
-
129
- class PytorchInvalidDataset(TorchDataset):
130
- def __init__(self):
131
- self.data = SAMPLE_DATA
132
-
133
- def __getitem__(self, i):
134
- return [self.data[i]["value"], self.data[i]["label"]]
135
-
136
- def __len__(self):
137
- return len(self.data)
138
-
139
-
140
- def test_hf_dataset_from_torch_invalid_dataset():
141
- # Given a Pytorch dataset that returns a list for each item
142
- dataset = PytorchInvalidDataset()
143
- # Then the HF dataset should raise an error
144
- with pytest.raises(DatasetGenerationError):
145
- hf_dataset_from_torch(dataset)
146
-
147
-
148
- def test_hf_dataset_from_torchdataloader():
149
- # Given a Pytorch dataloader that returns a column-oriented batch of items
150
- dataset = PytorchDictDataset()
151
-
152
- def collate_fn(x: list[dict]):
153
- return {"value": [item["value"] for item in x], "label": [item["label"] for item in x]}
154
-
155
- dataloader = TorchDataLoader(dataset, batch_size=3, collate_fn=collate_fn)
156
- hf_dataset = hf_dataset_from_torch(dataloader)
157
- # Then the HF dataset should be created successfully
158
- assert isinstance(hf_dataset, Dataset)
159
- assert len(hf_dataset) == len(dataset)
160
- assert hf_dataset.column_names == ["value", "label"]
161
-
162
-
163
- def test_hf_dataset_from_disk_pickle_list():
164
- with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
165
- # Given a pickle file with test data that is a list
166
- test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
167
- with open(temp_file.name, "wb") as f:
168
- pickle.dump(test_data, f)
169
- dataset = hf_dataset_from_disk(temp_file.name)
170
- # Then the HF dataset should be created successfully
171
- assert isinstance(dataset, Dataset)
172
- assert len(dataset) == 30
173
- assert dataset.column_names == ["value", "label"]
174
-
175
-
176
- def test_hf_dataset_from_disk_pickle_dict():
177
- with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
178
- # Given a pickle file with test data that is a dict
179
- test_data = {"value": [f"test_{i}" for i in range(30)], "label": [i % 2 for i in range(30)]}
180
- with open(temp_file.name, "wb") as f:
181
- pickle.dump(test_data, f)
182
- dataset = hf_dataset_from_disk(temp_file.name)
183
- # Then the HF dataset should be created successfully
184
- assert isinstance(dataset, Dataset)
185
- assert len(dataset) == 30
186
- assert dataset.column_names == ["value", "label"]
187
-
188
-
189
- def test_hf_dataset_from_disk_json():
190
- with tempfile.NamedTemporaryFile(suffix=".json") as temp_file:
191
- # Given a JSON file with test data
192
- test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
193
- with open(temp_file.name, "w") as f:
194
- json.dump(test_data, f)
195
- dataset = hf_dataset_from_disk(temp_file.name)
196
- # Then the HF dataset should be created successfully
197
- assert isinstance(dataset, Dataset)
198
- assert len(dataset) == 30
199
- assert dataset.column_names == ["value", "label"]
200
-
201
-
202
- def test_hf_dataset_from_disk_jsonl():
203
- with tempfile.NamedTemporaryFile(suffix=".jsonl") as temp_file:
204
- # Given a JSONL file with test data
205
- test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
206
- with open(temp_file.name, "w") as f:
207
- for item in test_data:
208
- f.write(json.dumps(item) + "\n")
209
- dataset = hf_dataset_from_disk(temp_file.name)
210
- # Then the HF dataset should be created successfully
211
- assert isinstance(dataset, Dataset)
212
- assert len(dataset) == 30
213
- assert dataset.column_names == ["value", "label"]
214
-
215
-
216
- def test_hf_dataset_from_disk_csv():
217
- with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
218
- # Given a CSV file with test data
219
- test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
220
- with open(temp_file.name, "w") as f:
221
- f.write("value,label\n")
222
- for item in test_data:
223
- f.write(f"{item['value']},{item['label']}\n")
224
- dataset = hf_dataset_from_disk(temp_file.name)
225
- # Then the HF dataset should be created successfully
226
- assert isinstance(dataset, Dataset)
227
- assert len(dataset) == 30
228
- assert dataset.column_names == ["value", "label"]
229
-
230
-
231
- def test_hf_dataset_from_disk_parquet():
232
- with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
233
- # Given a Parquet file with test data
234
- data = {
235
- "value": [f"test_{i}" for i in range(30)],
236
- "label": [i % 2 for i in range(30)],
237
- }
238
- df = pd.DataFrame(data)
239
- df.to_parquet(temp_file.name)
240
- dataset = hf_dataset_from_disk(temp_file.name)
241
- # Then the HF dataset should be created successfully
242
- assert isinstance(dataset, Dataset)
243
- assert len(dataset) == 30
244
- assert dataset.column_names == ["value", "label"]
@@ -1,41 +0,0 @@
1
- orca_sdk/__init__.py,sha256=xyjNwkLQXaX8A-UYgGwYDjv2btOXArT_yiMTfmW7KA8,1003
2
- orca_sdk/_shared/__init__.py,sha256=3Kt0Hu3QLI5FEp9nqGTxqAm3hAoBJKcagfaGQZ-lbJQ,223
3
- orca_sdk/_shared/metrics.py,sha256=faeL1B1ftmns1ikfKrIlU3xOn6j0iAGLNUupxvAFza8,24968
4
- orca_sdk/_shared/metrics_test.py,sha256=vDIXoj8EuuLcdPJz_7EiVPgQ-FXiVT81JG30jxsg9HM,20752
5
- orca_sdk/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- orca_sdk/_utils/analysis_ui.py,sha256=R0xc4RyJKyBHJEEF_ztI4Dm5w8Y1uF0Wpkn4LQgXqBE,9258
7
- orca_sdk/_utils/analysis_ui_style.css,sha256=q_ba_-_KtgztepHg829zLzypaxKayl7ySC1-oYDzV3k,836
8
- orca_sdk/_utils/auth.py,sha256=nC252O171_3_wn4KBAN7kg8GNvoZFiQ5Xtzkrm5dWDo,2645
9
- orca_sdk/_utils/auth_test.py,sha256=ygVWv1Ex53LaxIP7p2hzPHl8l9qYyBD5IGmEFJMps6s,1056
10
- orca_sdk/_utils/common.py,sha256=wUm2pNDWytEecC5WiDWd02-yCZw3Akx0bIutG4lHsFA,805
11
- orca_sdk/_utils/data_parsing.py,sha256=5vaTpvUOS-ldlcgnSARYw7s9mce-imzkU7kA48-pdIM,5396
12
- orca_sdk/_utils/data_parsing_test.py,sha256=u7BEjxtsU9gMs3tAZI0lJ--vOLlwKwH3hemdCedzxA0,8826
13
- orca_sdk/_utils/pagination.py,sha256=986z0QPZixrZeurJWorF6eMgnTRdDF84AagEA6qNbMw,4245
14
- orca_sdk/_utils/pagination_test.py,sha256=BUylCrcHnwoKEBmMUzVr0lwLpA35ivcCwdBK4rMw9y8,4887
15
- orca_sdk/_utils/prediction_result_ui.css,sha256=sqBlkRLnovb5X5EcUDdB6iGpH63nVRlTW4uAmXuD0WM,258
16
- orca_sdk/_utils/prediction_result_ui.py,sha256=Ur_FY7dz3oWNmtPiP3Wl3yRlEMgK8q9UfT-SDu9UPxA,4805
17
- orca_sdk/_utils/tqdm_file_reader.py,sha256=Lw7Cg1UgNuRUoN6jjqZb-IlV00H-kbRcrZLdudr1GxE,324
18
- orca_sdk/_utils/value_parser.py,sha256=c3qMABCCDQcIjn9N1orYYnlRwDW9JWdGwW_2TDZPLdI,1286
19
- orca_sdk/_utils/value_parser_test.py,sha256=OybsiC-Obi32RRi9NIuwrVBRAnlyPMV1xVAaevSrb7M,1079
20
- orca_sdk/async_client.py,sha256=PM7N-ggmtucfcUF1vQGtTZOCJpSNTOgd7l3LDNF5kP4,137192
21
- orca_sdk/classification_model.py,sha256=C58euWnNvwXnthR9RtVVCOcgPEbxCjjp3sHMb86V6YA,50197
22
- orca_sdk/classification_model_test.py,sha256=ElqxtR6gNwwk8dNXwfwAhpT7l0ZIP3H4pHmOyFXyTWk,37370
23
- orca_sdk/client.py,sha256=SKZv3zGG6OwLe_FlB5wL2cxltOLPCcHvoo2CbMwyKgA,136241
24
- orca_sdk/conftest.py,sha256=0O1VY-SPKNAvi9fBLdY1RMnYVgZvMjP92y99bNAqqiw,12461
25
- orca_sdk/credentials.py,sha256=2SwC3tq5akP-F_u2s4xMZDp8mlsKMUT1T5T9Z99-eSY,6588
26
- orca_sdk/credentials_test.py,sha256=TLbXJMz3IlThvtSrHeLM7jRsKnrncA_ahOTpHg15Ei4,4089
27
- orca_sdk/datasource.py,sha256=Qn5QloE84UXeyPk2wcy1lWe5wmh1iDBS044eWnxck_E,22371
28
- orca_sdk/datasource_test.py,sha256=sCk3IcQJbDut5oN4Wf7PXhTxyMwalxMuCXJekSxy9wk,16665
29
- orca_sdk/embedding_model.py,sha256=vLGnlO9I-cN1lklNBl_LxZ8m9oK3vkegFOpvYYw8u8g,28038
30
- orca_sdk/embedding_model_test.py,sha256=Lc6fZ0ifT0hh6ldkUfjwMPcP6OgN0Umlzu8XDLs7UO4,8144
31
- orca_sdk/job.py,sha256=wHwVt-s7i-v8udhLGybB-90Kp4dwOLrY806bE4Tam5Q,13092
32
- orca_sdk/job_test.py,sha256=nRSWxd_1UIfrj9oMVvrXjt6OBkBpddYAjb2y6P-DTUg,4327
33
- orca_sdk/memoryset.py,sha256=06v34fHabpkEaOv9VCKc0NhpMi_mNZGbQP_9GiW_nuE,157157
34
- orca_sdk/memoryset_test.py,sha256=O2o42XETtffXtZy0kbLk2b8cUDXU-w2ZAzXLi5-vDPQ,51278
35
- orca_sdk/regression_model.py,sha256=AXRzJG15sDJQSiDCDfRdcLnZDNkJWORYjhHqKyyL-Fc,33960
36
- orca_sdk/regression_model_test.py,sha256=90EyrhaMk1kTf87RFkMNz1PTItmeUISs6AvHmyp08DU,25447
37
- orca_sdk/telemetry.py,sha256=ZyCMiyyo_SchjadWZH55TlLrC4Ucq5S316NbW26LL4Y,27834
38
- orca_sdk/telemetry_test.py,sha256=eT66C5lFdNg-pQdo2I__BP7Tn5fTc9aTkVo9ZhWwhU0,5519
39
- orca_sdk-0.1.10.dist-info/METADATA,sha256=j_TIalbL8oztP39lnXjyAI6Aosvb6rnJKUc3gcuxD0k,3710
40
- orca_sdk-0.1.10.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
41
- orca_sdk-0.1.10.dist-info/RECORD,,