orca-sdk 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,91 +0,0 @@
1
- import json
2
- import pickle
3
- import tempfile
4
-
5
- from datasets import Dataset
6
-
7
- from .data_parsing import hf_dataset_from_disk
8
-
9
-
10
- def test_hf_dataset_from_disk_pickle_list():
11
- with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
12
- # Given a pickle file with test data that is a list
13
- test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
14
- with open(temp_file.name, "wb") as f:
15
- pickle.dump(test_data, f)
16
- dataset = hf_dataset_from_disk(temp_file.name)
17
- # Then the HF dataset should be created successfully
18
- assert isinstance(dataset, Dataset)
19
- assert len(dataset) == 30
20
- assert dataset.column_names == ["value", "label"]
21
-
22
-
23
- def test_hf_dataset_from_disk_pickle_dict():
24
- with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
25
- # Given a pickle file with test data that is a dict
26
- test_data = {"value": [f"test_{i}" for i in range(30)], "label": [i % 2 for i in range(30)]}
27
- with open(temp_file.name, "wb") as f:
28
- pickle.dump(test_data, f)
29
- dataset = hf_dataset_from_disk(temp_file.name)
30
- # Then the HF dataset should be created successfully
31
- assert isinstance(dataset, Dataset)
32
- assert len(dataset) == 30
33
- assert dataset.column_names == ["value", "label"]
34
-
35
-
36
- def test_hf_dataset_from_disk_json():
37
- with tempfile.NamedTemporaryFile(suffix=".json") as temp_file:
38
- # Given a JSON file with test data
39
- test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
40
- with open(temp_file.name, "w") as f:
41
- json.dump(test_data, f)
42
- dataset = hf_dataset_from_disk(temp_file.name)
43
- # Then the HF dataset should be created successfully
44
- assert isinstance(dataset, Dataset)
45
- assert len(dataset) == 30
46
- assert dataset.column_names == ["value", "label"]
47
-
48
-
49
- def test_hf_dataset_from_disk_jsonl():
50
- with tempfile.NamedTemporaryFile(suffix=".jsonl") as temp_file:
51
- # Given a JSONL file with test data
52
- test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
53
- with open(temp_file.name, "w") as f:
54
- for item in test_data:
55
- f.write(json.dumps(item) + "\n")
56
- dataset = hf_dataset_from_disk(temp_file.name)
57
- # Then the HF dataset should be created successfully
58
- assert isinstance(dataset, Dataset)
59
- assert len(dataset) == 30
60
- assert dataset.column_names == ["value", "label"]
61
-
62
-
63
- def test_hf_dataset_from_disk_csv():
64
- with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
65
- # Given a CSV file with test data
66
- test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
67
- with open(temp_file.name, "w") as f:
68
- f.write("value,label\n")
69
- for item in test_data:
70
- f.write(f"{item['value']},{item['label']}\n")
71
- dataset = hf_dataset_from_disk(temp_file.name)
72
- # Then the HF dataset should be created successfully
73
- assert isinstance(dataset, Dataset)
74
- assert len(dataset) == 30
75
- assert dataset.column_names == ["value", "label"]
76
-
77
-
78
- def test_hf_dataset_from_disk_parquet():
79
- with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
80
- # Given a Parquet file with test data
81
- data = {
82
- "value": [f"test_{i}" for i in range(30)],
83
- "label": [i % 2 for i in range(30)],
84
- }
85
- dataset = Dataset.from_dict(data)
86
- dataset.to_parquet(temp_file.name)
87
- dataset = hf_dataset_from_disk(temp_file.name)
88
- # Then the HF dataset should be created successfully
89
- assert isinstance(dataset, Dataset)
90
- assert len(dataset) == 30
91
- assert dataset.column_names == ["value", "label"]
@@ -1,159 +0,0 @@
1
- from collections import namedtuple
2
- from dataclasses import dataclass
3
-
4
- import pytest
5
- from datasets import Dataset
6
- from datasets.exceptions import DatasetGenerationError
7
-
8
- from ..conftest import SAMPLE_DATA
9
- from .data_parsing import hf_dataset_from_torch
10
-
11
- pytest.importorskip("torch")
12
-
13
- from torch.utils.data import DataLoader as TorchDataLoader # noqa: E402
14
- from torch.utils.data import Dataset as TorchDataset # noqa: E402
15
-
16
-
17
- class PytorchDictDataset(TorchDataset):
18
- def __init__(self):
19
- self.data = SAMPLE_DATA
20
-
21
- def __getitem__(self, i):
22
- return self.data[i]
23
-
24
- def __len__(self):
25
- return len(self.data)
26
-
27
-
28
- class PytorchTupleDataset(TorchDataset):
29
- def __init__(self):
30
- self.data = SAMPLE_DATA
31
-
32
- def __getitem__(self, i):
33
- return self.data[i]["value"], self.data[i]["label"]
34
-
35
- def __len__(self):
36
- return len(self.data)
37
-
38
-
39
- DatasetTuple = namedtuple("DatasetTuple", ["value", "label"])
40
-
41
-
42
- class PytorchNamedTupleDataset(TorchDataset):
43
- def __init__(self):
44
- self.data = SAMPLE_DATA
45
-
46
- def __getitem__(self, i):
47
- return DatasetTuple(self.data[i]["value"], self.data[i]["label"])
48
-
49
- def __len__(self):
50
- return len(self.data)
51
-
52
-
53
- @dataclass
54
- class DatasetItem:
55
- text: str
56
- label: int
57
-
58
-
59
- class PytorchDataclassDataset(TorchDataset):
60
- def __init__(self):
61
- self.data = SAMPLE_DATA
62
-
63
- def __getitem__(self, i):
64
- return DatasetItem(text=self.data[i]["value"], label=self.data[i]["label"])
65
-
66
- def __len__(self):
67
- return len(self.data)
68
-
69
-
70
- class PytorchInvalidDataset(TorchDataset):
71
- def __init__(self):
72
- self.data = SAMPLE_DATA
73
-
74
- def __getitem__(self, i):
75
- return [self.data[i]["value"], self.data[i]["label"]]
76
-
77
- def __len__(self):
78
- return len(self.data)
79
-
80
-
81
- def test_hf_dataset_from_torch_dict():
82
- # Given a Pytorch dataset that returns a dictionary for each item
83
- dataset = PytorchDictDataset()
84
- hf_dataset = hf_dataset_from_torch(dataset)
85
- # Then the HF dataset should be created successfully
86
- assert isinstance(hf_dataset, Dataset)
87
- assert len(hf_dataset) == len(dataset)
88
- assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id", "partition_id"}
89
-
90
-
91
- def test_hf_dataset_from_torch_tuple():
92
- # Given a Pytorch dataset that returns a tuple for each item
93
- dataset = PytorchTupleDataset()
94
- # And the correct number of column names passed in
95
- hf_dataset = hf_dataset_from_torch(dataset, column_names=["value", "label"])
96
- # Then the HF dataset should be created successfully
97
- assert isinstance(hf_dataset, Dataset)
98
- assert len(hf_dataset) == len(dataset)
99
- assert hf_dataset.column_names == ["value", "label"]
100
-
101
-
102
- def test_hf_dataset_from_torch_tuple_error():
103
- # Given a Pytorch dataset that returns a tuple for each item
104
- dataset = PytorchTupleDataset()
105
- # Then the HF dataset should raise an error if no column names are passed in
106
- with pytest.raises(DatasetGenerationError):
107
- hf_dataset_from_torch(dataset)
108
-
109
-
110
- def test_hf_dataset_from_torch_tuple_error_not_enough_columns():
111
- # Given a Pytorch dataset that returns a tuple for each item
112
- dataset = PytorchTupleDataset()
113
- # Then the HF dataset should raise an error if not enough column names are passed in
114
- with pytest.raises(DatasetGenerationError):
115
- hf_dataset_from_torch(dataset, column_names=["value"])
116
-
117
-
118
- def test_hf_dataset_from_torch_named_tuple():
119
- # Given a Pytorch dataset that returns a namedtuple for each item
120
- dataset = PytorchNamedTupleDataset()
121
- # And no column names are passed in
122
- hf_dataset = hf_dataset_from_torch(dataset)
123
- # Then the HF dataset should be created successfully
124
- assert isinstance(hf_dataset, Dataset)
125
- assert len(hf_dataset) == len(dataset)
126
- assert hf_dataset.column_names == ["value", "label"]
127
-
128
-
129
- def test_hf_dataset_from_torch_dataclass():
130
- # Given a Pytorch dataset that returns a dataclass for each item
131
- dataset = PytorchDataclassDataset()
132
- hf_dataset = hf_dataset_from_torch(dataset)
133
- # Then the HF dataset should be created successfully
134
- assert isinstance(hf_dataset, Dataset)
135
- assert len(hf_dataset) == len(dataset)
136
- assert hf_dataset.column_names == ["text", "label"]
137
-
138
-
139
- def test_hf_dataset_from_torch_invalid_dataset():
140
- # Given a Pytorch dataset that returns a list for each item
141
- dataset = PytorchInvalidDataset()
142
- # Then the HF dataset should raise an error
143
- with pytest.raises(DatasetGenerationError):
144
- hf_dataset_from_torch(dataset)
145
-
146
-
147
- def test_hf_dataset_from_torchdataloader():
148
- # Given a Pytorch dataloader that returns a column-oriented batch of items
149
- dataset = PytorchDictDataset()
150
-
151
- def collate_fn(x: list[dict]):
152
- return {"value": [item["value"] for item in x], "label": [item["label"] for item in x]}
153
-
154
- dataloader = TorchDataLoader(dataset, batch_size=3, collate_fn=collate_fn)
155
- hf_dataset = hf_dataset_from_torch(dataloader)
156
- # Then the HF dataset should be created successfully
157
- assert isinstance(hf_dataset, Dataset)
158
- assert len(hf_dataset) == len(dataset)
159
- assert hf_dataset.column_names == ["value", "label"]
@@ -1,42 +0,0 @@
1
- orca_sdk/__init__.py,sha256=xyjNwkLQXaX8A-UYgGwYDjv2btOXArT_yiMTfmW7KA8,1003
2
- orca_sdk/_shared/__init__.py,sha256=3Kt0Hu3QLI5FEp9nqGTxqAm3hAoBJKcagfaGQZ-lbJQ,223
3
- orca_sdk/_shared/metrics.py,sha256=faeL1B1ftmns1ikfKrIlU3xOn6j0iAGLNUupxvAFza8,24968
4
- orca_sdk/_shared/metrics_test.py,sha256=vDIXoj8EuuLcdPJz_7EiVPgQ-FXiVT81JG30jxsg9HM,20752
5
- orca_sdk/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- orca_sdk/_utils/analysis_ui.py,sha256=gXwCR972kffPjPyajcy7m8HgPARSfhXwBJ0rnOmu_7k,9418
7
- orca_sdk/_utils/analysis_ui_style.css,sha256=q_ba_-_KtgztepHg829zLzypaxKayl7ySC1-oYDzV3k,836
8
- orca_sdk/_utils/auth.py,sha256=nC252O171_3_wn4KBAN7kg8GNvoZFiQ5Xtzkrm5dWDo,2645
9
- orca_sdk/_utils/auth_test.py,sha256=ygVWv1Ex53LaxIP7p2hzPHl8l9qYyBD5IGmEFJMps6s,1056
10
- orca_sdk/_utils/common.py,sha256=wUm2pNDWytEecC5WiDWd02-yCZw3Akx0bIutG4lHsFA,805
11
- orca_sdk/_utils/data_parsing.py,sha256=tTZwGJQ1xkl12gyq6gQ0dRIgDFaNDy3Sde8A_SIfmxo,5726
12
- orca_sdk/_utils/data_parsing_disk_test.py,sha256=0IxyUNnawlNkqFwVEzIfXzuOympYkZRUP0rgxXhUrW4,3781
13
- orca_sdk/_utils/data_parsing_torch_test.py,sha256=LTCd1H9V9OtBARv_SmyLEYMeAYPohf8IHJjUzxenEC8,5155
14
- orca_sdk/_utils/pagination.py,sha256=986z0QPZixrZeurJWorF6eMgnTRdDF84AagEA6qNbMw,4245
15
- orca_sdk/_utils/pagination_test.py,sha256=BUylCrcHnwoKEBmMUzVr0lwLpA35ivcCwdBK4rMw9y8,4887
16
- orca_sdk/_utils/prediction_result_ui.css,sha256=sqBlkRLnovb5X5EcUDdB6iGpH63nVRlTW4uAmXuD0WM,258
17
- orca_sdk/_utils/prediction_result_ui.py,sha256=N4Cj7PUEx2UeV-4Mhk-ZaPegilssaKtElSACVhVrx1w,4965
18
- orca_sdk/_utils/tqdm_file_reader.py,sha256=Lw7Cg1UgNuRUoN6jjqZb-IlV00H-kbRcrZLdudr1GxE,324
19
- orca_sdk/_utils/value_parser.py,sha256=pw-suYXKuZQ7mGV-QUFcD3-fmp6lJKjnyQ3f_Hb3Gg8,2379
20
- orca_sdk/_utils/value_parser_test.py,sha256=dKni8W7KV2RgwuwK0ZN1SN-lH-W4DSSYkHdIXz52kys,1210
21
- orca_sdk/async_client.py,sha256=qaDmmXrCjPql_y-_kZMskFR0OlE8n1Y_pxtC1JwGbF0,138668
22
- orca_sdk/classification_model.py,sha256=rb1TmjCMrXqhpu4YKzT0ZOmzxjCV4_sc0gI9GONTc2o,50208
23
- orca_sdk/classification_model_test.py,sha256=OXKv4vfnV3NYVcZypuDztVKiEFB_0sYX3S-iU5wui38,39534
24
- orca_sdk/client.py,sha256=pm_NChTd3qKIwwCYoFUOj4sjZQvZJKYX1fbuT-H7hEc,137755
25
- orca_sdk/conftest.py,sha256=sPxOUGHU9kFznPJ_JZ6vZxY4m6e290ygvUf47P4pW6I,16926
26
- orca_sdk/credentials.py,sha256=2SwC3tq5akP-F_u2s4xMZDp8mlsKMUT1T5T9Z99-eSY,6588
27
- orca_sdk/credentials_test.py,sha256=K_1aNVCE5eoqX-tfh0G0_Vhqzhui4qcfYqWaDXfaqTA,4020
28
- orca_sdk/datasource.py,sha256=kPp3wOcjhTJsSwi51oK-y7tNYlz6jDAGKX9R7CoqHXs,22720
29
- orca_sdk/datasource_test.py,sha256=qoePaetnlgQZAx6y5SvCv9JMdBSvB-0TB1ug0_L0FuY,16786
30
- orca_sdk/embedding_model.py,sha256=hCl6vWpW7LXaM1ovGP6GzEp7sRdyJECS_sNc8kKBsvQ,28495
31
- orca_sdk/embedding_model_test.py,sha256=CERI3Lk7U32N3qwZyzip41Mw1Yb4sHWPEGeSulsaY88,9368
32
- orca_sdk/job.py,sha256=wHwVt-s7i-v8udhLGybB-90Kp4dwOLrY806bE4Tam5Q,13092
33
- orca_sdk/job_test.py,sha256=nRSWxd_1UIfrj9oMVvrXjt6OBkBpddYAjb2y6P-DTUg,4327
34
- orca_sdk/memoryset.py,sha256=JhIyusMelyg9ZinkoZKGtZIoxLANRk8XGZPlrVtX5ds,164318
35
- orca_sdk/memoryset_test.py,sha256=3PsqzYkd-QG2nKQiWyW4qxC7QVPY76ytAmxkJ5EWfUs,46407
36
- orca_sdk/regression_model.py,sha256=KHDVUZfnY5joRsO4HFg62LPeISH9j_cjyWW1SouuPHU,33971
37
- orca_sdk/regression_model_test.py,sha256=SIVWS8gSnmolVLEdJ4k6AYCV1bY4Hcjej43Ynw-TDzE,27398
38
- orca_sdk/telemetry.py,sha256=e_FiN3JFkQV62CKygd78BVQwfQwdClAZvV-XvLDEIGI,27828
39
- orca_sdk/telemetry_test.py,sha256=eT66C5lFdNg-pQdo2I__BP7Tn5fTc9aTkVo9ZhWwhU0,5519
40
- orca_sdk-0.1.11.dist-info/METADATA,sha256=54iBK4DRJ-rBkhYTnxTrKlnWQhMd7t9dXCY-iQ_XER8,3638
41
- orca_sdk-0.1.11.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
42
- orca_sdk-0.1.11.dist-info/RECORD,,