orca-sdk 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,10 @@ import re
5
5
  from pathlib import Path
6
6
  from typing import TypedDict, cast
7
7
 
8
- import gradio as gr
8
+ try:
9
+ import gradio as gr # type: ignore
10
+ except ImportError as e:
11
+ raise ImportError("gradio is required for UI features. Install it with: pip install orca_sdk[ui]") from e
9
12
 
10
13
  from ..memoryset import LabeledMemory, LabeledMemoryset
11
14
 
@@ -1,12 +1,17 @@
1
+ from __future__ import annotations
2
+
1
3
  import pickle
2
4
  from dataclasses import asdict, is_dataclass
3
5
  from os import PathLike
4
- from typing import Any, cast
6
+ from typing import TYPE_CHECKING, Any, cast
5
7
 
6
8
  from datasets import Dataset
7
9
  from datasets.exceptions import DatasetGenerationError
8
- from torch.utils.data import DataLoader as TorchDataLoader
9
- from torch.utils.data import Dataset as TorchDataset
10
+
11
+ if TYPE_CHECKING:
12
+ # peer dependencies that are used for types only
13
+ from torch.utils.data import DataLoader as TorchDataLoader # type: ignore
14
+ from torch.utils.data import Dataset as TorchDataset # type: ignore
10
15
 
11
16
 
12
17
  def parse_dict_like(item: Any, column_names: list[str] | None = None) -> dict:
@@ -62,6 +67,9 @@ def hf_dataset_from_torch(
62
67
  Returns:
63
68
  A HuggingFace Dataset object containing the data from the PyTorch DataLoader or Dataset.
64
69
  """
70
+ # peer dependency that is guaranteed to exist if the user provided a torch dataset
71
+ from torch.utils.data import DataLoader as TorchDataLoader # type: ignore
72
+
65
73
  if isinstance(torch_data, TorchDataLoader):
66
74
  dataloader = torch_data
67
75
  else:
@@ -0,0 +1,91 @@
1
+ import json
2
+ import pickle
3
+ import tempfile
4
+
5
+ from datasets import Dataset
6
+
7
+ from .data_parsing import hf_dataset_from_disk
8
+
9
+
10
+ def test_hf_dataset_from_disk_pickle_list():
11
+ with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
12
+ # Given a pickle file with test data that is a list
13
+ test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
14
+ with open(temp_file.name, "wb") as f:
15
+ pickle.dump(test_data, f)
16
+ dataset = hf_dataset_from_disk(temp_file.name)
17
+ # Then the HF dataset should be created successfully
18
+ assert isinstance(dataset, Dataset)
19
+ assert len(dataset) == 30
20
+ assert dataset.column_names == ["value", "label"]
21
+
22
+
23
+ def test_hf_dataset_from_disk_pickle_dict():
24
+ with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
25
+ # Given a pickle file with test data that is a dict
26
+ test_data = {"value": [f"test_{i}" for i in range(30)], "label": [i % 2 for i in range(30)]}
27
+ with open(temp_file.name, "wb") as f:
28
+ pickle.dump(test_data, f)
29
+ dataset = hf_dataset_from_disk(temp_file.name)
30
+ # Then the HF dataset should be created successfully
31
+ assert isinstance(dataset, Dataset)
32
+ assert len(dataset) == 30
33
+ assert dataset.column_names == ["value", "label"]
34
+
35
+
36
+ def test_hf_dataset_from_disk_json():
37
+ with tempfile.NamedTemporaryFile(suffix=".json") as temp_file:
38
+ # Given a JSON file with test data
39
+ test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
40
+ with open(temp_file.name, "w") as f:
41
+ json.dump(test_data, f)
42
+ dataset = hf_dataset_from_disk(temp_file.name)
43
+ # Then the HF dataset should be created successfully
44
+ assert isinstance(dataset, Dataset)
45
+ assert len(dataset) == 30
46
+ assert dataset.column_names == ["value", "label"]
47
+
48
+
49
+ def test_hf_dataset_from_disk_jsonl():
50
+ with tempfile.NamedTemporaryFile(suffix=".jsonl") as temp_file:
51
+ # Given a JSONL file with test data
52
+ test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
53
+ with open(temp_file.name, "w") as f:
54
+ for item in test_data:
55
+ f.write(json.dumps(item) + "\n")
56
+ dataset = hf_dataset_from_disk(temp_file.name)
57
+ # Then the HF dataset should be created successfully
58
+ assert isinstance(dataset, Dataset)
59
+ assert len(dataset) == 30
60
+ assert dataset.column_names == ["value", "label"]
61
+
62
+
63
+ def test_hf_dataset_from_disk_csv():
64
+ with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
65
+ # Given a CSV file with test data
66
+ test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
67
+ with open(temp_file.name, "w") as f:
68
+ f.write("value,label\n")
69
+ for item in test_data:
70
+ f.write(f"{item['value']},{item['label']}\n")
71
+ dataset = hf_dataset_from_disk(temp_file.name)
72
+ # Then the HF dataset should be created successfully
73
+ assert isinstance(dataset, Dataset)
74
+ assert len(dataset) == 30
75
+ assert dataset.column_names == ["value", "label"]
76
+
77
+
78
+ def test_hf_dataset_from_disk_parquet():
79
+ with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
80
+ # Given a Parquet file with test data
81
+ data = {
82
+ "value": [f"test_{i}" for i in range(30)],
83
+ "label": [i % 2 for i in range(30)],
84
+ }
85
+ dataset = Dataset.from_dict(data)
86
+ dataset.to_parquet(temp_file.name)
87
+ dataset = hf_dataset_from_disk(temp_file.name)
88
+ # Then the HF dataset should be created successfully
89
+ assert isinstance(dataset, Dataset)
90
+ assert len(dataset) == 30
91
+ assert dataset.column_names == ["value", "label"]
@@ -1,18 +1,17 @@
1
- import json
2
- import pickle
3
- import tempfile
4
1
  from collections import namedtuple
5
2
  from dataclasses import dataclass
6
3
 
7
- import pandas as pd
8
4
  import pytest
9
5
  from datasets import Dataset
10
6
  from datasets.exceptions import DatasetGenerationError
11
- from torch.utils.data import DataLoader as TorchDataLoader
12
- from torch.utils.data import Dataset as TorchDataset
13
7
 
14
8
  from ..conftest import SAMPLE_DATA
15
- from .data_parsing import hf_dataset_from_disk, hf_dataset_from_torch
9
+ from .data_parsing import hf_dataset_from_torch
10
+
11
+ pytest.importorskip("torch")
12
+
13
+ from torch.utils.data import DataLoader as TorchDataLoader # noqa: E402
14
+ from torch.utils.data import Dataset as TorchDataset # noqa: E402
16
15
 
17
16
 
18
17
  class PytorchDictDataset(TorchDataset):
@@ -26,16 +25,6 @@ class PytorchDictDataset(TorchDataset):
26
25
  return len(self.data)
27
26
 
28
27
 
29
- def test_hf_dataset_from_torch_dict():
30
- # Given a Pytorch dataset that returns a dictionary for each item
31
- dataset = PytorchDictDataset()
32
- hf_dataset = hf_dataset_from_torch(dataset)
33
- # Then the HF dataset should be created successfully
34
- assert isinstance(hf_dataset, Dataset)
35
- assert len(hf_dataset) == len(dataset)
36
- assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id", "partition_id"}
37
-
38
-
39
28
  class PytorchTupleDataset(TorchDataset):
40
29
  def __init__(self):
41
30
  self.data = SAMPLE_DATA
@@ -47,6 +36,58 @@ class PytorchTupleDataset(TorchDataset):
47
36
  return len(self.data)
48
37
 
49
38
 
39
+ DatasetTuple = namedtuple("DatasetTuple", ["value", "label"])
40
+
41
+
42
+ class PytorchNamedTupleDataset(TorchDataset):
43
+ def __init__(self):
44
+ self.data = SAMPLE_DATA
45
+
46
+ def __getitem__(self, i):
47
+ return DatasetTuple(self.data[i]["value"], self.data[i]["label"])
48
+
49
+ def __len__(self):
50
+ return len(self.data)
51
+
52
+
53
+ @dataclass
54
+ class DatasetItem:
55
+ text: str
56
+ label: int
57
+
58
+
59
+ class PytorchDataclassDataset(TorchDataset):
60
+ def __init__(self):
61
+ self.data = SAMPLE_DATA
62
+
63
+ def __getitem__(self, i):
64
+ return DatasetItem(text=self.data[i]["value"], label=self.data[i]["label"])
65
+
66
+ def __len__(self):
67
+ return len(self.data)
68
+
69
+
70
+ class PytorchInvalidDataset(TorchDataset):
71
+ def __init__(self):
72
+ self.data = SAMPLE_DATA
73
+
74
+ def __getitem__(self, i):
75
+ return [self.data[i]["value"], self.data[i]["label"]]
76
+
77
+ def __len__(self):
78
+ return len(self.data)
79
+
80
+
81
+ def test_hf_dataset_from_torch_dict():
82
+ # Given a Pytorch dataset that returns a dictionary for each item
83
+ dataset = PytorchDictDataset()
84
+ hf_dataset = hf_dataset_from_torch(dataset)
85
+ # Then the HF dataset should be created successfully
86
+ assert isinstance(hf_dataset, Dataset)
87
+ assert len(hf_dataset) == len(dataset)
88
+ assert set(hf_dataset.column_names) == {"value", "label", "key", "score", "source_id", "partition_id"}
89
+
90
+
50
91
  def test_hf_dataset_from_torch_tuple():
51
92
  # Given a Pytorch dataset that returns a tuple for each item
52
93
  dataset = PytorchTupleDataset()
@@ -74,20 +115,6 @@ def test_hf_dataset_from_torch_tuple_error_not_enough_columns():
74
115
  hf_dataset_from_torch(dataset, column_names=["value"])
75
116
 
76
117
 
77
- DatasetTuple = namedtuple("DatasetTuple", ["value", "label"])
78
-
79
-
80
- class PytorchNamedTupleDataset(TorchDataset):
81
- def __init__(self):
82
- self.data = SAMPLE_DATA
83
-
84
- def __getitem__(self, i):
85
- return DatasetTuple(self.data[i]["value"], self.data[i]["label"])
86
-
87
- def __len__(self):
88
- return len(self.data)
89
-
90
-
91
118
  def test_hf_dataset_from_torch_named_tuple():
92
119
  # Given a Pytorch dataset that returns a namedtuple for each item
93
120
  dataset = PytorchNamedTupleDataset()
@@ -99,23 +126,6 @@ def test_hf_dataset_from_torch_named_tuple():
99
126
  assert hf_dataset.column_names == ["value", "label"]
100
127
 
101
128
 
102
- @dataclass
103
- class DatasetItem:
104
- text: str
105
- label: int
106
-
107
-
108
- class PytorchDataclassDataset(TorchDataset):
109
- def __init__(self):
110
- self.data = SAMPLE_DATA
111
-
112
- def __getitem__(self, i):
113
- return DatasetItem(text=self.data[i]["value"], label=self.data[i]["label"])
114
-
115
- def __len__(self):
116
- return len(self.data)
117
-
118
-
119
129
  def test_hf_dataset_from_torch_dataclass():
120
130
  # Given a Pytorch dataset that returns a dataclass for each item
121
131
  dataset = PytorchDataclassDataset()
@@ -126,17 +136,6 @@ def test_hf_dataset_from_torch_dataclass():
126
136
  assert hf_dataset.column_names == ["text", "label"]
127
137
 
128
138
 
129
- class PytorchInvalidDataset(TorchDataset):
130
- def __init__(self):
131
- self.data = SAMPLE_DATA
132
-
133
- def __getitem__(self, i):
134
- return [self.data[i]["value"], self.data[i]["label"]]
135
-
136
- def __len__(self):
137
- return len(self.data)
138
-
139
-
140
139
  def test_hf_dataset_from_torch_invalid_dataset():
141
140
  # Given a Pytorch dataset that returns a list for each item
142
141
  dataset = PytorchInvalidDataset()
@@ -158,87 +157,3 @@ def test_hf_dataset_from_torchdataloader():
158
157
  assert isinstance(hf_dataset, Dataset)
159
158
  assert len(hf_dataset) == len(dataset)
160
159
  assert hf_dataset.column_names == ["value", "label"]
161
-
162
-
163
- def test_hf_dataset_from_disk_pickle_list():
164
- with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
165
- # Given a pickle file with test data that is a list
166
- test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
167
- with open(temp_file.name, "wb") as f:
168
- pickle.dump(test_data, f)
169
- dataset = hf_dataset_from_disk(temp_file.name)
170
- # Then the HF dataset should be created successfully
171
- assert isinstance(dataset, Dataset)
172
- assert len(dataset) == 30
173
- assert dataset.column_names == ["value", "label"]
174
-
175
-
176
- def test_hf_dataset_from_disk_pickle_dict():
177
- with tempfile.NamedTemporaryFile(suffix=".pkl") as temp_file:
178
- # Given a pickle file with test data that is a dict
179
- test_data = {"value": [f"test_{i}" for i in range(30)], "label": [i % 2 for i in range(30)]}
180
- with open(temp_file.name, "wb") as f:
181
- pickle.dump(test_data, f)
182
- dataset = hf_dataset_from_disk(temp_file.name)
183
- # Then the HF dataset should be created successfully
184
- assert isinstance(dataset, Dataset)
185
- assert len(dataset) == 30
186
- assert dataset.column_names == ["value", "label"]
187
-
188
-
189
- def test_hf_dataset_from_disk_json():
190
- with tempfile.NamedTemporaryFile(suffix=".json") as temp_file:
191
- # Given a JSON file with test data
192
- test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
193
- with open(temp_file.name, "w") as f:
194
- json.dump(test_data, f)
195
- dataset = hf_dataset_from_disk(temp_file.name)
196
- # Then the HF dataset should be created successfully
197
- assert isinstance(dataset, Dataset)
198
- assert len(dataset) == 30
199
- assert dataset.column_names == ["value", "label"]
200
-
201
-
202
- def test_hf_dataset_from_disk_jsonl():
203
- with tempfile.NamedTemporaryFile(suffix=".jsonl") as temp_file:
204
- # Given a JSONL file with test data
205
- test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
206
- with open(temp_file.name, "w") as f:
207
- for item in test_data:
208
- f.write(json.dumps(item) + "\n")
209
- dataset = hf_dataset_from_disk(temp_file.name)
210
- # Then the HF dataset should be created successfully
211
- assert isinstance(dataset, Dataset)
212
- assert len(dataset) == 30
213
- assert dataset.column_names == ["value", "label"]
214
-
215
-
216
- def test_hf_dataset_from_disk_csv():
217
- with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
218
- # Given a CSV file with test data
219
- test_data = [{"value": f"test_{i}", "label": i % 2} for i in range(30)]
220
- with open(temp_file.name, "w") as f:
221
- f.write("value,label\n")
222
- for item in test_data:
223
- f.write(f"{item['value']},{item['label']}\n")
224
- dataset = hf_dataset_from_disk(temp_file.name)
225
- # Then the HF dataset should be created successfully
226
- assert isinstance(dataset, Dataset)
227
- assert len(dataset) == 30
228
- assert dataset.column_names == ["value", "label"]
229
-
230
-
231
- def test_hf_dataset_from_disk_parquet():
232
- with tempfile.NamedTemporaryFile(suffix=".parquet") as temp_file:
233
- # Given a Parquet file with test data
234
- data = {
235
- "value": [f"test_{i}" for i in range(30)],
236
- "label": [i % 2 for i in range(30)],
237
- }
238
- df = pd.DataFrame(data)
239
- df.to_parquet(temp_file.name)
240
- dataset = hf_dataset_from_disk(temp_file.name)
241
- # Then the HF dataset should be created successfully
242
- assert isinstance(dataset, Dataset)
243
- assert len(dataset) == 30
244
- assert dataset.column_names == ["value", "label"]
@@ -5,7 +5,10 @@ import re
5
5
  from pathlib import Path
6
6
  from typing import TYPE_CHECKING
7
7
 
8
- import gradio as gr
8
+ try:
9
+ import gradio as gr # type: ignore
10
+ except ImportError as e:
11
+ raise ImportError("gradio is required for UI features. Install it with: pip install orca_sdk[ui]") from e
9
12
 
10
13
  from ..memoryset import LabeledMemoryLookup, LabeledMemoryset, ScoredMemoryLookup
11
14
 
@@ -1,27 +1,43 @@
1
+ from __future__ import annotations
2
+
1
3
  import base64
2
4
  import io
3
- from typing import cast
5
+ from typing import TYPE_CHECKING, Any
4
6
 
5
- import numpy as np
6
- from numpy.typing import NDArray
7
- from PIL import Image as pil
7
+ if TYPE_CHECKING:
8
+ # peer dependencies that are used for types only
9
+ import numpy as np # type: ignore
10
+ from numpy.typing import NDArray # type: ignore
11
+ from PIL import Image as pil # type: ignore
8
12
 
9
- ValueType = str | pil.Image | NDArray[np.float32]
10
- """
11
- The type of a value in a memoryset
13
+ ValueType = str | pil.Image | NDArray[np.float32]
14
+ """
15
+ The type of a value in a memoryset
12
16
 
13
- - `str`: string
14
- - `pil.Image`: image
15
- - `NDArray[np.float32]`: univariate or multivariate timeseries
16
- """
17
+ - `str`: string
18
+ - `pil.Image`: image
19
+ - `NDArray[np.float32]`: univariate or multivariate timeseries
20
+ """
21
+ else:
22
+ ValueType = Any
17
23
 
18
24
 
19
25
  def decode_value(value: str) -> ValueType:
20
26
  if value.startswith("data:image"):
27
+ try:
28
+ from PIL import Image as pil # type: ignore
29
+ except ImportError as e:
30
+ raise ImportError("Install Pillow to use image values") from e
31
+
21
32
  header, data = value.split(",", 1)
22
33
  return pil.open(io.BytesIO(base64.b64decode(data)))
23
34
 
24
35
  if value.startswith("data:numpy"):
36
+ try:
37
+ import numpy as np # type: ignore
38
+ except ImportError as e:
39
+ raise ImportError("Install numpy to use timeseries values") from e
40
+
25
41
  header, data = value.split(",", 1)
26
42
  return np.load(io.BytesIO(base64.b64decode(data)))
27
43
 
@@ -29,17 +45,28 @@ def decode_value(value: str) -> ValueType:
29
45
 
30
46
 
31
47
  def encode_value(value: ValueType) -> str:
32
- if isinstance(value, pil.Image):
33
- header = f"data:image/{value.format.lower()};base64," if value.format else "data:image;base64,"
48
+ try:
49
+ from PIL import Image as pil # type: ignore
50
+ except ImportError:
51
+ pil = None # type: ignore[assignment]
52
+
53
+ try:
54
+ import numpy as np # type: ignore
55
+ except ImportError:
56
+ np = None # type: ignore[assignment]
57
+
58
+ if pil is not None and isinstance(value, pil.Image):
59
+ header = f"data:image/{value.format.lower()};base64," if value.format else "data:image;base64," # type: ignore[union-attr]
34
60
  buffer = io.BytesIO()
35
- value.save(buffer, format=value.format)
61
+ value.save(buffer, format=value.format) # type: ignore[union-attr]
36
62
  bytes = buffer.getvalue()
37
63
  return header + base64.b64encode(bytes).decode("utf-8")
38
64
 
39
- if isinstance(value, np.ndarray):
40
- header = f"data:numpy/{value.dtype.name};base64,"
65
+ if np is not None and isinstance(value, np.ndarray):
66
+ header = f"data:numpy/{value.dtype.name};base64," # type: ignore[union-attr]
41
67
  buffer = io.BytesIO()
42
68
  np.save(buffer, value)
43
69
  return header + base64.b64encode(buffer.getvalue()).decode("utf-8")
44
70
 
45
- return value
71
+ # Value is already a string, or an unhandled type (fall back to str conversion)
72
+ return value if isinstance(value, str) else str(value)
@@ -1,5 +1,4 @@
1
- import numpy as np
2
- from PIL import Image as pil
1
+ import pytest
3
2
 
4
3
  from .value_parser import decode_value, encode_value
5
4
 
@@ -13,6 +12,7 @@ def test_string_parsing():
13
12
 
14
13
 
15
14
  def test_image_parsing():
15
+ pil = pytest.importorskip("PIL.Image")
16
16
  img = pil.new("RGB", (10, 10), color="red")
17
17
  img.format = "PNG"
18
18
 
@@ -22,10 +22,11 @@ def test_image_parsing():
22
22
 
23
23
  decoded = decode_value(encoded)
24
24
  assert isinstance(decoded, pil.Image)
25
- assert decoded.size == img.size
25
+ assert decoded.size == img.size # type: ignore[union-attr]
26
26
 
27
27
 
28
28
  def test_timeseries_parsing():
29
+ np = pytest.importorskip("numpy")
29
30
  timeseries = np.random.rand(20, 3).astype(np.float32)
30
31
 
31
32
  encoded = encode_value(timeseries)
@@ -34,6 +35,6 @@ def test_timeseries_parsing():
34
35
 
35
36
  decoded = decode_value(encoded)
36
37
  assert isinstance(decoded, np.ndarray)
37
- assert decoded.shape == timeseries.shape
38
- assert decoded.dtype == timeseries.dtype
38
+ assert decoded.shape == timeseries.shape # type: ignore[union-attr]
39
+ assert decoded.dtype == timeseries.dtype # type: ignore[union-attr]
39
40
  assert np.allclose(decoded, timeseries)