heavyedge-dataset 1.0.0.post0__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: heavyedge-dataset
3
- Version: 1.0.0.post0
3
+ Version: 1.1.0
4
4
  Summary: PyTorch-compatible edge profile dataset API
5
5
  Author-email: Jisoo Song <jeesoo9595@snu.ac.kr>
6
6
  License-Expression: MIT
@@ -28,6 +28,7 @@ Provides-Extra: doc
28
28
  Requires-Dist: sphinx; extra == "doc"
29
29
  Requires-Dist: numpydoc; extra == "doc"
30
30
  Requires-Dist: pydata_sphinx_theme; extra == "doc"
31
+ Requires-Dist: matplotlib; extra == "doc"
31
32
  Provides-Extra: dev
32
33
  Requires-Dist: flake8; extra == "dev"
33
34
  Requires-Dist: black; extra == "dev"
@@ -48,7 +49,7 @@ Package to load edge profile data as PyTorch dataset.
48
49
 
49
50
  ## Usage
50
51
 
51
- HeavyEdge-Dataset provides `ProfileDataset` which wraps profile data file.
52
+ HeavyEdge-Dataset provides dataset classes profile data file.
52
53
 
53
54
  A simple use case to load two-dimensional coordinates of profiles and their lengths:
54
55
 
@@ -11,7 +11,7 @@ Package to load edge profile data as PyTorch dataset.
11
11
 
12
12
  ## Usage
13
13
 
14
- HeavyEdge-Dataset provides `ProfileDataset` which wraps profile data file.
14
+ HeavyEdge-Dataset provides dataset classes profile data file.
15
15
 
16
16
  A simple use case to load two-dimensional coordinates of profiles and their lengths:
17
17
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "heavyedge-dataset"
7
- version = "1.0.0.post0"
7
+ version = "1.1.0"
8
8
  authors = [
9
9
  {name = "Jisoo Song", email = "jeesoo9595@snu.ac.kr"}
10
10
  ]
@@ -45,6 +45,7 @@ doc = [
45
45
  "sphinx",
46
46
  "numpydoc",
47
47
  "pydata_sphinx_theme",
48
+ "matplotlib",
48
49
  ]
49
50
  dev = [
50
51
  "flake8",
@@ -0,0 +1,272 @@
1
+ """Package to load edge profile data using PyTorch dataset.
2
+
3
+ Refer to `PyTorch tutorial <tutorial>`_ for information about custom dataset.
4
+
5
+ .. _tutorial: https://docs.pytorch.org/tutorials/beginner/data_loading_tutorial.html
6
+ """
7
+
8
+ import numbers
9
+ from collections.abc import Sequence
10
+
11
+ import numpy as np
12
+ from heavyedge.api import landmarks_type3
13
+ from torch.utils.data import Dataset
14
+
15
+ __all__ = [
16
+ "ProfileDataset",
17
+ "PseudoLandmarkDataset",
18
+ "MathematicalLandmarkDataset",
19
+ ]
20
+
21
+
22
+ class ProfileDataset(Dataset):
23
+ """Edge profile dataset.
24
+
25
+ Loads data as a tuple of two numpy arrays:
26
+
27
+ 1. Profile data, shape: (N, m, L).
28
+ 2. Length of each profile, shape: (N,).
29
+
30
+ N is the number of loaded data, m is dimension of coordinates, and
31
+ L is the maximum length of profiles.
32
+
33
+ Parameters
34
+ ----------
35
+ file : heavyedge.ProfileData
36
+ Open hdf5 file.
37
+ m : {1, 2}
38
+ Profile data dimension.
39
+ 1 means only y coordinates, and 2 means both x and y coordinates.
40
+ transform : callable, optional
41
+ Optional transformation to be applied on samples.
42
+
43
+ Examples
44
+ --------
45
+ >>> from heavyedge import get_sample_path, ProfileData
46
+ >>> from heavyedge_dataset import ProfileDataset
47
+ >>> with ProfileData(get_sample_path("Prep-Type2.h5")) as file:
48
+ ... profiles, lengths = ProfileDataset(file, m=2)[:]
49
+ >>> profiles.shape
50
+ (22, 2, 3200)
51
+ >>> lengths.shape
52
+ (22,)
53
+ >>> import matplotlib.pyplot as plt # doctest: +SKIP
54
+ ... plt.plot(*profiles.transpose(1, 2, 0))
55
+
56
+ Should the dataset be used for :class:`torch.utils.data.DataLoader`,
57
+ ``collate_fn`` argument should be passed to the data loader.
58
+
59
+ >>> from torch.utils.data import DataLoader
60
+ >>> with ProfileData(get_sample_path("Prep-Type2.h5")) as file:
61
+ ... dataset = ProfileDataset(file, m=2)
62
+ ... loader = DataLoader(dataset, collate_fn=lambda x: x)
63
+ ... profiles, lengths = next(iter(loader))
64
+ >>> profiles.shape
65
+ (1, 2, 3200)
66
+ >>> lengths.shape
67
+ (1,)
68
+
69
+ If data should be loaded as :class:`torch.Tensor`, pass ``transform`` argument.
70
+
71
+ >>> import torch
72
+ >>> def to_tensor(sample):
73
+ ... return (torch.from_numpy(sample[0]), torch.from_numpy(sample[1]))
74
+ >>> with ProfileData(get_sample_path("Prep-Type2.h5")) as file:
75
+ ... dataset = ProfileDataset(file, m=2, transform=to_tensor)
76
+ ... loader = DataLoader(dataset, collate_fn=lambda x: x)
77
+ ... profiles, lengths = next(iter(loader))
78
+ >>> type(profiles)
79
+ <class 'torch.Tensor'>
80
+ """
81
+
82
+ def __init__(self, file, m=1, transform=None):
83
+ self.file = file
84
+ self.m = m
85
+ self.transform = transform
86
+ self.x = file.x()
87
+
88
+ def __len__(self):
89
+ return len(self.file)
90
+
91
+ def __getitem__(self, idx):
92
+ if isinstance(idx, numbers.Integral):
93
+ Y, L, _ = self.file[idx]
94
+ Y = Y[np.newaxis, :]
95
+ else:
96
+ # Support multi-indexing
97
+ idxs = idx
98
+ needs_sort = isinstance(idx, (Sequence, np.ndarray))
99
+ if needs_sort:
100
+ # idxs must be sorted for h5py
101
+ idxs = np.array(idxs)
102
+ sort_idx = np.argsort(idxs)
103
+ idxs = idxs[sort_idx]
104
+ Y, L, _ = self.file[idxs]
105
+ if needs_sort:
106
+ reverse_idx = np.argsort(sort_idx)
107
+ Y = Y[reverse_idx]
108
+ L = L[reverse_idx]
109
+ Y = Y[:, np.newaxis, :]
110
+ if self.m == 1:
111
+ pass
112
+ elif self.m == 2:
113
+ x = np.tile(self.x, Y.shape[:-1] + (1,))
114
+ Y = np.concatenate([x, Y], axis=-2)
115
+ else:
116
+ raise ValueError(f"Unsupported dimension: {self.m} (Must be 1 or 2).")
117
+ ret = (Y, L)
118
+ if self.transform is not None:
119
+ ret = self.transform(ret)
120
+ return ret
121
+
122
+ def __getitems__(self, idxs):
123
+ # PyTorch API
124
+ return self.__getitem__(idxs)
125
+
126
+
127
+ class PseudoLandmarkDataset(Dataset):
128
+ """Dataset for pseudo-landmarks of edge profiles
129
+
130
+ Parameters
131
+ ----------
132
+ file : heavyedge.ProfileData
133
+ Open hdf5 file.
134
+ m : {1, 2}
135
+ Dimension of landmark coordinates.
136
+ k : int
137
+ Number of landmarks to sample.
138
+ transform : callable, optional
139
+ Optional transformation to be applied on samples.
140
+
141
+ Examples
142
+ --------
143
+ >>> from heavyedge import ProfileData, get_sample_path
144
+ >>> from heavyedge_dataset import PseudoLandmarkDataset
145
+ >>> with ProfileData(get_sample_path("Prep-Type1.h5")) as file:
146
+ ... dataset = PseudoLandmarkDataset(file, 1, 10)
147
+ ... data = dataset[:]
148
+ >>> data.shape
149
+ (18, 1, 10)
150
+ >>> import matplotlib.pyplot as plt # doctest: +SKIP
151
+ ... plt.plot(*data.transpose(1, 2, 0))
152
+
153
+ Because sampling pseudo-landmark requires loading full profile data,
154
+ loading large dataset can cause memory failure even if the output data is managable.
155
+ This can be avoided by batched loading with :class:`torch.utils.data.DataLoader`.
156
+
157
+ >>> import numpy as np
158
+ >>> from torch.utils.data import DataLoader
159
+ >>> with ProfileData(get_sample_path("Prep-Type1.h5")) as file:
160
+ ... dataset = PseudoLandmarkDataset(file, 1, 10)
161
+ ... loader = DataLoader(dataset, batch_size=10)
162
+ ... data = np.concatenate(list(loader))
163
+ >>> data.shape
164
+ (18, 1, 10)
165
+ """
166
+
167
+ def __init__(self, file, m, k, transform=None):
168
+ self.profiles = ProfileDataset(file, m=m)
169
+ self.k = k
170
+ self.transform = transform
171
+
172
+ def __len__(self):
173
+ return len(self.profiles)
174
+
175
+ def __getitem__(self, idx):
176
+ if isinstance(idx, numbers.Integral):
177
+ Y, L = self.profiles[idx]
178
+ Ys, Ls = [Y], [L]
179
+ else:
180
+ Ys, Ls = self.profiles[idx]
181
+
182
+ X = []
183
+ for Y, L in zip(Ys, Ls):
184
+ idxs = np.linspace(0, L - 1, self.k, dtype=int)
185
+ X.append(Y[:, idxs])
186
+ ret = np.array(X)
187
+ if self.transform is not None:
188
+ ret = self.transform(ret)
189
+ return ret
190
+
191
+ def __getitems__(self, idxs):
192
+ # PyTorch API
193
+ return self.__getitem__(idxs)
194
+
195
+
196
+ class MathematicalLandmarkDataset(Dataset):
197
+ """Dataset for mathematical landmarks of edge profiles.
198
+
199
+ Loads data as a tuple of two numpy arrays:
200
+
201
+ 1. Landmark coordinates, shape: (N, m, 4).
202
+ 2. Average plateau height, shape: (N,).
203
+
204
+ N is the number of loaded data and m is dimension of coordinates.
205
+
206
+ Parameters
207
+ ----------
208
+ file : heavyedge.ProfileData
209
+ Open hdf5 file.
210
+ m : {1, 2}
211
+ Dimension of landmark coordinates.
212
+ sigma : scalar
213
+ Standard deviation of Gaussian kernel for landmark detection.
214
+ transform : callable, optional
215
+ Optional transformation to be applied on samples.
216
+
217
+ Examples
218
+ --------
219
+ >>> from heavyedge import ProfileData, get_sample_path
220
+ >>> from heavyedge_dataset import MathematicalLandmarkDataset
221
+ >>> with ProfileData(get_sample_path("Prep-Type3.h5")) as file:
222
+ ... dataset = MathematicalLandmarkDataset(file, 2, 32)
223
+ ... landmarks, height = dataset[:]
224
+ >>> landmarks.shape
225
+ (35, 2, 4)
226
+ >>> height.shape
227
+ (35,)
228
+
229
+ Because sampling mathematical landmark requires loading full profile data,
230
+ loading large dataset can cause memory failure even if the output data is managable.
231
+ This can be avoided by batched loading with :class:`torch.utils.data.DataLoader`.
232
+ Note that ``collate_fn`` argument should be passed to the data loader.
233
+
234
+ >>> import numpy as np
235
+ >>> from torch.utils.data import DataLoader
236
+ >>> with ProfileData(get_sample_path("Prep-Type3.h5")) as file:
237
+ ... dataset = MathematicalLandmarkDataset(file, 2, 32)
238
+ ... loader = DataLoader(dataset, batch_size=10, collate_fn=lambda x: x)
239
+ ... landmarks = np.concatenate([lm for lm, _ in loader])
240
+ >>> landmarks.shape
241
+ (35, 2, 4)
242
+ """
243
+
244
+ def __init__(self, file, m, sigma, transform=None):
245
+ self.profiles = ProfileDataset(file, m=m)
246
+ self.sigma = sigma
247
+ self.transform = transform
248
+
249
+ def __len__(self):
250
+ return len(self.profiles)
251
+
252
+ def __getitem__(self, idx):
253
+ if isinstance(idx, numbers.Integral):
254
+ Y, L = self.profiles[idx]
255
+ Ys, Ls = [Y], [L]
256
+ else:
257
+ Ys, Ls = self.profiles[idx]
258
+
259
+ X, H = [], []
260
+ for Y, L in zip(Ys, Ls):
261
+ idxs = np.flip(landmarks_type3(Y[-1, :L], self.sigma))
262
+ X.append(Y[:, idxs])
263
+ H.append(np.mean(Y[-1, : idxs[0]]))
264
+
265
+ ret = np.array(X), np.array(H)
266
+ if self.transform is not None:
267
+ ret = self.transform(ret)
268
+ return ret
269
+
270
+ def __getitems__(self, idxs):
271
+ # PyTorch API
272
+ return self.__getitem__(idxs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: heavyedge-dataset
3
- Version: 1.0.0.post0
3
+ Version: 1.1.0
4
4
  Summary: PyTorch-compatible edge profile dataset API
5
5
  Author-email: Jisoo Song <jeesoo9595@snu.ac.kr>
6
6
  License-Expression: MIT
@@ -28,6 +28,7 @@ Provides-Extra: doc
28
28
  Requires-Dist: sphinx; extra == "doc"
29
29
  Requires-Dist: numpydoc; extra == "doc"
30
30
  Requires-Dist: pydata_sphinx_theme; extra == "doc"
31
+ Requires-Dist: matplotlib; extra == "doc"
31
32
  Provides-Extra: dev
32
33
  Requires-Dist: flake8; extra == "dev"
33
34
  Requires-Dist: black; extra == "dev"
@@ -48,7 +49,7 @@ Package to load edge profile data as PyTorch dataset.
48
49
 
49
50
  ## Usage
50
51
 
51
- HeavyEdge-Dataset provides `ProfileDataset` which wraps profile data file.
52
+ HeavyEdge-Dataset provides dataset classes profile data file.
52
53
 
53
54
  A simple use case to load two-dimensional coordinates of profiles and their lengths:
54
55
 
@@ -11,6 +11,7 @@ heavyedge-dataset[doc,test]
11
11
  sphinx
12
12
  numpydoc
13
13
  pydata_sphinx_theme
14
+ matplotlib
14
15
 
15
16
  [test]
16
17
  pytest
@@ -1,119 +0,0 @@
1
- """Package to load edge profile data using PyTorch dataset.
2
-
3
- Refer to `PyTorch tutorial <tutorial>`_ for information about custom dataset.
4
-
5
- .. _tutorial: https://docs.pytorch.org/tutorials/beginner/data_loading_tutorial.html
6
- """
7
-
8
- import numbers
9
- from collections.abc import Sequence
10
-
11
- import numpy as np
12
- from torch.utils.data import Dataset
13
-
14
- __all__ = [
15
- "ProfileDataset",
16
- ]
17
-
18
-
19
- class ProfileDataset(Dataset):
20
- """Edge profile dataset.
21
-
22
- Loads data as a tuple of two numpy arrays:
23
-
24
- 1. Profile data, shape: (N, m, L).
25
- 2. Length of each profile, shape: (N,).
26
-
27
- N is the number of loaded data, m is dimension of coordinates, and
28
- L is the maximum length of profiles.
29
-
30
- Parameters
31
- ----------
32
- file : heavyedge.ProfileData
33
- Open hdf5 file.
34
- m : {1, 2}
35
- Profile data dimension.
36
- 1 means only y coordinates, and 2 means both x and y coordinates.
37
- transform : callable, optional
38
- Optional transformation to be applied on samples.
39
-
40
- Examples
41
- --------
42
- >>> from heavyedge import get_sample_path, ProfileData
43
- >>> from heavyedge_dataset import ProfileDataset
44
- >>> with ProfileData(get_sample_path("Prep-Type2.h5")) as file:
45
- ... profiles, lengths = ProfileDataset(file, m=2)[:]
46
- >>> profiles.shape
47
- (22, 2, 3200)
48
- >>> lengths.shape
49
- (22,)
50
-
51
- Should this dataset be used for :class:`torch.utils.data.DataLoader`,
52
- ``collate_fn`` argument should be passed to the data loader.
53
-
54
- >>> from torch.utils.data import DataLoader
55
- >>> with ProfileData(get_sample_path("Prep-Type2.h5")) as file:
56
- ... dataset = ProfileDataset(file, m=2)
57
- ... loader = DataLoader(dataset, collate_fn=lambda x: x)
58
- ... profiles, lengths = next(iter(loader))
59
- >>> profiles.shape
60
- (1, 2, 3200)
61
- >>> lengths.shape
62
- (1,)
63
-
64
- If data should be loaded as :class:`torch.Tensor`, pass ``transform`` argument.
65
-
66
- >>> import torch
67
- >>> def to_tensor(sample):
68
- ... return (torch.from_numpy(sample[0]), torch.from_numpy(sample[1]))
69
- >>> with ProfileData(get_sample_path("Prep-Type2.h5")) as file:
70
- ... dataset = ProfileDataset(file, m=2, transform=to_tensor)
71
- ... loader = DataLoader(dataset, collate_fn=lambda x: x)
72
- ... profiles, lengths = next(iter(loader))
73
- >>> type(profiles)
74
- <class 'torch.Tensor'>
75
- """
76
-
77
- def __init__(self, file, m=1, transform=None):
78
- self.file = file
79
- self.m = m
80
- self.transform = transform
81
- self.x = file.x()
82
-
83
- def __len__(self):
84
- return len(self.file)
85
-
86
- def __getitem__(self, idx):
87
- if isinstance(idx, numbers.Integral):
88
- Y, L, _ = self.file[idx]
89
- Y = Y[np.newaxis, :]
90
- else:
91
- # Support multi-indexing
92
- idxs = idx
93
- needs_sort = isinstance(idx, (Sequence, np.ndarray))
94
- if needs_sort:
95
- # idxs must be sorted for h5py
96
- idxs = np.array(idxs)
97
- sort_idx = np.argsort(idxs)
98
- idxs = idxs[sort_idx]
99
- Y, L, _ = self.file[idxs]
100
- if needs_sort:
101
- reverse_idx = np.argsort(sort_idx)
102
- Y = Y[reverse_idx]
103
- L = L[reverse_idx]
104
- Y = Y[:, np.newaxis, :]
105
- if self.m == 1:
106
- pass
107
- elif self.m == 2:
108
- x = np.tile(self.x, Y.shape[:-1] + (1,))
109
- Y = np.concatenate([x, Y], axis=-2)
110
- else:
111
- raise ValueError(f"Unsupported dimension: {self.m} (Must be 1 or 2).")
112
- ret = (Y, L)
113
- if self.transform is not None:
114
- ret = self.transform(ret)
115
- return ret
116
-
117
- def __getitems__(self, idxs):
118
- # PyTorch API
119
- return self.__getitem__(idxs)