simcats-datasets 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- simcats_datasets/__init__.py +1 -1
- simcats_datasets/generation/_create_dataset.py +118 -30
- simcats_datasets/generation/_create_simulated_dataset.py +168 -69
- simcats_datasets/loading/_load_dataset.py +67 -14
- simcats_datasets/loading/load_ground_truth.py +219 -1
- simcats_datasets/loading/pytorch.py +78 -29
- simcats_datasets/support_functions/clip_line_to_rectangle.py +15 -4
- simcats_datasets/support_functions/convert_lines.py +34 -0
- simcats_datasets/support_functions/data_preprocessing.py +112 -1
- simcats_datasets/support_functions/get_coulomb_oscillation_area_boundaries.py +471 -0
- simcats_datasets/support_functions/metadata_utils.py +62 -0
- simcats_datasets/support_functions/pytorch_format_output.py +61 -62
- simcats_datasets-2.6.0.dist-info/METADATA +163 -0
- simcats_datasets-2.6.0.dist-info/RECORD +22 -0
- {simcats_datasets-2.4.0.dist-info → simcats_datasets-2.6.0.dist-info}/WHEEL +1 -1
- simcats_datasets-2.4.0.dist-info/METADATA +0 -837
- simcats_datasets-2.4.0.dist-info/RECORD +0 -20
- {simcats_datasets-2.4.0.dist-info → simcats_datasets-2.6.0.dist-info/licenses}/LICENSE +0 -0
- {simcats_datasets-2.4.0.dist-info → simcats_datasets-2.6.0.dist-info}/top_level.txt +0 -0
simcats_datasets/__init__.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
__all__ = []
|
|
2
|
-
__version__ = "2.
|
|
2
|
+
__version__ = "2.6.0"
|
|
@@ -16,10 +16,13 @@ __all__ = []
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def create_dataset(dataset_path: str,
|
|
19
|
-
csds: List[np.ndarray],
|
|
19
|
+
csds: Optional[List[np.ndarray]] = None,
|
|
20
|
+
sensor_scans: Optional[List[np.ndarray]] = None,
|
|
20
21
|
occupations: Optional[List[np.ndarray]] = None,
|
|
21
22
|
tct_masks: Optional[List[np.ndarray]] = None,
|
|
22
23
|
ct_by_dot_masks: Optional[List[np.ndarray]] = None,
|
|
24
|
+
sensor_regime_masks: Optional[List[np.ndarray]] = None,
|
|
25
|
+
sensor_peak_center_masks: Optional[List[np.ndarray]] = None,
|
|
23
26
|
line_coordinates: Optional[List[np.ndarray]] = None,
|
|
24
27
|
line_labels: Optional[List[dict]] = None,
|
|
25
28
|
metadata: Optional[List[dict]] = None,
|
|
@@ -27,60 +30,103 @@ def create_dataset(dataset_path: str,
|
|
|
27
30
|
max_len_line_labels_chunk: Optional[int] = None,
|
|
28
31
|
max_len_metadata_chunk: Optional[int] = None,
|
|
29
32
|
dtype_csd: np.dtype = np.float32,
|
|
33
|
+
dtype_sensor_scan: np.dtype = np.float32,
|
|
30
34
|
dtype_occ: np.dtype = np.float32,
|
|
31
35
|
dtype_tct: np.dtype = np.uint8,
|
|
32
36
|
dtype_ct_by_dot: np.dtype = np.uint8,
|
|
37
|
+
dtype_sensor_regime_masks: np.dtype = np.uint8,
|
|
38
|
+
dtype_sensor_peak_center_masks: np.dtype = np.uint8,
|
|
33
39
|
dtype_line_coordinates: np.dtype = np.float32) -> None:
|
|
34
40
|
"""Function for creating simcats_datasets v2 format datasets from given data.
|
|
35
41
|
|
|
36
42
|
Args:
|
|
37
43
|
dataset_path: The path where the new (v2) HDF5 dataset will be stored.
|
|
38
|
-
csds: The list of CSDs to use for creating the dataset.
|
|
44
|
+
csds: The list of CSDs to use for creating the dataset. A dataset can have either CSDs or sensor scans, but
|
|
45
|
+
never both. Default is None.
|
|
46
|
+
sensor_scans: The list of sensor scans to use for creating the dataset. A dataset can have either CSDs or sensor
|
|
47
|
+
scans, but never both. Default is None.
|
|
39
48
|
occupations: List of occupations to use for creating the dataset. Defaults to None.
|
|
40
49
|
tct_masks: List of TCT masks to use for creating the dataset. Defaults to None.
|
|
41
50
|
ct_by_dot_masks: List of CT by dot masks to use for creating the dataset. Defaults to None.
|
|
51
|
+
sensor_regime_masks: List of sensor regime masks to use for creating the dataset. Defaults to None.
|
|
52
|
+
sensor_peak_center_masks: List of sensor peak center masks to use for creating the dataset. Defaults to None.
|
|
42
53
|
line_coordinates: List of line coordinates to use for creating the dataset. Defaults to None.
|
|
43
54
|
line_labels: List of line labels to use for creating the dataset. Defaults to None.
|
|
44
55
|
metadata: List of metadata to use for creating the dataset. Defaults to None.
|
|
45
56
|
max_len_line_coordinates_chunk: The expected maximal length for line coordinates in number of float values (each
|
|
46
|
-
line requires 4 floats). If None, it is set to the largest value of the CSD shape. Default
|
|
57
|
+
line requires 4 floats). If None, it is set to the largest value of the CSD (or sensor scan) shape. Default
|
|
58
|
+
is None.
|
|
47
59
|
max_len_line_labels_chunk: The expected maximal length for line labels in number of uint8/char values (each line
|
|
48
60
|
label, encoded as utf-8 json, should require at most 80 chars). If None, it is set to the largest value of
|
|
49
|
-
the CSD shape * 20 (matching with allowed number of line coords). Default is None.
|
|
61
|
+
the CSD (or sensor scan) shape * 20 (matching with allowed number of line coords). Default is None.
|
|
50
62
|
max_len_metadata_chunk: The expected maximal length for metadata in number of uint8/char values (each metadata
|
|
51
63
|
dict, encoded as utf-8 json, should require at most 8000 chars, expected rather something like 4000, but
|
|
52
64
|
could get larger for dot jumps metadata of high resolution scans). If None, it is set to 8000. Default is
|
|
53
65
|
None.
|
|
54
66
|
dtype_csd: Specifies the dtype to be used for saving CSDs. Default is np.float32.
|
|
67
|
+
dtype_sensor_scan: Specifies the dtype to be used for saving sensor scans. Default is np.float32.
|
|
55
68
|
dtype_occ: Specifies the dtype to be used for saving Occupations. Default is np.float32.
|
|
56
69
|
dtype_tct: Specifies the dtype to be used for saving TCTs. Default is np.uint8.
|
|
57
70
|
dtype_ct_by_dot: Specifies the dtype to be used for saving CT by dot masks. Default is np.uint8.
|
|
71
|
+
dtype_sensor_regime_masks: Specifies the dtype to be used for saving sensor regime masks. Default is np.uint8.
|
|
72
|
+
dtype_sensor_peak_center_masks: Specifies the dtype to be used for saving sensor peak center masks. Default is
|
|
73
|
+
np.uint8.
|
|
58
74
|
dtype_line_coordinates: Specifies the dtype to be used for saving line coordinates. Default is np.float32.
|
|
59
75
|
"""
|
|
60
76
|
# Create path where the dataset will be saved (if folder doesn't exist already)
|
|
61
77
|
Path(dirname(dataset_path)).mkdir(parents=True, exist_ok=True)
|
|
62
78
|
|
|
79
|
+
# check if the dataset to be created is a csd or sensor_scan dataset
|
|
80
|
+
if csds is not None and sensor_scans is None:
|
|
81
|
+
csd_dataset = True
|
|
82
|
+
elif csds is None and sensor_scans is not None:
|
|
83
|
+
csd_dataset = False
|
|
84
|
+
else:
|
|
85
|
+
raise ValueError("A dataset can contain either CSDs or sensor scans but never both! Exactly one of the two has "
|
|
86
|
+
"to be None.")
|
|
87
|
+
|
|
63
88
|
with h5py.File(dataset_path, "a") as hdf5_file:
|
|
64
89
|
# get the number of total ids. This is especially required if a large dataset is loaded and saved step by step
|
|
65
|
-
|
|
90
|
+
if csd_dataset:
|
|
91
|
+
num_ids = len(csds)
|
|
92
|
+
else:
|
|
93
|
+
num_ids = len(sensor_scans)
|
|
66
94
|
|
|
67
|
-
#
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
95
|
+
# get a temp copy of a csd or sensor scan (to get the shape) and retrieve the corresponding HDF5 dataset
|
|
96
|
+
if csd_dataset:
|
|
97
|
+
# process CSDs
|
|
98
|
+
# save an example CSD to get shape and dtype
|
|
99
|
+
temp_data = csds[0].copy()
|
|
100
|
+
# use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to
|
|
101
|
+
# load one image at a time during training)
|
|
102
|
+
ds = hdf5_file.require_dataset(name='csds',
|
|
103
|
+
shape=(0, *temp_data.shape),
|
|
104
|
+
dtype=dtype_csd,
|
|
105
|
+
maxshape=(None, *temp_data.shape))
|
|
106
|
+
else:
|
|
107
|
+
# process sensor scans
|
|
108
|
+
# save an example sensor scan to get shape and dtype
|
|
109
|
+
temp_data = sensor_scans[0].copy()
|
|
110
|
+
# use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to
|
|
111
|
+
# load one image at a time during training)
|
|
112
|
+
ds = hdf5_file.require_dataset(name='sensor_scans',
|
|
113
|
+
shape=(0, *temp_data.shape),
|
|
114
|
+
dtype=dtype_sensor_scan,
|
|
115
|
+
maxshape=(None, *temp_data.shape))
|
|
74
116
|
# determine index offset if there is already data in the dataset
|
|
75
117
|
id_offset = ds.shape[0]
|
|
76
118
|
# resize datasets to fit new data
|
|
77
119
|
ds.resize(ds.shape[0] + num_ids, axis=0)
|
|
78
|
-
|
|
120
|
+
# Add new CSDs or sensor scans to the dataset
|
|
121
|
+
if csd_dataset:
|
|
122
|
+
ds[id_offset:] = np.array(csds).astype(dtype_csd)
|
|
123
|
+
else:
|
|
124
|
+
ds[id_offset:] = np.array(sensor_scans).astype(dtype_sensor_scan)
|
|
79
125
|
if occupations is not None:
|
|
80
126
|
if len(occupations) != num_ids:
|
|
81
127
|
raise ValueError(
|
|
82
|
-
f"Number of new occupation arrays ({len(occupations)}) does not match the number of new CSDs "
|
|
83
|
-
f"({num_ids}).")
|
|
128
|
+
f"Number of new occupation arrays ({len(occupations)}) does not match the number of new CSDs or "
|
|
129
|
+
f"sensor scans ({num_ids}).")
|
|
84
130
|
# process Occupations
|
|
85
131
|
# save an example occ to get shape
|
|
86
132
|
temp_occ = occupations[0].copy()
|
|
@@ -91,15 +137,15 @@ def create_dataset(dataset_path: str,
|
|
|
91
137
|
if ds.shape[0] != id_offset:
|
|
92
138
|
raise ValueError(
|
|
93
139
|
f"Number of already stored occupation arrays ({ds.shape[0]}) does not match the number of already "
|
|
94
|
-
f"stored CSDs ({id_offset}).")
|
|
140
|
+
f"stored CSDs or sensor scans ({id_offset}).")
|
|
95
141
|
# resize datasets to fit new data
|
|
96
142
|
ds.resize(ds.shape[0] + num_ids, axis=0)
|
|
97
143
|
ds[id_offset:] = np.array(occupations).astype(dtype_occ)
|
|
98
144
|
if tct_masks is not None:
|
|
99
145
|
if len(tct_masks) != num_ids:
|
|
100
146
|
raise ValueError(
|
|
101
|
-
f"Number of new TCT mask arrays ({len(tct_masks)}) does not match the number of new CSDs "
|
|
102
|
-
f"({num_ids}).")
|
|
147
|
+
f"Number of new TCT mask arrays ({len(tct_masks)}) does not match the number of new CSDs or sensor "
|
|
148
|
+
f"scans ({num_ids}).")
|
|
103
149
|
# process tct masks
|
|
104
150
|
# save an example tct to get shape and dtype
|
|
105
151
|
temp_tct = tct_masks[0].copy()
|
|
@@ -110,7 +156,7 @@ def create_dataset(dataset_path: str,
|
|
|
110
156
|
if ds.shape[0] != id_offset:
|
|
111
157
|
raise ValueError(
|
|
112
158
|
f"Number of already stored TCT mask arrays ({ds.shape[0]}) does not match the number of already "
|
|
113
|
-
f"stored CSDs ({id_offset}).")
|
|
159
|
+
f"stored CSDs or sensor scans ({id_offset}).")
|
|
114
160
|
# resize datasets to fit new data
|
|
115
161
|
ds.resize(ds.shape[0] + num_ids, axis=0)
|
|
116
162
|
ds[id_offset:] = np.array(tct_masks).astype(dtype_tct)
|
|
@@ -118,7 +164,7 @@ def create_dataset(dataset_path: str,
|
|
|
118
164
|
if len(ct_by_dot_masks) != num_ids:
|
|
119
165
|
raise ValueError(
|
|
120
166
|
f"Number of new CT by dot mask arrays ({len(ct_by_dot_masks)}) does not match the number of new "
|
|
121
|
-
f"CSDs ({num_ids}).")
|
|
167
|
+
f"CSDs or sensor scans ({num_ids}).")
|
|
122
168
|
# process tct masks
|
|
123
169
|
# save an example tct to get shape and dtype
|
|
124
170
|
temp_ct_by_dot = ct_by_dot_masks[0].copy()
|
|
@@ -129,19 +175,60 @@ def create_dataset(dataset_path: str,
|
|
|
129
175
|
if ds.shape[0] != id_offset:
|
|
130
176
|
raise ValueError(
|
|
131
177
|
f"Number of already stored CT by dot mask arrays ({ds.shape[0]}) does not match the number of "
|
|
132
|
-
f"already stored CSDs ({id_offset}).")
|
|
178
|
+
f"already stored CSDs or sensor scans ({id_offset}).")
|
|
133
179
|
# resize datasets to fit new data
|
|
134
180
|
ds.resize(ds.shape[0] + num_ids, axis=0)
|
|
135
181
|
ds[id_offset:] = np.array(ct_by_dot_masks).astype(dtype_tct)
|
|
182
|
+
if sensor_regime_masks is not None:
|
|
183
|
+
if len(sensor_regime_masks) != num_ids:
|
|
184
|
+
raise ValueError(
|
|
185
|
+
f"Number of new sensor regime mask arrays ({len(sensor_regime_masks)}) does not match the number "
|
|
186
|
+
f"of new CSDs or sensor scans ({num_ids}).")
|
|
187
|
+
# process sensor regime masks
|
|
188
|
+
# save an example sensor regime mask to get shape and dtype
|
|
189
|
+
temp_sensor_regime_mask = sensor_regime_masks[0].copy()
|
|
190
|
+
# use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to
|
|
191
|
+
# load one image at a time during training)
|
|
192
|
+
ds = hdf5_file.require_dataset(name='sensor_regime_masks', shape=(0, *temp_sensor_regime_mask.shape),
|
|
193
|
+
dtype=dtype_sensor_regime_masks,
|
|
194
|
+
maxshape=(None, *temp_sensor_regime_mask.shape))
|
|
195
|
+
if ds.shape[0] != id_offset:
|
|
196
|
+
raise ValueError(
|
|
197
|
+
f"Number of already stored sensor regime mask arrays ({ds.shape[0]}) does not match the number of "
|
|
198
|
+
f"already stored CSDs or sensor scans ({id_offset}).")
|
|
199
|
+
# resize datasets to fit new data
|
|
200
|
+
ds.resize(ds.shape[0] + num_ids, axis=0)
|
|
201
|
+
ds[id_offset:] = np.array(sensor_regime_masks).astype(dtype_sensor_regime_masks)
|
|
202
|
+
if sensor_peak_center_masks is not None:
|
|
203
|
+
if len(sensor_peak_center_masks) != num_ids:
|
|
204
|
+
raise ValueError(
|
|
205
|
+
f"Number of new sensor peak center mask arrays ({len(sensor_peak_center_masks)}) does not match "
|
|
206
|
+
f"the number of new CSDs or sensor scans ({num_ids}).")
|
|
207
|
+
# process sensor peak center masks
|
|
208
|
+
# save an example sensor peak center mask to get shape and dtype
|
|
209
|
+
temp_sensor_peak_center_mask = sensor_peak_center_masks[0].copy()
|
|
210
|
+
# use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to
|
|
211
|
+
# load one image at a time during training)
|
|
212
|
+
ds = hdf5_file.require_dataset(name='sensor_peak_center_masks',
|
|
213
|
+
shape=(0, *temp_sensor_peak_center_mask.shape),
|
|
214
|
+
dtype=dtype_sensor_peak_center_masks,
|
|
215
|
+
maxshape=(None, *temp_sensor_peak_center_mask.shape))
|
|
216
|
+
if ds.shape[0] != id_offset:
|
|
217
|
+
raise ValueError(
|
|
218
|
+
f"Number of already stored sensor peak center mask arrays ({ds.shape[0]}) does not match the "
|
|
219
|
+
f"number of already stored CSDs or sensor scans ({id_offset}).")
|
|
220
|
+
# resize datasets to fit new data
|
|
221
|
+
ds.resize(ds.shape[0] + num_ids, axis=0)
|
|
222
|
+
ds[id_offset:] = np.array(sensor_peak_center_masks).astype(dtype_sensor_peak_center_masks)
|
|
136
223
|
if line_coordinates is not None:
|
|
137
224
|
if len(line_coordinates) != num_ids:
|
|
138
225
|
raise ValueError(
|
|
139
226
|
f"Number of new line coordinates ({len(line_coordinates)}) does not match the number of new "
|
|
140
|
-
f"CSDs ({num_ids}).")
|
|
227
|
+
f"CSDs or sensor scans ({num_ids}).")
|
|
141
228
|
# retrieve fixed length for chunks
|
|
142
229
|
if max_len_line_coordinates_chunk is None:
|
|
143
230
|
# calculate max expected length (max_number_of_lines * 4 entries, max number estimated as max(shape)/4)
|
|
144
|
-
max_len = max(
|
|
231
|
+
max_len = max(temp_data.shape)
|
|
145
232
|
else:
|
|
146
233
|
max_len = max_len_line_coordinates_chunk
|
|
147
234
|
# use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to
|
|
@@ -151,7 +238,7 @@ def create_dataset(dataset_path: str,
|
|
|
151
238
|
if ds.shape[0] != id_offset:
|
|
152
239
|
raise ValueError(
|
|
153
240
|
f"Number of already stored line coordinates ({ds.shape[0]}) does not match the number of already "
|
|
154
|
-
f"stored CSDs ({id_offset}).")
|
|
241
|
+
f"stored CSDs or sensor scans ({id_offset}).")
|
|
155
242
|
# resize datasets to fit new data
|
|
156
243
|
ds.resize(ds.shape[0] + num_ids, axis=0)
|
|
157
244
|
# process line coordinates
|
|
@@ -163,13 +250,13 @@ def create_dataset(dataset_path: str,
|
|
|
163
250
|
if line_labels is not None:
|
|
164
251
|
if len(line_labels) != num_ids:
|
|
165
252
|
raise ValueError(
|
|
166
|
-
f"Number of new line labels ({len(line_labels)}) does not match the number of new CSDs "
|
|
167
|
-
f"({num_ids}).")
|
|
253
|
+
f"Number of new line labels ({len(line_labels)}) does not match the number of new CSDs or sensor "
|
|
254
|
+
f"scans ({num_ids}).")
|
|
168
255
|
# retrieve fixed length for chunks
|
|
169
256
|
if max_len_line_labels_chunk is None:
|
|
170
257
|
# calculate max expected length (max_number_of_lines * 80 uint8 numbers, max number estimated as
|
|
171
258
|
# max(shape)/4)
|
|
172
|
-
max_len = max(
|
|
259
|
+
max_len = max(temp_data.shape) * 20
|
|
173
260
|
else:
|
|
174
261
|
max_len = max_len_line_labels_chunk
|
|
175
262
|
# use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to
|
|
@@ -179,7 +266,7 @@ def create_dataset(dataset_path: str,
|
|
|
179
266
|
if ds.shape[0] != id_offset:
|
|
180
267
|
raise ValueError(
|
|
181
268
|
f"Number of already stored line labels ({ds.shape[0]}) does not match the number of already stored "
|
|
182
|
-
f"CSDs ({id_offset}).")
|
|
269
|
+
f"CSDs or sensor scans ({id_offset}).")
|
|
183
270
|
# resize datasets to fit new data
|
|
184
271
|
ds.resize(ds.shape[0] + num_ids, axis=0)
|
|
185
272
|
# process line labels
|
|
@@ -193,7 +280,8 @@ def create_dataset(dataset_path: str,
|
|
|
193
280
|
if metadata is not None:
|
|
194
281
|
if len(metadata) != num_ids:
|
|
195
282
|
raise ValueError(
|
|
196
|
-
f"Number of new metadata ({len(metadata)}) does not match the number of new CSDs
|
|
283
|
+
f"Number of new metadata ({len(metadata)}) does not match the number of new CSDs or sensor scans "
|
|
284
|
+
f"({num_ids}).")
|
|
197
285
|
# retrieve fixed length for chunks
|
|
198
286
|
if max_len_metadata_chunk is None:
|
|
199
287
|
# set len to 8000 uint8 numbers, that should already include some extra safety (expected smth. like
|
|
@@ -208,7 +296,7 @@ def create_dataset(dataset_path: str,
|
|
|
208
296
|
if ds.shape[0] != id_offset:
|
|
209
297
|
raise ValueError(
|
|
210
298
|
f"Number of already stored metadata ({ds.shape[0]}) does not match the number of already stored "
|
|
211
|
-
f"CSDs ({id_offset}).")
|
|
299
|
+
f"CSDs or sensor scans ({id_offset}).")
|
|
212
300
|
# resize datasets to fit new data
|
|
213
301
|
ds.resize(ds.shape[0] + num_ids, axis=0)
|
|
214
302
|
# process metadata
|
|
@@ -16,6 +16,14 @@ import numpy as np
|
|
|
16
16
|
|
|
17
17
|
# parallel
|
|
18
18
|
from parallelbar import progress_imap
|
|
19
|
+
from tqdm import tqdm
|
|
20
|
+
|
|
21
|
+
from simcats_datasets.loading import load_dataset
|
|
22
|
+
from simcats_datasets.loading.load_ground_truth import load_ct_by_dot_masks
|
|
23
|
+
# label creation based on line intersection
|
|
24
|
+
from simcats_datasets.support_functions.get_lead_transition_labels import get_lead_transition_labels
|
|
25
|
+
from simcats_datasets.support_functions.get_coulomb_oscillation_area_boundaries import get_coulomb_oscillation_area_boundaries
|
|
26
|
+
from simcats_datasets.support_functions._json_encoders import NumpyEncoder
|
|
19
27
|
|
|
20
28
|
# for SimCATS simulation
|
|
21
29
|
from simcats import Simulation, default_configs
|
|
@@ -25,27 +33,22 @@ from simcats.support_functions import (
|
|
|
25
33
|
NormalSamplingRange,
|
|
26
34
|
UniformSamplingRange, ExponentialSamplingRange,
|
|
27
35
|
)
|
|
28
|
-
from tqdm import tqdm
|
|
29
|
-
|
|
30
|
-
from simcats_datasets.loading import load_dataset
|
|
31
|
-
from simcats_datasets.loading.load_ground_truth import load_ct_by_dot_masks
|
|
32
|
-
# label creation based on line intersection
|
|
33
|
-
from simcats_datasets.support_functions.get_lead_transition_labels import get_lead_transition_labels
|
|
34
|
-
from simcats_datasets.support_functions._json_encoders import NumpyEncoder
|
|
35
36
|
|
|
36
37
|
__all__ = []
|
|
37
38
|
|
|
38
39
|
|
|
39
40
|
def _simulate(args: Tuple) -> Tuple:
|
|
40
|
-
"""Method to simulate a
|
|
41
|
+
"""Method to simulate a CSD or sensor scan with the given args. Required for parallel simulation in create_cimulated_dataset.
|
|
41
42
|
|
|
42
43
|
Args:
|
|
43
|
-
args: Tuple of sample_range_g1, sample_range_g2,
|
|
44
|
+
args: Tuple of sample_range_g1, sample_range_g2, sample_range_sensor_g1, sample_range_sensor_g2, volt_range,
|
|
45
|
+
simcats_config, resolution, sensor_scan_dataset, reset_sensor_offset_mu_sens_in_csds.
|
|
44
46
|
|
|
45
47
|
Returns:
|
|
46
|
-
Tuple of
|
|
48
|
+
Tuple of measurement, occupation_mask, lead_transition_mask, metadata, line_points, labels.
|
|
47
49
|
"""
|
|
48
|
-
sample_range_g1, sample_range_g2, volt_range, simcats_config,
|
|
50
|
+
(sample_range_g1, sample_range_g2, sample_range_sensor_g1, sample_range_sensor_g2, volt_range, simcats_config,
|
|
51
|
+
resolution, sensor_scan_dataset, reset_sensor_offset_mu_sens_in_csds) = args
|
|
49
52
|
|
|
50
53
|
# random number generator used for sampling volt ranges.
|
|
51
54
|
# !Must be generated here! Else same for every process!
|
|
@@ -75,23 +78,53 @@ def _simulate(args: Tuple) -> Tuple:
|
|
|
75
78
|
sim = Simulation(**simcats_config)
|
|
76
79
|
|
|
77
80
|
# sample voltage ranges
|
|
78
|
-
g1_start = rng.uniform(low=sample_range_g1[0], high=sample_range_g1[1])
|
|
79
|
-
g2_start = rng.uniform(low=sample_range_g2[0], high=sample_range_g2[1])
|
|
80
|
-
|
|
81
|
-
|
|
81
|
+
g1_start = rng.uniform(low=sample_range_g1[0], high=sample_range_g1[1]) if sample_range_g1 is not None else None
|
|
82
|
+
g2_start = rng.uniform(low=sample_range_g2[0], high=sample_range_g2[1]) if sample_range_g2 is not None else None
|
|
83
|
+
sensor_g1_start = rng.uniform(low=sample_range_sensor_g1[0], high=sample_range_sensor_g1[1]) if sample_range_sensor_g1 is not None else None
|
|
84
|
+
sensor_g2_start = rng.uniform(low=sample_range_sensor_g2[0], high=sample_range_sensor_g2[1]) if sample_range_sensor_g2 is not None else None
|
|
85
|
+
|
|
82
86
|
# perform simulation
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
87
|
+
if not sensor_scan_dataset:
|
|
88
|
+
g1_range = np.array([g1_start, g1_start + volt_range[0]])
|
|
89
|
+
g2_range = np.array([g2_start, g2_start + volt_range[1]])
|
|
90
|
+
if reset_sensor_offset_mu_sens_in_csds:
|
|
91
|
+
# calculate potential to reset offset_mu_sens
|
|
92
|
+
occupations, _ = sim.ideal_csd_config.get_csd_data(volt_limits_g1=g1_range,
|
|
93
|
+
volt_limits_g2=g2_range,
|
|
94
|
+
resolution=2)
|
|
95
|
+
potentials = simcats_config["sensor"].sensor_potential(occupations=occupations,
|
|
96
|
+
volt_limits_g1=g1_range,
|
|
97
|
+
volt_limits_g2=g2_range)
|
|
98
|
+
# the new offset is calculated as follows: offset - (potentials[0] - offset)
|
|
99
|
+
sim.sensor.offset_mu_sens = 2 * simcats_config["sensor"].offset_mu_sens - potentials[0]
|
|
100
|
+
measurement, occ, lead_trans, metadata = sim.measure(
|
|
101
|
+
sweep_range_g1=g1_range,
|
|
102
|
+
sweep_range_g2=g2_range,
|
|
103
|
+
volt_sensor_g1=sensor_g1_start,
|
|
104
|
+
volt_sensor_g2=sensor_g2_start,
|
|
105
|
+
resolution=resolution
|
|
106
|
+
)
|
|
107
|
+
# calculate lead_transition labels
|
|
108
|
+
ideal_csd_conf = metadata["ideal_csd_config"]
|
|
109
|
+
line_points, labels = get_lead_transition_labels(
|
|
110
|
+
sweep_range_g1=g1_range,
|
|
111
|
+
sweep_range_g2=g2_range,
|
|
112
|
+
ideal_csd_config=ideal_csd_conf,
|
|
113
|
+
lead_transition_mask=lead_trans,
|
|
114
|
+
)
|
|
115
|
+
else:
|
|
116
|
+
sensor_g1_range = np.array([sensor_g1_start, sensor_g1_start + volt_range[0]])
|
|
117
|
+
sensor_g2_range = np.array([sensor_g2_start, sensor_g2_start + volt_range[1]])
|
|
118
|
+
measurement, occ, lead_trans, metadata = sim.measure_sensor_scan(
|
|
119
|
+
sweep_range_sensor_g1=sensor_g1_range,
|
|
120
|
+
sweep_range_sensor_g2=sensor_g2_range,
|
|
121
|
+
volt_g1=g1_start,
|
|
122
|
+
volt_g2=g2_start,
|
|
123
|
+
resolution=resolution
|
|
124
|
+
)
|
|
125
|
+
line_points, labels = get_coulomb_oscillation_area_boundaries(metadata)
|
|
126
|
+
|
|
127
|
+
return measurement, occ, lead_trans, metadata, line_points, labels
|
|
95
128
|
|
|
96
129
|
|
|
97
130
|
def create_simulated_dataset(
|
|
@@ -107,28 +140,36 @@ def create_simulated_dataset(
|
|
|
107
140
|
max_len_line_labels_chunk: int = 2000,
|
|
108
141
|
max_len_metadata_chunk: int = 8000,
|
|
109
142
|
dtype_csd: np.dtype = np.float32,
|
|
143
|
+
dtype_sensor_scan: np.dtype = np.float32,
|
|
110
144
|
dtype_occ: np.dtype = np.float32,
|
|
111
145
|
dtype_tct: np.dtype = np.uint8,
|
|
146
|
+
dtype_sensor_regime_masks: np.dtype = np.uint8,
|
|
147
|
+
dtype_sensor_peak_center_masks: np.dtype = np.uint8,
|
|
112
148
|
dtype_line_coordinates: np.dtype = np.float32,
|
|
149
|
+
sensor_scan_dataset: bool = False,
|
|
150
|
+
reset_sensor_offset_mu_sens_in_csds: bool = False,
|
|
113
151
|
) -> None:
|
|
114
152
|
"""Function for generating simulated datasets using SimCATS for simulations.
|
|
115
153
|
|
|
116
|
-
|
|
117
|
-
|
|
154
|
+
Datasets can either contain CSDs or sensor scans.
|
|
155
|
+
|
|
156
|
+
**Warning**: This function expects that the simulation config uses IdealCSDGeometric (from SimCATS) for CSD datasets
|
|
157
|
+
and SensorScanSensorGeneric (from SimCATS) for sensor scan datasets. Other implementations are not guaranteed to
|
|
158
|
+
work.
|
|
118
159
|
|
|
119
160
|
Args:
|
|
120
161
|
dataset_path: The path where the dataset will be stored. Can also be an already existing dataset, to which new
|
|
121
162
|
data is added.
|
|
122
|
-
simcats_config: Configuration for
|
|
123
|
-
n_runs: Number of CSDs to be generated. Default is 10000.
|
|
124
|
-
resolution: Pixel resolution for both axis of the
|
|
125
|
-
Default is np.array([100, 100]). \n
|
|
163
|
+
simcats_config: Configuration for SimCATS simulation class. Default is the GaAs_v1 config provided by SimCATS.
|
|
164
|
+
n_runs: Number of CSDs or sensor scans to be generated. Default is 10000.
|
|
165
|
+
resolution: Pixel resolution for both axis of the measurements, first number of columns (x), then number of rows
|
|
166
|
+
(y). Default is np.array([100, 100]). \n
|
|
126
167
|
Example: \n
|
|
127
168
|
[res_g1, res_g2]
|
|
128
|
-
volt_range: Volt range for both axis of the
|
|
129
|
-
in the voltage space
|
|
130
|
-
30mV x 30mV).
|
|
131
|
-
tags: Additional tags for the data to be simulated, which will be added to the dataset
|
|
169
|
+
volt_range: Volt range for both axis of the measurements. Individual measurements with the specified size are
|
|
170
|
+
randomly sampled in the voltage space (defined by the volt_limits in the SimCATS config). Default is
|
|
171
|
+
np.array([0.03, 0.03]) (usually the scans from RWTH GaAs offler sample are 30mV x 30mV).
|
|
172
|
+
tags: Additional tags for the data to be simulated, which will be added to the dataset metadata. Default is
|
|
132
173
|
None. \n
|
|
133
174
|
Example: \n
|
|
134
175
|
{"tags": "shifted sensor, no noise", "sample": "GaAs"}.
|
|
@@ -139,9 +180,21 @@ def create_simulated_dataset(
|
|
|
139
180
|
max_len_line_labels_chunk: Maximum number of chars for the line label dict. Default is 2000.
|
|
140
181
|
max_len_metadata_chunk: Maximum number of chars for the metadata dict. Default is 8000.
|
|
141
182
|
dtype_csd: Specifies the dtype to be used for saving CSDs. Default is np.float32.
|
|
183
|
+
dtype_sensor_scan: Specifies the dtype to be used for saving sensor scans. Default is np.float32.
|
|
142
184
|
dtype_occ: Specifies the dtype to be used for saving Occupations. Default is np.float32.
|
|
143
185
|
dtype_tct: Specifies the dtype to be used for saving TCTs. Default is np.uint8.
|
|
186
|
+
dtype_sensor_regime_masks: Specifies the dtype to be used for saving sensor regime masks. Default is np.uint8.
|
|
187
|
+
dtype_sensor_peak_center_masks: Specifies the dtype to be used for saving sensor peak center masks. Default is
|
|
188
|
+
np.uint8.
|
|
144
189
|
dtype_line_coordinates: Specifies the dtype to be used for saving line coordinates. Default is np.float32.
|
|
190
|
+
sensor_scan_dataset: Determines whether to generate a sensor scan dataset (contains sensor scans instead of
|
|
191
|
+
CSDs). Default is False.
|
|
192
|
+
reset_sensor_offset_mu_sens_in_csds: Specifies whether to reset the sensor offset_mu_sens parameter before CSD
|
|
193
|
+
measurements. If this is activated, the offset of the sensor potential is reset so that the first pixel of
|
|
194
|
+
the CSD is exactly at the previously defined offset_mu_sens. Thus, this effectively resets the sensor to
|
|
195
|
+
start at the position defined by offset_mu_sens before starting to measure. It is intended to simulate that
|
|
196
|
+
the sensor is retuned to the defined position before each CSD. It has no effect for sensor scan datasets.
|
|
197
|
+
Default is False.
|
|
145
198
|
"""
|
|
146
199
|
# set tags to an empty dict if none were supplied
|
|
147
200
|
if tags is None:
|
|
@@ -150,36 +203,81 @@ def create_simulated_dataset(
|
|
|
150
203
|
# Create path where the dataset will be saved (if folder doesn't exist already)
|
|
151
204
|
Path(Path(dataset_path).parent).mkdir(parents=True, exist_ok=True)
|
|
152
205
|
|
|
206
|
+
# retrieve the allowed sampling ranges from the config and copy them (else we would change the config itself)
|
|
207
|
+
sample_range_g1 = simcats_config.get("volt_limits_g1", None)
|
|
208
|
+
sample_range_g1 = sample_range_g1.astype(np.float32) if sample_range_g1 is not None else None
|
|
209
|
+
sample_range_g2 = simcats_config.get("volt_limits_g2", None)
|
|
210
|
+
sample_range_g2 = sample_range_g2.astype(np.float32) if sample_range_g2 is not None else None
|
|
211
|
+
sample_range_sensor_g1 = simcats_config.get("volt_limits_sensor_g1", None)
|
|
212
|
+
sample_range_sensor_g1 = sample_range_sensor_g1.astype(np.float32) if sample_range_sensor_g1 is not None else None
|
|
213
|
+
sample_range_sensor_g2 = simcats_config.get("volt_limits_sensor_g2", None)
|
|
214
|
+
sample_range_sensor_g2 = sample_range_sensor_g2.astype(np.float32) if sample_range_sensor_g2 is not None else None
|
|
153
215
|
# arange volt limits so that random sampling gives us a starting point that is at least the defined volt_range below
|
|
154
216
|
# the maximum
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
217
|
+
if not sensor_scan_dataset:
|
|
218
|
+
measurement_type = "csds"
|
|
219
|
+
sample_range_g1[-1] -= volt_range[0]
|
|
220
|
+
sample_range_g2[-1] -= volt_range[1]
|
|
221
|
+
else:
|
|
222
|
+
measurement_type = "sensor_scans"
|
|
223
|
+
sample_range_sensor_g1[-1] -= volt_range[0]
|
|
224
|
+
sample_range_sensor_g2[-1] -= volt_range[1]
|
|
159
225
|
|
|
160
226
|
with h5py.File(dataset_path, "a") as hdf5_file:
|
|
161
227
|
# load datasets or create them if not already there
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
228
|
+
if isinstance(resolution, int):
|
|
229
|
+
measurements = hdf5_file.require_dataset(
|
|
230
|
+
name=measurement_type,
|
|
231
|
+
shape=(0, resolution),
|
|
232
|
+
chunks=(1, resolution),
|
|
233
|
+
dtype=dtype_csd if not sensor_scan_dataset else dtype_sensor_scan,
|
|
234
|
+
maxshape=(None, resolution),
|
|
235
|
+
)
|
|
236
|
+
occupations = hdf5_file.require_dataset(
|
|
237
|
+
name="occupations" if not sensor_scan_dataset else "sensor_regime_masks",
|
|
238
|
+
shape=(0, resolution, 2) if not sensor_scan_dataset else (0, resolution),
|
|
239
|
+
chunks=(1, resolution, 2) if not sensor_scan_dataset else (1, resolution),
|
|
240
|
+
dtype=dtype_occ if not sensor_scan_dataset else dtype_sensor_regime_masks,
|
|
241
|
+
maxshape=(None, resolution, 2) if not sensor_scan_dataset else (None, resolution),
|
|
242
|
+
)
|
|
243
|
+
tct_masks = hdf5_file.require_dataset(
|
|
244
|
+
name="tct_masks" if not sensor_scan_dataset else "sensor_peak_center_masks",
|
|
245
|
+
shape=(0, resolution),
|
|
246
|
+
chunks=(1, resolution),
|
|
247
|
+
dtype=dtype_tct if not sensor_scan_dataset else dtype_sensor_peak_center_masks,
|
|
248
|
+
maxshape=(None, resolution),
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
elif len(resolution) == 2:
|
|
252
|
+
measurements = hdf5_file.require_dataset(
|
|
253
|
+
name=measurement_type,
|
|
254
|
+
shape=(0, resolution[1], resolution[0]),
|
|
255
|
+
chunks=(1, resolution[1], resolution[0]),
|
|
256
|
+
dtype=dtype_csd if not sensor_scan_dataset else dtype_sensor_scan,
|
|
257
|
+
maxshape=(None, resolution[1], resolution[0]),
|
|
258
|
+
)
|
|
259
|
+
occupations = hdf5_file.require_dataset(
|
|
260
|
+
name="occupations" if not sensor_scan_dataset else "sensor_regime_masks",
|
|
261
|
+
shape=(0, resolution[1], resolution[0], 2) if not sensor_scan_dataset else (
|
|
262
|
+
0, resolution[1], resolution[0]),
|
|
263
|
+
chunks=(1, resolution[1], resolution[0], 2) if not sensor_scan_dataset else (
|
|
264
|
+
1, resolution[1], resolution[0]),
|
|
265
|
+
dtype=dtype_occ if not sensor_scan_dataset else dtype_sensor_regime_masks,
|
|
266
|
+
maxshape=(None, resolution[1], resolution[0], 2) if not sensor_scan_dataset else (
|
|
267
|
+
None, resolution[1], resolution[0]),
|
|
268
|
+
)
|
|
269
|
+
tct_masks = hdf5_file.require_dataset(
|
|
270
|
+
name="tct_masks" if not sensor_scan_dataset else "sensor_peak_center_masks",
|
|
271
|
+
shape=(0, resolution[1], resolution[0]),
|
|
272
|
+
chunks=(1, resolution[1], resolution[0]),
|
|
273
|
+
dtype=dtype_tct if not sensor_scan_dataset else dtype_sensor_peak_center_masks,
|
|
274
|
+
maxshape=(None, resolution[1], resolution[0]),
|
|
275
|
+
)
|
|
276
|
+
else:
|
|
277
|
+
raise ValueError(
|
|
278
|
+
"An invalid resolution was given. The resolution should either be an integer or a one dimensional numpy"
|
|
279
|
+
" array with two elements.")
|
|
280
|
+
|
|
183
281
|
line_coords = hdf5_file.require_dataset(
|
|
184
282
|
name="line_coordinates",
|
|
185
283
|
shape=(0, max_len_line_coordinates_chunk),
|
|
@@ -202,10 +300,10 @@ def create_simulated_dataset(
|
|
|
202
300
|
maxshape=(None, max_len_metadata_chunk),
|
|
203
301
|
)
|
|
204
302
|
# determine index offset if there is already data in the dataset
|
|
205
|
-
id_offset =
|
|
303
|
+
id_offset = measurements.shape[0]
|
|
206
304
|
|
|
207
305
|
# resize datasets to fit new data
|
|
208
|
-
|
|
306
|
+
measurements.resize(measurements.shape[0] + n_runs, axis=0)
|
|
209
307
|
occupations.resize(occupations.shape[0] + n_runs, axis=0)
|
|
210
308
|
tct_masks.resize(tct_masks.shape[0] + n_runs, axis=0)
|
|
211
309
|
line_coords.resize(line_coords.shape[0] + n_runs, axis=0)
|
|
@@ -215,10 +313,11 @@ def create_simulated_dataset(
|
|
|
215
313
|
# simulate and save data
|
|
216
314
|
indices = range(id_offset, n_runs + id_offset)
|
|
217
315
|
arguments = itertools.repeat(
|
|
218
|
-
(sample_range_g1, sample_range_g2,
|
|
316
|
+
(sample_range_g1, sample_range_g2, sample_range_sensor_g1, sample_range_sensor_g2, volt_range,
|
|
317
|
+
simcats_config, resolution, sensor_scan_dataset, reset_sensor_offset_mu_sens_in_csds),
|
|
219
318
|
times=len(indices),
|
|
220
319
|
)
|
|
221
|
-
for index, (
|
|
320
|
+
for index, (measurement, occ, lead_trans, metadata, line_points, labels) in zip(
|
|
222
321
|
indices,
|
|
223
322
|
progress_imap(
|
|
224
323
|
func=_simulate,
|
|
@@ -230,9 +329,9 @@ def create_simulated_dataset(
|
|
|
230
329
|
),
|
|
231
330
|
):
|
|
232
331
|
# save data
|
|
233
|
-
|
|
234
|
-
occupations[index] = occ.astype(dtype_occ)
|
|
235
|
-
tct_masks[index] = lead_trans.astype(dtype_tct)
|
|
332
|
+
measurements[index] = measurement.astype(dtype_csd if not sensor_scan_dataset else dtype_sensor_scan)
|
|
333
|
+
occupations[index] = occ.astype(dtype_occ if not sensor_scan_dataset else dtype_sensor_regime_masks)
|
|
334
|
+
tct_masks[index] = lead_trans.astype(dtype_tct if not sensor_scan_dataset else dtype_sensor_peak_center_masks)
|
|
236
335
|
line_coords[index] = np.pad(
|
|
237
336
|
line_points.flatten(),
|
|
238
337
|
((0, max_len_line_coordinates_chunk - line_points.size)),
|