simcats-datasets 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2 +1,2 @@
1
1
  __all__ = []
2
- __version__ = "2.4.0"
2
+ __version__ = "2.6.0"
@@ -16,10 +16,13 @@ __all__ = []
16
16
 
17
17
 
18
18
  def create_dataset(dataset_path: str,
19
- csds: List[np.ndarray],
19
+ csds: Optional[List[np.ndarray]] = None,
20
+ sensor_scans: Optional[List[np.ndarray]] = None,
20
21
  occupations: Optional[List[np.ndarray]] = None,
21
22
  tct_masks: Optional[List[np.ndarray]] = None,
22
23
  ct_by_dot_masks: Optional[List[np.ndarray]] = None,
24
+ sensor_regime_masks: Optional[List[np.ndarray]] = None,
25
+ sensor_peak_center_masks: Optional[List[np.ndarray]] = None,
23
26
  line_coordinates: Optional[List[np.ndarray]] = None,
24
27
  line_labels: Optional[List[dict]] = None,
25
28
  metadata: Optional[List[dict]] = None,
@@ -27,60 +30,103 @@ def create_dataset(dataset_path: str,
27
30
  max_len_line_labels_chunk: Optional[int] = None,
28
31
  max_len_metadata_chunk: Optional[int] = None,
29
32
  dtype_csd: np.dtype = np.float32,
33
+ dtype_sensor_scan: np.dtype = np.float32,
30
34
  dtype_occ: np.dtype = np.float32,
31
35
  dtype_tct: np.dtype = np.uint8,
32
36
  dtype_ct_by_dot: np.dtype = np.uint8,
37
+ dtype_sensor_regime_masks: np.dtype = np.uint8,
38
+ dtype_sensor_peak_center_masks: np.dtype = np.uint8,
33
39
  dtype_line_coordinates: np.dtype = np.float32) -> None:
34
40
  """Function for creating simcats_datasets v2 format datasets from given data.
35
41
 
36
42
  Args:
37
43
  dataset_path: The path where the new (v2) HDF5 dataset will be stored.
38
- csds: The list of CSDs to use for creating the dataset.
44
+ csds: The list of CSDs to use for creating the dataset. A dataset can have either CSDs or sensor scans, but
45
+ never both. Default is None.
46
+ sensor_scans: The list of sensor scans to use for creating the dataset. A dataset can have either CSDs or sensor
47
+ scans, but never both. Default is None.
39
48
  occupations: List of occupations to use for creating the dataset. Defaults to None.
40
49
  tct_masks: List of TCT masks to use for creating the dataset. Defaults to None.
41
50
  ct_by_dot_masks: List of CT by dot masks to use for creating the dataset. Defaults to None.
51
+ sensor_regime_masks: List of sensor regime masks to use for creating the dataset. Defaults to None.
52
+ sensor_peak_center_masks: List of sensor peak center masks to use for creating the dataset. Defaults to None.
42
53
  line_coordinates: List of line coordinates to use for creating the dataset. Defaults to None.
43
54
  line_labels: List of line labels to use for creating the dataset. Defaults to None.
44
55
  metadata: List of metadata to use for creating the dataset. Defaults to None.
45
56
  max_len_line_coordinates_chunk: The expected maximal length for line coordinates in number of float values (each
46
- line requires 4 floats). If None, it is set to the largest value of the CSD shape. Default is None.
57
+ line requires 4 floats). If None, it is set to the largest value of the CSD (or sensor scan) shape. Default
58
+ is None.
47
59
  max_len_line_labels_chunk: The expected maximal length for line labels in number of uint8/char values (each line
48
60
  label, encoded as utf-8 json, should require at most 80 chars). If None, it is set to the largest value of
49
- the CSD shape * 20 (matching with allowed number of line coords). Default is None.
61
+ the CSD (or sensor scan) shape * 20 (matching with allowed number of line coords). Default is None.
50
62
  max_len_metadata_chunk: The expected maximal length for metadata in number of uint8/char values (each metadata
51
63
  dict, encoded as utf-8 json, should require at most 8000 chars, expected rather something like 4000, but
52
64
  could get larger for dot jumps metadata of high resolution scans). If None, it is set to 8000. Default is
53
65
  None.
54
66
  dtype_csd: Specifies the dtype to be used for saving CSDs. Default is np.float32.
67
+ dtype_sensor_scan: Specifies the dtype to be used for saving sensor scans. Default is np.float32.
55
68
  dtype_occ: Specifies the dtype to be used for saving Occupations. Default is np.float32.
56
69
  dtype_tct: Specifies the dtype to be used for saving TCTs. Default is np.uint8.
57
70
  dtype_ct_by_dot: Specifies the dtype to be used for saving CT by dot masks. Default is np.uint8.
71
+ dtype_sensor_regime_masks: Specifies the dtype to be used for saving sensor regime masks. Default is np.uint8.
72
+ dtype_sensor_peak_center_masks: Specifies the dtype to be used for saving sensor peak center masks. Default is
73
+ np.uint8.
58
74
  dtype_line_coordinates: Specifies the dtype to be used for saving line coordinates. Default is np.float32.
59
75
  """
60
76
  # Create path where the dataset will be saved (if folder doesn't exist already)
61
77
  Path(dirname(dataset_path)).mkdir(parents=True, exist_ok=True)
62
78
 
79
+ # check if the dataset to be created is a csd or sensor_scan dataset
80
+ if csds is not None and sensor_scans is None:
81
+ csd_dataset = True
82
+ elif csds is None and sensor_scans is not None:
83
+ csd_dataset = False
84
+ else:
85
+ raise ValueError("A dataset can contain either CSDs or sensor scans but never both! Exactly one of the two has "
86
+ "to be None.")
87
+
63
88
  with h5py.File(dataset_path, "a") as hdf5_file:
64
89
  # get the number of total ids. This is especially required if a large dataset is loaded and saved step by step
65
- num_ids = len(csds)
90
+ if csd_dataset:
91
+ num_ids = len(csds)
92
+ else:
93
+ num_ids = len(sensor_scans)
66
94
 
67
- # process CSDs
68
- # save an example CSD to get shape and dtype
69
- temp_csd = csds[0].copy()
70
- # use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to load
71
- # one image at a time during training)
72
- ds = hdf5_file.require_dataset(name='csds', shape=(0, *temp_csd.shape), dtype=dtype_csd,
73
- maxshape=(None, *temp_csd.shape))
95
+ # get a temp copy of a csd or sensor scan (to get the shape) and retrieve the corresponding HDF5 dataset
96
+ if csd_dataset:
97
+ # process CSDs
98
+ # save an example CSD to get shape and dtype
99
+ temp_data = csds[0].copy()
100
+ # use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to
101
+ # load one image at a time during training)
102
+ ds = hdf5_file.require_dataset(name='csds',
103
+ shape=(0, *temp_data.shape),
104
+ dtype=dtype_csd,
105
+ maxshape=(None, *temp_data.shape))
106
+ else:
107
+ # process sensor scans
108
+ # save an example sensor scan to get shape and dtype
109
+ temp_data = sensor_scans[0].copy()
110
+ # use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to
111
+ # load one image at a time during training)
112
+ ds = hdf5_file.require_dataset(name='sensor_scans',
113
+ shape=(0, *temp_data.shape),
114
+ dtype=dtype_sensor_scan,
115
+ maxshape=(None, *temp_data.shape))
74
116
  # determine index offset if there is already data in the dataset
75
117
  id_offset = ds.shape[0]
76
118
  # resize datasets to fit new data
77
119
  ds.resize(ds.shape[0] + num_ids, axis=0)
78
- ds[id_offset:] = np.array(csds).astype(dtype_csd)
120
+ # Add new CSDs or sensor scans to the dataset
121
+ if csd_dataset:
122
+ ds[id_offset:] = np.array(csds).astype(dtype_csd)
123
+ else:
124
+ ds[id_offset:] = np.array(sensor_scans).astype(dtype_sensor_scan)
79
125
  if occupations is not None:
80
126
  if len(occupations) != num_ids:
81
127
  raise ValueError(
82
- f"Number of new occupation arrays ({len(occupations)}) does not match the number of new CSDs "
83
- f"({num_ids}).")
128
+ f"Number of new occupation arrays ({len(occupations)}) does not match the number of new CSDs or "
129
+ f"sensor scans ({num_ids}).")
84
130
  # process Occupations
85
131
  # save an example occ to get shape
86
132
  temp_occ = occupations[0].copy()
@@ -91,15 +137,15 @@ def create_dataset(dataset_path: str,
91
137
  if ds.shape[0] != id_offset:
92
138
  raise ValueError(
93
139
  f"Number of already stored occupation arrays ({ds.shape[0]}) does not match the number of already "
94
- f"stored CSDs ({id_offset}).")
140
+ f"stored CSDs or sensor scans ({id_offset}).")
95
141
  # resize datasets to fit new data
96
142
  ds.resize(ds.shape[0] + num_ids, axis=0)
97
143
  ds[id_offset:] = np.array(occupations).astype(dtype_occ)
98
144
  if tct_masks is not None:
99
145
  if len(tct_masks) != num_ids:
100
146
  raise ValueError(
101
- f"Number of new TCT mask arrays ({len(tct_masks)}) does not match the number of new CSDs "
102
- f"({num_ids}).")
147
+ f"Number of new TCT mask arrays ({len(tct_masks)}) does not match the number of new CSDs or sensor "
148
+ f"scans ({num_ids}).")
103
149
  # process tct masks
104
150
  # save an example tct to get shape and dtype
105
151
  temp_tct = tct_masks[0].copy()
@@ -110,7 +156,7 @@ def create_dataset(dataset_path: str,
110
156
  if ds.shape[0] != id_offset:
111
157
  raise ValueError(
112
158
  f"Number of already stored TCT mask arrays ({ds.shape[0]}) does not match the number of already "
113
- f"stored CSDs ({id_offset}).")
159
+ f"stored CSDs or sensor scans ({id_offset}).")
114
160
  # resize datasets to fit new data
115
161
  ds.resize(ds.shape[0] + num_ids, axis=0)
116
162
  ds[id_offset:] = np.array(tct_masks).astype(dtype_tct)
@@ -118,7 +164,7 @@ def create_dataset(dataset_path: str,
118
164
  if len(ct_by_dot_masks) != num_ids:
119
165
  raise ValueError(
120
166
  f"Number of new CT by dot mask arrays ({len(ct_by_dot_masks)}) does not match the number of new "
121
- f"CSDs ({num_ids}).")
167
+ f"CSDs or sensor scans ({num_ids}).")
122
168
  # process tct masks
123
169
  # save an example tct to get shape and dtype
124
170
  temp_ct_by_dot = ct_by_dot_masks[0].copy()
@@ -129,19 +175,60 @@ def create_dataset(dataset_path: str,
129
175
  if ds.shape[0] != id_offset:
130
176
  raise ValueError(
131
177
  f"Number of already stored CT by dot mask arrays ({ds.shape[0]}) does not match the number of "
132
- f"already stored CSDs ({id_offset}).")
178
+ f"already stored CSDs or sensor scans ({id_offset}).")
133
179
  # resize datasets to fit new data
134
180
  ds.resize(ds.shape[0] + num_ids, axis=0)
135
181
  ds[id_offset:] = np.array(ct_by_dot_masks).astype(dtype_tct)
182
+ if sensor_regime_masks is not None:
183
+ if len(sensor_regime_masks) != num_ids:
184
+ raise ValueError(
185
+ f"Number of new sensor regime mask arrays ({len(sensor_regime_masks)}) does not match the number "
186
+ f"of new CSDs or sensor scans ({num_ids}).")
187
+ # process sensor regime masks
188
+ # save an example sensor regime mask to get shape and dtype
189
+ temp_sensor_regime_mask = sensor_regime_masks[0].copy()
190
+ # use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to
191
+ # load one image at a time during training)
192
+ ds = hdf5_file.require_dataset(name='sensor_regime_masks', shape=(0, *temp_sensor_regime_mask.shape),
193
+ dtype=dtype_sensor_regime_masks,
194
+ maxshape=(None, *temp_sensor_regime_mask.shape))
195
+ if ds.shape[0] != id_offset:
196
+ raise ValueError(
197
+ f"Number of already stored sensor regime mask arrays ({ds.shape[0]}) does not match the number of "
198
+ f"already stored CSDs or sensor scans ({id_offset}).")
199
+ # resize datasets to fit new data
200
+ ds.resize(ds.shape[0] + num_ids, axis=0)
201
+ ds[id_offset:] = np.array(sensor_regime_masks).astype(dtype_sensor_regime_masks)
202
+ if sensor_peak_center_masks is not None:
203
+ if len(sensor_peak_center_masks) != num_ids:
204
+ raise ValueError(
205
+ f"Number of new sensor peak center mask arrays ({len(sensor_peak_center_masks)}) does not match "
206
+ f"the number of new CSDs or sensor scans ({num_ids}).")
207
+ # process sensor peak center masks
208
+ # save an example sensor peak center mask to get shape and dtype
209
+ temp_sensor_peak_center_mask = sensor_peak_center_masks[0].copy()
210
+ # use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to
211
+ # load one image at a time during training)
212
+ ds = hdf5_file.require_dataset(name='sensor_peak_center_masks',
213
+ shape=(0, *temp_sensor_peak_center_mask.shape),
214
+ dtype=dtype_sensor_peak_center_masks,
215
+ maxshape=(None, *temp_sensor_peak_center_mask.shape))
216
+ if ds.shape[0] != id_offset:
217
+ raise ValueError(
218
+ f"Number of already stored sensor peak center mask arrays ({ds.shape[0]}) does not match the "
219
+ f"number of already stored CSDs or sensor scans ({id_offset}).")
220
+ # resize datasets to fit new data
221
+ ds.resize(ds.shape[0] + num_ids, axis=0)
222
+ ds[id_offset:] = np.array(sensor_peak_center_masks).astype(dtype_sensor_peak_center_masks)
136
223
  if line_coordinates is not None:
137
224
  if len(line_coordinates) != num_ids:
138
225
  raise ValueError(
139
226
  f"Number of new line coordinates ({len(line_coordinates)}) does not match the number of new "
140
- f"CSDs ({num_ids}).")
227
+ f"CSDs or sensor scans ({num_ids}).")
141
228
  # retrieve fixed length for chunks
142
229
  if max_len_line_coordinates_chunk is None:
143
230
  # calculate max expected length (max_number_of_lines * 4 entries, max number estimated as max(shape)/4)
144
- max_len = max(temp_csd.shape)
231
+ max_len = max(temp_data.shape)
145
232
  else:
146
233
  max_len = max_len_line_coordinates_chunk
147
234
  # use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to
@@ -151,7 +238,7 @@ def create_dataset(dataset_path: str,
151
238
  if ds.shape[0] != id_offset:
152
239
  raise ValueError(
153
240
  f"Number of already stored line coordinates ({ds.shape[0]}) does not match the number of already "
154
- f"stored CSDs ({id_offset}).")
241
+ f"stored CSDs or sensor scans ({id_offset}).")
155
242
  # resize datasets to fit new data
156
243
  ds.resize(ds.shape[0] + num_ids, axis=0)
157
244
  # process line coordinates
@@ -163,13 +250,13 @@ def create_dataset(dataset_path: str,
163
250
  if line_labels is not None:
164
251
  if len(line_labels) != num_ids:
165
252
  raise ValueError(
166
- f"Number of new line labels ({len(line_labels)}) does not match the number of new CSDs "
167
- f"({num_ids}).")
253
+ f"Number of new line labels ({len(line_labels)}) does not match the number of new CSDs or sensor "
254
+ f"scans ({num_ids}).")
168
255
  # retrieve fixed length for chunks
169
256
  if max_len_line_labels_chunk is None:
170
257
  # calculate max expected length (max_number_of_lines * 80 uint8 numbers, max number estimated as
171
258
  # max(shape)/4)
172
- max_len = max(temp_csd.shape) * 20
259
+ max_len = max(temp_data.shape) * 20
173
260
  else:
174
261
  max_len = max_len_line_labels_chunk
175
262
  # use chunks as this will speed up reading later! One chunk is set to be exactly one image (optimized to
@@ -179,7 +266,7 @@ def create_dataset(dataset_path: str,
179
266
  if ds.shape[0] != id_offset:
180
267
  raise ValueError(
181
268
  f"Number of already stored line labels ({ds.shape[0]}) does not match the number of already stored "
182
- f"CSDs ({id_offset}).")
269
+ f"CSDs or sensor scans ({id_offset}).")
183
270
  # resize datasets to fit new data
184
271
  ds.resize(ds.shape[0] + num_ids, axis=0)
185
272
  # process line labels
@@ -193,7 +280,8 @@ def create_dataset(dataset_path: str,
193
280
  if metadata is not None:
194
281
  if len(metadata) != num_ids:
195
282
  raise ValueError(
196
- f"Number of new metadata ({len(metadata)}) does not match the number of new CSDs ({num_ids}).")
283
+ f"Number of new metadata ({len(metadata)}) does not match the number of new CSDs or sensor scans "
284
+ f"({num_ids}).")
197
285
  # retrieve fixed length for chunks
198
286
  if max_len_metadata_chunk is None:
199
287
  # set len to 8000 uint8 numbers, that should already include some extra safety (expected smth. like
@@ -208,7 +296,7 @@ def create_dataset(dataset_path: str,
208
296
  if ds.shape[0] != id_offset:
209
297
  raise ValueError(
210
298
  f"Number of already stored metadata ({ds.shape[0]}) does not match the number of already stored "
211
- f"CSDs ({id_offset}).")
299
+ f"CSDs or sensor scans ({id_offset}).")
212
300
  # resize datasets to fit new data
213
301
  ds.resize(ds.shape[0] + num_ids, axis=0)
214
302
  # process metadata
@@ -16,6 +16,14 @@ import numpy as np
16
16
 
17
17
  # parallel
18
18
  from parallelbar import progress_imap
19
+ from tqdm import tqdm
20
+
21
+ from simcats_datasets.loading import load_dataset
22
+ from simcats_datasets.loading.load_ground_truth import load_ct_by_dot_masks
23
+ # label creation based on line intersection
24
+ from simcats_datasets.support_functions.get_lead_transition_labels import get_lead_transition_labels
25
+ from simcats_datasets.support_functions.get_coulomb_oscillation_area_boundaries import get_coulomb_oscillation_area_boundaries
26
+ from simcats_datasets.support_functions._json_encoders import NumpyEncoder
19
27
 
20
28
  # for SimCATS simulation
21
29
  from simcats import Simulation, default_configs
@@ -25,27 +33,22 @@ from simcats.support_functions import (
25
33
  NormalSamplingRange,
26
34
  UniformSamplingRange, ExponentialSamplingRange,
27
35
  )
28
- from tqdm import tqdm
29
-
30
- from simcats_datasets.loading import load_dataset
31
- from simcats_datasets.loading.load_ground_truth import load_ct_by_dot_masks
32
- # label creation based on line intersection
33
- from simcats_datasets.support_functions.get_lead_transition_labels import get_lead_transition_labels
34
- from simcats_datasets.support_functions._json_encoders import NumpyEncoder
35
36
 
36
37
  __all__ = []
37
38
 
38
39
 
39
40
  def _simulate(args: Tuple) -> Tuple:
40
- """Method to simulate a csd with the given args. Required for parallel simulation in create_cimulated_dataset.
41
+ """Method to simulate a CSD or sensor scan with the given args. Required for parallel simulation in create_cimulated_dataset.
41
42
 
42
43
  Args:
43
- args: Tuple of sample_range_g1, sample_range_g2, volt_range, simcats_config, resolution.
44
+ args: Tuple of sample_range_g1, sample_range_g2, sample_range_sensor_g1, sample_range_sensor_g2, volt_range,
45
+ simcats_config, resolution, sensor_scan_dataset, reset_sensor_offset_mu_sens_in_csds.
44
46
 
45
47
  Returns:
46
- Tuple of csd, occ, lead_trans, metadata, line_points, labels.
48
+ Tuple of measurement, occupation_mask, lead_transition_mask, metadata, line_points, labels.
47
49
  """
48
- sample_range_g1, sample_range_g2, volt_range, simcats_config, resolution = args
50
+ (sample_range_g1, sample_range_g2, sample_range_sensor_g1, sample_range_sensor_g2, volt_range, simcats_config,
51
+ resolution, sensor_scan_dataset, reset_sensor_offset_mu_sens_in_csds) = args
49
52
 
50
53
  # random number generator used for sampling volt ranges.
51
54
  # !Must be generated here! Else same for every process!
@@ -75,23 +78,53 @@ def _simulate(args: Tuple) -> Tuple:
75
78
  sim = Simulation(**simcats_config)
76
79
 
77
80
  # sample voltage ranges
78
- g1_start = rng.uniform(low=sample_range_g1[0], high=sample_range_g1[1])
79
- g2_start = rng.uniform(low=sample_range_g2[0], high=sample_range_g2[1])
80
- g1_range = np.array([g1_start, g1_start + volt_range[0]])
81
- g2_range = np.array([g2_start, g2_start + volt_range[1]])
81
+ g1_start = rng.uniform(low=sample_range_g1[0], high=sample_range_g1[1]) if sample_range_g1 is not None else None
82
+ g2_start = rng.uniform(low=sample_range_g2[0], high=sample_range_g2[1]) if sample_range_g2 is not None else None
83
+ sensor_g1_start = rng.uniform(low=sample_range_sensor_g1[0], high=sample_range_sensor_g1[1]) if sample_range_sensor_g1 is not None else None
84
+ sensor_g2_start = rng.uniform(low=sample_range_sensor_g2[0], high=sample_range_sensor_g2[1]) if sample_range_sensor_g2 is not None else None
85
+
82
86
  # perform simulation
83
- csd, occ, lead_trans, metadata = sim.measure(
84
- sweep_range_g1=g1_range, sweep_range_g2=g2_range, resolution=resolution
85
- )
86
- # calculate lead_transition labels
87
- ideal_csd_conf = metadata["ideal_csd_config"]
88
- line_points, labels = get_lead_transition_labels(
89
- sweep_range_g1=g1_range,
90
- sweep_range_g2=g2_range,
91
- ideal_csd_config=ideal_csd_conf,
92
- lead_transition_mask=lead_trans,
93
- )
94
- return csd, occ, lead_trans, metadata, line_points, labels
87
+ if not sensor_scan_dataset:
88
+ g1_range = np.array([g1_start, g1_start + volt_range[0]])
89
+ g2_range = np.array([g2_start, g2_start + volt_range[1]])
90
+ if reset_sensor_offset_mu_sens_in_csds:
91
+ # calculate potential to reset offset_mu_sens
92
+ occupations, _ = sim.ideal_csd_config.get_csd_data(volt_limits_g1=g1_range,
93
+ volt_limits_g2=g2_range,
94
+ resolution=2)
95
+ potentials = simcats_config["sensor"].sensor_potential(occupations=occupations,
96
+ volt_limits_g1=g1_range,
97
+ volt_limits_g2=g2_range)
98
+ # the new offset is calculated as follows: offset - (potentials[0] - offset)
99
+ sim.sensor.offset_mu_sens = 2 * simcats_config["sensor"].offset_mu_sens - potentials[0]
100
+ measurement, occ, lead_trans, metadata = sim.measure(
101
+ sweep_range_g1=g1_range,
102
+ sweep_range_g2=g2_range,
103
+ volt_sensor_g1=sensor_g1_start,
104
+ volt_sensor_g2=sensor_g2_start,
105
+ resolution=resolution
106
+ )
107
+ # calculate lead_transition labels
108
+ ideal_csd_conf = metadata["ideal_csd_config"]
109
+ line_points, labels = get_lead_transition_labels(
110
+ sweep_range_g1=g1_range,
111
+ sweep_range_g2=g2_range,
112
+ ideal_csd_config=ideal_csd_conf,
113
+ lead_transition_mask=lead_trans,
114
+ )
115
+ else:
116
+ sensor_g1_range = np.array([sensor_g1_start, sensor_g1_start + volt_range[0]])
117
+ sensor_g2_range = np.array([sensor_g2_start, sensor_g2_start + volt_range[1]])
118
+ measurement, occ, lead_trans, metadata = sim.measure_sensor_scan(
119
+ sweep_range_sensor_g1=sensor_g1_range,
120
+ sweep_range_sensor_g2=sensor_g2_range,
121
+ volt_g1=g1_start,
122
+ volt_g2=g2_start,
123
+ resolution=resolution
124
+ )
125
+ line_points, labels = get_coulomb_oscillation_area_boundaries(metadata)
126
+
127
+ return measurement, occ, lead_trans, metadata, line_points, labels
95
128
 
96
129
 
97
130
  def create_simulated_dataset(
@@ -107,28 +140,36 @@ def create_simulated_dataset(
107
140
  max_len_line_labels_chunk: int = 2000,
108
141
  max_len_metadata_chunk: int = 8000,
109
142
  dtype_csd: np.dtype = np.float32,
143
+ dtype_sensor_scan: np.dtype = np.float32,
110
144
  dtype_occ: np.dtype = np.float32,
111
145
  dtype_tct: np.dtype = np.uint8,
146
+ dtype_sensor_regime_masks: np.dtype = np.uint8,
147
+ dtype_sensor_peak_center_masks: np.dtype = np.uint8,
112
148
  dtype_line_coordinates: np.dtype = np.float32,
149
+ sensor_scan_dataset: bool = False,
150
+ reset_sensor_offset_mu_sens_in_csds: bool = False,
113
151
  ) -> None:
114
152
  """Function for generating simulated datasets using SimCATS for simulations.
115
153
 
116
- **Warning**: This function expects that the simulation config uses IdealCSDGeometric from SimCATS. Other
117
- implementations are not guaranteed to work.
154
+ Datasets can either contain CSDs or sensor scans.
155
+
156
+ **Warning**: This function expects that the simulation config uses IdealCSDGeometric (from SimCATS) for CSD datasets
157
+ and SensorScanSensorGeneric (from SimCATS) for sensor scan datasets. Other implementations are not guaranteed to
158
+ work.
118
159
 
119
160
  Args:
120
161
  dataset_path: The path where the dataset will be stored. Can also be an already existing dataset, to which new
121
162
  data is added.
122
- simcats_config: Configuration for simcats simulation class. Default is the GaAs_v1 config provided by simcats.
123
- n_runs: Number of CSDs to be generated. Default is 10000.
124
- resolution: Pixel resolution for both axis of the CSDs, first number of columns (x), then number of rows (y).
125
- Default is np.array([100, 100]). \n
163
+ simcats_config: Configuration for SimCATS simulation class. Default is the GaAs_v1 config provided by SimCATS.
164
+ n_runs: Number of CSDs or sensor scans to be generated. Default is 10000.
165
+ resolution: Pixel resolution for both axis of the measurements, first number of columns (x), then number of rows
166
+ (y). Default is np.array([100, 100]). \n
126
167
  Example: \n
127
168
  [res_g1, res_g2]
128
- volt_range: Volt range for both axis of the CSDs. Individual CSDs with the specified size are randomly sampled
129
- in the voltage space. Default is np.array([0.03, 0.03]) (usually the scans from RWTH GaAs offler sample are
130
- 30mV x 30mV).
131
- tags: Additional tags for the data to be simulated, which will be added to the dataset DataFrame. Default is
169
+ volt_range: Volt range for both axis of the measurements. Individual measurements with the specified size are
170
+ randomly sampled in the voltage space (defined by the volt_limits in the SimCATS config). Default is
171
+ np.array([0.03, 0.03]) (usually the scans from RWTH GaAs offler sample are 30mV x 30mV).
172
+ tags: Additional tags for the data to be simulated, which will be added to the dataset metadata. Default is
132
173
  None. \n
133
174
  Example: \n
134
175
  {"tags": "shifted sensor, no noise", "sample": "GaAs"}.
@@ -139,9 +180,21 @@ def create_simulated_dataset(
139
180
  max_len_line_labels_chunk: Maximum number of chars for the line label dict. Default is 2000.
140
181
  max_len_metadata_chunk: Maximum number of chars for the metadata dict. Default is 8000.
141
182
  dtype_csd: Specifies the dtype to be used for saving CSDs. Default is np.float32.
183
+ dtype_sensor_scan: Specifies the dtype to be used for saving sensor scans. Default is np.float32.
142
184
  dtype_occ: Specifies the dtype to be used for saving Occupations. Default is np.float32.
143
185
  dtype_tct: Specifies the dtype to be used for saving TCTs. Default is np.uint8.
186
+ dtype_sensor_regime_masks: Specifies the dtype to be used for saving sensor regime masks. Default is np.uint8.
187
+ dtype_sensor_peak_center_masks: Specifies the dtype to be used for saving sensor peak center masks. Default is
188
+ np.uint8.
144
189
  dtype_line_coordinates: Specifies the dtype to be used for saving line coordinates. Default is np.float32.
190
+ sensor_scan_dataset: Determines whether to generate a sensor scan dataset (contains sensor scans instead of
191
+ CSDs). Default is False.
192
+ reset_sensor_offset_mu_sens_in_csds: Specifies whether to reset the sensor offset_mu_sens parameter before CSD
193
+ measurements. If this is activated, the offset of the sensor potential is reset so that the first pixel of
194
+ the CSD is exactly at the previously defined offset_mu_sens. Thus, this effectively resets the sensor to
195
+ start at the position defined by offset_mu_sens before starting to measure. It is intended to simulate that
196
+ the sensor is retuned to the defined position before each CSD. It has no effect for sensor scan datasets.
197
+ Default is False.
145
198
  """
146
199
  # set tags to an empty dict if none were supplied
147
200
  if tags is None:
@@ -150,36 +203,81 @@ def create_simulated_dataset(
150
203
  # Create path where the dataset will be saved (if folder doesn't exist already)
151
204
  Path(Path(dataset_path).parent).mkdir(parents=True, exist_ok=True)
152
205
 
206
+ # retrieve the allowed sampling ranges from the config and copy them (else we would change the config itself)
207
+ sample_range_g1 = simcats_config.get("volt_limits_g1", None)
208
+ sample_range_g1 = sample_range_g1.astype(np.float32) if sample_range_g1 is not None else None
209
+ sample_range_g2 = simcats_config.get("volt_limits_g2", None)
210
+ sample_range_g2 = sample_range_g2.astype(np.float32) if sample_range_g2 is not None else None
211
+ sample_range_sensor_g1 = simcats_config.get("volt_limits_sensor_g1", None)
212
+ sample_range_sensor_g1 = sample_range_sensor_g1.astype(np.float32) if sample_range_sensor_g1 is not None else None
213
+ sample_range_sensor_g2 = simcats_config.get("volt_limits_sensor_g2", None)
214
+ sample_range_sensor_g2 = sample_range_sensor_g2.astype(np.float32) if sample_range_sensor_g2 is not None else None
153
215
  # arange volt limits so that random sampling gives us a starting point that is at least the defined volt_range below
154
216
  # the maximum
155
- sample_range_g1 = simcats_config["volt_limits_g1"].copy()
156
- sample_range_g1[-1] -= volt_range[0]
157
- sample_range_g2 = simcats_config["volt_limits_g2"].copy()
158
- sample_range_g2[-1] -= volt_range[1]
217
+ if not sensor_scan_dataset:
218
+ measurement_type = "csds"
219
+ sample_range_g1[-1] -= volt_range[0]
220
+ sample_range_g2[-1] -= volt_range[1]
221
+ else:
222
+ measurement_type = "sensor_scans"
223
+ sample_range_sensor_g1[-1] -= volt_range[0]
224
+ sample_range_sensor_g2[-1] -= volt_range[1]
159
225
 
160
226
  with h5py.File(dataset_path, "a") as hdf5_file:
161
227
  # load datasets or create them if not already there
162
- csds = hdf5_file.require_dataset(
163
- name="csds",
164
- shape=(0, resolution[1], resolution[0]),
165
- chunks=(1, resolution[1], resolution[0]),
166
- dtype=dtype_csd,
167
- maxshape=(None, resolution[1], resolution[0]),
168
- )
169
- occupations = hdf5_file.require_dataset(
170
- name="occupations",
171
- shape=(0, resolution[1], resolution[0], 2),
172
- chunks=(1, resolution[1], resolution[0], 2),
173
- dtype=dtype_occ,
174
- maxshape=(None, resolution[1], resolution[0], 2),
175
- )
176
- tct_masks = hdf5_file.require_dataset(
177
- name="tct_masks",
178
- shape=(0, resolution[1], resolution[0]),
179
- chunks=(1, resolution[1], resolution[0]),
180
- dtype=dtype_tct,
181
- maxshape=(None, resolution[1], resolution[0]),
182
- )
228
+ if isinstance(resolution, int):
229
+ measurements = hdf5_file.require_dataset(
230
+ name=measurement_type,
231
+ shape=(0, resolution),
232
+ chunks=(1, resolution),
233
+ dtype=dtype_csd if not sensor_scan_dataset else dtype_sensor_scan,
234
+ maxshape=(None, resolution),
235
+ )
236
+ occupations = hdf5_file.require_dataset(
237
+ name="occupations" if not sensor_scan_dataset else "sensor_regime_masks",
238
+ shape=(0, resolution, 2) if not sensor_scan_dataset else (0, resolution),
239
+ chunks=(1, resolution, 2) if not sensor_scan_dataset else (1, resolution),
240
+ dtype=dtype_occ if not sensor_scan_dataset else dtype_sensor_regime_masks,
241
+ maxshape=(None, resolution, 2) if not sensor_scan_dataset else (None, resolution),
242
+ )
243
+ tct_masks = hdf5_file.require_dataset(
244
+ name="tct_masks" if not sensor_scan_dataset else "sensor_peak_center_masks",
245
+ shape=(0, resolution),
246
+ chunks=(1, resolution),
247
+ dtype=dtype_tct if not sensor_scan_dataset else dtype_sensor_peak_center_masks,
248
+ maxshape=(None, resolution),
249
+ )
250
+
251
+ elif len(resolution) == 2:
252
+ measurements = hdf5_file.require_dataset(
253
+ name=measurement_type,
254
+ shape=(0, resolution[1], resolution[0]),
255
+ chunks=(1, resolution[1], resolution[0]),
256
+ dtype=dtype_csd if not sensor_scan_dataset else dtype_sensor_scan,
257
+ maxshape=(None, resolution[1], resolution[0]),
258
+ )
259
+ occupations = hdf5_file.require_dataset(
260
+ name="occupations" if not sensor_scan_dataset else "sensor_regime_masks",
261
+ shape=(0, resolution[1], resolution[0], 2) if not sensor_scan_dataset else (
262
+ 0, resolution[1], resolution[0]),
263
+ chunks=(1, resolution[1], resolution[0], 2) if not sensor_scan_dataset else (
264
+ 1, resolution[1], resolution[0]),
265
+ dtype=dtype_occ if not sensor_scan_dataset else dtype_sensor_regime_masks,
266
+ maxshape=(None, resolution[1], resolution[0], 2) if not sensor_scan_dataset else (
267
+ None, resolution[1], resolution[0]),
268
+ )
269
+ tct_masks = hdf5_file.require_dataset(
270
+ name="tct_masks" if not sensor_scan_dataset else "sensor_peak_center_masks",
271
+ shape=(0, resolution[1], resolution[0]),
272
+ chunks=(1, resolution[1], resolution[0]),
273
+ dtype=dtype_tct if not sensor_scan_dataset else dtype_sensor_peak_center_masks,
274
+ maxshape=(None, resolution[1], resolution[0]),
275
+ )
276
+ else:
277
+ raise ValueError(
278
+ "An invalid resolution was given. The resolution should either be an integer or a one dimensional numpy"
279
+ " array with two elements.")
280
+
183
281
  line_coords = hdf5_file.require_dataset(
184
282
  name="line_coordinates",
185
283
  shape=(0, max_len_line_coordinates_chunk),
@@ -202,10 +300,10 @@ def create_simulated_dataset(
202
300
  maxshape=(None, max_len_metadata_chunk),
203
301
  )
204
302
  # determine index offset if there is already data in the dataset
205
- id_offset = csds.shape[0]
303
+ id_offset = measurements.shape[0]
206
304
 
207
305
  # resize datasets to fit new data
208
- csds.resize(csds.shape[0] + n_runs, axis=0)
306
+ measurements.resize(measurements.shape[0] + n_runs, axis=0)
209
307
  occupations.resize(occupations.shape[0] + n_runs, axis=0)
210
308
  tct_masks.resize(tct_masks.shape[0] + n_runs, axis=0)
211
309
  line_coords.resize(line_coords.shape[0] + n_runs, axis=0)
@@ -215,10 +313,11 @@ def create_simulated_dataset(
215
313
  # simulate and save data
216
314
  indices = range(id_offset, n_runs + id_offset)
217
315
  arguments = itertools.repeat(
218
- (sample_range_g1, sample_range_g2, volt_range, simcats_config, resolution),
316
+ (sample_range_g1, sample_range_g2, sample_range_sensor_g1, sample_range_sensor_g2, volt_range,
317
+ simcats_config, resolution, sensor_scan_dataset, reset_sensor_offset_mu_sens_in_csds),
219
318
  times=len(indices),
220
319
  )
221
- for index, (csd, occ, lead_trans, metadata, line_points, labels) in zip(
320
+ for index, (measurement, occ, lead_trans, metadata, line_points, labels) in zip(
222
321
  indices,
223
322
  progress_imap(
224
323
  func=_simulate,
@@ -230,9 +329,9 @@ def create_simulated_dataset(
230
329
  ),
231
330
  ):
232
331
  # save data
233
- csds[index] = csd.astype(dtype_csd)
234
- occupations[index] = occ.astype(dtype_occ)
235
- tct_masks[index] = lead_trans.astype(dtype_tct)
332
+ measurements[index] = measurement.astype(dtype_csd if not sensor_scan_dataset else dtype_sensor_scan)
333
+ occupations[index] = occ.astype(dtype_occ if not sensor_scan_dataset else dtype_sensor_regime_masks)
334
+ tct_masks[index] = lead_trans.astype(dtype_tct if not sensor_scan_dataset else dtype_sensor_peak_center_masks)
236
335
  line_coords[index] = np.pad(
237
336
  line_points.flatten(),
238
337
  ((0, max_len_line_coordinates_chunk - line_points.size)),