das2numpy 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
das2numpy/__init__.py ADDED
@@ -0,0 +1,28 @@
1
+ """ Module for efficient loading of both optasense and silixa data data.
2
+ @author: Erik genthe
3
+ """
4
+
5
+ import os as OS
6
+ import numpy as NP
7
+ from multipledispatch import dispatch
8
+
9
+
10
+
11
+ #@dispatch(str, str, int)
12
+ def loader(root_path:str, predefined_setup:str, num_worker_threads):
13
+
14
+ if predefined_setup.upper() == "SILIXA":
15
+ from .setups import silixa
16
+ chunk = silixa.init(root_path, num_worker_threads)
17
+ elif predefined_setup.upper() == "OPTASENSE":
18
+ from .setups import optasense_b35idefix
19
+ chunk = optasense_b35idefix.init()
20
+ else:
21
+ raise RuntimeError("Unknown setup: ", predefined_setup)
22
+
23
+ return chunk
24
+
25
+
26
+ #@dispatch(str, str, int)
27
+ #def loader(self, root_path:str, predefined_setup:str, num_worker_threads):
28
+ # return chunk
das2numpy/__main__.py ADDED
@@ -0,0 +1,99 @@
1
+ import argparse
2
+ from datetime import datetime
3
+ from time import time
4
+ import numpy as NP
5
+ from . import loader
6
+
7
+
8
+ def parse_arguments():
9
+ parser = argparse.ArgumentParser(description="This command line interface is work in progess!\nTODO script description!")
10
+ parser.add_argument(
11
+ "-v", "--verbosity",
12
+ action="count",
13
+ default=False,
14
+ help="Print more information to stdout"
15
+ )
16
+
17
+
18
+ parser.add_argument(
19
+ "device",
20
+ type=str,
21
+ help="Keyword for selecting the device. TODO unused yet!"
22
+ )
23
+ parser.add_argument(
24
+ "root_path",
25
+ type=str,
26
+ help="The path of the directory containing the data files. Recursive search."
27
+ )
28
+ parser.add_argument(
29
+ "start",
30
+ type=lambda x: datetime.fromisoformat(x),
31
+ help="Start timestamp in ISO format (YYYY-MM-DDTHH:MM:SS)."
32
+ )
33
+ parser.add_argument(
34
+ "end",
35
+ type=lambda x: datetime.fromisoformat(x),
36
+ help="End timestamp in ISO format (YYYY-MM-DDTHH:MM:SS)."
37
+ )
38
+ parser.add_argument(
39
+ "time_step",
40
+ type=int,
41
+ help="Time step as an integer."
42
+ )
43
+ parser.add_argument(
44
+ "channel_start",
45
+ type=int,
46
+ help="Channel start as an integer."
47
+ )
48
+ parser.add_argument(
49
+ "channel_end",
50
+ type=int,
51
+ help="Channel end as an integer."
52
+ )
53
+ parser.add_argument(
54
+ "channel_step",
55
+ type=int,
56
+ help="Channel step as an integer."
57
+ )
58
+ parser.add_argument(
59
+ "output",
60
+ type=str,
61
+ help="The path where to store the numpy file containing the data \"default\" or \"stdout\". "
62
+ + "If \"default\" is given, the file name will be the \"<startime>.npy\". "
63
+ + "If \"stdout\" is given, the data is piped to stdout as binary."
64
+ + "TODO: stdout not implemented yet!"
65
+ )
66
+
67
+ return parser.parse_args()
68
+
69
+ def main():
70
+ args = parse_arguments()
71
+ if args.verbosity: print("Args:", args)
72
+
73
+
74
+ if args.output == "default":
75
+ fname = args.start.strftime("%Y%m%dT%H%M%S") + ".npy"
76
+ elif args.output == "stdout":
77
+ raise RuntimeError("Not implemented yet")
78
+ else:
79
+ fname = output
80
+
81
+
82
+ print("Load...")
83
+ start = time()
84
+ loaderinstance = loader(args.root_path, args.device, num_worker_threads=4)
85
+ data = loaderinstance.load_array(args.start, args.end, args.time_step,
86
+ args.channel_start, args.channel_end, args.channel_step)
87
+ if args.verbosity:
88
+ end = time()
89
+ print("Duration", end-start)
90
+ print("Data:", NP.array(data.shape).prod() * 2.0 * 1000 / 1.0e6, "mb")
91
+ print("Rate:", NP.array(data.shape).prod() * 2.0 * 1000 / 1.0e6 / (end-start), "mb/s")
92
+ print("Saving...", fname)
93
+ NP.save(fname, data)
94
+
95
+
96
+
97
+ if __name__ == "__main__":
98
+ main()
99
+
das2numpy/chunk.py ADDED
@@ -0,0 +1,239 @@
1
+ """
2
+ See docstring of class Chunk.
3
+
4
+ Benchmark Optasense (measurements in seconds):
5
+ TIME for loading one whole file using h5py: 12.864407300949097
6
+ TIME for loading the first 1000 sensors from 10 files: 6.066787958145142
7
+ TIME for loading with sensor_step=10 from 10 files: 23.70387291908264
8
+ TIME for loading 100 sensors from 100 files 8.697869777679443
9
+ TIME for loading 1000 sensors from 100 files 92.85049629211426
10
+ TIME for loading 40 files completely 278.97754430770874
11
+ """
12
+ from typing import Callable
13
+ from math import floor
14
+ from datetime import datetime
15
+ from random import shuffle
16
+ from multipledispatch import dispatch
17
+ import concurrent.futures as CF
18
+ from concurrent.futures import ThreadPoolExecutor
19
+ from multiprocessing import Pool
20
+ import numpy as NP
21
+ from .filefinder import FileFinder, to_posix_timestamp_ms
22
+
23
+
24
+ SHUFFLE_TASKS = False
25
+
26
+ def _predict_size(start: int, end: int, step: int) -> int:
27
+ diff = end - start
28
+ return int(((diff-1) - (diff-1)%step) / step + 1)
29
+
30
+
31
+
32
+
33
+ class Chunk():
34
+ """
35
+ Class for efficient loading and storing data
36
+ After the data is loaded, using one of the load...(...) methods,
37
+ the data and the meta information can be accessed directly by accessing the following fields:
38
+ data, timestamps, geo_positions, channel.
39
+ TODO implement geo_positions, channel, timestamps
40
+ author: ingrabarbosa, Erik genthe
41
+ """
42
+
43
+
44
+ def __init__(self,
45
+ file_finder:FileFinder,
46
+ file_channel_amount:int,
47
+ file_time_sample_amount:int,
48
+ multithreaded:bool,
49
+ workers:int,
50
+ workerprocess:bool,
51
+ loading_function:Callable[[str, int, int, int, int, int, int], NP.ndarray]
52
+ ):
53
+ self.__file_finder = file_finder
54
+ self.__file_channel_amount = file_channel_amount
55
+ self.__file_time_sample_amount = file_time_sample_amount
56
+ self.__multithreaded = multithreaded
57
+ self.__workerprocess = workerprocess
58
+ self.__loading_function = loading_function
59
+ if multithreaded:
60
+ self.__executor = ThreadPoolExecutor(workers)
61
+ if not self.__multithreaded:
62
+ print("Warning: Chunk is not in multiprocessing or multithreading mode!")
63
+
64
+
65
+
66
+ def __load_from_file_into_data(self,
67
+ start_timestamp:int,
68
+ file_path:str,
69
+ t_start:int,
70
+ t_end:int,
71
+ t_step:int,
72
+ channel_start:int,
73
+ channel_end:int,
74
+ channel_step:int
75
+ ) -> None:
76
+ #print("Args: ", start_timestamp, file_path, t_start, t_end, t_step, channel_start, channel_end, channel_step)
77
+ # Check if the whole file shall be loaded. Especially the first and last file could be cut...
78
+ rel_t_start = 0
79
+ rel_t_end = self.__file_time_sample_amount
80
+ if t_start > start_timestamp:
81
+ rel_t_start = t_start - start_timestamp
82
+ if t_end < start_timestamp + self.__file_time_sample_amount: #TODO magicnumber
83
+ rel_t_end = t_end - start_timestamp
84
+ if rel_t_start == rel_t_end:
85
+ return # Do nothing
86
+ #print("relative start, relative end", rel_t_start, rel_t_end)
87
+ if start_timestamp + self.__file_time_sample_amount <= t_start:
88
+ print("Warning: File does not contain any parts of the requested data.",
89
+ "This can happen if there are leaks in the data. The corresponding output will be left filled with zeros.\n",
90
+ f" Requested range (Posixtimestamps in ms): [{t_start}, {t_end}[\n",
91
+ f" Filepath: {file_path}.")
92
+ return
93
+ assert rel_t_end > rel_t_start, f"rel_t_start={rel_t_start}, rel_t_end={rel_t_end}."
94
+
95
+
96
+ # Load h5-data using a different process... There is no other way to make h5py work parallel :(
97
+ data = None
98
+ if self.__workerprocess:
99
+ pool = Pool(1)
100
+ result = pool.apply_async(self.__loading_function,
101
+ (file_path, rel_t_start, rel_t_end, t_step, channel_start, channel_end, channel_step))
102
+ pool.close()
103
+ result = result.get() # Blocks!
104
+ data = result
105
+ else:
106
+ data = self.__loading_function(file_path, rel_t_start, rel_t_end, t_step, channel_start, channel_end, channel_step)
107
+
108
+ # Store loaded data part into all_data
109
+ start_index = floor((start_timestamp - t_start) / t_step)
110
+ #print(start_index)
111
+ if start_index < 0:
112
+ start_index = 0
113
+ #print("Shape: ", data.shape)
114
+ self.data[start_index : start_index + data.shape[0],:] = data[:,:]
115
+
116
+ @dispatch(int, int, int, int, int, int)
117
+ def load_array_posix_ms(self, t_start: int, t_end: int, t_step: int, channel_start: int, channel_end: int, channel_step: int) -> NP.ndarray:
118
+ """ Loading data
119
+ Warning: using a different value then 1 for t_step or channel_step can result in a high cpu-usage.
120
+ Consider using multithreaded=True in the constructor and a high amount of workers if needed.
121
+ Constraints:
122
+ t_start has to be less or equal t_end,
123
+ same for channel_start and channel_end.
124
+ t_step and channel_step have to be greater then 0
125
+ Args:
126
+ t_start (int): A posix timestamp in ms which defines the start of the data to load.
127
+ t_end (int): A posix timestamp in ms which defines the end of the data to load.
128
+ t_step (int): If you, for example only want to load the data of every fourth timestep use t_end=4
129
+ channel_start (int): The starting index of sensor in the data (inclusive).
130
+ channel_end (int): The ending index of sensors in the data (exclusive).
131
+ channel_step (int): Like t_step, but for the sensor position.
132
+ Returns:
133
+ Data as a numpy array
134
+ """
135
+
136
+ assert channel_start >= 0
137
+ assert channel_start <= self.__file_channel_amount
138
+ if channel_end == -1:
139
+ channel_end = self.__file_channel_amount
140
+ assert channel_end >= channel_start
141
+ assert channel_end <= self.__file_channel_amount, "channel_end has to be less or equal than self.__file_channel_amount"
142
+ assert t_step > 0
143
+ assert channel_step > 0
144
+
145
+ file_pathes = self.__file_finder.get_range_posix(t_start, t_end)
146
+ print(f"Loading data from {len(file_pathes)} files.")
147
+ print("file_pathes", file_pathes)
148
+ data_shape = (
149
+ _predict_size(t_start, t_end, t_step),
150
+ _predict_size(channel_start, channel_end, channel_step)
151
+ )
152
+ self.data = NP.zeros(shape=data_shape, dtype=NP.float32)
153
+ if self.__multithreaded:
154
+ futures = []
155
+ if SHUFFLE_TASKS:
156
+ shuffle(file_pathes)
157
+ for start_timestamp, file_path in file_pathes:
158
+ futures.append(
159
+ self.__executor.submit(
160
+ self.__load_from_file_into_data,
161
+ start_timestamp,
162
+ file_path,
163
+ t_start,
164
+ t_end,
165
+ t_step,
166
+ channel_start,
167
+ channel_end,
168
+ channel_step
169
+ )
170
+ )
171
+
172
+ for future in CF.as_completed(futures):
173
+ future.result() # Raises possible exceptions
174
+
175
+ else:
176
+ for start_timestamp, file_path in file_pathes:
177
+ self.__load_from_file_into_data(
178
+ start_timestamp,
179
+ file_path,
180
+ t_start,
181
+ t_end,
182
+ t_step,
183
+ channel_start,
184
+ channel_end,
185
+ channel_step)
186
+
187
+ return self.data
188
+
189
+
190
+
191
+
192
+
193
+ @dispatch(datetime, datetime, int, int)
194
+ def load_array(self, t_start:datetime, t_end:datetime, channel_start:int, channel_end:int) -> NP.ndarray:
195
+ """ Loads data and returns it as a numpy array.
196
+ Constraints:
197
+ t_start has to be less or equal t_end,
198
+ same for channel_start and channel_end.
199
+ Args:
200
+ t_start (datetime): datetime object which defines the start of the data to load.
201
+ t_end (datetime): datetime object which defines the end of the data to load.
202
+ channel_start (int): The starting index of sensor in the data (inclusive).
203
+ channel_end (int): The ending index of sensors in the data (exclusive).
204
+ Returns:
205
+ A 2d-numpy-array containing the data.
206
+ The first axis corresponds to the time, the second to the channel
207
+ """
208
+ return self.load_array(t_start, t_end, 1, channel_start, channel_end, 1)
209
+
210
+
211
+ @dispatch(datetime, datetime, int, int, int, int)
212
+ def load_array(self, t_start:datetime, t_end:datetime, t_step:int, channel_start:int, channel_end:int, channel_step:int) -> NP.ndarray:
213
+ """ Loading data into numpy array.
214
+ Returns nothing, the data can be accessed by accessing the data field of this instance.
215
+ Warning: using a different value then 1 for t_step or channel_step can result in a high cpu-usage.
216
+ Consider using multithreaded=True in the constructor and a high amount of workers if needed.
217
+ Constraints:
218
+ t_start has to be less or equal t_end,
219
+ same for channel_start and channel_end.
220
+ t_step and channel_step have to be greater then 0
221
+ Args:
222
+ t_start (datetime): datetime object which defines the start of the data to load.
223
+ t_end (datetime): datetime object which defines the end of the data to load.
224
+ t_step (int): If you, for example only want to load the data of every fourth timestep use t_end=4
225
+ channel_start (int): The starting index of sensor in the data (inclusive).
226
+ channel_end (int): The ending index of sensors in the data (exclusive).
227
+ channel_step (int): Like t_step, but for the sensor position.
228
+ Returns:
229
+ A 2d-numpy-array containing the data.
230
+ The first axis corresponds to the time, the second to the channel
231
+ """
232
+ return self.load_array_posix_ms(to_posix_timestamp_ms(t_start), to_posix_timestamp_ms(t_end), t_step, channel_start, channel_end, channel_step)
233
+
234
+
235
+ @dispatch(int, int, int, int)
236
+ def load_array_posix_ms(self, t_start:int, t_end:int, channel_start:int, channel_end:int) -> NP.ndarray:
237
+ return self.load_array_posix_ms(t_start, t_end, 1, channel_start, channel_end, 1)
238
+
239
+
@@ -0,0 +1,115 @@
1
+ """ See class docstring FileFinder """
2
+
3
+
4
+ import pickle as PICKLE
5
+ import os as OS
6
+ import datetime as DT
7
+ from typing import Callable
8
+ from time import time # For debug
9
+
10
+ USE_CACHE_FILE = False
11
+
12
+ def to_posix_timestamp_ms(timestamp:DT.datetime) -> int:
13
+ """
14
+ Takes a datetime-object and returns the posix timestamp in milliseconds.
15
+ """
16
+ return int(timestamp.timestamp()*1000)
17
+
18
+ instance_counter = 0 # Caution: This is a CLASS-Variable.
19
+
20
+ class FileFinder():
21
+ """
22
+ Class for finding the required files for given time-ranges.
23
+ @author: Erik Genthe
24
+ @since: 04.01.2022
25
+ """
26
+
27
+ # Time complexities.
28
+ # Source: https://wiki.python.org/moin/TimeComplexity
29
+ # list append() -> O(1)
30
+ # list len() -> O(1)
31
+ # list get() -> O(1)
32
+
33
+ def __init__(self, root_path:str, file_suffix:str, filename_to_posixtimestamp:Callable[[str], int]):
34
+ global instance_counter
35
+ self.instance_number = instance_counter
36
+ instance_counter += 1
37
+ self.__root_path = root_path
38
+ self.__file_pathes = []
39
+ self.__cache_path = OS.path.dirname(__file__) + "/pathes_cache" + str(self.instance_number)
40
+
41
+ if USE_CACHE_FILE and OS.path.exists(self.__cache_path):
42
+ f = open(self.__cache_path, 'rb')
43
+ self.__file_pathes = PICKLE.load(f)
44
+ f.close()
45
+ else:
46
+ time_start = time()
47
+ for pathlist in OS.walk(root_path):
48
+ for file_name in pathlist[2]:
49
+ if file_name.endswith(file_suffix):
50
+ posix_timestamp_ms = filename_to_posixtimestamp(file_name)
51
+ path = OS.path.join(pathlist[0], file_name)
52
+ self.__file_pathes.append((posix_timestamp_ms, path))
53
+ self.__file_pathes.sort()
54
+ time_end = time()
55
+ print(f"Filefinder: Time used for creating file list: {time_end-time_start} seconds for {len(self.__file_pathes)} files.")
56
+ if USE_CACHE_FILE:
57
+ f = open(self.__cache_path, 'wb')
58
+ PICKLE.dump(self.__file_pathes, f)
59
+ f.close()
60
+
61
+ if self.__file_pathes == []:
62
+ raise RuntimeError(f"Error: No {file_suffix} files found in {root_path} and its subdirectories.")
63
+
64
+
65
+ def __find_nearest_before(self, posix_timestamp_ms: int) -> tuple:
66
+ """Method __find_neares_before(self, posix_timestamp_ms)
67
+ Time complexity: O(n) (n := number of files)
68
+ TODO reduce to O(log(n)). This can be easily done.
69
+
70
+ Args:
71
+ posix_timestamp_ms (int): The posix timestamp in milliseconds to base the search on.
72
+ Returns:
73
+ tuple: A triple (internal_index, posix timestamp in millis of the file start, file path)
74
+ None: If the given time was before any recording was done.
75
+ """
76
+ for i in range(len(self.__file_pathes)-1, 0, -1): # Iterate reverse
77
+ key, value = self.__file_pathes[i]
78
+ if key < posix_timestamp_ms:
79
+ return (i, key, value)
80
+ return None
81
+
82
+
83
+ def get_range(self, t_start:DT.datetime, t_end:DT.datetime) -> list:
84
+ """
85
+ See method get_range_posix.
86
+ """
87
+ return self.get_range_posix(to_posix_timestamp_ms(t_start), to_posix_timestamp_ms(t_end))
88
+
89
+
90
+ def get_range_posix(self, t_start:int, t_end:int) -> list:
91
+ """Gets the files that contain the data for a given time-range.
92
+ Args:
93
+ t_start (int): Starting time of the requested range
94
+ t_end (int): Ending time of the requested range
95
+ Returns:
96
+ tuple: A list containing tuples.
97
+ First element of each tuple is the posix timestamp in ms of the start of the file,
98
+ Second element of each tuple is the path of the file.
99
+ """
100
+ assert isinstance(t_start, int)
101
+ assert isinstance(t_end, int)
102
+ assert t_start <= t_end, f"t_start={t_start} is supposed to be less or equal t_end={t_end}"
103
+ first = self.__find_nearest_before(t_start)
104
+ last = self.__find_nearest_before(t_end)
105
+ if first is None:
106
+ first = (0,)
107
+ if last is None:
108
+ return []
109
+ return self.__file_pathes[ first[0] : last[0] + 1 ]
110
+
111
+
112
+ def get_first(self) -> tuple:
113
+ if len(self.__file_pathes) == 0:
114
+ raise Exception(f"No data files found in root directory: {self.__root_path}")
115
+ return self.__file_pathes[0]