PyPI - das2numpy - Versions diffs - 0.0.1__py3-none-any.whl - Mend

das2numpy 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

das2numpy/__init__.py +28 -0
das2numpy/__main__.py +99 -0
das2numpy/chunk.py +239 -0
das2numpy/filefinder.py +115 -0
das2numpy/setups/light_tdms_reader.py +479 -0
das2numpy/setups/optasense_b35idefix.py +91 -0
das2numpy/setups/optasense_b35idefix_fast.py +111 -0
das2numpy/setups/silixa.py +87 -0
das2numpy/test.py +158 -0
das2numpy/utils.py +136 -0
das2numpy-0.0.1.dist-info/LICENSE +674 -0
das2numpy-0.0.1.dist-info/METADATA +89 -0
das2numpy-0.0.1.dist-info/RECORD +16 -0
das2numpy-0.0.1.dist-info/WHEEL +5 -0
das2numpy-0.0.1.dist-info/top_level.txt +2 -0
example.py +39 -0

das2numpy/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+""" Module for efficient loading of both optasense and silixa data data.
+    @author: Erik genthe
+"""
+import os as OS
+import numpy as NP
+from multipledispatch import dispatch
+#@dispatch(str, str, int)
+def loader(root_path:str, predefined_setup:str, num_worker_threads):
+    if predefined_setup.upper() == "SILIXA":
+        from .setups import silixa
+        chunk = silixa.init(root_path, num_worker_threads)
+    elif predefined_setup.upper() == "OPTASENSE":
+        from .setups import optasense_b35idefix
+        chunk = optasense_b35idefix.init()
+    else:
+        raise RuntimeError("Unknown setup: ", predefined_setup)
+    return chunk
+#@dispatch(str, str, int)
+#def loader(self, root_path:str, predefined_setup:str, num_worker_threads):
+#    return chunk

das2numpy/__main__.py ADDED Viewed

@@ -0,0 +1,99 @@
+import argparse
+from datetime import datetime
+from time import time
+import numpy as NP
+from . import loader
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="This command line interface is work in progess!\nTODO script description!")
+    parser.add_argument(
+        "-v", "--verbosity",
+        action="count",
+        default=False,
+        help="Print more information to stdout"
+    )
+    parser.add_argument(
+        "device",
+        type=str,
+        help="Keyword for selecting the device. TODO unused yet!"
+    )
+    parser.add_argument(
+        "root_path",
+        type=str,
+        help="The path of the directory containing the data files. Recursive search."
+    )
+    parser.add_argument(
+        "start",
+        type=lambda x: datetime.fromisoformat(x),
+        help="Start timestamp in ISO format (YYYY-MM-DDTHH:MM:SS)."
+    )
+    parser.add_argument(
+        "end",
+        type=lambda x: datetime.fromisoformat(x),
+        help="End timestamp in ISO format (YYYY-MM-DDTHH:MM:SS)."
+    )
+    parser.add_argument(
+        "time_step",
+        type=int,
+        help="Time step as an integer."
+    )
+    parser.add_argument(
+        "channel_start",
+        type=int,
+        help="Channel start as an integer."
+    )
+    parser.add_argument(
+        "channel_end",
+        type=int,
+        help="Channel end as an integer."
+    )
+    parser.add_argument(
+        "channel_step",
+        type=int,
+        help="Channel step as an integer."
+    )
+    parser.add_argument(
+        "output",
+        type=str,
+        help="The path where to store the numpy file containing the data \"default\" or \"stdout\". "
+            + "If \"default\" is given, the file name will be the \"<startime>.npy\". "
+            + "If \"stdout\" is given, the data is piped to stdout as binary."
+            + "TODO: stdout not implemented yet!"
+    )
+    return parser.parse_args()
+def main():
+    args = parse_arguments()
+    if args.verbosity: print("Args:", args)
+    if args.output == "default":
+        fname = args.start.strftime("%Y%m%dT%H%M%S") + ".npy"
+    elif args.output == "stdout":
+        raise RuntimeError("Not implemented yet")
+    else:
+        fname = output
+    print("Load...")
+    start = time()
+    loaderinstance = loader(args.root_path, args.device, num_worker_threads=4)
+    data = loaderinstance.load_array(args.start, args.end, args.time_step,
+            args.channel_start, args.channel_end, args.channel_step)
+    if args.verbosity:
+        end = time()
+        print("Duration", end-start)
+        print("Data:", NP.array(data.shape).prod() * 2.0 * 1000 / 1.0e6, "mb")
+        print("Rate:", NP.array(data.shape).prod() * 2.0 * 1000 / 1.0e6 / (end-start), "mb/s")
+        print("Saving...", fname)
+    NP.save(fname, data)
+if __name__ == "__main__":
+    main()

das2numpy/chunk.py ADDED Viewed

@@ -0,0 +1,239 @@
+"""
+    See docstring of class Chunk.
+    Benchmark Optasense (measurements in seconds):
+    TIME for loading one whole file using h5py: 12.864407300949097
+    TIME for loading the first 1000 sensors from 10 files: 6.066787958145142
+    TIME for loading with sensor_step=10 from 10 files: 23.70387291908264
+    TIME for loading 100 sensors from 100 files 8.697869777679443
+    TIME for loading 1000 sensors from 100 files 92.85049629211426
+    TIME for loading 40 files completely 278.97754430770874
+"""
+from typing import Callable
+from math import floor
+from datetime import datetime
+from random import shuffle
+from multipledispatch import dispatch
+import concurrent.futures as CF
+from concurrent.futures import ThreadPoolExecutor
+from multiprocessing import Pool
+import numpy as NP
+from .filefinder import FileFinder, to_posix_timestamp_ms
+SHUFFLE_TASKS = False
+def _predict_size(start: int, end: int, step: int) -> int:
+    diff = end - start
+    return int(((diff-1) - (diff-1)%step) / step + 1)
+class Chunk():
+    """
+        Class for efficient loading and storing data
+        After the data is loaded, using one of the load...(...) methods,
+        the data and the meta information can be accessed directly by accessing the following fields:
+        data, timestamps, geo_positions, channel.
+        TODO implement geo_positions, channel, timestamps
+        author: ingrabarbosa, Erik genthe
+    """
+    def __init__(self,
+                file_finder:FileFinder,
+                file_channel_amount:int,
+                file_time_sample_amount:int,
+                multithreaded:bool,
+                workers:int,
+                workerprocess:bool,
+                loading_function:Callable[[str, int, int, int, int, int, int], NP.ndarray]
+                ):
+        self.__file_finder = file_finder
+        self.__file_channel_amount = file_channel_amount
+        self.__file_time_sample_amount = file_time_sample_amount
+        self.__multithreaded = multithreaded
+        self.__workerprocess = workerprocess
+        self.__loading_function = loading_function
+        if multithreaded:
+            self.__executor = ThreadPoolExecutor(workers)
+        if not self.__multithreaded:
+            print("Warning: Chunk is not in multiprocessing or multithreading mode!")
+    def __load_from_file_into_data(self,
+                start_timestamp:int,
+                file_path:str,
+                t_start:int,
+                t_end:int,
+                t_step:int,
+                channel_start:int,
+                channel_end:int,
+                channel_step:int
+                ) -> None:
+        #print("Args: ", start_timestamp, file_path, t_start, t_end, t_step, channel_start, channel_end, channel_step)
+         # Check if the whole file shall be loaded. Especially the first and last file could be cut...
+        rel_t_start = 0
+        rel_t_end = self.__file_time_sample_amount
+        if t_start > start_timestamp:
+            rel_t_start = t_start - start_timestamp
+        if t_end < start_timestamp + self.__file_time_sample_amount: #TODO magicnumber
+            rel_t_end = t_end - start_timestamp
+        if rel_t_start == rel_t_end:
+            return # Do nothing
+        #print("relative start, relative end", rel_t_start, rel_t_end)
+        if start_timestamp + self.__file_time_sample_amount <= t_start:
+            print("Warning: File does not contain any parts of the requested data.",
+                    "This can happen if there are leaks in the data. The corresponding output will be left filled with zeros.\n",
+                    f"    Requested range (Posixtimestamps in ms): [{t_start}, {t_end}[\n",
+                    f"    Filepath: {file_path}.")
+            return
+        assert rel_t_end > rel_t_start, f"rel_t_start={rel_t_start}, rel_t_end={rel_t_end}."
+        # Load h5-data using a different process... There is no other way to make h5py work parallel :(
+        data = None
+        if self.__workerprocess:
+            pool = Pool(1)
+            result = pool.apply_async(self.__loading_function,
+                    (file_path, rel_t_start, rel_t_end, t_step, channel_start, channel_end, channel_step))
+            pool.close()
+            result = result.get() # Blocks!
+            data = result
+        else:
+            data = self.__loading_function(file_path, rel_t_start, rel_t_end, t_step, channel_start, channel_end, channel_step)
+        # Store loaded data part into all_data
+        start_index = floor((start_timestamp - t_start) / t_step)
+        #print(start_index)
+        if start_index < 0:
+            start_index = 0
+        #print("Shape: ", data.shape)
+        self.data[start_index : start_index + data.shape[0],:] = data[:,:]
+    @dispatch(int, int, int, int, int, int)
+    def load_array_posix_ms(self, t_start: int, t_end: int, t_step: int, channel_start: int, channel_end: int, channel_step: int) -> NP.ndarray:
+        """ Loading data
+            Warning: using a different value then 1 for t_step or channel_step can result in a high cpu-usage.
+                    Consider using multithreaded=True in the constructor and a high amount of workers if needed.
+            Constraints:
+                t_start has to be less or equal t_end,
+                same for channel_start and channel_end.
+                t_step and channel_step have to be greater then 0
+            Args:
+                t_start (int): A posix timestamp in ms which defines the start of the data to load.
+                t_end (int): A posix timestamp in ms which defines the end of the data to load.
+                t_step (int): If you, for example only want to load the data of every fourth timestep use t_end=4
+                channel_start (int): The starting index of sensor in the data (inclusive).
+                channel_end (int): The ending index of sensors in the data (exclusive).
+                channel_step (int): Like t_step, but for the sensor position.
+            Returns:
+                Data as a numpy array
+        """
+        assert channel_start >= 0
+        assert channel_start <= self.__file_channel_amount
+        if channel_end == -1:
+            channel_end = self.__file_channel_amount
+        assert channel_end >= channel_start
+        assert channel_end <= self.__file_channel_amount, "channel_end has to be less or equal than self.__file_channel_amount"
+        assert t_step > 0
+        assert channel_step > 0
+        file_pathes = self.__file_finder.get_range_posix(t_start, t_end)
+        print(f"Loading data from {len(file_pathes)} files.")
+        print("file_pathes", file_pathes)
+        data_shape = (
+                _predict_size(t_start, t_end, t_step),
+                _predict_size(channel_start, channel_end, channel_step)
+        )
+        self.data = NP.zeros(shape=data_shape, dtype=NP.float32)
+        if self.__multithreaded:
+            futures = []
+            if SHUFFLE_TASKS:
+                shuffle(file_pathes)
+            for start_timestamp, file_path in file_pathes:
+                futures.append(
+                    self.__executor.submit(
+                        self.__load_from_file_into_data,
+                        start_timestamp,
+                        file_path,
+                        t_start,
+                        t_end,
+                        t_step,
+                        channel_start,
+                        channel_end,
+                        channel_step
+                    )
+                )
+            for future in CF.as_completed(futures):
+                future.result() # Raises possible exceptions
+        else:
+            for start_timestamp, file_path in file_pathes:
+                self.__load_from_file_into_data(
+                        start_timestamp,
+                        file_path,
+                        t_start,
+                        t_end,
+                        t_step,
+                        channel_start,
+                        channel_end,
+                        channel_step)
+        return self.data
+    @dispatch(datetime, datetime, int, int)
+    def load_array(self, t_start:datetime, t_end:datetime, channel_start:int, channel_end:int) -> NP.ndarray:
+        """ Loads data and returns it as a numpy array.
+            Constraints:
+                t_start has to be less or equal t_end,
+                same for channel_start and channel_end.
+            Args:
+                t_start (datetime): datetime object which defines the start of the data to load.
+                t_end (datetime): datetime object which defines the end of the data to load.
+                channel_start (int): The starting index of sensor in the data (inclusive).
+                channel_end (int): The ending index of sensors in the data (exclusive).
+            Returns:
+                A 2d-numpy-array containing the data.
+                The first axis corresponds to the time, the second to the channel
+        """
+        return self.load_array(t_start, t_end, 1, channel_start, channel_end, 1)
+    @dispatch(datetime, datetime, int, int, int, int)
+    def load_array(self, t_start:datetime, t_end:datetime, t_step:int, channel_start:int, channel_end:int, channel_step:int) -> NP.ndarray:
+        """ Loading data into numpy array.
+            Returns nothing, the data can be accessed by accessing the data field of this instance.
+            Warning: using a different value then 1 for t_step or channel_step can result in a high cpu-usage.
+                    Consider using multithreaded=True in the constructor and a high amount of workers if needed.
+            Constraints:
+                t_start has to be less or equal t_end,
+                same for channel_start and channel_end.
+                t_step and channel_step have to be greater then 0
+            Args:
+                t_start (datetime): datetime object which defines the start of the data to load.
+                t_end (datetime): datetime object which defines the end of the data to load.
+                t_step (int): If you, for example only want to load the data of every fourth timestep use t_end=4
+                channel_start (int): The starting index of sensor in the data (inclusive).
+                channel_end (int): The ending index of sensors in the data (exclusive).
+                channel_step (int): Like t_step, but for the sensor position.
+            Returns:
+                A 2d-numpy-array containing the data.
+                The first axis corresponds to the time, the second to the channel
+        """
+        return self.load_array_posix_ms(to_posix_timestamp_ms(t_start), to_posix_timestamp_ms(t_end), t_step, channel_start, channel_end, channel_step)
+    @dispatch(int, int, int, int)
+    def load_array_posix_ms(self, t_start:int, t_end:int, channel_start:int, channel_end:int) -> NP.ndarray:
+        return self.load_array_posix_ms(t_start, t_end, 1, channel_start, channel_end, 1)

das2numpy/filefinder.py ADDED Viewed

@@ -0,0 +1,115 @@
+""" See class docstring FileFinder """
+import pickle as PICKLE
+import os as OS
+import datetime as DT
+from typing import Callable
+from time import time # For debug
+USE_CACHE_FILE = False
+def to_posix_timestamp_ms(timestamp:DT.datetime) -> int:
+    """
+        Takes a datetime-object and returns the posix timestamp in milliseconds.
+    """
+    return int(timestamp.timestamp()*1000)
+instance_counter = 0    # Caution: This is a CLASS-Variable.
+class FileFinder():
+    """
+        Class for finding the required files for given time-ranges.
+        @author: Erik Genthe
+        @since: 04.01.2022
+    """
+    # Time complexities.
+    # Source: https://wiki.python.org/moin/TimeComplexity
+    # list append() -> O(1)
+    # list len() -> O(1)
+    # list get() -> O(1)
+    def __init__(self, root_path:str, file_suffix:str, filename_to_posixtimestamp:Callable[[str], int]):
+        global instance_counter
+        self.instance_number = instance_counter
+        instance_counter += 1
+        self.__root_path = root_path
+        self.__file_pathes = []
+        self.__cache_path = OS.path.dirname(__file__) + "/pathes_cache" + str(self.instance_number)
+        if USE_CACHE_FILE and OS.path.exists(self.__cache_path):
+            f = open(self.__cache_path, 'rb')
+            self.__file_pathes = PICKLE.load(f)
+            f.close()
+        else:
+            time_start = time()
+            for pathlist in OS.walk(root_path):
+                for file_name in pathlist[2]:
+                    if file_name.endswith(file_suffix):
+                        posix_timestamp_ms = filename_to_posixtimestamp(file_name)
+                        path = OS.path.join(pathlist[0], file_name)
+                        self.__file_pathes.append((posix_timestamp_ms, path))
+            self.__file_pathes.sort()
+            time_end = time()
+            print(f"Filefinder: Time used for creating file list: {time_end-time_start} seconds for {len(self.__file_pathes)} files.")
+            if USE_CACHE_FILE:
+                f = open(self.__cache_path, 'wb')
+                PICKLE.dump(self.__file_pathes, f)
+                f.close()
+        if self.__file_pathes == []:
+            raise RuntimeError(f"Error: No {file_suffix} files found in {root_path} and its subdirectories.")
+    def __find_nearest_before(self, posix_timestamp_ms: int) -> tuple:
+        """Method __find_neares_before(self, posix_timestamp_ms)
+        Time complexity: O(n)  (n := number of files)
+        TODO reduce to O(log(n)). This can be easily done.
+        Args:
+            posix_timestamp_ms (int): The posix timestamp in milliseconds to base the search on.
+        Returns:
+            tuple: A triple (internal_index, posix timestamp in millis of the file start, file path)
+            None: If the given time was before any recording was done.
+        """
+        for i in range(len(self.__file_pathes)-1, 0, -1): # Iterate reverse
+            key, value = self.__file_pathes[i]
+            if key < posix_timestamp_ms:
+                return (i, key, value)
+        return None
+    def get_range(self, t_start:DT.datetime, t_end:DT.datetime) -> list:
+        """
+            See method get_range_posix.
+        """
+        return self.get_range_posix(to_posix_timestamp_ms(t_start), to_posix_timestamp_ms(t_end))
+    def get_range_posix(self, t_start:int, t_end:int) -> list:
+        """Gets the files that contain the data for a given time-range.
+        Args:
+            t_start (int): Starting time of the requested range
+            t_end (int): Ending time of the requested range
+        Returns:
+            tuple:  A list containing tuples.
+                    First element of each tuple is the posix timestamp in ms of the start of the file,
+                    Second element of each tuple is the path of the file.
+        """
+        assert isinstance(t_start, int)
+        assert isinstance(t_end, int)
+        assert t_start <= t_end, f"t_start={t_start} is supposed to be less or equal t_end={t_end}"
+        first = self.__find_nearest_before(t_start)
+        last = self.__find_nearest_before(t_end)
+        if first is None:
+            first = (0,)
+        if last is None:
+            return []
+        return self.__file_pathes[ first[0] : last[0] + 1 ]
+    def get_first(self) -> tuple:
+        if len(self.__file_pathes) == 0:
+            raise Exception(f"No data files found in root directory: {self.__root_path}")
+        return self.__file_pathes[0]