das2numpy 0.0.4__tar.gz → 1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {das2numpy-0.0.4/src/das2numpy.egg-info → das2numpy-1.0}/PKG-INFO +2 -1
  2. {das2numpy-0.0.4 → das2numpy-1.0}/README.md +1 -0
  3. {das2numpy-0.0.4 → das2numpy-1.0}/pyproject.toml +1 -1
  4. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy/__init__.py +7 -6
  5. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy/chunk.py +49 -48
  6. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy/filefinder.py +1 -1
  7. das2numpy-1.0/src/das2numpy/setups/flac_200hz.py +110 -0
  8. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy/setups/silixa.py +40 -26
  9. das2numpy-1.0/src/das2numpy/setups/silixa_200hz.py +105 -0
  10. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy/utils.py +68 -7
  11. {das2numpy-0.0.4 → das2numpy-1.0/src/das2numpy.egg-info}/PKG-INFO +2 -1
  12. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy.egg-info/SOURCES.txt +4 -1
  13. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy.egg-info/top_level.txt +1 -0
  14. das2numpy-1.0/src/test_downsampled.py +54 -0
  15. {das2numpy-0.0.4 → das2numpy-1.0}/LICENSE +0 -0
  16. {das2numpy-0.0.4 → das2numpy-1.0}/setup.cfg +0 -0
  17. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy/__main__.py +0 -0
  18. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy/setups/light_tdms_reader.py +0 -0
  19. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy/setups/optasense_b35idefix.py +0 -0
  20. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy/setups/optasense_b35idefix_fast.py +0 -0
  21. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy/test.py +0 -0
  22. {das2numpy-0.0.4 → das2numpy-1.0}/src/das2numpy.egg-info/dependency_links.txt +0 -0
  23. {das2numpy-0.0.4 → das2numpy-1.0}/src/example.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: das2numpy
3
- Version: 0.0.4
3
+ Version: 1.0
4
4
  Summary: A simple and universal package for loading large amounts of distributed acoustic sensing (DAS) data.
5
5
  Author-email: Erik Genthe <erik.genthe@desy.de>
6
6
  Project-URL: Homepage, https://git.physnet.uni-hamburg.de/wave/das2numpy
@@ -49,6 +49,7 @@ Returns:
49
49
  ```
50
50
 
51
51
 
52
+
52
53
  #### More detailed interface
53
54
  ```python
54
55
  def load_array(t_start:datetime, t_end:datetime, t_step:int, channel_start:int, channel_end:int, channel_step:int) -> NP.ndarray:
@@ -35,6 +35,7 @@ Returns:
35
35
  ```
36
36
 
37
37
 
38
+
38
39
  #### More detailed interface
39
40
  ```python
40
41
  def load_array(t_start:datetime, t_end:datetime, t_step:int, channel_start:int, channel_end:int, channel_step:int) -> NP.ndarray:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "das2numpy"
7
- version = "0.0.4"
7
+ version = "1.0"
8
8
  authors = [
9
9
  { name="Erik Genthe", email="erik.genthe@desy.de" },
10
10
  ]
@@ -4,16 +4,20 @@
4
4
 
5
5
  import os as OS
6
6
  import numpy as NP
7
- from multipledispatch import dispatch
7
+ from . import utils
8
8
 
9
9
 
10
-
11
- #@dispatch(str, str, int)
12
10
  def loader(root_path:str, predefined_setup:str, num_worker_threads):
13
11
 
14
12
  if predefined_setup.upper() == "SILIXA":
15
13
  from .setups import silixa
16
14
  chunk = silixa.init(root_path, num_worker_threads)
15
+ elif predefined_setup.upper() == "SILIXA_200HZ":
16
+ from .setups import silixa_200hz
17
+ chunk = silixa_200hz.init(root_path, num_worker_threads)
18
+ elif predefined_setup.upper() == "FLAC_200HZ":
19
+ from .setups import flac_200hz
20
+ chunk = flac_200hz.init(root_path, num_worker_threads)
17
21
  elif predefined_setup.upper() == "OPTASENSE":
18
22
  from .setups import optasense_b35idefix
19
23
  chunk = optasense_b35idefix.init()
@@ -23,6 +27,3 @@ def loader(root_path:str, predefined_setup:str, num_worker_threads):
23
27
  return chunk
24
28
 
25
29
 
26
- #@dispatch(str, str, int)
27
- #def loader(self, root_path:str, predefined_setup:str, num_worker_threads):
28
- # return chunk
@@ -16,6 +16,7 @@ from random import shuffle
16
16
  from multipledispatch import dispatch
17
17
  import concurrent.futures as CF
18
18
  from concurrent.futures import ThreadPoolExecutor
19
+ from threading import Lock
19
20
  from multiprocessing import Pool
20
21
  import numpy as NP
21
22
  from .filefinder import FileFinder, to_posix_timestamp_ms
@@ -23,7 +24,7 @@ from .filefinder import FileFinder, to_posix_timestamp_ms
23
24
 
24
25
  SHUFFLE_TASKS = False
25
26
 
26
- def _predict_size(start: int, end: int, step: int) -> int:
27
+ def _calc_size(start: int, end: int, step: int) -> int:
27
28
  diff = end - start
28
29
  return int(((diff-1) - (diff-1)%step) / step + 1)
29
30
 
@@ -37,25 +38,25 @@ class Chunk():
37
38
  the data and the meta information can be accessed directly by accessing the following fields:
38
39
  data, timestamps, geo_positions, channel.
39
40
  TODO implement geo_positions, channel, timestamps
40
- author: ingrabarbosa, Erik genthe
41
+ author: Erik genthe
41
42
  """
42
43
 
43
44
 
44
45
  def __init__(self,
45
- file_finder:FileFinder,
46
- file_channel_amount:int,
47
- file_time_sample_amount:int,
46
+ file_finder:FileFinder,
47
+ sample_rate,
48
48
  multithreaded:bool,
49
49
  workers:int,
50
50
  workerprocess:bool,
51
51
  loading_function:Callable[[str, int, int, int, int, int, int], NP.ndarray]
52
52
  ):
53
53
  self.__file_finder = file_finder
54
- self.__file_channel_amount = file_channel_amount
55
- self.__file_time_sample_amount = file_time_sample_amount
54
+ self.__sample_rate = sample_rate
56
55
  self.__multithreaded = multithreaded
57
56
  self.__workerprocess = workerprocess
58
57
  self.__loading_function = loading_function
58
+ self.__lock = Lock()
59
+ assert type(sample_rate) == int
59
60
  if multithreaded:
60
61
  self.__executor = ThreadPoolExecutor(workers)
61
62
  if not self.__multithreaded:
@@ -64,7 +65,7 @@ class Chunk():
64
65
 
65
66
 
66
67
  def __load_from_file_into_data(self,
67
- start_timestamp:int,
68
+ file_timestamp:int, # The timestamp retrieved from the filename
68
69
  file_path:str,
69
70
  t_start:int,
70
71
  t_end:int,
@@ -73,50 +74,44 @@ class Chunk():
73
74
  channel_end:int,
74
75
  channel_step:int
75
76
  ) -> None:
76
- #print("Args: ", start_timestamp, file_path, t_start, t_end, t_step, channel_start, channel_end, channel_step)
77
+ #print("Args: ", file_timestamp, file_path, t_start, t_end, t_step, channel_start, channel_end, channel_step)
77
78
  # Check if the whole file shall be loaded. Especially the first and last file could be cut...
78
79
  print("das2numpy: Loading from", file_path)
79
- rel_t_start = 0
80
- rel_t_end = self.__file_time_sample_amount
81
- if t_start > start_timestamp:
82
- rel_t_start = t_start - start_timestamp
83
- if t_end < start_timestamp + self.__file_time_sample_amount: #TODO magicnumber
84
- rel_t_end = t_end - start_timestamp
85
- if rel_t_start == rel_t_end:
86
- return # Do nothing
87
- #print("relative start, relative end", rel_t_start, rel_t_end)
88
- if start_timestamp + self.__file_time_sample_amount <= t_start:
89
- print("Warning: File does not contain any parts of the requested data.",
90
- "This can happen if there are leaks in the data. The corresponding output will be left filled with zeros.\n",
91
- f" Requested range (Posixtimestamps in ms): [{t_start}, {t_end}[\n",
92
- f" Filepath: {file_path}.")
93
- return
94
- assert rel_t_end > rel_t_start, f"rel_t_start={rel_t_start}, rel_t_end={rel_t_end}."
95
-
80
+
96
81
 
97
82
  # Load h5-data using a different process... There is no other way to make h5py work parallel :(
98
83
  data = None
99
84
  if self.__workerprocess:
100
85
  pool = Pool(1)
101
86
  result = pool.apply_async(self.__loading_function,
102
- (file_path, rel_t_start, rel_t_end, t_step, channel_start, channel_end, channel_step))
87
+ (file_path, file_timestamp, t_start, t_end, t_step, channel_start, channel_end, channel_step))
103
88
  pool.close()
104
- result = result.get() # Blocks!
105
- data = result
89
+ data = result.get() # Blocks!
106
90
  else:
107
- data = self.__loading_function(file_path, rel_t_start, rel_t_end, t_step, channel_start, channel_end, channel_step)
91
+ data = self.__loading_function(file_path, file_timestamp, t_start, t_end, t_step, channel_start, channel_end, channel_step)
108
92
 
109
93
  # Store loaded data part into all_data
110
- start_index = floor((start_timestamp - t_start) / t_step)
94
+ start_index = int((file_timestamp - t_start) * self.__sample_rate / 1000 / t_step)
111
95
  #print(start_index)
112
96
  if start_index < 0:
113
97
  start_index = 0
114
98
  #print("Shape: ", data.shape)
115
99
 
100
+ if data.shape[1] != self.data.shape[1]:
101
+ print(f"Amount of channels detected in file {file_path} is {data.shape[1]}. The buffer has {self.data.shape[1]}")
102
+
103
+ if channel_end == -1:
104
+ with self.__lock:
105
+ # If number of channels increased, reallocate the target array.
106
+ if self.data.shape[1] < data.shape[1]:
107
+ print("Reallocating buffer")
108
+ old = self.data
109
+ self.data = NP.zeros((self.data.shape[0], data.shape[1]), dtype=data.dtype)
110
+ self.data[:, :old.shape[1]] = old[:,:]
111
+ del old
112
+
116
113
  # To make this a little bit tolerant to a changing amount of channels per file, also the number of channels is given!
117
114
  n_channels = min(data.shape[1], self.data.shape[1])
118
- if data.shape[1] != self.data.shape[1]:
119
- print(f"Warning: Incosistend amount of channels detected in file {file_path}. Expected={self.data.shape[1]}, file={data.shape[1]}. Cropping to fit.")
120
115
  self.data[start_index : start_index + data.shape[0], 0:n_channels] = data[:,:n_channels]
121
116
 
122
117
  @dispatch(int, int, int, int, int, int)
@@ -140,31 +135,34 @@ class Chunk():
140
135
  """
141
136
 
142
137
  assert channel_start >= 0
143
- assert channel_start <= self.__file_channel_amount
144
- if channel_end == -1:
145
- channel_end = self.__file_channel_amount
146
- assert channel_end >= channel_start
147
- assert channel_end <= self.__file_channel_amount, "channel_end has to be less or equal than self.__file_channel_amount"
138
+ #assert channel_start <= self.__file_channel_amount
139
+ #if channel_end == -1: channel_end = self.__file_channel_amount
140
+ assert channel_end == -1 or channel_end > 0
141
+ if channel_end != -1:
142
+ assert channel_end >= channel_start
143
+ #assert channel_end <= self.__file_channel_amount, "channel_end has to be less or equal than self.__file_channel_amount"
148
144
  assert t_step > 0
149
145
  assert channel_step > 0
150
146
 
151
147
  file_pathes = self.__file_finder.get_range_posix(t_start, t_end)
152
148
  print(f"Loading data from {len(file_pathes)} files.")
153
149
  #print("file_pathes", file_pathes)
154
- data_shape = (
155
- _predict_size(t_start, t_end, t_step),
156
- _predict_size(channel_start, channel_end, channel_step)
157
- )
150
+ data_shape = [
151
+ _calc_size(t_start * self.__sample_rate / 1000, t_end * self.__sample_rate / 1000, t_step),
152
+ _calc_size(channel_start, channel_end, channel_step)
153
+ ]
154
+ if channel_end == -1:
155
+ data_shape[1] = 1
158
156
  self.data = NP.zeros(shape=data_shape, dtype=NP.float32)
159
157
  if self.__multithreaded:
160
158
  futures = []
161
159
  if SHUFFLE_TASKS:
162
160
  shuffle(file_pathes)
163
- for start_timestamp, file_path in file_pathes:
161
+ for file_timestamp, file_path in file_pathes:
164
162
  futures.append(
165
163
  self.__executor.submit(
166
164
  self.__load_from_file_into_data,
167
- start_timestamp,
165
+ file_timestamp,
168
166
  file_path,
169
167
  t_start,
170
168
  t_end,
@@ -179,9 +177,9 @@ class Chunk():
179
177
  future.result() # Raises possible exceptions
180
178
 
181
179
  else:
182
- for start_timestamp, file_path in file_pathes:
180
+ for file_timestamp, file_path in file_pathes:
183
181
  self.__load_from_file_into_data(
184
- start_timestamp,
182
+ file_timestamp,
185
183
  file_path,
186
184
  t_start,
187
185
  t_end,
@@ -189,8 +187,11 @@ class Chunk():
189
187
  channel_start,
190
188
  channel_end,
191
189
  channel_step)
192
-
193
- return self.data
190
+
191
+ # The following is weird, but it solves issues with garbage collection. Otherwise this behaves like a memory leak.
192
+ data = self.data
193
+ del self.data
194
+ return data
194
195
 
195
196
 
196
197
 
@@ -73,7 +73,7 @@ class FileFinder():
73
73
  tuple: A triple (internal_index, posix timestamp in millis of the file start, file path)
74
74
  None: If the given time was before any recording was done.
75
75
  """
76
- for i in range(len(self.__file_pathes)-1, 0, -1): # Iterate reverse
76
+ for i in range(len(self.__file_pathes)-1, -1, -1): # Iterate reverse
77
77
  key, value = self.__file_pathes[i]
78
78
  if key < posix_timestamp_ms:
79
79
  return (i, key, value)
@@ -0,0 +1,110 @@
1
+ """ Univsersal setup file for silixa, that detects sampling rate and number of channels by itself.
2
+ The root directory shall be supplied by the user via an argument
3
+ """
4
+
5
+ import sys as SYS
6
+ import ast as AST
7
+ from os import path as P
8
+ import datetime as DT
9
+ import numpy as NP
10
+ import ffmpeg as FFMPEG
11
+ from ..filefinder import FileFinder, to_posix_timestamp_ms
12
+ from ..chunk import Chunk
13
+ from ..utils import bin
14
+
15
+ CALIBRATE = True
16
+
17
+
18
+
19
+ sample_rate = 200
20
+
21
+
22
+ def init(root_path, num_worker_threads):
23
+ assert P.isdir(root_path)
24
+ file_finder = FileFinder(root_path, ".flac", filename_to_posix_timestamp)
25
+ assert num_worker_threads >= 1
26
+ multithreaded = num_worker_threads > 1
27
+ return Chunk(
28
+ file_finder,
29
+ sample_rate,
30
+ multithreaded,
31
+ num_worker_threads,
32
+ False,
33
+ load_file
34
+ )
35
+
36
+
37
+ def filename_to_posix_timestamp(file_name:str) -> int:
38
+ timestamp_str = file_name.split(".flac")[0]
39
+ timestamp_dt = DT.datetime.strptime(timestamp_str, "%Y%m%dT%H%M%S")
40
+ timestamp_ms = to_posix_timestamp_ms(timestamp_dt)
41
+ return timestamp_ms
42
+
43
+
44
+
45
+ def load_file(file_path, file_timestamp, t_start, t_end, t_step, channel_start, channel_end, channel_step) -> NP.ndarray:
46
+ """ Loads a single file, trims it. And returns the trimmed data as a numpy array. Downsampling (t_step, channel_step) is also possible!
47
+ """
48
+
49
+ try:
50
+ probe = FFMPEG.probe(file_path, v='error', select_streams='a:0', show_entries='stream=channels,sample_rate', of='json')
51
+ shape = AST.literal_eval(probe['format']['tags']['shape'])
52
+ calibration_factor = float(probe['format']['tags']['calibration_factor'])
53
+ info = probe['streams'][0]
54
+
55
+
56
+ idx_start = 0
57
+ if t_start > file_timestamp: # Check if beginning should be trimmed.
58
+ rel_t_start = t_start - file_timestamp
59
+ idx_start = int(rel_t_start * sample_rate / 1000.0)
60
+ idx_end = shape[0]
61
+ if t_end < file_timestamp + (shape[0] * 1000 / sample_rate): # Check if end should be trimmed
62
+ rel_t_end = t_end - file_timestamp
63
+ idx_end = int(rel_t_end * sample_rate / 1000.0)
64
+ if idx_start == idx_end:
65
+ return NP.zeros(shape=[0, 0]) # No data should be loaded. Do nothing
66
+ if file_timestamp + (shape[0] * 1000 / sample_rate) <= t_start:
67
+ print("Warning: File does not contain any parts of the requested data.",
68
+ "This can happen if there are leaks in the data or if there are no files for the requested time in the given directory.",
69
+ "The corresponding output will be left filled with zeros.\n",
70
+ f" Requested range (Posixtimestamps in ms): [{t_start}, {t_end}[\n",
71
+ f" Filepath: {file_path}.")
72
+ return NP.zeros(shape=[0, 0])
73
+ assert idx_end == shape[0] or idx_end > idx_start, f"idx_start={idx_start}, idx_end={idx_end}."
74
+ assert idx_start < idx_end
75
+
76
+ out, err = (
77
+ FFMPEG
78
+ .input(file_path)
79
+ .filter('atrim', start_sample=idx_start*shape[1], end_sample=idx_end*shape[1])
80
+ .output('pipe:', format='s16le', acodec='pcm_s16le', ac=int(info['channels']), ar=int(info['sample_rate']))
81
+ .run(capture_stdout=True, capture_stderr=True)
82
+ )
83
+ data = NP.frombuffer(out, dtype=NP.int16)
84
+ data = data.reshape([-1, shape[1]])
85
+ except FFMPEG.Error as e:
86
+ raise Exception(e.stderr.decode("utf-8"))
87
+ except FileNotFoundError as e:
88
+ raise Exception("ffmpeg not found. Please install ffmpeg."
89
+ + "If you are working on desys maxwell cluster, "
90
+ + "execute 'module load maxwell ffmpeg'")
91
+
92
+ # Trim data
93
+ data = data[:, channel_start:channel_end]
94
+
95
+ data = data.astype(NP.float32) #This needs to hapen before the binning step!
96
+
97
+ # Downsample data
98
+ if t_step != 1 or channel_step != 1:
99
+ data = bin(data, (t_step, channel_step))
100
+ #if t_step != 1:
101
+ # data = data[::t_step]
102
+ #if channel_step != 1:
103
+ # data = data[:, ::channel_step]
104
+ assert len(data) > 0
105
+
106
+ if CALIBRATE:
107
+ data *= calibration_factor
108
+
109
+ return data
110
+
@@ -20,17 +20,12 @@ CALIBRATE = True
20
20
  def init(root_path, num_worker_threads):
21
21
  assert P.isdir(root_path)
22
22
  file_finder = FileFinder(root_path, ".tdms", filename_to_posix_timestamp)
23
- example_file_path = file_finder.get_elem(10)[1] # TODO get first instead of 10th
24
- tdms = TdmsReader(example_file_path)
25
- shape = tdms.get_mmap().shape
26
- file_time_sample_amount = shape[0]
27
- channel_amount = shape[1]
28
23
  assert num_worker_threads >= 1
29
24
  multithreaded = num_worker_threads > 1
25
+ sample_rate = 1000
30
26
  return Chunk(
31
27
  file_finder,
32
- channel_amount,
33
- file_time_sample_amount,
28
+ sample_rate,
34
29
  multithreaded,
35
30
  num_worker_threads,
36
31
  False,
@@ -44,25 +39,44 @@ def filename_to_posix_timestamp(file_name:str) -> int:
44
39
  return timestamp_ms
45
40
 
46
41
 
47
- def load_file(file_path, rel_t_start, rel_t_end, t_step, channel_start, channel_end, channel_step) -> NP.ndarray:
48
- #assert rel_t_end <= FILE_TIME_SAMPLE_AMOUNT
49
- assert rel_t_start != rel_t_end
50
- assert rel_t_start < rel_t_end
51
- assert rel_t_start >= 0
52
-
53
- tdms = TdmsReader(file_path)
54
- data = tdms.get_mmap()
55
- data = data[rel_t_start:rel_t_end, channel_start:channel_end]
56
- if t_step != 1 or channel_step != 1:
57
- data = bin(data, (t_step, channel_step))
58
- #if t_step != 1:
59
- # data = data[::t_step]
60
- #if channel_step != 1:
61
- # data = data[:, ::channel_step]
62
- assert len(data) > 0
63
-
64
- if CALIBRATE:
65
- data = calibrate(data)
42
+ def load_file(file_path, file_timestamp, t_start, t_end, t_step, channel_start, channel_end, channel_step) -> NP.ndarray:
43
+ """ Loads a single file, trims it. And returns the trimmed data as a numpy array. Downsampling (t_step, channel_step) is also possible!
44
+ """
45
+
46
+ with TdmsReader(file_path) as tdms:
47
+ data = tdms.get_mmap()
48
+
49
+
50
+ # Trim data
51
+ rel_t_start = 0
52
+ if t_start > file_timestamp: # Check if beginning should be trimmed.
53
+ rel_t_start = t_start - file_timestamp
54
+ rel_t_end = -1
55
+ if t_end < file_timestamp + data.shape[0]: # Check if end should be trimmed
56
+ rel_t_end = t_end - file_timestamp
57
+ if rel_t_start == rel_t_end:
58
+ return NP.zeros(shape=[0, 0]) # No data should be loaded. Do nothing
59
+ if file_timestamp + data.shape[0] <= t_start:
60
+ print("Warning: File does not contain any parts of the requested data.",
61
+ "This can happen if there are leaks in the data. The corresponding output will be left filled with zeros.\n",
62
+ f" Requested range (Posixtimestamps in ms): [{t_start}, {t_end}[\n",
63
+ f" Filepath: {file_path}.")
64
+ return NP.zeros(shape=[0, 0])
65
+ assert rel_t_end == -1 or rel_t_end > rel_t_start, f"rel_t_start={rel_t_start}, rel_t_end={rel_t_end}."
66
+ data = data[rel_t_start:rel_t_end, channel_start:channel_end]
67
+
68
+
69
+ # Downsample data
70
+ if t_step != 1 or channel_step != 1:
71
+ data = bin(data, (t_step, channel_step))
72
+ #if t_step != 1:
73
+ # data = data[::t_step]
74
+ #if channel_step != 1:
75
+ # data = data[:, ::channel_step]
76
+ assert len(data) > 0
77
+
78
+ if CALIBRATE:
79
+ data = calibrate(data)
66
80
 
67
81
  return data
68
82
 
@@ -0,0 +1,105 @@
1
+ """ Univsersal setup file for silixa, that detects sampling rate and number of channels by itself.
2
+ The root directory shall be supplied by the user via an argument
3
+ """
4
+
5
+ import sys as SYS
6
+ from os import path as P
7
+ import datetime as DT
8
+ import numpy as NP
9
+ from ..filefinder import FileFinder, to_posix_timestamp_ms
10
+ from ..chunk import Chunk
11
+ from .light_tdms_reader import TdmsReader
12
+ from ..utils import bin
13
+
14
+ CALIBRATE = True
15
+
16
+
17
+
18
+ sample_rate = 200
19
+
20
+
21
+ def init(root_path, num_worker_threads):
22
+ assert P.isdir(root_path)
23
+ file_finder = FileFinder(root_path, ".tdms", filename_to_posix_timestamp)
24
+ assert num_worker_threads >= 1
25
+ multithreaded = num_worker_threads > 1
26
+ return Chunk(
27
+ file_finder,
28
+ sample_rate,
29
+ multithreaded,
30
+ num_worker_threads,
31
+ False,
32
+ load_file
33
+ )
34
+
35
+
36
+ def filename_to_posix_timestamp(file_name:str) -> int:
37
+ timestamp_str = file_name.split("_UTC_")[1][:19]
38
+ timestamp_dt = DT.datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S.%f")
39
+ timestamp_ms = to_posix_timestamp_ms(timestamp_dt)
40
+ return timestamp_ms
41
+
42
+
43
+
44
+ def load_file(file_path, file_timestamp, t_start, t_end, t_step, channel_start, channel_end, channel_step) -> NP.ndarray:
45
+ """ Loads a single file, trims it. And returns the trimmed data as a numpy array. Downsampling (t_step, channel_step) is also possible!
46
+ """
47
+
48
+ with TdmsReader(file_path) as tdms:
49
+ data = tdms.get_mmap()
50
+
51
+
52
+ # Trim data
53
+ idx_start = 0
54
+ if t_start > file_timestamp: # Check if beginning should be trimmed.
55
+ rel_t_start = t_start - file_timestamp
56
+ idx_start = int(rel_t_start * sample_rate / 1000.0)
57
+ idx_end = data.shape[0]
58
+ if t_end < file_timestamp + (data.shape[0] * 1000 / sample_rate): # Check if end should be trimmed
59
+ rel_t_end = t_end - file_timestamp
60
+ idx_end = int(rel_t_end * sample_rate / 1000.0)
61
+ if idx_start == idx_end:
62
+ return NP.zeros(shape=[0, 0]) # No data should be loaded. Do nothing
63
+ if file_timestamp + (data.shape[0] * 1000 / sample_rate) <= t_start:
64
+ print("Warning: File does not contain any parts of the requested data.",
65
+ "This can happen if there are leaks in the data. The corresponding output will be left filled with zeros.\n",
66
+ f" Requested range (Posixtimestamps in ms): [{t_start}, {t_end}[\n",
67
+ f" Filepath: {file_path}.")
68
+ return NP.zeros(shape=[0, 0])
69
+ assert idx_end == data.shape[0] or idx_end > idx_start, f"idx_start={idx_start}, idx_end={idx_end}."
70
+ print(idx_start, idx_end)
71
+ data = data[idx_start:idx_end, channel_start:channel_end]
72
+
73
+
74
+ # Downsample data
75
+ if t_step != 1 or channel_step != 1:
76
+ data = bin(data, (t_step, channel_step))
77
+ #if t_step != 1:
78
+ # data = data[::t_step]
79
+ #if channel_step != 1:
80
+ # data = data[:, ::channel_step]
81
+ assert len(data) > 0
82
+
83
+ if CALIBRATE:
84
+ data = calibrate(data)
85
+
86
+ return data
87
+
88
+
89
+ def calibrate(data:NP.ndarray) -> NP.ndarray:
90
+ """ Convert raw data to strain rate data.
91
+ As the resulting values are decimals, the datatype should be float. Otherwise an assertion fails. """
92
+ #assert data.dtype in (NP.float, NP.float32, NP.float64), f"The data should be floating point. It is {data.dtype}"
93
+ if data.dtype not in (float, NP.float32, NP.float64):
94
+ NEW_TYPE = NP.float32
95
+ #print("Warning: For calibration the data has to be of type float. Converting from {data.dtype} to {NEW_TYPE}")
96
+ data = data.astype(NEW_TYPE)
97
+
98
+ SAMPLE_FREQ = 1000.0 # This remains 1000.0 and not 200 Hz because the original sample rate of the device is relevant here!
99
+ EICHLAENGE = 10.0
100
+ factor = 116.0 * 10.0**(-9.0) / 8192.0 * SAMPLE_FREQ / EICHLAENGE
101
+ return data * factor # Result: 1 / s
102
+
103
+
104
+
105
+
@@ -7,6 +7,7 @@ import math as M
7
7
  import numpy as NP
8
8
  from numba import njit
9
9
  import scipy.signal as SS
10
+ import scipy.stats
10
11
 
11
12
  TIME_AXIS = 0
12
13
 
@@ -69,7 +70,22 @@ def butterworth_filter(
69
70
 
70
71
 
71
72
 
72
- def spectrum_smoothing(frequencies:NP.ndarray, psd:NP.ndarray, n:int):
73
+ def mean_confidence_interval(data, confidence=0.95, min_samples=10):
74
+ """
75
+ Calculates the confidence interval for a student-t distribution.
76
+ From https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
77
+ Returns: [mean, lower-confidence-limit, upper-confidence-limit]
78
+ """
79
+ n = len(data)
80
+ m = NP.mean(data)
81
+ if n < min_samples:
82
+ return m, None, None
83
+ se = scipy.stats.sem(data)
84
+ h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
85
+ return m, m-h, m+h
86
+
87
+
88
+ def spectrum_smoothing(frequencies:NP.ndarray, psd:NP.ndarray, n:int, mode="median", error_calculation=False):
73
89
  """
74
90
  Perform 1/n decade smoothing on the power spectral density (PSD) data.
75
91
  See also: https://dsp.stackexchange.com/questions/9967/1-n-octave-smoothing
@@ -81,10 +97,17 @@ def spectrum_smoothing(frequencies:NP.ndarray, psd:NP.ndarray, n:int):
81
97
  Array containing the power spectral density values corresponding to the frequencies.
82
98
  n : int
83
99
  The number of divisions per decade (e.g., n=10 for 1/10 decade smoothing).
100
+ mode : "mean" or "median"
101
+ How the data points of one bin should be reduced to one point.
102
+ error_calculation : False, "std", or float.
103
+ If false, the function returns only two arrays.
104
+ If "std", the third array contains the standard deviation of the original data points per frequency bin.
105
+ If "stderr", the third array contains the standard error or the original data points per frequency bin.
106
+ If float [0.0 until 1.0], the third array contains the confidence intervall for each frequency bin (EXPERIMENTAL).
84
107
 
85
108
  Returns:
86
- numpy.ndarray, numpy.ndarray
87
- Smoothed frequencies and PSD.
109
+ numpy.ndarray, numpy.ndarray, numpy:ndarray
110
+ Smoothed frequencies, the PSD, and the Standard deviation for each bin.
88
111
  """
89
112
  frequencies = NP.array(frequencies)
90
113
  psd = NP.array(psd)
@@ -105,6 +128,7 @@ def spectrum_smoothing(frequencies:NP.ndarray, psd:NP.ndarray, n:int):
105
128
 
106
129
  freq_new_actual = []
107
130
  psd_new = []
131
+ error = []
108
132
  for i in range(len(freq_new)):
109
133
  f_log = freq_new_log[i]
110
134
  f_lower = 10**(f_log - step_log / 2)
@@ -112,12 +136,49 @@ def spectrum_smoothing(frequencies:NP.ndarray, psd:NP.ndarray, n:int):
112
136
 
113
137
  # Find the indices within this log decade interval
114
138
  mask = (frequencies >= f_lower) & (frequencies < f_higher)
115
- #print(freq_new[i], f_lower, f_higher, mask)
116
139
  if NP.any(mask):
117
140
  freq_new_actual.append(NP.mean(frequencies[mask]))
118
- psd_new.append(NP.mean(psd[mask]))
119
-
120
- return NP.array(freq_new_actual), NP.array(psd_new)
141
+ if mode == "mean":
142
+ mean = NP.mean(psd[mask])
143
+ psd_new.append(mean)
144
+ elif mode == "median":
145
+ psd_new.append(NP.median(psd[mask]))
146
+ else:
147
+ raise Exception("Mode should be 'mean' or 'median'!")
148
+ if error_calculation == False:
149
+ pass
150
+ elif error_calculation == "std":
151
+ if len(psd[mask]) <= 1:
152
+ error.append(float("NaN"))
153
+ else:
154
+ error.append(NP.std(psd[mask]))
155
+ elif error_calculation == "stderr":
156
+ if len(psd[mask]) <= 1:
157
+ error.append(float("NaN"))
158
+ else:
159
+ error.append(NP.std(psd[mask]) / NP.sqrt(len(psd[mask])))
160
+ elif type(error_calculation) == float:
161
+ confidence_level = error_calculation
162
+ assert confidence_level >= 0.5
163
+ #samples = psd[mask]
164
+ #n = len(samples)
165
+ ##h = scipy.stats.sem(psd[mask]) * scipy.stats.t.ppf((1 + confidence_level) / 2., n-1) # From https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
166
+ #z_low = scipy.stats.rayleigh.ppf((1 - confidence_level) / 2.0) # Rayleigh should be the correct distribution for ASD values
167
+ #z_high = scipy.stats.rayleigh.ppf(confidence_level / 2.0) # Rayleigh should be the correct distribution for ASD values
168
+ #mean_or_median = psd_new[-1]
169
+ #standard_error = samples.std() / NP.sqrt(n)
170
+ #confidence_interval = [ mean_or_median - standard_error * z_low,
171
+ # mean_or_median + standard_error * z_high]
172
+ #print(f"-----------> m={mean_or_median} stderr={standard_error} z_low={z_low} stderr*zlow={standard_error * z_low} z+={z_high}")
173
+ m, lower, upper = mean_confidence_interval(psd[mask], confidence_level)
174
+ error.append([lower, upper])
175
+ else:
176
+ raise Exception(f"Error calculation type {error_calculation} is invalid.")
177
+
178
+ if error_calculation:
179
+ return NP.array(freq_new_actual), NP.array(psd_new), NP.array(error)
180
+ else:
181
+ return NP.array(freq_new_actual), NP.array(psd_new)
121
182
 
122
183
 
123
184
  def bin(arr: NP.ndarray, bin_factors:tuple):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: das2numpy
3
- Version: 0.0.4
3
+ Version: 1.0
4
4
  Summary: A simple and universal package for loading large amounts of distributed acoustic sensing (DAS) data.
5
5
  Author-email: Erik Genthe <erik.genthe@desy.de>
6
6
  Project-URL: Homepage, https://git.physnet.uni-hamburg.de/wave/das2numpy
@@ -49,6 +49,7 @@ Returns:
49
49
  ```
50
50
 
51
51
 
52
+
52
53
  #### More detailed interface
53
54
  ```python
54
55
  def load_array(t_start:datetime, t_end:datetime, t_step:int, channel_start:int, channel_end:int, channel_step:int) -> NP.ndarray:
@@ -2,6 +2,7 @@ LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
4
  src/example.py
5
+ src/test_downsampled.py
5
6
  src/das2numpy/__init__.py
6
7
  src/das2numpy/__main__.py
7
8
  src/das2numpy/chunk.py
@@ -12,7 +13,9 @@ src/das2numpy.egg-info/PKG-INFO
12
13
  src/das2numpy.egg-info/SOURCES.txt
13
14
  src/das2numpy.egg-info/dependency_links.txt
14
15
  src/das2numpy.egg-info/top_level.txt
16
+ src/das2numpy/setups/flac_200hz.py
15
17
  src/das2numpy/setups/light_tdms_reader.py
16
18
  src/das2numpy/setups/optasense_b35idefix.py
17
19
  src/das2numpy/setups/optasense_b35idefix_fast.py
18
- src/das2numpy/setups/silixa.py
20
+ src/das2numpy/setups/silixa.py
21
+ src/das2numpy/setups/silixa_200hz.py
@@ -1,2 +1,3 @@
1
1
  das2numpy
2
2
  example
3
+ test_downsampled
@@ -0,0 +1,54 @@
1
+ import numpy as NP
2
+ import sys
3
+ from datetime import datetime
4
+ import matplotlib.pyplot as PP
5
+ from das2numpy import loader, utils
6
+
7
+ USE_DOWNSAMPLED = False
8
+
9
+ print("Load data to numpy-array")
10
+ t_start = datetime(2025, 10, 14, 2, 58, 59)
11
+ t_end = datetime(2025, 10, 14, 2, 59, 1)
12
+ channel_start = 1000
13
+ channel_end = 3000
14
+
15
+ if USE_DOWNSAMPLED:
16
+ loader = loader("/pnfs/desy.de/m/project/iDAS/work/derived-data/DOWNSAMPLED_200HZ/2025-10/", "SILIXA_200HZ", 1)
17
+ else:
18
+ loader = loader("/pnfs/desy.de/m/project/iDAS/raw/2025-DESY/2025-10-14-desy", "SILIXA", 1)
19
+ data = loader.load_array(t_start, t_end, channel_start, channel_end)
20
+
21
+ print("Reduce data by binning (mean averaging)")
22
+ if USE_DOWNSAMPLED:
23
+ bin_factors = (1, 1)
24
+ data = utils.bin(data, bin_factors) # Reduce time sampling and spatial sampling by averaging.
25
+ sampling_hz = 200.0 / bin_factors[0]
26
+ else:
27
+ bin_factors = (5, 1)
28
+ data = utils.bin(data, bin_factors) # Reduce time sampling and spatial sampling by averaging.
29
+ sampling_hz = 1000.0 / bin_factors[0]
30
+ channel_spacing = 1.0 * bin_factors[1]
31
+
32
+ NP.save("data.npy", data)
33
+
34
+ print("Create plot with pyplot")
35
+ PP.title(f"{t_start.isoformat()}")
36
+ PP.imshow(
37
+ data,
38
+ cmap = "seismic",
39
+ aspect = "auto",
40
+ interpolation = "nearest",
41
+ vmin = -1e-7,
42
+ vmax = +1e-7,
43
+ extent = (
44
+ channel_start, channel_start + (data.shape[1] * channel_spacing),
45
+ data.shape[0] / sampling_hz, 0
46
+ )
47
+ )
48
+ PP.xlabel("Position [m]")
49
+ PP.ylabel("Time [s]")
50
+ PP.colorbar(label="Strain-rate [$\\frac{m}{m \\cdot s}$]")
51
+ if USE_DOWNSAMPLED:
52
+ PP.savefig("waterfall_downsampled.png")
53
+ else:
54
+ PP.savefig("waterfall.png")
File without changes
File without changes
File without changes
File without changes