pymast 0.0.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pymast/predictors.py CHANGED
@@ -1,28 +1,119 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  """
3
- Created on Tue Nov 14 10:52:20 2023
3
+ Predictor functions for radio telemetry classification and filtering.
4
4
 
5
- @author: KNebiolo
5
+ This module provides statistical predictor functions used during Naive Bayes
6
+ classification to distinguish true fish detections from noise. Each predictor
7
+ calculates a feature that helps identify legitimate versus spurious detections.
8
+
9
+ Core Predictors
10
+ ---------------
11
+ - **noise_ratio**: Ratio of miscoded to correctly-coded detections in time window
12
+ - **series_hit**: Whether detection is in-series with previous/next detection
13
+ - **detection_history**: Maximum contiguous sequence of expected detections
14
+ - **factors**: Prime factorization for pulse rate calculations
15
+
16
+ Classification Pipeline
17
+ -----------------------
18
+ These predictors are calculated during data import and used as features in the
19
+ Naive Bayes classifier. They help identify:
20
+ 1. Miscoded tags from environmental noise
21
+ 2. Out-of-series detections from spurious signals
22
+ 3. Detection patterns inconsistent with known pulse rates
23
+
24
+ Typical Usage
25
+ -------------
26
+ >>> import pymast.predictors as predictors
27
+ >>> import numpy as np
28
+ >>>
29
+ >>> # Calculate noise ratio for detections
30
+ >>> noise = predictors.noise_ratio(
31
+ ... duration=5.0,
32
+ ... freq_codes=freq_codes_array,
33
+ ... epochs=epochs_array,
34
+ ... study_tags=['166.380 7', '166.380 12']
35
+ ... )
36
+ >>>
37
+ >>> # Check detection history
38
+ >>> max_seq = predictors.detection_history(
39
+ ... epoch=epoch_array,
40
+ ... pulse_rate=pulse_rate_array,
41
+ ... num_detects=5,
42
+ ... num_channels=1,
43
+ ... scan_time=1.0
44
+ ... )
45
+
46
+ Notes
47
+ -----
48
+ - Predictors assume VHF pulse-coded tags (frequency + code)
49
+ - Noise ratio uses 5-minute moving window by default
50
+ - Series hit checks for mortality rate changes (active vs expired tags)
51
+ - Detection history accounts for multi-channel scan patterns
52
+
53
+ See Also
54
+ --------
55
+ naive_bayes.train : Classifier training using these predictors
56
+ parsers : Data import where predictors are first calculated
6
57
  """
58
+
59
+ import numba as nb
7
60
  import numpy as np
8
61
  import pandas as pd
62
+ pd.set_option('display.float_format', '{:.10f}'.format)
63
+ np.set_printoptions(suppress=True, precision=10)
9
64
 
10
- def noise_ratio (duration, freq_codes,epochs,study_tags):
11
-
12
- ''' function calculates the ratio of miscoded, pure noise detections, to matching frequency/code
13
- detections within the duration specified.
14
-
15
- In other words, what is the ratio of miscoded to correctly coded detections within the duration specified
16
65
 
17
- duration = moving window length in minutes
18
- data = current data file
19
- study_tags = list or list like object of study tags
20
- '''
66
+ def noise_ratio (duration, freq_codes,epochs,study_tags):
67
+ """
68
+ Calculate ratio of miscoded to total detections within moving time window.
69
+
70
+ Identifies noise by comparing miscoded detections (freq_codes not in study_tags)
71
+ to total detection count within specified duration. High noise ratio indicates
72
+ environmental interference or receiver malfunction.
73
+
74
+ Parameters
75
+ ----------
76
+ duration : float
77
+ Moving window length in seconds (e.g., 300.0 for 5 minutes)
78
+ freq_codes : array_like
79
+ Array of detected frequency-code strings (e.g., ['166.380 7', '166.380 12'])
80
+ epochs : array_like
81
+ Array of detection timestamps (seconds since 1970-01-01)
82
+ study_tags : list of str
83
+ List of valid freq_code tags deployed in study
84
+
85
+ Returns
86
+ -------
87
+ numpy.ndarray
88
+ Array of noise ratios (float32) with same length as input arrays.
89
+ Values range from 0 (no noise) to 1 (all noise).
90
+
91
+ Notes
92
+ -----
93
+ - Bins epochs into duration-sized windows
94
+ - Counts miscoded detections per bin (not in study_tags)
95
+ - Calculates ratio: miscodes / total_detections
96
+ - Ratio propagated to all detections in same bin
97
+
98
+ Examples
99
+ --------
100
+ >>> import numpy as np
101
+ >>> freq_codes = np.array(['166.380 7', '166.380 99', '166.380 7'])
102
+ >>> epochs = np.array([1000.0, 1100.0, 1200.0])
103
+ >>> study_tags = ['166.380 7', '166.380 12']
104
+ >>> predictors.noise_ratio(300.0, freq_codes, epochs, study_tags)
105
+ array([0.33333334, 0.33333334, 0.33333334], dtype=float32)
106
+
107
+ See Also
108
+ --------
109
+ naive_bayes.train : Uses noise_ratio as classification feature
110
+ """
21
111
  # identify miscodes
22
112
  miscode = np.isin(freq_codes, study_tags, invert = True)
23
113
 
24
- # bin everything into nearest 5 min time bin and count miscodes and total number of detections
25
- binned_epoch = epochs//duration
114
+ # bin everything into nearest duration-sized time bin and count miscodes and total number of detections
115
+ # Ensure epochs are integer seconds (or convertible)
116
+ binned_epoch = (epochs // duration).astype('int64')
26
117
 
27
118
  # Now identify the number of unique freq-codes within each bin
28
119
  # Create a DataFrame from the arrays
@@ -46,21 +137,81 @@ def noise_ratio (duration, freq_codes,epochs,study_tags):
46
137
  return df.noise_ratio.values.astype(np.float32)
47
138
 
48
139
  def factors(n):
49
-
50
- ''''function to return primes used to quantify the least common multiplier
51
- see: http://stackoverflow.com/questions/16996217/prime-factorization-list for recipe'''
140
+ """
141
+ Return all factors of integer n.
142
+
143
+ Used to calculate least common multiplier for pulse rate calculations.
144
+ Helps identify valid pulse intervals when multiple tags share similar rates.
145
+
146
+ Parameters
147
+ ----------
148
+ n : int
149
+ Integer to factorize
150
+
151
+ Returns
152
+ -------
153
+ list of int
154
+ All factors of n (including 1 and n)
155
+
156
+ Examples
157
+ --------
158
+ >>> predictors.factors(12)
159
+ [1, 2, 3, 4, 6, 12]
160
+
161
+ Notes
162
+ -----
163
+ Simple brute-force factorization, not optimized for large numbers.
164
+ See: http://stackoverflow.com/questions/16996217/prime-factorization-list
165
+ """
52
166
  pList = []
53
167
  for i in np.arange(1, n + 1):
54
168
  if n % i == 0:
55
169
  pList.append(i)
56
170
  return pList
57
171
 
172
+
58
173
  def series_hit (lags, pulse_rate, mort_rate, status,):
59
- '''seriesHit is a function for returning whether or not a detection on a specific
60
- frequency/code is in series with the previous or next detection on that same
61
- frequency/code
62
- '''
174
+ """
175
+ Check if detection lag matches expected pulse rate (in-series detection).
176
+
177
+ Determines whether time difference to previous/next detection is consistent
178
+ with tag's programmed pulse rate (active) or mortality rate (expired tag).
179
+
180
+ Parameters
181
+ ----------
182
+ lags : array_like
183
+ Time differences to previous detection (seconds)
184
+ pulse_rate : array_like
185
+ Programmed pulse rate for each tag (seconds)
186
+ mort_rate : array_like
187
+ Mortality pulse rate for each tag (seconds)
188
+ status : array_like
189
+ Tag status ('A' for active, other for expired/mortality)
190
+
191
+ Returns
192
+ -------
193
+ numpy.ndarray
194
+ Binary array: 1 if detection is in-series, 0 if out-of-series
195
+
196
+ Notes
197
+ -----
198
+ - Active tags checked against pulse_rate
199
+ - Expired tags checked against mort_rate
200
+ - Uses modulo to check if lag is multiple of expected rate
201
+
202
+ Examples
203
+ --------
204
+ >>> lags = np.array([5.0, 10.0, 7.5])
205
+ >>> pulse_rate = np.array([5.0, 5.0, 5.0])
206
+ >>> mort_rate = np.array([30.0, 30.0, 30.0])
207
+ >>> status = np.array(['A', 'A', 'A'])
208
+ >>> predictors.series_hit(lags, pulse_rate, mort_rate, status)
209
+ array([1, 1, 0])
63
210
 
211
+ See Also
212
+ --------
213
+ detection_history : More comprehensive in-series detection check
214
+ """
64
215
  # determine if the lag is potentially in series with the correct pulse rate based on status
65
216
  series_hit = np.where(status == 'A',
66
217
  np.where(lags % pulse_rate == 0,
@@ -70,101 +221,136 @@ def series_hit (lags, pulse_rate, mort_rate, status,):
70
221
  1,
71
222
  0)
72
223
  )
224
+
225
+
226
+ def max_contiguous_sequence(arr):
227
+ """
228
+ Find maximum number of consecutive 1's in binary array.
73
229
 
74
- def detection_history (epoch, pulse_rate, num_detects, num_channels, scan_time, channels, status = 'A'):
75
- '''
76
- Calculate detection history for multi-channel switching receivers.
77
-
78
- This function computes the detection history based on the epoch time, pulse rate, number of detections,
79
- number of channels, scan time, and channels. It accounts for the sparseness of detection histories
80
- in multi-channel switching receivers, acknowledging that bursts may not always align with scan windows.
81
-
82
- Parameters:
83
- - Epoch (array-like): Array of epoch times.
84
- - pulse_rate (int): Pulse rate of the tag.
85
- - num_detects (int): Number of detections to consider.
86
- - num_channels (int): Number of channels in the receiver.
87
- - scan_time (int): Time taken for one scan.
88
- - channels (int): Number of channels.
89
- - status (str, optional): Status of the detection. Defaults to 'A'.
90
-
91
- Returns:
92
- - det_hist_conc (list of str): Concatenated string representation of detection history.
93
- - det_hist (numpy.ndarray): Array representing the ratio of detections over the total number of detections.
94
- - cons_det (numpy.ndarray): Array indicating consecutive detections.
95
- - max_count (numpy.ndarray): Array of the longest contiguous sequence of detections for each epoch.
96
-
97
- Example:
98
- If the scan time is 2 seconds, there are two channels, and the Epoch is 100, the receiver would listen at:
99
- (-3) 87-89, (-2) 91-93, (-1) 95-97, (0) 100, (1) 103-105, (2) 107-109, (3) 111-113 seconds.
100
-
101
- A tag with a 3-second burst rate at an epoch of 100 would be heard at:
102
- 100, 103, 106, 109, 112 - meaning at least 1 out of 5 bursts could be missed.
103
-
104
- Note:
105
- Detection histories from multi-channel switching receivers are expected to be sparse, as they are not always listening.
106
- This function helps in understanding and analyzing this sparseness.
107
- '''
108
-
109
- # create dictionaries that will hold the epoch shift and its lower and upper limits
110
- epoch_shift_dict = {}
111
- lower_limit_dict = {}
112
- upper_limit_dict = {}
113
-
114
- # create a dictionary with key a detection and the value a boolean array of length of Epoch array
115
- detection_history_dict = {}
116
-
117
- # build detection history ushing shifts
118
- if num_channels > 1:
119
- for i in np.arange(num_detects * -1 , num_detects + 1, 1):
120
- epoch_shift_dict[i] = np.round(pd.Series(epoch).shift(i * -1).to_numpy().astype(np.float32),6)
121
- lower_limit_dict[i] = np.where(scan_time > 2 * pulse_rate,
122
- epoch + (pulse_rate * i - 1),
123
- epoch + ((scan_time * channels * i) - 1))
124
-
125
- upper_limit_dict[i] = np.where(scan_time > 2 * pulse_rate,
126
- epoch + (pulse_rate * i + 1),
127
- epoch + ((scan_time * channels * i) + 1))
128
-
129
- else:
130
- for i in np.arange(num_detects * -1 , num_detects + 1, 1):
131
- epoch_shift_dict[i] = pd.Series(epoch).shift(i * -1).to_numpy().astype(np.float32)
132
- lower_limit_dict[i] = epoch + (pulse_rate * i - 1)
133
- upper_limit_dict[i] = epoch + (pulse_rate * i + 1)
134
-
135
- for i in np.arange(num_detects * -1 , num_detects + 1, 1):
136
- if i == 0:
137
- detection_history_dict[i] = np.repeat('1',len(epoch))
138
-
139
- else:
140
- detection_history_dict[i] = np.where(np.logical_and(epoch_shift_dict[i] >= lower_limit_dict[i],
141
- epoch_shift_dict[i] < upper_limit_dict[i]),
142
- '1',
143
- '0')
144
-
145
- # create an immediate detection history around the current record to assess consecutive detections
146
- cons_arr = np.column_stack((detection_history_dict[-1],detection_history_dict[0],detection_history_dict[1])).astype(np.float32)
147
- cons_arr_sums = np.sum(cons_arr, axis = 1)
148
- cons_det = np.where(cons_arr_sums > 1,1,0)
149
-
150
- # calculate hit ratio - return det_hist_conc, det_hist
151
- det_hist_length = np.arange(num_detects * -1 , num_detects + 1, 1).shape[0]
152
- det_hist_stack = np.column_stack([detection_history_dict[x] for x in np.arange(num_detects * -1, num_detects + 1, 1)])
153
- det_hist_conc = [''.join(row) for row in det_hist_stack]
154
- det_hist_float = det_hist_stack.astype(np.float32)
155
- row_sums = np.sum(det_hist_float, axis = 1)
156
- det_hist = row_sums/det_hist_length
157
-
158
- # calculate consecutive record length
159
- max_count = np.zeros(epoch.shape)
160
- current_count = np.repeat(1,len(epoch.shape))
161
- for i in np.arange(num_detects * -1 , num_detects + 1, 1):
162
- current_col = detection_history_dict[i]
163
- current_count = np.where(current_col == '1', current_count + 1, 0)
164
- max_count = np.where(current_count > max_count,
165
- current_count,
166
- max_count)
167
-
168
- return det_hist_conc, det_hist, cons_det, max_count
230
+ Helper function for detection_history to identify longest run of
231
+ expected in-series detections.
232
+
233
+ Parameters
234
+ ----------
235
+ arr : array_like
236
+ Binary array (0s and 1s)
237
+
238
+ Returns
239
+ -------
240
+ int
241
+ Length of longest consecutive sequence of 1's
242
+
243
+ Examples
244
+ --------
245
+ >>> arr = np.array([1, 1, 0, 1, 1, 1, 0, 1])
246
+ >>> predictors.max_contiguous_sequence(arr)
247
+ 3
248
+ """
249
+ # Finds the maximum number of consecutive 1's in an array
250
+ return max(map(len, ''.join(map(str, arr)).split('0')))
251
+
252
+
253
+ def detection_history(epoch, pulse_rate, num_detects, num_channels, scan_time):
254
+ """
255
+ Calculate maximum contiguous sequence of expected detections.
256
+
257
+ Looks forward and backward in time to find longest run of detections
258
+ that match expected pulse intervals. Accounts for multi-channel scanning
259
+ patterns where pulse rate may exceed scan time.
260
+
261
+ Parameters
262
+ ----------
263
+ epoch : array_like
264
+ Detection timestamps (seconds since 1970-01-01)
265
+ pulse_rate : array_like
266
+ Programmed pulse rate for each tag (seconds)
267
+ num_detects : int
268
+ Number of detections to look forward/backward (window size)
269
+ num_channels : array_like
270
+ Number of receiver channels
271
+ scan_time : array_like
272
+ Scan duration per channel (seconds)
273
+
274
+ Returns
275
+ -------
276
+ numpy.ndarray
277
+ Array of maximum contiguous sequence lengths for each detection
278
+
279
+ Notes
280
+ -----
281
+ - Creates detection window of size (2 * num_detects + 1)
282
+ - Checks if adjacent detections match expected pulse intervals
283
+ - Accounts for scan_time > pulse_rate (detection aliasing)
284
+ - Uses vectorized operations for performance
285
+
286
+ Examples
287
+ --------
288
+ >>> epochs = np.array([100, 105, 110, 115, 120])
289
+ >>> pulse_rate = np.array([5.0, 5.0, 5.0, 5.0, 5.0])
290
+ >>> num_channels = np.array([1, 1, 1, 1, 1])
291
+ >>> scan_time = np.array([1.0, 1.0, 1.0, 1.0, 1.0])
292
+ >>> predictors.detection_history(epochs, pulse_rate, 2, num_channels, scan_time)
293
+ array([5, 5, 5, 5, 5])
294
+
295
+ See Also
296
+ --------
297
+ max_contiguous_sequence : Helper function for finding longest run
298
+ series_hit : Simpler in-series detection check
299
+ """
300
+ shifts = np.arange(-num_detects, num_detects + 1)
301
+
302
+ # Create shifted epochs for each detection window (NaNs for out-of-range shifts)
303
+ shifted_df = pd.DataFrame({f'Shift_{s}': pd.Series(epoch).shift(s) for s in (-shifts)})
304
+ shifted_epochs = shifted_df.to_numpy()
305
+
306
+ # Expand arrays for vectorized operations
307
+ m = len(shifts)
308
+ epoch_expanded = np.tile(epoch[:, None], (1, m))
309
+ scan_time_expanded = np.tile(scan_time[:, None], (1, m))
310
+ num_channels_expanded = np.tile(num_channels[:, None], (1, m))
311
+
312
+ # Expected epoch per shift
313
+ expected_epoch = np.where(
314
+ num_channels_expanded == 1,
315
+ epoch_expanded + shifts * pulse_rate,
316
+ np.where(
317
+ scan_time_expanded > 2 * pulse_rate,
318
+ epoch_expanded + shifts * pulse_rate,
319
+ epoch_expanded + shifts * scan_time_expanded * num_channels_expanded
320
+ )
321
+ )
322
+
323
+ # Window size relative to pulse rate
324
+ window_size = np.where(
325
+ num_channels == 1,
326
+ np.where(pulse_rate > 10, 1, pulse_rate),
327
+ scan_time / 2.0
328
+ )
329
+ window_size_expanded = np.tile(window_size[:, None], (1, m))
330
+
331
+ # Elementwise window limits
332
+ lower_limits = expected_epoch - window_size_expanded
333
+ upper_limits = expected_epoch + window_size_expanded
334
+
335
+ # *** FIX: elementwise compare, not all-to-all ***
336
+ detection_history = (
337
+ (shifted_epochs >= lower_limits) &
338
+ (shifted_epochs <= upper_limits)
339
+ ).astype(np.int32)
340
+
341
+ # Ensure current epoch marked
342
+ detection_history[:, num_detects] = 1
343
+
344
+ # Metrics
345
+ hit_ratio = detection_history.sum(axis=1) / detection_history.shape[1]
346
+ cons_det = (detection_history[:, 1:-1].sum(axis=1) > 1).astype(np.int32)
347
+
348
+ # Max contiguous sequence of 1s (assumes you have max_contiguous_sequence)
349
+ max_count = np.array([max_contiguous_sequence(hist) for hist in detection_history])
350
+
351
+ return detection_history, hit_ratio, cons_det, max_count
352
+
353
+
354
+
169
355
 
170
356