pymast 0.0.6__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pymast/__init__.py +31 -2
- pymast/fish_history.py +59 -6
- pymast/formatter.py +886 -548
- pymast/logger.py +58 -0
- pymast/naive_bayes.py +116 -9
- pymast/overlap_removal.py +2327 -490
- pymast/parsers.py +1111 -239
- pymast/predictors.py +302 -116
- pymast/radio_project.py +1382 -512
- pymast/validation.py +224 -0
- pymast-1.0.0.dist-info/METADATA +636 -0
- pymast-1.0.0.dist-info/RECORD +15 -0
- {pymast-0.0.6.dist-info → pymast-1.0.0.dist-info}/WHEEL +1 -1
- pymast/table_merge.py +0 -154
- pymast-0.0.6.dist-info/METADATA +0 -19
- pymast-0.0.6.dist-info/RECORD +0 -14
- {pymast-0.0.6.dist-info → pymast-1.0.0.dist-info/licenses}/LICENSE.txt +0 -0
- {pymast-0.0.6.dist-info → pymast-1.0.0.dist-info}/top_level.txt +0 -0
pymast/predictors.py
CHANGED
|
@@ -1,28 +1,119 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
"""
|
|
3
|
-
|
|
3
|
+
Predictor functions for radio telemetry classification and filtering.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
This module provides statistical predictor functions used during Naive Bayes
|
|
6
|
+
classification to distinguish true fish detections from noise. Each predictor
|
|
7
|
+
calculates a feature that helps identify legitimate versus spurious detections.
|
|
8
|
+
|
|
9
|
+
Core Predictors
|
|
10
|
+
---------------
|
|
11
|
+
- **noise_ratio**: Ratio of miscoded to correctly-coded detections in time window
|
|
12
|
+
- **series_hit**: Whether detection is in-series with previous/next detection
|
|
13
|
+
- **detection_history**: Maximum contiguous sequence of expected detections
|
|
14
|
+
- **factors**: Prime factorization for pulse rate calculations
|
|
15
|
+
|
|
16
|
+
Classification Pipeline
|
|
17
|
+
-----------------------
|
|
18
|
+
These predictors are calculated during data import and used as features in the
|
|
19
|
+
Naive Bayes classifier. They help identify:
|
|
20
|
+
1. Miscoded tags from environmental noise
|
|
21
|
+
2. Out-of-series detections from spurious signals
|
|
22
|
+
3. Detection patterns inconsistent with known pulse rates
|
|
23
|
+
|
|
24
|
+
Typical Usage
|
|
25
|
+
-------------
|
|
26
|
+
>>> import pymast.predictors as predictors
|
|
27
|
+
>>> import numpy as np
|
|
28
|
+
>>>
|
|
29
|
+
>>> # Calculate noise ratio for detections
|
|
30
|
+
>>> noise = predictors.noise_ratio(
|
|
31
|
+
... duration=5.0,
|
|
32
|
+
... freq_codes=freq_codes_array,
|
|
33
|
+
... epochs=epochs_array,
|
|
34
|
+
... study_tags=['166.380 7', '166.380 12']
|
|
35
|
+
... )
|
|
36
|
+
>>>
|
|
37
|
+
>>> # Check detection history
|
|
38
|
+
>>> max_seq = predictors.detection_history(
|
|
39
|
+
... epoch=epoch_array,
|
|
40
|
+
... pulse_rate=pulse_rate_array,
|
|
41
|
+
... num_detects=5,
|
|
42
|
+
... num_channels=1,
|
|
43
|
+
... scan_time=1.0
|
|
44
|
+
... )
|
|
45
|
+
|
|
46
|
+
Notes
|
|
47
|
+
-----
|
|
48
|
+
- Predictors assume VHF pulse-coded tags (frequency + code)
|
|
49
|
+
- Noise ratio uses 5-minute moving window by default
|
|
50
|
+
- Series hit checks for mortality rate changes (active vs expired tags)
|
|
51
|
+
- Detection history accounts for multi-channel scan patterns
|
|
52
|
+
|
|
53
|
+
See Also
|
|
54
|
+
--------
|
|
55
|
+
naive_bayes.train : Classifier training using these predictors
|
|
56
|
+
parsers : Data import where predictors are first calculated
|
|
6
57
|
"""
|
|
58
|
+
|
|
59
|
+
import numba as nb
|
|
7
60
|
import numpy as np
|
|
8
61
|
import pandas as pd
|
|
62
|
+
pd.set_option('display.float_format', '{:.10f}'.format)
|
|
63
|
+
np.set_printoptions(suppress=True, precision=10)
|
|
9
64
|
|
|
10
|
-
def noise_ratio (duration, freq_codes,epochs,study_tags):
|
|
11
|
-
|
|
12
|
-
''' function calculates the ratio of miscoded, pure noise detections, to matching frequency/code
|
|
13
|
-
detections within the duration specified.
|
|
14
|
-
|
|
15
|
-
In other words, what is the ratio of miscoded to correctly coded detections within the duration specified
|
|
16
65
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
66
|
+
def noise_ratio (duration, freq_codes,epochs,study_tags):
|
|
67
|
+
"""
|
|
68
|
+
Calculate ratio of miscoded to total detections within moving time window.
|
|
69
|
+
|
|
70
|
+
Identifies noise by comparing miscoded detections (freq_codes not in study_tags)
|
|
71
|
+
to total detection count within specified duration. High noise ratio indicates
|
|
72
|
+
environmental interference or receiver malfunction.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
duration : float
|
|
77
|
+
Moving window length in seconds (e.g., 300.0 for 5 minutes)
|
|
78
|
+
freq_codes : array_like
|
|
79
|
+
Array of detected frequency-code strings (e.g., ['166.380 7', '166.380 12'])
|
|
80
|
+
epochs : array_like
|
|
81
|
+
Array of detection timestamps (seconds since 1970-01-01)
|
|
82
|
+
study_tags : list of str
|
|
83
|
+
List of valid freq_code tags deployed in study
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
numpy.ndarray
|
|
88
|
+
Array of noise ratios (float32) with same length as input arrays.
|
|
89
|
+
Values range from 0 (no noise) to 1 (all noise).
|
|
90
|
+
|
|
91
|
+
Notes
|
|
92
|
+
-----
|
|
93
|
+
- Bins epochs into duration-sized windows
|
|
94
|
+
- Counts miscoded detections per bin (not in study_tags)
|
|
95
|
+
- Calculates ratio: miscodes / total_detections
|
|
96
|
+
- Ratio propagated to all detections in same bin
|
|
97
|
+
|
|
98
|
+
Examples
|
|
99
|
+
--------
|
|
100
|
+
>>> import numpy as np
|
|
101
|
+
>>> freq_codes = np.array(['166.380 7', '166.380 99', '166.380 7'])
|
|
102
|
+
>>> epochs = np.array([1000.0, 1100.0, 1200.0])
|
|
103
|
+
>>> study_tags = ['166.380 7', '166.380 12']
|
|
104
|
+
>>> predictors.noise_ratio(300.0, freq_codes, epochs, study_tags)
|
|
105
|
+
array([0.33333334, 0.33333334, 0.33333334], dtype=float32)
|
|
106
|
+
|
|
107
|
+
See Also
|
|
108
|
+
--------
|
|
109
|
+
naive_bayes.train : Uses noise_ratio as classification feature
|
|
110
|
+
"""
|
|
21
111
|
# identify miscodes
|
|
22
112
|
miscode = np.isin(freq_codes, study_tags, invert = True)
|
|
23
113
|
|
|
24
|
-
# bin everything into nearest
|
|
25
|
-
|
|
114
|
+
# bin everything into nearest duration-sized time bin and count miscodes and total number of detections
|
|
115
|
+
# Ensure epochs are integer seconds (or convertible)
|
|
116
|
+
binned_epoch = (epochs // duration).astype('int64')
|
|
26
117
|
|
|
27
118
|
# Now identify the number of unique freq-codes within each bin
|
|
28
119
|
# Create a DataFrame from the arrays
|
|
@@ -46,21 +137,81 @@ def noise_ratio (duration, freq_codes,epochs,study_tags):
|
|
|
46
137
|
return df.noise_ratio.values.astype(np.float32)
|
|
47
138
|
|
|
48
139
|
def factors(n):
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
140
|
+
"""
|
|
141
|
+
Return all factors of integer n.
|
|
142
|
+
|
|
143
|
+
Used to calculate least common multiplier for pulse rate calculations.
|
|
144
|
+
Helps identify valid pulse intervals when multiple tags share similar rates.
|
|
145
|
+
|
|
146
|
+
Parameters
|
|
147
|
+
----------
|
|
148
|
+
n : int
|
|
149
|
+
Integer to factorize
|
|
150
|
+
|
|
151
|
+
Returns
|
|
152
|
+
-------
|
|
153
|
+
list of int
|
|
154
|
+
All factors of n (including 1 and n)
|
|
155
|
+
|
|
156
|
+
Examples
|
|
157
|
+
--------
|
|
158
|
+
>>> predictors.factors(12)
|
|
159
|
+
[1, 2, 3, 4, 6, 12]
|
|
160
|
+
|
|
161
|
+
Notes
|
|
162
|
+
-----
|
|
163
|
+
Simple brute-force factorization, not optimized for large numbers.
|
|
164
|
+
See: http://stackoverflow.com/questions/16996217/prime-factorization-list
|
|
165
|
+
"""
|
|
52
166
|
pList = []
|
|
53
167
|
for i in np.arange(1, n + 1):
|
|
54
168
|
if n % i == 0:
|
|
55
169
|
pList.append(i)
|
|
56
170
|
return pList
|
|
57
171
|
|
|
172
|
+
|
|
58
173
|
def series_hit (lags, pulse_rate, mort_rate, status,):
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
174
|
+
"""
|
|
175
|
+
Check if detection lag matches expected pulse rate (in-series detection).
|
|
176
|
+
|
|
177
|
+
Determines whether time difference to previous/next detection is consistent
|
|
178
|
+
with tag's programmed pulse rate (active) or mortality rate (expired tag).
|
|
179
|
+
|
|
180
|
+
Parameters
|
|
181
|
+
----------
|
|
182
|
+
lags : array_like
|
|
183
|
+
Time differences to previous detection (seconds)
|
|
184
|
+
pulse_rate : array_like
|
|
185
|
+
Programmed pulse rate for each tag (seconds)
|
|
186
|
+
mort_rate : array_like
|
|
187
|
+
Mortality pulse rate for each tag (seconds)
|
|
188
|
+
status : array_like
|
|
189
|
+
Tag status ('A' for active, other for expired/mortality)
|
|
190
|
+
|
|
191
|
+
Returns
|
|
192
|
+
-------
|
|
193
|
+
numpy.ndarray
|
|
194
|
+
Binary array: 1 if detection is in-series, 0 if out-of-series
|
|
195
|
+
|
|
196
|
+
Notes
|
|
197
|
+
-----
|
|
198
|
+
- Active tags checked against pulse_rate
|
|
199
|
+
- Expired tags checked against mort_rate
|
|
200
|
+
- Uses modulo to check if lag is multiple of expected rate
|
|
201
|
+
|
|
202
|
+
Examples
|
|
203
|
+
--------
|
|
204
|
+
>>> lags = np.array([5.0, 10.0, 7.5])
|
|
205
|
+
>>> pulse_rate = np.array([5.0, 5.0, 5.0])
|
|
206
|
+
>>> mort_rate = np.array([30.0, 30.0, 30.0])
|
|
207
|
+
>>> status = np.array(['A', 'A', 'A'])
|
|
208
|
+
>>> predictors.series_hit(lags, pulse_rate, mort_rate, status)
|
|
209
|
+
array([1, 1, 0])
|
|
63
210
|
|
|
211
|
+
See Also
|
|
212
|
+
--------
|
|
213
|
+
detection_history : More comprehensive in-series detection check
|
|
214
|
+
"""
|
|
64
215
|
# determine if the lag is potentially in series with the correct pulse rate based on status
|
|
65
216
|
series_hit = np.where(status == 'A',
|
|
66
217
|
np.where(lags % pulse_rate == 0,
|
|
@@ -70,101 +221,136 @@ def series_hit (lags, pulse_rate, mort_rate, status,):
|
|
|
70
221
|
1,
|
|
71
222
|
0)
|
|
72
223
|
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def max_contiguous_sequence(arr):
|
|
227
|
+
"""
|
|
228
|
+
Find maximum number of consecutive 1's in binary array.
|
|
73
229
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
#
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
230
|
+
Helper function for detection_history to identify longest run of
|
|
231
|
+
expected in-series detections.
|
|
232
|
+
|
|
233
|
+
Parameters
|
|
234
|
+
----------
|
|
235
|
+
arr : array_like
|
|
236
|
+
Binary array (0s and 1s)
|
|
237
|
+
|
|
238
|
+
Returns
|
|
239
|
+
-------
|
|
240
|
+
int
|
|
241
|
+
Length of longest consecutive sequence of 1's
|
|
242
|
+
|
|
243
|
+
Examples
|
|
244
|
+
--------
|
|
245
|
+
>>> arr = np.array([1, 1, 0, 1, 1, 1, 0, 1])
|
|
246
|
+
>>> predictors.max_contiguous_sequence(arr)
|
|
247
|
+
3
|
|
248
|
+
"""
|
|
249
|
+
# Finds the maximum number of consecutive 1's in an array
|
|
250
|
+
return max(map(len, ''.join(map(str, arr)).split('0')))
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def detection_history(epoch, pulse_rate, num_detects, num_channels, scan_time):
|
|
254
|
+
"""
|
|
255
|
+
Calculate maximum contiguous sequence of expected detections.
|
|
256
|
+
|
|
257
|
+
Looks forward and backward in time to find longest run of detections
|
|
258
|
+
that match expected pulse intervals. Accounts for multi-channel scanning
|
|
259
|
+
patterns where pulse rate may exceed scan time.
|
|
260
|
+
|
|
261
|
+
Parameters
|
|
262
|
+
----------
|
|
263
|
+
epoch : array_like
|
|
264
|
+
Detection timestamps (seconds since 1970-01-01)
|
|
265
|
+
pulse_rate : array_like
|
|
266
|
+
Programmed pulse rate for each tag (seconds)
|
|
267
|
+
num_detects : int
|
|
268
|
+
Number of detections to look forward/backward (window size)
|
|
269
|
+
num_channels : array_like
|
|
270
|
+
Number of receiver channels
|
|
271
|
+
scan_time : array_like
|
|
272
|
+
Scan duration per channel (seconds)
|
|
273
|
+
|
|
274
|
+
Returns
|
|
275
|
+
-------
|
|
276
|
+
numpy.ndarray
|
|
277
|
+
Array of maximum contiguous sequence lengths for each detection
|
|
278
|
+
|
|
279
|
+
Notes
|
|
280
|
+
-----
|
|
281
|
+
- Creates detection window of size (2 * num_detects + 1)
|
|
282
|
+
- Checks if adjacent detections match expected pulse intervals
|
|
283
|
+
- Accounts for scan_time > pulse_rate (detection aliasing)
|
|
284
|
+
- Uses vectorized operations for performance
|
|
285
|
+
|
|
286
|
+
Examples
|
|
287
|
+
--------
|
|
288
|
+
>>> epochs = np.array([100, 105, 110, 115, 120])
|
|
289
|
+
>>> pulse_rate = np.array([5.0, 5.0, 5.0, 5.0, 5.0])
|
|
290
|
+
>>> num_channels = np.array([1, 1, 1, 1, 1])
|
|
291
|
+
>>> scan_time = np.array([1.0, 1.0, 1.0, 1.0, 1.0])
|
|
292
|
+
>>> predictors.detection_history(epochs, pulse_rate, 2, num_channels, scan_time)
|
|
293
|
+
array([5, 5, 5, 5, 5])
|
|
294
|
+
|
|
295
|
+
See Also
|
|
296
|
+
--------
|
|
297
|
+
max_contiguous_sequence : Helper function for finding longest run
|
|
298
|
+
series_hit : Simpler in-series detection check
|
|
299
|
+
"""
|
|
300
|
+
shifts = np.arange(-num_detects, num_detects + 1)
|
|
301
|
+
|
|
302
|
+
# Create shifted epochs for each detection window (NaNs for out-of-range shifts)
|
|
303
|
+
shifted_df = pd.DataFrame({f'Shift_{s}': pd.Series(epoch).shift(s) for s in (-shifts)})
|
|
304
|
+
shifted_epochs = shifted_df.to_numpy()
|
|
305
|
+
|
|
306
|
+
# Expand arrays for vectorized operations
|
|
307
|
+
m = len(shifts)
|
|
308
|
+
epoch_expanded = np.tile(epoch[:, None], (1, m))
|
|
309
|
+
scan_time_expanded = np.tile(scan_time[:, None], (1, m))
|
|
310
|
+
num_channels_expanded = np.tile(num_channels[:, None], (1, m))
|
|
311
|
+
|
|
312
|
+
# Expected epoch per shift
|
|
313
|
+
expected_epoch = np.where(
|
|
314
|
+
num_channels_expanded == 1,
|
|
315
|
+
epoch_expanded + shifts * pulse_rate,
|
|
316
|
+
np.where(
|
|
317
|
+
scan_time_expanded > 2 * pulse_rate,
|
|
318
|
+
epoch_expanded + shifts * pulse_rate,
|
|
319
|
+
epoch_expanded + shifts * scan_time_expanded * num_channels_expanded
|
|
320
|
+
)
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# Window size relative to pulse rate
|
|
324
|
+
window_size = np.where(
|
|
325
|
+
num_channels == 1,
|
|
326
|
+
np.where(pulse_rate > 10, 1, pulse_rate),
|
|
327
|
+
scan_time / 2.0
|
|
328
|
+
)
|
|
329
|
+
window_size_expanded = np.tile(window_size[:, None], (1, m))
|
|
330
|
+
|
|
331
|
+
# Elementwise window limits
|
|
332
|
+
lower_limits = expected_epoch - window_size_expanded
|
|
333
|
+
upper_limits = expected_epoch + window_size_expanded
|
|
334
|
+
|
|
335
|
+
# *** FIX: elementwise compare, not all-to-all ***
|
|
336
|
+
detection_history = (
|
|
337
|
+
(shifted_epochs >= lower_limits) &
|
|
338
|
+
(shifted_epochs <= upper_limits)
|
|
339
|
+
).astype(np.int32)
|
|
340
|
+
|
|
341
|
+
# Ensure current epoch marked
|
|
342
|
+
detection_history[:, num_detects] = 1
|
|
343
|
+
|
|
344
|
+
# Metrics
|
|
345
|
+
hit_ratio = detection_history.sum(axis=1) / detection_history.shape[1]
|
|
346
|
+
cons_det = (detection_history[:, 1:-1].sum(axis=1) > 1).astype(np.int32)
|
|
347
|
+
|
|
348
|
+
# Max contiguous sequence of 1s (assumes you have max_contiguous_sequence)
|
|
349
|
+
max_count = np.array([max_contiguous_sequence(hist) for hist in detection_history])
|
|
350
|
+
|
|
351
|
+
return detection_history, hit_ratio, cons_det, max_count
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
|
|
169
355
|
|
|
170
356
|
|