PyPI - pymast - Versions diffs - 0.0.6__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

pymast 0.0.6py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

pymast/__init__.py +31 -2
pymast/fish_history.py +59 -6
pymast/formatter.py +886 -548
pymast/logger.py +58 -0
pymast/naive_bayes.py +116 -9
pymast/overlap_removal.py +2327 -490
pymast/parsers.py +1111 -239
pymast/predictors.py +302 -116
pymast/radio_project.py +1382 -512
pymast/validation.py +224 -0
pymast-1.0.0.dist-info/METADATA +636 -0
pymast-1.0.0.dist-info/RECORD +15 -0
{pymast-0.0.6.dist-info → pymast-1.0.0.dist-info}/WHEEL +1 -1
pymast/table_merge.py +0 -154
pymast-0.0.6.dist-info/METADATA +0 -19
pymast-0.0.6.dist-info/RECORD +0 -14
{pymast-0.0.6.dist-info → pymast-1.0.0.dist-info/licenses}/LICENSE.txt +0 -0
{pymast-0.0.6.dist-info → pymast-1.0.0.dist-info}/top_level.txt +0 -0

pymast/predictors.py CHANGED Viewed

@@ -1,28 +1,119 @@
 # -*- coding: utf-8 -*-
 """
-Created on Tue Nov 14 10:52:20 2023
+Predictor functions for radio telemetry classification and filtering.
-@author: KNebiolo
+This module provides statistical predictor functions used during Naive Bayes
+classification to distinguish true fish detections from noise. Each predictor
+calculates a feature that helps identify legitimate versus spurious detections.
+Core Predictors
+---------------
+- **noise_ratio**: Ratio of miscoded to correctly-coded detections in time window
+- **series_hit**: Whether detection is in-series with previous/next detection
+- **detection_history**: Maximum contiguous sequence of expected detections
+- **factors**: Prime factorization for pulse rate calculations
+Classification Pipeline
+-----------------------
+These predictors are calculated during data import and used as features in the
+Naive Bayes classifier. They help identify:
+1. Miscoded tags from environmental noise
+2. Out-of-series detections from spurious signals
+3. Detection patterns inconsistent with known pulse rates
+Typical Usage
+-------------
+>>> import pymast.predictors as predictors
+>>> import numpy as np
+>>>
+>>> # Calculate noise ratio for detections
+>>> noise = predictors.noise_ratio(
+...     duration=5.0,
+...     freq_codes=freq_codes_array,
+...     epochs=epochs_array,
+...     study_tags=['166.380 7', '166.380 12']
+... )
+>>>
+>>> # Check detection history
+>>> max_seq = predictors.detection_history(
+...     epoch=epoch_array,
+...     pulse_rate=pulse_rate_array,
+...     num_detects=5,
+...     num_channels=1,
+...     scan_time=1.0
+... )
+Notes
+-----
+- Predictors assume VHF pulse-coded tags (frequency + code)
+- Noise ratio uses 5-minute moving window by default
+- Series hit checks for mortality rate changes (active vs expired tags)
+- Detection history accounts for multi-channel scan patterns
+See Also
+--------
+naive_bayes.train : Classifier training using these predictors
+parsers : Data import where predictors are first calculated
 """
+import numba as nb
 import numpy as np
 import pandas as pd
+pd.set_option('display.float_format', '{:.10f}'.format)
+np.set_printoptions(suppress=True, precision=10)
-def noise_ratio (duration, freq_codes,epochs,study_tags):
-    ''' function calculates the ratio of miscoded, pure noise detections, to matching frequency/code
-    detections within the duration specified.
-    In other words, what is the ratio of miscoded to correctly coded detections within the duration specified
-    duration = moving window length in minutes
-    data = current data file
-    study_tags = list or list like object of study tags
-    '''
+def noise_ratio (duration, freq_codes,epochs,study_tags):
+    """
+    Calculate ratio of miscoded to total detections within moving time window.
+    Identifies noise by comparing miscoded detections (freq_codes not in study_tags)
+    to total detection count within specified duration. High noise ratio indicates
+    environmental interference or receiver malfunction.
+    Parameters
+    ----------
+    duration : float
+        Moving window length in seconds (e.g., 300.0 for 5 minutes)
+    freq_codes : array_like
+        Array of detected frequency-code strings (e.g., ['166.380 7', '166.380 12'])
+    epochs : array_like
+        Array of detection timestamps (seconds since 1970-01-01)
+    study_tags : list of str
+        List of valid freq_code tags deployed in study
+    Returns
+    -------
+    numpy.ndarray
+        Array of noise ratios (float32) with same length as input arrays.
+        Values range from 0 (no noise) to 1 (all noise).
+    Notes
+    -----
+    - Bins epochs into duration-sized windows
+    - Counts miscoded detections per bin (not in study_tags)
+    - Calculates ratio: miscodes / total_detections
+    - Ratio propagated to all detections in same bin
+    Examples
+    --------
+    >>> import numpy as np
+    >>> freq_codes = np.array(['166.380 7', '166.380 99', '166.380 7'])
+    >>> epochs = np.array([1000.0, 1100.0, 1200.0])
+    >>> study_tags = ['166.380 7', '166.380 12']
+    >>> predictors.noise_ratio(300.0, freq_codes, epochs, study_tags)
+    array([0.33333334, 0.33333334, 0.33333334], dtype=float32)
+    See Also
+    --------
+    naive_bayes.train : Uses noise_ratio as classification feature
+    """
     # identify miscodes
     miscode = np.isin(freq_codes, study_tags, invert = True)
-    # bin everything into nearest 5 min time bin and count miscodes and total number of detections
-    binned_epoch = epochs//duration
+    # bin everything into nearest duration-sized time bin and count miscodes and total number of detections
+    # Ensure epochs are integer seconds (or convertible)
+    binned_epoch = (epochs // duration).astype('int64')
     # Now identify the number of unique freq-codes within each bin
     # Create a DataFrame from the arrays
@@ -46,21 +137,81 @@ def noise_ratio (duration, freq_codes,epochs,study_tags):
     return df.noise_ratio.values.astype(np.float32)
 def factors(n):
-    ''''function to return primes used to quantify the least common multiplier
-    see: http://stackoverflow.com/questions/16996217/prime-factorization-list for recipe'''
+    """
+    Return all factors of integer n.
+    Used to calculate least common multiplier for pulse rate calculations.
+    Helps identify valid pulse intervals when multiple tags share similar rates.
+    Parameters
+    ----------
+    n : int
+        Integer to factorize
+    Returns
+    -------
+    list of int
+        All factors of n (including 1 and n)
+    Examples
+    --------
+    >>> predictors.factors(12)
+    [1, 2, 3, 4, 6, 12]
+    Notes
+    -----
+    Simple brute-force factorization, not optimized for large numbers.
+    See: http://stackoverflow.com/questions/16996217/prime-factorization-list
+    """
     pList = []
     for i in np.arange(1, n + 1):
         if n % i == 0:
             pList.append(i)
     return pList
 def series_hit (lags, pulse_rate, mort_rate, status,):
-    '''seriesHit is a function for returning whether or not a detection on a specific
-    frequency/code is in series with the previous or next detection on that same
-    frequency/code
-    '''
+    """
+    Check if detection lag matches expected pulse rate (in-series detection).
+    Determines whether time difference to previous/next detection is consistent
+    with tag's programmed pulse rate (active) or mortality rate (expired tag).
+    Parameters
+    ----------
+    lags : array_like
+        Time differences to previous detection (seconds)
+    pulse_rate : array_like
+        Programmed pulse rate for each tag (seconds)
+    mort_rate : array_like
+        Mortality pulse rate for each tag (seconds)
+    status : array_like
+        Tag status ('A' for active, other for expired/mortality)
+    Returns
+    -------
+    numpy.ndarray
+        Binary array: 1 if detection is in-series, 0 if out-of-series
+    Notes
+    -----
+    - Active tags checked against pulse_rate
+    - Expired tags checked against mort_rate
+    - Uses modulo to check if lag is multiple of expected rate
+    Examples
+    --------
+    >>> lags = np.array([5.0, 10.0, 7.5])
+    >>> pulse_rate = np.array([5.0, 5.0, 5.0])
+    >>> mort_rate = np.array([30.0, 30.0, 30.0])
+    >>> status = np.array(['A', 'A', 'A'])
+    >>> predictors.series_hit(lags, pulse_rate, mort_rate, status)
+    array([1, 1, 0])
+    See Also
+    --------
+    detection_history : More comprehensive in-series detection check
+    """
     # determine if the lag is potentially in series with the correct pulse rate based on status
     series_hit = np.where(status == 'A',
                           np.where(lags % pulse_rate == 0,
@@ -70,101 +221,136 @@ def series_hit (lags, pulse_rate, mort_rate, status,):
                                    1,
                                    0)
                           )
+def max_contiguous_sequence(arr):
+    """
+    Find maximum number of consecutive 1's in binary array.
-def detection_history (epoch, pulse_rate, num_detects, num_channels, scan_time, channels, status = 'A'):
-    '''
-    Calculate detection history for multi-channel switching receivers.
-    This function computes the detection history based on the epoch time, pulse rate, number of detections,
-    number of channels, scan time, and channels. It accounts for the sparseness of detection histories
-    in multi-channel switching receivers, acknowledging that bursts may not always align with scan windows.
-    Parameters:
-    - Epoch (array-like): Array of epoch times.
-    - pulse_rate (int): Pulse rate of the tag.
-    - num_detects (int): Number of detections to consider.
-    - num_channels (int): Number of channels in the receiver.
-    - scan_time (int): Time taken for one scan.
-    - channels (int): Number of channels.
-    - status (str, optional): Status of the detection. Defaults to 'A'.
-    Returns:
-    - det_hist_conc (list of str): Concatenated string representation of detection history.
-    - det_hist (numpy.ndarray): Array representing the ratio of detections over the total number of detections.
-    - cons_det (numpy.ndarray): Array indicating consecutive detections.
-    - max_count (numpy.ndarray): Array of the longest contiguous sequence of detections for each epoch.
-    Example:
-    If the scan time is 2 seconds, there are two channels, and the Epoch is 100, the receiver would listen at:
-    (-3) 87-89, (-2) 91-93, (-1) 95-97, (0) 100, (1) 103-105, (2) 107-109, (3) 111-113 seconds.
-    A tag with a 3-second burst rate at an epoch of 100 would be heard at:
-    100, 103, 106, 109, 112 - meaning at least 1 out of 5 bursts could be missed.
-    Note:
-    Detection histories from multi-channel switching receivers are expected to be sparse, as they are not always listening.
-    This function helps in understanding and analyzing this sparseness.
-    '''
-    # create dictionaries that will hold the epoch shift and its lower and upper limits
-    epoch_shift_dict = {}
-    lower_limit_dict = {}
-    upper_limit_dict = {}
-    # create a dictionary with key a detection and the value a boolean array of length of Epoch array
-    detection_history_dict = {}
-    # build detection history ushing shifts
-    if num_channels > 1:
-        for i in np.arange(num_detects * -1 , num_detects + 1, 1):
-            epoch_shift_dict[i] = np.round(pd.Series(epoch).shift(i * -1).to_numpy().astype(np.float32),6)
-            lower_limit_dict[i] = np.where(scan_time > 2 * pulse_rate,
-                                           epoch + (pulse_rate * i - 1),
-                                           epoch + ((scan_time * channels * i) - 1))
-            upper_limit_dict[i] = np.where(scan_time > 2 * pulse_rate,
-                                           epoch + (pulse_rate * i + 1),
-                                           epoch + ((scan_time * channels * i) + 1))
-    else:
-        for i in np.arange(num_detects * -1 , num_detects + 1, 1):
-            epoch_shift_dict[i] = pd.Series(epoch).shift(i * -1).to_numpy().astype(np.float32)
-            lower_limit_dict[i] = epoch + (pulse_rate * i - 1)
-            upper_limit_dict[i] = epoch + (pulse_rate * i + 1)
-    for i in np.arange(num_detects * -1 , num_detects + 1, 1):
-        if i == 0:
-            detection_history_dict[i] = np.repeat('1',len(epoch))
-        else:
-            detection_history_dict[i] = np.where(np.logical_and(epoch_shift_dict[i] >= lower_limit_dict[i],
-                                                                epoch_shift_dict[i] < upper_limit_dict[i]),
-                                                 '1',
-                                                 '0')
-    # create an immediate detection history around the current record to assess consecutive detections
-    cons_arr = np.column_stack((detection_history_dict[-1],detection_history_dict[0],detection_history_dict[1])).astype(np.float32)
-    cons_arr_sums = np.sum(cons_arr, axis = 1)
-    cons_det = np.where(cons_arr_sums > 1,1,0)
-    # calculate hit ratio - return det_hist_conc, det_hist
-    det_hist_length = np.arange(num_detects * -1 , num_detects + 1, 1).shape[0]
-    det_hist_stack = np.column_stack([detection_history_dict[x] for x in np.arange(num_detects * -1, num_detects + 1, 1)])
-    det_hist_conc = [''.join(row) for row in det_hist_stack]
-    det_hist_float = det_hist_stack.astype(np.float32)
-    row_sums = np.sum(det_hist_float, axis = 1)
-    det_hist = row_sums/det_hist_length
-    # calculate consecutive record length
-    max_count = np.zeros(epoch.shape)
-    current_count = np.repeat(1,len(epoch.shape))
-    for i in np.arange(num_detects * -1 , num_detects + 1, 1):
-        current_col = detection_history_dict[i]
-        current_count = np.where(current_col == '1', current_count + 1, 0)
-        max_count = np.where(current_count > max_count,
-                             current_count,
-                             max_count)
-    return det_hist_conc, det_hist, cons_det, max_count
+    Helper function for detection_history to identify longest run of
+    expected in-series detections.
+    Parameters
+    ----------
+    arr : array_like
+        Binary array (0s and 1s)
+    Returns
+    -------
+    int
+        Length of longest consecutive sequence of 1's
+    Examples
+    --------
+    >>> arr = np.array([1, 1, 0, 1, 1, 1, 0, 1])
+    >>> predictors.max_contiguous_sequence(arr)
+    3
+    """
+    # Finds the maximum number of consecutive 1's in an array
+    return max(map(len, ''.join(map(str, arr)).split('0')))
+def detection_history(epoch, pulse_rate, num_detects, num_channels, scan_time):
+    """
+    Calculate maximum contiguous sequence of expected detections.
+    Looks forward and backward in time to find longest run of detections
+    that match expected pulse intervals. Accounts for multi-channel scanning
+    patterns where pulse rate may exceed scan time.
+    Parameters
+    ----------
+    epoch : array_like
+        Detection timestamps (seconds since 1970-01-01)
+    pulse_rate : array_like
+        Programmed pulse rate for each tag (seconds)
+    num_detects : int
+        Number of detections to look forward/backward (window size)
+    num_channels : array_like
+        Number of receiver channels
+    scan_time : array_like
+        Scan duration per channel (seconds)
+    Returns
+    -------
+    numpy.ndarray
+        Array of maximum contiguous sequence lengths for each detection
+    Notes
+    -----
+    - Creates detection window of size (2 * num_detects + 1)
+    - Checks if adjacent detections match expected pulse intervals
+    - Accounts for scan_time > pulse_rate (detection aliasing)
+    - Uses vectorized operations for performance
+    Examples
+    --------
+    >>> epochs = np.array([100, 105, 110, 115, 120])
+    >>> pulse_rate = np.array([5.0, 5.0, 5.0, 5.0, 5.0])
+    >>> num_channels = np.array([1, 1, 1, 1, 1])
+    >>> scan_time = np.array([1.0, 1.0, 1.0, 1.0, 1.0])
+    >>> predictors.detection_history(epochs, pulse_rate, 2, num_channels, scan_time)
+    array([5, 5, 5, 5, 5])
+    See Also
+    --------
+    max_contiguous_sequence : Helper function for finding longest run
+    series_hit : Simpler in-series detection check
+    """
+    shifts = np.arange(-num_detects, num_detects + 1)
+    # Create shifted epochs for each detection window (NaNs for out-of-range shifts)
+    shifted_df = pd.DataFrame({f'Shift_{s}': pd.Series(epoch).shift(s) for s in (-shifts)})
+    shifted_epochs = shifted_df.to_numpy()
+    # Expand arrays for vectorized operations
+    m = len(shifts)
+    epoch_expanded        = np.tile(epoch[:, None],        (1, m))
+    scan_time_expanded    = np.tile(scan_time[:, None],    (1, m))
+    num_channels_expanded = np.tile(num_channels[:, None], (1, m))
+    # Expected epoch per shift
+    expected_epoch = np.where(
+        num_channels_expanded == 1,
+        epoch_expanded + shifts * pulse_rate,
+        np.where(
+            scan_time_expanded > 2 * pulse_rate,
+            epoch_expanded + shifts * pulse_rate,
+            epoch_expanded + shifts * scan_time_expanded * num_channels_expanded
+        )
+    )
+    # Window size relative to pulse rate
+    window_size = np.where(
+        num_channels == 1,
+        np.where(pulse_rate > 10, 1, pulse_rate),
+        scan_time / 2.0
+    )
+    window_size_expanded = np.tile(window_size[:, None], (1, m))
+    # Elementwise window limits
+    lower_limits = expected_epoch - window_size_expanded
+    upper_limits = expected_epoch + window_size_expanded
+    # *** FIX: elementwise compare, not all-to-all ***
+    detection_history = (
+        (shifted_epochs >= lower_limits) &
+        (shifted_epochs <= upper_limits)
+    ).astype(np.int32)
+    # Ensure current epoch marked
+    detection_history[:, num_detects] = 1
+    # Metrics
+    hit_ratio = detection_history.sum(axis=1) / detection_history.shape[1]
+    cons_det  = (detection_history[:, 1:-1].sum(axis=1) > 1).astype(np.int32)
+    # Max contiguous sequence of 1s (assumes you have max_contiguous_sequence)
+    max_count = np.array([max_contiguous_sequence(hist) for hist in detection_history])
+    return detection_history, hit_ratio, cons_det, max_count

pymast 0.0.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

pymast 0.0.6py3-none-any.whl → 1.0.0py3-none-any.whl