pymast 1.0.0__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {pymast-1.0.0 → pymast-1.0.2}/PKG-INFO +1 -1
  2. {pymast-1.0.0 → pymast-1.0.2}/pymast/__init__.py +1 -1
  3. {pymast-1.0.0 → pymast-1.0.2}/pymast/parsers.py +197 -186
  4. {pymast-1.0.0 → pymast-1.0.2}/pymast/radio_project.py +164 -66
  5. {pymast-1.0.0 → pymast-1.0.2}/pymast.egg-info/PKG-INFO +1 -1
  6. {pymast-1.0.0 → pymast-1.0.2}/pyproject.toml +19 -19
  7. {pymast-1.0.0 → pymast-1.0.2}/LICENSE.txt +0 -0
  8. {pymast-1.0.0 → pymast-1.0.2}/README.md +0 -0
  9. {pymast-1.0.0 → pymast-1.0.2}/pymast/fish_history.py +0 -0
  10. {pymast-1.0.0 → pymast-1.0.2}/pymast/formatter.py +0 -0
  11. {pymast-1.0.0 → pymast-1.0.2}/pymast/logger.py +0 -0
  12. {pymast-1.0.0 → pymast-1.0.2}/pymast/naive_bayes.py +0 -0
  13. {pymast-1.0.0 → pymast-1.0.2}/pymast/overlap_removal.py +0 -0
  14. {pymast-1.0.0 → pymast-1.0.2}/pymast/predictors.py +0 -0
  15. {pymast-1.0.0 → pymast-1.0.2}/pymast/validation.py +0 -0
  16. {pymast-1.0.0 → pymast-1.0.2}/pymast.egg-info/SOURCES.txt +0 -0
  17. {pymast-1.0.0 → pymast-1.0.2}/pymast.egg-info/dependency_links.txt +0 -0
  18. {pymast-1.0.0 → pymast-1.0.2}/pymast.egg-info/requires.txt +0 -0
  19. {pymast-1.0.0 → pymast-1.0.2}/pymast.egg-info/top_level.txt +0 -0
  20. {pymast-1.0.0 → pymast-1.0.2}/setup.cfg +0 -0
  21. {pymast-1.0.0 → pymast-1.0.2}/setup.py +0 -0
  22. {pymast-1.0.0 → pymast-1.0.2}/tests/test_basic.py +0 -0
  23. {pymast-1.0.0 → pymast-1.0.2}/tests/test_csv_pit.py +0 -0
  24. {pymast-1.0.0 → pymast-1.0.2}/tests/test_formatter_tte.py +0 -0
  25. {pymast-1.0.0 → pymast-1.0.2}/tests/test_initial_state_release.py +0 -0
  26. {pymast-1.0.0 → pymast-1.0.2}/tests/test_overlap_hdf5_integration.py +0 -0
  27. {pymast-1.0.0 → pymast-1.0.2}/tests/test_overlap_loading.py +0 -0
  28. {pymast-1.0.0 → pymast-1.0.2}/tests/test_overlap_small.py +0 -0
  29. {pymast-1.0.0 → pymast-1.0.2}/tests/test_overlap_unit.py +0 -0
  30. {pymast-1.0.0 → pymast-1.0.2}/tests/test_parsers_basic.py +0 -0
  31. {pymast-1.0.0 → pymast-1.0.2}/tests/test_pit_multiple_parser.py +0 -0
  32. {pymast-1.0.0 → pymast-1.0.2}/tests/test_pit_parser.py +0 -0
  33. {pymast-1.0.0 → pymast-1.0.2}/tests/test_unified_pit.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pymast
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: Movement Analysis Software for Telemetry (MAST) - False positive removal and movement analysis for radio telemetry data
5
5
  Author: Theodore Castro-Santos
6
6
  Author-email: "Kevin P. Nebiolo" <kevin.nebiolo@kleinschmidtgroup.com>
@@ -28,7 +28,7 @@ from .parsers import *
28
28
  from .radio_project import *
29
29
 
30
30
  # Version
31
- __version__ = '1.0.0'
31
+ __version__ = '1.0.2'
32
32
 
33
33
  # Define what's available when using "from pymast import *"
34
34
  __all__ = [
@@ -80,32 +80,32 @@ predictors.noise_ratio : Miscoded detection ratio calculation
80
80
  import pandas as pd
81
81
  import numpy as np
82
82
  import datetime
83
- import os
84
- import pymast.predictors as predictors
85
- import sys
86
-
87
- def _append_raw_data(db_dir, telem_dat, data_columns=None):
88
- with pd.HDFStore(db_dir, mode='a') as store:
89
- append_kwargs = {
90
- 'key': 'raw_data',
91
- 'value': telem_dat,
92
- 'format': 'table',
93
- 'index': False,
94
- 'min_itemsize': {
95
- 'freq_code': 20,
96
- 'rec_type': 20,
97
- 'rec_id': 20,
98
- },
99
- 'append': True,
100
- 'chunksize': 1000000,
101
- }
102
- if data_columns is not None:
103
- append_kwargs['data_columns'] = data_columns
104
- store.append(**append_kwargs)
105
-
106
- def ares(file_name,
107
- db_dir,
108
- rec_id,
83
+ import os
84
+ import pymast.predictors as predictors
85
+ import sys
86
+
87
+ def _append_raw_data(db_dir, telem_dat, data_columns=None):
88
+ with pd.HDFStore(db_dir, mode='a') as store:
89
+ append_kwargs = {
90
+ 'key': 'raw_data',
91
+ 'value': telem_dat,
92
+ 'format': 'table',
93
+ 'index': False,
94
+ 'min_itemsize': {
95
+ 'freq_code': 20,
96
+ 'rec_type': 20,
97
+ 'rec_id': 20,
98
+ },
99
+ 'append': True,
100
+ 'chunksize': 1000000,
101
+ }
102
+ if data_columns is not None:
103
+ append_kwargs['data_columns'] = data_columns
104
+ store.append(**append_kwargs)
105
+
106
+ def ares(file_name,
107
+ db_dir,
108
+ rec_id,
109
109
  study_tags,
110
110
  scan_time = 1,
111
111
  channels = 1,
@@ -229,26 +229,26 @@ def ares(file_name,
229
229
  telem_dat.epoch.values,
230
230
  study_tags)
231
231
 
232
- telem_dat = telem_dat.astype({'power':'float32',
233
- 'freq_code':'object',
234
- 'time_stamp':'datetime64[ns]',
235
- 'scan_time':'float32',
236
- 'channels':'int32',
237
- 'rec_type':'object',
238
- 'epoch':'int64',
239
- 'noise_ratio':'float32',
240
- 'rec_id':'object'})
241
-
242
- _append_raw_data(db_dir, telem_dat)
243
-
244
-
245
- def orion_import(file_name,
246
- db_dir,
247
- rec_id,
248
- study_tags,
249
- scan_time = 1.,
250
- channels = 1,
251
- ant_to_rec_dict = None):
232
+ telem_dat = telem_dat.astype({'power':'float32',
233
+ 'freq_code':'object',
234
+ 'time_stamp':'datetime64[ns]',
235
+ 'scan_time':'float32',
236
+ 'channels':'int32',
237
+ 'rec_type':'object',
238
+ 'epoch':'int64',
239
+ 'noise_ratio':'float32',
240
+ 'rec_id':'object'})
241
+
242
+ _append_raw_data(db_dir, telem_dat)
243
+
244
+
245
+ def orion_import(file_name,
246
+ db_dir,
247
+ rec_id,
248
+ study_tags,
249
+ scan_time = 1.,
250
+ channels = 1,
251
+ ant_to_rec_dict = None):
252
252
  """
253
253
  Import Sigma Eight Orion receiver data into MAST HDF5 database.
254
254
 
@@ -334,33 +334,33 @@ def orion_import(file_name,
334
334
  telem_dat['Freq'] = telem_dat['Freq'].apply(lambda x: f"{x:.3f}")
335
335
 
336
336
 
337
- def _write_orion_subset(df, receiver_id, epoch_dtype):
338
- df = df.copy()
339
- df['rec_id'] = np.repeat(receiver_id, len(df))
340
- df.drop(['Ant'], axis = 1, inplace = True)
341
- df = df.astype({'power':'float32',
342
- 'freq_code':'object',
343
- 'time_stamp':'datetime64[ns]',
344
- 'scan_time':'float32',
345
- 'channels':'int32',
346
- 'rec_type':'object',
347
- 'epoch': epoch_dtype,
348
- 'noise_ratio':'float32',
349
- 'rec_id':'object'})
350
-
351
- df = df[['power',
352
- 'time_stamp',
353
- 'epoch',
354
- 'freq_code',
355
- 'noise_ratio',
356
- 'scan_time',
357
- 'channels',
358
- 'rec_id',
359
- 'rec_type']]
360
-
361
- _append_raw_data(db_dir, df, data_columns=True)
362
-
363
- if len(telem_dat) > 0:
337
+ def _write_orion_subset(df, receiver_id, epoch_dtype):
338
+ df = df.copy()
339
+ df['rec_id'] = np.repeat(receiver_id, len(df))
340
+ df.drop(['Ant'], axis = 1, inplace = True)
341
+ df = df.astype({'power':'float32',
342
+ 'freq_code':'object',
343
+ 'time_stamp':'datetime64[ns]',
344
+ 'scan_time':'float32',
345
+ 'channels':'int32',
346
+ 'rec_type':'object',
347
+ 'epoch': epoch_dtype,
348
+ 'noise_ratio':'float32',
349
+ 'rec_id':'object'})
350
+
351
+ df = df[['power',
352
+ 'time_stamp',
353
+ 'epoch',
354
+ 'freq_code',
355
+ 'noise_ratio',
356
+ 'scan_time',
357
+ 'channels',
358
+ 'rec_id',
359
+ 'rec_type']]
360
+
361
+ _append_raw_data(db_dir, df, data_columns=True)
362
+
363
+ if len(telem_dat) > 0:
364
364
  # add file name to data
365
365
  #['fileName'] = np.repeat(file_name,len(telem_dat)) #Note I'm going back here to the actual file name without the path. Is that OK? I prefer it, but it's a potential source of confusion
366
366
 
@@ -389,18 +389,18 @@ def orion_import(file_name,
389
389
  telem_dat.epoch.values,
390
390
  study_tags)
391
391
 
392
- # if there is no antenna to receiver dictionary
393
- if ant_to_rec_dict == None:
394
- _write_orion_subset(telem_dat, rec_id, 'int64')
395
- # if there is an antenna to receiver dictionary
396
- else:
397
- for i in ant_to_rec_dict.keys():
398
- # get site from dictionary
399
- site = ant_to_rec_dict[i]
400
-
401
- # get telemetryt data associated with this site
402
- telem_dat_sub = telem_dat[telem_dat.Ant == 1]
403
- _write_orion_subset(telem_dat_sub, site, 'float32')
392
+ # if there is no antenna to receiver dictionary
393
+ if ant_to_rec_dict == None:
394
+ _write_orion_subset(telem_dat, rec_id, 'int64')
395
+ # if there is an antenna to receiver dictionary
396
+ else:
397
+ for i in ant_to_rec_dict.keys():
398
+ # get site from dictionary
399
+ site = ant_to_rec_dict[i]
400
+
401
+ # get telemetryt data associated with this site
402
+ telem_dat_sub = telem_dat[telem_dat.Ant == 1]
403
+ _write_orion_subset(telem_dat_sub, site, 'float32')
404
404
  else:
405
405
  raise ValueError("Invalid import parameters, no data returned")
406
406
  sys.exit()
@@ -492,7 +492,7 @@ def vr2_import(file_name,db_dir,study_tags, rec_id):
492
492
  'noise_ratio':'float32',
493
493
  'rec_id':'object'})
494
494
 
495
- _append_raw_data(db_dir, telem_dat)
495
+ _append_raw_data(db_dir, telem_dat)
496
496
 
497
497
  def srx1200(file_name,
498
498
  db_dir,
@@ -785,7 +785,7 @@ def srx1200(file_name,
785
785
  'rec_id',
786
786
  'rec_type']]
787
787
 
788
- _append_raw_data(db_dir, telem_dat, data_columns=True)
788
+ _append_raw_data(db_dir, telem_dat, data_columns=True)
789
789
 
790
790
  # if the data doesn't have a header
791
791
  else:
@@ -857,7 +857,7 @@ def srx1200(file_name,
857
857
  'rec_id',
858
858
  'rec_type']]
859
859
 
860
- _append_raw_data(db_dir, telem_dat, data_columns=True)
860
+ _append_raw_data(db_dir, telem_dat, data_columns=True)
861
861
 
862
862
  def srx800(file_name,
863
863
  db_dir,
@@ -1146,16 +1146,16 @@ def srx800(file_name,
1146
1146
  telem_dat_sub['epoch'] = np.round((telem_dat_sub.time_stamp - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s'),6)
1147
1147
 
1148
1148
  # get setup number for every row
1149
- try:
1150
- telem_dat_sub['setup'] = get_setup(
1151
- telem_dat_sub.epoch.values,
1152
- setup_df.epoch.values
1153
- )
1154
- except (ValueError, TypeError, IndexError) as e:
1155
- raise ValueError(
1156
- f"Failed to compute setup mapping for antenna '{ant}' at site '{site}'. "
1157
- "Check setup table epoch alignment and input data integrity."
1158
- ) from e
1149
+ try:
1150
+ telem_dat_sub['setup'] = get_setup(
1151
+ telem_dat_sub.epoch.values,
1152
+ setup_df.epoch.values
1153
+ )
1154
+ except (ValueError, TypeError, IndexError) as e:
1155
+ raise ValueError(
1156
+ f"Failed to compute setup mapping for antenna '{ant}' at site '{site}'. "
1157
+ "Check setup table epoch alignment and input data integrity."
1158
+ ) from e
1159
1159
 
1160
1160
  # get frequency from channel
1161
1161
  telem_dat_sub['Frequency'] = get_frequency(telem_dat_sub.setup.values,
@@ -1488,7 +1488,7 @@ def srx600(file_name,
1488
1488
  'noise_ratio':'float32',
1489
1489
  'rec_id':'object'})
1490
1490
 
1491
- _append_raw_data(db_dir, telem_dat_sub, data_columns=True)
1491
+ _append_raw_data(db_dir, telem_dat_sub, data_columns=True)
1492
1492
  else:
1493
1493
  telem_dat = pd.read_fwf(file_name,
1494
1494
  colspecs = [(0,9),(9,19),(19,29),(29,36),(36,44),(44,52)],
@@ -1553,7 +1553,7 @@ def srx600(file_name,
1553
1553
  'noise_ratio':'float32',
1554
1554
  'rec_id':'object'})
1555
1555
 
1556
- _append_raw_data(db_dir, telem_dat_sub)
1556
+ _append_raw_data(db_dir, telem_dat_sub)
1557
1557
 
1558
1558
 
1559
1559
 
@@ -1644,13 +1644,13 @@ def PIT(file_name,
1644
1644
  # First, analyze the file to determine format
1645
1645
  def analyze_file_format(file_name):
1646
1646
  """Dynamically determine PIT file format and header structure"""
1647
- with open(file_name, 'r') as file:
1648
- lines = []
1649
- for _ in range(20): # Read first 20 lines to analyze format
1650
- line = file.readline()
1651
- if not line:
1652
- break
1653
- lines.append(line.rstrip('\n'))
1647
+ with open(file_name, 'r') as file:
1648
+ lines = []
1649
+ for _ in range(20): # Read first 20 lines to analyze format
1650
+ line = file.readline()
1651
+ if not line:
1652
+ break
1653
+ lines.append(line.rstrip('\n'))
1654
1654
 
1655
1655
  # Check if CSV format (look for commas in sample lines)
1656
1656
  csv_indicators = 0
@@ -1711,10 +1711,10 @@ def PIT(file_name,
1711
1711
  telem_dat = pd.read_csv(file_name, dtype=str)
1712
1712
  print(f"Auto-detected columns: {list(telem_dat.columns)}")
1713
1713
 
1714
- except (pd.errors.ParserError, UnicodeDecodeError, ValueError) as e:
1715
- raise ValueError(
1716
- f"CSV auto-detection failed for PIT file '{file_name}': {e}"
1717
- ) from e
1714
+ except (pd.errors.ParserError, UnicodeDecodeError, ValueError) as e:
1715
+ raise ValueError(
1716
+ f"CSV auto-detection failed for PIT file '{file_name}': {e}"
1717
+ ) from e
1718
1718
 
1719
1719
  # Find timestamp column dynamically
1720
1720
  timestamp_col = find_column_by_patterns(telem_dat, ['timestamp', 'time stamp', 'date', 'scan date', 'detected'])
@@ -1732,8 +1732,8 @@ def PIT(file_name,
1732
1732
  if not telem_dat["time_stamp"].isna().all():
1733
1733
  print(f"Successfully parsed timestamps using format: {fmt or 'auto-detect'}")
1734
1734
  break
1735
- except (ValueError, TypeError) as e:
1736
- continue
1735
+ except (ValueError, TypeError) as e:
1736
+ continue
1737
1737
  else:
1738
1738
  raise ValueError("Could not find timestamp column")
1739
1739
 
@@ -1773,14 +1773,14 @@ def PIT(file_name,
1773
1773
  # Fixed-Width Format Parsing (original logic)
1774
1774
 
1775
1775
  # Read header information for format detection
1776
- with open(file_name, 'r') as file:
1777
- header_lines = []
1778
- for _ in range(max(skiprows, 10)):
1779
- line = file.readline()
1780
- if not line:
1781
- break
1782
- header_lines.append(line.rstrip('\n'))
1783
- header_text = " ".join(header_lines).lower()
1776
+ with open(file_name, 'r') as file:
1777
+ header_lines = []
1778
+ for _ in range(max(skiprows, 10)):
1779
+ line = file.readline()
1780
+ if not line:
1781
+ break
1782
+ header_lines.append(line.rstrip('\n'))
1783
+ header_text = " ".join(header_lines).lower()
1784
1784
 
1785
1785
  # Define colspecs for different fixed-width formats
1786
1786
  if 'latitude' in header_text or 'longitude' in header_text:
@@ -1842,7 +1842,15 @@ def PIT(file_name,
1842
1842
  antenna_col = None
1843
1843
  for col in telem_dat.columns:
1844
1844
  col_lower = str(col).lower().strip()
1845
- if col_lower in ('antenna id', 'antenna', 'ant', 'antennae', 'antennae id'):
1845
+ if col_lower in (
1846
+ 'antenna id',
1847
+ 'antenna',
1848
+ 'ant',
1849
+ 'antennae',
1850
+ 'antennae id',
1851
+ 'reader id',
1852
+ 'readerid',
1853
+ ):
1846
1854
  antenna_col = col
1847
1855
  break
1848
1856
 
@@ -1854,12 +1862,12 @@ def PIT(file_name,
1854
1862
  telem_dat['antenna_num'] = pd.to_numeric(telem_dat['antenna_num'], errors='coerce')
1855
1863
 
1856
1864
  # Prepare mapping dict keys as strings and ints for robust lookup
1857
- ant_map = {}
1858
- for k, v in ant_to_rec_dict.items():
1859
- key_str = str(k).strip()
1860
- if key_str.isdigit():
1861
- ant_map[int(key_str)] = v
1862
- ant_map[key_str] = v
1865
+ ant_map = {}
1866
+ for k, v in ant_to_rec_dict.items():
1867
+ key_str = str(k).strip()
1868
+ if key_str.isdigit():
1869
+ ant_map[int(key_str)] = v
1870
+ ant_map[key_str] = v
1863
1871
 
1864
1872
  # Map by numeric antenna if possible, else by raw string
1865
1873
  telem_dat['rec_id'] = telem_dat['antenna_num'].map(ant_map)
@@ -1877,7 +1885,10 @@ def PIT(file_name,
1877
1885
  # drop detections that do not map to a known receiver
1878
1886
  telem_dat = telem_dat.dropna(subset=['rec_id'])
1879
1887
  else:
1880
- raise ValueError('Multi-antenna fixed-width PIT file requires an antenna column but none was found')
1888
+ raise ValueError(
1889
+ 'Multi-antenna fixed-width PIT file requires an antenna/reader column '
1890
+ '(e.g., "Antenna ID" or "Reader ID"), but none was found'
1891
+ )
1881
1892
 
1882
1893
  # Data cleaning - remove invalid entries
1883
1894
  print(f"\nCleaning data - original records: {len(telem_dat)}")
@@ -1899,49 +1910,49 @@ def PIT(file_name,
1899
1910
  telem_dat = telem_dat[telem_dat['freq_code'].str.len() > 3]
1900
1911
  telem_dat = telem_dat[~telem_dat['freq_code'].isna()]
1901
1912
 
1902
- # Finalize fields and append to HDF5 /raw_data
1903
- if len(telem_dat) == 0:
1904
- print('No valid PIT rows after cleaning; nothing to append')
1905
- return
1906
-
1907
- if 'power' not in telem_dat.columns:
1908
- telem_dat['power'] = np.nan
1909
-
1910
- # compute epoch as int64 seconds and other derived fields
1911
- telem_dat['epoch'] = (pd.to_datetime(telem_dat['time_stamp']).astype('int64') // 10**9).astype('int64')
1913
+ # Finalize fields and append to HDF5 /raw_data
1914
+ if len(telem_dat) == 0:
1915
+ print('No valid PIT rows after cleaning; nothing to append')
1916
+ return
1917
+
1918
+ if 'power' not in telem_dat.columns:
1919
+ telem_dat['power'] = np.nan
1920
+
1921
+ # compute epoch as int64 seconds and other derived fields
1922
+ telem_dat['epoch'] = (pd.to_datetime(telem_dat['time_stamp']).astype('int64') // 10**9).astype('int64')
1912
1923
  telem_dat['channels'] = np.repeat(channels, len(telem_dat))
1913
1924
  telem_dat['scan_time'] = np.repeat(scan_time, len(telem_dat))
1914
1925
  telem_dat['rec_type'] = np.repeat(rec_type, len(telem_dat))
1915
1926
 
1916
1927
  # compute noise ratio if study_tags provided
1917
- try:
1918
- telem_dat['noise_ratio'] = predictors.noise_ratio(
1919
- 5.0,
1920
- telem_dat.freq_code.values,
1921
- telem_dat.epoch.values,
1922
- study_tags
1923
- )
1924
- except (ValueError, TypeError, KeyError, IndexError) as e:
1925
- raise ValueError(f"Failed to compute noise_ratio for PIT data: {e}") from e
1926
-
1927
- # ensure dtypes
1928
- telem_dat = telem_dat.astype({'time_stamp': 'datetime64[ns]',
1929
- 'epoch': 'int64',
1930
- 'freq_code': 'object',
1931
- 'power': 'float32',
1932
- 'rec_id': 'object',
1933
- 'rec_type': 'object',
1934
- 'scan_time': 'float32',
1935
- 'channels': 'int32',
1936
- 'noise_ratio': 'float32'})
1928
+ try:
1929
+ telem_dat['noise_ratio'] = predictors.noise_ratio(
1930
+ 5.0,
1931
+ telem_dat.freq_code.values,
1932
+ telem_dat.epoch.values,
1933
+ study_tags
1934
+ )
1935
+ except (ValueError, TypeError, KeyError, IndexError) as e:
1936
+ raise ValueError(f"Failed to compute noise_ratio for PIT data: {e}") from e
1937
+
1938
+ # ensure dtypes
1939
+ telem_dat = telem_dat.astype({'time_stamp': 'datetime64[ns]',
1940
+ 'epoch': 'int64',
1941
+ 'freq_code': 'object',
1942
+ 'power': 'float32',
1943
+ 'rec_id': 'object',
1944
+ 'rec_type': 'object',
1945
+ 'scan_time': 'float32',
1946
+ 'channels': 'int32',
1947
+ 'noise_ratio': 'float32'})
1937
1948
 
1938
1949
  # reorder columns to match expected schema
1939
1950
  cols = ['time_stamp', 'epoch', 'freq_code', 'power', 'noise_ratio', 'scan_time', 'channels', 'rec_id', 'rec_type']
1940
1951
  cols_existing = [c for c in cols if c in telem_dat.columns]
1941
1952
 
1942
- _append_raw_data(db_dir, telem_dat[cols_existing], data_columns=True)
1943
- with pd.HDFStore(db_dir, mode='a') as store:
1944
- print('Store keys after append:', store.keys())
1953
+ _append_raw_data(db_dir, telem_dat[cols_existing], data_columns=True)
1954
+ with pd.HDFStore(db_dir, mode='a') as store:
1955
+ print('Store keys after append:', store.keys())
1945
1956
 
1946
1957
 
1947
1958
  def PIT_Multiple(
@@ -2019,29 +2030,29 @@ def PIT_Multiple(
2019
2030
  "LocationDetail", "Type", "Recapture", "Sex", "GeneticSampleID", "Comments"
2020
2031
  ]
2021
2032
 
2022
- # Read the CSV into a DataFrame, skipping rows if needed
2023
- telem_dat = pd.read_csv(file_name, names=col_names, header=0, skiprows=skiprows, dtype=str)
2024
-
2025
- mode_str = "multi-antenna"
2026
- if ant_to_rec_dict is None:
2027
- raise ValueError("ant_to_rec_dict is required for PIT_Multiple")
2028
-
2029
- # Convert "TimeStamp" to datetime with explicit format
2030
- telem_dat["time_stamp"] = pd.to_datetime(telem_dat["TimeStamp"], format="%m/%d/%Y %H:%M", errors="coerce")
2031
-
2032
- # Ensure "Tag1Dec" and "Tag1Hex" are treated as strings (avoid scientific notation issues)
2033
- telem_dat["Tag1Dec"] = telem_dat["Tag1Dec"].astype(str)
2034
- telem_dat["Tag1Hex"] = telem_dat["Tag1Hex"].astype(str)
2035
-
2036
- telem_dat["freq_code"] = telem_dat["Tag1Hex"].astype(str).str.strip()
2037
- antenna_raw = telem_dat["Antennae"].astype(str).str.strip()
2038
- antenna_num = pd.to_numeric(antenna_raw.str.extract(r"(\d+)")[0], errors="coerce")
2039
- rec_id = antenna_num.map(ant_to_rec_dict)
2040
- if rec_id.isna().any():
2041
- rec_id = rec_id.fillna(antenna_raw.map(ant_to_rec_dict))
2042
- telem_dat["rec_id"] = rec_id
2043
- telem_dat = telem_dat.dropna(subset=["rec_id"])
2044
-
2033
+ # Read the CSV into a DataFrame, skipping rows if needed
2034
+ telem_dat = pd.read_csv(file_name, names=col_names, header=0, skiprows=skiprows, dtype=str)
2035
+
2036
+ mode_str = "multi-antenna"
2037
+ if ant_to_rec_dict is None:
2038
+ raise ValueError("ant_to_rec_dict is required for PIT_Multiple")
2039
+
2040
+ # Convert "TimeStamp" to datetime with explicit format
2041
+ telem_dat["time_stamp"] = pd.to_datetime(telem_dat["TimeStamp"], format="%m/%d/%Y %H:%M", errors="coerce")
2042
+
2043
+ # Ensure "Tag1Dec" and "Tag1Hex" are treated as strings (avoid scientific notation issues)
2044
+ telem_dat["Tag1Dec"] = telem_dat["Tag1Dec"].astype(str)
2045
+ telem_dat["Tag1Hex"] = telem_dat["Tag1Hex"].astype(str)
2046
+
2047
+ telem_dat["freq_code"] = telem_dat["Tag1Hex"].astype(str).str.strip()
2048
+ antenna_raw = telem_dat["Antennae"].astype(str).str.strip()
2049
+ antenna_num = pd.to_numeric(antenna_raw.str.extract(r"(\d+)")[0], errors="coerce")
2050
+ rec_id = antenna_num.map(ant_to_rec_dict)
2051
+ if rec_id.isna().any():
2052
+ rec_id = rec_id.fillna(antenna_raw.map(ant_to_rec_dict))
2053
+ telem_dat["rec_id"] = rec_id
2054
+ telem_dat = telem_dat.dropna(subset=["rec_id"])
2055
+
2045
2056
  # if after_cleanup == 0:
2046
2057
  # raise ValueError(f"No valid records found in {file_name}")
2047
2058
 
@@ -2101,4 +2112,4 @@ def PIT_Multiple(
2101
2112
 
2102
2113
 
2103
2114
 
2104
-
2115
+
@@ -95,21 +95,21 @@ import pymast.predictors as predictors
95
95
  import matplotlib.pyplot as plt
96
96
  from matplotlib import rcParams
97
97
  from scipy import interpolate
98
- try:
99
- from tqdm import tqdm
100
- except ImportError:
101
- def tqdm(iterable, **kwargs):
102
- return iterable
98
+ try:
99
+ from tqdm import tqdm
100
+ except ImportError:
101
+ def tqdm(iterable, **kwargs):
102
+ return iterable
103
103
  import shutil
104
104
  import warnings
105
105
  import dask.dataframe as dd
106
106
  import dask.array as da
107
- try:
108
- from dask_ml.cluster import KMeans
109
- _KMEANS_IMPL = 'dask'
110
- except ImportError:
111
- from sklearn.cluster import KMeans
112
- _KMEANS_IMPL = 'sklearn'
107
+ try:
108
+ from dask_ml.cluster import KMeans
109
+ _KMEANS_IMPL = 'dask'
110
+ except ImportError:
111
+ from sklearn.cluster import KMeans
112
+ _KMEANS_IMPL = 'sklearn'
113
113
 
114
114
  # Initialize logger
115
115
  logger = logging.getLogger('pymast.radio_project')
@@ -415,12 +415,12 @@ class radio_project():
415
415
  if self.non_interactive:
416
416
  logger.debug(f"Non-interactive mode: auto-answering '{prompt_text}' with '{default}'")
417
417
  return default
418
- try:
419
- return input(prompt_text)
420
- except (EOFError, OSError) as exc:
421
- raise RuntimeError(
422
- "Input prompt failed. Set project.non_interactive = True to use defaults."
423
- ) from exc
418
+ try:
419
+ return input(prompt_text)
420
+ except (EOFError, OSError) as exc:
421
+ raise RuntimeError(
422
+ "Input prompt failed. Set project.non_interactive = True to use defaults."
423
+ ) from exc
424
424
 
425
425
  def telem_data_import(self,
426
426
  rec_id,
@@ -496,9 +496,19 @@ class radio_project():
496
496
 
497
497
  logger.info(f" Found {len(tFiles)} file(s) to import")
498
498
 
499
+ # Track detections per file for statistics
500
+ detections_per_file = []
501
+
499
502
  # for every file call the correct text parser and import
500
503
  for i, f in enumerate(tqdm(tFiles, desc=f"Importing {rec_id}", unit="file"), 1):
501
504
  logger.debug(f" Processing file {i}/{len(tFiles)}: {f}")
505
+
506
+ # Count detections before import
507
+ try:
508
+ pre_count = len(pd.read_hdf(self.db, key='raw_data', where=f'rec_id = "{rec_id}"'))
509
+ except (KeyError, FileNotFoundError):
510
+ pre_count = 0
511
+
502
512
  # get the complete file directory
503
513
  f_dir = os.path.join(file_dir,f)
504
514
 
@@ -533,8 +543,91 @@ class radio_project():
533
543
  else:
534
544
  logger.error(f"No import routine for receiver type: {rec_type}")
535
545
  raise ValueError(f"No import routine available for receiver type: {rec_type}")
546
+
547
+ # Count detections after import
548
+ try:
549
+ post_count = len(pd.read_hdf(self.db, key='raw_data', where=f'rec_id = "{rec_id}"'))
550
+ detections_this_file = post_count - pre_count
551
+ detections_per_file.append(detections_this_file)
552
+ except (KeyError, FileNotFoundError):
553
+ detections_per_file.append(0)
536
554
 
537
555
  logger.info(f"✓ Import complete for receiver {rec_id}: {len(tFiles)} file(s) processed")
556
+
557
+ # Calculate and display import statistics
558
+ try:
559
+ raw_data = pd.read_hdf(self.db, key='raw_data', where=f'rec_id = "{rec_id}"')
560
+
561
+ # Total Detection Count
562
+ total_detections = len(raw_data)
563
+ logger.info(f"\n{'='*60}")
564
+ logger.info(f"IMPORT STATISTICS FOR {rec_id}")
565
+ logger.info(f"{'='*60}")
566
+ logger.info(f"Total Detection Count: {total_detections:,}")
567
+
568
+ if total_detections > 0:
569
+ # Detection count summary statistics
570
+ logger.info(f"\nDetection Summary Statistics:")
571
+ logger.info(f" Mean detections per file: {total_detections / len(tFiles):.1f}")
572
+ logger.info(f" Files processed: {len(tFiles)}")
573
+
574
+ # 5-number summary for detections per file
575
+ if len(detections_per_file) > 0:
576
+ det_array = np.array(detections_per_file)
577
+ logger.info(f"\nDetections Per File (5-number summary):")
578
+ logger.info(f" Min: {np.min(det_array):,.0f}")
579
+ logger.info(f" Q1: {np.percentile(det_array, 25):,.0f}")
580
+ logger.info(f" Median: {np.median(det_array):,.0f}")
581
+ logger.info(f" Q3: {np.percentile(det_array, 75):,.0f}")
582
+ logger.info(f" Max: {np.max(det_array):,.0f}")
583
+
584
+ # Unique Tag Count
585
+ unique_tags = raw_data['freq_code'].nunique()
586
+ logger.info(f"\nUnique Tag Count: {unique_tags}")
587
+
588
+ # Duplicate Tag Count and IDs
589
+ # Check for detections at the exact same timestamp (true duplicates)
590
+ if 'time_stamp' in raw_data.columns:
591
+ dup_mask = raw_data.duplicated(subset=['freq_code', 'time_stamp'], keep=False)
592
+ duplicate_count = dup_mask.sum()
593
+
594
+ if duplicate_count > 0:
595
+ duplicate_tags = raw_data.loc[dup_mask, 'freq_code'].unique()
596
+ logger.info(f"\nDuplicate Detection Count (same timestamp): {duplicate_count:,}")
597
+ logger.info(f"Duplicate Tag IDs ({len(duplicate_tags)} tags):")
598
+ for tag in sorted(duplicate_tags)[:10]: # Show first 10
599
+ tag_dups = dup_mask & (raw_data['freq_code'] == tag)
600
+ logger.info(f" {tag}: {tag_dups.sum()} duplicate(s)")
601
+ if len(duplicate_tags) > 10:
602
+ logger.info(f" ... and {len(duplicate_tags) - 10} more")
603
+ else:
604
+ logger.info(f"\nDuplicate Detection Count: 0 (no exact timestamp duplicates)")
605
+
606
+ # Time Coverage
607
+ if 'time_stamp' in raw_data.columns:
608
+ raw_data['time_stamp'] = pd.to_datetime(raw_data['time_stamp'])
609
+ start_time = raw_data['time_stamp'].min()
610
+ end_time = raw_data['time_stamp'].max()
611
+ duration = end_time - start_time
612
+
613
+ logger.info(f"\nTime Coverage:")
614
+ logger.info(f" Start: {start_time}")
615
+ logger.info(f" End: {end_time}")
616
+ logger.info(f" Duration: {duration.days} days, {duration.seconds // 3600} hours")
617
+
618
+ # Detection rate
619
+ if duration.total_seconds() > 0:
620
+ det_per_hour = total_detections / (duration.total_seconds() / 3600)
621
+ logger.info(f" Detection rate: {det_per_hour:.1f} detections/hour")
622
+
623
+ logger.info(f"{'='*60}\n")
624
+ else:
625
+ logger.warning(f"No detections found for receiver {rec_id}")
626
+
627
+ except KeyError:
628
+ logger.warning(f"Could not retrieve statistics - raw_data table not found in database")
629
+ except Exception as e:
630
+ logger.warning(f"Error calculating import statistics: {e}")
538
631
 
539
632
  def get_fish(self, rec_id, train = True, reclass_iter = None):
540
633
  logger.info(f"Getting fish for receiver {rec_id}")
@@ -1576,16 +1669,16 @@ class radio_project():
1576
1669
  node_path = node._v_pathname
1577
1670
  print(f" Copying {node_path}...")
1578
1671
 
1579
- try:
1580
- # Use recursive=True to copy entire subtree (Groups, Tables, Arrays, etc.)
1581
- h5in.copy_node(
1582
- where=node_path,
1583
- newparent=h5out.root,
1584
- recursive=True,
1585
- filters=filters
1586
- )
1587
- except (tables.NodeError, tables.HDF5ExtError, OSError, ValueError) as e:
1588
- raise RuntimeError(f"Failed to copy HDF5 node {node_path}: {e}") from e
1672
+ try:
1673
+ # Use recursive=True to copy entire subtree (Groups, Tables, Arrays, etc.)
1674
+ h5in.copy_node(
1675
+ where=node_path,
1676
+ newparent=h5out.root,
1677
+ recursive=True,
1678
+ filters=filters
1679
+ )
1680
+ except (tables.NodeError, tables.HDF5ExtError, OSError, ValueError) as e:
1681
+ raise RuntimeError(f"Failed to copy HDF5 node {node_path}: {e}") from e
1589
1682
 
1590
1683
  # Get new size
1591
1684
  new_size = os.path.getsize(output_path)
@@ -1603,26 +1696,29 @@ class radio_project():
1603
1696
  def make_recaptures_table(self, export=True, pit_study=False):
1604
1697
  '''Creates a recaptures key in the HDF5 file, iterating over receivers to manage memory.'''
1605
1698
  logger.info("Creating recaptures table")
1699
+ logger.info(f" PIT study mode: {pit_study}")
1606
1700
  logger.info(f" Processing {len(self.receivers)} receiver(s)")
1607
1701
  # prepare a heartbeat log so long runs can be monitored (one-line per receiver)
1608
1702
  heartbeat_dir = os.path.join(self.project_dir, 'build')
1609
- try:
1610
- os.makedirs(heartbeat_dir, exist_ok=True)
1611
- except OSError as e:
1612
- raise RuntimeError(
1613
- f"Failed to create heartbeat directory '{heartbeat_dir}': {e}"
1614
- ) from e
1703
+ try:
1704
+ os.makedirs(heartbeat_dir, exist_ok=True)
1705
+ except OSError as e:
1706
+ raise RuntimeError(
1707
+ f"Failed to create heartbeat directory '{heartbeat_dir}': {e}"
1708
+ ) from e
1615
1709
  heartbeat_path = os.path.join(heartbeat_dir, 'recaptures_heartbeat.log')
1616
1710
  print(f"Starting recaptures: {len(self.receivers)} receivers. Heartbeat -> {heartbeat_path}")
1617
- try:
1618
- with open(heartbeat_path, 'a') as _hb:
1619
- _hb.write(f"START {datetime.datetime.now().isoformat()} receivers={len(self.receivers)}\n")
1620
- except OSError as e:
1621
- raise RuntimeError(
1622
- f"Failed to write heartbeat start to '{heartbeat_path}': {e}"
1623
- ) from e
1624
-
1625
- if pit_study==False:
1711
+ try:
1712
+ with open(heartbeat_path, 'a') as _hb:
1713
+ _hb.write(f"START {datetime.datetime.now().isoformat()} receivers={len(self.receivers)}\n")
1714
+ except OSError as e:
1715
+ raise RuntimeError(
1716
+ f"Failed to write heartbeat start to '{heartbeat_path}': {e}"
1717
+ ) from e
1718
+
1719
+ if not pit_study:
1720
+ # RADIO STUDY PATH
1721
+ logger.info(" Using RADIO study processing path")
1626
1722
  # Convert release dates to datetime if not already done
1627
1723
  self.tags['rel_date'] = pd.to_datetime(self.tags['rel_date'])
1628
1724
  tags_copy = self.tags.copy()
@@ -1787,15 +1883,17 @@ class radio_project():
1787
1883
  logger.info(f" ✓ Recaps for {rec} compiled and written to HDF5")
1788
1884
  print(f"[recaptures] ✓ {rec} written to database", flush=True)
1789
1885
  # append heartbeat line
1790
- try:
1791
- with open(heartbeat_path, 'a') as _hb:
1792
- _hb.write(f"{datetime.datetime.now().isoformat()} rec={rec} rows={len(rec_dat)}\n")
1793
- except OSError as e:
1794
- raise RuntimeError(
1795
- f"Failed to write heartbeat for receiver {rec} to '{heartbeat_path}': {e}"
1796
- ) from e
1886
+ try:
1887
+ with open(heartbeat_path, 'a') as _hb:
1888
+ _hb.write(f"{datetime.datetime.now().isoformat()} rec={rec} rows={len(rec_dat)}\n")
1889
+ except OSError as e:
1890
+ raise RuntimeError(
1891
+ f"Failed to write heartbeat for receiver {rec} to '{heartbeat_path}': {e}"
1892
+ ) from e
1797
1893
 
1798
1894
  else:
1895
+ # PIT STUDY PATH
1896
+ logger.info(" Using PIT study processing path")
1799
1897
  # Loop over each receiver in self.receivers
1800
1898
  for rec in tqdm(self.receivers.index, desc="Processing PIT receivers", unit="receiver"):
1801
1899
  logger.info(f" Processing {rec} (PIT study)...")
@@ -1917,13 +2015,13 @@ class radio_project():
1917
2015
 
1918
2016
  logger.info(f" ✓ PIT recaps for {rec} compiled and written to HDF5")
1919
2017
  print(f"[recaptures] ✓ {rec} PIT data written to database", flush=True)
1920
- try:
1921
- with open(heartbeat_path, 'a') as _hb:
1922
- _hb.write(f"{datetime.datetime.now().isoformat()} pit_rec={rec} rows={len(pit_data)}\n")
1923
- except OSError as e:
1924
- raise RuntimeError(
1925
- f"Failed to write PIT heartbeat for receiver {rec} to '{heartbeat_path}': {e}"
1926
- ) from e
2018
+ try:
2019
+ with open(heartbeat_path, 'a') as _hb:
2020
+ _hb.write(f"{datetime.datetime.now().isoformat()} pit_rec={rec} rows={len(pit_data)}\n")
2021
+ except OSError as e:
2022
+ raise RuntimeError(
2023
+ f"Failed to write PIT heartbeat for receiver {rec} to '{heartbeat_path}': {e}"
2024
+ ) from e
1927
2025
 
1928
2026
 
1929
2027
  if export:
@@ -1933,16 +2031,16 @@ class radio_project():
1933
2031
  rec_data.to_csv(os.path.join(self.output_dir,'recaptures.csv'), index=False)
1934
2032
  logger.info(f" ✓ Export complete: {os.path.join(self.output_dir,'recaptures.csv')}")
1935
2033
  print(f"[recaptures] ✓ Export complete: {os.path.join(self.output_dir,'recaptures.csv')}", flush=True)
1936
- try:
1937
- with open(heartbeat_path, 'a') as _hb:
1938
- _hb.write(
1939
- f"DONE {datetime.datetime.now().isoformat()} export="
1940
- f"{os.path.join(self.output_dir, 'recaptures.csv')}\n"
1941
- )
1942
- except OSError as e:
1943
- raise RuntimeError(
1944
- f"Failed to write heartbeat completion to '{heartbeat_path}': {e}"
1945
- ) from e
2034
+ try:
2035
+ with open(heartbeat_path, 'a') as _hb:
2036
+ _hb.write(
2037
+ f"DONE {datetime.datetime.now().isoformat()} export="
2038
+ f"{os.path.join(self.output_dir, 'recaptures.csv')}\n"
2039
+ )
2040
+ except OSError as e:
2041
+ raise RuntimeError(
2042
+ f"Failed to write heartbeat completion to '{heartbeat_path}': {e}"
2043
+ ) from e
1946
2044
 
1947
2045
 
1948
2046
  def undo_recaptures(self):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pymast
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: Movement Analysis Software for Telemetry (MAST) - False positive removal and movement analysis for radio telemetry data
5
5
  Author: Theodore Castro-Santos
6
6
  Author-email: "Kevin P. Nebiolo" <kevin.nebiolo@kleinschmidtgroup.com>
@@ -1,23 +1,23 @@
1
- [build-system]
2
- requires = ["setuptools>=61", "wheel"]
3
- build-backend = "setuptools.build_meta"
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "pymast"
7
- version = "1.0.0"
7
+ version = "1.0.2"
8
8
  description = "Movement Analysis Software for Telemetry (MAST) - False positive removal and movement analysis for radio telemetry data"
9
9
  readme = "README.md"
10
10
  authors = [
11
11
  {name = "Kevin P. Nebiolo", email = "kevin.nebiolo@kleinschmidtgroup.com"},
12
12
  {name = "Theodore Castro-Santos"}
13
13
  ]
14
- license = "MIT"
15
- classifiers = [
16
- "Development Status :: 4 - Beta",
17
- "Intended Audience :: Science/Research",
18
- "Programming Language :: Python :: 3",
19
- "Programming Language :: Python :: 3.8",
20
- "Programming Language :: Python :: 3.9",
14
+ license = "MIT"
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Science/Research",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.8",
20
+ "Programming Language :: Python :: 3.9",
21
21
  "Programming Language :: Python :: 3.10",
22
22
  "Programming Language :: Python :: 3.11",
23
23
  "Topic :: Scientific/Engineering :: Bio-Informatics",
@@ -30,14 +30,14 @@ dependencies = [
30
30
  "matplotlib>=3.4.0",
31
31
  "statsmodels>=0.12.0",
32
32
  "networkx>=2.5",
33
- "scipy>=1.7.1",
34
- "scikit-learn>=0.24.0",
35
- "h5py>=3.0.0",
36
- "dask>=2021.3.0",
37
- "dask-ml>=1.9.0",
38
- "distributed>=2021.3.0",
39
- "numba>=0.53.0",
40
- "tables>=3.8.0",
33
+ "scipy>=1.7.1",
34
+ "scikit-learn>=0.24.0",
35
+ "h5py>=3.0.0",
36
+ "dask>=2021.3.0",
37
+ "dask-ml>=1.9.0",
38
+ "distributed>=2021.3.0",
39
+ "numba>=0.53.0",
40
+ "tables>=3.8.0",
41
41
  "intervaltree>=3.1.0",
42
42
  ]
43
43
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes