pymast 0.0.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pymast/parsers.py CHANGED
@@ -1,18 +1,169 @@
1
1
  # -*- coding: utf-8 -*-
2
+ """
3
+ Data parsers for radio telemetry receiver file formats.
4
+
5
+ This module provides parser functions to import raw detection data from various
6
+ radio telemetry receiver manufacturers into the MAST HDF5 database. Each parser
7
+ handles manufacturer-specific file formats and standardizes the data into a common
8
+ schema for downstream processing.
9
+
10
+ Supported Receiver Types
11
+ ------------------------
12
+ - **ARES**: Lotek Advanced Radio Telemetry Systems
13
+ - **Orion**: Sigma Eight Orion receivers
14
+ - **SRX-1200**: Lotek SRX 1200 receivers (fixed-width format)
15
+ - **SRX-800**: Lotek SRX 800 receivers (fixed-width format)
16
+ - **SRX-600**: Lotek SRX 600 receivers (fixed-width format)
17
+ - **VR2**: Vemco VR2 acoustic receivers (CSV format)
18
+ - **PIT**: Passive Integrated Transponder readers
19
+
20
+ Common Data Pipeline
21
+ --------------------
22
+ All parsers follow this workflow:
23
+ 1. Read raw receiver file (CSV, fixed-width, or vendor format)
24
+ 2. Parse timestamps, frequencies, codes, power, antenna information
25
+ 3. Calculate derived fields: epoch, noise_ratio
26
+ 4. Standardize column names and data types
27
+ 5. Append to HDF5 `/raw_data` table
28
+
29
+ Standardized Output Schema
30
+ --------------------------
31
+ All parsers produce these columns:
32
+ - `time_stamp` : datetime64 - Detection timestamp
33
+ - `epoch` : float32 - Seconds since 1970-01-01
34
+ - `freq_code` : object - Frequency + code (e.g., "166.380 7")
35
+ - `power` : float32 - Signal power (dB or raw)
36
+ - `rec_id` : object - Receiver identifier
37
+ - `rec_type` : object - Receiver type (ares, orion, srx1200, etc.)
38
+ - `channels` : int32 - Number of receiver channels
39
+ - `scan_time` : float32 - Scan duration per channel (seconds)
40
+ - `noise_ratio` : float32 - Ratio of miscoded to total detections
41
+
42
+ Typical Usage
43
+ -------------
44
+ >>> import pymast.parsers as parsers
45
+ >>>
46
+ >>> # Import ARES data
47
+ >>> parsers.ares(
48
+ ... file_name='receiver_001.csv',
49
+ ... db_dir='project.h5',
50
+ ... rec_id='REC001',
51
+ ... study_tags=['166.380 7', '166.380 12'],
52
+ ... scan_time=1.0,
53
+ ... channels=1
54
+ ... )
55
+ >>>
56
+ >>> # Import SRX-1200 data
57
+ >>> parsers.srx1200(
58
+ ... file_name='srx_detections.txt',
59
+ ... db_dir='project.h5',
60
+ ... rec_id='SRX123',
61
+ ... study_tags=['166.380 7'],
62
+ ... scan_time=2.5,
63
+ ... channels=1
64
+ ... )
65
+
66
+ Notes
67
+ -----
68
+ - Frequency values are rounded to nearest 5 kHz then converted to MHz with 3 decimal precision
69
+ - Noise ratio calculated using 5-minute moving window (see `predictors.noise_ratio`)
70
+ - All parsers append to existing HDF5 `/raw_data` table (mode='a')
71
+ - Timestamps assumed to be in UTC or project-specific timezone
72
+ - PIT readers have different schemas due to antenna-based detection logic
73
+
74
+ See Also
75
+ --------
76
+ radio_project.import_data : High-level batch import interface
77
+ predictors.noise_ratio : Miscoded detection ratio calculation
78
+ """
2
79
 
3
80
  import pandas as pd
4
81
  import numpy as np
5
82
  import datetime
6
- import os
7
- import pymast.predictors as predictors
8
-
9
- def ares(file_name,
10
- db_dir,
11
- rec_id,
83
+ import os
84
+ import pymast.predictors as predictors
85
+ import sys
86
+
87
+ def _append_raw_data(db_dir, telem_dat, data_columns=None):
88
+ with pd.HDFStore(db_dir, mode='a') as store:
89
+ append_kwargs = {
90
+ 'key': 'raw_data',
91
+ 'value': telem_dat,
92
+ 'format': 'table',
93
+ 'index': False,
94
+ 'min_itemsize': {
95
+ 'freq_code': 20,
96
+ 'rec_type': 20,
97
+ 'rec_id': 20,
98
+ },
99
+ 'append': True,
100
+ 'chunksize': 1000000,
101
+ }
102
+ if data_columns is not None:
103
+ append_kwargs['data_columns'] = data_columns
104
+ store.append(**append_kwargs)
105
+
106
+ def ares(file_name,
107
+ db_dir,
108
+ rec_id,
12
109
  study_tags,
13
110
  scan_time = 1,
14
111
  channels = 1,
15
112
  ant_to_rec_dict = None):
113
+ """
114
+ Import Lotek ARES receiver data into MAST HDF5 database.
115
+
116
+ Parses CSV format detection files from Lotek Advanced Radio Telemetry Systems
117
+ (ARES) receivers. Automatically detects file format variant based on header row
118
+ and standardizes data into common schema.
119
+
120
+ Parameters
121
+ ----------
122
+ file_name : str
123
+ Absolute path to ARES CSV file
124
+ db_dir : str
125
+ Absolute path to project HDF5 database
126
+ rec_id : str
127
+ Unique receiver identifier (e.g., 'REC001', 'SITE_A')
128
+ study_tags : list of str
129
+ List of valid freq_code tags deployed in study (e.g., ['166.380 7', '166.380 12'])
130
+ Used to calculate noise_ratio
131
+ scan_time : float, optional
132
+ Scan duration per channel in seconds (default: 1.0)
133
+ channels : int, optional
134
+ Number of receiver channels (default: 1)
135
+ ant_to_rec_dict : dict, optional
136
+ Mapping of antenna IDs to receiver IDs (not currently used)
137
+
138
+ Returns
139
+ -------
140
+ None
141
+ Data appended directly to HDF5 `/raw_data` table
142
+
143
+ Notes
144
+ -----
145
+ - Handles two ARES file format variants (detected via header row)
146
+ - Frequencies rounded to nearest 5 kHz, formatted as 3-decimal MHz
147
+ - Calculates noise_ratio using 5-minute moving window
148
+ - All timestamps converted to epoch (seconds since 1970-01-01)
149
+
150
+ Examples
151
+ --------
152
+ >>> import pymast.parsers as parsers
153
+ >>> parsers.ares(
154
+ ... file_name='C:/data/ares_001.csv',
155
+ ... db_dir='C:/project/study.h5',
156
+ ... rec_id='ARES001',
157
+ ... study_tags=['166.380 7', '166.380 12', '166.380 19'],
158
+ ... scan_time=1.0,
159
+ ... channels=1
160
+ ... )
161
+
162
+ See Also
163
+ --------
164
+ radio_project.import_data : High-level batch import
165
+ predictors.noise_ratio : Noise ratio calculation
166
+ """
16
167
  # identify the receiver type
17
168
  rec_type = 'ares'
18
169
 
@@ -67,7 +218,8 @@ def ares(file_name,
67
218
  inplace = True)
68
219
 
69
220
  # now do this stuff to files regardless of type
70
- telem_dat['epoch'] = np.round((telem_dat.time_stamp - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s'),6)
221
+ # compute epoch as integer seconds (int64) to avoid floating precision loss
222
+ telem_dat['epoch'] = (telem_dat.time_stamp.astype('int64') // 10**9).astype('int64')
71
223
  telem_dat['rec_type'] = np.repeat(rec_type,len(telem_dat))
72
224
  telem_dat['rec_id'] = np.repeat(rec_id,len(telem_dat))
73
225
  telem_dat['channels'] = np.repeat(channels,len(telem_dat))
@@ -77,40 +229,78 @@ def ares(file_name,
77
229
  telem_dat.epoch.values,
78
230
  study_tags)
79
231
 
80
- telem_dat = telem_dat.astype({'power':'float32',
81
- 'freq_code':'object',
82
- 'time_stamp':'datetime64[ns]',
83
- 'scan_time':'float32',
84
- 'channels':'int32',
85
- 'rec_type':'object',
86
- 'epoch':'float32',
87
- 'noise_ratio':'float32',
88
- 'rec_id':'object'})
89
-
90
- with pd.HDFStore(db_dir, mode='a') as store:
91
- store.append(key = 'raw_data',
92
- value = telem_dat,
93
- format = 'table',
94
- index = False,
95
- min_itemsize = {'freq_code':20,
96
- 'rec_type':20,
97
- 'rec_id':20},
98
- append = True,
99
- chunksize = 1000000)
100
-
101
-
102
- def orion_import(file_name,
103
- db_dir,
104
- rec_id,
105
- study_tags,
106
- scan_time = 1,
107
- channels = 1,
108
- ant_to_rec_dict = None):
109
- '''Function imports raw Sigma Eight orion data.
232
+ telem_dat = telem_dat.astype({'power':'float32',
233
+ 'freq_code':'object',
234
+ 'time_stamp':'datetime64[ns]',
235
+ 'scan_time':'float32',
236
+ 'channels':'int32',
237
+ 'rec_type':'object',
238
+ 'epoch':'int64',
239
+ 'noise_ratio':'float32',
240
+ 'rec_id':'object'})
241
+
242
+ _append_raw_data(db_dir, telem_dat)
110
243
 
111
- Text parser uses simple column fixed column widths.
112
244
 
113
- '''
245
+ def orion_import(file_name,
246
+ db_dir,
247
+ rec_id,
248
+ study_tags,
249
+ scan_time = 1.,
250
+ channels = 1,
251
+ ant_to_rec_dict = None):
252
+ """
253
+ Import Sigma Eight Orion receiver data into MAST HDF5 database.
254
+
255
+ Parses fixed-width format detection files from Sigma Eight Orion receivers.
256
+ Automatically detects firmware version based on header row and adjusts
257
+ column parsing accordingly.
258
+
259
+ Parameters
260
+ ----------
261
+ file_name : str
262
+ Absolute path to Orion fixed-width text file
263
+ db_dir : str
264
+ Absolute path to project HDF5 database
265
+ rec_id : str
266
+ Unique receiver identifier (e.g., 'ORION_01')
267
+ study_tags : list of str
268
+ List of valid freq_code tags deployed in study
269
+ scan_time : float, optional
270
+ Scan duration per channel in seconds (default: 1.0)
271
+ channels : int, optional
272
+ Number of receiver channels (default: 1)
273
+ ant_to_rec_dict : dict, optional
274
+ Mapping of antenna IDs to receiver IDs (not currently used)
275
+
276
+ Returns
277
+ -------
278
+ None
279
+ Data appended directly to HDF5 `/raw_data` table
280
+
281
+ Notes
282
+ -----
283
+ - Handles two Orion firmware variants: with/without 'Type' column
284
+ - Fixed-width column parsing using pandas read_fwf
285
+ - Filters out 'STATUS' messages (firmware-specific)
286
+ - Frequencies formatted as 3-decimal MHz
287
+
288
+ Examples
289
+ --------
290
+ >>> parsers.orion_import(
291
+ ... file_name='C:/data/orion_site1.txt',
292
+ ... db_dir='C:/project/study.h5',
293
+ ... rec_id='ORION_SITE1',
294
+ ... study_tags=['166.380 7'],
295
+ ... scan_time=1.0,
296
+ ... channels=1
297
+ ... )
298
+
299
+ See Also
300
+ --------
301
+ ares : Similar parser for Lotek ARES receivers
302
+ srx1200 : Parser for Lotek SRX 1200 receivers
303
+ """
114
304
  # identify the receiver type
115
305
  rec_type = 'orion'
116
306
 
@@ -124,19 +314,53 @@ def orion_import(file_name,
124
314
  # with our data row, extract information using pandas fwf import procedure
125
315
  telem_dat = pd.read_fwf(file_name,colspecs = [(0,12),(13,23),(24,30),(31,35),(36,45),(46,54),(55,60),(61,65)],
126
316
  names = ['Date','Time','Site','Ant','Freq','Type','Code','power'],
127
- skiprows = 1,
128
- dtype = {'Date':str,'Time':str,'Site':np.int32,'Ant':str,'Freq':str,'Type':str,'Code':str,'power':np.float64})
317
+ skiprows = 1)#,
318
+ #dtype = {'Date':str,'Time':str,'Site':np.int32,'Ant':str,'Freq':str,'Type':str,'Code':str,'power':np.float64})
129
319
  telem_dat = telem_dat[telem_dat.Type != 'STATUS']
320
+ telem_dat['Freq'] = telem_dat.Freq.astype('float32')
321
+
322
+ telem_dat['Freq'] = telem_dat['Freq'].apply(lambda x: f"{x:.3f}")
323
+ telem_dat['Ant'] = telem_dat.Ant.astype('object')
130
324
  telem_dat.drop(['Type'], axis = 1, inplace = True)
131
325
 
132
326
  else:
133
327
  # with our data row, extract information using pandas fwf import procedure
134
328
  telem_dat = pd.read_fwf(file_name,colspecs = [(0,11),(11,20),(20,26),(26,30),(30,37),(37,42),(42,48)],
135
329
  names = ['Date','Time','Site','Ant','Freq','Code','power'],
136
- skiprows = 1,
137
- dtype = {'Date':str,'Time':str,'Site':str,'Ant':str,'Freq':str,'Code':str,'power':str})
330
+ skiprows = 1)#,
331
+ #dtype = {'Date':str,'Time':str,'Site':str,'Ant':str,'Freq':str,'Code':str,'power':str})
332
+ telem_dat['Ant'] = telem_dat.Ant.astype('object')
333
+ telem_dat['Freq'] = telem_dat.Freq.astype('float32')
334
+ telem_dat['Freq'] = telem_dat['Freq'].apply(lambda x: f"{x:.3f}")
138
335
 
139
- if len(telem_dat) > 0:
336
+
337
+ def _write_orion_subset(df, receiver_id, epoch_dtype):
338
+ df = df.copy()
339
+ df['rec_id'] = np.repeat(receiver_id, len(df))
340
+ df.drop(['Ant'], axis = 1, inplace = True)
341
+ df = df.astype({'power':'float32',
342
+ 'freq_code':'object',
343
+ 'time_stamp':'datetime64[ns]',
344
+ 'scan_time':'float32',
345
+ 'channels':'int32',
346
+ 'rec_type':'object',
347
+ 'epoch': epoch_dtype,
348
+ 'noise_ratio':'float32',
349
+ 'rec_id':'object'})
350
+
351
+ df = df[['power',
352
+ 'time_stamp',
353
+ 'epoch',
354
+ 'freq_code',
355
+ 'noise_ratio',
356
+ 'scan_time',
357
+ 'channels',
358
+ 'rec_id',
359
+ 'rec_type']]
360
+
361
+ _append_raw_data(db_dir, df, data_columns=True)
362
+
363
+ if len(telem_dat) > 0:
140
364
  # add file name to data
141
365
  #['fileName'] = np.repeat(file_name,len(telem_dat)) #Note I'm going back here to the actual file name without the path. Is that OK? I prefer it, but it's a potential source of confusion
142
366
 
@@ -153,8 +377,8 @@ def orion_import(file_name,
153
377
  if len(telem_dat) == 0:
154
378
  print ("Invalid timestamps in raw data, cannot import")
155
379
  else:
156
- # create epoch
157
- telem_dat['epoch'] = np.round((telem_dat.time_stamp - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s'),6)
380
+ # create epoch as int64 seconds
381
+ telem_dat['epoch'] = (telem_dat.time_stamp.astype('int64') // 10**9).astype('int64')
158
382
 
159
383
  # drop unnecessary columns
160
384
  telem_dat.drop (['Date','Time','Freq','Code','Site'],axis = 1, inplace = True)
@@ -165,99 +389,66 @@ def orion_import(file_name,
165
389
  telem_dat.epoch.values,
166
390
  study_tags)
167
391
 
168
- # if there is no antenna to receiver dictionary
169
- if ant_to_rec_dict == None:
170
- # drop the antenna column - we don't need it anymore
171
- telem_dat.drop(['Ant'], axis = 1, inplace = True)
172
-
173
- # add receiver id
174
- telem_dat['rec_id'] = np.repeat(rec_id,len(telem_dat))
175
-
176
- telem_dat = telem_dat.astype({'power':'float32',
177
- 'freq_code':'object',
178
- 'time_stamp':'datetime64[ns]',
179
- 'scan_time':'float32',
180
- 'channels':'int32',
181
- 'rec_type':'object',
182
- 'epoch':'float32',
183
- 'noise_ratio':'float32',
184
- 'rec_id':'object'})
185
-
186
- telem_dat = telem_dat[['power',
187
- 'time_stamp',
188
- 'epoch',
189
- 'freq_code',
190
- 'noise_ratio',
191
- 'scan_time',
192
- 'channels',
193
- 'rec_id',
194
- 'rec_type']]
195
-
196
- with pd.HDFStore(db_dir, mode='a') as store:
197
- store.append(key = 'raw_data',
198
- value = telem_dat,
199
- format = 'table',
200
- index = False,
201
- min_itemsize = {'freq_code':20,
202
- 'rec_type':20,
203
- 'rec_id':20},
204
- append = True,
205
- chunksize = 1000000,
206
- data_columns = True)
207
-
208
- # if there is an antenna to receiver dictionary
209
- else:
210
- for i in ant_to_rec_dict:
211
- # get site from dictionary
212
- site = ant_to_rec_dict[i]
213
-
214
- # get telemetryt data associated with this site
215
- telem_dat_sub = telem_dat[telem_dat.Ant == str(i)]
216
-
217
- # add receiver ID
218
- telem_dat_sub['rec_id'] = np.repeat(site,len(telem_dat_sub))
219
-
220
- # remove exctranneous columns
221
- telem_dat_sub.drop(['Ant'], axis = 1, inplace = True)
222
-
223
- telem_dat_sub = telem_dat_sub.astype({'power':'float32',
224
- 'freq_code':'object',
225
- 'time_stamp':'datetime64[ns]',
226
- 'scan_time':'float32',
227
- 'channels':'int32',
228
- 'rec_type':'object',
229
- 'epoch':'float32',
230
- 'noise_ratio':'float32',
231
- 'rec_id':'object'})
232
-
233
- telem_dat_sub = telem_dat_sub[['power',
234
- 'time_stamp',
235
- 'epoch',
236
- 'freq_code',
237
- 'noise_ratio',
238
- 'scan_time',
239
- 'channels',
240
- 'rec_id',
241
- 'rec_type']]
242
-
243
- with pd.HDFStore(db_dir, mode='a') as store:
244
- store.append(key = 'raw_data',
245
- value = telem_dat_sub,
246
- format = 'table',
247
- index = False,
248
- min_itemsize = {'freq_code':20,
249
- 'rec_type':20,
250
- 'rec_id':20},
251
- append = True,
252
- chunksize = 1000000,
253
- data_columns = True)
254
-
255
-
392
+ # if there is no antenna to receiver dictionary
393
+ if ant_to_rec_dict == None:
394
+ _write_orion_subset(telem_dat, rec_id, 'int64')
395
+ # if there is an antenna to receiver dictionary
396
+ else:
397
+ for i in ant_to_rec_dict.keys():
398
+ # get site from dictionary
399
+ site = ant_to_rec_dict[i]
400
+
401
+ # get telemetryt data associated with this site
402
+ telem_dat_sub = telem_dat[telem_dat.Ant == 1]
403
+ _write_orion_subset(telem_dat_sub, site, 'float32')
404
+ else:
405
+ raise ValueError("Invalid import parameters, no data returned")
406
+ sys.exit()
407
+
256
408
 
257
409
  def vr2_import(file_name,db_dir,study_tags, rec_id):
258
- '''Function imports raw VEMCO VR2 acoustic data.
259
-
260
- '''
410
+ """
411
+ Import Vemco VR2 acoustic receiver data into MAST HDF5 database.
412
+
413
+ Parses CSV format detection files from Vemco VR2 acoustic receivers.
414
+ VR2 data uses acoustic tags instead of radio frequencies, with different
415
+ field names and data structure.
416
+
417
+ Parameters
418
+ ----------
419
+ file_name : str
420
+ Absolute path to VR2 CSV file
421
+ db_dir : str
422
+ Absolute path to project HDF5 database
423
+ study_tags : list of str
424
+ List of valid acoustic tag codes deployed in study
425
+ rec_id : str
426
+ Unique receiver identifier
427
+
428
+ Returns
429
+ -------
430
+ None
431
+ Data appended directly to HDF5 `/raw_data` table
432
+
433
+ Notes
434
+ -----
435
+ - Acoustic receivers use different schema than radio receivers
436
+ - VR2 files typically have standardized CSV format from Vemco software
437
+ - Converts acoustic tag IDs to freq_code format for consistency
438
+
439
+ Examples
440
+ --------
441
+ >>> parsers.vr2_import(
442
+ ... file_name='C:/data/vr2_001.csv',
443
+ ... db_dir='C:/project/acoustic_study.h5',
444
+ ... study_tags=['A69-1601-12345', 'A69-1601-12346'],
445
+ ... rec_id='VR2_001'
446
+ ... )
447
+
448
+ See Also
449
+ --------
450
+ ares : Parser for radio telemetry receivers
451
+ """
261
452
 
262
453
  recType = 'vr2'
263
454
 
@@ -281,7 +472,7 @@ def vr2_import(file_name,db_dir,study_tags, rec_id):
281
472
  telem_dat['transmitter'] = telem_dat['transmitter'].str.split("-", n = 2, expand = True)[2]
282
473
  telem_dat['transmitter'] = telem_dat.transmitter.astype(str)
283
474
  telem_dat.rename(columns = {'Receiver':'rec_id','transmitter':'freq_code'}, inplace = True)
284
- telem_dat['epoch'] = np.round((telem_dat.time_stamp - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s'),6)
475
+ telem_dat['epoch'] = (telem_dat.time_stamp.astype('int64') // 10**9).astype('int64')
285
476
  try:
286
477
  telem_dat.drop (['Date and Time (UTC)', 'Transmitter Name','Transmitter Serial','Sensor Value','Sensor Unit','Station Name','Latitude','Longitude','Transmitter Type','Sensor Precision'],axis = 1, inplace = True)
287
478
  except KeyError:
@@ -292,32 +483,80 @@ def vr2_import(file_name,db_dir,study_tags, rec_id):
292
483
  # telem_dat.set_index(index,inplace = True,drop = False)
293
484
 
294
485
  telem_dat = telem_dat.astype({'power':'float32',
295
- 'freq_code':'object',
296
- 'time_stamp':'datetime64[ns]',
297
- 'scan_time':'float32',
298
- 'channels':'int32',
299
- 'rec_type':'object',
300
- 'epoch':'float32',
301
- 'noise_ratio':'float32',
302
- 'rec_id':'object'})
486
+ 'freq_code':'object',
487
+ 'time_stamp':'datetime64[ns]',
488
+ 'scan_time':'float32',
489
+ 'channels':'int32',
490
+ 'rec_type':'object',
491
+ 'epoch':'int64',
492
+ 'noise_ratio':'float32',
493
+ 'rec_id':'object'})
303
494
 
304
- with pd.HDFStore(db_dir, mode='a') as store:
305
- store.append(key = 'raw_data',
306
- value = telem_dat,
307
- format = 'table',
308
- index = False,
309
- min_itemsize = {'freq_code':20,
310
- 'rec_type':20,
311
- 'rec_id':20},
312
- append = True,
313
- chunksize = 1000000)
495
+ _append_raw_data(db_dir, telem_dat)
496
+
314
497
  def srx1200(file_name,
315
498
  db_dir,
316
499
  rec_id,
317
500
  study_tags,
318
501
  scan_time = 1,
319
502
  channels = 1,
320
- ant_to_rec_dict = None):
503
+ ant_to_rec_dict = None,
504
+ ka_format = False):
505
+ """
506
+ Import Lotek SRX-1200 receiver data into MAST HDF5 database.
507
+
508
+ Parses fixed-width format detection files from Lotek SRX-1200 receivers.
509
+ Supports both standard Lotek format and custom Kleinschmidt Associates (KA) format.
510
+
511
+ Parameters
512
+ ----------
513
+ file_name : str
514
+ Absolute path to SRX-1200 fixed-width text file
515
+ db_dir : str
516
+ Absolute path to project HDF5 database
517
+ rec_id : str
518
+ Unique receiver identifier (e.g., 'SRX1200_001')
519
+ study_tags : list of str
520
+ List of valid freq_code tags deployed in study
521
+ scan_time : float, optional
522
+ Scan duration per channel in seconds (default: 1.0)
523
+ channels : int, optional
524
+ Number of receiver channels (default: 1)
525
+ ant_to_rec_dict : dict, optional
526
+ Mapping of antenna IDs to receiver IDs for multi-antenna setups
527
+ ka_format : bool, optional
528
+ If True, parse Kleinschmidt Associates custom format (default: False)
529
+
530
+ Returns
531
+ -------
532
+ None
533
+ Data appended directly to HDF5 `/raw_data` table
534
+
535
+ Notes
536
+ -----
537
+ - Fixed-width column parsing optimized for SRX-1200 output
538
+ - Handles multi-antenna configurations via ant_to_rec_dict
539
+ - KA format includes additional metadata fields
540
+ - Power values typically in dB
541
+
542
+ Examples
543
+ --------
544
+ >>> parsers.srx1200(
545
+ ... file_name='C:/data/srx1200_site1.txt',
546
+ ... db_dir='C:/project/study.h5',
547
+ ... rec_id='SRX1200_SITE1',
548
+ ... study_tags=['166.380 7', '166.380 12'],
549
+ ... scan_time=2.5,
550
+ ... channels=1,
551
+ ... ka_format=False
552
+ ... )
553
+
554
+ See Also
555
+ --------
556
+ srx800 : Parser for SRX-800 receivers
557
+ srx600 : Parser for SRX-600 receivers
558
+ ares : Parser for ARES receivers
559
+ """
321
560
  rec_type = 'srx1200'
322
561
 
323
562
  # create empty dictionary to hold Lotek header data indexed by line number - to be imported to Pandas dataframe
@@ -467,12 +706,20 @@ def srx1200(file_name,
467
706
 
468
707
  # read in telemetry data
469
708
  if new_split == None:
470
- telem_dat = pd.read_fwf(file_name,
471
- colspecs = [(0,7),(7,25),(25,35),(35,46),(46,57),(57,68),(68,80),(80,90),(90,102),(102,110),(110,130),(130,143),(143,153)],
472
- names = ['Index','Rx Serial Number','Date','Time','[uSec]','Tag/BPM','Freq [MHz]','Codeset','Antenna','Gain','RSSI','Latitude','Longitude'],
473
- skiprows = dataRow,
474
- skipfooter = eof - dataEnd)
475
- telem_dat.drop(columns = ['Index'], inplace = True)
709
+ if ka_format == False:
710
+ telem_dat = pd.read_fwf(file_name,
711
+ colspecs = [(0,7),(7,25),(25,35),(35,46),(46,57),(57,68),(68,80),(80,90),(90,102),(102,110),(110,130),(130,143),(143,153)],
712
+ names = ['Index','Rx Serial Number','Date','Time','[uSec]','Tag/BPM','Freq [MHz]','Codeset','Antenna','Gain','RSSI','Latitude','Longitude'],
713
+ skiprows = dataRow,
714
+ skipfooter = eof - dataEnd)
715
+ telem_dat.drop(columns = ['Index'], inplace = True)
716
+ else:
717
+ telem_dat = pd.read_fwf(file_name,
718
+ colspecs = [(0,5),(6,20),(20,32),(32,43),(43,53),(53,65),(65,72),(72,85),(85,93),(93,101)],
719
+ names = ['Index','Date','Time','[uSec]','Tag/BPM','Freq [MHz]','Codeset','Antenna','Gain','RSSI'],
720
+ skiprows = dataRow,
721
+ skipfooter = eof - dataEnd)
722
+ telem_dat.drop(columns = ['Index'], inplace = True)
476
723
 
477
724
  else:
478
725
  telem_dat = pd.read_csv(file_name,
@@ -489,8 +736,8 @@ def srx1200(file_name,
489
736
 
490
737
  telem_dat['time_stamp'] = pd.to_datetime(telem_dat.time_stamp)
491
738
 
492
- # calculate Epoch
493
- telem_dat['epoch'] = (telem_dat.time_stamp - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s')
739
+ # calculate Epoch as int64 seconds
740
+ telem_dat['epoch'] = (telem_dat.time_stamp.astype('int64') // 10**9).astype('int64')
494
741
 
495
742
  # format frequency code
496
743
  telem_dat['FreqNo'] = telem_dat['Freq [MHz]'].apply(lambda x: f"{x:.3f}" )
@@ -519,30 +766,26 @@ def srx1200(file_name,
519
766
  telem_dat.reset_index(inplace = True)
520
767
 
521
768
  telem_dat = telem_dat.astype({'power':'float32',
522
- 'freq_code':'object',
523
- 'time_stamp':'datetime64[ns]',
524
- 'scan_time':'int32',
525
- 'channels':'int32',
526
- 'rec_type':'object',
527
- 'epoch':'float32',
528
- 'noise_ratio':'float32',
529
- 'rec_id':'object'})
769
+ 'freq_code':'object',
770
+ 'time_stamp':'datetime64[ns]',
771
+ 'scan_time':'float32',
772
+ 'channels':'int32',
773
+ 'rec_type':'object',
774
+ 'epoch':'int64',
775
+ 'noise_ratio':'float32',
776
+ 'rec_id':'object'})
530
777
 
531
- if new_split != None:
532
- telem_dat.drop(columns = ['index'], inplace = True)
533
- print ('fuck')
534
-
535
- with pd.HDFStore(db_dir, mode='a') as store:
536
- store.append(key = 'raw_data',
537
- value = telem_dat,
538
- format = 'table',
539
- index = False,
540
- append = True,
541
- min_itemsize = {'freq_code':20,
542
- 'rec_type':20,
543
- 'rec_id':20},
544
- chunksize = 1000000,
545
- data_columns = True,)
778
+ telem_dat = telem_dat[['power',
779
+ 'time_stamp',
780
+ 'epoch',
781
+ 'freq_code',
782
+ 'noise_ratio',
783
+ 'scan_time',
784
+ 'channels',
785
+ 'rec_id',
786
+ 'rec_type']]
787
+
788
+ _append_raw_data(db_dir, telem_dat, data_columns=True)
546
789
 
547
790
  # if the data doesn't have a header
548
791
  else:
@@ -566,7 +809,7 @@ def srx1200(file_name,
566
809
  telem_dat['time_stamp'] = pd.to_datetime(telem_dat.time_stamp)
567
810
 
568
811
  # calculate Epoch
569
- telem_dat['epoch'] = np.round((telem_dat.time_stamp - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s'),6)
812
+ telem_dat['epoch'] = (telem_dat.time_stamp.astype('int64') // 10**9).astype('int64')
570
813
 
571
814
  # format frequency code
572
815
  telem_dat['FreqNo'] = telem_dat['Freq [MHz]'].apply(lambda x: f"{x:.3f}" )
@@ -595,26 +838,26 @@ def srx1200(file_name,
595
838
  telem_dat.reset_index(inplace = True, drop = True)
596
839
 
597
840
  telem_dat = telem_dat.astype({'power':'float32',
598
- 'freq_code':'object',
599
- 'time_stamp':'datetime64[ns]',
600
- 'scan_time':'float32',
601
- 'channels':'int32',
602
- 'rec_type':'object',
603
- 'epoch':'float32',
604
- 'noise_ratio':'float32',
605
- 'rec_id':'object'})
606
-
607
- with pd.HDFStore(db_dir, mode='a') as store:
608
- store.append(key = 'raw_data',
609
- value = telem_dat,
610
- format = 'table',
611
- index = False,
612
- min_itemsize = {'freq_code':20,
613
- 'rec_type':20,
614
- 'rec_id':20},
615
- append = True,
616
- chunksize = 1000000,
617
- data_columns = True)
841
+ 'freq_code':'object',
842
+ 'time_stamp':'datetime64[ns]',
843
+ 'scan_time':'float32',
844
+ 'channels':'int32',
845
+ 'rec_type':'object',
846
+ 'epoch':'int64',
847
+ 'noise_ratio':'float32',
848
+ 'rec_id':'object'})
849
+
850
+ telem_dat = telem_dat[['power',
851
+ 'time_stamp',
852
+ 'epoch',
853
+ 'freq_code',
854
+ 'noise_ratio',
855
+ 'scan_time',
856
+ 'channels',
857
+ 'rec_id',
858
+ 'rec_type']]
859
+
860
+ _append_raw_data(db_dir, telem_dat, data_columns=True)
618
861
 
619
862
  def srx800(file_name,
620
863
  db_dir,
@@ -623,6 +866,57 @@ def srx800(file_name,
623
866
  scan_time = 1,
624
867
  channels = 1,
625
868
  ant_to_rec_dict = None):
869
+ """
870
+ Import Lotek SRX-800 receiver data into MAST HDF5 database.
871
+
872
+ Parses fixed-width format detection files from Lotek SRX-800 receivers.
873
+ Similar to SRX-1200 but with different column widths and firmware-specific
874
+ header parsing.
875
+
876
+ Parameters
877
+ ----------
878
+ file_name : str
879
+ Absolute path to SRX-800 fixed-width text file
880
+ db_dir : str
881
+ Absolute path to project HDF5 database
882
+ rec_id : str
883
+ Unique receiver identifier
884
+ study_tags : list of str
885
+ List of valid freq_code tags deployed in study
886
+ scan_time : float, optional
887
+ Scan duration per channel in seconds (default: 1.0)
888
+ channels : int, optional
889
+ Number of receiver channels (default: 1)
890
+ ant_to_rec_dict : dict, optional
891
+ Mapping of antenna IDs to receiver IDs
892
+
893
+ Returns
894
+ -------
895
+ None
896
+ Data appended directly to HDF5 `/raw_data` table
897
+
898
+ Notes
899
+ -----
900
+ - Parses SRX-800 specific header format for scan configuration
901
+ - Fixed-width column parsing adjusted for SRX-800 output
902
+ - Handles multi-antenna configurations
903
+
904
+ Examples
905
+ --------
906
+ >>> parsers.srx800(
907
+ ... file_name='C:/data/srx800_detections.txt',
908
+ ... db_dir='C:/project/study.h5',
909
+ ... rec_id='SRX800_001',
910
+ ... study_tags=['166.380 7'],
911
+ ... scan_time=2.0,
912
+ ... channels=1
913
+ ... )
914
+
915
+ See Also
916
+ --------
917
+ srx1200 : Parser for SRX-1200 receivers
918
+ srx600 : Parser for SRX-600 receivers
919
+ """
626
920
 
627
921
  rec_type = 'srx800'
628
922
 
@@ -786,6 +1080,7 @@ def srx800(file_name,
786
1080
  names = ['DayNumber','Time','ChannelID','TagID','Antenna','power'],
787
1081
  skiprows = dataRow,
788
1082
  dtype = {'ChannelID':str,'TagID':str,'Antenna':str})
1083
+ telem_dat = telem_dat.iloc[:-1]
789
1084
  telem_dat['day0'] = np.repeat(pd.to_datetime("1900-01-01"),len(telem_dat))
790
1085
  telem_dat['Date'] = telem_dat['day0'] + pd.to_timedelta(telem_dat['DayNumber'].astype(int), unit='d')
791
1086
  telem_dat['Date'] = telem_dat.Date.astype('str')
@@ -851,11 +1146,16 @@ def srx800(file_name,
851
1146
  telem_dat_sub['epoch'] = np.round((telem_dat_sub.time_stamp - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s'),6)
852
1147
 
853
1148
  # get setup number for every row
854
- try:
855
- telem_dat_sub['setup'] = get_setup(telem_dat_sub.epoch.values,
856
- setup_df.epoch.values)
857
- except:
858
- print ('why you fail?')
1149
+ try:
1150
+ telem_dat_sub['setup'] = get_setup(
1151
+ telem_dat_sub.epoch.values,
1152
+ setup_df.epoch.values
1153
+ )
1154
+ except (ValueError, TypeError, IndexError) as e:
1155
+ raise ValueError(
1156
+ f"Failed to compute setup mapping for antenna '{ant}' at site '{site}'. "
1157
+ "Check setup table epoch alignment and input data integrity."
1158
+ ) from e
859
1159
 
860
1160
  # get frequency from channel
861
1161
  telem_dat_sub['Frequency'] = get_frequency(telem_dat_sub.setup.values,
@@ -934,6 +1234,57 @@ def srx600(file_name,
934
1234
  scan_time = 1,
935
1235
  channels = 1,
936
1236
  ant_to_rec_dict = None):
1237
+ """
1238
+ Import Lotek SRX-600 receiver data into MAST HDF5 database.
1239
+
1240
+ Parses fixed-width format detection files from Lotek SRX-600 receivers.
1241
+ Similar to SRX-800/1200 but with SRX-600 specific column widths and
1242
+ header structure.
1243
+
1244
+ Parameters
1245
+ ----------
1246
+ file_name : str
1247
+ Absolute path to SRX-600 fixed-width text file
1248
+ db_dir : str
1249
+ Absolute path to project HDF5 database
1250
+ rec_id : str
1251
+ Unique receiver identifier
1252
+ study_tags : list of str
1253
+ List of valid freq_code tags deployed in study
1254
+ scan_time : float, optional
1255
+ Scan duration per channel in seconds (default: 1.0)
1256
+ channels : int, optional
1257
+ Number of receiver channels (default: 1)
1258
+ ant_to_rec_dict : dict, optional
1259
+ Mapping of antenna IDs to receiver IDs
1260
+
1261
+ Returns
1262
+ -------
1263
+ None
1264
+ Data appended directly to HDF5 `/raw_data` table
1265
+
1266
+ Notes
1267
+ -----
1268
+ - Parses SRX-600 specific header format
1269
+ - Fixed-width column parsing adjusted for SRX-600 output
1270
+ - Older receiver model with slightly different data structure
1271
+
1272
+ Examples
1273
+ --------
1274
+ >>> parsers.srx600(
1275
+ ... file_name='C:/data/srx600_detections.txt',
1276
+ ... db_dir='C:/project/study.h5',
1277
+ ... rec_id='SRX600_001',
1278
+ ... study_tags=['166.380 7'],
1279
+ ... scan_time=1.5,
1280
+ ... channels=1
1281
+ ... )
1282
+
1283
+ See Also
1284
+ --------
1285
+ srx1200 : Parser for SRX-1200 receivers
1286
+ srx800 : Parser for SRX-800 receivers
1287
+ """
937
1288
 
938
1289
  rec_type = 'srx600'
939
1290
 
@@ -1095,8 +1446,8 @@ def srx600(file_name,
1095
1446
  telem_dat_sub['time_stamp'] = pd.to_datetime(telem_dat_sub['Date'] + ' ' + telem_dat_sub['Time'])
1096
1447
  telem_dat_sub.drop(['day0','DayNumber'],axis = 1, inplace = True)
1097
1448
 
1098
- # calculate unix epoch
1099
- telem_dat['epoch'] = np.round((telem_dat.time_stamp - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s'),6)
1449
+ # calculate unix epoch as int64 seconds
1450
+ telem_dat_sub['epoch'] = (telem_dat_sub.time_stamp.astype('int64') // 10**9).astype('int64')
1100
1451
 
1101
1452
  # clean up some more
1102
1453
  telem_dat_sub.drop (['Date','Time','Frequency','TagID','ChannelID','Antenna'],axis = 1, inplace = True)
@@ -1137,17 +1488,7 @@ def srx600(file_name,
1137
1488
  'noise_ratio':'float32',
1138
1489
  'rec_id':'object'})
1139
1490
 
1140
- with pd.HDFStore(db_dir, mode='a') as store:
1141
- store.append(key = 'raw_data',
1142
- value = telem_dat_sub,
1143
- format = 'table',
1144
- index = False,
1145
- min_itemsize = {'freq_code':20,
1146
- 'rec_type':20,
1147
- 'rec_id':20},
1148
- append = True,
1149
- chunksize = 1000000,
1150
- data_columns = True)
1491
+ _append_raw_data(db_dir, telem_dat_sub, data_columns=True)
1151
1492
  else:
1152
1493
  telem_dat = pd.read_fwf(file_name,
1153
1494
  colspecs = [(0,9),(9,19),(19,29),(29,36),(36,44),(44,52)],
@@ -1212,21 +1553,552 @@ def srx600(file_name,
1212
1553
  'noise_ratio':'float32',
1213
1554
  'rec_id':'object'})
1214
1555
 
1215
- # write to SQL
1216
- with pd.HDFStore(db_dir, mode='a') as store:
1217
- store.append(key = 'raw_data',
1218
- value = telem_dat_sub,
1219
- format = 'table',
1220
- index = False,
1221
- min_itemsize = {'freq_code':20,
1222
- 'rec_type':20,
1223
- 'rec_id':20},
1224
- append = True,
1225
- chunksize = 1000000)
1556
+ _append_raw_data(db_dir, telem_dat_sub)
1557
+
1558
+
1559
+
1560
+
1561
+
1562
+ def PIT(file_name,
1563
+ db_dir,
1564
+ rec_id=None,
1565
+ study_tags=None,
1566
+ skiprows=6,
1567
+ scan_time=0,
1568
+ channels=0,
1569
+ rec_type="PIT",
1570
+ ant_to_rec_dict=None):
1571
+ """
1572
+ Import PIT (Passive Integrated Transponder) reader data into MAST HDF5 database.
1573
+
1574
+ Parses detection files from PIT tag readers. PIT systems use different
1575
+ technology (RFID) than radio telemetry but data can be analyzed with
1576
+ similar methods.
1577
+
1578
+ Parameters
1579
+ ----------
1580
+ file_name : str
1581
+ Absolute path to PIT reader CSV/text file
1582
+ db_dir : str
1583
+ Absolute path to project HDF5 database
1584
+ rec_id : str
1585
+ Unique reader identifier
1586
+ study_tags : list of str
1587
+ List of valid PIT tag IDs deployed in study
1588
+ skiprows : int, optional
1589
+ Number of header rows to skip (default: 6)
1590
+ scan_time : float, optional
1591
+ Not used for PIT readers (default: 0)
1592
+ channels : int, optional
1593
+ Not used for PIT readers (default: 0)
1594
+ rec_type : str, optional
1595
+ Reader type identifier (default: 'PIT_Array')
1596
+ ant_to_rec_dict : dict, optional
1597
+ Mapping of antenna IDs to reader IDs for multi-antenna arrays
1598
+
1599
+ Returns
1600
+ -------
1601
+ None
1602
+ Data appended directly to HDF5 `/raw_data` table
1603
+
1604
+ Notes
1605
+ -----
1606
+ - PIT readers have antenna-based detection logic (different from radio receivers)
1607
+ - Tag IDs converted to freq_code format for consistency with radio data
1608
+ - Typically used at fixed locations (weirs, ladders, bypass systems)
1609
+ - scan_time and channels not applicable to PIT technology
1610
+
1611
+ Examples
1612
+ --------
1613
+ >>> parsers.PIT(
1614
+ ... file_name='C:/data/pit_reader_001.csv',
1615
+ ... db_dir='C:/project/pit_study.h5',
1616
+ ... rec_id='PIT_WEIR_01',
1617
+ ... study_tags=['3D9.1BF3C5A8B2', '3D9.1BF3C5A8C1'],
1618
+ ... skiprows=6,
1619
+ ... rec_type='PIT_Array'
1620
+ ... )
1621
+
1622
+ See Also
1623
+ --------
1624
+ PIT_Multiple : Parser for multi-antenna PIT arrays
1625
+ """
1626
+
1627
+ import pandas as pd
1628
+ import re
1629
+
1630
+ # Determine mode based on parameters
1631
+ is_multi_antenna = ant_to_rec_dict is not None
1632
+ mode_str = "multi-antenna" if is_multi_antenna else "single antenna"
1633
+ print(f"Parsing PIT file ({mode_str}): {file_name}")
1634
+
1635
+ # Function to find columns by pattern matching
1636
+ def find_column_by_patterns(df, patterns):
1637
+ for col in df.columns:
1638
+ col_lower = str(col).lower().strip()
1639
+ for pattern in patterns:
1640
+ if pattern in col_lower:
1641
+ return col
1642
+ return None
1643
+
1644
+ # First, analyze the file to determine format
1645
+ def analyze_file_format(file_name):
1646
+ """Dynamically determine PIT file format and header structure"""
1647
+ with open(file_name, 'r') as file:
1648
+ lines = []
1649
+ for _ in range(20): # Read first 20 lines to analyze format
1650
+ line = file.readline()
1651
+ if not line:
1652
+ break
1653
+ lines.append(line.rstrip('\n'))
1654
+
1655
+ # Check if CSV format (look for commas in sample lines)
1656
+ csv_indicators = 0
1657
+ for line in lines[max(0, len(lines)-10):]: # Check last 10 lines for data
1658
+ if line.count(',') > 3: # More than 3 commas suggests CSV
1659
+ csv_indicators += 1
1660
+
1661
+ is_csv = csv_indicators > 2 # If most lines have commas, it's CSV
1662
+
1663
+ # For CSV, look for header row
1664
+ actual_skiprows = 0
1665
+ if is_csv:
1666
+ for i, line in enumerate(lines):
1667
+ line_lower = line.lower()
1668
+ # Look for column headers (must contain text headers, not just data)
1669
+ if any(header in line_lower for header in ['tag', 'time', 'date', 'antenna', 'detected', 'site', 'reader']):
1670
+ if ',' in line and not re.search(r'^\d{4}-\d{2}-\d{2}|\d{1,2}/\d{1,2}/\d{4}', line.strip()):
1671
+ # It's a header row (has keywords but no date pattern at start)
1672
+ actual_skiprows = 0 # Keep headers, don't skip them
1673
+ break
1674
+
1675
+ # If no header found, assume no header (skiprows = 0)
1676
+ else:
1677
+ # For fixed-width, look for data start
1678
+ for i, line in enumerate(lines):
1679
+ line_lower = line.lower()
1680
+ if 'version' in line_lower or 'ver' in line_lower:
1681
+ print(f"Found version info: {line}")
1682
+
1683
+ # Look for data start indicators
1684
+ if any(indicator in line_lower for indicator in ['scan date', 'date', 'timestamp', 'tag id']):
1685
+ if i > 0: # If this looks like a header row
1686
+ actual_skiprows = i + 1
1687
+ break
1688
+
1689
+ # Check if this looks like a data line
1690
+ if re.search(r'\d{1,2}/\d{1,2}/\d{4}|\d{4}-\d{2}-\d{2}', line):
1691
+ actual_skiprows = i
1692
+ break
1693
+
1694
+ return is_csv, actual_skiprows, lines
1695
+
1696
+ # Analyze file format
1697
+ is_csv_format, detected_skiprows, sample_lines = analyze_file_format(file_name)
1698
+
1699
+ # Use detected skiprows for CSV, keep provided for fixed-width
1700
+ if is_csv_format:
1701
+ skiprows = detected_skiprows
1702
+ print(f"Detected CSV format, using skiprows: {skiprows}")
1703
+ else:
1704
+ print(f"Detected fixed-width format, using skiprows: {skiprows}")
1705
+
1706
+ # Parse the file based on detected format
1707
+ if is_csv_format:
1708
+ # CSV Format Parsing
1709
+ try:
1710
+ # Read CSV - if skiprows is 0, pandas will automatically use first row as headers
1711
+ telem_dat = pd.read_csv(file_name, dtype=str)
1712
+ print(f"Auto-detected columns: {list(telem_dat.columns)}")
1713
+
1714
+ except (pd.errors.ParserError, UnicodeDecodeError, ValueError) as e:
1715
+ raise ValueError(
1716
+ f"CSV auto-detection failed for PIT file '{file_name}': {e}"
1717
+ ) from e
1718
+
1719
+ # Find timestamp column dynamically
1720
+ timestamp_col = find_column_by_patterns(telem_dat, ['timestamp', 'time stamp', 'date', 'scan date', 'detected'])
1721
+ if timestamp_col:
1722
+ print(f"Found timestamp column: {timestamp_col}")
1723
+ # Try multiple datetime formats
1724
+ for fmt in ["%m/%d/%Y %H:%M", "%Y-%m-%d %H:%M:%S", "%m/%d/%Y", "%Y-%m-%d", None]:
1725
+ try:
1726
+ if fmt:
1727
+ telem_dat["time_stamp"] = pd.to_datetime(telem_dat[timestamp_col], format=fmt, errors="coerce")
1728
+ else:
1729
+ telem_dat["time_stamp"] = pd.to_datetime(telem_dat[timestamp_col], errors="coerce")
1730
+
1731
+ # Check if parsing was successful
1732
+ if not telem_dat["time_stamp"].isna().all():
1733
+ print(f"Successfully parsed timestamps using format: {fmt or 'auto-detect'}")
1734
+ break
1735
+ except (ValueError, TypeError) as e:
1736
+ continue
1737
+ else:
1738
+ raise ValueError("Could not find timestamp column")
1739
+
1740
+ # Find tag ID columns dynamically
1741
+ telem_dat["freq_code"] = telem_dat['Tag1Hex'].astype(str).str.strip()
1742
+
1743
+ # hex_tag_col = find_column_by_patterns(telem_dat, ['hex', 'tag1hex', 'tag id', 'tagid', 'tag'])
1744
+ # dec_tag_col = find_column_by_patterns(telem_dat, ['dec', 'tag1dec', 'decimal'])
1745
+
1746
+ # if hex_tag_col:
1747
+ # print(f"Found HEX tag column: {hex_tag_col}")
1748
+ # telem_dat["freq_code"] = telem_dat[hex_tag_col].astype(str).str.strip()
1749
+ # elif dec_tag_col:
1750
+ # print(f"Found DEC tag column: {dec_tag_col}")
1751
+ # telem_dat["freq_code"] = telem_dat[dec_tag_col].astype(str).str.strip()
1752
+ # else:
1753
+ # raise ValueError("Could not find tag ID column")
1754
+
1755
+ # Handle antenna mapping for multi-antenna files
1756
+ if is_multi_antenna:
1757
+ antenna_col = find_column_by_patterns(telem_dat, ['antenna', 'antennae', 'ant'])
1758
+ if antenna_col:
1759
+ print(f"Found antenna column: {antenna_col}")
1760
+ # Convert antenna column to integer and apply mapping
1761
+ telem_dat["antenna_clean"] = telem_dat[antenna_col].astype(str).str.extract(r'(\d+)')[0]
1762
+ telem_dat["antenna_clean"] = pd.to_numeric(telem_dat["antenna_clean"], errors='coerce').astype("Int64")
1763
+ telem_dat["rec_id"] = telem_dat["antenna_clean"].map(ant_to_rec_dict)
1764
+ # Drop rows where antenna values don't match known receivers
1765
+ telem_dat = telem_dat.dropna(subset=["rec_id"])
1766
+ else:
1767
+ raise ValueError("Multi-antenna mode requires antenna column, but none found")
1768
+ else:
1769
+ # Single antenna mode - use provided rec_id
1770
+ telem_dat["rec_id"] = rec_id
1771
+
1772
+ else:
1773
+ # Fixed-Width Format Parsing (original logic)
1774
+
1775
+ # Read header information for format detection
1776
+ with open(file_name, 'r') as file:
1777
+ header_lines = []
1778
+ for _ in range(max(skiprows, 10)):
1779
+ line = file.readline()
1780
+ if not line:
1781
+ break
1782
+ header_lines.append(line.rstrip('\n'))
1783
+ header_text = " ".join(header_lines).lower()
1784
+
1785
+ # Define colspecs for different fixed-width formats
1786
+ if 'latitude' in header_text or 'longitude' in header_text:
1787
+ colspecs = [(0, 12), (12, 26), (26, 41), (41, 56), (56, 59), (66, 70),
1788
+ (79, 95), (95, 112), (113, 120), (120, 131), (138, 145),
1789
+ (145, 155), (155, 166), (166, 175)]
1790
+ col_names = ["Scan Date", "Scan Time", "Download Date", "Download Time",
1791
+ "Reader ID", "Antenna ID", "HEX Tag ID", "DEC Tag ID",
1792
+ "Temperature_C", "Signal_mV", "Is Duplicate", "Latitude",
1793
+ "Longitude", "File Name"]
1794
+ print("Using format with latitude/longitude")
1795
+ else:
1796
+ colspecs = [(0, 12), (12, 26), (26, 41), (41, 56), (56, 62), (62, 73),
1797
+ (73, 89), (89, 107), (107, 122), (122, 132), (136, 136)]
1798
+ col_names = ["Scan Date", "Scan Time", "Download Date", "Download Time",
1799
+ "S/N", "Reader ID", "HEX Tag ID", "DEC Tag ID",
1800
+ "Temperature_C", "Signal_mV", "Is Duplicate"]
1801
+ print("Using format without latitude/longitude")
1802
+
1803
+ # Try different encodings if UTF-8 fails
1804
+ encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
1805
+ telem_dat = None
1806
+
1807
+ for encoding in encodings_to_try:
1808
+ try:
1809
+ print(f"Attempting to read file with encoding: {encoding}")
1810
+ telem_dat = pd.read_fwf(
1811
+ file_name,
1812
+ colspecs=colspecs,
1813
+ names=col_names,
1814
+ skiprows=skiprows,
1815
+ encoding=encoding
1816
+ )
1817
+ print(f"Successfully read with encoding: {encoding}")
1818
+ break
1819
+ except UnicodeDecodeError:
1820
+ print(f"Failed with {encoding}, trying next...")
1821
+ continue
1822
+
1823
+ if telem_dat is None:
1824
+ raise ValueError(f"Could not read file with any supported encoding: {encodings_to_try}")
1825
+
1826
+ print(f"Fixed-width parsing complete. Shape: {telem_dat.shape}")
1827
+
1828
+ # Build timestamp from Scan Date + Scan Time
1829
+ telem_dat["time_stamp"] = pd.to_datetime(
1830
+ telem_dat["Scan Date"] + " " + telem_dat["Scan Time"],
1831
+ errors="coerce"
1832
+ )
1833
+
1834
+ # Use HEX Tag ID as freq_code
1835
+ telem_dat["freq_code"] = telem_dat["HEX Tag ID"].str.strip()
1836
+
1837
+ # For fixed-width, assign rec_id or map antennas if multi-antenna mapping provided
1838
+ if ant_to_rec_dict is None:
1839
+ telem_dat["rec_id"] = rec_id
1840
+ else:
1841
+ # try to find an antenna column in the fixed-width frame
1842
+ antenna_col = None
1843
+ for col in telem_dat.columns:
1844
+ col_lower = str(col).lower().strip()
1845
+ if col_lower in ('antenna id', 'antenna', 'ant', 'antennae', 'antennae id'):
1846
+ antenna_col = col
1847
+ break
1848
+
1849
+ if antenna_col is not None:
1850
+ # extract numeric antenna identifier and map using provided dictionary
1851
+ telem_dat['antenna_raw'] = telem_dat[antenna_col].astype(str).str.strip()
1852
+ # Try numeric extraction first, then fall back to raw string mapping
1853
+ telem_dat['antenna_num'] = telem_dat['antenna_raw'].str.extract(r'(\d+)')[0]
1854
+ telem_dat['antenna_num'] = pd.to_numeric(telem_dat['antenna_num'], errors='coerce')
1855
+
1856
+ # Prepare mapping dict keys as strings and ints for robust lookup
1857
+ ant_map = {}
1858
+ for k, v in ant_to_rec_dict.items():
1859
+ key_str = str(k).strip()
1860
+ if key_str.isdigit():
1861
+ ant_map[int(key_str)] = v
1862
+ ant_map[key_str] = v
1863
+
1864
+ # Map by numeric antenna if possible, else by raw string
1865
+ telem_dat['rec_id'] = telem_dat['antenna_num'].map(ant_map)
1866
+ missing_mask = telem_dat['rec_id'].isna()
1867
+ if missing_mask.any():
1868
+ # try mapping by raw string for missing ones
1869
+ telem_dat.loc[missing_mask, 'rec_id'] = telem_dat.loc[missing_mask, 'antenna_raw'].map(ant_map)
1870
+
1871
+ # report mapping summary for debugging
1872
+ unique_antennas = telem_dat['antenna_raw'].unique()[:20]
1873
+ print('Detected antenna values (sample):', unique_antennas)
1874
+ mapped_counts = telem_dat['rec_id'].notna().sum()
1875
+ print(f'Mapped {mapped_counts} / {len(telem_dat)} rows to receivers via ant_to_rec_dict')
1876
+
1877
+ # drop detections that do not map to a known receiver
1878
+ telem_dat = telem_dat.dropna(subset=['rec_id'])
1879
+ else:
1880
+ raise ValueError('Multi-antenna fixed-width PIT file requires an antenna column but none was found')
1881
+
1882
+ # Data cleaning - remove invalid entries
1883
+ print(f"\nCleaning data - original records: {len(telem_dat)}")
1226
1884
 
1885
+ before_cleanup = len(telem_dat)
1227
1886
 
1887
+ # Remove header artifacts
1888
+ header_patterns = ['HEX Tag ID', 'DEC Tag ID', '----', '====', 'Tag ID', 'Scan Date']
1889
+ for pattern in header_patterns:
1890
+ telem_dat = telem_dat[telem_dat['freq_code'] != pattern]
1228
1891
 
1892
+ # Remove separator lines
1893
+ telem_dat = telem_dat[~telem_dat['freq_code'].str.match(r'^-+$', na=False)]
1229
1894
 
1895
+ # Remove rows with invalid timestamps
1896
+ telem_dat = telem_dat[~telem_dat['time_stamp'].isna()]
1230
1897
 
1898
+ # Remove rows with invalid freq_codes
1899
+ telem_dat = telem_dat[telem_dat['freq_code'].str.len() > 3]
1900
+ telem_dat = telem_dat[~telem_dat['freq_code'].isna()]
1231
1901
 
1232
-
1902
+ # Finalize fields and append to HDF5 /raw_data
1903
+ if len(telem_dat) == 0:
1904
+ print('No valid PIT rows after cleaning; nothing to append')
1905
+ return
1906
+
1907
+ if 'power' not in telem_dat.columns:
1908
+ telem_dat['power'] = np.nan
1909
+
1910
+ # compute epoch as int64 seconds and other derived fields
1911
+ telem_dat['epoch'] = (pd.to_datetime(telem_dat['time_stamp']).astype('int64') // 10**9).astype('int64')
1912
+ telem_dat['channels'] = np.repeat(channels, len(telem_dat))
1913
+ telem_dat['scan_time'] = np.repeat(scan_time, len(telem_dat))
1914
+ telem_dat['rec_type'] = np.repeat(rec_type, len(telem_dat))
1915
+
1916
+ # compute noise ratio if study_tags provided
1917
+ try:
1918
+ telem_dat['noise_ratio'] = predictors.noise_ratio(
1919
+ 5.0,
1920
+ telem_dat.freq_code.values,
1921
+ telem_dat.epoch.values,
1922
+ study_tags
1923
+ )
1924
+ except (ValueError, TypeError, KeyError, IndexError) as e:
1925
+ raise ValueError(f"Failed to compute noise_ratio for PIT data: {e}") from e
1926
+
1927
+ # ensure dtypes
1928
+ telem_dat = telem_dat.astype({'time_stamp': 'datetime64[ns]',
1929
+ 'epoch': 'int64',
1930
+ 'freq_code': 'object',
1931
+ 'power': 'float32',
1932
+ 'rec_id': 'object',
1933
+ 'rec_type': 'object',
1934
+ 'scan_time': 'float32',
1935
+ 'channels': 'int32',
1936
+ 'noise_ratio': 'float32'})
1937
+
1938
+ # reorder columns to match expected schema
1939
+ cols = ['time_stamp', 'epoch', 'freq_code', 'power', 'noise_ratio', 'scan_time', 'channels', 'rec_id', 'rec_type']
1940
+ cols_existing = [c for c in cols if c in telem_dat.columns]
1941
+
1942
+ _append_raw_data(db_dir, telem_dat[cols_existing], data_columns=True)
1943
+ with pd.HDFStore(db_dir, mode='a') as store:
1944
+ print('Store keys after append:', store.keys())
1945
+
1946
+
1947
+ def PIT_Multiple(
1948
+ file_name,
1949
+ db_dir,
1950
+ study_tags=None,
1951
+ skiprows=0,
1952
+ scan_time=0,
1953
+ channels=0,
1954
+ rec_type="PIT_Multiple",
1955
+ ant_to_rec_dict=None
1956
+ ):
1957
+ """
1958
+ Import multi-antenna PIT array data into MAST HDF5 database.
1959
+
1960
+ Parses detection files from PIT reader arrays with multiple antennas at
1961
+ a single location. Handles antenna-to-receiver mapping and converts
1962
+ multi-antenna detections to individual receiver records.
1963
+
1964
+ Parameters
1965
+ ----------
1966
+ file_name : str
1967
+ Absolute path to PIT array CSV file
1968
+ db_dir : str
1969
+ Absolute path to project HDF5 database
1970
+ study_tags : list of str, optional
1971
+ List of valid PIT tag IDs deployed in study
1972
+ skiprows : int, optional
1973
+ Number of header rows to skip (default: 0)
1974
+ scan_time : float, optional
1975
+ Not used for PIT readers (default: 0)
1976
+ channels : int, optional
1977
+ Not used for PIT readers (default: 0)
1978
+ rec_type : str, optional
1979
+ Reader type identifier (default: 'PIT_Multiple')
1980
+ ant_to_rec_dict : dict, optional
1981
+ Mapping of antenna IDs to receiver IDs (REQUIRED for multi-antenna arrays)
1982
+
1983
+ Returns
1984
+ -------
1985
+ None
1986
+ Data appended directly to HDF5 `/raw_data` table
1987
+
1988
+ Notes
1989
+ -----
1990
+ - Designed for PIT arrays with multiple antennas at single location
1991
+ - Uses ant_to_rec_dict to assign detections to virtual "receivers" per antenna
1992
+ - Processes fish metadata (species, weight, length, capture method)
1993
+ - Handles both decimal and hexadecimal tag formats
1994
+
1995
+ Examples
1996
+ --------
1997
+ >>> ant_map = {
1998
+ ... 'Antenna1': 'PIT_WEIR_DOWNSTREAM',
1999
+ ... 'Antenna2': 'PIT_WEIR_UPSTREAM',
2000
+ ... 'Antenna3': 'PIT_WEIR_LADDER'
2001
+ ... }
2002
+ >>> parsers.PIT_Multiple(
2003
+ ... file_name='C:/data/pit_array_detections.csv',
2004
+ ... db_dir='C:/project/pit_study.h5',
2005
+ ... study_tags=['3D9.1BF3C5A8B2'],
2006
+ ... rec_type='PIT_Multiple',
2007
+ ... ant_to_rec_dict=ant_map
2008
+ ... )
2009
+
2010
+ See Also
2011
+ --------
2012
+ PIT : Parser for single PIT readers
2013
+ """
2014
+ # Define column names based on the expected structure of the CSV
2015
+ col_names = [
2016
+ "FishId", "Tag1Dec", "Tag1Hex", "Tag2Dec", "Tag2Hex", "FloyTag", "RadioTag",
2017
+ "Location", "Source", "FishSpecies", "TimeStamp", "Weight", "Length",
2018
+ "Antennae", "Latitude", "Longitude", "SampleDate", "CaptureMethod",
2019
+ "LocationDetail", "Type", "Recapture", "Sex", "GeneticSampleID", "Comments"
2020
+ ]
2021
+
2022
+ # Read the CSV into a DataFrame, skipping rows if needed
2023
+ telem_dat = pd.read_csv(file_name, names=col_names, header=0, skiprows=skiprows, dtype=str)
2024
+
2025
+ mode_str = "multi-antenna"
2026
+ if ant_to_rec_dict is None:
2027
+ raise ValueError("ant_to_rec_dict is required for PIT_Multiple")
2028
+
2029
+ # Convert "TimeStamp" to datetime with explicit format
2030
+ telem_dat["time_stamp"] = pd.to_datetime(telem_dat["TimeStamp"], format="%m/%d/%Y %H:%M", errors="coerce")
2031
+
2032
+ # Ensure "Tag1Dec" and "Tag1Hex" are treated as strings (avoid scientific notation issues)
2033
+ telem_dat["Tag1Dec"] = telem_dat["Tag1Dec"].astype(str)
2034
+ telem_dat["Tag1Hex"] = telem_dat["Tag1Hex"].astype(str)
2035
+
2036
+ telem_dat["freq_code"] = telem_dat["Tag1Hex"].astype(str).str.strip()
2037
+ antenna_raw = telem_dat["Antennae"].astype(str).str.strip()
2038
+ antenna_num = pd.to_numeric(antenna_raw.str.extract(r"(\d+)")[0], errors="coerce")
2039
+ rec_id = antenna_num.map(ant_to_rec_dict)
2040
+ if rec_id.isna().any():
2041
+ rec_id = rec_id.fillna(antenna_raw.map(ant_to_rec_dict))
2042
+ telem_dat["rec_id"] = rec_id
2043
+ telem_dat = telem_dat.dropna(subset=["rec_id"])
2044
+
2045
+ # if after_cleanup == 0:
2046
+ # raise ValueError(f"No valid records found in {file_name}")
2047
+
2048
+ # Standardize columns
2049
+ telem_dat["power"] = 0.0
2050
+ telem_dat["noise_ratio"] = 0.0
2051
+ telem_dat["scan_time"] = scan_time
2052
+ telem_dat["channels"] = channels
2053
+ telem_dat["rec_type"] = rec_type
2054
+
2055
+ # Calculate epoch time (seconds since 1970-01-01) as int64
2056
+ # Use integer seconds to avoid float32 precision loss for large epoch values
2057
+ # Ensure time_stamp has no NaT rows before converting
2058
+ telem_dat = telem_dat[~telem_dat["time_stamp"].isna()].copy()
2059
+ telem_dat["epoch"] = telem_dat["time_stamp"].astype('int64') // 10**9
2060
+
2061
+ # Convert to standard data types
2062
+ telem_dat = telem_dat.astype({
2063
+ "power": "float32",
2064
+ "freq_code": "object",
2065
+ "time_stamp": "datetime64[ns]",
2066
+ "scan_time": "float32",
2067
+ "channels": "int32",
2068
+ "rec_type": "object",
2069
+ "epoch": "int64",
2070
+ "noise_ratio": "float32",
2071
+ "rec_id": "object"
2072
+ })
2073
+
2074
+ # Keep only standard columns
2075
+ telem_dat = telem_dat[
2076
+ ["power", "time_stamp", "epoch", "freq_code", "noise_ratio",
2077
+ "scan_time", "channels", "rec_id", "rec_type"]
2078
+ ]
2079
+
2080
+ # Append to HDF5 store
2081
+ with pd.HDFStore(db_dir, mode='a') as store:
2082
+ store.append(
2083
+ key="raw_data",
2084
+ value=telem_dat,
2085
+ format="table",
2086
+ index=False,
2087
+ min_itemsize={"freq_code": 20, "rec_type": 20, "rec_id": 20},
2088
+ append=True,
2089
+ chunksize=1000000,
2090
+ data_columns=True
2091
+ )
2092
+
2093
+ print(f"\nSuccessfully parsed {file_name} and appended to {db_dir}!")
2094
+ print(f"Imported {len(telem_dat)} records in {mode_str} mode")
2095
+
2096
+ with pd.HDFStore(db_dir, 'r') as store:
2097
+ print("Store keys after append:", store.keys())
2098
+
2099
+
2100
+
2101
+
2102
+
2103
+
2104
+