pymast 0.0.6__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pymast/radio_project.py CHANGED
@@ -1,7 +1,86 @@
1
1
  # -*- coding: utf-8 -*-
2
+ """
3
+ Radio telemetry project management and HDF5 database operations.
2
4
 
3
- '''
4
- Module contains all of the functions to create a radio telemetry project.'''
5
+ This module provides the `radio_project` class, the central object for managing
6
+ radio telemetry studies. It handles project initialization, data import, metadata
7
+ storage, and database operations using HDF5 format.
8
+
9
+ Core Responsibilities
10
+ ---------------------
11
+ - **Project Initialization**: Create HDF5 database with standardized table structure
12
+ - **Data Import**: Batch import from multiple receiver types and file formats
13
+ - **Metadata Management**: Store tags, receivers, recaptures, nodes, lines
14
+ - **Recapture Generation**: Process raw detections into spatiotemporal recaptures
15
+ - **Query Interface**: Retrieve fish tracks, detection statistics, project metadata
16
+
17
+ HDF5 Database Structure
18
+ -----------------------
19
+ The project database contains these primary tables:
20
+
21
+ - `/raw_data`: Imported receiver detections (time_stamp, freq_code, power, etc.)
22
+ - `/tblMasterTag`: Tag metadata (freq_code, pulse_rate, tag_type, release info)
23
+ - `/tblMasterReceiver`: Receiver metadata (rec_id, rec_type, latitude, longitude)
24
+ - `/recaptures`: Processed detections linked to spatial locations and tags
25
+ - `/nodes`: Spatial nodes for state-space modeling
26
+ - `/lines`: Connectivity between nodes for movement modeling
27
+
28
+ Classification and Filtering Tables:
29
+
30
+ - `/training`: Hand-labeled detections for classifier training
31
+ - `/test`: Detections scored by Naive Bayes classifier
32
+ - `/overlapping`: Overlapping detection decisions from overlap_reduction
33
+ - `/bouts`: Bout summaries from DBSCAN clustering
34
+ - `/presence`: Presence/absence by bout and receiver
35
+
36
+ Statistical Model Tables:
37
+
38
+ - `/cjs`: Cormack-Jolly-Seber capture history
39
+ - `/lrdr`: Live-recapture dead-recovery format
40
+ - `/tte`: Time-to-event format for survival analysis
41
+
42
+ Typical Usage
43
+ -------------
44
+ >>> from pymast.radio_project import radio_project
45
+ >>>
46
+ >>> # Initialize new project
47
+ >>> proj = radio_project(
48
+ ... project_dir='C:/projects/my_study',
49
+ ... db_name='my_study.h5',
50
+ ... rec_list='receivers.csv',
51
+ ... tag_list='tags.csv',
52
+ ... node_list='nodes.csv',
53
+ ... line_list='lines.csv'
54
+ ... )
55
+ >>>
56
+ >>> # Import raw receiver data
57
+ >>> proj.import_data(
58
+ ... file_name='receiver_001.csv',
59
+ ... receiver_make='ares',
60
+ ... rec_id='REC001',
61
+ ... scan_time=1.0,
62
+ ... channels=1
63
+ ... )
64
+ >>>
65
+ >>> # Generate recaptures table
66
+ >>> proj.make_recaptures_table()
67
+ >>>
68
+ >>> # Query fish tracks
69
+ >>> tracks = proj.get_fish_tracks(freq_code='166.380 7')
70
+
71
+ Notes
72
+ -----
73
+ - HDF5 format provides fast queries, compression, and hierarchical organization
74
+ - All tables use indexed columns for performance (freq_code, rec_id, time_stamp)
75
+ - Receiver imports are append-only (no overwrites unless db_dir deleted)
76
+ - Project metadata stored in HDF5 attributes for provenance
77
+
78
+ See Also
79
+ --------
80
+ parsers : Data import from various receiver formats
81
+ overlap_removal : Detection filtering and bout analysis
82
+ formatter : Statistical model output generation
83
+ """
5
84
 
6
85
  # import modules required for function dependencies
7
86
  import numpy as np
@@ -9,19 +88,38 @@ import pandas as pd
9
88
  import os
10
89
  import h5py
11
90
  import datetime
91
+ import logging
12
92
  import pymast.naive_bayes as naive_bayes
13
93
  import pymast.parsers as parsers
14
94
  import pymast.predictors as predictors
15
95
  import matplotlib.pyplot as plt
16
96
  from matplotlib import rcParams
17
97
  from scipy import interpolate
98
+ try:
99
+ from tqdm import tqdm
100
+ except ImportError:
101
+ def tqdm(iterable, **kwargs):
102
+ return iterable
103
+ import shutil
18
104
  import warnings
19
- warnings.filterwarnings("ignore")
105
+ import dask.dataframe as dd
106
+ import dask.array as da
107
+ try:
108
+ from dask_ml.cluster import KMeans
109
+ _KMEANS_IMPL = 'dask'
110
+ except ImportError:
111
+ from sklearn.cluster import KMeans
112
+ _KMEANS_IMPL = 'sklearn'
113
+
114
+ # Initialize logger
115
+ logger = logging.getLogger('pymast.radio_project')
20
116
 
21
117
  font = {'family': 'serif','size': 6}
22
118
  rcParams['font.size'] = 6
23
119
  rcParams['font.family'] = 'serif'
24
120
 
121
+ push = 'push'
122
+
25
123
  class radio_project():
26
124
  '''
27
125
  A class to manage and organize data and parameters for a Radio Telemetry project.
@@ -46,28 +144,91 @@ class radio_project():
46
144
  '''
47
145
 
48
146
  def __init__(self, project_dir, db_name, detection_count, duration, tag_data, receiver_data, nodes_data = None):
49
- '''
50
- Initializes the radio_project class with project parameters and datasets.
147
+ """
148
+ Initialize a radio telemetry project for data management and analysis.
51
149
 
52
- Sets up the project directory structure, initializes the project database, and stores the provided datasets.
150
+ This constructor sets up the complete project infrastructure including:
151
+ - Directory structure for data, training files, and outputs
152
+ - HDF5 database for efficient data storage
153
+ - Tag, receiver, and node metadata
53
154
 
54
- Parameters:
55
- - project_dir (str): The root directory for the project.
56
- - db_name (str): The name of the database file to be created or used.
57
- - det (DataFrame or similar): Data containing detection information.
58
- - duration (int or float): The duration of the project or a related parameter.
59
- - tag_data (DataFrame or similar): Data containing information about the tags.
60
- - receiver_data (DataFrame or similar): Data containing information about the receivers.
61
- - nodes_data (DataFrame or similar, optional): Data containing information about the nodes, if applicable.
62
-
63
- The method creates the necessary directories for the project, initializes the HDF5 database, and sets up the class attributes.
64
- '''
155
+ Parameters
156
+ ----------
157
+ project_dir : str
158
+ Root directory for the project. Recommended to avoid spaces in path.
159
+ db_name : str
160
+ Name of the HDF5 database file (without .h5 extension).
161
+ detection_count : int
162
+ Number of detections to include in detection history window for
163
+ predictor calculation. Typical values: 3-7.
164
+ duration : float
165
+ Time window in seconds for noise ratio calculation.
166
+ Typical values: 1.0-5.0 seconds.
167
+ tag_data : pandas.DataFrame
168
+ Master tag table with required columns:
169
+ - freq_code (str): Unique frequency-code combination
170
+ - pulse_rate (float): Seconds between tag pulses
171
+ - tag_type (str): 'study', 'BEACON', or 'TEST'
172
+ - rel_date (datetime): Release date and time
173
+ See docs/API_REFERENCE.md for complete schema.
174
+ receiver_data : pandas.DataFrame
175
+ Master receiver table with required columns:
176
+ - rec_id (str): Unique receiver identifier
177
+ - rec_type (str): Receiver type ('srx600', 'srx800', etc.)
178
+ - node (str): Associated network node ID
179
+ nodes_data : pandas.DataFrame, optional
180
+ Network nodes table with columns:
181
+ - node (str): Unique node identifier
182
+ - X (int): X coordinate for visualization
183
+ - Y (int): Y coordinate for visualization
184
+ Required for movement analysis and overlap removal.
185
+
186
+ Raises
187
+ ------
188
+ ValueError
189
+ If required columns are missing from input DataFrames.
190
+ OSError
191
+ If project directory cannot be created.
192
+
193
+ Examples
194
+ --------
195
+ >>> import pandas as pd
196
+ >>> from pymast.radio_project import radio_project
197
+ >>>
198
+ >>> # Load input data
199
+ >>> tags = pd.read_csv('tblMasterTag.csv')
200
+ >>> receivers = pd.read_csv('tblMasterReceiver.csv')
201
+ >>> nodes = pd.read_csv('tblNodes.csv')
202
+ >>>
203
+ >>> # Create project
204
+ >>> project = radio_project(
205
+ ... project_dir='/path/to/project',
206
+ ... db_name='my_study',
207
+ ... detection_count=5,
208
+ ... duration=1.0,
209
+ ... tag_data=tags,
210
+ ... receiver_data=receivers,
211
+ ... nodes_data=nodes
212
+ ... )
213
+
214
+ Notes
215
+ -----
216
+ The project directory structure will be created as:
217
+ - project_dir/
218
+ - Data/ (raw data storage)
219
+ - Training_Files/ (receiver data files)
220
+ - Output/ (processed data and exports)
221
+ - Figures/ (generated plots)
222
+ - my_study.h5 (HDF5 database)
223
+ """
65
224
  # set model parameters
66
225
  self.project_dir = project_dir
67
226
  self.db_name = db_name
68
227
  self.db = os.path.join(project_dir,'%s.h5'%(db_name))
69
228
  self.tags = tag_data
70
229
  self.study_tags = self.tags[self.tags.tag_type == 'study'].freq_code.values
230
+ self.test_tags = self.tags[self.tags.tag_type == 'TEST'].freq_code.values
231
+ self.beacon_tags = self.tags[self.tags.tag_type == 'BEACON'].freq_code.values
71
232
  self.tags.set_index('freq_code', inplace = True)
72
233
  self.receivers = receiver_data
73
234
  self.receivers.set_index('rec_id', inplace = True)
@@ -96,6 +257,10 @@ class radio_project():
96
257
  os.makedirs(self.figures_dir)
97
258
  self.figure_ws = os.path.join(project_dir,'Output','Figures')
98
259
 
260
+ # When running in automated/non-interactive mode set this flag True to avoid input() prompts
261
+ # By default, leave interactive (False) so user can respond to prompts
262
+ self.non_interactive = False
263
+
99
264
  # create a project database and write initial arrays to HDF
100
265
  self.initialize_hdf5()
101
266
 
@@ -111,6 +276,115 @@ class radio_project():
111
276
  self.tags.to_hdf(self.db, key='/project_setup/tags', mode='a')
112
277
  self.receivers.to_hdf(self.db, key='/project_setup/receivers', mode='a')
113
278
  self.nodes.to_hdf(self.db, key='/project_setup/nodes', mode='a')
279
+ else:
280
+ # Project already exists - check for new tags and merge if needed
281
+ try:
282
+ existing_tags = pd.read_hdf(self.db, key='/project_setup/tags')
283
+
284
+ # Reset index on incoming tags for comparison (it gets set later in __init__)
285
+ incoming_tags = self.tags.copy()
286
+ if incoming_tags.index.name == 'freq_code':
287
+ incoming_tags = incoming_tags.reset_index()
288
+
289
+ # Find new tags not in existing database
290
+ if 'freq_code' in existing_tags.columns:
291
+ existing_freq_codes = set(existing_tags['freq_code'])
292
+ else:
293
+ existing_freq_codes = set(existing_tags.index)
294
+
295
+ incoming_freq_codes = set(incoming_tags['freq_code'])
296
+ new_freq_codes = incoming_freq_codes - existing_freq_codes
297
+
298
+ if new_freq_codes:
299
+ print(f"Found {len(new_freq_codes)} new tags to add to database: {sorted(new_freq_codes)}")
300
+
301
+ # Merge existing and new tags
302
+ new_tags_only = incoming_tags[incoming_tags['freq_code'].isin(new_freq_codes)]
303
+
304
+ # Ensure existing_tags has freq_code as column, not index
305
+ if existing_tags.index.name == 'freq_code':
306
+ existing_tags = existing_tags.reset_index()
307
+
308
+ merged_tags = pd.concat([existing_tags, new_tags_only], ignore_index=True)
309
+
310
+ # Remove the old tags table and write merged version
311
+ with pd.HDFStore(self.db, mode='a') as store:
312
+ if '/project_setup/tags' in store:
313
+ store.remove('/project_setup/tags')
314
+ store.put('/project_setup/tags',
315
+ merged_tags,
316
+ format='table',
317
+ data_columns=True)
318
+
319
+ # Update self.tags with merged data
320
+ self.tags = merged_tags.copy()
321
+ self.tags.set_index('freq_code', inplace=True)
322
+
323
+ # Update tag type arrays
324
+ self.study_tags = self.tags[self.tags.tag_type == 'study'].index.values
325
+ self.test_tags = self.tags[self.tags.tag_type == 'TEST'].index.values
326
+ self.beacon_tags = self.tags[self.tags.tag_type == 'BEACON'].index.values
327
+
328
+ print(f"Successfully added {len(new_freq_codes)} new tags to database.")
329
+ else:
330
+ print("No new tags found - database is up to date.")
331
+
332
+ except (KeyError, FileNotFoundError):
333
+ # Tags table doesn't exist yet, write it
334
+ print("Tags table not found in database, creating it now.")
335
+ self.tags.to_hdf(self.db, key='/project_setup/tags', mode='a')
336
+
337
+ # Check for new receivers and merge if needed
338
+ try:
339
+ existing_receivers = pd.read_hdf(self.db, key='/project_setup/receivers')
340
+
341
+ # Reset index on incoming receivers for comparison
342
+ incoming_receivers = self.receivers.copy()
343
+ if incoming_receivers.index.name == 'rec_id':
344
+ incoming_receivers = incoming_receivers.reset_index()
345
+
346
+ # Find new receivers not in existing database
347
+ if 'rec_id' in existing_receivers.columns:
348
+ existing_rec_ids = set(existing_receivers['rec_id'])
349
+ else:
350
+ existing_rec_ids = set(existing_receivers.index)
351
+
352
+ incoming_rec_ids = set(incoming_receivers['rec_id'])
353
+ new_rec_ids = incoming_rec_ids - existing_rec_ids
354
+
355
+ if new_rec_ids:
356
+ print(f"Found {len(new_rec_ids)} new receivers to add to database: {sorted(new_rec_ids)}")
357
+
358
+ # Merge existing and new receivers
359
+ new_receivers_only = incoming_receivers[incoming_receivers['rec_id'].isin(new_rec_ids)]
360
+
361
+ # Ensure existing_receivers has rec_id as column, not index
362
+ if existing_receivers.index.name == 'rec_id':
363
+ existing_receivers = existing_receivers.reset_index()
364
+
365
+ merged_receivers = pd.concat([existing_receivers, new_receivers_only], ignore_index=True)
366
+
367
+ # Remove the old receivers table and write merged version
368
+ with pd.HDFStore(self.db, mode='a') as store:
369
+ if '/project_setup/receivers' in store:
370
+ store.remove('/project_setup/receivers')
371
+ store.put('/project_setup/receivers',
372
+ merged_receivers,
373
+ format='table',
374
+ data_columns=True)
375
+
376
+ # Update self.receivers with merged data
377
+ self.receivers = merged_receivers.copy()
378
+ self.receivers.set_index('rec_id', inplace=True)
379
+
380
+ print(f"Successfully added {len(new_rec_ids)} new receivers to database.")
381
+ else:
382
+ print("No new receivers found - database is up to date.")
383
+
384
+ except (KeyError, FileNotFoundError):
385
+ # Receivers table doesn't exist yet, write it
386
+ print("Receivers table not found in database, creating it now.")
387
+ self.receivers.to_hdf(self.db, key='/project_setup/receivers', mode='a')
114
388
 
115
389
  if 'raw_data' not in hdf5:
116
390
  hdf5.create_group("raw_data")
@@ -131,6 +405,22 @@ class radio_project():
131
405
  hdf5.create_group('recaptures')
132
406
 
133
407
  hdf5.close()
408
+
409
+ def _prompt(self, prompt_text, default="no"):
410
+ """Centralized prompt helper — returns default ONLY when non_interactive is True.
411
+
412
+ By default (non_interactive=False), this will prompt the user interactively.
413
+ Set project.non_interactive = True to auto-answer with defaults.
414
+ """
415
+ if self.non_interactive:
416
+ logger.debug(f"Non-interactive mode: auto-answering '{prompt_text}' with '{default}'")
417
+ return default
418
+ try:
419
+ return input(prompt_text)
420
+ except (EOFError, OSError) as exc:
421
+ raise RuntimeError(
422
+ "Input prompt failed. Set project.non_interactive = True to use defaults."
423
+ ) from exc
134
424
 
135
425
  def telem_data_import(self,
136
426
  rec_id,
@@ -139,13 +429,76 @@ class radio_project():
139
429
  db_dir,
140
430
  scan_time = 1,
141
431
  channels = 1,
142
- ant_to_rec_dict = None):
432
+ ant_to_rec_dict = None,
433
+ ka_format = False):
434
+ """
435
+ Import raw telemetry data from receiver files into the project database.
436
+
437
+ Parameters
438
+ ----------
439
+ rec_id : str
440
+ Receiver ID (must exist in receiver_data)
441
+ rec_type : str
442
+ Receiver type. Supported: 'srx600', 'srx800', 'srx1200',
443
+ 'orion', 'ares', 'VR2'
444
+ file_dir : str
445
+ Directory containing raw data files
446
+ db_dir : str
447
+ Path to HDF5 database file
448
+ scan_time : float, optional
449
+ Channel scan time in seconds (default: 1)
450
+ channels : int, optional
451
+ Number of channels (default: 1)
452
+ ant_to_rec_dict : dict, optional
453
+ Mapping of antenna IDs to receiver IDs
454
+ ka_format : bool, optional
455
+ Use Kleinschmidt Associates format (default: False)
456
+
457
+ Raises
458
+ ------
459
+ ValueError
460
+ If rec_type is not supported or rec_id not found
461
+ FileNotFoundError
462
+ If file_dir doesn't exist or contains no data files
463
+ """
464
+ # Validate receiver type
465
+ VALID_REC_TYPES = ['srx600', 'srx800', 'srx1200', 'orion', 'ares', 'VR2','PIT']
466
+ if rec_type not in VALID_REC_TYPES:
467
+ raise ValueError(
468
+ f"Unsupported receiver type: '{rec_type}'. "
469
+ f"Supported types: {', '.join(VALID_REC_TYPES)}"
470
+ )
471
+
472
+ # Validate receiver ID
473
+ if rec_id not in self.receivers.index:
474
+ raise ValueError(
475
+ f"Receiver '{rec_id}' not found in receiver_data. "
476
+ f"Available receivers: {', '.join(self.receivers.index)}"
477
+ )
478
+
479
+ # Validate directory exists
480
+ if not os.path.exists(file_dir):
481
+ logger.error(f"Data directory not found: {file_dir}")
482
+ raise FileNotFoundError(
483
+ f"Data directory not found: {file_dir}. "
484
+ f"Expected location: {self.training_dir}"
485
+ )
486
+
487
+ logger.info(f"Importing data for receiver {rec_id} (type: {rec_type})")
488
+ logger.info(f" Data directory: {file_dir}")
489
+
143
490
  # list raw data files
144
491
  tFiles = os.listdir(file_dir)
145
492
 
493
+ if not tFiles:
494
+ logger.warning(f"No files found in {file_dir}")
495
+ return
496
+
497
+ logger.info(f" Found {len(tFiles)} file(s) to import")
498
+
146
499
  # for every file call the correct text parser and import
147
- for f in tFiles:
148
- print ("start importing file %s"%(f))
500
+ for i, f in enumerate(tqdm(tFiles, desc=f"Importing {rec_id}", unit="file"), 1):
501
+ logger.debug(f" Processing file {i}/{len(tFiles)}: {f}")
149
502
  # get the complete file directory
150
503
  f_dir = os.path.join(file_dir,f)
151
504
 
@@ -156,7 +509,7 @@ class radio_project():
156
509
  parsers.srx800(f_dir, db_dir, rec_id, self.study_tags, scan_time = scan_time, channels = channels, ant_to_rec_dict = ant_to_rec_dict)
157
510
 
158
511
  elif rec_type == 'srx1200':
159
- parsers.srx1200(f_dir, db_dir, rec_id, self.study_tags, scan_time = scan_time, channels = channels, ant_to_rec_dict = ant_to_rec_dict)
512
+ parsers.srx1200(f_dir, db_dir, rec_id, self.study_tags, scan_time = scan_time, channels = channels, ant_to_rec_dict = ant_to_rec_dict, ka_format = 'True')
160
513
 
161
514
  elif rec_type == 'orion':
162
515
  parsers.orion_import(f_dir,db_dir,rec_id, self.study_tags, scan_time = scan_time, channels = channels, ant_to_rec_dict = ant_to_rec_dict)
@@ -166,14 +519,26 @@ class radio_project():
166
519
 
167
520
  elif rec_type == 'ares':
168
521
  parsers.ares(f_dir,db_dir,rec_id, self.study_tags, scan_time = scan_time, channels = channels, ant_to_rec_dict = ant_to_rec_dict)
522
+
523
+ elif rec_type == 'PIT':
524
+ parsers.PIT(f_dir,db_dir,rec_id, self.study_tags, scan_time = scan_time, channels = channels, ant_to_rec_dict = ant_to_rec_dict)
525
+
526
+ elif rec_type == 'PIT_Multiple':
527
+ parsers.PIT_Multiple(f_dir, db_dir,
528
+ study_tags=self.study_tags,
529
+ ant_to_rec_dict=ant_to_rec_dict,
530
+ scan_time=scan_time,
531
+ channels=channels)
532
+
169
533
  else:
170
- print ("There currently is not an import routine created for this receiver type. Please try again")
171
-
172
- print ("File %s imported"%(f))
534
+ logger.error(f"No import routine for receiver type: {rec_type}")
535
+ raise ValueError(f"No import routine available for receiver type: {rec_type}")
173
536
 
174
- print ("Raw Telemetry Data Import Completed")
537
+ logger.info(f" Import complete for receiver {rec_id}: {len(tFiles)} file(s) processed")
175
538
 
176
539
  def get_fish(self, rec_id, train = True, reclass_iter = None):
540
+ logger.info(f"Getting fish for receiver {rec_id}")
541
+ logger.debug(f" Mode: {'training' if train else 'classification'}, Iteration: {reclass_iter}")
177
542
 
178
543
  tags_no_idx = self.tags.reset_index(drop = False)
179
544
 
@@ -182,6 +547,7 @@ class radio_project():
182
547
  key = 'raw_data',
183
548
  where = f'rec_id = "{rec_id}"')
184
549
  dat = pd.merge(dat, tags_no_idx, on='freq_code', how='left')
550
+ dat = dat[(dat.tag_type != 'TEST') & (dat.tag_type != 'BEACON')]
185
551
  dat = dat[(dat.tag_type != 'beacon') & (dat.tag_type != 'test')]
186
552
 
187
553
  elif reclass_iter == None and train == False:
@@ -189,7 +555,7 @@ class radio_project():
189
555
  key = 'raw_data',
190
556
  where = f'rec_id = "{rec_id}"')
191
557
  dat = pd.merge(dat, tags_no_idx, on='freq_code', how='left')
192
- dat = dat[dat.tag_type == 'study']
558
+ dat = dat[(dat.tag_type == 'study') | (dat.tag_type == 'STUDY')]
193
559
 
194
560
  else:
195
561
  itr = reclass_iter -1
@@ -199,104 +565,153 @@ class radio_project():
199
565
  dat = pd.merge(dat, tags_no_idx, on='freq_code', how='left')
200
566
  dat = dat[dat.tag_type == 'study']
201
567
 
202
- return dat.freq_code.unique()
568
+ fish_list = dat.freq_code.unique()
569
+ logger.info(f" Found {len(fish_list)} unique fish")
570
+ return fish_list
571
+
572
+ def orphan_tags(self, return_rows=False):
573
+ """Return orphan tags or their recapture rows.
574
+
575
+ By default returns a sorted list of orphan `freq_code` strings (tags
576
+ present in `/recaptures` but missing from `/project_setup/tags`). If
577
+ `return_rows=True` returns the recaptures DataFrame rows for those tags.
578
+ """
579
+ recaps = pd.read_hdf(self.db, 'recaptures')
580
+ recaps['freq_code'] = recaps['freq_code'].astype(str)
581
+
582
+ master = self.tags.copy()
583
+ if master.index.name == 'freq_code':
584
+ master_codes = set(master.index.astype(str))
585
+ else:
586
+ master_codes = set(master['freq_code'].astype(str))
587
+
588
+ recap_codes = set(recaps['freq_code'].unique())
589
+ orphans = sorted(list(recap_codes - master_codes))
590
+
591
+ if return_rows:
592
+ if not orphans:
593
+ return pd.DataFrame(columns=recaps.columns)
594
+ return recaps[recaps['freq_code'].isin(orphans)].copy()
595
+ return orphans
203
596
 
204
597
  def train(self, freq_code, rec_id):
205
- '''A class object for a training dataframe and related data objects.
206
-
207
- This class object creates a training dataframe for animal i at site j.
598
+ """
599
+ Train the Naive Bayes classifier using a specific tag at a receiver.
600
+
601
+ This method calculates predictor variables for all detections of the
602
+ specified tag and stores them in the training dataset. Training data
603
+ includes both known true positives (from beacon/test tags) and known
604
+ false positives (miscoded detections).
605
+
606
+ Parameters
607
+ ----------
608
+ freq_code : str
609
+ Frequency-code combination to train on (e.g., '164.123 45').
610
+ Must exist in the tag_data provided during initialization.
611
+ rec_id : str
612
+ Receiver ID where training data was collected.
613
+ Must exist in the receiver_data provided during initialization.
614
+
615
+ Returns
616
+ -------
617
+ None
618
+ Training data is written to HDF5 database at /trained key.
619
+
620
+ Raises
621
+ ------
622
+ KeyError
623
+ If freq_code or rec_id not found in project data.
624
+ ValueError
625
+ If insufficient data for training (e.g., no detections).
626
+
627
+ Examples
628
+ --------
629
+ >>> # Train on a single tag
630
+ >>> project.train('164.123 45', 'R01')
631
+
632
+ >>> # Train on all tags at a receiver
633
+ >>> fishes = project.get_fish(rec_id='R01')
634
+ >>> for fish in fishes:
635
+ ... project.train(fish, 'R01')
636
+
637
+ See Also
638
+ --------
639
+ training_summary : Generate statistics and plots from training data
640
+ reclassify : Apply trained classifier to classify detections
641
+
642
+ Notes
643
+ -----
644
+ Predictor variables calculated:
645
+ - hit_ratio: Proportion of expected detections received
646
+ - cons_length: Maximum consecutive detection length
647
+ - noise_ratio: Ratio of miscoded to total detections
648
+ - power: Signal strength
649
+ - lag_diff: Second-order difference in detection timing
650
+ """
651
+ '''A class object for a training dataframe and related data objects.'''
208
652
 
209
- when class is initialized, we will extract information for this animal (i)
210
- at reciever (site) from the project database (projectDB).
211
- '''
212
- # pull raw data
653
+ # Pull raw data
213
654
  train_dat = pd.read_hdf(self.db,
214
655
  'raw_data',
215
- where = f'(freq_code == "{freq_code}") & (rec_id == "{rec_id}")')
216
-
217
- # do some data management when importing training dataframe
656
+ where=f'(freq_code == "{freq_code}") & (rec_id == "{rec_id}")')
657
+
658
+ # Data management
218
659
  train_dat['time_stamp'] = pd.to_datetime(train_dat.time_stamp)
219
- train_dat['epoch'] = np.round((train_dat.time_stamp - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s'),6)
220
- train_dat.sort_values(by = 'epoch', inplace = True)
660
+ train_dat['epoch'] = (train_dat.time_stamp.astype('int64') // 10**9).astype('int64')
661
+ train_dat.sort_values(by='epoch', inplace=True)
221
662
 
222
- train_dat.drop_duplicates(subset = 'time_stamp',
223
- keep = 'first',
224
- inplace = True)
663
+ train_dat.drop_duplicates(subset='time_stamp', keep='first', inplace=True)
225
664
 
226
- # set some object variables
227
- rec_type = self.receivers.at[rec_id,'rec_type']
228
-
229
- # for training data, we know the tag's detection class ahead of time,
230
- # if the tag is in the study tag list, it is a known detection class, if
231
- # it is a beacon tag, it is definite, if it is a study tag, it's plausible
665
+ # Object variables
666
+ if self.receivers.index.dtype != 'object':
667
+ rec_id = np.int64(rec_id)
668
+ rec_type = self.receivers.at[rec_id, 'rec_type']
669
+
670
+ # Detection class
232
671
  if freq_code in self.study_tags:
233
672
  plausible = 1
234
673
  else:
235
674
  plausible = 0
236
- # get rate
675
+
676
+ # Get rate
237
677
  if freq_code in self.tags.index:
238
- pulse_rate = self.tags.at[freq_code,'pulse_rate']
678
+ pulse_rate = self.tags.at[freq_code, 'pulse_rate']
239
679
  else:
240
- pulse_rate = 3.0
241
- # if self.tags.at[freq_code,'mort_rate'] == np.nan or self.tags.at[freq_code,'mort_rate'] == 0:
242
- # mort_rate = 9999.0
243
- # else:
244
- # mort_rate = self.tags.at[freq_code,'mort_rate']
245
-
680
+ pulse_rate = 673.
681
+
246
682
  mort_rate = 8888.
247
- # calculate predictors
248
- # if plausible == 1:
249
- # print ('debug check det hist')
250
- train_dat['detection'] = np.repeat(plausible,len(train_dat))
683
+ train_dat['detection'] = np.repeat(plausible, len(train_dat))
251
684
  train_dat['lag'] = train_dat.epoch.diff()
252
685
  train_dat['lag_diff'] = train_dat.lag.diff()
253
686
 
254
- det_hist_dict = {}
255
- hit_ratio_dict = {}
256
- cons_det_dict = {}
257
- max_count_dict = {}
258
-
259
- for ch in train_dat.channels.unique():
260
- train_dat_sub = train_dat[train_dat.channels == ch]
261
- det_hist_string, hit_ratio, cons_det, max_count \
262
- = predictors.detection_history(train_dat_sub.epoch.values,
263
- pulse_rate,
264
- self.det_count,
265
- ch,
266
- train_dat_sub.scan_time.values,
267
- train_dat_sub.channels.values)
268
-
269
- det_hist_dict[ch] = det_hist_string
270
- hit_ratio_dict[ch] = hit_ratio
271
- cons_det_dict[ch] = cons_det
272
- max_count_dict[ch] = max_count
273
-
274
- det_hist_string_arrs = list(det_hist_dict.values())
275
- det_hist_string_arr = np.hstack(det_hist_string_arrs)
276
- hit_ratio_arrs = list(hit_ratio_dict.values())
277
- hit_ratio_arr = np.hstack(hit_ratio_arrs)
278
- cons_det_arrs = list(cons_det_dict.values())
279
- cons_det_arr = np.hstack(cons_det_arrs)
280
- max_count_arrs = list(max_count_dict.values())
281
- max_count_arr = np.hstack(max_count_arrs)
687
+ # if freq_code in self.tags.index:
688
+ # print ('check')
689
+
690
+ # Apply the optimized detection history function to the entire dataset at once
691
+ detection_history, hit_ratio_arr, cons_det_arr, max_count_arr = predictors.detection_history(
692
+ train_dat['epoch'].values,
693
+ pulse_rate,
694
+ self.det_count,
695
+ train_dat['channels'].values,
696
+ train_dat['scan_time'].values,
697
+ )
698
+
699
+ # Convert detection history arrays to concatenated strings outside Numba
700
+ det_hist_string_arr = np.array([''.join(row.astype(str)) for row in detection_history])
282
701
 
702
+ # Assign back to the DataFrame
283
703
  train_dat['det_hist'] = det_hist_string_arr
284
704
  train_dat['hit_ratio'] = hit_ratio_arr
285
705
  train_dat['cons_det'] = cons_det_arr
286
706
  train_dat['cons_length'] = max_count_arr
287
- # train_dat['series_hit'] = predictors.series_hit(train_dat.lag.values,
288
- # pulse_rate,
289
- # mort_rate,
290
- # 'A')
291
- # if plausible == 1:
292
- # print ('debug why det hist not right?')
293
- train_dat.fillna(value=9999999, inplace=True)
294
-
295
- # make sure data types are correct - these next steps are critical
707
+
708
+ train_dat.fillna(value=9999999, inplace=True)
709
+
710
+ # Ensure data types are correct
296
711
  try:
297
712
  train_dat = train_dat.astype({'power': 'float32',
298
713
  'time_stamp': 'datetime64[ns]',
299
- 'epoch': 'float32',
714
+ 'epoch': 'int64',
300
715
  'freq_code': 'object',
301
716
  'noise_ratio': 'float32',
302
717
  'scan_time': 'int32',
@@ -311,82 +726,83 @@ class radio_project():
311
726
  'cons_det': 'int32',
312
727
  'cons_length': 'float32'})
313
728
  except ValueError:
314
- print ('debug - check datatypes')
315
-
316
- # append to hdf5
317
- with pd.HDFStore(self.db, mode='a') as store:
318
- store.append(key = 'trained',
319
- value = train_dat,
320
- format = 'table',
321
- index = False,
322
- min_itemsize = {'freq_code':20,
323
- 'rec_type':20,
324
- 'rec_id':20,
325
- 'det_hist':20},
326
- append = True,
327
- chunksize = 1000000)
729
+ logger.debug(f" Data type conversion issue for {freq_code} at {rec_id}")
328
730
 
329
-
330
-
331
- print ('Fish %s trained at receiver %s, plausibiity: %s'%(freq_code, rec_id, plausible))
731
+ # Append to HDF5
732
+ with pd.HDFStore(self.db, mode='a') as store:
733
+ store.append(key='trained',
734
+ value=train_dat,
735
+ format='table',
736
+ index=False,
737
+ min_itemsize={'freq_code': 20,
738
+ 'rec_type': 20,
739
+ 'rec_id': 20,
740
+ 'det_hist': 20},
741
+ append=True,
742
+ chunksize=1000000)
743
+
744
+ logger.info(f"✓ Training complete: {freq_code} at {rec_id} - Plausibility: {plausible:.2f}")
332
745
 
333
746
  def training_summary(self,rec_type,site = None):
334
- # initialize some variables
335
-
747
+ logger.info(f"Generating training summary for {rec_type}")
748
+
336
749
  # connect to database and get data
337
-
338
750
  trained_dat = pd.read_hdf(self.db,key = 'trained')#, mode = 'r')
339
751
  trained_dat = trained_dat[(trained_dat.rec_type == rec_type)]
340
- #train_dat.reset_index(inplace = True)
341
-
342
- # if site != None:
343
- # for rec_id in site:
344
- # trained_dat = trained_dat[(trained_dat.rec_id == rec_id)]
752
+
753
+ logger.info(f" Loaded {len(trained_dat)} detections from {len(trained_dat.rec_id.unique())} receivers")
345
754
 
346
755
  det_class_count = trained_dat.groupby('detection')['detection'].count().to_frame()
347
756
 
348
- print ("")
349
- print ("Training summary statistics report")
350
- print ("The algorithm collected %s detections from %s %s receivers"%(len(trained_dat),len(trained_dat.rec_id.unique()),rec_type))
351
- print ("----------------------------------------------------------------------------------")
352
- print ("")
353
- print ("%s detection clas statistics:"%(rec_type) )
757
+ logger.info("")
758
+ logger.info("Training Summary Statistics Report")
759
+ logger.info("="*80)
760
+ logger.info(f"Collected {len(trained_dat)} detections from {len(trained_dat.rec_id.unique())} {rec_type} receivers")
761
+ logger.info("="*80)
762
+ logger.info("")
763
+ logger.info(f"{rec_type} detection class statistics:")
354
764
  try:
355
- print ("The prior probability that a detection was true was %s"%((round(float(det_class_count.at[1,'detection'])/float(det_class_count.sum()),3))))
765
+ prior_true = round(float(det_class_count.at[1,'detection'])/float(det_class_count.sum()),3)
766
+ logger.info(f" Prior P(true detection) = {prior_true}")
356
767
  except KeyError:
357
- print ("No known true detections found")
768
+ logger.warning(" No known true detections found")
358
769
  pass
359
770
  try:
360
- print ("The prior probability that a detection was false positive was %s"%((round(float(det_class_count.at[0,'detection'])/float(det_class_count.sum()),3))))
771
+ prior_false = round(float(det_class_count.at[0,'detection'])/float(det_class_count.sum()),3)
772
+ logger.info(f" Prior P(false positive) = {prior_false}")
361
773
  except KeyError:
362
- print ("No known true detections found")
774
+ logger.warning(" No known false positives found")
363
775
  pass
364
776
 
365
- print ("")
366
- print ("----------------------------------------------------------------------------------")
367
- print ("")
777
+ logger.info("")
778
+ logger.info("="*80)
779
+ logger.info("")
368
780
  trained_dat['detection'] = trained_dat.detection.astype('str')
369
781
  sta_class_count = trained_dat.groupby(['rec_id','detection'])['detection'].count().rename('det_class_count').to_frame().reset_index()
370
782
  recs = sorted(sta_class_count.rec_id.unique())
371
- print ("Detection Class Counts Across Stations")
372
- print (" Known Known")
373
- print (" False True")
374
- print (" ______________________________")
375
- print (" | | |")
783
+ logger.info("Detection Class Counts Across Stations")
784
+ logger.info(" Known Known")
785
+ logger.info(" False True")
786
+ logger.info(" ______________________________")
787
+ logger.info(" | | |")
376
788
  for i in recs:
377
789
  trues = sta_class_count[(sta_class_count.rec_id == i) & (sta_class_count.detection == '1')]
378
790
  falses = sta_class_count[(sta_class_count.rec_id == i) & (sta_class_count.detection == '0')]
379
791
  if len(trues) > 0 and len(falses) > 0:
380
- print ("%6s| %8s | %8s |"%(i,falses.det_class_count.values[0],trues.det_class_count.values[0]))
792
+ logger.info("%6s| %8s | %8s |"%(i,falses.det_class_count.values[0],trues.det_class_count.values[0]))
381
793
  elif len(trues) == 0 and len(falses) > 0:
382
- print ("%6s| %8s | %8s |"%(i,falses.det_class_count.values[0],0))
794
+ logger.info("%6s| %8s | %8s |"%(i,falses.det_class_count.values[0],0))
383
795
  else:
384
- print ("%6s| %8s | %8s |"%(i,0,trues.det_clas_count.values[0]))
796
+ try:
797
+ logger.info("%6s| %8s | %8s |"%(i,0,trues.det_clas_count.values[0]))
798
+
799
+ except AttributeError:
800
+ logger.info("%6s| %8s | %8s |"%(i,0,0))
385
801
 
386
- print (" |______________|______________|")
387
- print ("")
388
- print ("----------------------------------------------------------------------------------")
389
- print ("Compiling Figures")
802
+ logger.info(" |______________|______________|")
803
+ logger.info("")
804
+ logger.info("="*80)
805
+ logger.info("Compiling training figures...")
390
806
  # get data by detection class for side by side histograms
391
807
  trained_dat['power']= trained_dat.power.astype(float)
392
808
  trained_dat['lag_diff'] = trained_dat.lag_diff.astype(float)
@@ -514,50 +930,177 @@ class radio_project():
514
930
  'rec_type':20,
515
931
  'rec_id':20})
516
932
 
517
- def create_training_data(self, rec_type, reclass_iter = None, rec_list = None):
518
- '''Function creates training dataset for current round of classification -
519
- if we only do this once, this time suck goes away'''
933
+ def create_training_data(self, rec_type=None, reclass_iter=None, rec_list=None):
934
+ """
935
+ Function to create a training dataset for the current round of classification.
936
+ The function supports multiple pathways for generating training data, including
937
+ using a receiver list (rec_list) and incorporating reclassification methods.
520
938
 
521
- #get training data
522
- '''
523
- Reclassification code contributed by T Castro-Santos
524
- '''
525
- # get training data and restrict it to the receiver type - we can't have orion's diagnose srx800's now can we?
526
- train_dat = pd.read_hdf(self.db,
527
- 'trained',
528
- where = f'rec_type == "{rec_type}"')
529
-
530
- # then if we are further restricting to a subset of that receiver type
531
- if rec_list != None:
532
- train_dat = train_dat[train_dat['rec_id'].isin(rec_list)]
533
-
534
- # if this is not the first classification - we need known falses from training and assumed true from last classification
535
- if reclass_iter != None:
536
- last_class = reclass_iter - 1
537
-
538
- class_dat = pd.read_hdf(self.db,
539
- 'classified',
540
- where = f'iter == "{last_class}"')
541
-
542
- class_dat = class_dat[class_dat['rec_id'].isin(rec_list)]
543
- class_dat = class_dat[class_dat.iter == last_class]
544
-
545
- columns = ['test', 'freq_code','power','noise_ratio','lag', 'lag_diff',
546
- 'cons_length','cons_det','det_hist','hit_ratio','rec_type','epoch']
547
-
548
- class_dat = class_dat[columns]
549
-
550
- class_dat.rename(columns = {'test':'detection'},
551
- inplace = True)
939
+ Parameters
940
+ ----------
941
+ rec_type : str, optional
942
+ The type of receiver to filter the data by. This restricts the training data
943
+ to a specific receiver type (e.g., 'orion', 'srx800'). If not provided and
944
+ `rec_list` is used, it is ignored.
945
+ reclass_iter : int, optional
946
+ Iteration number for reclassification. If provided, the function pulls the
947
+ previous classification data and incorporates known false positives and
948
+ assumed true positives.
949
+ rec_list : list of str, optional
950
+ A list of receiver IDs to filter the data by. If provided, the function
951
+ queries the HDF database using this list directly rather than the receiver
952
+ type (`rec_type`).
953
+
954
+ Returns
955
+ -------
956
+ pandas.DataFrame
957
+ A DataFrame containing the training data for the classification process,
958
+ incorporating any previous classifications if applicable.
959
+
960
+ Notes
961
+ -----
962
+ - If both `rec_type` and `rec_list` are provided, the function will prioritize
963
+ the `rec_list` to restrict the training data.
964
+ - Reclassification logic is based on contributions from T. Castro-Santos.
965
+ """
966
+ logger.debug(f" Creating training data (rec_type={rec_type}, iter={reclass_iter}, rec_list={rec_list})")
552
967
 
553
- train_dat = train_dat[train_dat.detection==0]
554
- class_dat = class_dat[class_dat.detection==1]
968
+ if rec_list is not None:
969
+ # Construct the query for multiple receiver IDs using the OR operator
970
+ rec_query = ' | '.join([f'rec_id == "{rec_id}"' for rec_id in rec_list])
971
+ train_dat = pd.read_hdf(self.db, 'trained', where=rec_query)
972
+ elif rec_type is not None:
973
+ # Query based on receiver type directly
974
+ train_dat = pd.read_hdf(self.db, 'trained', where=f'rec_type == "{rec_type}"')
975
+ else:
976
+ raise ValueError("Either 'rec_type' or 'rec_list' must be provided to create training data.")
977
+
978
+ # Handling reclassification if this is not the first iteration
979
+ if reclass_iter is not None:
980
+ last_class = reclass_iter - 1
981
+
982
+ # Load the classified dataset and filter by iteration
983
+ class_dat = pd.read_hdf(self.db, 'classified', where=f'iter == {last_class}')
984
+
985
+ # Further restrict classified data to the receiver list if rec_list is provided
986
+ if rec_list is not None:
987
+ class_query = ' | '.join([f'rec_id == "{rec_id}"' for rec_id in rec_list])
988
+ class_dat = class_dat.query(class_query)
989
+
990
+ # Selecting relevant columns for the training dataset
991
+ columns = ['test', 'freq_code', 'power', 'noise_ratio', 'lag',
992
+ 'lag_diff', 'cons_length', 'cons_det', 'det_hist',
993
+ 'hit_ratio', 'rec_type', 'epoch']
994
+
995
+ class_dat = class_dat[columns]
996
+ class_dat.rename(columns={'test': 'detection'}, inplace=True)
997
+
998
+ # Separate known falses (train_dat) and assumed trues (class_dat)
999
+ train_dat = train_dat[train_dat['detection'] == 0]
1000
+ class_dat = class_dat[class_dat['detection'] == 1]
555
1001
 
556
- #Next we append the classdf to the traindf
557
- train_dat = train_dat.append(class_dat)
1002
+ # Append the classified data to the training data
1003
+ train_dat = pd.concat([train_dat, class_dat], ignore_index=True)
1004
+ logger.debug(f" Combined training data: {len(train_dat)} detections ({sum(train_dat['detection']==0)} false, {sum(train_dat['detection']==1)} true)")
1005
+ else:
1006
+ logger.debug(f" Training data: {len(train_dat)} detections")
558
1007
 
559
1008
  return train_dat
1009
+
1010
+
1011
+
1012
+
1013
+ def reclassify(self, project, rec_id, threshold_ratio, likelihood_model, rec_type=None, rec_list=None):
1014
+ """
1015
+ Reclassifies fish in a project based on user-defined criteria and threshold ratios.
560
1016
 
1017
+ Parameters
1018
+ ----------
1019
+ project : object
1020
+ The project object that contains methods for managing and classifying fish data.
1021
+
1022
+ rec_id : int or str
1023
+ The unique identifier for the receiver to be reclassified.
1024
+
1025
+ threshold_ratio : float
1026
+ The threshold ratio used for determining classification criteria.
1027
+
1028
+ likelihood_model : list of str
1029
+ The fields to use as the likelihood model for classification.
1030
+
1031
+ rec_type : str, optional
1032
+ The type of receiver being processed (e.g., 'srx1200', 'orion').
1033
+
1034
+ rec_list : list of str, optional
1035
+ A list of receiver IDs to filter the data by, used for creating training data.
1036
+
1037
+ Notes
1038
+ -----
1039
+ - The classification process involves interactive user input to determine if additional
1040
+ iterations are needed.
1041
+ - The fields used for classification are hardcoded as ['hit_ratio', 'cons_length',
1042
+ 'noise_ratio', 'power', 'lag_diff'].
1043
+ """
1044
+ logger.info(f"Starting classification for receiver {rec_id}")
1045
+ logger.info(f" Threshold ratio: {threshold_ratio}")
1046
+ logger.info(f" Likelihood model: {', '.join(likelihood_model)}")
1047
+
1048
+ # Validate inputs
1049
+ if rec_id not in self.receivers.index:
1050
+ logger.error(f"Receiver {rec_id} not found")
1051
+ raise ValueError(f"Receiver '{rec_id}' not found in receiver_data")
1052
+
1053
+ valid_predictors = ['hit_ratio', 'cons_length', 'noise_ratio', 'power', 'lag_diff']
1054
+ invalid = set(likelihood_model) - set(valid_predictors)
1055
+ if invalid:
1056
+ logger.error(f"Invalid predictors: {invalid}")
1057
+ raise ValueError(f"Invalid predictors: {', '.join(invalid)}. Valid: {', '.join(valid_predictors)}")
1058
+
1059
+ class_iter = None
1060
+
1061
+ while True:
1062
+ iter_label = f"iteration {class_iter}" if class_iter else "initial classification"
1063
+ logger.info(f"Running {iter_label}...")
1064
+
1065
+ # Get a list of fish to iterate over
1066
+ fishes = project.get_fish(rec_id=rec_id, train=False, reclass_iter=class_iter)
1067
+ logger.info(f" Found {len(fishes)} fish to classify")
1068
+
1069
+ # Generate training data for the classifier
1070
+ logger.info(" Creating training data...")
1071
+ training_data = project.create_training_data(rec_type=rec_type, reclass_iter=class_iter, rec_list=rec_list)
1072
+ logger.info(f" Training data: {len(training_data)} detections")
1073
+
1074
+ # Iterate over fish and classify with progress bar
1075
+ logger.info(" Classifying detections...")
1076
+ for fish in tqdm(fishes, desc=f" Classifying {rec_id}", unit="fish"):
1077
+ project.classify(fish, rec_id, likelihood_model, training_data, class_iter, threshold_ratio)
1078
+
1079
+ # Generate summary statistics
1080
+ logger.info(" Generating classification summary...")
1081
+ project.classification_summary(rec_id, class_iter)
1082
+
1083
+ # Show the figures and block execution until they are closed
1084
+ plt.show(block=True)
1085
+
1086
+ # Ask the user if they need another iteration (use _prompt helper)
1087
+ user_input = str(self._prompt("\nDo you need another classification iteration? (yes/no): ", default="no")).strip().lower()
1088
+
1089
+ if user_input in ['yes', 'y']:
1090
+ # If yes, increase class_iter and reclassify
1091
+ if class_iter is None:
1092
+ class_iter = 2
1093
+ else:
1094
+ class_iter += 1
1095
+ logger.info(f"Starting iteration {class_iter}")
1096
+ elif user_input in ['no', 'n']:
1097
+ # If no, break the loop
1098
+ logger.info(f"✓ Classification complete for {rec_id}")
1099
+ break
1100
+ else:
1101
+ logger.warning("Invalid input, please enter 'yes' or 'no'")
1102
+
1103
+
561
1104
  def classify(self,
562
1105
  freq_code,
563
1106
  rec_id,
@@ -565,6 +1108,7 @@ class radio_project():
565
1108
  training_data,
566
1109
  reclass_iter = None,
567
1110
  threshold_ratio = None):
1111
+ logger.debug(f" Classifying {freq_code} at {rec_id} (iter: {reclass_iter})")
568
1112
 
569
1113
  # get rates
570
1114
  try:
@@ -594,28 +1138,40 @@ class radio_project():
594
1138
  columns = ['freq_code','epoch','rec_id','time_stamp','power','noise_ratio','scan_time','channels','rec_type']
595
1139
  class_dat = class_dat[columns]
596
1140
 
1141
+ class_dat = class_dat.drop_duplicates()
1142
+
597
1143
  if len(class_dat) > 0:
598
1144
  # do some data management when importing training dataframe
599
1145
  class_dat['time_stamp'] = pd.to_datetime(class_dat['time_stamp'])
1146
+ class_dat['epoch'] = (class_dat.time_stamp.astype('int64') // 10**9).astype('int64')
1147
+
600
1148
  class_dat.sort_values(by = 'time_stamp', inplace = True)
601
- class_dat['epoch'] = class_dat.epoch.values.astype(np.int32)
1149
+ class_dat['epoch'] = class_dat.epoch.values.astype(np.int64)
602
1150
  class_dat = class_dat.drop_duplicates(subset = 'time_stamp')
603
1151
 
604
1152
  # calculate predictors
605
1153
  class_dat['lag'] = class_dat.epoch.diff()
606
1154
  class_dat['lag_diff'] = class_dat.lag.diff()
607
1155
  class_dat.fillna(value = 99999999, inplace = True)
608
- det_hist_string, det_hist, cons_det, max_count \
609
- = predictors.detection_history(class_dat.epoch.values,
610
- pulse_rate,
611
- self.det_count,
612
- 2,
613
- class_dat.scan_time.values,
614
- class_dat.channels)
615
- class_dat['det_hist'] = det_hist_string
616
- class_dat['hit_ratio'] = det_hist
617
- class_dat['cons_det'] = cons_det
618
- class_dat['cons_length'] = max_count
1156
+
1157
+ # Apply the optimized detection history function to the entire dataset at once
1158
+ detection_history, hit_ratio_arr, cons_det_arr, max_count_arr = predictors.detection_history(
1159
+ class_dat['epoch'].values,
1160
+ pulse_rate,
1161
+ self.det_count,
1162
+ class_dat['channels'].values,
1163
+ class_dat['scan_time'].values
1164
+ )
1165
+
1166
+ # Convert detection history arrays to concatenated strings outside Numba
1167
+ det_hist_string_arr = np.array([''.join(row.astype(str)) for row in detection_history])
1168
+
1169
+ # Assign back to the DataFrame
1170
+ class_dat['det_hist'] = det_hist_string_arr
1171
+ class_dat['hit_ratio'] = hit_ratio_arr
1172
+ class_dat['cons_det'] = cons_det_arr
1173
+ class_dat['cons_length'] = max_count_arr
1174
+
619
1175
  # class_dat['series_hit'] = predictors.series_hit(class_dat.lag.values,
620
1176
  # pulse_rate,
621
1177
  # mort_rate,
@@ -706,7 +1262,7 @@ class radio_project():
706
1262
 
707
1263
  # keep it tidy cuz hdf is fragile
708
1264
  class_dat = class_dat.astype({'freq_code': 'object',
709
- 'epoch': 'float32',
1265
+ 'epoch': 'int64',
710
1266
  'rec_id': 'object',
711
1267
  'time_stamp': 'datetime64[ns]',
712
1268
  'power': 'float32',
@@ -741,9 +1297,9 @@ class radio_project():
741
1297
  chunksize = 1000000)
742
1298
 
743
1299
  # export
744
- class_dat.to_csv(os.path.join(self.output_dir,'freq_code_%s_rec_%s_class_%s.csv'%(freq_code, rec_id, reclass_iter)))
1300
+ #class_dat.to_csv(os.path.join(self.output_dir,'freq_code_%s_rec_%s_class_%s.csv'%(freq_code, rec_id, reclass_iter)))
745
1301
 
746
- print ('Fish %s at receiver %s classified'%(freq_code,rec_id))
1302
+ logger.debug(f" ✓ {freq_code} at {rec_id}: {sum(classification)} true, {len(classification)-sum(classification)} false")
747
1303
  # next step looks at results
748
1304
 
749
1305
  # else:
@@ -752,6 +1308,9 @@ class radio_project():
752
1308
  def classification_summary(self,rec_id,reclass_iter = None):
753
1309
  '''if this is not the initial classification we need the trues from the last
754
1310
  last classification and falses from the first'''
1311
+
1312
+ iter_label = f"iteration {reclass_iter}" if reclass_iter else "initial classification"
1313
+ logger.info(f"Generating classification summary for {rec_id} ({iter_label})")
755
1314
 
756
1315
  if reclass_iter == None:
757
1316
  classified_dat = pd.read_hdf(self.db,
@@ -761,229 +1320,140 @@ class radio_project():
761
1320
  classified_dat = pd.read_hdf(self.db,
762
1321
  key = 'classified',
763
1322
  where = f'(iter == {reclass_iter}) & (rec_id == "{rec_id}")')
1323
+
1324
+ logger.info(f" Loaded {len(classified_dat)} classified detections")
764
1325
 
765
- print ("")
766
- print ("Classification summary statistics report %s"%(rec_id))
767
- print ("----------------------------------------------------------------------------------")
1326
+ logger.info("")
1327
+ logger.info(f"Classification Summary Report: {rec_id}")
1328
+ logger.info("="*80)
768
1329
  det_class_count = classified_dat.groupby('test')['test'].count().to_frame()
769
1330
  if len(det_class_count)>1:
770
- print ("")
771
- print ("%s detection class statistics:"%(rec_id))
772
- print ("The probability that a detection was classified as true was %s"%((round(float(det_class_count.at[1,'test'])/float(det_class_count.sum()),3))))
773
- print ("The probability that a detection was classified as false positive was %s"%((round(float(det_class_count.at[0,'test'])/float(det_class_count.sum()),3))))
774
- print ("")
775
- print ("----------------------------------------------------------------------------------")
776
- print ("")
1331
+ logger.info("")
1332
+ logger.info(f"{rec_id} detection class statistics:")
1333
+ prob_true = round(float(det_class_count.at[1,'test'])/float(det_class_count.sum()),3)
1334
+ prob_false = round(float(det_class_count.at[0,'test'])/float(det_class_count.sum()),3)
1335
+ logger.info(f" P(classified as true) = {prob_true}")
1336
+ logger.info(f" P(classified as false positive) = {prob_false}")
1337
+ logger.info("")
1338
+ logger.info("="*80)
1339
+ logger.info("")
777
1340
  sta_class_count = classified_dat.groupby(['rec_id','test'])['test'].count().to_frame()#.reset_index(drop = False)
778
1341
  recs = list(set(sta_class_count.index.levels[0]))
779
- print ("Detection Class Counts Across Stations")
780
- print (" Classified Classified")
781
- print (" False True")
782
- print (" ______________________________")
783
- print (" | | |")
1342
+ logger.info("Detection Class Counts Across Stations")
1343
+ logger.info(" Classified Classified")
1344
+ logger.info(" False True")
1345
+ logger.info(" ______________________________")
1346
+ logger.info(" | | |")
784
1347
  for i in recs:
785
- print ("%6s| %8s | %8s |"%(i,sta_class_count.loc[(i,0)].values[0],sta_class_count.loc[(i,1)].values[0]))
786
- print (" |______________|______________|")
787
- print ("")
788
- print ("----------------------------------------------------------------------------------")
789
- print ("----------------------------------------------------------------------------------")
1348
+ logger.info("%6s| %8s | %8s |"%(i,sta_class_count.loc[(i,0)].values[0],sta_class_count.loc[(i,1)].values[0]))
1349
+ logger.info(" |______________|______________|")
1350
+ logger.info("")
1351
+ logger.info("="*80)
1352
+ logger.info("Compiling classification figures...")
790
1353
 
791
- # print ("Compiling Figures")
792
-
793
- # plot the log likelihood ratio
1354
+ # Plot the log likelihood ratio
794
1355
  classified_dat['log_posterior_ratio'] = np.log10(classified_dat.posterior_T / classified_dat.posterior_F)
795
- minLogRatio = classified_dat.log_posterior_ratio.min()//1 * 1
796
- maxLogRatio = classified_dat.log_posterior_ratio.max()//1 * 1
797
- ratio_range = maxLogRatio - minLogRatio
798
- ratio_bins =np.linspace(minLogRatio,maxLogRatio+1,100)
799
1356
 
800
- # hit ratio bins
801
- hit_ratio_bins =np.linspace(0,1.0,11)
802
-
803
- # plot signal power histograms by detection class
804
- min_power = classified_dat.power.min()//5 * 5
805
- max_power = classified_dat.power.max()//5 * 5
806
- power_bins =np.arange(min_power,max_power+20,10)
807
-
808
- # Lag Back Differences - how steady are detection lags?
809
- lag_bins =np.arange(-100,110,20)
810
-
811
- # Consecutive Record Length
812
- con_length_bins =np.arange(1,12,1)
813
-
814
- # Noise Ratio
815
- noise_bins =np.arange(0,1.1,0.1)
816
-
817
- # plot the log of the posterior ratio
818
- classified_dat['log_post_ratio'] = np.log(classified_dat.posterior_T/classified_dat.posterior_F)
819
- minPostRatio = classified_dat.log_post_ratio.min()
820
- maxPostRatio = classified_dat.log_post_ratio.max()
821
- post_ratio_bins = np.linspace(minPostRatio,maxPostRatio,10)
822
-
1357
+ # Binning and other parameters
1358
+ hit_ratio_bins = np.linspace(0, 1.0, 11)
1359
+ con_length_bins = np.arange(1, 12, 1)
1360
+ power_bins = np.arange(50, 110, 10)
1361
+ noise_bins = np.linspace(0, 1.1, 11)
1362
+ lag_bins = np.arange(-100, 110, 20)
1363
+ post_ratio_bins = np.linspace(classified_dat.log_posterior_ratio.min(), classified_dat.log_posterior_ratio.max(), 10)
1364
+
823
1365
  trues = classified_dat[classified_dat.test == 1]
824
1366
  falses = classified_dat[classified_dat.test == 0]
825
-
826
- # make lattice plot for pubs
827
-
828
- # hit ratio
829
- fig = plt.figure(figsize = (4, 2), dpi = 300, layout = 'tight')
830
-
831
- ax1 = fig.add_subplot(1,2,1)
832
- ax1.hist(falses.hit_ratio.values,
833
- hit_ratio_bins,
834
- density = True,
835
- color = 'grey',
836
- edgecolor='black',
837
- linewidth=1.2)
838
- ax1.set_xlabel('Hit Ratio')
839
- ax1.set_title('False Positive')
840
- ax1.set_ylabel('Probability Density')
841
-
842
- ax2 = fig.add_subplot(1,2,2)
843
- ax2.hist(trues.hit_ratio.values,
844
- hit_ratio_bins,
845
- density = True,
846
- color = 'grey',
847
- edgecolor='black',
848
- linewidth=1.2)
849
- ax2.set_title('Valid')
850
- ax2.set_xlabel('Hit Ratio')
851
-
852
- plt.show()
853
1367
 
854
- # consecutive record length
855
- fig = plt.figure(figsize = (4, 2), dpi = 300, layout = 'tight')
856
-
857
- ax1 = fig.add_subplot(1,2,1)
858
- ax1.hist(falses.cons_length.values,
859
- con_length_bins,
860
- density = True,
861
- color = 'grey',
862
- edgecolor='black',
863
- linewidth=1.2)
864
- ax1.set_xlabel('Consecutive Hit Length')
865
- ax1.set_title('False Positive')
866
- ax1.set_ylabel('Probability Density')
867
-
868
- ax2 = fig.add_subplot(1,2,2)
869
- ax2.hist(trues.cons_length.values,
870
- con_length_bins,
871
- density = True,
872
- color = 'grey',
873
- edgecolor='black',
874
- linewidth=1.2)
875
- ax2.set_title('Valid')
876
- ax2.set_xlabel('Consecutive Hit Length')
1368
+ # Create a grid of subplots (3 rows x 4 columns)
1369
+ fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(15, 10), dpi=300)
877
1370
 
878
- plt.show()
1371
+ # Function to set font sizes
1372
+ def set_fontsize(ax, fontsize=6):
1373
+ for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
1374
+ ax.get_xticklabels() + ax.get_yticklabels()):
1375
+ item.set_fontsize(fontsize)
879
1376
 
880
- # power
881
- fig = plt.figure(figsize = (4, 2), dpi = 300, layout = 'tight')
882
-
883
- ax1 = fig.add_subplot(1,2,1)
884
- ax1.hist(falses.power.values,
885
- power_bins,
886
- density = True,
887
- color = 'grey',
888
- edgecolor='black',
889
- linewidth=1.2)
890
- ax1.set_xlabel('Signal Power')
891
- ax1.set_ylabel('Probability Density')
892
- ax1.set_title('False Positive')
893
-
894
- ax2 = fig.add_subplot(1,2,2)
895
- ax2.hist(trues.power.values,
896
- power_bins,
897
- density = True,
898
- color = 'grey',
899
- edgecolor='black',
900
- linewidth=1.2)
901
- ax2.set_xlabel('Signal Power')
902
- ax2.set_title('Valid')
1377
+ # Plot hit ratio
1378
+ axes[0, 0].hist(falses.hit_ratio.values, hit_ratio_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
1379
+ axes[0, 0].set_xlabel('Hit Ratio')
1380
+ axes[0, 0].set_ylabel('Probability Density')
1381
+ axes[0, 0].set_title('Hit Ratio - False Positive')
1382
+ set_fontsize(axes[0, 0])
903
1383
 
904
- plt.show()
905
-
906
- # noise ratio
907
- fig = plt.figure(figsize = (4, 2), dpi = 300, layout = 'tight')
908
-
909
- ax1 = fig.add_subplot(1,2,1)
910
- ax1.hist(falses.noise_ratio.values,
911
- noise_bins,
912
- density = True,
913
- color = 'grey',
914
- edgecolor='black',
915
- linewidth=1.2)
916
- ax1.set_xlabel('Noise Ratio')
917
- ax1.set_ylabel('Probability Density')
918
- ax1.set_title('False Positive')
919
-
920
- ax2 = fig.add_subplot(1,2,2)
921
- ax2.hist(trues.noise_ratio.values,
922
- noise_bins,
923
- density = True,
924
- color = 'grey',
925
- edgecolor='black',
926
- linewidth=1.2)
927
- ax2.set_xlabel('Noise Ratio')
928
- ax2.set_title('Valid')
1384
+ axes[0, 1].hist(trues.hit_ratio.values, hit_ratio_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
1385
+ axes[0, 1].set_xlabel('Hit Ratio')
1386
+ axes[0, 1].set_title('Hit Ratio - Valid')
1387
+ set_fontsize(axes[0, 1])
929
1388
 
930
- plt.show()
931
-
932
- # lag diff
933
- fig = plt.figure(figsize = (4, 2), dpi = 300, layout = 'tight')
934
-
935
- ax1 = fig.add_subplot(1,2,1)
936
- ax1.hist(falses.lag_diff.values,
937
- lag_bins,
938
- density = True,
939
- color = 'grey',
940
- edgecolor='black',
941
- linewidth=1.2)
942
- ax1.set_xlabel('Lag Differences')
943
- ax1.set_ylabel('Probability Density')
944
- ax1.set_title('False Positive')
945
-
946
- ax2 = fig.add_subplot(1,2,2)
947
- ax2.hist(trues.lag_diff.values,
948
- lag_bins,
949
- density = True,
950
- color = 'grey',
951
- edgecolor='black',
952
- linewidth=1.2)
953
- ax2.set_xlabel('Lag Differences')
954
- ax2.set_title('Valid')
1389
+ # Plot consecutive record length
1390
+ axes[0, 2].hist(falses.cons_length.values, con_length_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
1391
+ axes[0, 2].set_xlabel('Consecutive Hit Length')
1392
+ axes[0, 2].set_ylabel('Probability Density')
1393
+ axes[0, 2].set_title('Consecutive Hit Length - False Positive')
1394
+ set_fontsize(axes[0, 2])
955
1395
 
956
- plt.show()
1396
+ axes[0, 3].hist(trues.cons_length.values, con_length_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
1397
+ axes[0, 3].set_xlabel('Consecutive Hit Length')
1398
+ axes[0, 3].set_title('Consecutive Hit Length - Valid')
1399
+ set_fontsize(axes[0, 3])
1400
+
1401
+ # Plot power
1402
+ axes[1, 0].hist(falses.power.values, power_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
1403
+ axes[1, 0].set_xlabel('Signal Power')
1404
+ axes[1, 0].set_ylabel('Probability Density')
1405
+ axes[1, 0].set_title('Signal Power - False Positive')
1406
+ set_fontsize(axes[1, 0])
1407
+
1408
+ axes[1, 1].hist(trues.power.values, power_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
1409
+ axes[1, 1].set_xlabel('Signal Power')
1410
+ axes[1, 1].set_title('Signal Power - Valid')
1411
+ set_fontsize(axes[1, 1])
1412
+
1413
+ # Plot noise ratio
1414
+ axes[1, 2].hist(falses.noise_ratio.values, noise_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
1415
+ axes[1, 2].set_xlabel('Noise Ratio')
1416
+ axes[1, 2].set_ylabel('Probability Density')
1417
+ axes[1, 2].set_title('Noise Ratio - False Positive')
1418
+ set_fontsize(axes[1, 2])
1419
+
1420
+ axes[1, 3].hist(trues.noise_ratio.values, noise_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
1421
+ axes[1, 3].set_xlabel('Noise Ratio')
1422
+ axes[1, 3].set_title('Noise Ratio - Valid')
1423
+ set_fontsize(axes[1, 3])
1424
+
1425
+ # Plot lag differences
1426
+ axes[2, 0].hist(falses.lag_diff.values, lag_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
1427
+ axes[2, 0].set_xlabel('Lag Differences')
1428
+ axes[2, 0].set_ylabel('Probability Density')
1429
+ axes[2, 0].set_title('Lag Differences - False Positive')
1430
+ set_fontsize(axes[2, 0])
1431
+
1432
+ axes[2, 1].hist(trues.lag_diff.values, lag_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
1433
+ axes[2, 1].set_xlabel('Lag Differences')
1434
+ axes[2, 1].set_title('Lag Differences - Valid')
1435
+ set_fontsize(axes[2, 1])
957
1436
 
958
- # log posterior ratio
959
- fig = plt.figure(figsize = (4, 2), dpi = 300, layout = 'tight')
960
-
961
- ax1 = fig.add_subplot(1,2,1)
962
- ax1.hist(falses.log_posterior_ratio.values,
963
- bins = 20,
964
- density = True,
965
- color = 'grey',
966
- edgecolor='black',
967
- linewidth=1.2)
968
- ax1.set_xlabel('Log Posterior Ratio')
969
- ax1.set_ylabel('Probability Density')
970
- ax1.set_title('False Positive')
971
-
972
- ax2 = fig.add_subplot(1,2,2)
973
- ax2.hist(trues.log_posterior_ratio.values,
974
- bins = 20,
975
- density = True,
976
- color = 'grey',
977
- edgecolor='black',
978
- linewidth=1.2)
979
- ax2.set_xlabel('Log Posterior Ratio')
980
- ax2.set_title('Valid')
1437
+ # Plot log posterior ratio
1438
+ axes[2, 2].hist(falses.log_posterior_ratio.values, bins=20, density=True, color='grey', edgecolor='black', linewidth=1.2)
1439
+ axes[2, 2].set_xlabel('Log Posterior Ratio')
1440
+ axes[2, 2].set_ylabel('Probability Density')
1441
+ axes[2, 2].set_title('Log Posterior Ratio - False Positive')
1442
+ set_fontsize(axes[2, 2])
1443
+
1444
+ axes[2, 3].hist(trues.log_posterior_ratio.values, bins=20, density=True, color='grey', edgecolor='black', linewidth=1.2)
1445
+ axes[2, 3].set_xlabel('Log Posterior Ratio')
1446
+ axes[2, 3].set_title('Log Posterior Ratio - Valid')
1447
+ set_fontsize(axes[2, 3])
981
1448
 
1449
+ # Adjust layout
1450
+ plt.tight_layout()
1451
+
1452
+ # Show the plot
982
1453
  plt.show()
983
-
984
1454
  else:
985
- print("There were insufficient data to quantify summary statistics")
986
- print("All remaining were classified as %s suggesting there is no more improvement in the model"%(det_class_count.index[0]))
1455
+ logger.warning("Insufficient data to quantify summary statistics")
1456
+ logger.warning(f"All remaining classified as {det_class_count.index[0]} - no more model improvement expected")
987
1457
 
988
1458
  def undo_classification(self, rec_id, freq_code=None, class_iter=None):
989
1459
  # Read the table from the HDF5 file
@@ -1017,21 +1487,29 @@ class radio_project():
1017
1487
  data_columns=True,
1018
1488
  append = False)
1019
1489
 
1020
- def undo_bouts(self, rec_id):
1490
+ def undo_bouts(self, rec_id=None):
1491
+ """
1492
+ Remove bouts from the presence table.
1493
+
1494
+ Args:
1495
+ rec_id (str, optional): Specific receiver ID to remove bouts for.
1496
+ If None, removes all bouts.
1497
+ """
1021
1498
  # Read the table from the HDF5 file
1022
1499
  with pd.HDFStore(self.db, 'r+') as store:
1023
1500
  if 'presence' in store:
1024
1501
  df = store['presence']
1025
1502
 
1026
1503
  # Build the condition based on provided arguments
1027
- condition = (df['rec_id'] == rec_id)
1028
-
1029
- # Update or delete the rows based on your requirement
1030
- # For deletion:
1031
- df = df[~condition]
1504
+ if rec_id is not None:
1505
+ condition = (df['rec_id'] == rec_id)
1506
+ df = df[~condition]
1507
+ else:
1508
+ # Remove all presence data
1509
+ df = pd.DataFrame(columns=df.columns)
1032
1510
 
1033
1511
  df = df.astype({'freq_code': 'object',
1034
- 'epoch': 'float32',
1512
+ 'epoch': 'int64',
1035
1513
  'rec_id': 'object',
1036
1514
  'class': 'object',
1037
1515
  'bout_no':'int32',
@@ -1049,118 +1527,510 @@ class radio_project():
1049
1527
  'rec_id':20,
1050
1528
  'class':20},
1051
1529
  data_columns=True,
1052
- append = False)
1053
-
1054
- def make_recaptures_table(self):
1055
- '''method creates a recaptures key in the hdf file'''
1056
-
1057
- # iterate over fish, get last classificaiton, presences, and overlapping detections
1058
- for fish in self.tags[self.tags.tag_type == 'study'].index:
1059
- for rec in self.receivers.index:
1060
- # get this receivers data from the classified key
1061
- rec_dat = pd.read_hdf(self.db,
1062
- key = 'classified',
1063
- where = f'(freq_code == "{fish}") & (rec_id == "{rec}")')
1064
- try:
1065
- presence_dat = pd.read_hdf(self.db,
1066
- key = 'presence',
1067
- where = f'(freq_code == "{fish}") & (rec_id == "{rec}")')
1068
- except:
1069
- presence_dat = []
1530
+ append = False)
1070
1531
 
1071
- try:
1072
- # get data from overlapping associated with this fish and receiver
1073
- overlap_dat = pd.read_hdf(self.db,
1074
- key = 'overlapping',
1075
- where = f'(freq_code == "{fish}") & (rec_id == "{rec}")')
1532
+ def repack_database(self, output_path=None):
1533
+ """
1534
+ Repack HDF5 database to reclaim disk space and improve performance.
1535
+
1536
+ Uses PyTables to copy all nodes (Groups, Tables, Arrays) recursively
1537
+ with compression enabled. This fixes the bloat from repeated append operations.
1538
+
1539
+ Args:
1540
+ output_path (str, optional): Path for repacked database.
1541
+ If None, uses '{db_name}_repacked.h5'
1542
+
1543
+ Returns:
1544
+ str: Path to the repacked database
1545
+ """
1546
+ import tables
1547
+ import logging
1548
+ import time
1549
+
1550
+ logger = logging.getLogger(__name__)
1551
+
1552
+ if output_path is None:
1553
+ base_name = os.path.splitext(self.db)[0]
1554
+ output_path = f"{base_name}_repacked.h5"
1555
+
1556
+ logger.info(f"Repacking database: {self.db} → {output_path}")
1557
+ print(f"[repack] Starting database repack...")
1558
+ print(f" Source: {self.db}")
1559
+ print(f" Target: {output_path}")
1560
+
1561
+ # Get original size
1562
+ orig_size = os.path.getsize(self.db)
1563
+ print(f" Original size: {orig_size / (1024**3):.2f} GB")
1564
+
1565
+ start_time = time.time()
1566
+
1567
+ # Open both files with PyTables
1568
+ with tables.open_file(self.db, mode='r') as h5in:
1569
+ with tables.open_file(output_path, mode='w') as h5out:
1570
+
1571
+ # Set compression filters
1572
+ filters = tables.Filters(complevel=5, complib='blosc:zstd')
1573
+
1574
+ # Copy all top-level nodes recursively
1575
+ for node in h5in.root:
1576
+ node_path = node._v_pathname
1577
+ print(f" Copying {node_path}...")
1076
1578
 
1077
- except:
1078
- overlap_dat = []
1579
+ try:
1580
+ # Use recursive=True to copy entire subtree (Groups, Tables, Arrays, etc.)
1581
+ h5in.copy_node(
1582
+ where=node_path,
1583
+ newparent=h5out.root,
1584
+ recursive=True,
1585
+ filters=filters
1586
+ )
1587
+ except (tables.NodeError, tables.HDF5ExtError, OSError, ValueError) as e:
1588
+ raise RuntimeError(f"Failed to copy HDF5 node {node_path}: {e}") from e
1589
+
1590
+ # Get new size
1591
+ new_size = os.path.getsize(output_path)
1592
+ savings = (1 - new_size / orig_size) * 100
1593
+ elapsed = time.time() - start_time
1594
+
1595
+ print(f"\n[repack] ✓ Repack complete in {elapsed:.1f} seconds")
1596
+ print(f" New size: {new_size / (1024**3):.2f} GB")
1597
+ print(f" Savings: {savings:.1f}%")
1598
+
1599
+ logger.info(f"Repack complete: {new_size / (1024**3):.2f} GB ({savings:.1f}% reduction)")
1600
+
1601
+ return output_path
1602
+
1603
+ def make_recaptures_table(self, export=True, pit_study=False):
1604
+ '''Creates a recaptures key in the HDF5 file, iterating over receivers to manage memory.'''
1605
+ logger.info("Creating recaptures table")
1606
+ logger.info(f" Processing {len(self.receivers)} receiver(s)")
1607
+ # prepare a heartbeat log so long runs can be monitored (one-line per receiver)
1608
+ heartbeat_dir = os.path.join(self.project_dir, 'build')
1609
+ try:
1610
+ os.makedirs(heartbeat_dir, exist_ok=True)
1611
+ except OSError as e:
1612
+ raise RuntimeError(
1613
+ f"Failed to create heartbeat directory '{heartbeat_dir}': {e}"
1614
+ ) from e
1615
+ heartbeat_path = os.path.join(heartbeat_dir, 'recaptures_heartbeat.log')
1616
+ print(f"Starting recaptures: {len(self.receivers)} receivers. Heartbeat -> {heartbeat_path}")
1617
+ try:
1618
+ with open(heartbeat_path, 'a') as _hb:
1619
+ _hb.write(f"START {datetime.datetime.now().isoformat()} receivers={len(self.receivers)}\n")
1620
+ except OSError as e:
1621
+ raise RuntimeError(
1622
+ f"Failed to write heartbeat start to '{heartbeat_path}': {e}"
1623
+ ) from e
1624
+
1625
+ if pit_study==False:
1626
+ # Convert release dates to datetime if not already done
1627
+ self.tags['rel_date'] = pd.to_datetime(self.tags['rel_date'])
1628
+ tags_copy = self.tags.copy()
1629
+
1630
+ for rec in tqdm(self.receivers.index, desc="Processing receivers", unit="receiver"):
1631
+ logger.info(f" Processing receiver {rec}...")
1632
+ print(f"[recaptures] processing receiver {rec}...", flush=True)
1633
+
1634
+ # Read classified data for this receiver as a Dask DataFrame
1635
+ # Reading the data (assuming self.db and rec are predefined variables)
1636
+ rec_dat = dd.read_hdf(self.db, key='classified')
1079
1637
 
1080
- if len(rec_dat) > 0:
1081
- rec_dat = rec_dat[rec_dat.iter == rec_dat.iter.max()]
1082
- rec_dat = rec_dat[rec_dat.test == 1]
1083
- if len(rec_dat) > 0:
1084
- rec_dat.set_index('epoch')
1085
-
1086
- if len(presence_dat) > 0:
1087
- presence_dat.set_index('epoch')
1088
-
1089
- rec_dat = pd.merge(rec_dat,
1090
- presence_dat,
1091
- how = 'left')
1092
-
1093
- else:
1094
- rec_dat['bout_no'] = np.zeros(len(rec_dat))
1638
+ # Filter for specific rec_id and convert to pandas DataFrame
1639
+ rec_dat = rec_dat[rec_dat['rec_id'] == rec].compute()
1640
+
1641
+ # Convert 'timestamp' column to datetime
1642
+ rec_dat['time_stamp'] = pd.to_datetime(rec_dat['time_stamp'])
1643
+
1644
+ # Calculate seconds since Unix epoch
1645
+ rec_dat['epoch'] = (rec_dat['time_stamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
1646
+ logger.debug(f" Initial load: {len(rec_dat)} detections")
1647
+
1648
+ # Merge with release dates to filter out data before release
1649
+ rec_dat = rec_dat.merge(tags_copy, left_on='freq_code', right_index=True)
1650
+ rec_dat = rec_dat[rec_dat['time_stamp'] >= rec_dat['rel_date']]
1651
+ logger.debug(f" After release date filter: {len(rec_dat)} detections")
1652
+
1653
+ # Reset index to avoid ambiguity between index and column labels
1654
+ if 'freq_code' in rec_dat.columns and 'freq_code' in rec_dat.index.names:
1655
+ rec_dat = rec_dat.reset_index(drop=True)
1656
+
1657
+ # Filter by latest iteration and valid test
1658
+ idxmax_values = rec_dat.iter.max()
1659
+ rec_dat = rec_dat[rec_dat.iter == idxmax_values]
1660
+ rec_dat = rec_dat[rec_dat['test'] == 1]
1661
+ logger.debug(f" After filtering (iter={idxmax_values}, test=1): {len(rec_dat)} detections")
1662
+
1663
+ # Check if 'presence' exists before trying to read it
1664
+ try:
1665
+ presence_data = dd.read_hdf(self.db, key='presence')
1666
+ # Filter immediately instead of checking len() which triggers expensive compute
1667
+ presence_data = presence_data[presence_data['rec_id'] == rec]
1668
+ presence_exists = True
1669
+ except (KeyError, FileNotFoundError):
1670
+ presence_exists = False
1671
+
1672
+ if presence_exists:
1673
+ try:
1674
+ presence_data = presence_data.compute()
1675
+ presence_data = presence_data[presence_data['freq_code'].isin(self.tags[self.tags.tag_type=='study'].index)]
1676
+ presence_data = presence_data[['freq_code', 'epoch', 'rec_id', 'bout_no']]
1677
+ logger.debug(f" Presence data: {len(presence_data)} records")
1678
+
1679
+ except KeyError:
1680
+ logger.warning(f" No presence data found for {rec}, skipping presence merge")
1681
+ else:
1682
+ logger.warning(" 'presence' key not found in HDF5, skipping presence merge")
1095
1683
 
1096
- if len(overlap_dat) > 0:
1097
- overlap_dat.set_index('epoch')
1098
-
1099
- rec_dat = pd.merge(rec_dat,
1100
- overlap_dat,
1101
- how = 'left')
1102
-
1103
- rec_dat = rec_dat[rec_dat.overlapping != 0]
1104
-
1684
+ # Read overlap data - filter immediately to avoid expensive len() compute
1685
+ try:
1686
+ overlap_data = dd.read_hdf(self.db, key='overlapping')
1687
+ # Filter to this receiver first before checking anything
1688
+ overlap_data = overlap_data[overlap_data['rec_id'] == rec]
1689
+ overlap_exists = True
1690
+ except (KeyError, FileNotFoundError):
1691
+ overlap_exists = False
1692
+
1693
+ if overlap_exists:
1694
+ try:
1695
+ overlap_data = overlap_data.compute()
1696
+ overlap_data = overlap_data[overlap_data['freq_code'].isin(self.tags[self.tags.tag_type=='study'].index)]
1697
+ # Aggregate both overlapping and ambiguous_overlap columns
1698
+ if 'ambiguous_overlap' in overlap_data.columns:
1699
+ overlap_data = overlap_data.groupby(['freq_code', 'epoch', 'rec_id']).agg({
1700
+ 'overlapping': 'max',
1701
+ 'ambiguous_overlap': 'max'
1702
+ }).reset_index()
1105
1703
  else:
1106
- rec_dat['overlapping'] = np.zeros(len(rec_dat))
1107
-
1108
- rec_dat.reset_index(inplace = True)
1109
- rec_dat['bout_no'] = rec_dat['bout_no'].fillna(0)
1704
+ overlap_data = overlap_data.groupby(['freq_code', 'epoch', 'rec_id'])['overlapping'].max().reset_index()
1705
+ logger.debug(f" Overlap data: {len(overlap_data)} records")
1706
+
1707
+ except KeyError:
1708
+ logger.warning(f" No overlap data found for {rec}, skipping overlap merge")
1709
+ else:
1710
+ logger.warning(" 'overlapping' key not found in HDF5, skipping overlap merge")
1110
1711
 
1111
- # keep certain columns and write to hdf
1112
- rec_dat = rec_dat[['freq_code',
1113
- 'rec_id',
1114
- 'epoch',
1115
- 'time_stamp',
1116
- 'power',
1117
- 'noise_ratio',
1118
- 'lag',
1119
- 'det_hist',
1120
- 'hit_ratio',
1121
- 'cons_det',
1122
- 'cons_length',
1123
- 'likelihood_T',
1124
- 'likelihood_F',
1125
- 'bout_no',
1126
- 'overlapping']]
1127
-
1128
- # keep it tidy cuz hdf is fragile
1129
- rec_dat = rec_dat.astype({'freq_code': 'object',
1130
- 'epoch': 'float32',
1131
- 'rec_id': 'object',
1132
- 'time_stamp': 'datetime64[ns]',
1133
- 'power': 'float32',
1134
- 'noise_ratio': 'float32',
1135
- 'lag': 'float32',
1136
- 'det_hist': 'object',
1137
- 'hit_ratio': 'float32',
1138
- 'cons_det': 'int32',
1139
- 'cons_length': 'float32',
1140
- 'likelihood_T': 'float32',
1141
- 'likelihood_F': 'float32',
1142
- 'bout_no':'int32',
1143
- 'overlapping':'int32'})
1712
+ # Merge with presence data
1713
+ if presence_exists:
1714
+ rec_dat = rec_dat.merge(presence_data, on=['freq_code', 'epoch', 'rec_id'], how='left')
1715
+ rec_dat['bout_no'] = rec_dat['bout_no'].fillna(0).astype(int)
1716
+ else:
1717
+ rec_dat['bout_no'] = 0
1718
+
1719
+ # Merge with overlap data
1720
+ if overlap_exists:
1721
+ rec_dat = rec_dat.merge(overlap_data, on=['freq_code', 'epoch', 'rec_id'], how='left')
1722
+ rec_dat['overlapping'] = rec_dat['overlapping'].fillna(0).astype(int)
1723
+ # Add ambiguous_overlap if it exists in overlap data
1724
+ if 'ambiguous_overlap' in overlap_data.columns:
1725
+ rec_dat['ambiguous_overlap'] = rec_dat['ambiguous_overlap'].fillna(0).astype('float32')
1726
+ else:
1727
+ rec_dat['ambiguous_overlap'] = np.float32(0)
1728
+ else:
1729
+ rec_dat['overlapping'] = 0
1730
+ rec_dat['ambiguous_overlap'] = np.float32(0)
1731
+
1732
+ # Filter out overlapping detections (keep only overlapping=0)
1733
+ before_filter = len(rec_dat)
1734
+ rec_dat = rec_dat[rec_dat['overlapping'] != 1]
1735
+ after_filter = len(rec_dat)
1736
+ logger.debug(f" Filtered {before_filter - after_filter} overlapping detections")
1737
+
1738
+ logger.debug(f" After presence/overlap merge: {len(rec_dat)} detections")
1739
+
1740
+ # Check for required columns
1741
+ required_columns = ['freq_code', 'rec_id', 'epoch', 'time_stamp', 'power', 'noise_ratio',
1742
+ 'lag', 'det_hist', 'hit_ratio', 'cons_det', 'cons_length',
1743
+ 'likelihood_T', 'likelihood_F', 'bout_no', 'overlapping', 'ambiguous_overlap']
1744
+
1745
+ missing_columns = [col for col in required_columns if col not in rec_dat.columns]
1746
+ if missing_columns:
1747
+ logger.error(f" Required columns missing: {missing_columns}")
1748
+ continue
1749
+
1750
+ # Sort by freq code and epoch
1751
+ rec_dat = rec_dat.sort_values(by=['freq_code', 'epoch'], ascending=[True, True])
1752
+
1753
+ # Keep only the necessary columns (including handling missing columns)
1754
+ available_columns = [col for col in required_columns if col in rec_dat.columns]
1755
+ rec_dat = rec_dat[available_columns]
1756
+
1757
+ # Ensure correct data types
1758
+ rec_dat = rec_dat.astype({
1759
+ 'freq_code': 'object',
1760
+ 'epoch': 'int64',
1761
+ 'rec_id': 'object',
1762
+ 'time_stamp': 'datetime64[ns]',
1763
+ 'power': 'float32',
1764
+ 'noise_ratio': 'float32',
1765
+ 'lag': 'float32',
1766
+ 'det_hist': 'object',
1767
+ 'hit_ratio': 'float32',
1768
+ 'cons_det': 'int32',
1769
+ 'cons_length': 'float32',
1770
+ 'likelihood_T': 'float32',
1771
+ 'likelihood_F': 'float32',
1772
+ 'bout_no': 'int32',
1773
+ 'overlapping': 'int32',
1774
+ 'ambiguous_overlap': 'float32'
1775
+ })
1776
+
1777
+ # Show record counts
1778
+ logger.debug(f" Final: {len(rec_dat)} detections for {rec}")
1779
+ print(f"[recaptures] {rec}: compiled {len(rec_dat)} rows (overlapping={rec_dat['overlapping'].sum()}, bouts={rec_dat['bout_no'].max()})", flush=True)
1780
+
1781
+ # Append to the HDF5 file
1782
+ with pd.HDFStore(self.db, mode='a') as store:
1783
+ store.append(key='recaptures', value=rec_dat, format='table',
1784
+ index=False, min_itemsize={'freq_code': 20, 'rec_id': 20, 'det_hist': 20},
1785
+ append=True, chunksize=1000000, data_columns=True)
1786
+
1787
+ logger.info(f" ✓ Recaps for {rec} compiled and written to HDF5")
1788
+ print(f"[recaptures] ✓ {rec} written to database", flush=True)
1789
+ # append heartbeat line
1790
+ try:
1791
+ with open(heartbeat_path, 'a') as _hb:
1792
+ _hb.write(f"{datetime.datetime.now().isoformat()} rec={rec} rows={len(rec_dat)}\n")
1793
+ except OSError as e:
1794
+ raise RuntimeError(
1795
+ f"Failed to write heartbeat for receiver {rec} to '{heartbeat_path}': {e}"
1796
+ ) from e
1797
+
1798
+ else:
1799
+ # Loop over each receiver in self.receivers
1800
+ for rec in tqdm(self.receivers.index, desc="Processing PIT receivers", unit="receiver"):
1801
+ logger.info(f" Processing {rec} (PIT study)...")
1802
+
1803
+ # Read PIT data (already parsed from your text files) from /raw_data in HDF5
1804
+ try:
1805
+ pit_data = pd.read_hdf(self.db, key='raw_data')
1806
+ except KeyError:
1807
+ logger.error(" No 'raw_data' key found in HDF5 file")
1808
+ continue
1809
+
1810
+ # Filter rows so that only the specified receiver is kept
1811
+ pit_data = pit_data[pit_data['rec_id'] == rec]
1812
+ logger.debug(f" Filtered PIT data: {len(pit_data)} detections")
1813
+
1814
+ # Add any missing columns to align with the acoustic (non-PIT) columns
1815
+ missing_cols = [
1816
+ 'lag', 'det_hist', 'hit_ratio', 'cons_det', 'cons_length',
1817
+ 'likelihood_T', 'likelihood_F', 'bout_no', 'overlapping', 'ambiguous_overlap'
1818
+ ]
1819
+ for col in missing_cols:
1820
+ if col not in pit_data.columns:
1821
+ if col == 'ambiguous_overlap':
1822
+ pit_data[col] = np.float32(0)
1823
+ else:
1824
+ pit_data[col] = 0
1825
+
1826
+ # Check if 'presence' exists before trying to read it
1827
+ with pd.HDFStore(self.db, mode='r') as store:
1828
+ presence_exists = 'presence' in store.keys()
1829
+
1830
+ if presence_exists:
1831
+ try:
1832
+ presence_data = dd.read_hdf(self.db, key='presence')
1833
+ presence_data = presence_data[presence_data['rec_id'] == rec].compute()
1834
+ presence_data = presence_data[presence_data['freq_code'].isin(self.tags[self.tags.tag_type=='study'].index)]
1835
+ presence_data = presence_data[['freq_code', 'epoch', 'rec_id', 'bout_no']]
1144
1836
 
1145
- with pd.HDFStore(self.db, mode='a') as store:
1146
- store.append(key = 'recaptures',
1147
- value = rec_dat,
1148
- format = 'table',
1149
- index = False,
1150
- min_itemsize = {'freq_code':20,
1151
- 'rec_id':20,
1152
- 'det_hist':20},
1153
- append = True,
1154
- chunksize = 1000000,
1155
- data_columns = True)
1156
-
1157
- print ('recaps for fish %s at receiver %s compiled' %(fish,rec))
1158
-
1159
- tbl_recaptures = pd.read_hdf(self.db,key = 'recaptures')
1160
- tbl_recaptures.to_csv(os.path.join(self.output_dir,'recaptures.csv'))
1837
+ if not presence_data.empty:
1838
+ pit_data = pit_data.merge(presence_data, on=['freq_code', 'epoch', 'rec_id'], how='left')
1839
+ pit_data['bout_no'] = pit_data['bout_no'].fillna(0).astype(int)
1840
+ except KeyError:
1841
+ logger.warning(f" No presence data found for {rec}, skipping presence merge")
1842
+ else:
1843
+ logger.warning(" 'presence' key not found in HDF5, skipping presence merge")
1844
+
1845
+ # Check if 'overlapping' exists before trying to read it
1846
+ with pd.HDFStore(self.db, mode='r') as store:
1847
+ overlap_exists = 'overlapping' in store.keys()
1848
+
1849
+ if overlap_exists:
1850
+ try:
1851
+ overlap_data = dd.read_hdf(self.db, key='overlapping')
1852
+ overlap_data = overlap_data[overlap_data['rec_id'] == rec].compute()
1853
+ overlap_data = overlap_data[overlap_data['freq_code'].isin(self.tags[self.tags.tag_type=='study'].index)]
1854
+ # Aggregate: take max for both overlapping and ambiguous_overlap
1855
+ agg_dict = {'overlapping': 'max'}
1856
+ if 'ambiguous_overlap' in overlap_data.columns:
1857
+ agg_dict['ambiguous_overlap'] = 'max'
1858
+ overlap_data = overlap_data.groupby(['freq_code', 'epoch', 'rec_id']).agg(agg_dict).reset_index()
1859
+
1860
+ if not overlap_data.empty:
1861
+ pit_data = pit_data.merge(overlap_data, on=['freq_code', 'epoch', 'rec_id'], how='left')
1862
+ pit_data['overlapping'] = pit_data['overlapping'].fillna(0).astype(int)
1863
+ if 'ambiguous_overlap' in overlap_data.columns:
1864
+ pit_data['ambiguous_overlap'] = pit_data['ambiguous_overlap'].fillna(0).astype('float32')
1865
+ except KeyError:
1866
+ logger.warning(f" No overlap data found for {rec}, skipping overlap merge")
1867
+ else:
1868
+ logger.warning(" 'overlapping' key not found in HDF5, skipping overlap merge")
1869
+
1870
+ # Sort PIT data by freq_code and epoch
1871
+ pit_data = pit_data.sort_values(['freq_code', 'epoch'])
1872
+
1873
+ # Keep only the columns needed in `recaptures`
1874
+ required_columns = [
1875
+ 'freq_code', 'rec_id', 'epoch', 'time_stamp', 'power', 'noise_ratio', 'lag', 'det_hist',
1876
+ 'hit_ratio', 'cons_det', 'cons_length', 'likelihood_T', 'likelihood_F', 'bout_no', 'overlapping', 'ambiguous_overlap'
1877
+ ]
1878
+ pit_data = pit_data[[c for c in required_columns if c in pit_data.columns]]
1879
+
1880
+ # Convert each column to the correct dtype
1881
+ dtypes_map = {
1882
+ 'freq_code': 'object', 'rec_id': 'object', 'epoch': 'int64',
1883
+ 'time_stamp': 'datetime64[ns]', 'power': 'float32', 'noise_ratio': 'float32',
1884
+ 'lag': 'float32', 'det_hist': 'object', 'hit_ratio': 'float32',
1885
+ 'cons_det': 'int32', 'cons_length': 'float32', 'likelihood_T': 'float32',
1886
+ 'likelihood_F': 'float32', 'bout_no': 'int32', 'overlapping': 'int32', 'ambiguous_overlap': 'float32'
1887
+ }
1888
+ for col, dt in dtypes_map.items():
1889
+ if col in pit_data.columns:
1890
+ pit_data[col] = pit_data[col].astype(dt)
1891
+
1892
+ # Show record counts BEFORE prompting
1893
+ print(f"[recaptures] {rec}: compiled {len(pit_data)} PIT rows (overlapping={pit_data['overlapping'].sum()}, bouts={pit_data['bout_no'].max()})", flush=True)
1161
1894
 
1895
+ # Confirm with user before appending PIT data into 'recaptures'
1896
+ confirm = str(self._prompt("Import PIT data? (yes/no): ", default="no")).strip().lower()
1897
+ if confirm != 'yes':
1898
+ logger.info("PIT data import canceled by user")
1899
+ return
1900
+
1901
+ # Convert 'det_hist' to string to avoid serialization issues
1902
+ if 'det_hist' in pit_data.columns:
1903
+ pit_data['det_hist'] = pit_data['det_hist'].astype(str)
1904
+
1905
+ # Append PIT data to 'recaptures' in HDF5
1906
+ with pd.HDFStore(self.db, mode='a') as store:
1907
+ store.append(
1908
+ key='recaptures',
1909
+ value=pit_data,
1910
+ format='table',
1911
+ index=False,
1912
+ min_itemsize={'freq_code': 20, 'rec_id': 20, 'det_hist': 20},
1913
+ append=True,
1914
+ chunksize=1000000,
1915
+ data_columns=True
1916
+ )
1917
+
1918
+ logger.info(f" ✓ PIT recaps for {rec} compiled and written to HDF5")
1919
+ print(f"[recaptures] ✓ {rec} PIT data written to database", flush=True)
1920
+ try:
1921
+ with open(heartbeat_path, 'a') as _hb:
1922
+ _hb.write(f"{datetime.datetime.now().isoformat()} pit_rec={rec} rows={len(pit_data)}\n")
1923
+ except OSError as e:
1924
+ raise RuntimeError(
1925
+ f"Failed to write PIT heartbeat for receiver {rec} to '{heartbeat_path}': {e}"
1926
+ ) from e
1927
+
1928
+
1929
+ if export:
1930
+ logger.info("Exporting recaptures to CSV...")
1931
+ print("[recaptures] exporting recaptures to CSV...", flush=True)
1932
+ rec_data = dd.read_hdf(self.db, 'recaptures').compute()
1933
+ rec_data.to_csv(os.path.join(self.output_dir,'recaptures.csv'), index=False)
1934
+ logger.info(f" ✓ Export complete: {os.path.join(self.output_dir,'recaptures.csv')}")
1935
+ print(f"[recaptures] ✓ Export complete: {os.path.join(self.output_dir,'recaptures.csv')}", flush=True)
1936
+ try:
1937
+ with open(heartbeat_path, 'a') as _hb:
1938
+ _hb.write(
1939
+ f"DONE {datetime.datetime.now().isoformat()} export="
1940
+ f"{os.path.join(self.output_dir, 'recaptures.csv')}\n"
1941
+ )
1942
+ except OSError as e:
1943
+ raise RuntimeError(
1944
+ f"Failed to write heartbeat completion to '{heartbeat_path}': {e}"
1945
+ ) from e
1946
+
1162
1947
 
1948
+ def undo_recaptures(self):
1949
+ """
1950
+ Remove recaptures data from HDF5 file.
1951
+ Note: File size won't shrink until you manually repack the database.
1952
+ """
1953
+ logger.info("Removing recaptures from database")
1954
+ with pd.HDFStore(self.db, mode='a') as store:
1955
+ if 'recaptures' in store:
1956
+ store.remove('recaptures')
1957
+ logger.info(" ✓ Recaptures key removed")
1958
+ else:
1959
+ logger.info(" No recaptures key found")
1960
+
1961
+ logger.info(" Data logically deleted (file size unchanged)")
1962
+ logger.info(" To reclaim disk space, manually repack after all deletions complete")
1963
+
1964
+ def undo_overlap(self):
1965
+ """
1966
+ Remove overlapping data from HDF5 file.
1967
+ Note: File size won't shrink until you manually repack the database.
1968
+ """
1969
+ logger.info("Removing overlapping from database")
1970
+ with pd.HDFStore(self.db, mode='a') as store:
1971
+ if 'overlapping' in store:
1972
+ store.remove('overlapping')
1973
+ logger.info(" ✓ Overlapping key removed")
1974
+ else:
1975
+ logger.info(" No overlapping key found")
1976
+
1977
+ logger.info(" Data logically deleted (file size unchanged)")
1978
+ logger.info(" To reclaim disk space, manually repack after all deletions complete")
1163
1979
 
1980
+ def new_db_version(self, output_h5):
1981
+ """
1982
+ Create a new version of the working HDF5 database.
1983
+
1984
+ This function creates a copy of the existing working database, allowing you to
1985
+ backtrack or branch your analysis. If there are keys that are in error or conflict
1986
+ with the current understanding of the system, this function helps you remove them
1987
+ from the new version of the database.
1988
+
1989
+ Parameters:
1990
+ output_h5 (str): The file name for the new HDF5 file.
1991
+ """
1992
+ logger.info(f"Creating new database version: {output_h5}")
1993
+
1994
+ # Copy the HDF5 file
1995
+ shutil.copyfile(self.db, output_h5)
1996
+ logger.info(" Database copied")
1997
+
1998
+ # Open the copied HDF5 file
1999
+ with h5py.File(output_h5, 'r+') as hdf:
2000
+ # List all keys in the file
2001
+ keys = list(hdf.keys())
2002
+ logger.info(f" Keys in HDF5 file: {', '.join(keys)}")
2003
+
2004
+ # Ask the user to input the keys they want to modify
2005
+ selected_keys = str(self._prompt("Enter the keys you want to modify, separated by commas: ", default="")).split(',')
2006
+
2007
+ # Clean up the input (remove whitespace)
2008
+ selected_keys = [key.strip() for key in selected_keys]
2009
+
2010
+ for key in selected_keys:
2011
+ if key in hdf:
2012
+ logger.info(f" Processing key: '{key}'...")
2013
+
2014
+ # If it's a group, recursively delete all datasets (subkeys)
2015
+ if isinstance(hdf[key], h5py.Group):
2016
+ logger.info(f" Key '{key}' is a group, deleting all subkeys...")
2017
+ for subkey in list(hdf[key].keys()):
2018
+ logger.debug(f" Removing subkey: '{key}/{subkey}'")
2019
+ del hdf[key][subkey]
2020
+ logger.info(f" All subkeys under '{key}' deleted")
2021
+ else:
2022
+ # It's a dataset, clear the data in the DataFrame
2023
+ logger.info(f" Clearing data for dataset key: '{key}'")
2024
+ df = pd.read_hdf(output_h5, key)
2025
+ df.drop(df.index, inplace=True)
2026
+ df.to_hdf(output_h5, key, mode='a', format='table', data_columns=True)
2027
+ logger.info(f" Data cleared for key: '{key}'")
2028
+ else:
2029
+ logger.warning(f" Key '{key}' not found in HDF5 file")
2030
+
2031
+ # Update the project's database to the new copied database
2032
+ self.db = output_h5
2033
+ logger.info(f"✓ New database version created: {output_h5}")
1164
2034
 
1165
2035
 
1166
2036