pymast 0.0.6__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pymast/__init__.py +31 -2
- pymast/fish_history.py +59 -6
- pymast/formatter.py +886 -548
- pymast/logger.py +58 -0
- pymast/naive_bayes.py +116 -9
- pymast/overlap_removal.py +2327 -490
- pymast/parsers.py +1091 -208
- pymast/predictors.py +302 -116
- pymast/radio_project.py +1382 -512
- pymast/validation.py +224 -0
- pymast-1.0.1.dist-info/METADATA +636 -0
- pymast-1.0.1.dist-info/RECORD +15 -0
- {pymast-0.0.6.dist-info → pymast-1.0.1.dist-info}/WHEEL +1 -1
- pymast/table_merge.py +0 -154
- pymast-0.0.6.dist-info/METADATA +0 -19
- pymast-0.0.6.dist-info/RECORD +0 -14
- {pymast-0.0.6.dist-info → pymast-1.0.1.dist-info/licenses}/LICENSE.txt +0 -0
- {pymast-0.0.6.dist-info → pymast-1.0.1.dist-info}/top_level.txt +0 -0
pymast/radio_project.py
CHANGED
|
@@ -1,7 +1,86 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Radio telemetry project management and HDF5 database operations.
|
|
2
4
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
+
This module provides the `radio_project` class, the central object for managing
|
|
6
|
+
radio telemetry studies. It handles project initialization, data import, metadata
|
|
7
|
+
storage, and database operations using HDF5 format.
|
|
8
|
+
|
|
9
|
+
Core Responsibilities
|
|
10
|
+
---------------------
|
|
11
|
+
- **Project Initialization**: Create HDF5 database with standardized table structure
|
|
12
|
+
- **Data Import**: Batch import from multiple receiver types and file formats
|
|
13
|
+
- **Metadata Management**: Store tags, receivers, recaptures, nodes, lines
|
|
14
|
+
- **Recapture Generation**: Process raw detections into spatiotemporal recaptures
|
|
15
|
+
- **Query Interface**: Retrieve fish tracks, detection statistics, project metadata
|
|
16
|
+
|
|
17
|
+
HDF5 Database Structure
|
|
18
|
+
-----------------------
|
|
19
|
+
The project database contains these primary tables:
|
|
20
|
+
|
|
21
|
+
- `/raw_data`: Imported receiver detections (time_stamp, freq_code, power, etc.)
|
|
22
|
+
- `/tblMasterTag`: Tag metadata (freq_code, pulse_rate, tag_type, release info)
|
|
23
|
+
- `/tblMasterReceiver`: Receiver metadata (rec_id, rec_type, latitude, longitude)
|
|
24
|
+
- `/recaptures`: Processed detections linked to spatial locations and tags
|
|
25
|
+
- `/nodes`: Spatial nodes for state-space modeling
|
|
26
|
+
- `/lines`: Connectivity between nodes for movement modeling
|
|
27
|
+
|
|
28
|
+
Classification and Filtering Tables:
|
|
29
|
+
|
|
30
|
+
- `/training`: Hand-labeled detections for classifier training
|
|
31
|
+
- `/test`: Detections scored by Naive Bayes classifier
|
|
32
|
+
- `/overlapping`: Overlapping detection decisions from overlap_reduction
|
|
33
|
+
- `/bouts`: Bout summaries from DBSCAN clustering
|
|
34
|
+
- `/presence`: Presence/absence by bout and receiver
|
|
35
|
+
|
|
36
|
+
Statistical Model Tables:
|
|
37
|
+
|
|
38
|
+
- `/cjs`: Cormack-Jolly-Seber capture history
|
|
39
|
+
- `/lrdr`: Live-recapture dead-recovery format
|
|
40
|
+
- `/tte`: Time-to-event format for survival analysis
|
|
41
|
+
|
|
42
|
+
Typical Usage
|
|
43
|
+
-------------
|
|
44
|
+
>>> from pymast.radio_project import radio_project
|
|
45
|
+
>>>
|
|
46
|
+
>>> # Initialize new project
|
|
47
|
+
>>> proj = radio_project(
|
|
48
|
+
... project_dir='C:/projects/my_study',
|
|
49
|
+
... db_name='my_study.h5',
|
|
50
|
+
... rec_list='receivers.csv',
|
|
51
|
+
... tag_list='tags.csv',
|
|
52
|
+
... node_list='nodes.csv',
|
|
53
|
+
... line_list='lines.csv'
|
|
54
|
+
... )
|
|
55
|
+
>>>
|
|
56
|
+
>>> # Import raw receiver data
|
|
57
|
+
>>> proj.import_data(
|
|
58
|
+
... file_name='receiver_001.csv',
|
|
59
|
+
... receiver_make='ares',
|
|
60
|
+
... rec_id='REC001',
|
|
61
|
+
... scan_time=1.0,
|
|
62
|
+
... channels=1
|
|
63
|
+
... )
|
|
64
|
+
>>>
|
|
65
|
+
>>> # Generate recaptures table
|
|
66
|
+
>>> proj.make_recaptures_table()
|
|
67
|
+
>>>
|
|
68
|
+
>>> # Query fish tracks
|
|
69
|
+
>>> tracks = proj.get_fish_tracks(freq_code='166.380 7')
|
|
70
|
+
|
|
71
|
+
Notes
|
|
72
|
+
-----
|
|
73
|
+
- HDF5 format provides fast queries, compression, and hierarchical organization
|
|
74
|
+
- All tables use indexed columns for performance (freq_code, rec_id, time_stamp)
|
|
75
|
+
- Receiver imports are append-only (no overwrites unless db_dir deleted)
|
|
76
|
+
- Project metadata stored in HDF5 attributes for provenance
|
|
77
|
+
|
|
78
|
+
See Also
|
|
79
|
+
--------
|
|
80
|
+
parsers : Data import from various receiver formats
|
|
81
|
+
overlap_removal : Detection filtering and bout analysis
|
|
82
|
+
formatter : Statistical model output generation
|
|
83
|
+
"""
|
|
5
84
|
|
|
6
85
|
# import modules required for function dependencies
|
|
7
86
|
import numpy as np
|
|
@@ -9,19 +88,38 @@ import pandas as pd
|
|
|
9
88
|
import os
|
|
10
89
|
import h5py
|
|
11
90
|
import datetime
|
|
91
|
+
import logging
|
|
12
92
|
import pymast.naive_bayes as naive_bayes
|
|
13
93
|
import pymast.parsers as parsers
|
|
14
94
|
import pymast.predictors as predictors
|
|
15
95
|
import matplotlib.pyplot as plt
|
|
16
96
|
from matplotlib import rcParams
|
|
17
97
|
from scipy import interpolate
|
|
98
|
+
try:
|
|
99
|
+
from tqdm import tqdm
|
|
100
|
+
except ImportError:
|
|
101
|
+
def tqdm(iterable, **kwargs):
|
|
102
|
+
return iterable
|
|
103
|
+
import shutil
|
|
18
104
|
import warnings
|
|
19
|
-
|
|
105
|
+
import dask.dataframe as dd
|
|
106
|
+
import dask.array as da
|
|
107
|
+
try:
|
|
108
|
+
from dask_ml.cluster import KMeans
|
|
109
|
+
_KMEANS_IMPL = 'dask'
|
|
110
|
+
except ImportError:
|
|
111
|
+
from sklearn.cluster import KMeans
|
|
112
|
+
_KMEANS_IMPL = 'sklearn'
|
|
113
|
+
|
|
114
|
+
# Initialize logger
|
|
115
|
+
logger = logging.getLogger('pymast.radio_project')
|
|
20
116
|
|
|
21
117
|
font = {'family': 'serif','size': 6}
|
|
22
118
|
rcParams['font.size'] = 6
|
|
23
119
|
rcParams['font.family'] = 'serif'
|
|
24
120
|
|
|
121
|
+
push = 'push'
|
|
122
|
+
|
|
25
123
|
class radio_project():
|
|
26
124
|
'''
|
|
27
125
|
A class to manage and organize data and parameters for a Radio Telemetry project.
|
|
@@ -46,28 +144,91 @@ class radio_project():
|
|
|
46
144
|
'''
|
|
47
145
|
|
|
48
146
|
def __init__(self, project_dir, db_name, detection_count, duration, tag_data, receiver_data, nodes_data = None):
|
|
49
|
-
|
|
50
|
-
|
|
147
|
+
"""
|
|
148
|
+
Initialize a radio telemetry project for data management and analysis.
|
|
51
149
|
|
|
52
|
-
|
|
150
|
+
This constructor sets up the complete project infrastructure including:
|
|
151
|
+
- Directory structure for data, training files, and outputs
|
|
152
|
+
- HDF5 database for efficient data storage
|
|
153
|
+
- Tag, receiver, and node metadata
|
|
53
154
|
|
|
54
|
-
Parameters
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
project_dir : str
|
|
158
|
+
Root directory for the project. Recommended to avoid spaces in path.
|
|
159
|
+
db_name : str
|
|
160
|
+
Name of the HDF5 database file (without .h5 extension).
|
|
161
|
+
detection_count : int
|
|
162
|
+
Number of detections to include in detection history window for
|
|
163
|
+
predictor calculation. Typical values: 3-7.
|
|
164
|
+
duration : float
|
|
165
|
+
Time window in seconds for noise ratio calculation.
|
|
166
|
+
Typical values: 1.0-5.0 seconds.
|
|
167
|
+
tag_data : pandas.DataFrame
|
|
168
|
+
Master tag table with required columns:
|
|
169
|
+
- freq_code (str): Unique frequency-code combination
|
|
170
|
+
- pulse_rate (float): Seconds between tag pulses
|
|
171
|
+
- tag_type (str): 'study', 'BEACON', or 'TEST'
|
|
172
|
+
- rel_date (datetime): Release date and time
|
|
173
|
+
See docs/API_REFERENCE.md for complete schema.
|
|
174
|
+
receiver_data : pandas.DataFrame
|
|
175
|
+
Master receiver table with required columns:
|
|
176
|
+
- rec_id (str): Unique receiver identifier
|
|
177
|
+
- rec_type (str): Receiver type ('srx600', 'srx800', etc.)
|
|
178
|
+
- node (str): Associated network node ID
|
|
179
|
+
nodes_data : pandas.DataFrame, optional
|
|
180
|
+
Network nodes table with columns:
|
|
181
|
+
- node (str): Unique node identifier
|
|
182
|
+
- X (int): X coordinate for visualization
|
|
183
|
+
- Y (int): Y coordinate for visualization
|
|
184
|
+
Required for movement analysis and overlap removal.
|
|
185
|
+
|
|
186
|
+
Raises
|
|
187
|
+
------
|
|
188
|
+
ValueError
|
|
189
|
+
If required columns are missing from input DataFrames.
|
|
190
|
+
OSError
|
|
191
|
+
If project directory cannot be created.
|
|
192
|
+
|
|
193
|
+
Examples
|
|
194
|
+
--------
|
|
195
|
+
>>> import pandas as pd
|
|
196
|
+
>>> from pymast.radio_project import radio_project
|
|
197
|
+
>>>
|
|
198
|
+
>>> # Load input data
|
|
199
|
+
>>> tags = pd.read_csv('tblMasterTag.csv')
|
|
200
|
+
>>> receivers = pd.read_csv('tblMasterReceiver.csv')
|
|
201
|
+
>>> nodes = pd.read_csv('tblNodes.csv')
|
|
202
|
+
>>>
|
|
203
|
+
>>> # Create project
|
|
204
|
+
>>> project = radio_project(
|
|
205
|
+
... project_dir='/path/to/project',
|
|
206
|
+
... db_name='my_study',
|
|
207
|
+
... detection_count=5,
|
|
208
|
+
... duration=1.0,
|
|
209
|
+
... tag_data=tags,
|
|
210
|
+
... receiver_data=receivers,
|
|
211
|
+
... nodes_data=nodes
|
|
212
|
+
... )
|
|
213
|
+
|
|
214
|
+
Notes
|
|
215
|
+
-----
|
|
216
|
+
The project directory structure will be created as:
|
|
217
|
+
- project_dir/
|
|
218
|
+
- Data/ (raw data storage)
|
|
219
|
+
- Training_Files/ (receiver data files)
|
|
220
|
+
- Output/ (processed data and exports)
|
|
221
|
+
- Figures/ (generated plots)
|
|
222
|
+
- my_study.h5 (HDF5 database)
|
|
223
|
+
"""
|
|
65
224
|
# set model parameters
|
|
66
225
|
self.project_dir = project_dir
|
|
67
226
|
self.db_name = db_name
|
|
68
227
|
self.db = os.path.join(project_dir,'%s.h5'%(db_name))
|
|
69
228
|
self.tags = tag_data
|
|
70
229
|
self.study_tags = self.tags[self.tags.tag_type == 'study'].freq_code.values
|
|
230
|
+
self.test_tags = self.tags[self.tags.tag_type == 'TEST'].freq_code.values
|
|
231
|
+
self.beacon_tags = self.tags[self.tags.tag_type == 'BEACON'].freq_code.values
|
|
71
232
|
self.tags.set_index('freq_code', inplace = True)
|
|
72
233
|
self.receivers = receiver_data
|
|
73
234
|
self.receivers.set_index('rec_id', inplace = True)
|
|
@@ -96,6 +257,10 @@ class radio_project():
|
|
|
96
257
|
os.makedirs(self.figures_dir)
|
|
97
258
|
self.figure_ws = os.path.join(project_dir,'Output','Figures')
|
|
98
259
|
|
|
260
|
+
# When running in automated/non-interactive mode set this flag True to avoid input() prompts
|
|
261
|
+
# By default, leave interactive (False) so user can respond to prompts
|
|
262
|
+
self.non_interactive = False
|
|
263
|
+
|
|
99
264
|
# create a project database and write initial arrays to HDF
|
|
100
265
|
self.initialize_hdf5()
|
|
101
266
|
|
|
@@ -111,6 +276,115 @@ class radio_project():
|
|
|
111
276
|
self.tags.to_hdf(self.db, key='/project_setup/tags', mode='a')
|
|
112
277
|
self.receivers.to_hdf(self.db, key='/project_setup/receivers', mode='a')
|
|
113
278
|
self.nodes.to_hdf(self.db, key='/project_setup/nodes', mode='a')
|
|
279
|
+
else:
|
|
280
|
+
# Project already exists - check for new tags and merge if needed
|
|
281
|
+
try:
|
|
282
|
+
existing_tags = pd.read_hdf(self.db, key='/project_setup/tags')
|
|
283
|
+
|
|
284
|
+
# Reset index on incoming tags for comparison (it gets set later in __init__)
|
|
285
|
+
incoming_tags = self.tags.copy()
|
|
286
|
+
if incoming_tags.index.name == 'freq_code':
|
|
287
|
+
incoming_tags = incoming_tags.reset_index()
|
|
288
|
+
|
|
289
|
+
# Find new tags not in existing database
|
|
290
|
+
if 'freq_code' in existing_tags.columns:
|
|
291
|
+
existing_freq_codes = set(existing_tags['freq_code'])
|
|
292
|
+
else:
|
|
293
|
+
existing_freq_codes = set(existing_tags.index)
|
|
294
|
+
|
|
295
|
+
incoming_freq_codes = set(incoming_tags['freq_code'])
|
|
296
|
+
new_freq_codes = incoming_freq_codes - existing_freq_codes
|
|
297
|
+
|
|
298
|
+
if new_freq_codes:
|
|
299
|
+
print(f"Found {len(new_freq_codes)} new tags to add to database: {sorted(new_freq_codes)}")
|
|
300
|
+
|
|
301
|
+
# Merge existing and new tags
|
|
302
|
+
new_tags_only = incoming_tags[incoming_tags['freq_code'].isin(new_freq_codes)]
|
|
303
|
+
|
|
304
|
+
# Ensure existing_tags has freq_code as column, not index
|
|
305
|
+
if existing_tags.index.name == 'freq_code':
|
|
306
|
+
existing_tags = existing_tags.reset_index()
|
|
307
|
+
|
|
308
|
+
merged_tags = pd.concat([existing_tags, new_tags_only], ignore_index=True)
|
|
309
|
+
|
|
310
|
+
# Remove the old tags table and write merged version
|
|
311
|
+
with pd.HDFStore(self.db, mode='a') as store:
|
|
312
|
+
if '/project_setup/tags' in store:
|
|
313
|
+
store.remove('/project_setup/tags')
|
|
314
|
+
store.put('/project_setup/tags',
|
|
315
|
+
merged_tags,
|
|
316
|
+
format='table',
|
|
317
|
+
data_columns=True)
|
|
318
|
+
|
|
319
|
+
# Update self.tags with merged data
|
|
320
|
+
self.tags = merged_tags.copy()
|
|
321
|
+
self.tags.set_index('freq_code', inplace=True)
|
|
322
|
+
|
|
323
|
+
# Update tag type arrays
|
|
324
|
+
self.study_tags = self.tags[self.tags.tag_type == 'study'].index.values
|
|
325
|
+
self.test_tags = self.tags[self.tags.tag_type == 'TEST'].index.values
|
|
326
|
+
self.beacon_tags = self.tags[self.tags.tag_type == 'BEACON'].index.values
|
|
327
|
+
|
|
328
|
+
print(f"Successfully added {len(new_freq_codes)} new tags to database.")
|
|
329
|
+
else:
|
|
330
|
+
print("No new tags found - database is up to date.")
|
|
331
|
+
|
|
332
|
+
except (KeyError, FileNotFoundError):
|
|
333
|
+
# Tags table doesn't exist yet, write it
|
|
334
|
+
print("Tags table not found in database, creating it now.")
|
|
335
|
+
self.tags.to_hdf(self.db, key='/project_setup/tags', mode='a')
|
|
336
|
+
|
|
337
|
+
# Check for new receivers and merge if needed
|
|
338
|
+
try:
|
|
339
|
+
existing_receivers = pd.read_hdf(self.db, key='/project_setup/receivers')
|
|
340
|
+
|
|
341
|
+
# Reset index on incoming receivers for comparison
|
|
342
|
+
incoming_receivers = self.receivers.copy()
|
|
343
|
+
if incoming_receivers.index.name == 'rec_id':
|
|
344
|
+
incoming_receivers = incoming_receivers.reset_index()
|
|
345
|
+
|
|
346
|
+
# Find new receivers not in existing database
|
|
347
|
+
if 'rec_id' in existing_receivers.columns:
|
|
348
|
+
existing_rec_ids = set(existing_receivers['rec_id'])
|
|
349
|
+
else:
|
|
350
|
+
existing_rec_ids = set(existing_receivers.index)
|
|
351
|
+
|
|
352
|
+
incoming_rec_ids = set(incoming_receivers['rec_id'])
|
|
353
|
+
new_rec_ids = incoming_rec_ids - existing_rec_ids
|
|
354
|
+
|
|
355
|
+
if new_rec_ids:
|
|
356
|
+
print(f"Found {len(new_rec_ids)} new receivers to add to database: {sorted(new_rec_ids)}")
|
|
357
|
+
|
|
358
|
+
# Merge existing and new receivers
|
|
359
|
+
new_receivers_only = incoming_receivers[incoming_receivers['rec_id'].isin(new_rec_ids)]
|
|
360
|
+
|
|
361
|
+
# Ensure existing_receivers has rec_id as column, not index
|
|
362
|
+
if existing_receivers.index.name == 'rec_id':
|
|
363
|
+
existing_receivers = existing_receivers.reset_index()
|
|
364
|
+
|
|
365
|
+
merged_receivers = pd.concat([existing_receivers, new_receivers_only], ignore_index=True)
|
|
366
|
+
|
|
367
|
+
# Remove the old receivers table and write merged version
|
|
368
|
+
with pd.HDFStore(self.db, mode='a') as store:
|
|
369
|
+
if '/project_setup/receivers' in store:
|
|
370
|
+
store.remove('/project_setup/receivers')
|
|
371
|
+
store.put('/project_setup/receivers',
|
|
372
|
+
merged_receivers,
|
|
373
|
+
format='table',
|
|
374
|
+
data_columns=True)
|
|
375
|
+
|
|
376
|
+
# Update self.receivers with merged data
|
|
377
|
+
self.receivers = merged_receivers.copy()
|
|
378
|
+
self.receivers.set_index('rec_id', inplace=True)
|
|
379
|
+
|
|
380
|
+
print(f"Successfully added {len(new_rec_ids)} new receivers to database.")
|
|
381
|
+
else:
|
|
382
|
+
print("No new receivers found - database is up to date.")
|
|
383
|
+
|
|
384
|
+
except (KeyError, FileNotFoundError):
|
|
385
|
+
# Receivers table doesn't exist yet, write it
|
|
386
|
+
print("Receivers table not found in database, creating it now.")
|
|
387
|
+
self.receivers.to_hdf(self.db, key='/project_setup/receivers', mode='a')
|
|
114
388
|
|
|
115
389
|
if 'raw_data' not in hdf5:
|
|
116
390
|
hdf5.create_group("raw_data")
|
|
@@ -131,6 +405,22 @@ class radio_project():
|
|
|
131
405
|
hdf5.create_group('recaptures')
|
|
132
406
|
|
|
133
407
|
hdf5.close()
|
|
408
|
+
|
|
409
|
+
def _prompt(self, prompt_text, default="no"):
|
|
410
|
+
"""Centralized prompt helper — returns default ONLY when non_interactive is True.
|
|
411
|
+
|
|
412
|
+
By default (non_interactive=False), this will prompt the user interactively.
|
|
413
|
+
Set project.non_interactive = True to auto-answer with defaults.
|
|
414
|
+
"""
|
|
415
|
+
if self.non_interactive:
|
|
416
|
+
logger.debug(f"Non-interactive mode: auto-answering '{prompt_text}' with '{default}'")
|
|
417
|
+
return default
|
|
418
|
+
try:
|
|
419
|
+
return input(prompt_text)
|
|
420
|
+
except (EOFError, OSError) as exc:
|
|
421
|
+
raise RuntimeError(
|
|
422
|
+
"Input prompt failed. Set project.non_interactive = True to use defaults."
|
|
423
|
+
) from exc
|
|
134
424
|
|
|
135
425
|
def telem_data_import(self,
|
|
136
426
|
rec_id,
|
|
@@ -139,13 +429,76 @@ class radio_project():
|
|
|
139
429
|
db_dir,
|
|
140
430
|
scan_time = 1,
|
|
141
431
|
channels = 1,
|
|
142
|
-
ant_to_rec_dict = None
|
|
432
|
+
ant_to_rec_dict = None,
|
|
433
|
+
ka_format = False):
|
|
434
|
+
"""
|
|
435
|
+
Import raw telemetry data from receiver files into the project database.
|
|
436
|
+
|
|
437
|
+
Parameters
|
|
438
|
+
----------
|
|
439
|
+
rec_id : str
|
|
440
|
+
Receiver ID (must exist in receiver_data)
|
|
441
|
+
rec_type : str
|
|
442
|
+
Receiver type. Supported: 'srx600', 'srx800', 'srx1200',
|
|
443
|
+
'orion', 'ares', 'VR2'
|
|
444
|
+
file_dir : str
|
|
445
|
+
Directory containing raw data files
|
|
446
|
+
db_dir : str
|
|
447
|
+
Path to HDF5 database file
|
|
448
|
+
scan_time : float, optional
|
|
449
|
+
Channel scan time in seconds (default: 1)
|
|
450
|
+
channels : int, optional
|
|
451
|
+
Number of channels (default: 1)
|
|
452
|
+
ant_to_rec_dict : dict, optional
|
|
453
|
+
Mapping of antenna IDs to receiver IDs
|
|
454
|
+
ka_format : bool, optional
|
|
455
|
+
Use Kleinschmidt Associates format (default: False)
|
|
456
|
+
|
|
457
|
+
Raises
|
|
458
|
+
------
|
|
459
|
+
ValueError
|
|
460
|
+
If rec_type is not supported or rec_id not found
|
|
461
|
+
FileNotFoundError
|
|
462
|
+
If file_dir doesn't exist or contains no data files
|
|
463
|
+
"""
|
|
464
|
+
# Validate receiver type
|
|
465
|
+
VALID_REC_TYPES = ['srx600', 'srx800', 'srx1200', 'orion', 'ares', 'VR2','PIT']
|
|
466
|
+
if rec_type not in VALID_REC_TYPES:
|
|
467
|
+
raise ValueError(
|
|
468
|
+
f"Unsupported receiver type: '{rec_type}'. "
|
|
469
|
+
f"Supported types: {', '.join(VALID_REC_TYPES)}"
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# Validate receiver ID
|
|
473
|
+
if rec_id not in self.receivers.index:
|
|
474
|
+
raise ValueError(
|
|
475
|
+
f"Receiver '{rec_id}' not found in receiver_data. "
|
|
476
|
+
f"Available receivers: {', '.join(self.receivers.index)}"
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
# Validate directory exists
|
|
480
|
+
if not os.path.exists(file_dir):
|
|
481
|
+
logger.error(f"Data directory not found: {file_dir}")
|
|
482
|
+
raise FileNotFoundError(
|
|
483
|
+
f"Data directory not found: {file_dir}. "
|
|
484
|
+
f"Expected location: {self.training_dir}"
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
logger.info(f"Importing data for receiver {rec_id} (type: {rec_type})")
|
|
488
|
+
logger.info(f" Data directory: {file_dir}")
|
|
489
|
+
|
|
143
490
|
# list raw data files
|
|
144
491
|
tFiles = os.listdir(file_dir)
|
|
145
492
|
|
|
493
|
+
if not tFiles:
|
|
494
|
+
logger.warning(f"No files found in {file_dir}")
|
|
495
|
+
return
|
|
496
|
+
|
|
497
|
+
logger.info(f" Found {len(tFiles)} file(s) to import")
|
|
498
|
+
|
|
146
499
|
# for every file call the correct text parser and import
|
|
147
|
-
for f in tFiles:
|
|
148
|
-
|
|
500
|
+
for i, f in enumerate(tqdm(tFiles, desc=f"Importing {rec_id}", unit="file"), 1):
|
|
501
|
+
logger.debug(f" Processing file {i}/{len(tFiles)}: {f}")
|
|
149
502
|
# get the complete file directory
|
|
150
503
|
f_dir = os.path.join(file_dir,f)
|
|
151
504
|
|
|
@@ -156,7 +509,7 @@ class radio_project():
|
|
|
156
509
|
parsers.srx800(f_dir, db_dir, rec_id, self.study_tags, scan_time = scan_time, channels = channels, ant_to_rec_dict = ant_to_rec_dict)
|
|
157
510
|
|
|
158
511
|
elif rec_type == 'srx1200':
|
|
159
|
-
parsers.srx1200(f_dir, db_dir, rec_id, self.study_tags, scan_time = scan_time, channels = channels, ant_to_rec_dict = ant_to_rec_dict)
|
|
512
|
+
parsers.srx1200(f_dir, db_dir, rec_id, self.study_tags, scan_time = scan_time, channels = channels, ant_to_rec_dict = ant_to_rec_dict, ka_format = 'True')
|
|
160
513
|
|
|
161
514
|
elif rec_type == 'orion':
|
|
162
515
|
parsers.orion_import(f_dir,db_dir,rec_id, self.study_tags, scan_time = scan_time, channels = channels, ant_to_rec_dict = ant_to_rec_dict)
|
|
@@ -166,14 +519,26 @@ class radio_project():
|
|
|
166
519
|
|
|
167
520
|
elif rec_type == 'ares':
|
|
168
521
|
parsers.ares(f_dir,db_dir,rec_id, self.study_tags, scan_time = scan_time, channels = channels, ant_to_rec_dict = ant_to_rec_dict)
|
|
522
|
+
|
|
523
|
+
elif rec_type == 'PIT':
|
|
524
|
+
parsers.PIT(f_dir,db_dir,rec_id, self.study_tags, scan_time = scan_time, channels = channels, ant_to_rec_dict = ant_to_rec_dict)
|
|
525
|
+
|
|
526
|
+
elif rec_type == 'PIT_Multiple':
|
|
527
|
+
parsers.PIT_Multiple(f_dir, db_dir,
|
|
528
|
+
study_tags=self.study_tags,
|
|
529
|
+
ant_to_rec_dict=ant_to_rec_dict,
|
|
530
|
+
scan_time=scan_time,
|
|
531
|
+
channels=channels)
|
|
532
|
+
|
|
169
533
|
else:
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
print ("File %s imported"%(f))
|
|
534
|
+
logger.error(f"No import routine for receiver type: {rec_type}")
|
|
535
|
+
raise ValueError(f"No import routine available for receiver type: {rec_type}")
|
|
173
536
|
|
|
174
|
-
|
|
537
|
+
logger.info(f"✓ Import complete for receiver {rec_id}: {len(tFiles)} file(s) processed")
|
|
175
538
|
|
|
176
539
|
def get_fish(self, rec_id, train = True, reclass_iter = None):
|
|
540
|
+
logger.info(f"Getting fish for receiver {rec_id}")
|
|
541
|
+
logger.debug(f" Mode: {'training' if train else 'classification'}, Iteration: {reclass_iter}")
|
|
177
542
|
|
|
178
543
|
tags_no_idx = self.tags.reset_index(drop = False)
|
|
179
544
|
|
|
@@ -182,6 +547,7 @@ class radio_project():
|
|
|
182
547
|
key = 'raw_data',
|
|
183
548
|
where = f'rec_id = "{rec_id}"')
|
|
184
549
|
dat = pd.merge(dat, tags_no_idx, on='freq_code', how='left')
|
|
550
|
+
dat = dat[(dat.tag_type != 'TEST') & (dat.tag_type != 'BEACON')]
|
|
185
551
|
dat = dat[(dat.tag_type != 'beacon') & (dat.tag_type != 'test')]
|
|
186
552
|
|
|
187
553
|
elif reclass_iter == None and train == False:
|
|
@@ -189,7 +555,7 @@ class radio_project():
|
|
|
189
555
|
key = 'raw_data',
|
|
190
556
|
where = f'rec_id = "{rec_id}"')
|
|
191
557
|
dat = pd.merge(dat, tags_no_idx, on='freq_code', how='left')
|
|
192
|
-
dat = dat[dat.tag_type == 'study']
|
|
558
|
+
dat = dat[(dat.tag_type == 'study') | (dat.tag_type == 'STUDY')]
|
|
193
559
|
|
|
194
560
|
else:
|
|
195
561
|
itr = reclass_iter -1
|
|
@@ -199,104 +565,153 @@ class radio_project():
|
|
|
199
565
|
dat = pd.merge(dat, tags_no_idx, on='freq_code', how='left')
|
|
200
566
|
dat = dat[dat.tag_type == 'study']
|
|
201
567
|
|
|
202
|
-
|
|
568
|
+
fish_list = dat.freq_code.unique()
|
|
569
|
+
logger.info(f" Found {len(fish_list)} unique fish")
|
|
570
|
+
return fish_list
|
|
571
|
+
|
|
572
|
+
def orphan_tags(self, return_rows=False):
|
|
573
|
+
"""Return orphan tags or their recapture rows.
|
|
574
|
+
|
|
575
|
+
By default returns a sorted list of orphan `freq_code` strings (tags
|
|
576
|
+
present in `/recaptures` but missing from `/project_setup/tags`). If
|
|
577
|
+
`return_rows=True` returns the recaptures DataFrame rows for those tags.
|
|
578
|
+
"""
|
|
579
|
+
recaps = pd.read_hdf(self.db, 'recaptures')
|
|
580
|
+
recaps['freq_code'] = recaps['freq_code'].astype(str)
|
|
581
|
+
|
|
582
|
+
master = self.tags.copy()
|
|
583
|
+
if master.index.name == 'freq_code':
|
|
584
|
+
master_codes = set(master.index.astype(str))
|
|
585
|
+
else:
|
|
586
|
+
master_codes = set(master['freq_code'].astype(str))
|
|
587
|
+
|
|
588
|
+
recap_codes = set(recaps['freq_code'].unique())
|
|
589
|
+
orphans = sorted(list(recap_codes - master_codes))
|
|
590
|
+
|
|
591
|
+
if return_rows:
|
|
592
|
+
if not orphans:
|
|
593
|
+
return pd.DataFrame(columns=recaps.columns)
|
|
594
|
+
return recaps[recaps['freq_code'].isin(orphans)].copy()
|
|
595
|
+
return orphans
|
|
203
596
|
|
|
204
597
|
def train(self, freq_code, rec_id):
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
598
|
+
"""
|
|
599
|
+
Train the Naive Bayes classifier using a specific tag at a receiver.
|
|
600
|
+
|
|
601
|
+
This method calculates predictor variables for all detections of the
|
|
602
|
+
specified tag and stores them in the training dataset. Training data
|
|
603
|
+
includes both known true positives (from beacon/test tags) and known
|
|
604
|
+
false positives (miscoded detections).
|
|
605
|
+
|
|
606
|
+
Parameters
|
|
607
|
+
----------
|
|
608
|
+
freq_code : str
|
|
609
|
+
Frequency-code combination to train on (e.g., '164.123 45').
|
|
610
|
+
Must exist in the tag_data provided during initialization.
|
|
611
|
+
rec_id : str
|
|
612
|
+
Receiver ID where training data was collected.
|
|
613
|
+
Must exist in the receiver_data provided during initialization.
|
|
614
|
+
|
|
615
|
+
Returns
|
|
616
|
+
-------
|
|
617
|
+
None
|
|
618
|
+
Training data is written to HDF5 database at /trained key.
|
|
619
|
+
|
|
620
|
+
Raises
|
|
621
|
+
------
|
|
622
|
+
KeyError
|
|
623
|
+
If freq_code or rec_id not found in project data.
|
|
624
|
+
ValueError
|
|
625
|
+
If insufficient data for training (e.g., no detections).
|
|
626
|
+
|
|
627
|
+
Examples
|
|
628
|
+
--------
|
|
629
|
+
>>> # Train on a single tag
|
|
630
|
+
>>> project.train('164.123 45', 'R01')
|
|
631
|
+
|
|
632
|
+
>>> # Train on all tags at a receiver
|
|
633
|
+
>>> fishes = project.get_fish(rec_id='R01')
|
|
634
|
+
>>> for fish in fishes:
|
|
635
|
+
... project.train(fish, 'R01')
|
|
636
|
+
|
|
637
|
+
See Also
|
|
638
|
+
--------
|
|
639
|
+
training_summary : Generate statistics and plots from training data
|
|
640
|
+
reclassify : Apply trained classifier to classify detections
|
|
641
|
+
|
|
642
|
+
Notes
|
|
643
|
+
-----
|
|
644
|
+
Predictor variables calculated:
|
|
645
|
+
- hit_ratio: Proportion of expected detections received
|
|
646
|
+
- cons_length: Maximum consecutive detection length
|
|
647
|
+
- noise_ratio: Ratio of miscoded to total detections
|
|
648
|
+
- power: Signal strength
|
|
649
|
+
- lag_diff: Second-order difference in detection timing
|
|
650
|
+
"""
|
|
651
|
+
'''A class object for a training dataframe and related data objects.'''
|
|
208
652
|
|
|
209
|
-
|
|
210
|
-
at reciever (site) from the project database (projectDB).
|
|
211
|
-
'''
|
|
212
|
-
# pull raw data
|
|
653
|
+
# Pull raw data
|
|
213
654
|
train_dat = pd.read_hdf(self.db,
|
|
214
655
|
'raw_data',
|
|
215
|
-
where
|
|
216
|
-
|
|
217
|
-
#
|
|
656
|
+
where=f'(freq_code == "{freq_code}") & (rec_id == "{rec_id}")')
|
|
657
|
+
|
|
658
|
+
# Data management
|
|
218
659
|
train_dat['time_stamp'] = pd.to_datetime(train_dat.time_stamp)
|
|
219
|
-
train_dat['epoch'] =
|
|
220
|
-
train_dat.sort_values(by
|
|
660
|
+
train_dat['epoch'] = (train_dat.time_stamp.astype('int64') // 10**9).astype('int64')
|
|
661
|
+
train_dat.sort_values(by='epoch', inplace=True)
|
|
221
662
|
|
|
222
|
-
train_dat.drop_duplicates(subset
|
|
223
|
-
keep = 'first',
|
|
224
|
-
inplace = True)
|
|
663
|
+
train_dat.drop_duplicates(subset='time_stamp', keep='first', inplace=True)
|
|
225
664
|
|
|
226
|
-
#
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
#
|
|
665
|
+
# Object variables
|
|
666
|
+
if self.receivers.index.dtype != 'object':
|
|
667
|
+
rec_id = np.int64(rec_id)
|
|
668
|
+
rec_type = self.receivers.at[rec_id, 'rec_type']
|
|
669
|
+
|
|
670
|
+
# Detection class
|
|
232
671
|
if freq_code in self.study_tags:
|
|
233
672
|
plausible = 1
|
|
234
673
|
else:
|
|
235
674
|
plausible = 0
|
|
236
|
-
|
|
675
|
+
|
|
676
|
+
# Get rate
|
|
237
677
|
if freq_code in self.tags.index:
|
|
238
|
-
pulse_rate = self.tags.at[freq_code,'pulse_rate']
|
|
678
|
+
pulse_rate = self.tags.at[freq_code, 'pulse_rate']
|
|
239
679
|
else:
|
|
240
|
-
pulse_rate =
|
|
241
|
-
|
|
242
|
-
# mort_rate = 9999.0
|
|
243
|
-
# else:
|
|
244
|
-
# mort_rate = self.tags.at[freq_code,'mort_rate']
|
|
245
|
-
|
|
680
|
+
pulse_rate = 673.
|
|
681
|
+
|
|
246
682
|
mort_rate = 8888.
|
|
247
|
-
|
|
248
|
-
# if plausible == 1:
|
|
249
|
-
# print ('debug check det hist')
|
|
250
|
-
train_dat['detection'] = np.repeat(plausible,len(train_dat))
|
|
683
|
+
train_dat['detection'] = np.repeat(plausible, len(train_dat))
|
|
251
684
|
train_dat['lag'] = train_dat.epoch.diff()
|
|
252
685
|
train_dat['lag_diff'] = train_dat.lag.diff()
|
|
253
686
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
det_hist_dict[ch] = det_hist_string
|
|
270
|
-
hit_ratio_dict[ch] = hit_ratio
|
|
271
|
-
cons_det_dict[ch] = cons_det
|
|
272
|
-
max_count_dict[ch] = max_count
|
|
273
|
-
|
|
274
|
-
det_hist_string_arrs = list(det_hist_dict.values())
|
|
275
|
-
det_hist_string_arr = np.hstack(det_hist_string_arrs)
|
|
276
|
-
hit_ratio_arrs = list(hit_ratio_dict.values())
|
|
277
|
-
hit_ratio_arr = np.hstack(hit_ratio_arrs)
|
|
278
|
-
cons_det_arrs = list(cons_det_dict.values())
|
|
279
|
-
cons_det_arr = np.hstack(cons_det_arrs)
|
|
280
|
-
max_count_arrs = list(max_count_dict.values())
|
|
281
|
-
max_count_arr = np.hstack(max_count_arrs)
|
|
687
|
+
# if freq_code in self.tags.index:
|
|
688
|
+
# print ('check')
|
|
689
|
+
|
|
690
|
+
# Apply the optimized detection history function to the entire dataset at once
|
|
691
|
+
detection_history, hit_ratio_arr, cons_det_arr, max_count_arr = predictors.detection_history(
|
|
692
|
+
train_dat['epoch'].values,
|
|
693
|
+
pulse_rate,
|
|
694
|
+
self.det_count,
|
|
695
|
+
train_dat['channels'].values,
|
|
696
|
+
train_dat['scan_time'].values,
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
# Convert detection history arrays to concatenated strings outside Numba
|
|
700
|
+
det_hist_string_arr = np.array([''.join(row.astype(str)) for row in detection_history])
|
|
282
701
|
|
|
702
|
+
# Assign back to the DataFrame
|
|
283
703
|
train_dat['det_hist'] = det_hist_string_arr
|
|
284
704
|
train_dat['hit_ratio'] = hit_ratio_arr
|
|
285
705
|
train_dat['cons_det'] = cons_det_arr
|
|
286
706
|
train_dat['cons_length'] = max_count_arr
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
#
|
|
291
|
-
# if plausible == 1:
|
|
292
|
-
# print ('debug why det hist not right?')
|
|
293
|
-
train_dat.fillna(value=9999999, inplace=True)
|
|
294
|
-
|
|
295
|
-
# make sure data types are correct - these next steps are critical
|
|
707
|
+
|
|
708
|
+
train_dat.fillna(value=9999999, inplace=True)
|
|
709
|
+
|
|
710
|
+
# Ensure data types are correct
|
|
296
711
|
try:
|
|
297
712
|
train_dat = train_dat.astype({'power': 'float32',
|
|
298
713
|
'time_stamp': 'datetime64[ns]',
|
|
299
|
-
'epoch': '
|
|
714
|
+
'epoch': 'int64',
|
|
300
715
|
'freq_code': 'object',
|
|
301
716
|
'noise_ratio': 'float32',
|
|
302
717
|
'scan_time': 'int32',
|
|
@@ -311,82 +726,83 @@ class radio_project():
|
|
|
311
726
|
'cons_det': 'int32',
|
|
312
727
|
'cons_length': 'float32'})
|
|
313
728
|
except ValueError:
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
# append to hdf5
|
|
317
|
-
with pd.HDFStore(self.db, mode='a') as store:
|
|
318
|
-
store.append(key = 'trained',
|
|
319
|
-
value = train_dat,
|
|
320
|
-
format = 'table',
|
|
321
|
-
index = False,
|
|
322
|
-
min_itemsize = {'freq_code':20,
|
|
323
|
-
'rec_type':20,
|
|
324
|
-
'rec_id':20,
|
|
325
|
-
'det_hist':20},
|
|
326
|
-
append = True,
|
|
327
|
-
chunksize = 1000000)
|
|
729
|
+
logger.debug(f" Data type conversion issue for {freq_code} at {rec_id}")
|
|
328
730
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
731
|
+
# Append to HDF5
|
|
732
|
+
with pd.HDFStore(self.db, mode='a') as store:
|
|
733
|
+
store.append(key='trained',
|
|
734
|
+
value=train_dat,
|
|
735
|
+
format='table',
|
|
736
|
+
index=False,
|
|
737
|
+
min_itemsize={'freq_code': 20,
|
|
738
|
+
'rec_type': 20,
|
|
739
|
+
'rec_id': 20,
|
|
740
|
+
'det_hist': 20},
|
|
741
|
+
append=True,
|
|
742
|
+
chunksize=1000000)
|
|
743
|
+
|
|
744
|
+
logger.info(f"✓ Training complete: {freq_code} at {rec_id} - Plausibility: {plausible:.2f}")
|
|
332
745
|
|
|
333
746
|
def training_summary(self,rec_type,site = None):
|
|
334
|
-
|
|
335
|
-
|
|
747
|
+
logger.info(f"Generating training summary for {rec_type}")
|
|
748
|
+
|
|
336
749
|
# connect to database and get data
|
|
337
|
-
|
|
338
750
|
trained_dat = pd.read_hdf(self.db,key = 'trained')#, mode = 'r')
|
|
339
751
|
trained_dat = trained_dat[(trained_dat.rec_type == rec_type)]
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
# if site != None:
|
|
343
|
-
# for rec_id in site:
|
|
344
|
-
# trained_dat = trained_dat[(trained_dat.rec_id == rec_id)]
|
|
752
|
+
|
|
753
|
+
logger.info(f" Loaded {len(trained_dat)} detections from {len(trained_dat.rec_id.unique())} receivers")
|
|
345
754
|
|
|
346
755
|
det_class_count = trained_dat.groupby('detection')['detection'].count().to_frame()
|
|
347
756
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
757
|
+
logger.info("")
|
|
758
|
+
logger.info("Training Summary Statistics Report")
|
|
759
|
+
logger.info("="*80)
|
|
760
|
+
logger.info(f"Collected {len(trained_dat)} detections from {len(trained_dat.rec_id.unique())} {rec_type} receivers")
|
|
761
|
+
logger.info("="*80)
|
|
762
|
+
logger.info("")
|
|
763
|
+
logger.info(f"{rec_type} detection class statistics:")
|
|
354
764
|
try:
|
|
355
|
-
|
|
765
|
+
prior_true = round(float(det_class_count.at[1,'detection'])/float(det_class_count.sum()),3)
|
|
766
|
+
logger.info(f" Prior P(true detection) = {prior_true}")
|
|
356
767
|
except KeyError:
|
|
357
|
-
|
|
768
|
+
logger.warning(" No known true detections found")
|
|
358
769
|
pass
|
|
359
770
|
try:
|
|
360
|
-
|
|
771
|
+
prior_false = round(float(det_class_count.at[0,'detection'])/float(det_class_count.sum()),3)
|
|
772
|
+
logger.info(f" Prior P(false positive) = {prior_false}")
|
|
361
773
|
except KeyError:
|
|
362
|
-
|
|
774
|
+
logger.warning(" No known false positives found")
|
|
363
775
|
pass
|
|
364
776
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
777
|
+
logger.info("")
|
|
778
|
+
logger.info("="*80)
|
|
779
|
+
logger.info("")
|
|
368
780
|
trained_dat['detection'] = trained_dat.detection.astype('str')
|
|
369
781
|
sta_class_count = trained_dat.groupby(['rec_id','detection'])['detection'].count().rename('det_class_count').to_frame().reset_index()
|
|
370
782
|
recs = sorted(sta_class_count.rec_id.unique())
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
783
|
+
logger.info("Detection Class Counts Across Stations")
|
|
784
|
+
logger.info(" Known Known")
|
|
785
|
+
logger.info(" False True")
|
|
786
|
+
logger.info(" ______________________________")
|
|
787
|
+
logger.info(" | | |")
|
|
376
788
|
for i in recs:
|
|
377
789
|
trues = sta_class_count[(sta_class_count.rec_id == i) & (sta_class_count.detection == '1')]
|
|
378
790
|
falses = sta_class_count[(sta_class_count.rec_id == i) & (sta_class_count.detection == '0')]
|
|
379
791
|
if len(trues) > 0 and len(falses) > 0:
|
|
380
|
-
|
|
792
|
+
logger.info("%6s| %8s | %8s |"%(i,falses.det_class_count.values[0],trues.det_class_count.values[0]))
|
|
381
793
|
elif len(trues) == 0 and len(falses) > 0:
|
|
382
|
-
|
|
794
|
+
logger.info("%6s| %8s | %8s |"%(i,falses.det_class_count.values[0],0))
|
|
383
795
|
else:
|
|
384
|
-
|
|
796
|
+
try:
|
|
797
|
+
logger.info("%6s| %8s | %8s |"%(i,0,trues.det_clas_count.values[0]))
|
|
798
|
+
|
|
799
|
+
except AttributeError:
|
|
800
|
+
logger.info("%6s| %8s | %8s |"%(i,0,0))
|
|
385
801
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
802
|
+
logger.info(" |______________|______________|")
|
|
803
|
+
logger.info("")
|
|
804
|
+
logger.info("="*80)
|
|
805
|
+
logger.info("Compiling training figures...")
|
|
390
806
|
# get data by detection class for side by side histograms
|
|
391
807
|
trained_dat['power']= trained_dat.power.astype(float)
|
|
392
808
|
trained_dat['lag_diff'] = trained_dat.lag_diff.astype(float)
|
|
@@ -514,50 +930,177 @@ class radio_project():
|
|
|
514
930
|
'rec_type':20,
|
|
515
931
|
'rec_id':20})
|
|
516
932
|
|
|
517
|
-
def create_training_data(self, rec_type, reclass_iter
|
|
518
|
-
|
|
519
|
-
|
|
933
|
+
def create_training_data(self, rec_type=None, reclass_iter=None, rec_list=None):
|
|
934
|
+
"""
|
|
935
|
+
Function to create a training dataset for the current round of classification.
|
|
936
|
+
The function supports multiple pathways for generating training data, including
|
|
937
|
+
using a receiver list (rec_list) and incorporating reclassification methods.
|
|
520
938
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
class_dat.rename(columns = {'test':'detection'},
|
|
551
|
-
inplace = True)
|
|
939
|
+
Parameters
|
|
940
|
+
----------
|
|
941
|
+
rec_type : str, optional
|
|
942
|
+
The type of receiver to filter the data by. This restricts the training data
|
|
943
|
+
to a specific receiver type (e.g., 'orion', 'srx800'). If not provided and
|
|
944
|
+
`rec_list` is used, it is ignored.
|
|
945
|
+
reclass_iter : int, optional
|
|
946
|
+
Iteration number for reclassification. If provided, the function pulls the
|
|
947
|
+
previous classification data and incorporates known false positives and
|
|
948
|
+
assumed true positives.
|
|
949
|
+
rec_list : list of str, optional
|
|
950
|
+
A list of receiver IDs to filter the data by. If provided, the function
|
|
951
|
+
queries the HDF database using this list directly rather than the receiver
|
|
952
|
+
type (`rec_type`).
|
|
953
|
+
|
|
954
|
+
Returns
|
|
955
|
+
-------
|
|
956
|
+
pandas.DataFrame
|
|
957
|
+
A DataFrame containing the training data for the classification process,
|
|
958
|
+
incorporating any previous classifications if applicable.
|
|
959
|
+
|
|
960
|
+
Notes
|
|
961
|
+
-----
|
|
962
|
+
- If both `rec_type` and `rec_list` are provided, the function will prioritize
|
|
963
|
+
the `rec_list` to restrict the training data.
|
|
964
|
+
- Reclassification logic is based on contributions from T. Castro-Santos.
|
|
965
|
+
"""
|
|
966
|
+
logger.debug(f" Creating training data (rec_type={rec_type}, iter={reclass_iter}, rec_list={rec_list})")
|
|
552
967
|
|
|
553
|
-
|
|
554
|
-
|
|
968
|
+
if rec_list is not None:
|
|
969
|
+
# Construct the query for multiple receiver IDs using the OR operator
|
|
970
|
+
rec_query = ' | '.join([f'rec_id == "{rec_id}"' for rec_id in rec_list])
|
|
971
|
+
train_dat = pd.read_hdf(self.db, 'trained', where=rec_query)
|
|
972
|
+
elif rec_type is not None:
|
|
973
|
+
# Query based on receiver type directly
|
|
974
|
+
train_dat = pd.read_hdf(self.db, 'trained', where=f'rec_type == "{rec_type}"')
|
|
975
|
+
else:
|
|
976
|
+
raise ValueError("Either 'rec_type' or 'rec_list' must be provided to create training data.")
|
|
977
|
+
|
|
978
|
+
# Handling reclassification if this is not the first iteration
|
|
979
|
+
if reclass_iter is not None:
|
|
980
|
+
last_class = reclass_iter - 1
|
|
981
|
+
|
|
982
|
+
# Load the classified dataset and filter by iteration
|
|
983
|
+
class_dat = pd.read_hdf(self.db, 'classified', where=f'iter == {last_class}')
|
|
984
|
+
|
|
985
|
+
# Further restrict classified data to the receiver list if rec_list is provided
|
|
986
|
+
if rec_list is not None:
|
|
987
|
+
class_query = ' | '.join([f'rec_id == "{rec_id}"' for rec_id in rec_list])
|
|
988
|
+
class_dat = class_dat.query(class_query)
|
|
989
|
+
|
|
990
|
+
# Selecting relevant columns for the training dataset
|
|
991
|
+
columns = ['test', 'freq_code', 'power', 'noise_ratio', 'lag',
|
|
992
|
+
'lag_diff', 'cons_length', 'cons_det', 'det_hist',
|
|
993
|
+
'hit_ratio', 'rec_type', 'epoch']
|
|
994
|
+
|
|
995
|
+
class_dat = class_dat[columns]
|
|
996
|
+
class_dat.rename(columns={'test': 'detection'}, inplace=True)
|
|
997
|
+
|
|
998
|
+
# Separate known falses (train_dat) and assumed trues (class_dat)
|
|
999
|
+
train_dat = train_dat[train_dat['detection'] == 0]
|
|
1000
|
+
class_dat = class_dat[class_dat['detection'] == 1]
|
|
555
1001
|
|
|
556
|
-
|
|
557
|
-
|
|
1002
|
+
# Append the classified data to the training data
|
|
1003
|
+
train_dat = pd.concat([train_dat, class_dat], ignore_index=True)
|
|
1004
|
+
logger.debug(f" Combined training data: {len(train_dat)} detections ({sum(train_dat['detection']==0)} false, {sum(train_dat['detection']==1)} true)")
|
|
1005
|
+
else:
|
|
1006
|
+
logger.debug(f" Training data: {len(train_dat)} detections")
|
|
558
1007
|
|
|
559
1008
|
return train_dat
|
|
1009
|
+
|
|
1010
|
+
|
|
1011
|
+
|
|
1012
|
+
|
|
1013
|
+
def reclassify(self, project, rec_id, threshold_ratio, likelihood_model, rec_type=None, rec_list=None):
|
|
1014
|
+
"""
|
|
1015
|
+
Reclassifies fish in a project based on user-defined criteria and threshold ratios.
|
|
560
1016
|
|
|
1017
|
+
Parameters
|
|
1018
|
+
----------
|
|
1019
|
+
project : object
|
|
1020
|
+
The project object that contains methods for managing and classifying fish data.
|
|
1021
|
+
|
|
1022
|
+
rec_id : int or str
|
|
1023
|
+
The unique identifier for the receiver to be reclassified.
|
|
1024
|
+
|
|
1025
|
+
threshold_ratio : float
|
|
1026
|
+
The threshold ratio used for determining classification criteria.
|
|
1027
|
+
|
|
1028
|
+
likelihood_model : list of str
|
|
1029
|
+
The fields to use as the likelihood model for classification.
|
|
1030
|
+
|
|
1031
|
+
rec_type : str, optional
|
|
1032
|
+
The type of receiver being processed (e.g., 'srx1200', 'orion').
|
|
1033
|
+
|
|
1034
|
+
rec_list : list of str, optional
|
|
1035
|
+
A list of receiver IDs to filter the data by, used for creating training data.
|
|
1036
|
+
|
|
1037
|
+
Notes
|
|
1038
|
+
-----
|
|
1039
|
+
- The classification process involves interactive user input to determine if additional
|
|
1040
|
+
iterations are needed.
|
|
1041
|
+
- The fields used for classification are hardcoded as ['hit_ratio', 'cons_length',
|
|
1042
|
+
'noise_ratio', 'power', 'lag_diff'].
|
|
1043
|
+
"""
|
|
1044
|
+
logger.info(f"Starting classification for receiver {rec_id}")
|
|
1045
|
+
logger.info(f" Threshold ratio: {threshold_ratio}")
|
|
1046
|
+
logger.info(f" Likelihood model: {', '.join(likelihood_model)}")
|
|
1047
|
+
|
|
1048
|
+
# Validate inputs
|
|
1049
|
+
if rec_id not in self.receivers.index:
|
|
1050
|
+
logger.error(f"Receiver {rec_id} not found")
|
|
1051
|
+
raise ValueError(f"Receiver '{rec_id}' not found in receiver_data")
|
|
1052
|
+
|
|
1053
|
+
valid_predictors = ['hit_ratio', 'cons_length', 'noise_ratio', 'power', 'lag_diff']
|
|
1054
|
+
invalid = set(likelihood_model) - set(valid_predictors)
|
|
1055
|
+
if invalid:
|
|
1056
|
+
logger.error(f"Invalid predictors: {invalid}")
|
|
1057
|
+
raise ValueError(f"Invalid predictors: {', '.join(invalid)}. Valid: {', '.join(valid_predictors)}")
|
|
1058
|
+
|
|
1059
|
+
class_iter = None
|
|
1060
|
+
|
|
1061
|
+
while True:
|
|
1062
|
+
iter_label = f"iteration {class_iter}" if class_iter else "initial classification"
|
|
1063
|
+
logger.info(f"Running {iter_label}...")
|
|
1064
|
+
|
|
1065
|
+
# Get a list of fish to iterate over
|
|
1066
|
+
fishes = project.get_fish(rec_id=rec_id, train=False, reclass_iter=class_iter)
|
|
1067
|
+
logger.info(f" Found {len(fishes)} fish to classify")
|
|
1068
|
+
|
|
1069
|
+
# Generate training data for the classifier
|
|
1070
|
+
logger.info(" Creating training data...")
|
|
1071
|
+
training_data = project.create_training_data(rec_type=rec_type, reclass_iter=class_iter, rec_list=rec_list)
|
|
1072
|
+
logger.info(f" Training data: {len(training_data)} detections")
|
|
1073
|
+
|
|
1074
|
+
# Iterate over fish and classify with progress bar
|
|
1075
|
+
logger.info(" Classifying detections...")
|
|
1076
|
+
for fish in tqdm(fishes, desc=f" Classifying {rec_id}", unit="fish"):
|
|
1077
|
+
project.classify(fish, rec_id, likelihood_model, training_data, class_iter, threshold_ratio)
|
|
1078
|
+
|
|
1079
|
+
# Generate summary statistics
|
|
1080
|
+
logger.info(" Generating classification summary...")
|
|
1081
|
+
project.classification_summary(rec_id, class_iter)
|
|
1082
|
+
|
|
1083
|
+
# Show the figures and block execution until they are closed
|
|
1084
|
+
plt.show(block=True)
|
|
1085
|
+
|
|
1086
|
+
# Ask the user if they need another iteration (use _prompt helper)
|
|
1087
|
+
user_input = str(self._prompt("\nDo you need another classification iteration? (yes/no): ", default="no")).strip().lower()
|
|
1088
|
+
|
|
1089
|
+
if user_input in ['yes', 'y']:
|
|
1090
|
+
# If yes, increase class_iter and reclassify
|
|
1091
|
+
if class_iter is None:
|
|
1092
|
+
class_iter = 2
|
|
1093
|
+
else:
|
|
1094
|
+
class_iter += 1
|
|
1095
|
+
logger.info(f"Starting iteration {class_iter}")
|
|
1096
|
+
elif user_input in ['no', 'n']:
|
|
1097
|
+
# If no, break the loop
|
|
1098
|
+
logger.info(f"✓ Classification complete for {rec_id}")
|
|
1099
|
+
break
|
|
1100
|
+
else:
|
|
1101
|
+
logger.warning("Invalid input, please enter 'yes' or 'no'")
|
|
1102
|
+
|
|
1103
|
+
|
|
561
1104
|
def classify(self,
|
|
562
1105
|
freq_code,
|
|
563
1106
|
rec_id,
|
|
@@ -565,6 +1108,7 @@ class radio_project():
|
|
|
565
1108
|
training_data,
|
|
566
1109
|
reclass_iter = None,
|
|
567
1110
|
threshold_ratio = None):
|
|
1111
|
+
logger.debug(f" Classifying {freq_code} at {rec_id} (iter: {reclass_iter})")
|
|
568
1112
|
|
|
569
1113
|
# get rates
|
|
570
1114
|
try:
|
|
@@ -594,28 +1138,40 @@ class radio_project():
|
|
|
594
1138
|
columns = ['freq_code','epoch','rec_id','time_stamp','power','noise_ratio','scan_time','channels','rec_type']
|
|
595
1139
|
class_dat = class_dat[columns]
|
|
596
1140
|
|
|
1141
|
+
class_dat = class_dat.drop_duplicates()
|
|
1142
|
+
|
|
597
1143
|
if len(class_dat) > 0:
|
|
598
1144
|
# do some data management when importing training dataframe
|
|
599
1145
|
class_dat['time_stamp'] = pd.to_datetime(class_dat['time_stamp'])
|
|
1146
|
+
class_dat['epoch'] = (class_dat.time_stamp.astype('int64') // 10**9).astype('int64')
|
|
1147
|
+
|
|
600
1148
|
class_dat.sort_values(by = 'time_stamp', inplace = True)
|
|
601
|
-
class_dat['epoch'] = class_dat.epoch.values.astype(np.
|
|
1149
|
+
class_dat['epoch'] = class_dat.epoch.values.astype(np.int64)
|
|
602
1150
|
class_dat = class_dat.drop_duplicates(subset = 'time_stamp')
|
|
603
1151
|
|
|
604
1152
|
# calculate predictors
|
|
605
1153
|
class_dat['lag'] = class_dat.epoch.diff()
|
|
606
1154
|
class_dat['lag_diff'] = class_dat.lag.diff()
|
|
607
1155
|
class_dat.fillna(value = 99999999, inplace = True)
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
1156
|
+
|
|
1157
|
+
# Apply the optimized detection history function to the entire dataset at once
|
|
1158
|
+
detection_history, hit_ratio_arr, cons_det_arr, max_count_arr = predictors.detection_history(
|
|
1159
|
+
class_dat['epoch'].values,
|
|
1160
|
+
pulse_rate,
|
|
1161
|
+
self.det_count,
|
|
1162
|
+
class_dat['channels'].values,
|
|
1163
|
+
class_dat['scan_time'].values
|
|
1164
|
+
)
|
|
1165
|
+
|
|
1166
|
+
# Convert detection history arrays to concatenated strings outside Numba
|
|
1167
|
+
det_hist_string_arr = np.array([''.join(row.astype(str)) for row in detection_history])
|
|
1168
|
+
|
|
1169
|
+
# Assign back to the DataFrame
|
|
1170
|
+
class_dat['det_hist'] = det_hist_string_arr
|
|
1171
|
+
class_dat['hit_ratio'] = hit_ratio_arr
|
|
1172
|
+
class_dat['cons_det'] = cons_det_arr
|
|
1173
|
+
class_dat['cons_length'] = max_count_arr
|
|
1174
|
+
|
|
619
1175
|
# class_dat['series_hit'] = predictors.series_hit(class_dat.lag.values,
|
|
620
1176
|
# pulse_rate,
|
|
621
1177
|
# mort_rate,
|
|
@@ -706,7 +1262,7 @@ class radio_project():
|
|
|
706
1262
|
|
|
707
1263
|
# keep it tidy cuz hdf is fragile
|
|
708
1264
|
class_dat = class_dat.astype({'freq_code': 'object',
|
|
709
|
-
'epoch': '
|
|
1265
|
+
'epoch': 'int64',
|
|
710
1266
|
'rec_id': 'object',
|
|
711
1267
|
'time_stamp': 'datetime64[ns]',
|
|
712
1268
|
'power': 'float32',
|
|
@@ -741,9 +1297,9 @@ class radio_project():
|
|
|
741
1297
|
chunksize = 1000000)
|
|
742
1298
|
|
|
743
1299
|
# export
|
|
744
|
-
class_dat.to_csv(os.path.join(self.output_dir,'freq_code_%s_rec_%s_class_%s.csv'%(freq_code, rec_id, reclass_iter)))
|
|
1300
|
+
#class_dat.to_csv(os.path.join(self.output_dir,'freq_code_%s_rec_%s_class_%s.csv'%(freq_code, rec_id, reclass_iter)))
|
|
745
1301
|
|
|
746
|
-
|
|
1302
|
+
logger.debug(f" ✓ {freq_code} at {rec_id}: {sum(classification)} true, {len(classification)-sum(classification)} false")
|
|
747
1303
|
# next step looks at results
|
|
748
1304
|
|
|
749
1305
|
# else:
|
|
@@ -752,6 +1308,9 @@ class radio_project():
|
|
|
752
1308
|
def classification_summary(self,rec_id,reclass_iter = None):
|
|
753
1309
|
'''if this is not the initial classification we need the trues from the last
|
|
754
1310
|
last classification and falses from the first'''
|
|
1311
|
+
|
|
1312
|
+
iter_label = f"iteration {reclass_iter}" if reclass_iter else "initial classification"
|
|
1313
|
+
logger.info(f"Generating classification summary for {rec_id} ({iter_label})")
|
|
755
1314
|
|
|
756
1315
|
if reclass_iter == None:
|
|
757
1316
|
classified_dat = pd.read_hdf(self.db,
|
|
@@ -761,229 +1320,140 @@ class radio_project():
|
|
|
761
1320
|
classified_dat = pd.read_hdf(self.db,
|
|
762
1321
|
key = 'classified',
|
|
763
1322
|
where = f'(iter == {reclass_iter}) & (rec_id == "{rec_id}")')
|
|
1323
|
+
|
|
1324
|
+
logger.info(f" Loaded {len(classified_dat)} classified detections")
|
|
764
1325
|
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
1326
|
+
logger.info("")
|
|
1327
|
+
logger.info(f"Classification Summary Report: {rec_id}")
|
|
1328
|
+
logger.info("="*80)
|
|
768
1329
|
det_class_count = classified_dat.groupby('test')['test'].count().to_frame()
|
|
769
1330
|
if len(det_class_count)>1:
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
1331
|
+
logger.info("")
|
|
1332
|
+
logger.info(f"{rec_id} detection class statistics:")
|
|
1333
|
+
prob_true = round(float(det_class_count.at[1,'test'])/float(det_class_count.sum()),3)
|
|
1334
|
+
prob_false = round(float(det_class_count.at[0,'test'])/float(det_class_count.sum()),3)
|
|
1335
|
+
logger.info(f" P(classified as true) = {prob_true}")
|
|
1336
|
+
logger.info(f" P(classified as false positive) = {prob_false}")
|
|
1337
|
+
logger.info("")
|
|
1338
|
+
logger.info("="*80)
|
|
1339
|
+
logger.info("")
|
|
777
1340
|
sta_class_count = classified_dat.groupby(['rec_id','test'])['test'].count().to_frame()#.reset_index(drop = False)
|
|
778
1341
|
recs = list(set(sta_class_count.index.levels[0]))
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
1342
|
+
logger.info("Detection Class Counts Across Stations")
|
|
1343
|
+
logger.info(" Classified Classified")
|
|
1344
|
+
logger.info(" False True")
|
|
1345
|
+
logger.info(" ______________________________")
|
|
1346
|
+
logger.info(" | | |")
|
|
784
1347
|
for i in recs:
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
1348
|
+
logger.info("%6s| %8s | %8s |"%(i,sta_class_count.loc[(i,0)].values[0],sta_class_count.loc[(i,1)].values[0]))
|
|
1349
|
+
logger.info(" |______________|______________|")
|
|
1350
|
+
logger.info("")
|
|
1351
|
+
logger.info("="*80)
|
|
1352
|
+
logger.info("Compiling classification figures...")
|
|
790
1353
|
|
|
791
|
-
#
|
|
792
|
-
|
|
793
|
-
# plot the log likelihood ratio
|
|
1354
|
+
# Plot the log likelihood ratio
|
|
794
1355
|
classified_dat['log_posterior_ratio'] = np.log10(classified_dat.posterior_T / classified_dat.posterior_F)
|
|
795
|
-
minLogRatio = classified_dat.log_posterior_ratio.min()//1 * 1
|
|
796
|
-
maxLogRatio = classified_dat.log_posterior_ratio.max()//1 * 1
|
|
797
|
-
ratio_range = maxLogRatio - minLogRatio
|
|
798
|
-
ratio_bins =np.linspace(minLogRatio,maxLogRatio+1,100)
|
|
799
1356
|
|
|
800
|
-
#
|
|
801
|
-
hit_ratio_bins =np.linspace(0,1.0,11)
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
# Lag Back Differences - how steady are detection lags?
|
|
809
|
-
lag_bins =np.arange(-100,110,20)
|
|
810
|
-
|
|
811
|
-
# Consecutive Record Length
|
|
812
|
-
con_length_bins =np.arange(1,12,1)
|
|
813
|
-
|
|
814
|
-
# Noise Ratio
|
|
815
|
-
noise_bins =np.arange(0,1.1,0.1)
|
|
816
|
-
|
|
817
|
-
# plot the log of the posterior ratio
|
|
818
|
-
classified_dat['log_post_ratio'] = np.log(classified_dat.posterior_T/classified_dat.posterior_F)
|
|
819
|
-
minPostRatio = classified_dat.log_post_ratio.min()
|
|
820
|
-
maxPostRatio = classified_dat.log_post_ratio.max()
|
|
821
|
-
post_ratio_bins = np.linspace(minPostRatio,maxPostRatio,10)
|
|
822
|
-
|
|
1357
|
+
# Binning and other parameters
|
|
1358
|
+
hit_ratio_bins = np.linspace(0, 1.0, 11)
|
|
1359
|
+
con_length_bins = np.arange(1, 12, 1)
|
|
1360
|
+
power_bins = np.arange(50, 110, 10)
|
|
1361
|
+
noise_bins = np.linspace(0, 1.1, 11)
|
|
1362
|
+
lag_bins = np.arange(-100, 110, 20)
|
|
1363
|
+
post_ratio_bins = np.linspace(classified_dat.log_posterior_ratio.min(), classified_dat.log_posterior_ratio.max(), 10)
|
|
1364
|
+
|
|
823
1365
|
trues = classified_dat[classified_dat.test == 1]
|
|
824
1366
|
falses = classified_dat[classified_dat.test == 0]
|
|
825
|
-
|
|
826
|
-
# make lattice plot for pubs
|
|
827
|
-
|
|
828
|
-
# hit ratio
|
|
829
|
-
fig = plt.figure(figsize = (4, 2), dpi = 300, layout = 'tight')
|
|
830
|
-
|
|
831
|
-
ax1 = fig.add_subplot(1,2,1)
|
|
832
|
-
ax1.hist(falses.hit_ratio.values,
|
|
833
|
-
hit_ratio_bins,
|
|
834
|
-
density = True,
|
|
835
|
-
color = 'grey',
|
|
836
|
-
edgecolor='black',
|
|
837
|
-
linewidth=1.2)
|
|
838
|
-
ax1.set_xlabel('Hit Ratio')
|
|
839
|
-
ax1.set_title('False Positive')
|
|
840
|
-
ax1.set_ylabel('Probability Density')
|
|
841
|
-
|
|
842
|
-
ax2 = fig.add_subplot(1,2,2)
|
|
843
|
-
ax2.hist(trues.hit_ratio.values,
|
|
844
|
-
hit_ratio_bins,
|
|
845
|
-
density = True,
|
|
846
|
-
color = 'grey',
|
|
847
|
-
edgecolor='black',
|
|
848
|
-
linewidth=1.2)
|
|
849
|
-
ax2.set_title('Valid')
|
|
850
|
-
ax2.set_xlabel('Hit Ratio')
|
|
851
|
-
|
|
852
|
-
plt.show()
|
|
853
1367
|
|
|
854
|
-
#
|
|
855
|
-
fig = plt.
|
|
856
|
-
|
|
857
|
-
ax1 = fig.add_subplot(1,2,1)
|
|
858
|
-
ax1.hist(falses.cons_length.values,
|
|
859
|
-
con_length_bins,
|
|
860
|
-
density = True,
|
|
861
|
-
color = 'grey',
|
|
862
|
-
edgecolor='black',
|
|
863
|
-
linewidth=1.2)
|
|
864
|
-
ax1.set_xlabel('Consecutive Hit Length')
|
|
865
|
-
ax1.set_title('False Positive')
|
|
866
|
-
ax1.set_ylabel('Probability Density')
|
|
867
|
-
|
|
868
|
-
ax2 = fig.add_subplot(1,2,2)
|
|
869
|
-
ax2.hist(trues.cons_length.values,
|
|
870
|
-
con_length_bins,
|
|
871
|
-
density = True,
|
|
872
|
-
color = 'grey',
|
|
873
|
-
edgecolor='black',
|
|
874
|
-
linewidth=1.2)
|
|
875
|
-
ax2.set_title('Valid')
|
|
876
|
-
ax2.set_xlabel('Consecutive Hit Length')
|
|
1368
|
+
# Create a grid of subplots (3 rows x 4 columns)
|
|
1369
|
+
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(15, 10), dpi=300)
|
|
877
1370
|
|
|
878
|
-
|
|
1371
|
+
# Function to set font sizes
|
|
1372
|
+
def set_fontsize(ax, fontsize=6):
|
|
1373
|
+
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
|
|
1374
|
+
ax.get_xticklabels() + ax.get_yticklabels()):
|
|
1375
|
+
item.set_fontsize(fontsize)
|
|
879
1376
|
|
|
880
|
-
#
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
density = True,
|
|
887
|
-
color = 'grey',
|
|
888
|
-
edgecolor='black',
|
|
889
|
-
linewidth=1.2)
|
|
890
|
-
ax1.set_xlabel('Signal Power')
|
|
891
|
-
ax1.set_ylabel('Probability Density')
|
|
892
|
-
ax1.set_title('False Positive')
|
|
893
|
-
|
|
894
|
-
ax2 = fig.add_subplot(1,2,2)
|
|
895
|
-
ax2.hist(trues.power.values,
|
|
896
|
-
power_bins,
|
|
897
|
-
density = True,
|
|
898
|
-
color = 'grey',
|
|
899
|
-
edgecolor='black',
|
|
900
|
-
linewidth=1.2)
|
|
901
|
-
ax2.set_xlabel('Signal Power')
|
|
902
|
-
ax2.set_title('Valid')
|
|
1377
|
+
# Plot hit ratio
|
|
1378
|
+
axes[0, 0].hist(falses.hit_ratio.values, hit_ratio_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
|
|
1379
|
+
axes[0, 0].set_xlabel('Hit Ratio')
|
|
1380
|
+
axes[0, 0].set_ylabel('Probability Density')
|
|
1381
|
+
axes[0, 0].set_title('Hit Ratio - False Positive')
|
|
1382
|
+
set_fontsize(axes[0, 0])
|
|
903
1383
|
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
ax1 = fig.add_subplot(1,2,1)
|
|
910
|
-
ax1.hist(falses.noise_ratio.values,
|
|
911
|
-
noise_bins,
|
|
912
|
-
density = True,
|
|
913
|
-
color = 'grey',
|
|
914
|
-
edgecolor='black',
|
|
915
|
-
linewidth=1.2)
|
|
916
|
-
ax1.set_xlabel('Noise Ratio')
|
|
917
|
-
ax1.set_ylabel('Probability Density')
|
|
918
|
-
ax1.set_title('False Positive')
|
|
919
|
-
|
|
920
|
-
ax2 = fig.add_subplot(1,2,2)
|
|
921
|
-
ax2.hist(trues.noise_ratio.values,
|
|
922
|
-
noise_bins,
|
|
923
|
-
density = True,
|
|
924
|
-
color = 'grey',
|
|
925
|
-
edgecolor='black',
|
|
926
|
-
linewidth=1.2)
|
|
927
|
-
ax2.set_xlabel('Noise Ratio')
|
|
928
|
-
ax2.set_title('Valid')
|
|
1384
|
+
axes[0, 1].hist(trues.hit_ratio.values, hit_ratio_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
|
|
1385
|
+
axes[0, 1].set_xlabel('Hit Ratio')
|
|
1386
|
+
axes[0, 1].set_title('Hit Ratio - Valid')
|
|
1387
|
+
set_fontsize(axes[0, 1])
|
|
929
1388
|
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
ax1.hist(falses.lag_diff.values,
|
|
937
|
-
lag_bins,
|
|
938
|
-
density = True,
|
|
939
|
-
color = 'grey',
|
|
940
|
-
edgecolor='black',
|
|
941
|
-
linewidth=1.2)
|
|
942
|
-
ax1.set_xlabel('Lag Differences')
|
|
943
|
-
ax1.set_ylabel('Probability Density')
|
|
944
|
-
ax1.set_title('False Positive')
|
|
945
|
-
|
|
946
|
-
ax2 = fig.add_subplot(1,2,2)
|
|
947
|
-
ax2.hist(trues.lag_diff.values,
|
|
948
|
-
lag_bins,
|
|
949
|
-
density = True,
|
|
950
|
-
color = 'grey',
|
|
951
|
-
edgecolor='black',
|
|
952
|
-
linewidth=1.2)
|
|
953
|
-
ax2.set_xlabel('Lag Differences')
|
|
954
|
-
ax2.set_title('Valid')
|
|
1389
|
+
# Plot consecutive record length
|
|
1390
|
+
axes[0, 2].hist(falses.cons_length.values, con_length_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
|
|
1391
|
+
axes[0, 2].set_xlabel('Consecutive Hit Length')
|
|
1392
|
+
axes[0, 2].set_ylabel('Probability Density')
|
|
1393
|
+
axes[0, 2].set_title('Consecutive Hit Length - False Positive')
|
|
1394
|
+
set_fontsize(axes[0, 2])
|
|
955
1395
|
|
|
956
|
-
|
|
1396
|
+
axes[0, 3].hist(trues.cons_length.values, con_length_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
|
|
1397
|
+
axes[0, 3].set_xlabel('Consecutive Hit Length')
|
|
1398
|
+
axes[0, 3].set_title('Consecutive Hit Length - Valid')
|
|
1399
|
+
set_fontsize(axes[0, 3])
|
|
1400
|
+
|
|
1401
|
+
# Plot power
|
|
1402
|
+
axes[1, 0].hist(falses.power.values, power_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
|
|
1403
|
+
axes[1, 0].set_xlabel('Signal Power')
|
|
1404
|
+
axes[1, 0].set_ylabel('Probability Density')
|
|
1405
|
+
axes[1, 0].set_title('Signal Power - False Positive')
|
|
1406
|
+
set_fontsize(axes[1, 0])
|
|
1407
|
+
|
|
1408
|
+
axes[1, 1].hist(trues.power.values, power_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
|
|
1409
|
+
axes[1, 1].set_xlabel('Signal Power')
|
|
1410
|
+
axes[1, 1].set_title('Signal Power - Valid')
|
|
1411
|
+
set_fontsize(axes[1, 1])
|
|
1412
|
+
|
|
1413
|
+
# Plot noise ratio
|
|
1414
|
+
axes[1, 2].hist(falses.noise_ratio.values, noise_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
|
|
1415
|
+
axes[1, 2].set_xlabel('Noise Ratio')
|
|
1416
|
+
axes[1, 2].set_ylabel('Probability Density')
|
|
1417
|
+
axes[1, 2].set_title('Noise Ratio - False Positive')
|
|
1418
|
+
set_fontsize(axes[1, 2])
|
|
1419
|
+
|
|
1420
|
+
axes[1, 3].hist(trues.noise_ratio.values, noise_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
|
|
1421
|
+
axes[1, 3].set_xlabel('Noise Ratio')
|
|
1422
|
+
axes[1, 3].set_title('Noise Ratio - Valid')
|
|
1423
|
+
set_fontsize(axes[1, 3])
|
|
1424
|
+
|
|
1425
|
+
# Plot lag differences
|
|
1426
|
+
axes[2, 0].hist(falses.lag_diff.values, lag_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
|
|
1427
|
+
axes[2, 0].set_xlabel('Lag Differences')
|
|
1428
|
+
axes[2, 0].set_ylabel('Probability Density')
|
|
1429
|
+
axes[2, 0].set_title('Lag Differences - False Positive')
|
|
1430
|
+
set_fontsize(axes[2, 0])
|
|
1431
|
+
|
|
1432
|
+
axes[2, 1].hist(trues.lag_diff.values, lag_bins, density=True, color='grey', edgecolor='black', linewidth=1.2)
|
|
1433
|
+
axes[2, 1].set_xlabel('Lag Differences')
|
|
1434
|
+
axes[2, 1].set_title('Lag Differences - Valid')
|
|
1435
|
+
set_fontsize(axes[2, 1])
|
|
957
1436
|
|
|
958
|
-
# log posterior ratio
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
ax1.set_ylabel('Probability Density')
|
|
970
|
-
ax1.set_title('False Positive')
|
|
971
|
-
|
|
972
|
-
ax2 = fig.add_subplot(1,2,2)
|
|
973
|
-
ax2.hist(trues.log_posterior_ratio.values,
|
|
974
|
-
bins = 20,
|
|
975
|
-
density = True,
|
|
976
|
-
color = 'grey',
|
|
977
|
-
edgecolor='black',
|
|
978
|
-
linewidth=1.2)
|
|
979
|
-
ax2.set_xlabel('Log Posterior Ratio')
|
|
980
|
-
ax2.set_title('Valid')
|
|
1437
|
+
# Plot log posterior ratio
|
|
1438
|
+
axes[2, 2].hist(falses.log_posterior_ratio.values, bins=20, density=True, color='grey', edgecolor='black', linewidth=1.2)
|
|
1439
|
+
axes[2, 2].set_xlabel('Log Posterior Ratio')
|
|
1440
|
+
axes[2, 2].set_ylabel('Probability Density')
|
|
1441
|
+
axes[2, 2].set_title('Log Posterior Ratio - False Positive')
|
|
1442
|
+
set_fontsize(axes[2, 2])
|
|
1443
|
+
|
|
1444
|
+
axes[2, 3].hist(trues.log_posterior_ratio.values, bins=20, density=True, color='grey', edgecolor='black', linewidth=1.2)
|
|
1445
|
+
axes[2, 3].set_xlabel('Log Posterior Ratio')
|
|
1446
|
+
axes[2, 3].set_title('Log Posterior Ratio - Valid')
|
|
1447
|
+
set_fontsize(axes[2, 3])
|
|
981
1448
|
|
|
1449
|
+
# Adjust layout
|
|
1450
|
+
plt.tight_layout()
|
|
1451
|
+
|
|
1452
|
+
# Show the plot
|
|
982
1453
|
plt.show()
|
|
983
|
-
|
|
984
1454
|
else:
|
|
985
|
-
|
|
986
|
-
|
|
1455
|
+
logger.warning("Insufficient data to quantify summary statistics")
|
|
1456
|
+
logger.warning(f"All remaining classified as {det_class_count.index[0]} - no more model improvement expected")
|
|
987
1457
|
|
|
988
1458
|
def undo_classification(self, rec_id, freq_code=None, class_iter=None):
|
|
989
1459
|
# Read the table from the HDF5 file
|
|
@@ -1017,21 +1487,29 @@ class radio_project():
|
|
|
1017
1487
|
data_columns=True,
|
|
1018
1488
|
append = False)
|
|
1019
1489
|
|
|
1020
|
-
def undo_bouts(self, rec_id):
|
|
1490
|
+
def undo_bouts(self, rec_id=None):
|
|
1491
|
+
"""
|
|
1492
|
+
Remove bouts from the presence table.
|
|
1493
|
+
|
|
1494
|
+
Args:
|
|
1495
|
+
rec_id (str, optional): Specific receiver ID to remove bouts for.
|
|
1496
|
+
If None, removes all bouts.
|
|
1497
|
+
"""
|
|
1021
1498
|
# Read the table from the HDF5 file
|
|
1022
1499
|
with pd.HDFStore(self.db, 'r+') as store:
|
|
1023
1500
|
if 'presence' in store:
|
|
1024
1501
|
df = store['presence']
|
|
1025
1502
|
|
|
1026
1503
|
# Build the condition based on provided arguments
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1504
|
+
if rec_id is not None:
|
|
1505
|
+
condition = (df['rec_id'] == rec_id)
|
|
1506
|
+
df = df[~condition]
|
|
1507
|
+
else:
|
|
1508
|
+
# Remove all presence data
|
|
1509
|
+
df = pd.DataFrame(columns=df.columns)
|
|
1032
1510
|
|
|
1033
1511
|
df = df.astype({'freq_code': 'object',
|
|
1034
|
-
'epoch': '
|
|
1512
|
+
'epoch': 'int64',
|
|
1035
1513
|
'rec_id': 'object',
|
|
1036
1514
|
'class': 'object',
|
|
1037
1515
|
'bout_no':'int32',
|
|
@@ -1049,118 +1527,510 @@ class radio_project():
|
|
|
1049
1527
|
'rec_id':20,
|
|
1050
1528
|
'class':20},
|
|
1051
1529
|
data_columns=True,
|
|
1052
|
-
append = False)
|
|
1053
|
-
|
|
1054
|
-
def make_recaptures_table(self):
|
|
1055
|
-
'''method creates a recaptures key in the hdf file'''
|
|
1056
|
-
|
|
1057
|
-
# iterate over fish, get last classificaiton, presences, and overlapping detections
|
|
1058
|
-
for fish in self.tags[self.tags.tag_type == 'study'].index:
|
|
1059
|
-
for rec in self.receivers.index:
|
|
1060
|
-
# get this receivers data from the classified key
|
|
1061
|
-
rec_dat = pd.read_hdf(self.db,
|
|
1062
|
-
key = 'classified',
|
|
1063
|
-
where = f'(freq_code == "{fish}") & (rec_id == "{rec}")')
|
|
1064
|
-
try:
|
|
1065
|
-
presence_dat = pd.read_hdf(self.db,
|
|
1066
|
-
key = 'presence',
|
|
1067
|
-
where = f'(freq_code == "{fish}") & (rec_id == "{rec}")')
|
|
1068
|
-
except:
|
|
1069
|
-
presence_dat = []
|
|
1530
|
+
append = False)
|
|
1070
1531
|
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1532
|
+
def repack_database(self, output_path=None):
|
|
1533
|
+
"""
|
|
1534
|
+
Repack HDF5 database to reclaim disk space and improve performance.
|
|
1535
|
+
|
|
1536
|
+
Uses PyTables to copy all nodes (Groups, Tables, Arrays) recursively
|
|
1537
|
+
with compression enabled. This fixes the bloat from repeated append operations.
|
|
1538
|
+
|
|
1539
|
+
Args:
|
|
1540
|
+
output_path (str, optional): Path for repacked database.
|
|
1541
|
+
If None, uses '{db_name}_repacked.h5'
|
|
1542
|
+
|
|
1543
|
+
Returns:
|
|
1544
|
+
str: Path to the repacked database
|
|
1545
|
+
"""
|
|
1546
|
+
import tables
|
|
1547
|
+
import logging
|
|
1548
|
+
import time
|
|
1549
|
+
|
|
1550
|
+
logger = logging.getLogger(__name__)
|
|
1551
|
+
|
|
1552
|
+
if output_path is None:
|
|
1553
|
+
base_name = os.path.splitext(self.db)[0]
|
|
1554
|
+
output_path = f"{base_name}_repacked.h5"
|
|
1555
|
+
|
|
1556
|
+
logger.info(f"Repacking database: {self.db} → {output_path}")
|
|
1557
|
+
print(f"[repack] Starting database repack...")
|
|
1558
|
+
print(f" Source: {self.db}")
|
|
1559
|
+
print(f" Target: {output_path}")
|
|
1560
|
+
|
|
1561
|
+
# Get original size
|
|
1562
|
+
orig_size = os.path.getsize(self.db)
|
|
1563
|
+
print(f" Original size: {orig_size / (1024**3):.2f} GB")
|
|
1564
|
+
|
|
1565
|
+
start_time = time.time()
|
|
1566
|
+
|
|
1567
|
+
# Open both files with PyTables
|
|
1568
|
+
with tables.open_file(self.db, mode='r') as h5in:
|
|
1569
|
+
with tables.open_file(output_path, mode='w') as h5out:
|
|
1570
|
+
|
|
1571
|
+
# Set compression filters
|
|
1572
|
+
filters = tables.Filters(complevel=5, complib='blosc:zstd')
|
|
1573
|
+
|
|
1574
|
+
# Copy all top-level nodes recursively
|
|
1575
|
+
for node in h5in.root:
|
|
1576
|
+
node_path = node._v_pathname
|
|
1577
|
+
print(f" Copying {node_path}...")
|
|
1076
1578
|
|
|
1077
|
-
|
|
1078
|
-
|
|
1579
|
+
try:
|
|
1580
|
+
# Use recursive=True to copy entire subtree (Groups, Tables, Arrays, etc.)
|
|
1581
|
+
h5in.copy_node(
|
|
1582
|
+
where=node_path,
|
|
1583
|
+
newparent=h5out.root,
|
|
1584
|
+
recursive=True,
|
|
1585
|
+
filters=filters
|
|
1586
|
+
)
|
|
1587
|
+
except (tables.NodeError, tables.HDF5ExtError, OSError, ValueError) as e:
|
|
1588
|
+
raise RuntimeError(f"Failed to copy HDF5 node {node_path}: {e}") from e
|
|
1589
|
+
|
|
1590
|
+
# Get new size
|
|
1591
|
+
new_size = os.path.getsize(output_path)
|
|
1592
|
+
savings = (1 - new_size / orig_size) * 100
|
|
1593
|
+
elapsed = time.time() - start_time
|
|
1594
|
+
|
|
1595
|
+
print(f"\n[repack] ✓ Repack complete in {elapsed:.1f} seconds")
|
|
1596
|
+
print(f" New size: {new_size / (1024**3):.2f} GB")
|
|
1597
|
+
print(f" Savings: {savings:.1f}%")
|
|
1598
|
+
|
|
1599
|
+
logger.info(f"Repack complete: {new_size / (1024**3):.2f} GB ({savings:.1f}% reduction)")
|
|
1600
|
+
|
|
1601
|
+
return output_path
|
|
1602
|
+
|
|
1603
|
+
def make_recaptures_table(self, export=True, pit_study=False):
|
|
1604
|
+
'''Creates a recaptures key in the HDF5 file, iterating over receivers to manage memory.'''
|
|
1605
|
+
logger.info("Creating recaptures table")
|
|
1606
|
+
logger.info(f" Processing {len(self.receivers)} receiver(s)")
|
|
1607
|
+
# prepare a heartbeat log so long runs can be monitored (one-line per receiver)
|
|
1608
|
+
heartbeat_dir = os.path.join(self.project_dir, 'build')
|
|
1609
|
+
try:
|
|
1610
|
+
os.makedirs(heartbeat_dir, exist_ok=True)
|
|
1611
|
+
except OSError as e:
|
|
1612
|
+
raise RuntimeError(
|
|
1613
|
+
f"Failed to create heartbeat directory '{heartbeat_dir}': {e}"
|
|
1614
|
+
) from e
|
|
1615
|
+
heartbeat_path = os.path.join(heartbeat_dir, 'recaptures_heartbeat.log')
|
|
1616
|
+
print(f"Starting recaptures: {len(self.receivers)} receivers. Heartbeat -> {heartbeat_path}")
|
|
1617
|
+
try:
|
|
1618
|
+
with open(heartbeat_path, 'a') as _hb:
|
|
1619
|
+
_hb.write(f"START {datetime.datetime.now().isoformat()} receivers={len(self.receivers)}\n")
|
|
1620
|
+
except OSError as e:
|
|
1621
|
+
raise RuntimeError(
|
|
1622
|
+
f"Failed to write heartbeat start to '{heartbeat_path}': {e}"
|
|
1623
|
+
) from e
|
|
1624
|
+
|
|
1625
|
+
if pit_study==False:
|
|
1626
|
+
# Convert release dates to datetime if not already done
|
|
1627
|
+
self.tags['rel_date'] = pd.to_datetime(self.tags['rel_date'])
|
|
1628
|
+
tags_copy = self.tags.copy()
|
|
1629
|
+
|
|
1630
|
+
for rec in tqdm(self.receivers.index, desc="Processing receivers", unit="receiver"):
|
|
1631
|
+
logger.info(f" Processing receiver {rec}...")
|
|
1632
|
+
print(f"[recaptures] processing receiver {rec}...", flush=True)
|
|
1633
|
+
|
|
1634
|
+
# Read classified data for this receiver as a Dask DataFrame
|
|
1635
|
+
# Reading the data (assuming self.db and rec are predefined variables)
|
|
1636
|
+
rec_dat = dd.read_hdf(self.db, key='classified')
|
|
1079
1637
|
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1638
|
+
# Filter for specific rec_id and convert to pandas DataFrame
|
|
1639
|
+
rec_dat = rec_dat[rec_dat['rec_id'] == rec].compute()
|
|
1640
|
+
|
|
1641
|
+
# Convert 'timestamp' column to datetime
|
|
1642
|
+
rec_dat['time_stamp'] = pd.to_datetime(rec_dat['time_stamp'])
|
|
1643
|
+
|
|
1644
|
+
# Calculate seconds since Unix epoch
|
|
1645
|
+
rec_dat['epoch'] = (rec_dat['time_stamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
|
|
1646
|
+
logger.debug(f" Initial load: {len(rec_dat)} detections")
|
|
1647
|
+
|
|
1648
|
+
# Merge with release dates to filter out data before release
|
|
1649
|
+
rec_dat = rec_dat.merge(tags_copy, left_on='freq_code', right_index=True)
|
|
1650
|
+
rec_dat = rec_dat[rec_dat['time_stamp'] >= rec_dat['rel_date']]
|
|
1651
|
+
logger.debug(f" After release date filter: {len(rec_dat)} detections")
|
|
1652
|
+
|
|
1653
|
+
# Reset index to avoid ambiguity between index and column labels
|
|
1654
|
+
if 'freq_code' in rec_dat.columns and 'freq_code' in rec_dat.index.names:
|
|
1655
|
+
rec_dat = rec_dat.reset_index(drop=True)
|
|
1656
|
+
|
|
1657
|
+
# Filter by latest iteration and valid test
|
|
1658
|
+
idxmax_values = rec_dat.iter.max()
|
|
1659
|
+
rec_dat = rec_dat[rec_dat.iter == idxmax_values]
|
|
1660
|
+
rec_dat = rec_dat[rec_dat['test'] == 1]
|
|
1661
|
+
logger.debug(f" After filtering (iter={idxmax_values}, test=1): {len(rec_dat)} detections")
|
|
1662
|
+
|
|
1663
|
+
# Check if 'presence' exists before trying to read it
|
|
1664
|
+
try:
|
|
1665
|
+
presence_data = dd.read_hdf(self.db, key='presence')
|
|
1666
|
+
# Filter immediately instead of checking len() which triggers expensive compute
|
|
1667
|
+
presence_data = presence_data[presence_data['rec_id'] == rec]
|
|
1668
|
+
presence_exists = True
|
|
1669
|
+
except (KeyError, FileNotFoundError):
|
|
1670
|
+
presence_exists = False
|
|
1671
|
+
|
|
1672
|
+
if presence_exists:
|
|
1673
|
+
try:
|
|
1674
|
+
presence_data = presence_data.compute()
|
|
1675
|
+
presence_data = presence_data[presence_data['freq_code'].isin(self.tags[self.tags.tag_type=='study'].index)]
|
|
1676
|
+
presence_data = presence_data[['freq_code', 'epoch', 'rec_id', 'bout_no']]
|
|
1677
|
+
logger.debug(f" Presence data: {len(presence_data)} records")
|
|
1678
|
+
|
|
1679
|
+
except KeyError:
|
|
1680
|
+
logger.warning(f" No presence data found for {rec}, skipping presence merge")
|
|
1681
|
+
else:
|
|
1682
|
+
logger.warning(" 'presence' key not found in HDF5, skipping presence merge")
|
|
1095
1683
|
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1684
|
+
# Read overlap data - filter immediately to avoid expensive len() compute
|
|
1685
|
+
try:
|
|
1686
|
+
overlap_data = dd.read_hdf(self.db, key='overlapping')
|
|
1687
|
+
# Filter to this receiver first before checking anything
|
|
1688
|
+
overlap_data = overlap_data[overlap_data['rec_id'] == rec]
|
|
1689
|
+
overlap_exists = True
|
|
1690
|
+
except (KeyError, FileNotFoundError):
|
|
1691
|
+
overlap_exists = False
|
|
1692
|
+
|
|
1693
|
+
if overlap_exists:
|
|
1694
|
+
try:
|
|
1695
|
+
overlap_data = overlap_data.compute()
|
|
1696
|
+
overlap_data = overlap_data[overlap_data['freq_code'].isin(self.tags[self.tags.tag_type=='study'].index)]
|
|
1697
|
+
# Aggregate both overlapping and ambiguous_overlap columns
|
|
1698
|
+
if 'ambiguous_overlap' in overlap_data.columns:
|
|
1699
|
+
overlap_data = overlap_data.groupby(['freq_code', 'epoch', 'rec_id']).agg({
|
|
1700
|
+
'overlapping': 'max',
|
|
1701
|
+
'ambiguous_overlap': 'max'
|
|
1702
|
+
}).reset_index()
|
|
1105
1703
|
else:
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1704
|
+
overlap_data = overlap_data.groupby(['freq_code', 'epoch', 'rec_id'])['overlapping'].max().reset_index()
|
|
1705
|
+
logger.debug(f" Overlap data: {len(overlap_data)} records")
|
|
1706
|
+
|
|
1707
|
+
except KeyError:
|
|
1708
|
+
logger.warning(f" No overlap data found for {rec}, skipping overlap merge")
|
|
1709
|
+
else:
|
|
1710
|
+
logger.warning(" 'overlapping' key not found in HDF5, skipping overlap merge")
|
|
1110
1711
|
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1712
|
+
# Merge with presence data
|
|
1713
|
+
if presence_exists:
|
|
1714
|
+
rec_dat = rec_dat.merge(presence_data, on=['freq_code', 'epoch', 'rec_id'], how='left')
|
|
1715
|
+
rec_dat['bout_no'] = rec_dat['bout_no'].fillna(0).astype(int)
|
|
1716
|
+
else:
|
|
1717
|
+
rec_dat['bout_no'] = 0
|
|
1718
|
+
|
|
1719
|
+
# Merge with overlap data
|
|
1720
|
+
if overlap_exists:
|
|
1721
|
+
rec_dat = rec_dat.merge(overlap_data, on=['freq_code', 'epoch', 'rec_id'], how='left')
|
|
1722
|
+
rec_dat['overlapping'] = rec_dat['overlapping'].fillna(0).astype(int)
|
|
1723
|
+
# Add ambiguous_overlap if it exists in overlap data
|
|
1724
|
+
if 'ambiguous_overlap' in overlap_data.columns:
|
|
1725
|
+
rec_dat['ambiguous_overlap'] = rec_dat['ambiguous_overlap'].fillna(0).astype('float32')
|
|
1726
|
+
else:
|
|
1727
|
+
rec_dat['ambiguous_overlap'] = np.float32(0)
|
|
1728
|
+
else:
|
|
1729
|
+
rec_dat['overlapping'] = 0
|
|
1730
|
+
rec_dat['ambiguous_overlap'] = np.float32(0)
|
|
1731
|
+
|
|
1732
|
+
# Filter out overlapping detections (keep only overlapping=0)
|
|
1733
|
+
before_filter = len(rec_dat)
|
|
1734
|
+
rec_dat = rec_dat[rec_dat['overlapping'] != 1]
|
|
1735
|
+
after_filter = len(rec_dat)
|
|
1736
|
+
logger.debug(f" Filtered {before_filter - after_filter} overlapping detections")
|
|
1737
|
+
|
|
1738
|
+
logger.debug(f" After presence/overlap merge: {len(rec_dat)} detections")
|
|
1739
|
+
|
|
1740
|
+
# Check for required columns
|
|
1741
|
+
required_columns = ['freq_code', 'rec_id', 'epoch', 'time_stamp', 'power', 'noise_ratio',
|
|
1742
|
+
'lag', 'det_hist', 'hit_ratio', 'cons_det', 'cons_length',
|
|
1743
|
+
'likelihood_T', 'likelihood_F', 'bout_no', 'overlapping', 'ambiguous_overlap']
|
|
1744
|
+
|
|
1745
|
+
missing_columns = [col for col in required_columns if col not in rec_dat.columns]
|
|
1746
|
+
if missing_columns:
|
|
1747
|
+
logger.error(f" Required columns missing: {missing_columns}")
|
|
1748
|
+
continue
|
|
1749
|
+
|
|
1750
|
+
# Sort by freq code and epoch
|
|
1751
|
+
rec_dat = rec_dat.sort_values(by=['freq_code', 'epoch'], ascending=[True, True])
|
|
1752
|
+
|
|
1753
|
+
# Keep only the necessary columns (including handling missing columns)
|
|
1754
|
+
available_columns = [col for col in required_columns if col in rec_dat.columns]
|
|
1755
|
+
rec_dat = rec_dat[available_columns]
|
|
1756
|
+
|
|
1757
|
+
# Ensure correct data types
|
|
1758
|
+
rec_dat = rec_dat.astype({
|
|
1759
|
+
'freq_code': 'object',
|
|
1760
|
+
'epoch': 'int64',
|
|
1761
|
+
'rec_id': 'object',
|
|
1762
|
+
'time_stamp': 'datetime64[ns]',
|
|
1763
|
+
'power': 'float32',
|
|
1764
|
+
'noise_ratio': 'float32',
|
|
1765
|
+
'lag': 'float32',
|
|
1766
|
+
'det_hist': 'object',
|
|
1767
|
+
'hit_ratio': 'float32',
|
|
1768
|
+
'cons_det': 'int32',
|
|
1769
|
+
'cons_length': 'float32',
|
|
1770
|
+
'likelihood_T': 'float32',
|
|
1771
|
+
'likelihood_F': 'float32',
|
|
1772
|
+
'bout_no': 'int32',
|
|
1773
|
+
'overlapping': 'int32',
|
|
1774
|
+
'ambiguous_overlap': 'float32'
|
|
1775
|
+
})
|
|
1776
|
+
|
|
1777
|
+
# Show record counts
|
|
1778
|
+
logger.debug(f" Final: {len(rec_dat)} detections for {rec}")
|
|
1779
|
+
print(f"[recaptures] {rec}: compiled {len(rec_dat)} rows (overlapping={rec_dat['overlapping'].sum()}, bouts={rec_dat['bout_no'].max()})", flush=True)
|
|
1780
|
+
|
|
1781
|
+
# Append to the HDF5 file
|
|
1782
|
+
with pd.HDFStore(self.db, mode='a') as store:
|
|
1783
|
+
store.append(key='recaptures', value=rec_dat, format='table',
|
|
1784
|
+
index=False, min_itemsize={'freq_code': 20, 'rec_id': 20, 'det_hist': 20},
|
|
1785
|
+
append=True, chunksize=1000000, data_columns=True)
|
|
1786
|
+
|
|
1787
|
+
logger.info(f" ✓ Recaps for {rec} compiled and written to HDF5")
|
|
1788
|
+
print(f"[recaptures] ✓ {rec} written to database", flush=True)
|
|
1789
|
+
# append heartbeat line
|
|
1790
|
+
try:
|
|
1791
|
+
with open(heartbeat_path, 'a') as _hb:
|
|
1792
|
+
_hb.write(f"{datetime.datetime.now().isoformat()} rec={rec} rows={len(rec_dat)}\n")
|
|
1793
|
+
except OSError as e:
|
|
1794
|
+
raise RuntimeError(
|
|
1795
|
+
f"Failed to write heartbeat for receiver {rec} to '{heartbeat_path}': {e}"
|
|
1796
|
+
) from e
|
|
1797
|
+
|
|
1798
|
+
else:
|
|
1799
|
+
# Loop over each receiver in self.receivers
|
|
1800
|
+
for rec in tqdm(self.receivers.index, desc="Processing PIT receivers", unit="receiver"):
|
|
1801
|
+
logger.info(f" Processing {rec} (PIT study)...")
|
|
1802
|
+
|
|
1803
|
+
# Read PIT data (already parsed from your text files) from /raw_data in HDF5
|
|
1804
|
+
try:
|
|
1805
|
+
pit_data = pd.read_hdf(self.db, key='raw_data')
|
|
1806
|
+
except KeyError:
|
|
1807
|
+
logger.error(" No 'raw_data' key found in HDF5 file")
|
|
1808
|
+
continue
|
|
1809
|
+
|
|
1810
|
+
# Filter rows so that only the specified receiver is kept
|
|
1811
|
+
pit_data = pit_data[pit_data['rec_id'] == rec]
|
|
1812
|
+
logger.debug(f" Filtered PIT data: {len(pit_data)} detections")
|
|
1813
|
+
|
|
1814
|
+
# Add any missing columns to align with the acoustic (non-PIT) columns
|
|
1815
|
+
missing_cols = [
|
|
1816
|
+
'lag', 'det_hist', 'hit_ratio', 'cons_det', 'cons_length',
|
|
1817
|
+
'likelihood_T', 'likelihood_F', 'bout_no', 'overlapping', 'ambiguous_overlap'
|
|
1818
|
+
]
|
|
1819
|
+
for col in missing_cols:
|
|
1820
|
+
if col not in pit_data.columns:
|
|
1821
|
+
if col == 'ambiguous_overlap':
|
|
1822
|
+
pit_data[col] = np.float32(0)
|
|
1823
|
+
else:
|
|
1824
|
+
pit_data[col] = 0
|
|
1825
|
+
|
|
1826
|
+
# Check if 'presence' exists before trying to read it
|
|
1827
|
+
with pd.HDFStore(self.db, mode='r') as store:
|
|
1828
|
+
presence_exists = 'presence' in store.keys()
|
|
1829
|
+
|
|
1830
|
+
if presence_exists:
|
|
1831
|
+
try:
|
|
1832
|
+
presence_data = dd.read_hdf(self.db, key='presence')
|
|
1833
|
+
presence_data = presence_data[presence_data['rec_id'] == rec].compute()
|
|
1834
|
+
presence_data = presence_data[presence_data['freq_code'].isin(self.tags[self.tags.tag_type=='study'].index)]
|
|
1835
|
+
presence_data = presence_data[['freq_code', 'epoch', 'rec_id', 'bout_no']]
|
|
1144
1836
|
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1837
|
+
if not presence_data.empty:
|
|
1838
|
+
pit_data = pit_data.merge(presence_data, on=['freq_code', 'epoch', 'rec_id'], how='left')
|
|
1839
|
+
pit_data['bout_no'] = pit_data['bout_no'].fillna(0).astype(int)
|
|
1840
|
+
except KeyError:
|
|
1841
|
+
logger.warning(f" No presence data found for {rec}, skipping presence merge")
|
|
1842
|
+
else:
|
|
1843
|
+
logger.warning(" 'presence' key not found in HDF5, skipping presence merge")
|
|
1844
|
+
|
|
1845
|
+
# Check if 'overlapping' exists before trying to read it
|
|
1846
|
+
with pd.HDFStore(self.db, mode='r') as store:
|
|
1847
|
+
overlap_exists = 'overlapping' in store.keys()
|
|
1848
|
+
|
|
1849
|
+
if overlap_exists:
|
|
1850
|
+
try:
|
|
1851
|
+
overlap_data = dd.read_hdf(self.db, key='overlapping')
|
|
1852
|
+
overlap_data = overlap_data[overlap_data['rec_id'] == rec].compute()
|
|
1853
|
+
overlap_data = overlap_data[overlap_data['freq_code'].isin(self.tags[self.tags.tag_type=='study'].index)]
|
|
1854
|
+
# Aggregate: take max for both overlapping and ambiguous_overlap
|
|
1855
|
+
agg_dict = {'overlapping': 'max'}
|
|
1856
|
+
if 'ambiguous_overlap' in overlap_data.columns:
|
|
1857
|
+
agg_dict['ambiguous_overlap'] = 'max'
|
|
1858
|
+
overlap_data = overlap_data.groupby(['freq_code', 'epoch', 'rec_id']).agg(agg_dict).reset_index()
|
|
1859
|
+
|
|
1860
|
+
if not overlap_data.empty:
|
|
1861
|
+
pit_data = pit_data.merge(overlap_data, on=['freq_code', 'epoch', 'rec_id'], how='left')
|
|
1862
|
+
pit_data['overlapping'] = pit_data['overlapping'].fillna(0).astype(int)
|
|
1863
|
+
if 'ambiguous_overlap' in overlap_data.columns:
|
|
1864
|
+
pit_data['ambiguous_overlap'] = pit_data['ambiguous_overlap'].fillna(0).astype('float32')
|
|
1865
|
+
except KeyError:
|
|
1866
|
+
logger.warning(f" No overlap data found for {rec}, skipping overlap merge")
|
|
1867
|
+
else:
|
|
1868
|
+
logger.warning(" 'overlapping' key not found in HDF5, skipping overlap merge")
|
|
1869
|
+
|
|
1870
|
+
# Sort PIT data by freq_code and epoch
|
|
1871
|
+
pit_data = pit_data.sort_values(['freq_code', 'epoch'])
|
|
1872
|
+
|
|
1873
|
+
# Keep only the columns needed in `recaptures`
|
|
1874
|
+
required_columns = [
|
|
1875
|
+
'freq_code', 'rec_id', 'epoch', 'time_stamp', 'power', 'noise_ratio', 'lag', 'det_hist',
|
|
1876
|
+
'hit_ratio', 'cons_det', 'cons_length', 'likelihood_T', 'likelihood_F', 'bout_no', 'overlapping', 'ambiguous_overlap'
|
|
1877
|
+
]
|
|
1878
|
+
pit_data = pit_data[[c for c in required_columns if c in pit_data.columns]]
|
|
1879
|
+
|
|
1880
|
+
# Convert each column to the correct dtype
|
|
1881
|
+
dtypes_map = {
|
|
1882
|
+
'freq_code': 'object', 'rec_id': 'object', 'epoch': 'int64',
|
|
1883
|
+
'time_stamp': 'datetime64[ns]', 'power': 'float32', 'noise_ratio': 'float32',
|
|
1884
|
+
'lag': 'float32', 'det_hist': 'object', 'hit_ratio': 'float32',
|
|
1885
|
+
'cons_det': 'int32', 'cons_length': 'float32', 'likelihood_T': 'float32',
|
|
1886
|
+
'likelihood_F': 'float32', 'bout_no': 'int32', 'overlapping': 'int32', 'ambiguous_overlap': 'float32'
|
|
1887
|
+
}
|
|
1888
|
+
for col, dt in dtypes_map.items():
|
|
1889
|
+
if col in pit_data.columns:
|
|
1890
|
+
pit_data[col] = pit_data[col].astype(dt)
|
|
1891
|
+
|
|
1892
|
+
# Show record counts BEFORE prompting
|
|
1893
|
+
print(f"[recaptures] {rec}: compiled {len(pit_data)} PIT rows (overlapping={pit_data['overlapping'].sum()}, bouts={pit_data['bout_no'].max()})", flush=True)
|
|
1161
1894
|
|
|
1895
|
+
# Confirm with user before appending PIT data into 'recaptures'
|
|
1896
|
+
confirm = str(self._prompt("Import PIT data? (yes/no): ", default="no")).strip().lower()
|
|
1897
|
+
if confirm != 'yes':
|
|
1898
|
+
logger.info("PIT data import canceled by user")
|
|
1899
|
+
return
|
|
1900
|
+
|
|
1901
|
+
# Convert 'det_hist' to string to avoid serialization issues
|
|
1902
|
+
if 'det_hist' in pit_data.columns:
|
|
1903
|
+
pit_data['det_hist'] = pit_data['det_hist'].astype(str)
|
|
1904
|
+
|
|
1905
|
+
# Append PIT data to 'recaptures' in HDF5
|
|
1906
|
+
with pd.HDFStore(self.db, mode='a') as store:
|
|
1907
|
+
store.append(
|
|
1908
|
+
key='recaptures',
|
|
1909
|
+
value=pit_data,
|
|
1910
|
+
format='table',
|
|
1911
|
+
index=False,
|
|
1912
|
+
min_itemsize={'freq_code': 20, 'rec_id': 20, 'det_hist': 20},
|
|
1913
|
+
append=True,
|
|
1914
|
+
chunksize=1000000,
|
|
1915
|
+
data_columns=True
|
|
1916
|
+
)
|
|
1917
|
+
|
|
1918
|
+
logger.info(f" ✓ PIT recaps for {rec} compiled and written to HDF5")
|
|
1919
|
+
print(f"[recaptures] ✓ {rec} PIT data written to database", flush=True)
|
|
1920
|
+
try:
|
|
1921
|
+
with open(heartbeat_path, 'a') as _hb:
|
|
1922
|
+
_hb.write(f"{datetime.datetime.now().isoformat()} pit_rec={rec} rows={len(pit_data)}\n")
|
|
1923
|
+
except OSError as e:
|
|
1924
|
+
raise RuntimeError(
|
|
1925
|
+
f"Failed to write PIT heartbeat for receiver {rec} to '{heartbeat_path}': {e}"
|
|
1926
|
+
) from e
|
|
1927
|
+
|
|
1928
|
+
|
|
1929
|
+
if export:
|
|
1930
|
+
logger.info("Exporting recaptures to CSV...")
|
|
1931
|
+
print("[recaptures] exporting recaptures to CSV...", flush=True)
|
|
1932
|
+
rec_data = dd.read_hdf(self.db, 'recaptures').compute()
|
|
1933
|
+
rec_data.to_csv(os.path.join(self.output_dir,'recaptures.csv'), index=False)
|
|
1934
|
+
logger.info(f" ✓ Export complete: {os.path.join(self.output_dir,'recaptures.csv')}")
|
|
1935
|
+
print(f"[recaptures] ✓ Export complete: {os.path.join(self.output_dir,'recaptures.csv')}", flush=True)
|
|
1936
|
+
try:
|
|
1937
|
+
with open(heartbeat_path, 'a') as _hb:
|
|
1938
|
+
_hb.write(
|
|
1939
|
+
f"DONE {datetime.datetime.now().isoformat()} export="
|
|
1940
|
+
f"{os.path.join(self.output_dir, 'recaptures.csv')}\n"
|
|
1941
|
+
)
|
|
1942
|
+
except OSError as e:
|
|
1943
|
+
raise RuntimeError(
|
|
1944
|
+
f"Failed to write heartbeat completion to '{heartbeat_path}': {e}"
|
|
1945
|
+
) from e
|
|
1946
|
+
|
|
1162
1947
|
|
|
1948
|
+
def undo_recaptures(self):
|
|
1949
|
+
"""
|
|
1950
|
+
Remove recaptures data from HDF5 file.
|
|
1951
|
+
Note: File size won't shrink until you manually repack the database.
|
|
1952
|
+
"""
|
|
1953
|
+
logger.info("Removing recaptures from database")
|
|
1954
|
+
with pd.HDFStore(self.db, mode='a') as store:
|
|
1955
|
+
if 'recaptures' in store:
|
|
1956
|
+
store.remove('recaptures')
|
|
1957
|
+
logger.info(" ✓ Recaptures key removed")
|
|
1958
|
+
else:
|
|
1959
|
+
logger.info(" No recaptures key found")
|
|
1960
|
+
|
|
1961
|
+
logger.info(" Data logically deleted (file size unchanged)")
|
|
1962
|
+
logger.info(" To reclaim disk space, manually repack after all deletions complete")
|
|
1963
|
+
|
|
1964
|
+
def undo_overlap(self):
|
|
1965
|
+
"""
|
|
1966
|
+
Remove overlapping data from HDF5 file.
|
|
1967
|
+
Note: File size won't shrink until you manually repack the database.
|
|
1968
|
+
"""
|
|
1969
|
+
logger.info("Removing overlapping from database")
|
|
1970
|
+
with pd.HDFStore(self.db, mode='a') as store:
|
|
1971
|
+
if 'overlapping' in store:
|
|
1972
|
+
store.remove('overlapping')
|
|
1973
|
+
logger.info(" ✓ Overlapping key removed")
|
|
1974
|
+
else:
|
|
1975
|
+
logger.info(" No overlapping key found")
|
|
1976
|
+
|
|
1977
|
+
logger.info(" Data logically deleted (file size unchanged)")
|
|
1978
|
+
logger.info(" To reclaim disk space, manually repack after all deletions complete")
|
|
1163
1979
|
|
|
1980
|
+
def new_db_version(self, output_h5):
|
|
1981
|
+
"""
|
|
1982
|
+
Create a new version of the working HDF5 database.
|
|
1983
|
+
|
|
1984
|
+
This function creates a copy of the existing working database, allowing you to
|
|
1985
|
+
backtrack or branch your analysis. If there are keys that are in error or conflict
|
|
1986
|
+
with the current understanding of the system, this function helps you remove them
|
|
1987
|
+
from the new version of the database.
|
|
1988
|
+
|
|
1989
|
+
Parameters:
|
|
1990
|
+
output_h5 (str): The file name for the new HDF5 file.
|
|
1991
|
+
"""
|
|
1992
|
+
logger.info(f"Creating new database version: {output_h5}")
|
|
1993
|
+
|
|
1994
|
+
# Copy the HDF5 file
|
|
1995
|
+
shutil.copyfile(self.db, output_h5)
|
|
1996
|
+
logger.info(" Database copied")
|
|
1997
|
+
|
|
1998
|
+
# Open the copied HDF5 file
|
|
1999
|
+
with h5py.File(output_h5, 'r+') as hdf:
|
|
2000
|
+
# List all keys in the file
|
|
2001
|
+
keys = list(hdf.keys())
|
|
2002
|
+
logger.info(f" Keys in HDF5 file: {', '.join(keys)}")
|
|
2003
|
+
|
|
2004
|
+
# Ask the user to input the keys they want to modify
|
|
2005
|
+
selected_keys = str(self._prompt("Enter the keys you want to modify, separated by commas: ", default="")).split(',')
|
|
2006
|
+
|
|
2007
|
+
# Clean up the input (remove whitespace)
|
|
2008
|
+
selected_keys = [key.strip() for key in selected_keys]
|
|
2009
|
+
|
|
2010
|
+
for key in selected_keys:
|
|
2011
|
+
if key in hdf:
|
|
2012
|
+
logger.info(f" Processing key: '{key}'...")
|
|
2013
|
+
|
|
2014
|
+
# If it's a group, recursively delete all datasets (subkeys)
|
|
2015
|
+
if isinstance(hdf[key], h5py.Group):
|
|
2016
|
+
logger.info(f" Key '{key}' is a group, deleting all subkeys...")
|
|
2017
|
+
for subkey in list(hdf[key].keys()):
|
|
2018
|
+
logger.debug(f" Removing subkey: '{key}/{subkey}'")
|
|
2019
|
+
del hdf[key][subkey]
|
|
2020
|
+
logger.info(f" All subkeys under '{key}' deleted")
|
|
2021
|
+
else:
|
|
2022
|
+
# It's a dataset, clear the data in the DataFrame
|
|
2023
|
+
logger.info(f" Clearing data for dataset key: '{key}'")
|
|
2024
|
+
df = pd.read_hdf(output_h5, key)
|
|
2025
|
+
df.drop(df.index, inplace=True)
|
|
2026
|
+
df.to_hdf(output_h5, key, mode='a', format='table', data_columns=True)
|
|
2027
|
+
logger.info(f" Data cleared for key: '{key}'")
|
|
2028
|
+
else:
|
|
2029
|
+
logger.warning(f" Key '{key}' not found in HDF5 file")
|
|
2030
|
+
|
|
2031
|
+
# Update the project's database to the new copied database
|
|
2032
|
+
self.db = output_h5
|
|
2033
|
+
logger.info(f"✓ New database version created: {output_h5}")
|
|
1164
2034
|
|
|
1165
2035
|
|
|
1166
2036
|
|