pymast 0.0.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pymast/__init__.py +31 -2
- pymast/fish_history.py +59 -6
- pymast/formatter.py +886 -548
- pymast/logger.py +58 -0
- pymast/naive_bayes.py +116 -9
- pymast/overlap_removal.py +2327 -490
- pymast/parsers.py +1111 -239
- pymast/predictors.py +302 -116
- pymast/radio_project.py +1382 -512
- pymast/validation.py +224 -0
- pymast-1.0.0.dist-info/METADATA +636 -0
- pymast-1.0.0.dist-info/RECORD +15 -0
- {pymast-0.0.5.dist-info → pymast-1.0.0.dist-info}/WHEEL +1 -1
- pymast/table_merge.py +0 -154
- pymast-0.0.5.dist-info/METADATA +0 -19
- pymast-0.0.5.dist-info/RECORD +0 -14
- {pymast-0.0.5.dist-info → pymast-1.0.0.dist-info/licenses}/LICENSE.txt +0 -0
- {pymast-0.0.5.dist-info → pymast-1.0.0.dist-info}/top_level.txt +0 -0
pymast/formatter.py
CHANGED
|
@@ -1,10 +1,63 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
"""
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
3
|
+
Statistical Model Data Formatting Module
|
|
4
|
+
========================================
|
|
5
|
+
|
|
6
|
+
This module contains classes for formatting cleaned telemetry data
|
|
7
|
+
into input files for various statistical survival and movement models.
|
|
8
|
+
|
|
9
|
+
Classes
|
|
10
|
+
-------
|
|
11
|
+
cjs_data_prep : Cormack-Jolly-Seber (CJS) mark-recapture formatting
|
|
12
|
+
Creates encounter history matrices for Program MARK survival analysis.
|
|
13
|
+
|
|
14
|
+
lrdr_data_prep : Live Recapture Dead Recovery (LRDR) formatting
|
|
15
|
+
Combines live detections with mobile tracking mortality surveys.
|
|
16
|
+
Used for post-passage survival estimation with recovery data.
|
|
17
|
+
|
|
18
|
+
time_to_event : Multi-state time-to-event (competing risks) formatting
|
|
19
|
+
Creates counting-process style data for multi-state survival models.
|
|
20
|
+
Most commonly used class for fish passage and movement studies.
|
|
21
|
+
|
|
22
|
+
Typical Usage
|
|
23
|
+
-------------
|
|
24
|
+
>>> import pymast
|
|
25
|
+
>>> project = pymast.radio_project('path/to/project')
|
|
26
|
+
>>>
|
|
27
|
+
>>> # Time-to-event model (most common)
|
|
28
|
+
>>> receiver_to_state = {'R01': 1, 'R02': 2, 'R03': 9} # Map receivers to states
|
|
29
|
+
>>> tte = pymast.formatter.time_to_event(receiver_to_state, project)
|
|
30
|
+
>>> tte.data_prep(project, adjacency_filter=[(9,1), (9,2)]) # Remove illegal movements
|
|
31
|
+
>>> stats = tte.summary() # Print movement statistics
|
|
32
|
+
>>> tte.master_state_table.to_csv('output.csv') # Export for R/msm
|
|
33
|
+
>>>
|
|
34
|
+
>>> # CJS model
|
|
35
|
+
>>> receiver_to_recap = {'R01': 'R00', 'R02': 'R01', 'R03': 'R02'}
|
|
36
|
+
>>> cjs = pymast.formatter.cjs_data_prep(receiver_to_recap, project)
|
|
37
|
+
>>> cjs.input_file('model_name', 'output_dir') # Creates .inp for MARK
|
|
38
|
+
|
|
39
|
+
Notes
|
|
40
|
+
-----
|
|
41
|
+
- All classes expect data from a pymast.radio_project with complete pipeline:
|
|
42
|
+
1. Data imported (parsers)
|
|
43
|
+
2. Classified (naive_bayes)
|
|
44
|
+
3. Bouts detected (overlap_removal.bout)
|
|
45
|
+
4. Overlaps removed (overlap_removal.overlap_reduction)
|
|
46
|
+
5. Recaptures built (radio_project.make_recaptures_table)
|
|
47
|
+
|
|
48
|
+
- The time_to_event class automatically filters:
|
|
49
|
+
- Overlapping detections (overlapping == 1)
|
|
50
|
+
- Ambiguous overlaps (ambiguous_overlap == 1)
|
|
51
|
+
- Small bouts (< 3 detections)
|
|
52
|
+
|
|
53
|
+
- Adjacency filter removes biologically impossible state transitions
|
|
54
|
+
based on study site geography (e.g., downstream → upstream without ladder).
|
|
55
|
+
|
|
56
|
+
See Also
|
|
57
|
+
--------
|
|
58
|
+
pymast.radio_project : Project management and database
|
|
59
|
+
pymast.overlap_removal : Bout detection and overlap resolution
|
|
60
|
+
pymast.naive_bayes : Classification of true detections
|
|
8
61
|
"""
|
|
9
62
|
|
|
10
63
|
# import modules required for function dependencies
|
|
@@ -48,18 +101,18 @@ class cjs_data_prep():
|
|
|
48
101
|
#where = qry)
|
|
49
102
|
|
|
50
103
|
#self.recap_data.set_index('freq_code', inplace = True)
|
|
51
|
-
project.tags.reset_index('freq_code', inplace = True)
|
|
104
|
+
#project.tags.reset_index('freq_code', inplace = True)
|
|
52
105
|
self.recap_data = pd.merge(self.recap_data,
|
|
53
106
|
project.tags,
|
|
54
107
|
left_on = 'freq_code',
|
|
55
108
|
right_on = 'freq_code',
|
|
56
109
|
how = 'left')
|
|
57
|
-
self.recap_data.reset_index(
|
|
110
|
+
self.recap_data.reset_index(inplace = True)
|
|
58
111
|
#project.tags.reset_index(drop = False, inplace = True)
|
|
59
112
|
|
|
60
113
|
# filter out tag data we don't want mucking up our staistical model
|
|
61
114
|
if species != None:
|
|
62
|
-
self.recap_data = self.recap_data[self.recap_data.
|
|
115
|
+
self.recap_data = self.recap_data[self.recap_data.species == species]
|
|
63
116
|
if rel_loc != None:
|
|
64
117
|
self.recap_data = self.recap_data[self.recap_data.RelLoc == rel_loc]
|
|
65
118
|
if cap_loc != None:
|
|
@@ -91,7 +144,13 @@ class cjs_data_prep():
|
|
|
91
144
|
rel_dat.rename(columns = {'rel_date':'time_stamp'}, inplace = True)
|
|
92
145
|
rel_dat['recap_occasion'] = np.repeat('R00',len(rel_dat))
|
|
93
146
|
rel_dat['overlapping'] = np.zeros(len(rel_dat))
|
|
94
|
-
|
|
147
|
+
# Check if the index is the default integer index
|
|
148
|
+
has_default_index = isinstance(rel_dat.index, pd.RangeIndex)
|
|
149
|
+
|
|
150
|
+
# If the DataFrame has a default index, reset it
|
|
151
|
+
if not has_default_index:
|
|
152
|
+
rel_dat.reset_index(inplace = True)
|
|
153
|
+
self.recap_data = pd.concat([self.recap_data,rel_dat])
|
|
95
154
|
|
|
96
155
|
else:
|
|
97
156
|
print ("Starting Initial Recap Release Procedure")
|
|
@@ -346,36 +405,193 @@ class lrdr_data_prep():
|
|
|
346
405
|
|
|
347
406
|
|
|
348
407
|
|
|
408
|
+
|
|
349
409
|
print (self.inp.head())
|
|
350
410
|
# Check your work
|
|
351
411
|
self.inp.to_csv(os.path.join(outputWS,'%s_lrdr.csv'%(modelName)))
|
|
352
412
|
|
|
353
|
-
class time_to_event()
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
413
|
+
class time_to_event():
|
|
414
|
+
"""
|
|
415
|
+
Multi-state Time-to-Event Data Formatter
|
|
416
|
+
|
|
417
|
+
Formats radio telemetry recapture data into counting-process style records
|
|
418
|
+
for multi-state survival models (competing risks, Cox proportional hazards).
|
|
419
|
+
|
|
420
|
+
This is the primary class for fish passage and movement studies using
|
|
421
|
+
state-based survival models in R (msm, mstate packages).
|
|
422
|
+
|
|
423
|
+
Parameters
|
|
424
|
+
----------
|
|
425
|
+
receiver_to_state : dict
|
|
426
|
+
Mapping of receiver IDs to integer state codes.
|
|
427
|
+
Example: {'R_forebay': 1, 'R_powerhouse': 2, 'R_tailrace': 9}
|
|
428
|
+
|
|
429
|
+
project : pymast.radio_project
|
|
430
|
+
Project object with completed data pipeline (imported, classified,
|
|
431
|
+
bouts detected, overlaps removed, recaptures built).
|
|
432
|
+
|
|
433
|
+
input_type : str, optional
|
|
434
|
+
Legacy parameter, always uses 'query' from HDF5. Default 'query'.
|
|
435
|
+
|
|
436
|
+
initial_state_release : bool, optional
|
|
437
|
+
If True, models movement from release point (state 0).
|
|
438
|
+
If False, models from first recapture at state 1. Default False.
|
|
439
|
+
|
|
440
|
+
last_presence_time0 : bool, optional
|
|
441
|
+
If True, uses last presence at initial state as time_0 (for
|
|
442
|
+
downstream migration studies where fish linger upstream). Default False.
|
|
443
|
+
|
|
444
|
+
cap_loc : str, optional
|
|
445
|
+
Filter by capture location. Default None (all fish).
|
|
446
|
+
|
|
447
|
+
rel_loc : str, optional
|
|
448
|
+
Filter by release location. Default None (all fish).
|
|
449
|
+
|
|
450
|
+
species : str, optional
|
|
451
|
+
Filter by species code. Default None (all species).
|
|
452
|
+
|
|
453
|
+
rel_date : str, optional
|
|
454
|
+
Filter fish released after this date (YYYY-MM-DD). Default None.
|
|
455
|
+
|
|
456
|
+
recap_date : str, optional
|
|
457
|
+
Filter recaptures after this date (YYYY-MM-DD). Default None.
|
|
458
|
+
|
|
459
|
+
Attributes
|
|
460
|
+
----------
|
|
461
|
+
recap_data : pd.DataFrame
|
|
462
|
+
Loaded recapture data after all filtering (overlapping, ambiguous,
|
|
463
|
+
bout size, species/location filters).
|
|
464
|
+
|
|
465
|
+
master_state_table : pd.DataFrame
|
|
466
|
+
Final formatted output with columns:
|
|
467
|
+
- freq_code : Fish ID
|
|
468
|
+
- species : Species code
|
|
469
|
+
- start_state : State at time_0 (integer)
|
|
470
|
+
- end_state : State at time_1 (integer)
|
|
471
|
+
- presence : Presence number for this fish
|
|
472
|
+
- time_stamp : Datetime of transition
|
|
473
|
+
- time_delta : Duration in state (seconds)
|
|
474
|
+
- first_obs : Binary flag for first observation
|
|
475
|
+
- time_0 : Entry time (epoch seconds)
|
|
476
|
+
- time_1 : Exit time (epoch seconds)
|
|
477
|
+
- transition : Tuple (start_state, end_state)
|
|
478
|
+
|
|
479
|
+
fish : np.ndarray
|
|
480
|
+
Array of unique fish IDs in dataset.
|
|
481
|
+
|
|
482
|
+
start_times : pd.DataFrame
|
|
483
|
+
First recapture time per fish (for time_0 initialization).
|
|
484
|
+
|
|
485
|
+
Methods
|
|
486
|
+
-------
|
|
487
|
+
data_prep(project, unknown_state=None, bucket_length_min=15, adjacency_filter=None)
|
|
488
|
+
Process recapture data into state transitions. Applies adjacency filter
|
|
489
|
+
if provided to remove biologically impossible movements.
|
|
490
|
+
|
|
491
|
+
summary()
|
|
492
|
+
Calculate and print movement statistics (transition counts, durations).
|
|
493
|
+
Returns dict with statistics and saves CSV files to output directory.
|
|
494
|
+
|
|
495
|
+
Examples
|
|
496
|
+
--------
|
|
497
|
+
>>> import pymast
|
|
498
|
+
>>> project = pymast.radio_project('path/to/project')
|
|
499
|
+
>>>
|
|
500
|
+
>>> # Define state mapping
|
|
501
|
+
>>> receiver_to_state = {
|
|
502
|
+
... 'R_release': 0, # Release location
|
|
503
|
+
... 'R_forebay': 1, # Upstream of dam
|
|
504
|
+
... 'R_powerhouse': 2, # At turbines
|
|
505
|
+
... 'R_tailrace': 9 # Downstream of dam
|
|
506
|
+
... }
|
|
507
|
+
>>>
|
|
508
|
+
>>> # Initialize formatter
|
|
509
|
+
>>> tte = pymast.formatter.time_to_event(
|
|
510
|
+
... receiver_to_state,
|
|
511
|
+
... project,
|
|
512
|
+
... species='Chinook',
|
|
513
|
+
... initial_state_release=True
|
|
514
|
+
... )
|
|
515
|
+
>>>
|
|
516
|
+
>>> # Define illegal movements (downstream can't go upstream without ladder)
|
|
517
|
+
>>> adjacency_filter = [
|
|
518
|
+
... (9, 1), # Tailrace to forebay
|
|
519
|
+
... (9, 2), # Tailrace to powerhouse
|
|
520
|
+
... (2, 1), # Powerhouse to forebay
|
|
521
|
+
... ]
|
|
522
|
+
>>>
|
|
523
|
+
>>> # Process data
|
|
524
|
+
>>> tte.data_prep(project, adjacency_filter=adjacency_filter)
|
|
525
|
+
>>>
|
|
526
|
+
>>> # Get statistics
|
|
527
|
+
>>> stats = tte.summary()
|
|
528
|
+
>>>
|
|
529
|
+
>>> # Export for R analysis
|
|
530
|
+
>>> tte.master_state_table.to_csv('tte_data.csv', index=False)
|
|
531
|
+
>>>
|
|
532
|
+
>>> # In R:
|
|
533
|
+
>>> # library(msm)
|
|
534
|
+
>>> # data <- read.csv('tte_data.csv')
|
|
535
|
+
>>> # model <- msm(end_state ~ time_1, subject=freq_code,
|
|
536
|
+
>>> # data=data, qmatrix=Q, ...)
|
|
537
|
+
|
|
538
|
+
Notes
|
|
539
|
+
-----
|
|
540
|
+
**Automatic Filtering**:
|
|
541
|
+
|
|
542
|
+
The class automatically removes:
|
|
543
|
+
- Detections with overlapping == 1 (removed during overlap resolution)
|
|
544
|
+
- Detections with ambiguous_overlap == 1 (couldn't resolve receiver conflict)
|
|
545
|
+
- Bouts with < 3 detections (likely false positives)
|
|
546
|
+
|
|
547
|
+
**Adjacency Filter**:
|
|
548
|
+
|
|
549
|
+
Removes biologically impossible state transitions based on site geography.
|
|
550
|
+
Iteratively processes each fish until all illegal movements are removed.
|
|
551
|
+
|
|
552
|
+
Works by:
|
|
553
|
+
1. Finding rows with illegal transitions
|
|
554
|
+
2. Carrying forward the start_state and time_0 to next row
|
|
555
|
+
3. Deleting the illegal row
|
|
556
|
+
4. Recalculating transitions
|
|
557
|
+
5. Repeating until clean
|
|
558
|
+
|
|
559
|
+
**State Coding**:
|
|
560
|
+
|
|
561
|
+
Common conventions:
|
|
562
|
+
- 0: Release location (if initial_state_release=True)
|
|
563
|
+
- 1: Initial study area state
|
|
564
|
+
- 2-8: Intermediate states
|
|
565
|
+
- 9: Terminal/downstream state
|
|
566
|
+
|
|
567
|
+
**Time Format**:
|
|
568
|
+
|
|
569
|
+
All times are Unix epoch (seconds since 1970-01-01 00:00:00 UTC).
|
|
570
|
+
Convert in R: `as.POSIXct(time_0, origin='1970-01-01')`
|
|
571
|
+
|
|
572
|
+
See Also
|
|
573
|
+
--------
|
|
574
|
+
pymast.radio_project.make_recaptures_table : Builds input recaptures table
|
|
575
|
+
pymast.overlap_removal.overlap_reduction : Marks overlapping detections
|
|
576
|
+
cjs_data_prep : For standard CJS survival models in Program MARK
|
|
577
|
+
"""
|
|
366
578
|
def __init__(self,
|
|
367
579
|
receiver_to_state,
|
|
368
580
|
project,
|
|
369
581
|
input_type = 'query',
|
|
370
582
|
initial_state_release = False,
|
|
371
583
|
last_presence_time0 = False,
|
|
584
|
+
hit_ratio_filter = False,
|
|
372
585
|
cap_loc = None,
|
|
373
|
-
rel_loc = None,
|
|
374
|
-
species = None
|
|
586
|
+
rel_loc = None,
|
|
587
|
+
species = None,
|
|
588
|
+
rel_date = None,
|
|
589
|
+
recap_date = None):
|
|
375
590
|
# Import Data From Project HDF
|
|
376
591
|
self.rel_loc = rel_loc
|
|
377
592
|
self.cap_loc = cap_loc
|
|
378
|
-
|
|
593
|
+
|
|
594
|
+
self.initial_state_release = initial_state_release
|
|
379
595
|
# get recaptures, but first build a query
|
|
380
596
|
query_parts = []
|
|
381
597
|
for key in receiver_to_state:
|
|
@@ -384,8 +600,18 @@ class time_to_event():#inputFile,outputFile,time_dependent_covariates = False, c
|
|
|
384
600
|
|
|
385
601
|
self.recap_data = pd.read_hdf(project.db,
|
|
386
602
|
'recaptures',
|
|
387
|
-
where = qry)
|
|
603
|
+
where = qry)#"rec_id == 'R11'")
|
|
604
|
+
|
|
605
|
+
# Debug: check if ambiguous_overlap column exists
|
|
606
|
+
print(f"[TTE] Columns in recaptures table: {self.recap_data.columns.tolist()}")
|
|
607
|
+
print(f"[TTE] Has ambiguous_overlap: {'ambiguous_overlap' in self.recap_data.columns}")
|
|
388
608
|
|
|
609
|
+
if hit_ratio_filter == True:
|
|
610
|
+
self.recap_data = self.recap_data[self.recap_data.hit_ratio > 0.1]
|
|
611
|
+
|
|
612
|
+
if 'power' not in self.recap_data.columns:
|
|
613
|
+
self.recap_data['power'] = np.nan
|
|
614
|
+
|
|
389
615
|
self.recap_data.drop(columns = ['power',
|
|
390
616
|
'noise_ratio',
|
|
391
617
|
'det_hist',
|
|
@@ -398,72 +624,84 @@ class time_to_event():#inputFile,outputFile,time_dependent_covariates = False, c
|
|
|
398
624
|
inplace = True)
|
|
399
625
|
|
|
400
626
|
self.recap_data.set_index('freq_code', inplace = True)
|
|
627
|
+
|
|
401
628
|
self.recap_data = pd.merge(self.recap_data,
|
|
402
629
|
project.tags,
|
|
403
630
|
how = 'left',
|
|
404
631
|
left_index = True,
|
|
405
632
|
right_index = True)
|
|
406
633
|
|
|
407
|
-
|
|
634
|
+
# remove any row where df['your_column'] is NaN
|
|
635
|
+
self.recap_data.dropna(subset=['rel_date'], inplace=True)
|
|
636
|
+
df = self.recap_data
|
|
637
|
+
|
|
638
|
+
# Filter out overlapping detections (keep only overlapping=0)
|
|
639
|
+
self.recap_data = self.recap_data[self.recap_data.overlapping == 0]
|
|
640
|
+
|
|
641
|
+
# Filter out ambiguous overlaps if column exists (keep only ambiguous_overlap=0)
|
|
642
|
+
# For time-to-event models, fish can only be in one place at once
|
|
643
|
+
if 'ambiguous_overlap' in self.recap_data.columns:
|
|
644
|
+
before_ambig_filter = len(self.recap_data)
|
|
645
|
+
self.recap_data = self.recap_data[self.recap_data.ambiguous_overlap == 0]
|
|
646
|
+
after_ambig_filter = len(self.recap_data)
|
|
647
|
+
print(f"[TTE] Filtered {before_ambig_filter - after_ambig_filter} ambiguous overlap detections")
|
|
648
|
+
|
|
649
|
+
# Filter out detections from small bouts (likely false positives)
|
|
650
|
+
# Calculate bout size for each bout and filter
|
|
651
|
+
if 'bout_no' in self.recap_data.columns:
|
|
652
|
+
# Count detections per bout (per fish, per receiver, per bout_no)
|
|
653
|
+
bout_sizes = self.recap_data.groupby(['freq_code', 'rec_id', 'bout_no']).size().reset_index(name='bout_size')
|
|
654
|
+
self.recap_data = self.recap_data.merge(bout_sizes, on=['freq_code', 'rec_id', 'bout_no'], how='left')
|
|
655
|
+
|
|
656
|
+
# # Filter out bouts with < 3 detections (single spurious detections)
|
|
657
|
+
# min_bout_size = 3
|
|
658
|
+
# before_bout_filter = len(self.recap_data)
|
|
659
|
+
# self.recap_data = self.recap_data[self.recap_data['bout_size'] >= min_bout_size]
|
|
660
|
+
# after_bout_filter = len(self.recap_data)
|
|
661
|
+
# print(f"[TTE] Filtered {before_bout_filter - after_bout_filter} detections from bouts with < {min_bout_size} detections")
|
|
662
|
+
|
|
663
|
+
# Drop the bout_size column (no longer needed)
|
|
664
|
+
self.recap_data = self.recap_data.drop(columns=['bout_size'])
|
|
665
|
+
|
|
666
|
+
self.recap_data['rel_date'] = pd.to_datetime(self.recap_data.rel_date,format = 'mixed')
|
|
667
|
+
if 'pulse_rate' not in self.recap_data.columns:
|
|
668
|
+
self.recap_data['pulse_rate'] = np.nan
|
|
669
|
+
|
|
408
670
|
self.recap_data.drop(columns = ['pulse_rate',
|
|
409
671
|
'tag_type',
|
|
410
|
-
'rel_date',
|
|
411
672
|
'length'],
|
|
412
673
|
axis = 'columns',
|
|
413
674
|
inplace = True)
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
if species
|
|
417
|
-
self.recap_data =
|
|
675
|
+
# store requested species filter on the instance; apply later in data_prep
|
|
676
|
+
self.species = species
|
|
677
|
+
if "species" not in self.recap_data.columns:
|
|
678
|
+
self.recap_data["species"] = np.nan # or any default value you want
|
|
418
679
|
if rel_loc != None:
|
|
419
|
-
self.recap_data = self.recap_data[self.recap_data.
|
|
680
|
+
self.recap_data = self.recap_data[self.recap_data.rel_loc == rel_loc]
|
|
420
681
|
if cap_loc != None:
|
|
421
|
-
self.recap_data = self.recap_data[self.recap_data.
|
|
682
|
+
self.recap_data = self.recap_data[self.recap_data.cap_loc == cap_loc]
|
|
683
|
+
if rel_date != None:
|
|
684
|
+
self.recap_data = self.recap_data[self.recap_data.rel_date >= pd.to_datetime(rel_date)]
|
|
685
|
+
if recap_date != None:
|
|
686
|
+
self.recap_data = self.recap_data[self.recap_data.time_stamp >= pd.to_datetime(recap_date)]
|
|
422
687
|
|
|
423
|
-
self.recap_data['state'] = self.recap_data.rec_id.map(receiver_to_state)
|
|
424
688
|
|
|
689
|
+
self.recap_data['state'] = self.recap_data.rec_id.map(receiver_to_state)
|
|
690
|
+
self.recap_data.reset_index(inplace = True)
|
|
691
|
+
|
|
425
692
|
self.recap_data = self.recap_data.astype({'freq_code':'object',
|
|
426
693
|
'rec_id':'object',
|
|
427
694
|
'epoch':'float32',
|
|
428
|
-
'time_stamp':'datetime64',
|
|
695
|
+
'time_stamp':'datetime64[ns]',
|
|
429
696
|
'lag':'float32',
|
|
430
697
|
'cap_loc':'object',
|
|
431
698
|
'rel_loc':'object',
|
|
432
699
|
'state':'int32'})
|
|
433
700
|
|
|
434
701
|
print ("Unique states in Returned data:%s"%(self.recap_data.state.unique()))
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
than the initial location in our competing risks model. This allows
|
|
439
|
-
us to quantify fall back. If a fish never makes it to the intial
|
|
440
|
-
spoke, then its fall back.
|
|
441
|
-
|
|
442
|
-
If we care about modeling from the release point, we need to query
|
|
443
|
-
release times of each fish, morph data into a recaptures file and
|
|
444
|
-
merge it to self.data'''
|
|
445
|
-
|
|
446
|
-
# get data
|
|
447
|
-
release_dat = project.tags
|
|
448
|
-
|
|
449
|
-
# do some data management
|
|
450
|
-
release_dat['rel_date'] = pd.to_datetime(release_dat.rel_date)
|
|
451
|
-
release_dat['epoch'] = np.round((release_dat.rel_date - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s'),6)
|
|
452
|
-
release_dat.rename(columns = {'rel_date':'time_stamp'}, inplace = True)
|
|
453
|
-
release_dat['rec_id'] = np.repeat('rel', len(release_dat))
|
|
454
|
-
release_dat['state'] = np.zeros(len(release_dat))
|
|
455
|
-
release_dat['presence_number'] = np.zeros(len(release_dat))
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
# filter out tag data we don't want mucking up our staistical model
|
|
459
|
-
if species != None:
|
|
460
|
-
release_dat = release_dat[release_dat.Species == species]
|
|
461
|
-
if rel_loc != None:
|
|
462
|
-
release_dat = release_dat[release_dat.RelLoc == rel_loc]
|
|
463
|
-
if cap_loc != None:
|
|
464
|
-
release_dat = release_dat[release_dat.CapLoc == cap_loc]
|
|
465
|
-
|
|
466
|
-
self.recap_data = self.recap_data.append(release_dat)
|
|
702
|
+
|
|
703
|
+
# identify unique fish to loop through
|
|
704
|
+
self.fish = self.recap_data.freq_code.unique()
|
|
467
705
|
|
|
468
706
|
if last_presence_time0 == True:
|
|
469
707
|
''' sometimes when we are modeling downstream movement, it would be
|
|
@@ -519,494 +757,594 @@ class time_to_event():#inputFile,outputFile,time_dependent_covariates = False, c
|
|
|
519
757
|
if i not in fish_at_start:
|
|
520
758
|
self.recap_data.drop(self.recap_data[self.recap_data.freq_code == i].index,
|
|
521
759
|
inplace = True)
|
|
522
|
-
|
|
523
|
-
# Identify first recapture times
|
|
524
|
-
self.start_times = self.recap_data[self.recap_data.state == 1].\
|
|
525
|
-
groupby(['freq_code'])['epoch'].min().\
|
|
526
|
-
to_frame()
|
|
527
760
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
# we only care about movements from the initial sstate - this is a competing risks model
|
|
534
|
-
if fish not in self.start_times.index:
|
|
535
|
-
self.recap_data = self.recap_data[self.recap_data.freq_code != fish]
|
|
536
|
-
|
|
537
|
-
# identify unique fish to loop through
|
|
538
|
-
self.fish = self.recap_data.freq_code.unique()
|
|
539
|
-
|
|
540
|
-
def data_prep(self,
|
|
541
|
-
project,
|
|
542
|
-
time_dependent_covariates = False,
|
|
543
|
-
unknown_state = None,
|
|
544
|
-
bucket_length_min = 15,
|
|
545
|
-
adjacency_filter = None):
|
|
546
|
-
|
|
547
|
-
if unknown_state != None:
|
|
548
|
-
'''It may be beneficial to allow fish to enter into an unknown state
|
|
549
|
-
rather than become censored at their last recapture in the initial state.
|
|
550
|
-
This way the Nelson-Aalen will match empirical expectations. If we have
|
|
551
|
-
a lot of censored fish we lose information from the denominator and
|
|
552
|
-
numerator. If we put fish into an unknown state rather than censoring
|
|
553
|
-
them we still have informative data. For this to work, we only need to
|
|
554
|
-
know the last recapture of any fish in the initial state. We will
|
|
555
|
-
assess absorbption into the unknown state with a Boolean statement later on.'''
|
|
556
|
-
|
|
557
|
-
last_epoch = self.recap_data[self.recap_data.state == 1].epoch.max()
|
|
761
|
+
if initial_state_release == True:
|
|
762
|
+
'''we are modeling movement from the initial release location rather
|
|
763
|
+
than the initial location in our competing risks model. This allows
|
|
764
|
+
us to quantify fall back. If a fish never makes it to the intial
|
|
765
|
+
spoke, then its fall back.
|
|
558
766
|
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
tables.
|
|
767
|
+
If we care about modeling from the release point, we need to query
|
|
768
|
+
release times of each fish, morph data into a recaptures file and
|
|
769
|
+
merge it to self.data'''
|
|
563
770
|
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
time_delta = time_1 - time_0
|
|
597
|
-
|
|
598
|
-
# creat a row and add it to the state table
|
|
599
|
-
row_arr = [i,
|
|
600
|
-
fish_dat.state.values[0],
|
|
601
|
-
presence,
|
|
602
|
-
time_1,
|
|
603
|
-
time_delta,
|
|
604
|
-
time_0,
|
|
605
|
-
first_obs] # create initial row for state table
|
|
606
|
-
|
|
607
|
-
row = pd.DataFrame(np.array([row_arr]),columns = columns)
|
|
608
|
-
state_table = state_table.append(row)
|
|
609
|
-
first_obs = 0 # no other recapture can be the first
|
|
610
|
-
|
|
611
|
-
# give rows an arbitrary index and find the index of our last row
|
|
612
|
-
fish_dat['idx'] = np.arange(0,len(fish_dat),1)
|
|
613
|
-
max_idx = fish_dat.idx.iloc[-1]
|
|
614
|
-
|
|
615
|
-
# loop through rows, if it is a new state enter transition data
|
|
616
|
-
for j in fish_dat.iterrows():
|
|
617
|
-
row_idx = j[1]['idx']
|
|
618
|
-
state = j[1]['state']
|
|
619
|
-
prev_state = j[1]['prev_state']
|
|
620
|
-
|
|
621
|
-
if state != prev_state or row_idx == max_idx:
|
|
622
|
-
time_1 = j[1]['epoch']
|
|
623
|
-
if unknown_state != None \
|
|
624
|
-
and row_idx == max_idx \
|
|
625
|
-
and state == 1 \
|
|
626
|
-
and time_1 < last_epoch:
|
|
627
|
-
state = unknown_state
|
|
628
|
-
|
|
629
|
-
time_delta = time_1 - time_0
|
|
630
|
-
presence = presence + 1
|
|
631
|
-
row_arr = [i,state,presence,time_1,time_delta,time_0,first_obs]
|
|
632
|
-
|
|
633
|
-
row = pd.DataFrame(row_arr,columns = columns)
|
|
634
|
-
row['state'] = pd.to_numeric(row.state)
|
|
635
|
-
row = row.astype({'freq_code':'object',
|
|
636
|
-
'state':'int32',
|
|
637
|
-
'presence':'int32',
|
|
638
|
-
'epoch':'float32',
|
|
639
|
-
'time_delta':'float32',
|
|
640
|
-
'time_0': 'float32',
|
|
641
|
-
'first_obs':'int32'})
|
|
642
|
-
|
|
643
|
-
state_table = state_table.append(row)
|
|
644
|
-
time_0 = j[1]['epoch']
|
|
645
|
-
|
|
646
|
-
print ("State Table Completed for Fish %s"%(i))
|
|
647
|
-
|
|
648
|
-
# identify transitions and write to the state table
|
|
649
|
-
from_rec = state_table['state'].shift(1)
|
|
650
|
-
|
|
651
|
-
to_rec = state_table['state'].astype(np.int32)
|
|
652
|
-
trans = tuple(zip(from_rec,to_rec))
|
|
653
|
-
state_table['transition'] = trans
|
|
654
|
-
state_table['start_state'] = from_rec
|
|
655
|
-
state_table['end_state'] = to_rec
|
|
656
|
-
|
|
657
|
-
# get time all sorted out
|
|
658
|
-
state_table['t0'] = np.zeros(len(state_table))
|
|
659
|
-
state_table['t1'] = state_table['epoch'] - state_table['time_0']
|
|
771
|
+
# get data
|
|
772
|
+
release_dat = project.tags.copy()
|
|
773
|
+
|
|
774
|
+
# do some data management
|
|
775
|
+
rel_raw = release_dat['rel_date']
|
|
776
|
+
rel_parsed = pd.to_datetime(rel_raw, format="%m/%d/%y %H:%M", errors="coerce")
|
|
777
|
+
if rel_parsed.isna().any():
|
|
778
|
+
alt_parsed = pd.to_datetime(rel_raw, format="%m/%d/%Y %H:%M", errors="coerce")
|
|
779
|
+
rel_parsed = rel_parsed.fillna(alt_parsed)
|
|
780
|
+
if rel_parsed.isna().any():
|
|
781
|
+
missing = int(rel_parsed.isna().sum())
|
|
782
|
+
raise ValueError(f"[TTE] Failed to parse {missing} release dates in tags table.")
|
|
783
|
+
release_dat['rel_date'] = rel_parsed
|
|
784
|
+
release_dat['epoch'] = np.round((release_dat.rel_date - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s'),6)
|
|
785
|
+
release_dat.rename(columns = {'rel_date':'time_stamp'}, inplace = True)
|
|
786
|
+
release_dat['rec_id'] = np.repeat('rel', len(release_dat))
|
|
787
|
+
release_dat['state'] = np.zeros(len(release_dat))
|
|
788
|
+
release_dat['presence_number'] = np.zeros(len(release_dat))
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
# filter out tag data we don't want mucking up our staistical model
|
|
792
|
+
# NOTE: do not apply species filter here; defer to data_prep to keep
|
|
793
|
+
# master_state_table construction deterministic regardless of
|
|
794
|
+
# whether species filtering is requested.
|
|
795
|
+
if rel_loc != None:
|
|
796
|
+
release_dat = release_dat[release_dat.rel_loc == rel_loc]
|
|
797
|
+
if cap_loc != None:
|
|
798
|
+
release_dat = release_dat[release_dat.cap_loc == cap_loc]
|
|
799
|
+
if rel_date != None:
|
|
800
|
+
release_dat = release_dat[release_dat.time_stamp >= pd.to_datetime(rel_date)]
|
|
801
|
+
if recap_date != None:
|
|
802
|
+
release_dat = release_dat[release_dat.time_stamp >= pd.to_datetime(recap_date)]
|
|
660
803
|
|
|
661
|
-
|
|
662
|
-
|
|
804
|
+
release_dat.reset_index(inplace = True)
|
|
805
|
+
release_dat = release_dat[release_dat['freq_code'].isin(self.fish)]
|
|
663
806
|
|
|
664
|
-
|
|
807
|
+
# add to recaptures table and create a start times table
|
|
808
|
+
self.recap_data = pd.concat([self.recap_data, release_dat], axis=0, ignore_index=True)
|
|
809
|
+
self.start_times = release_dat[['freq_code', 'epoch']].copy()
|
|
810
|
+
self.start_times = self.start_times.rename(columns={'epoch': 'first_recapture'}).set_index('freq_code')
|
|
665
811
|
else:
|
|
666
|
-
|
|
812
|
+
# movement from state 1
|
|
813
|
+
self.start_times = self.recap_data[self.recap_data.state == 1].\
|
|
814
|
+
groupby(['freq_code'])['epoch'].min().\
|
|
815
|
+
to_frame()
|
|
667
816
|
|
|
668
|
-
|
|
669
|
-
self.
|
|
817
|
+
# movement from first recapture
|
|
818
|
+
# self.start_times = (
|
|
819
|
+
# self.recap_data
|
|
820
|
+
# .groupby(['freq_code'])['epoch']
|
|
821
|
+
# .min()
|
|
822
|
+
# .to_frame()
|
|
823
|
+
# )
|
|
824
|
+
|
|
825
|
+
self.start_times.rename(columns = {'epoch':'first_recapture'},
|
|
826
|
+
inplace = True)
|
|
827
|
+
|
|
828
|
+
# Clean Up stuff that doesn't make sense
|
|
829
|
+
for fish in self.recap_data.freq_code.unique():
|
|
830
|
+
# we only care about movements from the initial sstate - this is a competing risks model
|
|
831
|
+
if fish not in self.start_times.index:
|
|
832
|
+
self.recap_data = self.recap_data[self.recap_data.freq_code != fish]
|
|
833
|
+
|
|
834
|
+
def _apply_adjacency_filter(self, adjacency_filter):
|
|
835
|
+
'''When the truth value of a detection is assessed, a detection
|
|
836
|
+
may be valid for a fish that is not present.
|
|
837
|
+
|
|
838
|
+
In some instances, especially when using Yagi antennas, back-lobes
|
|
839
|
+
may develop where a fish in the tailrace of a powerhouse is
|
|
840
|
+
detected in the forebay antenna. In these instances, a downstream
|
|
841
|
+
migrating fish would not have migrated up through the powerhouse.
|
|
842
|
+
|
|
843
|
+
From a false positive perspective, these records are valid detections.
|
|
844
|
+
However, from a movement perspective, these series of detections
|
|
845
|
+
could not occur and should be removed.
|
|
846
|
+
|
|
847
|
+
This function repeatedly removes rows with 'illegal' movements
|
|
848
|
+
until there are none left. Rows with 'illegal' transitions are
|
|
849
|
+
identified with a list that is passed to the function.
|
|
850
|
+
|
|
851
|
+
input = list of illegal transitions stored as (from, to) tuples
|
|
852
|
+
'''
|
|
853
|
+
fish = self.master_state_table.freq_code.unique()
|
|
854
|
+
filtered = pd.DataFrame()
|
|
855
|
+
for i in fish:
|
|
856
|
+
fish_dat = self.master_state_table[self.master_state_table.freq_code == i]
|
|
857
|
+
|
|
858
|
+
# create a condition, we're running this filter because we know illogical movements are present
|
|
859
|
+
bad_moves_present = True
|
|
860
|
+
|
|
861
|
+
# while there are illogical movements, keep filtering
|
|
862
|
+
while bad_moves_present == True:
|
|
863
|
+
# let's keep count of the number of rows we are filtering
|
|
864
|
+
filtered_rows = 0.0
|
|
865
|
+
|
|
866
|
+
# for every known bad movement
|
|
867
|
+
for j in adjacency_filter:
|
|
868
|
+
# find those rows where this movement exists
|
|
869
|
+
if 'transition' not in fish_dat.columns:
|
|
870
|
+
raise KeyError(
|
|
871
|
+
f"[TTE] Missing 'transition' column while filtering fish {i}."
|
|
872
|
+
)
|
|
873
|
+
fish_dat['transition_filter'] = np.where(fish_dat.transition == j, 1, 0)
|
|
874
|
+
#fish_dat.set_index(['time_0'], inplace = True)
|
|
875
|
+
|
|
876
|
+
if fish_dat.transition_filter.sum() > 0:
|
|
877
|
+
# add up those rows
|
|
878
|
+
filtered_rows = filtered_rows + fish_dat.transition_filter.sum()
|
|
879
|
+
print ('%s rows found with %s movements'%(fish_dat.transition_filter.sum(),j))
|
|
880
|
+
|
|
881
|
+
# do some data management, we need to take the start state and t0 of the affected rows and place them on the subsequent row
|
|
882
|
+
idx = fish_dat.index[fish_dat['transition_filter']==1]
|
|
883
|
+
time0 = fish_dat.iloc[0]['time_0']
|
|
884
|
+
|
|
885
|
+
for k in idx:
|
|
886
|
+
idx_int = fish_dat.index.get_loc(k)
|
|
887
|
+
t0_col = fish_dat.columns.get_loc('time_0')
|
|
888
|
+
start_col = fish_dat.columns.get_loc('start_state')
|
|
889
|
+
|
|
890
|
+
# get start time and start state
|
|
891
|
+
start = fish_dat.iloc[idx_int]['start_state']
|
|
892
|
+
t0 = fish_dat.iloc[idx_int]['time_0']
|
|
893
|
+
|
|
894
|
+
# write it to next row
|
|
895
|
+
idx1 = idx_int + 1
|
|
896
|
+
try:
|
|
897
|
+
fish_dat.iloc[idx1, start_col] = start
|
|
898
|
+
fish_dat.iloc[idx1, t0_col] = t0
|
|
899
|
+
except IndexError:
|
|
900
|
+
# when this occurs, there is no extra row - this last row will be deleted
|
|
901
|
+
continue
|
|
902
|
+
|
|
903
|
+
# remove those rows
|
|
904
|
+
fish_dat = fish_dat[fish_dat.transition_filter != 1]
|
|
905
|
+
# NOTE: do NOT assign a single time0 to the entire DataFrame
|
|
906
|
+
# (that overwrites time_0 for all remaining rows). The
|
|
907
|
+
# intended behavior is to copy start/time to the next row
|
|
908
|
+
# above (done in the loop that writes to idx1), so we
|
|
909
|
+
# leave the remaining time_0 values intact.
|
|
910
|
+
|
|
911
|
+
# create a new transition field
|
|
912
|
+
fish_dat['transition'] = list(zip(fish_dat.start_state.values.astype(int),
|
|
913
|
+
fish_dat.end_state.values.astype(int))) # Fixed: was tuple, should be list
|
|
914
|
+
|
|
915
|
+
#fish_dat.reset_index(inplace = True)
|
|
916
|
+
else:
|
|
917
|
+
pass # No illegal movements for this transition type
|
|
918
|
+
#fish_dat.reset_index(inplace = True)
|
|
919
|
+
|
|
920
|
+
if filtered_rows == 0.0:
|
|
921
|
+
# stop that loop
|
|
922
|
+
bad_moves_present = False
|
|
923
|
+
else:
|
|
924
|
+
pass # Continue filtering
|
|
925
|
+
|
|
926
|
+
# we can only have 1 transmission to point of no return - let's grab the last recapture
|
|
927
|
+
equal_rows = fish_dat[fish_dat['start_state'] == fish_dat['end_state']]
|
|
928
|
+
|
|
929
|
+
# Step 2: Get the index of the last occurrence where column1 equals column2
|
|
930
|
+
if fish_dat.empty:
|
|
931
|
+
raise ValueError(f"[TTE] No transitions remain for fish {i} after filtering.")
|
|
932
|
+
last_index = fish_dat.index[-1]
|
|
933
|
+
|
|
934
|
+
# Step 3: Drop all rows where column1 equals column2 except the last one
|
|
935
|
+
if len(equal_rows) > 1 and equal_rows.index[-1] == last_index:
|
|
936
|
+
fish_dat = fish_dat.drop(equal_rows.index[:-1])
|
|
937
|
+
elif len(equal_rows) > 1 and equal_rows.index[-1] != last_index:
|
|
938
|
+
fish_dat = fish_dat.drop(equal_rows.index)
|
|
939
|
+
elif len(equal_rows) == 1:
|
|
940
|
+
fish_dat = fish_dat.drop(equal_rows.index)
|
|
941
|
+
|
|
942
|
+
fish_dat.drop(labels = ['transition_filter'], axis = 1, inplace = True)
|
|
943
|
+
filtered = pd.concat([filtered, fish_dat])
|
|
944
|
+
|
|
945
|
+
# Print summary statistics
|
|
946
|
+
initial_transitions = len(self.master_state_table)
|
|
947
|
+
final_transitions = len(filtered)
|
|
948
|
+
removed = initial_transitions - final_transitions
|
|
949
|
+
if initial_transitions == 0:
|
|
950
|
+
raise ValueError("[TTE] Adjacency filter cannot run: master_state_table is empty.")
|
|
951
|
+
print(f"\n[TTE] Adjacency filter complete:")
|
|
952
|
+
print(f" Initial transitions: {initial_transitions:,}")
|
|
953
|
+
print(f" Final transitions: {final_transitions:,}")
|
|
954
|
+
print(f" Removed illegal movements: {removed:,} ({removed/initial_transitions*100:.1f}%)")
|
|
955
|
+
|
|
956
|
+
if self.initial_state_release == False:
|
|
957
|
+
self.master_state_table
|
|
958
|
+
|
|
959
|
+
self.master_state_table = filtered
|
|
960
|
+
|
|
961
|
+
def data_prep(self, project, unknown_state=None, bucket_length_min=15, adjacency_filter=None):
|
|
962
|
+
self.project = project
|
|
963
|
+
if unknown_state is not None:
|
|
964
|
+
last_epoch = self.recap_data[self.recap_data.state == 1].epoch.max()
|
|
965
|
+
|
|
966
|
+
columns = ['freq_code','species', 'start_state', 'end_state', 'presence', 'time_stamp',
|
|
967
|
+
'time_delta', 'first_obs', 'time_0', 'time_1', 'transition'] # Include 'transition' here
|
|
968
|
+
|
|
969
|
+
self.master_state_table = pd.DataFrame()
|
|
970
|
+
self.bucket_length = bucket_length_min
|
|
971
|
+
|
|
972
|
+
# Sorting recap_data by freq_code and epoch for efficient processing
|
|
973
|
+
self.recap_data.sort_values(by=['freq_code', 'epoch'], ascending=True, inplace=True)
|
|
974
|
+
|
|
975
|
+
# if self.initial_state_release == True:
|
|
976
|
+
# # Merge start_times into recap_data based on freq_code
|
|
977
|
+
# self.recap_data = self.recap_data.merge(
|
|
978
|
+
# self.start_times[['first_recapture']].reset_index(),
|
|
979
|
+
# on='freq_code',
|
|
980
|
+
# how='left'
|
|
981
|
+
# )
|
|
670
982
|
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
983
|
+
self.recap_data = self.recap_data.merge(
|
|
984
|
+
self.start_times[['first_recapture']].reset_index(),
|
|
985
|
+
on='freq_code',
|
|
986
|
+
how='left')
|
|
987
|
+
|
|
988
|
+
# Create a boolean array to mark the start of a new fish
|
|
989
|
+
fish_start_mask = self.recap_data['freq_code'] != self.recap_data['freq_code'].shift(1)
|
|
990
|
+
|
|
991
|
+
# Initialize state tracking columns
|
|
992
|
+
self.recap_data['prev_state'] = self.recap_data.groupby('freq_code')['state'].shift(1).fillna(0).astype(int)
|
|
993
|
+
if self.initial_state_release == False:
|
|
994
|
+
self.recap_data = self.recap_data[self.recap_data.prev_state > 0]
|
|
995
|
+
# Set time_0 to the previous epoch or first_recapture if it's the first observation
|
|
996
|
+
self.recap_data['time_0'] = self.recap_data.groupby('freq_code')['epoch'].shift(1)
|
|
997
|
+
self.recap_data['time_0'] = self.recap_data['time_0'].fillna(
|
|
998
|
+
self.recap_data['first_recapture']
|
|
999
|
+
)
|
|
1000
|
+
|
|
1001
|
+
self.recap_data['time_delta'] = self.recap_data['epoch'] - self.recap_data['time_0']
|
|
1002
|
+
|
|
1003
|
+
# Identify the rows where state changes or the fish changes (new fish)
|
|
1004
|
+
state_change_mask = self.recap_data['state'] != self.recap_data['prev_state']
|
|
1005
|
+
last_recapture_mask = self.recap_data.groupby('freq_code')['epoch'].transform('max') == self.recap_data['epoch']
|
|
1006
|
+
mask = state_change_mask | last_recapture_mask
|
|
1007
|
+
|
|
1008
|
+
# Filter rows to keep only those where state changes or it's the last record for the fish
|
|
1009
|
+
state_table = self.recap_data[mask].copy()
|
|
1010
|
+
|
|
1011
|
+
# Fill in the remaining columns
|
|
1012
|
+
state_table['start_state'] = state_table['prev_state'].astype('int32')
|
|
1013
|
+
state_table['end_state'] = state_table['state'].astype('int32')
|
|
1014
|
+
# drop duplicates
|
|
1015
|
+
state_table = state_table.drop_duplicates(subset=['time_0', 'end_state', 'start_state'])
|
|
1016
|
+
|
|
1017
|
+
state_table['presence'] = state_table.groupby('freq_code').cumcount()
|
|
1018
|
+
state_table['first_obs'] = fish_start_mask.astype(int)
|
|
1019
|
+
state_table['time_1'] = state_table['epoch']
|
|
1020
|
+
|
|
1021
|
+
# Create the 'transition' column by zipping 'start_state' and 'end_state'
|
|
1022
|
+
state_table['transition'] = list(zip(state_table['start_state'].astype('int32'), state_table['end_state'].astype('int32')))
|
|
1023
|
+
|
|
1024
|
+
# Add flow period for time-dependent variables
|
|
1025
|
+
state_table['flow_period'] = state_table['time_stamp'].dt.round('30min')
|
|
1026
|
+
|
|
1027
|
+
|
|
1028
|
+
# Write state table to master state table
|
|
1029
|
+
self.master_state_table = pd.concat([self.master_state_table, state_table[columns]], axis=0, ignore_index=True)
|
|
1030
|
+
|
|
1031
|
+
# handle initial release logic in helper functions to improve readability
|
|
1032
|
+
if self.initial_state_release:
|
|
1033
|
+
try:
|
|
1034
|
+
self._convert_release_rows()
|
|
1035
|
+
self._ensure_diagonals()
|
|
1036
|
+
self._insert_missing_releases(project)
|
|
1037
|
+
self._normalize_master()
|
|
1038
|
+
except (KeyError, ValueError, TypeError, IndexError) as e:
|
|
1039
|
+
raise RuntimeError(
|
|
1040
|
+
"[TTE] Failed to apply initial release normalization. "
|
|
1041
|
+
"Check release rows and state transitions for validity."
|
|
1042
|
+
) from e
|
|
1043
|
+
|
|
1044
|
+
if adjacency_filter is not None:
|
|
1045
|
+
self._apply_adjacency_filter(adjacency_filter)
|
|
1046
|
+
# Apply species filter to master_state_table if requested (defer filtering until master built)
|
|
1047
|
+
if getattr(self, 'species', None) is not None:
|
|
1048
|
+
if 'species' not in self.master_state_table.columns:
|
|
1049
|
+
raise ValueError("[TTE] Species filter requested but 'species' column is missing.")
|
|
1050
|
+
before = len(self.master_state_table)
|
|
1051
|
+
self.master_state_table = self.master_state_table[self.master_state_table.species == self.species].copy()
|
|
1052
|
+
after = len(self.master_state_table)
|
|
1053
|
+
print(f"[TTE] Applied species filter '{self.species}': {before} -> {after} rows")
|
|
1054
|
+
#self.master_stateTable = self.master_stateTable[self.master_stateTable.firstObs == 0]
|
|
1055
|
+
#self.master_state_table.to_csv(os.path.join(project.output_dir,'state_table.csv')
|
|
1056
|
+
|
|
1057
|
+
# generate summary statistics
|
|
1058
|
+
def summary(self, print_summary = True):
|
|
1059
|
+
"""Prepare the data needed for summarization."""
|
|
1060
|
+
self.master_state_table.dropna(subset = ['time_delta'],inplace = True)
|
|
1061
|
+
self.master_state_table = self.master_state_table.astype({'freq_code':'object',
|
|
1062
|
+
'species':'object',
|
|
1063
|
+
'start_state':'int32',
|
|
1064
|
+
'end_state':'int32',
|
|
1065
|
+
'presence':'int32',
|
|
1066
|
+
'time_stamp':'datetime64[ns]',
|
|
1067
|
+
'time_delta':'int32',
|
|
1068
|
+
'first_obs':'int32',
|
|
1069
|
+
'time_0':'int32',
|
|
1070
|
+
'time_1':'int32',
|
|
1071
|
+
'transition':'object'})
|
|
1072
|
+
|
|
1073
|
+
self.master_state_table['dur'] = (
|
|
1074
|
+
self.master_state_table['time_1'].astype('int32') -
|
|
1075
|
+
self.master_state_table['time_0'].astype('int32')
|
|
1076
|
+
)
|
|
1077
|
+
|
|
1078
|
+
self.unique_fish_count = len(self.master_state_table['freq_code'].unique())
|
|
1079
|
+
self.count_per_state = self.master_state_table.groupby('end_state')['freq_code'].nunique()
|
|
1080
|
+
self.msm_state_table = pd.crosstab(self.master_state_table['start_state'], self.master_state_table['end_state'])
|
|
1081
|
+
self.count_table = self.master_state_table.groupby(['start_state', 'end_state'])['freq_code'].nunique().unstack().fillna(0).astype('int32')
|
|
1082
|
+
self.fish_trans_count = self.master_state_table.groupby(['freq_code', 'transition']).size().unstack(fill_value=0)
|
|
1083
|
+
|
|
1084
|
+
grouped_stats = (
|
|
1085
|
+
self.master_state_table
|
|
1086
|
+
.groupby('transition')
|
|
1087
|
+
.agg({
|
|
1088
|
+
'dur': [
|
|
1089
|
+
'min',
|
|
1090
|
+
'median',
|
|
1091
|
+
'max'
|
|
1092
|
+
]
|
|
1093
|
+
})
|
|
1094
|
+
)
|
|
1095
|
+
self.move_summ = grouped_stats
|
|
1096
|
+
|
|
1097
|
+
"""Generate summary statistics as a dictionary."""
|
|
1098
|
+
min_trans_count = self.fish_trans_count.min()
|
|
1099
|
+
med_trans_count = self.fish_trans_count.median()
|
|
1100
|
+
max_trans_count = self.fish_trans_count.max()
|
|
1101
|
+
|
|
1102
|
+
summary_stats = {
|
|
1103
|
+
"unique_fish_count": self.unique_fish_count,
|
|
1104
|
+
"count_per_state": self.count_per_state,
|
|
1105
|
+
"state_transition_table": self.msm_state_table,
|
|
1106
|
+
"movement_count_table": self.count_table,
|
|
1107
|
+
"min_transition_count": min_trans_count,
|
|
1108
|
+
"median_transition_count": med_trans_count,
|
|
1109
|
+
"max_transition_count": max_trans_count,
|
|
1110
|
+
"movement_duration_summary": self.move_summ
|
|
1111
|
+
}
|
|
1112
|
+
# Print stats
|
|
1113
|
+
|
|
1114
|
+
print("-" * 110)
|
|
1115
|
+
print("Time To Event Data Manage Complete")
|
|
1116
|
+
print("-" * 110 + "\n")
|
|
1117
|
+
|
|
1118
|
+
print("--------------------------------------- MOVEMENT SUMMARY STATISTICS -----------------------------------------\n")
|
|
1119
|
+
print(f"In total, there were {summary_stats['unique_fish_count']} unique fish within this competing risks model.\n")
|
|
1120
|
+
|
|
1121
|
+
print(f"{summary_stats['unique_fish_count']} fish made the movements as enumerated in the state transition table:")
|
|
1122
|
+
print(summary_stats['state_transition_table'])
|
|
1123
|
+
print("The table should read movement from a row to a column.\n")
|
|
1124
|
+
|
|
1125
|
+
print("The number of unique fish to make these movements are found in the following count table:")
|
|
1126
|
+
print(summary_stats['movement_count_table'], "\n")
|
|
688
1127
|
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
time_1 = fish_dat.epoch.iloc[0]
|
|
693
|
-
|
|
694
|
-
# calculate seconds since releaes
|
|
695
|
-
time_delta = time_1 - time_0
|
|
696
|
-
|
|
697
|
-
# creat a row and add it to the state table
|
|
698
|
-
row_arr = [i,
|
|
699
|
-
0,
|
|
700
|
-
fish_dat.state.values[0],
|
|
701
|
-
presence,
|
|
702
|
-
fish_dat.time_stamp.values[-1],
|
|
703
|
-
first_obs,
|
|
704
|
-
time_0,
|
|
705
|
-
time_1] # create initial row for state table
|
|
706
|
-
|
|
707
|
-
row = pd.DataFrame(np.array([row_arr]),columns = columns)
|
|
708
|
-
state_table = state_table.append(row)
|
|
709
|
-
|
|
710
|
-
# create arbitrary index and get the maximum
|
|
711
|
-
fish_dat['idx'] = np.arange(0,len(fish_dat),1)
|
|
712
|
-
max_idx = fish_dat.idx.iloc[-1]
|
|
713
|
-
|
|
714
|
-
# for each row, if it's a new presence add data to state table
|
|
715
|
-
for j in fish_dat.iterrows(): # for every row in fish data
|
|
716
|
-
row_idx = j[1]['idx'] # what's the row number?
|
|
717
|
-
state_1 = int(j[1]['prev_state']) # what's the state
|
|
718
|
-
state_2 = int(j[1]['state']) # what was the previous state
|
|
719
|
-
ts = j[1]['time_stamp']
|
|
720
|
-
|
|
721
|
-
# if it's a new state or the end add a row
|
|
722
|
-
if state_1 != state_2 or row_idx == max_idx: # if the present state does not equal the previous state or if we reach the end of the dataframe...
|
|
723
|
-
time_1 = j[1]['epoch'] # what time is it?
|
|
724
|
-
time_delta = time_1 - time_0 # calculate difference in seconds between current time and release # if it's a new state
|
|
725
|
-
presence = presence + 1 # oh snap new observation for new state
|
|
726
|
-
row_arr = [i,
|
|
727
|
-
state_1,
|
|
728
|
-
state_2,
|
|
729
|
-
presence,
|
|
730
|
-
ts,
|
|
731
|
-
first_obs,
|
|
732
|
-
time_0,
|
|
733
|
-
time_1] # start a new row
|
|
734
|
-
row = pd.DataFrame(np.array([row_arr]),
|
|
735
|
-
columns = columns)
|
|
736
|
-
state_table = state_table.append(row) # add the row to the state table data frame
|
|
737
|
-
time_0 = j[1]['epoch']
|
|
738
|
-
first_obs = 0
|
|
739
|
-
|
|
740
|
-
print ("State Table Completed for Fish %s"%(i))
|
|
741
|
-
|
|
742
|
-
state_table.sort_values(by = 'time_0',
|
|
743
|
-
ascending = True,
|
|
744
|
-
inplace = True) # sort by exposure time
|
|
745
|
-
|
|
746
|
-
# put time into increments to match time series variables
|
|
747
|
-
time_bucket = self.bucket_length*60*1000000000 # time bucket in nanoseconds
|
|
748
|
-
state_table['flow_period'] = (state_table['time_0'].\
|
|
749
|
-
astype(np.int64)//time_bucket+1) * time_bucket # round to nearest 15 minute period
|
|
750
|
-
state_table['flow_period'] = pd.to_datetime(state_table['flow_period']) # turn it into a datetime object so we can use pandas to expand and fill
|
|
751
|
-
|
|
752
|
-
# create arbitrary index
|
|
753
|
-
row_num = np.arange(0,len(state_table),1)
|
|
754
|
-
state_table['row_num'] = row_num
|
|
755
|
-
|
|
756
|
-
# create an expanded state table
|
|
757
|
-
exp_state_table = pd.DataFrame()
|
|
758
|
-
|
|
759
|
-
# build expanded state table
|
|
760
|
-
for row in state_table.iterrows():
|
|
761
|
-
row_idx = row[1]['row_num']
|
|
762
|
-
t0 = row[1]['flow_period']
|
|
763
|
-
t1 = row[1]['t1']
|
|
764
|
-
|
|
765
|
-
# try expanding, if interval not large enough return nothing
|
|
766
|
-
try:
|
|
767
|
-
expand = pd.date_range(t0,
|
|
768
|
-
t1,
|
|
769
|
-
freq = '%smin'%(self.bucket_length))
|
|
770
|
-
except ValueError:
|
|
771
|
-
expand = []
|
|
772
|
-
except AttributeError:
|
|
773
|
-
expand = []
|
|
774
|
-
|
|
775
|
-
# if we can expand create intervals
|
|
776
|
-
if len(expand) > 0:
|
|
777
|
-
# create a series using expanded time stamps
|
|
778
|
-
series = pd.Series(expand,
|
|
779
|
-
index = expand,
|
|
780
|
-
name = 'flow_period')
|
|
781
|
-
|
|
782
|
-
# convert series to invterval dataframe and perform data management
|
|
783
|
-
intervals = series.to_frame()
|
|
784
|
-
intervals.reset_index(inplace = True, drop = True)
|
|
785
|
-
intervals['t0'] = row[1]['t0']
|
|
786
|
-
intervals['t1'] = row[1]['t1']
|
|
787
|
-
intervals['startState'] = row[1]['startState']
|
|
788
|
-
intervals['endState'] = row[1]['endState']
|
|
789
|
-
intervals['timeStamp'] = row[1]['timeStamp']
|
|
790
|
-
intervals['FreqCode'] = row[1]['FreqCode']
|
|
791
|
-
intervals['presence'] = row[1]['presence']
|
|
792
|
-
newRowArr = np.array([row[1]['FreqCode'],
|
|
793
|
-
row[1]['startState'],
|
|
794
|
-
row[1]['endState'],
|
|
795
|
-
row[1]['timeStamp'],
|
|
796
|
-
row[1]['flowPeriod'],
|
|
797
|
-
row[1]['t0'],
|
|
798
|
-
row[1]['t1'],
|
|
799
|
-
row[1]['presence']])
|
|
800
|
-
newRow = pd.DataFrame(np.array([newRowArr]),columns = ['FreqCode',
|
|
801
|
-
'startState',
|
|
802
|
-
'endState',
|
|
803
|
-
'timeStamp',
|
|
804
|
-
'flowPeriod',
|
|
805
|
-
't0',
|
|
806
|
-
't1',
|
|
807
|
-
'presence']) # add first, non expanded row to new state table
|
|
808
|
-
newRow = newRow.append(intervals) # add filled and expanded data
|
|
809
|
-
newRow['nextFlowPeriod'] = newRow['flowPeriod'].shift(-1) # identify the next flow period
|
|
810
|
-
newRow['idx'] = np.arange(0,len(newRow),1) # add a count index field, but don't index it yet
|
|
811
|
-
newRow.reset_index(inplace = True, drop = True) # remove the index
|
|
812
|
-
idxL = newRow.idx.values # generate list of indexes
|
|
813
|
-
newRow.loc[idxL[1]:,'t0'] = newRow.loc[idxL[1]:,'flowPeriod'].astype(str) # after the initial t0, re-write the current t0 as the current row's flow period
|
|
814
|
-
newRow.ix[:idxL[-2],'t1'] = newRow.loc[:idxL[-2],'nextFlowPeriod'].astype(str) # other than the last t1, re-write the current t1 as the current row's next flow period - see what we did there?
|
|
815
|
-
newRow.ix[:idxL[-2]:,'endState'] = row[1]['startState']# other than the last row in the series, re-write the end state as the start state - there will be a lot of to-from same site here. it's ok, these are censored observations.
|
|
816
|
-
newRow['t0'] = pd.to_datetime(newRow['t0']) # convert time text to datetime - so we can do stuff with it
|
|
817
|
-
newRow['t1'] = pd.to_datetime(newRow['t1'])
|
|
818
|
-
exp_state_table = exp_state_table.append(newRow) # now add all that stuff to the state table dataframe
|
|
819
|
-
del newRow, intervals, newRowArr, expand
|
|
820
|
-
else:
|
|
821
|
-
newRowArr = np.array([row[1]['FreqCode'],
|
|
822
|
-
row[1]['startState'],
|
|
823
|
-
row[1]['endState'],
|
|
824
|
-
row[1]['timeStamp'],
|
|
825
|
-
row[1]['flowPeriod'],
|
|
826
|
-
row[1]['t0'],
|
|
827
|
-
row[1]['t1'],
|
|
828
|
-
row[1]['presence']])
|
|
829
|
-
newRow = pd.DataFrame(np.array([newRowArr]),columns = ['FreqCode',
|
|
830
|
-
'startState',
|
|
831
|
-
'endState',
|
|
832
|
-
'timeStamp',
|
|
833
|
-
'flowPeriod',
|
|
834
|
-
't0',
|
|
835
|
-
't1',
|
|
836
|
-
'presence']) # add first, non expanded row to new state table
|
|
837
|
-
exp_state_table = exp_state_table.append(newRow)
|
|
838
|
-
del newRow, newRowArr
|
|
839
|
-
# exp_state_table.sort_values(by = 't0', ascending = True, inplace = True) # sort by exposure time
|
|
840
|
-
# exp_state_table['time0'] = pd.to_datetime(exp_state_table['t0']) # create new time columns
|
|
841
|
-
# exp_state_table['time1'] = pd.to_datetime(exp_state_table['t1'])
|
|
842
|
-
# exp_state_table['t0'] = (pd.to_datetime(exp_state_table['t0']) - initialTime)/np.timedelta64(1,'s')
|
|
843
|
-
# exp_state_table['t1'] = (pd.to_datetime(exp_state_table['t1']) - initialTime)/np.timedelta64(1,'s')
|
|
844
|
-
# # calculate minimum t0 by presence
|
|
845
|
-
# min_t0 = exp_stateTable.groupby(['presence'])['t0'].min()#.to_frame().rename({'t0':'min_t0'},inplace = True)
|
|
846
|
-
# min_t0 = pd.Series(min_t0, name = 'min_t0')
|
|
847
|
-
# min_t0 = pd.DataFrame(min_t0).reset_index()
|
|
848
|
-
# # join to exp_stateTable as presence_time_0
|
|
849
|
-
# exp_stateTable = pd.merge(left = exp_stateTable, right = min_t0, how = u'left',left_on = 'presence', right_on = 'presence')
|
|
850
|
-
# # subtract presence_time_0 from t0 and t1
|
|
851
|
-
# exp_stateTable['t0'] = exp_stateTable['t0'] - exp_stateTable['min_t0']
|
|
852
|
-
# exp_stateTable['t1'] = exp_stateTable['t1'] - exp_stateTable['min_t0']
|
|
853
|
-
# # drop presence_time_0 from exp_stateTable
|
|
854
|
-
|
|
855
|
-
# exp_stateTable['hour'] = pd.DatetimeIndex(exp_stateTable['time0']).hour # get the hour of the day from the current time stamp
|
|
856
|
-
# exp_stateTable['qDay'] = exp_stateTable.hour//6 # integer division by 6 to put the day into a quarter
|
|
857
|
-
# exp_stateTable['test'] = exp_stateTable.t1 - exp_stateTable.t0 # this is no longer needed, but if t1 is smaller than t0 things are screwed up
|
|
858
|
-
# stateTable = exp_stateTable
|
|
859
|
-
# del exp_stateTable
|
|
860
|
-
# stateTable['transition'] = tuple(zip(stateTable.startState.values.astype(int),stateTable.endState.values.astype(int))) # create transition variable, this is helpful in R
|
|
861
|
-
# self.master_stateTable = self.master_stateTable.append(stateTable)
|
|
862
|
-
# export
|
|
863
|
-
self.master_stateTable.drop(labels = ['nextFlowPeriod'],axis = 1, inplace = True)
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
if adjacency_filter is not None:
|
|
867
|
-
'''When the truth value of a detection is assessed, a detection
|
|
868
|
-
may be valid for a fish that is not present.
|
|
869
|
-
|
|
870
|
-
In some instances, especially when using Yagi antennas, back-lobes
|
|
871
|
-
may develop where a fish in the tailrace of a powerhouse is
|
|
872
|
-
detected in the forebay antenna. In these instances, a downstream
|
|
873
|
-
migrating fish would not have migrated up through the powerhouse.
|
|
874
|
-
|
|
875
|
-
From a false positive perspective, these records are valid detections.
|
|
876
|
-
However, from a movement perspective, these series of detections
|
|
877
|
-
could not occur and should be removed.
|
|
878
|
-
|
|
879
|
-
This function repeatedly removes rows with 'illegal' movements
|
|
880
|
-
until there are none left. Rows with 'illegal' transitions are
|
|
881
|
-
identified with a list that is passed to the function.
|
|
882
|
-
|
|
883
|
-
input = list of illegal transitions stored as (from, to) tuples
|
|
884
|
-
'''
|
|
885
|
-
fish = self.master_stateTable.FreqCode.unique()
|
|
886
|
-
|
|
887
|
-
for i in fish:
|
|
888
|
-
fishDat = self.master_stateTable[self.master_stateTable.FreqCode == i]
|
|
889
|
-
self.master_stateTable = self.master_stateTable[self.master_stateTable.FreqCode != i]
|
|
890
|
-
|
|
891
|
-
# create a condition, we're running this filter because we know illogical movements are present
|
|
892
|
-
bad_moves_present = True
|
|
893
|
-
|
|
894
|
-
# while there are illogical movements, keep filtering
|
|
895
|
-
while bad_moves_present == True:
|
|
896
|
-
# let's keep count of the number of rows we are filtering
|
|
897
|
-
filtered_rows = 0.0
|
|
898
|
-
|
|
899
|
-
# for every known bad movement
|
|
900
|
-
for j in adjacency_filter:
|
|
901
|
-
print ("Starting %s filter"%(i))
|
|
902
|
-
# find those rows where this movement exists
|
|
903
|
-
fishDat['transition_filter'] = np.where(fishDat.transition == j,1,0)
|
|
904
|
-
fishDat.set_index(['time0'], inplace = True)
|
|
905
|
-
|
|
906
|
-
if fishDat.transition_filter.sum() > 0:
|
|
907
|
-
# add up those rows
|
|
908
|
-
filtered_rows = filtered_rows + fishDat.transition_filter.sum()
|
|
909
|
-
print ('%s rows found with %s movements'%(fishDat.transition_filter.sum(),j))
|
|
910
|
-
|
|
911
|
-
# do some data management, we need to take the start state and t0 of the affected rows and place them on the subsequent row
|
|
912
|
-
idx = fishDat.index[fishDat['transition_filter']==1]
|
|
913
|
-
|
|
914
|
-
for k in idx:
|
|
915
|
-
idx_int = fishDat.index.get_loc(k)
|
|
916
|
-
t0_col = fishDat.columns.get_loc('t0')
|
|
917
|
-
start_col = fishDat.columns.get_loc('startState')
|
|
918
|
-
|
|
919
|
-
# get start time and start state
|
|
920
|
-
start = fishDat.iloc[idx_int]['startState']
|
|
921
|
-
t0 = fishDat.iloc[idx_int]['t0']
|
|
922
|
-
|
|
923
|
-
# write it to next row
|
|
924
|
-
try:
|
|
925
|
-
idx1 = idx_int + 1
|
|
926
|
-
except:
|
|
927
|
-
start = fishDat.iloc[idx_int].index[0]
|
|
928
|
-
idx1 = start + 1
|
|
929
|
-
try:
|
|
930
|
-
fishDat.iloc[idx1, start_col] = start
|
|
931
|
-
fishDat.iloc[idx1, t0_col] = t0
|
|
932
|
-
except IndexError:
|
|
933
|
-
# when this occurs, there is no extra row - this last row will be deleted
|
|
934
|
-
continue
|
|
935
|
-
|
|
936
|
-
# remove those rows
|
|
937
|
-
fishDat = fishDat[fishDat.transition_filter != 1]
|
|
938
|
-
|
|
939
|
-
# create a new transition field
|
|
940
|
-
fishDat['transition'] = tuple(zip(fishDat.startState.values.astype(int),
|
|
941
|
-
fishDat.endState.values.astype(int)))
|
|
942
|
-
|
|
943
|
-
fishDat.reset_index(inplace = True)
|
|
944
|
-
else:
|
|
945
|
-
print ("No illegal movements identified")
|
|
946
|
-
fishDat.reset_index(inplace = True)
|
|
947
|
-
|
|
948
|
-
if filtered_rows == 0.0:
|
|
949
|
-
print ("All illegal movements for fish %s removed"%(i))
|
|
950
|
-
# stop that loop
|
|
951
|
-
bad_moves_present = False
|
|
952
|
-
|
|
953
|
-
else:
|
|
954
|
-
# i feel bad for you son
|
|
955
|
-
print ("%s illegal movements present in iteration, go again"%(filtered_rows))
|
|
956
|
-
|
|
957
|
-
fishDat.drop(labels = ['transition_filter'], axis = 1, inplace = True)
|
|
958
|
-
self.master_stateTable = self.master_stateTable.append(fishDat)
|
|
1128
|
+
print("The number of movements a fish is expected to make is best described with min, median, and maximum statistics.\n")
|
|
1129
|
+
print("Minimum number of times each transition was made:")
|
|
1130
|
+
print(summary_stats['min_transition_count'], "\n")
|
|
959
1131
|
|
|
960
|
-
|
|
961
|
-
|
|
1132
|
+
print("Median number of times each transition was made:")
|
|
1133
|
+
print(summary_stats['median_transition_count'], "\n")
|
|
962
1134
|
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
print
|
|
967
|
-
print
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
self.
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1135
|
+
print("Maximum number of times each transition was made by each fish:")
|
|
1136
|
+
print(summary_stats['max_transition_count'], "\n")
|
|
1137
|
+
|
|
1138
|
+
print("Movement summaries - Duration between states in seconds:")
|
|
1139
|
+
print(summary_stats['movement_duration_summary'], "\n")
|
|
1140
|
+
self.msm_state_table.to_csv(os.path.join(self.project.output_dir,'state_table.csv'))
|
|
1141
|
+
self.count_table.to_csv(os.path.join(self.project.output_dir,'count_table.csv'))
|
|
1142
|
+
self.master_state_table.to_csv(os.path.join(self.project.output_dir,'tte.csv'))
|
|
1143
|
+
self.move_summ.to_csv(os.path.join(self.project.output_dir,'movement_summary.csv'))
|
|
1144
|
+
return summary_stats
|
|
1145
|
+
|
|
1146
|
+
# Helper methods for initial_state_release handling (refactored)
|
|
1147
|
+
def _convert_release_rows(self):
|
|
1148
|
+
"""Convert transitions ending at release (0) into stay-in-place rows
|
|
1149
|
+
and remove trivial 0->0 rows. Operates per-fish to avoid cross-fish
|
|
1150
|
+
contamination and prints summaries when changes occur."""
|
|
1151
|
+
if 'end_state' not in self.master_state_table.columns:
|
|
1152
|
+
return
|
|
1153
|
+
per_fish = []
|
|
1154
|
+
total_converted = 0
|
|
1155
|
+
total_removed = 0
|
|
1156
|
+
for fish in self.master_state_table['freq_code'].unique():
|
|
1157
|
+
df = self.master_state_table[self.master_state_table['freq_code'] == fish].copy()
|
|
1158
|
+
mask_end0 = df['end_state'] == 0
|
|
1159
|
+
n_end0 = int(mask_end0.sum())
|
|
1160
|
+
if n_end0 > 0:
|
|
1161
|
+
total_converted += n_end0
|
|
1162
|
+
for idx in df.index[mask_end0].tolist():
|
|
1163
|
+
ss = int(df.at[idx, 'start_state']) if not pd.isna(df.at[idx, 'start_state']) else int(df.loc[idx, 'start_state'])
|
|
1164
|
+
df.at[idx, 'end_state'] = ss
|
|
1165
|
+
df.at[idx, 'time_delta'] = 0
|
|
1166
|
+
df.at[idx, 'time_1'] = df.at[idx, 'time_0']
|
|
1167
|
+
df.at[idx, 'transition'] = (ss, ss)
|
|
1168
|
+
|
|
1169
|
+
zero_zero_mask = (df['start_state'] == 0) & (df['end_state'] == 0)
|
|
1170
|
+
n_zero_zero = int(zero_zero_mask.sum())
|
|
1171
|
+
if n_zero_zero > 0:
|
|
1172
|
+
total_removed += n_zero_zero
|
|
1173
|
+
df = df.loc[~zero_zero_mask].copy()
|
|
1174
|
+
|
|
1175
|
+
per_fish.append(df)
|
|
1176
|
+
|
|
1177
|
+
if total_converted > 0:
|
|
1178
|
+
print(f"[TTE] Converting {total_converted} transitions ending at release state (0) into stay-in-place rows")
|
|
1179
|
+
if total_removed > 0:
|
|
1180
|
+
print(f"[TTE] Removing {total_removed} trivial release->release (0->0) rows")
|
|
1181
|
+
|
|
1182
|
+
if per_fish:
|
|
1183
|
+
self.master_state_table = pd.concat(per_fish, axis=0, ignore_index=True)
|
|
1184
|
+
|
|
1185
|
+
def _ensure_diagonals(self):
|
|
1186
|
+
"""Ensure each fish's last-seen state has a diagonal (S->S) row
|
|
1187
|
+
whose entry time is the start of the contiguous final block of S."""
|
|
1188
|
+
diag_rows = []
|
|
1189
|
+
for fish in self.master_state_table['freq_code'].unique():
|
|
1190
|
+
rows = self.master_state_table[self.master_state_table['freq_code'] == fish].copy()
|
|
1191
|
+
if rows.empty:
|
|
1192
|
+
continue
|
|
1193
|
+
try:
|
|
1194
|
+
rows['time_1'] = rows['time_1'].astype('int64')
|
|
1195
|
+
rows['time_0'] = rows['time_0'].astype('int64')
|
|
1196
|
+
except (ValueError, TypeError) as exc:
|
|
1197
|
+
raise ValueError(
|
|
1198
|
+
f"[TTE] Invalid time_0/time_1 values for fish {fish}."
|
|
1199
|
+
) from exc
|
|
1200
|
+
last_idx = rows['time_1'].idxmax()
|
|
1201
|
+
last_row = rows.loc[last_idx]
|
|
1202
|
+
last_state = int(last_row['end_state'])
|
|
1203
|
+
last_time = int(last_row['time_1'])
|
|
1204
|
+
diag_exists = ((rows['start_state'] == last_state) & (rows['end_state'] == last_state) & (rows['time_1'] == last_time)).any()
|
|
1205
|
+
if not diag_exists and last_state > 0:
|
|
1206
|
+
fr = rows.sort_values(['time_0','time_1']).reset_index(drop=True)
|
|
1207
|
+
positions = fr.index[fr['end_state'] == last_state].tolist()
|
|
1208
|
+
entry_time = last_time
|
|
1209
|
+
if positions:
|
|
1210
|
+
candidate_idxs = [i for i in positions if int(fr.at[i, 'time_1']) == last_time]
|
|
1211
|
+
last_pos = candidate_idxs[-1] if candidate_idxs else positions[-1]
|
|
1212
|
+
start_pos = last_pos
|
|
1213
|
+
while start_pos > 0 and int(fr.at[start_pos - 1, 'end_state']) == last_state:
|
|
1214
|
+
start_pos -= 1
|
|
1215
|
+
try:
|
|
1216
|
+
entry_time = int(fr.at[start_pos, 'time_0'])
|
|
1217
|
+
except (KeyError, ValueError, TypeError) as exc:
|
|
1218
|
+
raise ValueError(
|
|
1219
|
+
f"[TTE] Invalid entry time for fish {fish} at state {last_state}."
|
|
1220
|
+
) from exc
|
|
1221
|
+
|
|
1222
|
+
diag = {
|
|
1223
|
+
'freq_code': fish,
|
|
1224
|
+
'species': last_row.get('species', np.nan),
|
|
1225
|
+
'start_state': last_state,
|
|
1226
|
+
'end_state': last_state,
|
|
1227
|
+
'presence': int(last_row.get('presence', 0)),
|
|
1228
|
+
'time_stamp': last_row.get('time_stamp', pd.to_datetime(last_time, unit='s')),
|
|
1229
|
+
'time_delta': int(last_time - entry_time),
|
|
1230
|
+
'first_obs': 0,
|
|
1231
|
+
'time_0': int(entry_time),
|
|
1232
|
+
'time_1': int(last_time),
|
|
1233
|
+
'transition': (last_state, last_state)
|
|
1234
|
+
}
|
|
1235
|
+
diag_rows.append(diag)
|
|
1236
|
+
|
|
1237
|
+
if diag_rows:
|
|
1238
|
+
diag_df = pd.DataFrame(diag_rows)
|
|
1239
|
+
for c in diag_df.columns:
|
|
1240
|
+
if c in self.master_state_table.columns:
|
|
1241
|
+
try:
|
|
1242
|
+
diag_df[c] = diag_df[c].astype(self.master_state_table[c].dtype)
|
|
1243
|
+
except (ValueError, TypeError) as exc:
|
|
1244
|
+
raise ValueError(
|
|
1245
|
+
f"[TTE] Failed to cast diagonal column '{c}' for fish rows."
|
|
1246
|
+
) from exc
|
|
1247
|
+
self.master_state_table = pd.concat([self.master_state_table, diag_df], axis=0, ignore_index=True)
|
|
1248
|
+
|
|
1249
|
+
def _insert_missing_releases(self, project=None):
|
|
1250
|
+
"""Insert a release (start_state==0) row for fish missing one using
|
|
1251
|
+
release times from project's tag table (must have `freq_code`)."""
|
|
1252
|
+
tag_table = None
|
|
1253
|
+
if hasattr(self, 'project') and hasattr(self.project, 'tags'):
|
|
1254
|
+
tag_table = self.project.tags
|
|
1255
|
+
elif project is not None and hasattr(project, 'tags'):
|
|
1256
|
+
tag_table = project.tags
|
|
1257
|
+
if tag_table is None:
|
|
1258
|
+
raise ValueError("[TTE] Tag table missing; cannot insert missing releases.")
|
|
1259
|
+
|
|
1260
|
+
if 'freq_code' in tag_table.columns:
|
|
1261
|
+
tag_df = tag_table.copy()
|
|
1262
|
+
else:
|
|
1263
|
+
tag_df = tag_table.reset_index()
|
|
1264
|
+
if 'freq_code' not in tag_df.columns:
|
|
1265
|
+
raise ValueError("[TTE] Tag table missing required 'freq_code' column.")
|
|
1266
|
+
|
|
1267
|
+
missing = []
|
|
1268
|
+
for fish in self.master_state_table['freq_code'].unique():
|
|
1269
|
+
fish_df = self.master_state_table[self.master_state_table['freq_code'] == fish]
|
|
1270
|
+
if (fish_df['start_state'] == 0).any():
|
|
1271
|
+
continue
|
|
1272
|
+
earliest = fish_df.sort_values(['time_0', 'time_1']).iloc[0]
|
|
1273
|
+
try:
|
|
1274
|
+
candidate_end = int(earliest.get('start_state', earliest.get('end_state', 0)))
|
|
1275
|
+
except (ValueError, TypeError) as exc:
|
|
1276
|
+
raise ValueError(f"[TTE] Invalid start_state for fish {fish}.") from exc
|
|
1277
|
+
if candidate_end == 0:
|
|
1278
|
+
candidate_end = int(earliest.get('end_state', 0))
|
|
1279
|
+
|
|
1280
|
+
rows = tag_df[tag_df['freq_code'].astype(str) == str(fish)]
|
|
1281
|
+
if rows.empty:
|
|
1282
|
+
raise ValueError(f"[TTE] Missing release data for fish {fish}.")
|
|
1283
|
+
|
|
1284
|
+
if 'rel_date' in rows.columns:
|
|
1285
|
+
try:
|
|
1286
|
+
rel_epoch = int(
|
|
1287
|
+
(pd.to_datetime(rows['rel_date'].iloc[0]) - pd.Timestamp('1970-01-01'))
|
|
1288
|
+
/ pd.Timedelta('1s')
|
|
1289
|
+
)
|
|
1290
|
+
except (ValueError, TypeError) as exc:
|
|
1291
|
+
raise ValueError(f"[TTE] Invalid rel_date for fish {fish}.") from exc
|
|
1292
|
+
elif 'epoch' in rows.columns:
|
|
1293
|
+
try:
|
|
1294
|
+
rel_epoch = int(rows['epoch'].iloc[0])
|
|
1295
|
+
except (ValueError, TypeError) as exc:
|
|
1296
|
+
raise ValueError(f"[TTE] Invalid epoch for fish {fish}.") from exc
|
|
1297
|
+
else:
|
|
1298
|
+
raise ValueError(f"[TTE] Tag table missing rel_date/epoch for fish {fish}.")
|
|
1299
|
+
|
|
1300
|
+
species_val = rows['species'].iloc[0] if 'species' in rows.columns else np.nan
|
|
1301
|
+
release_row = {
|
|
1302
|
+
'freq_code': fish,
|
|
1303
|
+
'species': species_val,
|
|
1304
|
+
'start_state': 0,
|
|
1305
|
+
'end_state': candidate_end,
|
|
1306
|
+
'presence': 0,
|
|
1307
|
+
'time_stamp': pd.to_datetime(rel_epoch, unit='s'),
|
|
1308
|
+
'time_delta': int(earliest['time_0'] - rel_epoch) if 'time_0' in earliest else 0,
|
|
1309
|
+
'first_obs': 0,
|
|
1310
|
+
'time_0': int(rel_epoch),
|
|
1311
|
+
'time_1': int(earliest['time_0']) if 'time_0' in earliest else int(rel_epoch),
|
|
1312
|
+
'transition': (0, candidate_end)
|
|
1313
|
+
}
|
|
1314
|
+
missing.append(release_row)
|
|
1315
|
+
|
|
1316
|
+
if missing:
|
|
1317
|
+
mr = pd.DataFrame(missing)
|
|
1318
|
+
for c in mr.columns:
|
|
1319
|
+
if c in self.master_state_table.columns:
|
|
1320
|
+
try:
|
|
1321
|
+
mr[c] = mr[c].astype(self.master_state_table[c].dtype)
|
|
1322
|
+
except (ValueError, TypeError) as exc:
|
|
1323
|
+
raise ValueError(
|
|
1324
|
+
f"[TTE] Failed to cast release column '{c}' for missing releases."
|
|
1325
|
+
) from exc
|
|
1326
|
+
self.master_state_table = pd.concat([self.master_state_table, mr], axis=0, ignore_index=True)
|
|
1327
|
+
|
|
1328
|
+
def _normalize_master(self):
|
|
1329
|
+
"""Normalize master_state_table: coerce time dtypes, drop duplicates,
|
|
1330
|
+
sort deterministically, and recompute presence/first_obs per fish."""
|
|
1331
|
+
try:
|
|
1332
|
+
self.master_state_table['time_0'] = self.master_state_table['time_0'].astype('int64')
|
|
1333
|
+
self.master_state_table['time_1'] = self.master_state_table['time_1'].astype('int64')
|
|
1334
|
+
except (ValueError, TypeError) as exc:
|
|
1335
|
+
raise ValueError("[TTE] time_0/time_1 must be numeric epoch seconds.") from exc
|
|
1336
|
+
|
|
1337
|
+
dup_subset = ['freq_code', 'start_state', 'end_state', 'time_0', 'time_1']
|
|
1338
|
+
self.master_state_table = self.master_state_table.drop_duplicates(subset=dup_subset, keep='first').copy()
|
|
1339
|
+
self.master_state_table.sort_values(by=['freq_code', 'time_0', 'time_1'], inplace=True)
|
|
1340
|
+
|
|
1341
|
+
self.master_state_table['presence'] = (
|
|
1342
|
+
self.master_state_table.groupby('freq_code').cumcount().astype('int32')
|
|
1343
|
+
)
|
|
1344
|
+
self.master_state_table['first_obs'] = 0
|
|
1345
|
+
|
|
1346
|
+
# Example usage
|
|
1347
|
+
# master_state_table = pd.DataFrame({...})
|
|
1348
|
+
# summary = SurvivalDataSummary(master_state_table)
|
|
1349
|
+
# summary_stats = summary.summary()
|
|
1350
|
+
# summary.print_summary() # Optionally print the summary
|