pymast 0.0.6__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pymast/formatter.py CHANGED
@@ -1,10 +1,63 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  """
3
- Modules contains all of the functions and classes required to format radio
4
- telemetry data for statistical testing.
5
-
6
- currently we can set up models for Cormack Jolly Seber mark recapture, Time-to-
7
- Event modeling, and Live Recapture Dead Recovery Mark Recapture.
3
+ Statistical Model Data Formatting Module
4
+ ========================================
5
+
6
+ This module contains classes for formatting cleaned telemetry data
7
+ into input files for various statistical survival and movement models.
8
+
9
+ Classes
10
+ -------
11
+ cjs_data_prep : Cormack-Jolly-Seber (CJS) mark-recapture formatting
12
+ Creates encounter history matrices for Program MARK survival analysis.
13
+
14
+ lrdr_data_prep : Live Recapture Dead Recovery (LRDR) formatting
15
+ Combines live detections with mobile tracking mortality surveys.
16
+ Used for post-passage survival estimation with recovery data.
17
+
18
+ time_to_event : Multi-state time-to-event (competing risks) formatting
19
+ Creates counting-process style data for multi-state survival models.
20
+ Most commonly used class for fish passage and movement studies.
21
+
22
+ Typical Usage
23
+ -------------
24
+ >>> import pymast
25
+ >>> project = pymast.radio_project('path/to/project')
26
+ >>>
27
+ >>> # Time-to-event model (most common)
28
+ >>> receiver_to_state = {'R01': 1, 'R02': 2, 'R03': 9} # Map receivers to states
29
+ >>> tte = pymast.formatter.time_to_event(receiver_to_state, project)
30
+ >>> tte.data_prep(project, adjacency_filter=[(9,1), (9,2)]) # Remove illegal movements
31
+ >>> stats = tte.summary() # Print movement statistics
32
+ >>> tte.master_state_table.to_csv('output.csv') # Export for R/msm
33
+ >>>
34
+ >>> # CJS model
35
+ >>> receiver_to_recap = {'R01': 'R00', 'R02': 'R01', 'R03': 'R02'}
36
+ >>> cjs = pymast.formatter.cjs_data_prep(receiver_to_recap, project)
37
+ >>> cjs.input_file('model_name', 'output_dir') # Creates .inp for MARK
38
+
39
+ Notes
40
+ -----
41
+ - All classes expect data from a pymast.radio_project with complete pipeline:
42
+ 1. Data imported (parsers)
43
+ 2. Classified (naive_bayes)
44
+ 3. Bouts detected (overlap_removal.bout)
45
+ 4. Overlaps removed (overlap_removal.overlap_reduction)
46
+ 5. Recaptures built (radio_project.make_recaptures_table)
47
+
48
+ - The time_to_event class automatically filters:
49
+ - Overlapping detections (overlapping == 1)
50
+ - Ambiguous overlaps (ambiguous_overlap == 1)
51
+ - Small bouts (< 3 detections)
52
+
53
+ - Adjacency filter removes biologically impossible state transitions
54
+ based on study site geography (e.g., downstream → upstream without ladder).
55
+
56
+ See Also
57
+ --------
58
+ pymast.radio_project : Project management and database
59
+ pymast.overlap_removal : Bout detection and overlap resolution
60
+ pymast.naive_bayes : Classification of true detections
8
61
  """
9
62
 
10
63
  # import modules required for function dependencies
@@ -48,18 +101,18 @@ class cjs_data_prep():
48
101
  #where = qry)
49
102
 
50
103
  #self.recap_data.set_index('freq_code', inplace = True)
51
- project.tags.reset_index('freq_code', inplace = True)
104
+ #project.tags.reset_index('freq_code', inplace = True)
52
105
  self.recap_data = pd.merge(self.recap_data,
53
106
  project.tags,
54
107
  left_on = 'freq_code',
55
108
  right_on = 'freq_code',
56
109
  how = 'left')
57
- self.recap_data.reset_index(drop = False, inplace = True)
110
+ self.recap_data.reset_index(inplace = True)
58
111
  #project.tags.reset_index(drop = False, inplace = True)
59
112
 
60
113
  # filter out tag data we don't want mucking up our staistical model
61
114
  if species != None:
62
- self.recap_data = self.recap_data[self.recap_data.Species == species]
115
+ self.recap_data = self.recap_data[self.recap_data.species == species]
63
116
  if rel_loc != None:
64
117
  self.recap_data = self.recap_data[self.recap_data.RelLoc == rel_loc]
65
118
  if cap_loc != None:
@@ -91,7 +144,13 @@ class cjs_data_prep():
91
144
  rel_dat.rename(columns = {'rel_date':'time_stamp'}, inplace = True)
92
145
  rel_dat['recap_occasion'] = np.repeat('R00',len(rel_dat))
93
146
  rel_dat['overlapping'] = np.zeros(len(rel_dat))
94
- self.recap_data = pd.concat([self.recap_data,rel_dat],axis = 0)
147
+ # Check if the index is the default integer index
148
+ has_default_index = isinstance(rel_dat.index, pd.RangeIndex)
149
+
150
+ # If the DataFrame has a default index, reset it
151
+ if not has_default_index:
152
+ rel_dat.reset_index(inplace = True)
153
+ self.recap_data = pd.concat([self.recap_data,rel_dat])
95
154
 
96
155
  else:
97
156
  print ("Starting Initial Recap Release Procedure")
@@ -346,36 +405,193 @@ class lrdr_data_prep():
346
405
 
347
406
 
348
407
 
408
+
349
409
  print (self.inp.head())
350
410
  # Check your work
351
411
  self.inp.to_csv(os.path.join(outputWS,'%s_lrdr.csv'%(modelName)))
352
412
 
353
- class time_to_event():#inputFile,outputFile,time_dependent_covariates = False, covariates = None, bucket_length = 15):
354
- '''Class imports standardized raw state presences and converts data structure
355
- into counting process style data appropriate for time to event analysis.
356
-
357
- Function inputs:
358
- input file = directory with file name of raw timestamped state presence
359
- output file = directory with file name for formatted data files.
360
- covariates = directory that contains covariates
361
- time_dependent_covariates = True/False (default = False) field indicating
362
- whether or not time dependnent covariates are incorporated into the
363
- counting process style
364
- bucket_length = covariate time interval, default 15 minutes
365
- '''
413
+ class time_to_event():
414
+ """
415
+ Multi-state Time-to-Event Data Formatter
416
+
417
+ Formats radio telemetry recapture data into counting-process style records
418
+ for multi-state survival models (competing risks, Cox proportional hazards).
419
+
420
+ This is the primary class for fish passage and movement studies using
421
+ state-based survival models in R (msm, mstate packages).
422
+
423
+ Parameters
424
+ ----------
425
+ receiver_to_state : dict
426
+ Mapping of receiver IDs to integer state codes.
427
+ Example: {'R_forebay': 1, 'R_powerhouse': 2, 'R_tailrace': 9}
428
+
429
+ project : pymast.radio_project
430
+ Project object with completed data pipeline (imported, classified,
431
+ bouts detected, overlaps removed, recaptures built).
432
+
433
+ input_type : str, optional
434
+ Legacy parameter, always uses 'query' from HDF5. Default 'query'.
435
+
436
+ initial_state_release : bool, optional
437
+ If True, models movement from release point (state 0).
438
+ If False, models from first recapture at state 1. Default False.
439
+
440
+ last_presence_time0 : bool, optional
441
+ If True, uses last presence at initial state as time_0 (for
442
+ downstream migration studies where fish linger upstream). Default False.
443
+
444
+ cap_loc : str, optional
445
+ Filter by capture location. Default None (all fish).
446
+
447
+ rel_loc : str, optional
448
+ Filter by release location. Default None (all fish).
449
+
450
+ species : str, optional
451
+ Filter by species code. Default None (all species).
452
+
453
+ rel_date : str, optional
454
+ Filter fish released after this date (YYYY-MM-DD). Default None.
455
+
456
+ recap_date : str, optional
457
+ Filter recaptures after this date (YYYY-MM-DD). Default None.
458
+
459
+ Attributes
460
+ ----------
461
+ recap_data : pd.DataFrame
462
+ Loaded recapture data after all filtering (overlapping, ambiguous,
463
+ bout size, species/location filters).
464
+
465
+ master_state_table : pd.DataFrame
466
+ Final formatted output with columns:
467
+ - freq_code : Fish ID
468
+ - species : Species code
469
+ - start_state : State at time_0 (integer)
470
+ - end_state : State at time_1 (integer)
471
+ - presence : Presence number for this fish
472
+ - time_stamp : Datetime of transition
473
+ - time_delta : Duration in state (seconds)
474
+ - first_obs : Binary flag for first observation
475
+ - time_0 : Entry time (epoch seconds)
476
+ - time_1 : Exit time (epoch seconds)
477
+ - transition : Tuple (start_state, end_state)
478
+
479
+ fish : np.ndarray
480
+ Array of unique fish IDs in dataset.
481
+
482
+ start_times : pd.DataFrame
483
+ First recapture time per fish (for time_0 initialization).
484
+
485
+ Methods
486
+ -------
487
+ data_prep(project, unknown_state=None, bucket_length_min=15, adjacency_filter=None)
488
+ Process recapture data into state transitions. Applies adjacency filter
489
+ if provided to remove biologically impossible movements.
490
+
491
+ summary()
492
+ Calculate and print movement statistics (transition counts, durations).
493
+ Returns dict with statistics and saves CSV files to output directory.
494
+
495
+ Examples
496
+ --------
497
+ >>> import pymast
498
+ >>> project = pymast.radio_project('path/to/project')
499
+ >>>
500
+ >>> # Define state mapping
501
+ >>> receiver_to_state = {
502
+ ... 'R_release': 0, # Release location
503
+ ... 'R_forebay': 1, # Upstream of dam
504
+ ... 'R_powerhouse': 2, # At turbines
505
+ ... 'R_tailrace': 9 # Downstream of dam
506
+ ... }
507
+ >>>
508
+ >>> # Initialize formatter
509
+ >>> tte = pymast.formatter.time_to_event(
510
+ ... receiver_to_state,
511
+ ... project,
512
+ ... species='Chinook',
513
+ ... initial_state_release=True
514
+ ... )
515
+ >>>
516
+ >>> # Define illegal movements (downstream can't go upstream without ladder)
517
+ >>> adjacency_filter = [
518
+ ... (9, 1), # Tailrace to forebay
519
+ ... (9, 2), # Tailrace to powerhouse
520
+ ... (2, 1), # Powerhouse to forebay
521
+ ... ]
522
+ >>>
523
+ >>> # Process data
524
+ >>> tte.data_prep(project, adjacency_filter=adjacency_filter)
525
+ >>>
526
+ >>> # Get statistics
527
+ >>> stats = tte.summary()
528
+ >>>
529
+ >>> # Export for R analysis
530
+ >>> tte.master_state_table.to_csv('tte_data.csv', index=False)
531
+ >>>
532
+ >>> # In R:
533
+ >>> # library(msm)
534
+ >>> # data <- read.csv('tte_data.csv')
535
+ >>> # model <- msm(end_state ~ time_1, subject=freq_code,
536
+ >>> # data=data, qmatrix=Q, ...)
537
+
538
+ Notes
539
+ -----
540
+ **Automatic Filtering**:
541
+
542
+ The class automatically removes:
543
+ - Detections with overlapping == 1 (removed during overlap resolution)
544
+ - Detections with ambiguous_overlap == 1 (couldn't resolve receiver conflict)
545
+ - Bouts with < 3 detections (likely false positives)
546
+
547
+ **Adjacency Filter**:
548
+
549
+ Removes biologically impossible state transitions based on site geography.
550
+ Iteratively processes each fish until all illegal movements are removed.
551
+
552
+ Works by:
553
+ 1. Finding rows with illegal transitions
554
+ 2. Carrying forward the start_state and time_0 to next row
555
+ 3. Deleting the illegal row
556
+ 4. Recalculating transitions
557
+ 5. Repeating until clean
558
+
559
+ **State Coding**:
560
+
561
+ Common conventions:
562
+ - 0: Release location (if initial_state_release=True)
563
+ - 1: Initial study area state
564
+ - 2-8: Intermediate states
565
+ - 9: Terminal/downstream state
566
+
567
+ **Time Format**:
568
+
569
+ All times are Unix epoch (seconds since 1970-01-01 00:00:00 UTC).
570
+ Convert in R: `as.POSIXct(time_0, origin='1970-01-01')`
571
+
572
+ See Also
573
+ --------
574
+ pymast.radio_project.make_recaptures_table : Builds input recaptures table
575
+ pymast.overlap_removal.overlap_reduction : Marks overlapping detections
576
+ cjs_data_prep : For standard CJS survival models in Program MARK
577
+ """
366
578
  def __init__(self,
367
579
  receiver_to_state,
368
580
  project,
369
581
  input_type = 'query',
370
582
  initial_state_release = False,
371
583
  last_presence_time0 = False,
584
+ hit_ratio_filter = False,
372
585
  cap_loc = None,
373
- rel_loc = None,
374
- species = None):
586
+ rel_loc = None,
587
+ species = None,
588
+ rel_date = None,
589
+ recap_date = None):
375
590
  # Import Data From Project HDF
376
591
  self.rel_loc = rel_loc
377
592
  self.cap_loc = cap_loc
378
-
593
+
594
+ self.initial_state_release = initial_state_release
379
595
  # get recaptures, but first build a query
380
596
  query_parts = []
381
597
  for key in receiver_to_state:
@@ -384,8 +600,18 @@ class time_to_event():#inputFile,outputFile,time_dependent_covariates = False, c
384
600
 
385
601
  self.recap_data = pd.read_hdf(project.db,
386
602
  'recaptures',
387
- where = qry)
603
+ where = qry)#"rec_id == 'R11'")
604
+
605
+ # Debug: check if ambiguous_overlap column exists
606
+ print(f"[TTE] Columns in recaptures table: {self.recap_data.columns.tolist()}")
607
+ print(f"[TTE] Has ambiguous_overlap: {'ambiguous_overlap' in self.recap_data.columns}")
388
608
 
609
+ if hit_ratio_filter == True:
610
+ self.recap_data = self.recap_data[self.recap_data.hit_ratio > 0.1]
611
+
612
+ if 'power' not in self.recap_data.columns:
613
+ self.recap_data['power'] = np.nan
614
+
389
615
  self.recap_data.drop(columns = ['power',
390
616
  'noise_ratio',
391
617
  'det_hist',
@@ -398,72 +624,84 @@ class time_to_event():#inputFile,outputFile,time_dependent_covariates = False, c
398
624
  inplace = True)
399
625
 
400
626
  self.recap_data.set_index('freq_code', inplace = True)
627
+
401
628
  self.recap_data = pd.merge(self.recap_data,
402
629
  project.tags,
403
630
  how = 'left',
404
631
  left_index = True,
405
632
  right_index = True)
406
633
 
407
- self.recap_data.reset_index(drop = False, inplace = True)
634
+ # remove any row where df['your_column'] is NaN
635
+ self.recap_data.dropna(subset=['rel_date'], inplace=True)
636
+ df = self.recap_data
637
+
638
+ # Filter out overlapping detections (keep only overlapping=0)
639
+ self.recap_data = self.recap_data[self.recap_data.overlapping == 0]
640
+
641
+ # Filter out ambiguous overlaps if column exists (keep only ambiguous_overlap=0)
642
+ # For time-to-event models, fish can only be in one place at once
643
+ if 'ambiguous_overlap' in self.recap_data.columns:
644
+ before_ambig_filter = len(self.recap_data)
645
+ self.recap_data = self.recap_data[self.recap_data.ambiguous_overlap == 0]
646
+ after_ambig_filter = len(self.recap_data)
647
+ print(f"[TTE] Filtered {before_ambig_filter - after_ambig_filter} ambiguous overlap detections")
648
+
649
+ # Filter out detections from small bouts (likely false positives)
650
+ # Calculate bout size for each bout and filter
651
+ if 'bout_no' in self.recap_data.columns:
652
+ # Count detections per bout (per fish, per receiver, per bout_no)
653
+ bout_sizes = self.recap_data.groupby(['freq_code', 'rec_id', 'bout_no']).size().reset_index(name='bout_size')
654
+ self.recap_data = self.recap_data.merge(bout_sizes, on=['freq_code', 'rec_id', 'bout_no'], how='left')
655
+
656
+ # # Filter out bouts with < 3 detections (single spurious detections)
657
+ # min_bout_size = 3
658
+ # before_bout_filter = len(self.recap_data)
659
+ # self.recap_data = self.recap_data[self.recap_data['bout_size'] >= min_bout_size]
660
+ # after_bout_filter = len(self.recap_data)
661
+ # print(f"[TTE] Filtered {before_bout_filter - after_bout_filter} detections from bouts with < {min_bout_size} detections")
662
+
663
+ # Drop the bout_size column (no longer needed)
664
+ self.recap_data = self.recap_data.drop(columns=['bout_size'])
665
+
666
+ self.recap_data['rel_date'] = pd.to_datetime(self.recap_data.rel_date,format = 'mixed')
667
+ if 'pulse_rate' not in self.recap_data.columns:
668
+ self.recap_data['pulse_rate'] = np.nan
669
+
408
670
  self.recap_data.drop(columns = ['pulse_rate',
409
671
  'tag_type',
410
- 'rel_date',
411
672
  'length'],
412
673
  axis = 'columns',
413
674
  inplace = True)
414
-
415
- # filter out tag data we don't want mucking up our staistical model
416
- if species != None:
417
- self.recap_data = self.recap_data[self.recap_data.Species == species]
675
+ # store requested species filter on the instance; apply later in data_prep
676
+ self.species = species
677
+ if "species" not in self.recap_data.columns:
678
+ self.recap_data["species"] = np.nan # or any default value you want
418
679
  if rel_loc != None:
419
- self.recap_data = self.recap_data[self.recap_data.RelLoc == rel_loc]
680
+ self.recap_data = self.recap_data[self.recap_data.rel_loc == rel_loc]
420
681
  if cap_loc != None:
421
- self.recap_data = self.recap_data[self.recap_data.CapLoc == cap_loc]
682
+ self.recap_data = self.recap_data[self.recap_data.cap_loc == cap_loc]
683
+ if rel_date != None:
684
+ self.recap_data = self.recap_data[self.recap_data.rel_date >= pd.to_datetime(rel_date)]
685
+ if recap_date != None:
686
+ self.recap_data = self.recap_data[self.recap_data.time_stamp >= pd.to_datetime(recap_date)]
422
687
 
423
- self.recap_data['state'] = self.recap_data.rec_id.map(receiver_to_state)
424
688
 
689
+ self.recap_data['state'] = self.recap_data.rec_id.map(receiver_to_state)
690
+ self.recap_data.reset_index(inplace = True)
691
+
425
692
  self.recap_data = self.recap_data.astype({'freq_code':'object',
426
693
  'rec_id':'object',
427
694
  'epoch':'float32',
428
- 'time_stamp':'datetime64',
695
+ 'time_stamp':'datetime64[ns]',
429
696
  'lag':'float32',
430
697
  'cap_loc':'object',
431
698
  'rel_loc':'object',
432
699
  'state':'int32'})
433
700
 
434
701
  print ("Unique states in Returned data:%s"%(self.recap_data.state.unique()))
435
-
436
- if initial_state_release == True:
437
- '''we are modeling movement from the initial release location rather
438
- than the initial location in our competing risks model. This allows
439
- us to quantify fall back. If a fish never makes it to the intial
440
- spoke, then its fall back.
441
-
442
- If we care about modeling from the release point, we need to query
443
- release times of each fish, morph data into a recaptures file and
444
- merge it to self.data'''
445
-
446
- # get data
447
- release_dat = project.tags
448
-
449
- # do some data management
450
- release_dat['rel_date'] = pd.to_datetime(release_dat.rel_date)
451
- release_dat['epoch'] = np.round((release_dat.rel_date - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s'),6)
452
- release_dat.rename(columns = {'rel_date':'time_stamp'}, inplace = True)
453
- release_dat['rec_id'] = np.repeat('rel', len(release_dat))
454
- release_dat['state'] = np.zeros(len(release_dat))
455
- release_dat['presence_number'] = np.zeros(len(release_dat))
456
-
457
-
458
- # filter out tag data we don't want mucking up our staistical model
459
- if species != None:
460
- release_dat = release_dat[release_dat.Species == species]
461
- if rel_loc != None:
462
- release_dat = release_dat[release_dat.RelLoc == rel_loc]
463
- if cap_loc != None:
464
- release_dat = release_dat[release_dat.CapLoc == cap_loc]
465
-
466
- self.recap_data = self.recap_data.append(release_dat)
702
+
703
+ # identify unique fish to loop through
704
+ self.fish = self.recap_data.freq_code.unique()
467
705
 
468
706
  if last_presence_time0 == True:
469
707
  ''' sometimes when we are modeling downstream movement, it would be
@@ -519,494 +757,594 @@ class time_to_event():#inputFile,outputFile,time_dependent_covariates = False, c
519
757
  if i not in fish_at_start:
520
758
  self.recap_data.drop(self.recap_data[self.recap_data.freq_code == i].index,
521
759
  inplace = True)
522
-
523
- # Identify first recapture times
524
- self.start_times = self.recap_data[self.recap_data.state == 1].\
525
- groupby(['freq_code'])['epoch'].min().\
526
- to_frame()
527
760
 
528
- self.start_times.rename(columns = {'epoch':'first_recapture'},
529
- inplace = True)
530
-
531
- # Clean Up stuff that doesn't make sense
532
- for fish in self.recap_data.freq_code.unique():
533
- # we only care about movements from the initial sstate - this is a competing risks model
534
- if fish not in self.start_times.index:
535
- self.recap_data = self.recap_data[self.recap_data.freq_code != fish]
536
-
537
- # identify unique fish to loop through
538
- self.fish = self.recap_data.freq_code.unique()
539
-
540
- def data_prep(self,
541
- project,
542
- time_dependent_covariates = False,
543
- unknown_state = None,
544
- bucket_length_min = 15,
545
- adjacency_filter = None):
546
-
547
- if unknown_state != None:
548
- '''It may be beneficial to allow fish to enter into an unknown state
549
- rather than become censored at their last recapture in the initial state.
550
- This way the Nelson-Aalen will match empirical expectations. If we have
551
- a lot of censored fish we lose information from the denominator and
552
- numerator. If we put fish into an unknown state rather than censoring
553
- them we still have informative data. For this to work, we only need to
554
- know the last recapture of any fish in the initial state. We will
555
- assess absorbption into the unknown state with a Boolean statement later on.'''
556
-
557
- last_epoch = self.recap_data[self.recap_data.state == 1].epoch.max()
761
+ if initial_state_release == True:
762
+ '''we are modeling movement from the initial release location rather
763
+ than the initial location in our competing risks model. This allows
764
+ us to quantify fall back. If a fish never makes it to the intial
765
+ spoke, then its fall back.
558
766
 
559
- if time_dependent_covariates == False:
560
- '''This option will produce data appropriate for construction of
561
- Nelson-Aalen cumulative incidence functions and to produce the state
562
- tables.
767
+ If we care about modeling from the release point, we need to query
768
+ release times of each fish, morph data into a recaptures file and
769
+ merge it to self.data'''
563
770
 
564
- This option is not appropriate if we wish to perform Cox Proportional
565
- Hazards Regression modeling as we will not be able to join to time-
566
- dependent covariates in R.
567
- '''
568
- columns = ['freq_code','state','presence','epoch','time_delta','time_0','first_obs'] # create columns
569
- self.master_state_table = pd.DataFrame()
570
- for i in self.fish:
571
- # get data for this fish
572
- fish_dat = self.recap_data[self.recap_data.freq_code == i]
573
-
574
- # get first recapture in state
575
- fish_dat.sort_values(by = 'epoch',
576
- ascending = True,
577
- inplace = True) # sort by exposure time
578
-
579
- # get previous state and fill nans with current state
580
- fish_dat['prev_state'] = fish_dat['state'].shift(1)
581
- fish_dat.at[0,'prev_state']= fish_dat.state.values[0]
582
-
583
- # create some counters
584
- presence = 1
585
- first_obs = 1
586
-
587
- # create empty data frame
588
- state_table = pd.DataFrame(columns = columns)
589
-
590
- # get release time and the first epoch after release time
591
- time_0 = self.start_times.at[i,'first_recapture']
592
- fish_dat = fish_dat[fish_dat.epoch >= time_0]
593
- time_1 = fish_dat.epoch.iloc[0]
594
-
595
- # calculate seconds from release
596
- time_delta = time_1 - time_0
597
-
598
- # creat a row and add it to the state table
599
- row_arr = [i,
600
- fish_dat.state.values[0],
601
- presence,
602
- time_1,
603
- time_delta,
604
- time_0,
605
- first_obs] # create initial row for state table
606
-
607
- row = pd.DataFrame(np.array([row_arr]),columns = columns)
608
- state_table = state_table.append(row)
609
- first_obs = 0 # no other recapture can be the first
610
-
611
- # give rows an arbitrary index and find the index of our last row
612
- fish_dat['idx'] = np.arange(0,len(fish_dat),1)
613
- max_idx = fish_dat.idx.iloc[-1]
614
-
615
- # loop through rows, if it is a new state enter transition data
616
- for j in fish_dat.iterrows():
617
- row_idx = j[1]['idx']
618
- state = j[1]['state']
619
- prev_state = j[1]['prev_state']
620
-
621
- if state != prev_state or row_idx == max_idx:
622
- time_1 = j[1]['epoch']
623
- if unknown_state != None \
624
- and row_idx == max_idx \
625
- and state == 1 \
626
- and time_1 < last_epoch:
627
- state = unknown_state
628
-
629
- time_delta = time_1 - time_0
630
- presence = presence + 1
631
- row_arr = [i,state,presence,time_1,time_delta,time_0,first_obs]
632
-
633
- row = pd.DataFrame(row_arr,columns = columns)
634
- row['state'] = pd.to_numeric(row.state)
635
- row = row.astype({'freq_code':'object',
636
- 'state':'int32',
637
- 'presence':'int32',
638
- 'epoch':'float32',
639
- 'time_delta':'float32',
640
- 'time_0': 'float32',
641
- 'first_obs':'int32'})
642
-
643
- state_table = state_table.append(row)
644
- time_0 = j[1]['epoch']
645
-
646
- print ("State Table Completed for Fish %s"%(i))
647
-
648
- # identify transitions and write to the state table
649
- from_rec = state_table['state'].shift(1)
650
-
651
- to_rec = state_table['state'].astype(np.int32)
652
- trans = tuple(zip(from_rec,to_rec))
653
- state_table['transition'] = trans
654
- state_table['start_state'] = from_rec
655
- state_table['end_state'] = to_rec
656
-
657
- # get time all sorted out
658
- state_table['t0'] = np.zeros(len(state_table))
659
- state_table['t1'] = state_table['epoch'] - state_table['time_0']
771
+ # get data
772
+ release_dat = project.tags.copy()
773
+
774
+ # do some data management
775
+ rel_raw = release_dat['rel_date']
776
+ rel_parsed = pd.to_datetime(rel_raw, format="%m/%d/%y %H:%M", errors="coerce")
777
+ if rel_parsed.isna().any():
778
+ alt_parsed = pd.to_datetime(rel_raw, format="%m/%d/%Y %H:%M", errors="coerce")
779
+ rel_parsed = rel_parsed.fillna(alt_parsed)
780
+ if rel_parsed.isna().any():
781
+ missing = int(rel_parsed.isna().sum())
782
+ raise ValueError(f"[TTE] Failed to parse {missing} release dates in tags table.")
783
+ release_dat['rel_date'] = rel_parsed
784
+ release_dat['epoch'] = np.round((release_dat.rel_date - pd.Timestamp("1970-01-01")) / pd.Timedelta('1s'),6)
785
+ release_dat.rename(columns = {'rel_date':'time_stamp'}, inplace = True)
786
+ release_dat['rec_id'] = np.repeat('rel', len(release_dat))
787
+ release_dat['state'] = np.zeros(len(release_dat))
788
+ release_dat['presence_number'] = np.zeros(len(release_dat))
789
+
790
+
791
+ # filter out tag data we don't want mucking up our staistical model
792
+ # NOTE: do not apply species filter here; defer to data_prep to keep
793
+ # master_state_table construction deterministic regardless of
794
+ # whether species filtering is requested.
795
+ if rel_loc != None:
796
+ release_dat = release_dat[release_dat.rel_loc == rel_loc]
797
+ if cap_loc != None:
798
+ release_dat = release_dat[release_dat.cap_loc == cap_loc]
799
+ if rel_date != None:
800
+ release_dat = release_dat[release_dat.time_stamp >= pd.to_datetime(rel_date)]
801
+ if recap_date != None:
802
+ release_dat = release_dat[release_dat.time_stamp >= pd.to_datetime(recap_date)]
660
803
 
661
- # write state table to master state table
662
- self.master_state_table = self.master_state_table.append(state_table)
804
+ release_dat.reset_index(inplace = True)
805
+ release_dat = release_dat[release_dat['freq_code'].isin(self.fish)]
663
806
 
664
- del i,j
807
+ # add to recaptures table and create a start times table
808
+ self.recap_data = pd.concat([self.recap_data, release_dat], axis=0, ignore_index=True)
809
+ self.start_times = release_dat[['freq_code', 'epoch']].copy()
810
+ self.start_times = self.start_times.rename(columns={'epoch': 'first_recapture'}).set_index('freq_code')
665
811
  else:
666
- columns = ['FreqCode','startState','endState','presence','time_stamp','firstObs','t0','t1'] # create columns
812
+ # movement from state 1
813
+ self.start_times = self.recap_data[self.recap_data.state == 1].\
814
+ groupby(['freq_code'])['epoch'].min().\
815
+ to_frame()
667
816
 
668
- self.master_state_table = pd.DataFrame()
669
- self.bucket_length = bucket_length_min
817
+ # movement from first recapture
818
+ # self.start_times = (
819
+ # self.recap_data
820
+ # .groupby(['freq_code'])['epoch']
821
+ # .min()
822
+ # .to_frame()
823
+ # )
824
+
825
+ self.start_times.rename(columns = {'epoch':'first_recapture'},
826
+ inplace = True)
827
+
828
+ # Clean Up stuff that doesn't make sense
829
+ for fish in self.recap_data.freq_code.unique():
830
+ # we only care about movements from the initial sstate - this is a competing risks model
831
+ if fish not in self.start_times.index:
832
+ self.recap_data = self.recap_data[self.recap_data.freq_code != fish]
833
+
834
+ def _apply_adjacency_filter(self, adjacency_filter):
835
+ '''When the truth value of a detection is assessed, a detection
836
+ may be valid for a fish that is not present.
837
+
838
+ In some instances, especially when using Yagi antennas, back-lobes
839
+ may develop where a fish in the tailrace of a powerhouse is
840
+ detected in the forebay antenna. In these instances, a downstream
841
+ migrating fish would not have migrated up through the powerhouse.
842
+
843
+ From a false positive perspective, these records are valid detections.
844
+ However, from a movement perspective, these series of detections
845
+ could not occur and should be removed.
846
+
847
+ This function repeatedly removes rows with 'illegal' movements
848
+ until there are none left. Rows with 'illegal' transitions are
849
+ identified with a list that is passed to the function.
850
+
851
+ input = list of illegal transitions stored as (from, to) tuples
852
+ '''
853
+ fish = self.master_state_table.freq_code.unique()
854
+ filtered = pd.DataFrame()
855
+ for i in fish:
856
+ fish_dat = self.master_state_table[self.master_state_table.freq_code == i]
857
+
858
+ # create a condition, we're running this filter because we know illogical movements are present
859
+ bad_moves_present = True
860
+
861
+ # while there are illogical movements, keep filtering
862
+ while bad_moves_present == True:
863
+ # let's keep count of the number of rows we are filtering
864
+ filtered_rows = 0.0
865
+
866
+ # for every known bad movement
867
+ for j in adjacency_filter:
868
+ # find those rows where this movement exists
869
+ if 'transition' not in fish_dat.columns:
870
+ raise KeyError(
871
+ f"[TTE] Missing 'transition' column while filtering fish {i}."
872
+ )
873
+ fish_dat['transition_filter'] = np.where(fish_dat.transition == j, 1, 0)
874
+ #fish_dat.set_index(['time_0'], inplace = True)
875
+
876
+ if fish_dat.transition_filter.sum() > 0:
877
+ # add up those rows
878
+ filtered_rows = filtered_rows + fish_dat.transition_filter.sum()
879
+ print ('%s rows found with %s movements'%(fish_dat.transition_filter.sum(),j))
880
+
881
+ # do some data management, we need to take the start state and t0 of the affected rows and place them on the subsequent row
882
+ idx = fish_dat.index[fish_dat['transition_filter']==1]
883
+ time0 = fish_dat.iloc[0]['time_0']
884
+
885
+ for k in idx:
886
+ idx_int = fish_dat.index.get_loc(k)
887
+ t0_col = fish_dat.columns.get_loc('time_0')
888
+ start_col = fish_dat.columns.get_loc('start_state')
889
+
890
+ # get start time and start state
891
+ start = fish_dat.iloc[idx_int]['start_state']
892
+ t0 = fish_dat.iloc[idx_int]['time_0']
893
+
894
+ # write it to next row
895
+ idx1 = idx_int + 1
896
+ try:
897
+ fish_dat.iloc[idx1, start_col] = start
898
+ fish_dat.iloc[idx1, t0_col] = t0
899
+ except IndexError:
900
+ # when this occurs, there is no extra row - this last row will be deleted
901
+ continue
902
+
903
+ # remove those rows
904
+ fish_dat = fish_dat[fish_dat.transition_filter != 1]
905
+ # NOTE: do NOT assign a single time0 to the entire DataFrame
906
+ # (that overwrites time_0 for all remaining rows). The
907
+ # intended behavior is to copy start/time to the next row
908
+ # above (done in the loop that writes to idx1), so we
909
+ # leave the remaining time_0 values intact.
910
+
911
+ # create a new transition field
912
+ fish_dat['transition'] = list(zip(fish_dat.start_state.values.astype(int),
913
+ fish_dat.end_state.values.astype(int))) # Fixed: was tuple, should be list
914
+
915
+ #fish_dat.reset_index(inplace = True)
916
+ else:
917
+ pass # No illegal movements for this transition type
918
+ #fish_dat.reset_index(inplace = True)
919
+
920
+ if filtered_rows == 0.0:
921
+ # stop that loop
922
+ bad_moves_present = False
923
+ else:
924
+ pass # Continue filtering
925
+
926
+ # we can only have 1 transmission to point of no return - let's grab the last recapture
927
+ equal_rows = fish_dat[fish_dat['start_state'] == fish_dat['end_state']]
928
+
929
+ # Step 2: Get the index of the last occurrence where column1 equals column2
930
+ if fish_dat.empty:
931
+ raise ValueError(f"[TTE] No transitions remain for fish {i} after filtering.")
932
+ last_index = fish_dat.index[-1]
933
+
934
+ # Step 3: Drop all rows where column1 equals column2 except the last one
935
+ if len(equal_rows) > 1 and equal_rows.index[-1] == last_index:
936
+ fish_dat = fish_dat.drop(equal_rows.index[:-1])
937
+ elif len(equal_rows) > 1 and equal_rows.index[-1] != last_index:
938
+ fish_dat = fish_dat.drop(equal_rows.index)
939
+ elif len(equal_rows) == 1:
940
+ fish_dat = fish_dat.drop(equal_rows.index)
941
+
942
+ fish_dat.drop(labels = ['transition_filter'], axis = 1, inplace = True)
943
+ filtered = pd.concat([filtered, fish_dat])
944
+
945
+ # Print summary statistics
946
+ initial_transitions = len(self.master_state_table)
947
+ final_transitions = len(filtered)
948
+ removed = initial_transitions - final_transitions
949
+ if initial_transitions == 0:
950
+ raise ValueError("[TTE] Adjacency filter cannot run: master_state_table is empty.")
951
+ print(f"\n[TTE] Adjacency filter complete:")
952
+ print(f" Initial transitions: {initial_transitions:,}")
953
+ print(f" Final transitions: {final_transitions:,}")
954
+ print(f" Removed illegal movements: {removed:,} ({removed/initial_transitions*100:.1f}%)")
955
+
956
+ if self.initial_state_release == False:
957
+ self.master_state_table
958
+
959
+ self.master_state_table = filtered
960
+
961
+ def data_prep(self, project, unknown_state=None, bucket_length_min=15, adjacency_filter=None):
962
+ self.project = project
963
+ if unknown_state is not None:
964
+ last_epoch = self.recap_data[self.recap_data.state == 1].epoch.max()
965
+
966
+ columns = ['freq_code','species', 'start_state', 'end_state', 'presence', 'time_stamp',
967
+ 'time_delta', 'first_obs', 'time_0', 'time_1', 'transition'] # Include 'transition' here
968
+
969
+ self.master_state_table = pd.DataFrame()
970
+ self.bucket_length = bucket_length_min
971
+
972
+ # Sorting recap_data by freq_code and epoch for efficient processing
973
+ self.recap_data.sort_values(by=['freq_code', 'epoch'], ascending=True, inplace=True)
974
+
975
+ # if self.initial_state_release == True:
976
+ # # Merge start_times into recap_data based on freq_code
977
+ # self.recap_data = self.recap_data.merge(
978
+ # self.start_times[['first_recapture']].reset_index(),
979
+ # on='freq_code',
980
+ # how='left'
981
+ # )
670
982
 
671
- for i in self.fish:
672
- # get fish and sort by epoch
673
- fish_dat = self.recap_data[self.recap_data.freq_code == i] # get data for this fish
674
- fish_dat.sort_values(by = 'epoch',
675
- ascending = True,
676
- inplace = True) # sort by exposure time
677
-
678
- # identify previous state and fill in nans
679
- fish_dat['prev_state'] = fish_dat['state'].shift(1) # get previous state
680
- fish_dat['prev_state'].fillna(fish_dat.state.values[0], inplace = True) # fill NaN states with current state - for first record in data frame
681
-
682
- # initialize some counters
683
- presence = 0
684
- first_obs = 1
685
-
686
- # create an empty state table
687
- state_table = pd.DataFrame(columns = columns)
983
+ self.recap_data = self.recap_data.merge(
984
+ self.start_times[['first_recapture']].reset_index(),
985
+ on='freq_code',
986
+ how='left')
987
+
988
+ # Create a boolean array to mark the start of a new fish
989
+ fish_start_mask = self.recap_data['freq_code'] != self.recap_data['freq_code'].shift(1)
990
+
991
+ # Initialize state tracking columns
992
+ self.recap_data['prev_state'] = self.recap_data.groupby('freq_code')['state'].shift(1).fillna(0).astype(int)
993
+ if self.initial_state_release == False:
994
+ self.recap_data = self.recap_data[self.recap_data.prev_state > 0]
995
+ # Set time_0 to the previous epoch or first_recapture if it's the first observation
996
+ self.recap_data['time_0'] = self.recap_data.groupby('freq_code')['epoch'].shift(1)
997
+ self.recap_data['time_0'] = self.recap_data['time_0'].fillna(
998
+ self.recap_data['first_recapture']
999
+ )
1000
+
1001
+ self.recap_data['time_delta'] = self.recap_data['epoch'] - self.recap_data['time_0']
1002
+
1003
+ # Identify the rows where state changes or the fish changes (new fish)
1004
+ state_change_mask = self.recap_data['state'] != self.recap_data['prev_state']
1005
+ last_recapture_mask = self.recap_data.groupby('freq_code')['epoch'].transform('max') == self.recap_data['epoch']
1006
+ mask = state_change_mask | last_recapture_mask
1007
+
1008
+ # Filter rows to keep only those where state changes or it's the last record for the fish
1009
+ state_table = self.recap_data[mask].copy()
1010
+
1011
+ # Fill in the remaining columns
1012
+ state_table['start_state'] = state_table['prev_state'].astype('int32')
1013
+ state_table['end_state'] = state_table['state'].astype('int32')
1014
+ # drop duplicates
1015
+ state_table = state_table.drop_duplicates(subset=['time_0', 'end_state', 'start_state'])
1016
+
1017
+ state_table['presence'] = state_table.groupby('freq_code').cumcount()
1018
+ state_table['first_obs'] = fish_start_mask.astype(int)
1019
+ state_table['time_1'] = state_table['epoch']
1020
+
1021
+ # Create the 'transition' column by zipping 'start_state' and 'end_state'
1022
+ state_table['transition'] = list(zip(state_table['start_state'].astype('int32'), state_table['end_state'].astype('int32')))
1023
+
1024
+ # Add flow period for time-dependent variables
1025
+ state_table['flow_period'] = state_table['time_stamp'].dt.round('30min')
1026
+
1027
+
1028
+ # Write state table to master state table
1029
+ self.master_state_table = pd.concat([self.master_state_table, state_table[columns]], axis=0, ignore_index=True)
1030
+
1031
+ # handle initial release logic in helper functions to improve readability
1032
+ if self.initial_state_release:
1033
+ try:
1034
+ self._convert_release_rows()
1035
+ self._ensure_diagonals()
1036
+ self._insert_missing_releases(project)
1037
+ self._normalize_master()
1038
+ except (KeyError, ValueError, TypeError, IndexError) as e:
1039
+ raise RuntimeError(
1040
+ "[TTE] Failed to apply initial release normalization. "
1041
+ "Check release rows and state transitions for validity."
1042
+ ) from e
1043
+
1044
+ if adjacency_filter is not None:
1045
+ self._apply_adjacency_filter(adjacency_filter)
1046
+ # Apply species filter to master_state_table if requested (defer filtering until master built)
1047
+ if getattr(self, 'species', None) is not None:
1048
+ if 'species' not in self.master_state_table.columns:
1049
+ raise ValueError("[TTE] Species filter requested but 'species' column is missing.")
1050
+ before = len(self.master_state_table)
1051
+ self.master_state_table = self.master_state_table[self.master_state_table.species == self.species].copy()
1052
+ after = len(self.master_state_table)
1053
+ print(f"[TTE] Applied species filter '{self.species}': {before} -> {after} rows")
1054
+ #self.master_stateTable = self.master_stateTable[self.master_stateTable.firstObs == 0]
1055
+ #self.master_state_table.to_csv(os.path.join(project.output_dir,'state_table.csv')
1056
+
1057
+ # generate summary statistics
1058
+ def summary(self, print_summary = True):
1059
+ """Prepare the data needed for summarization."""
1060
+ self.master_state_table.dropna(subset = ['time_delta'],inplace = True)
1061
+ self.master_state_table = self.master_state_table.astype({'freq_code':'object',
1062
+ 'species':'object',
1063
+ 'start_state':'int32',
1064
+ 'end_state':'int32',
1065
+ 'presence':'int32',
1066
+ 'time_stamp':'datetime64[ns]',
1067
+ 'time_delta':'int32',
1068
+ 'first_obs':'int32',
1069
+ 'time_0':'int32',
1070
+ 'time_1':'int32',
1071
+ 'transition':'object'})
1072
+
1073
+ self.master_state_table['dur'] = (
1074
+ self.master_state_table['time_1'].astype('int32') -
1075
+ self.master_state_table['time_0'].astype('int32')
1076
+ )
1077
+
1078
+ self.unique_fish_count = len(self.master_state_table['freq_code'].unique())
1079
+ self.count_per_state = self.master_state_table.groupby('end_state')['freq_code'].nunique()
1080
+ self.msm_state_table = pd.crosstab(self.master_state_table['start_state'], self.master_state_table['end_state'])
1081
+ self.count_table = self.master_state_table.groupby(['start_state', 'end_state'])['freq_code'].nunique().unstack().fillna(0).astype('int32')
1082
+ self.fish_trans_count = self.master_state_table.groupby(['freq_code', 'transition']).size().unstack(fill_value=0)
1083
+
1084
+ grouped_stats = (
1085
+ self.master_state_table
1086
+ .groupby('transition')
1087
+ .agg({
1088
+ 'dur': [
1089
+ 'min',
1090
+ 'median',
1091
+ 'max'
1092
+ ]
1093
+ })
1094
+ )
1095
+ self.move_summ = grouped_stats
1096
+
1097
+ """Generate summary statistics as a dictionary."""
1098
+ min_trans_count = self.fish_trans_count.min()
1099
+ med_trans_count = self.fish_trans_count.median()
1100
+ max_trans_count = self.fish_trans_count.max()
1101
+
1102
+ summary_stats = {
1103
+ "unique_fish_count": self.unique_fish_count,
1104
+ "count_per_state": self.count_per_state,
1105
+ "state_transition_table": self.msm_state_table,
1106
+ "movement_count_table": self.count_table,
1107
+ "min_transition_count": min_trans_count,
1108
+ "median_transition_count": med_trans_count,
1109
+ "max_transition_count": max_trans_count,
1110
+ "movement_duration_summary": self.move_summ
1111
+ }
1112
+ # Print stats
1113
+
1114
+ print("-" * 110)
1115
+ print("Time To Event Data Manage Complete")
1116
+ print("-" * 110 + "\n")
1117
+
1118
+ print("--------------------------------------- MOVEMENT SUMMARY STATISTICS -----------------------------------------\n")
1119
+ print(f"In total, there were {summary_stats['unique_fish_count']} unique fish within this competing risks model.\n")
1120
+
1121
+ print(f"{summary_stats['unique_fish_count']} fish made the movements as enumerated in the state transition table:")
1122
+ print(summary_stats['state_transition_table'])
1123
+ print("The table should read movement from a row to a column.\n")
1124
+
1125
+ print("The number of unique fish to make these movements are found in the following count table:")
1126
+ print(summary_stats['movement_count_table'], "\n")
688
1127
 
689
- # get initial start and end times and filter dataset
690
- time_0 = self.start_times.at[i,'first_recapture']
691
- fish_dat = fish_dat[fish_dat.epoch >= time_0]
692
- time_1 = fish_dat.epoch.iloc[0]
693
-
694
- # calculate seconds since releaes
695
- time_delta = time_1 - time_0
696
-
697
- # creat a row and add it to the state table
698
- row_arr = [i,
699
- 0,
700
- fish_dat.state.values[0],
701
- presence,
702
- fish_dat.time_stamp.values[-1],
703
- first_obs,
704
- time_0,
705
- time_1] # create initial row for state table
706
-
707
- row = pd.DataFrame(np.array([row_arr]),columns = columns)
708
- state_table = state_table.append(row)
709
-
710
- # create arbitrary index and get the maximum
711
- fish_dat['idx'] = np.arange(0,len(fish_dat),1)
712
- max_idx = fish_dat.idx.iloc[-1]
713
-
714
- # for each row, if it's a new presence add data to state table
715
- for j in fish_dat.iterrows(): # for every row in fish data
716
- row_idx = j[1]['idx'] # what's the row number?
717
- state_1 = int(j[1]['prev_state']) # what's the state
718
- state_2 = int(j[1]['state']) # what was the previous state
719
- ts = j[1]['time_stamp']
720
-
721
- # if it's a new state or the end add a row
722
- if state_1 != state_2 or row_idx == max_idx: # if the present state does not equal the previous state or if we reach the end of the dataframe...
723
- time_1 = j[1]['epoch'] # what time is it?
724
- time_delta = time_1 - time_0 # calculate difference in seconds between current time and release # if it's a new state
725
- presence = presence + 1 # oh snap new observation for new state
726
- row_arr = [i,
727
- state_1,
728
- state_2,
729
- presence,
730
- ts,
731
- first_obs,
732
- time_0,
733
- time_1] # start a new row
734
- row = pd.DataFrame(np.array([row_arr]),
735
- columns = columns)
736
- state_table = state_table.append(row) # add the row to the state table data frame
737
- time_0 = j[1]['epoch']
738
- first_obs = 0
739
-
740
- print ("State Table Completed for Fish %s"%(i))
741
-
742
- state_table.sort_values(by = 'time_0',
743
- ascending = True,
744
- inplace = True) # sort by exposure time
745
-
746
- # put time into increments to match time series variables
747
- time_bucket = self.bucket_length*60*1000000000 # time bucket in nanoseconds
748
- state_table['flow_period'] = (state_table['time_0'].\
749
- astype(np.int64)//time_bucket+1) * time_bucket # round to nearest 15 minute period
750
- state_table['flow_period'] = pd.to_datetime(state_table['flow_period']) # turn it into a datetime object so we can use pandas to expand and fill
751
-
752
- # create arbitrary index
753
- row_num = np.arange(0,len(state_table),1)
754
- state_table['row_num'] = row_num
755
-
756
- # create an expanded state table
757
- exp_state_table = pd.DataFrame()
758
-
759
- # build expanded state table
760
- for row in state_table.iterrows():
761
- row_idx = row[1]['row_num']
762
- t0 = row[1]['flow_period']
763
- t1 = row[1]['t1']
764
-
765
- # try expanding, if interval not large enough return nothing
766
- try:
767
- expand = pd.date_range(t0,
768
- t1,
769
- freq = '%smin'%(self.bucket_length))
770
- except ValueError:
771
- expand = []
772
- except AttributeError:
773
- expand = []
774
-
775
- # if we can expand create intervals
776
- if len(expand) > 0:
777
- # create a series using expanded time stamps
778
- series = pd.Series(expand,
779
- index = expand,
780
- name = 'flow_period')
781
-
782
- # convert series to invterval dataframe and perform data management
783
- intervals = series.to_frame()
784
- intervals.reset_index(inplace = True, drop = True)
785
- intervals['t0'] = row[1]['t0']
786
- intervals['t1'] = row[1]['t1']
787
- intervals['startState'] = row[1]['startState']
788
- intervals['endState'] = row[1]['endState']
789
- intervals['timeStamp'] = row[1]['timeStamp']
790
- intervals['FreqCode'] = row[1]['FreqCode']
791
- intervals['presence'] = row[1]['presence']
792
- newRowArr = np.array([row[1]['FreqCode'],
793
- row[1]['startState'],
794
- row[1]['endState'],
795
- row[1]['timeStamp'],
796
- row[1]['flowPeriod'],
797
- row[1]['t0'],
798
- row[1]['t1'],
799
- row[1]['presence']])
800
- newRow = pd.DataFrame(np.array([newRowArr]),columns = ['FreqCode',
801
- 'startState',
802
- 'endState',
803
- 'timeStamp',
804
- 'flowPeriod',
805
- 't0',
806
- 't1',
807
- 'presence']) # add first, non expanded row to new state table
808
- newRow = newRow.append(intervals) # add filled and expanded data
809
- newRow['nextFlowPeriod'] = newRow['flowPeriod'].shift(-1) # identify the next flow period
810
- newRow['idx'] = np.arange(0,len(newRow),1) # add a count index field, but don't index it yet
811
- newRow.reset_index(inplace = True, drop = True) # remove the index
812
- idxL = newRow.idx.values # generate list of indexes
813
- newRow.loc[idxL[1]:,'t0'] = newRow.loc[idxL[1]:,'flowPeriod'].astype(str) # after the initial t0, re-write the current t0 as the current row's flow period
814
- newRow.ix[:idxL[-2],'t1'] = newRow.loc[:idxL[-2],'nextFlowPeriod'].astype(str) # other than the last t1, re-write the current t1 as the current row's next flow period - see what we did there?
815
- newRow.ix[:idxL[-2]:,'endState'] = row[1]['startState']# other than the last row in the series, re-write the end state as the start state - there will be a lot of to-from same site here. it's ok, these are censored observations.
816
- newRow['t0'] = pd.to_datetime(newRow['t0']) # convert time text to datetime - so we can do stuff with it
817
- newRow['t1'] = pd.to_datetime(newRow['t1'])
818
- exp_state_table = exp_state_table.append(newRow) # now add all that stuff to the state table dataframe
819
- del newRow, intervals, newRowArr, expand
820
- else:
821
- newRowArr = np.array([row[1]['FreqCode'],
822
- row[1]['startState'],
823
- row[1]['endState'],
824
- row[1]['timeStamp'],
825
- row[1]['flowPeriod'],
826
- row[1]['t0'],
827
- row[1]['t1'],
828
- row[1]['presence']])
829
- newRow = pd.DataFrame(np.array([newRowArr]),columns = ['FreqCode',
830
- 'startState',
831
- 'endState',
832
- 'timeStamp',
833
- 'flowPeriod',
834
- 't0',
835
- 't1',
836
- 'presence']) # add first, non expanded row to new state table
837
- exp_state_table = exp_state_table.append(newRow)
838
- del newRow, newRowArr
839
- # exp_state_table.sort_values(by = 't0', ascending = True, inplace = True) # sort by exposure time
840
- # exp_state_table['time0'] = pd.to_datetime(exp_state_table['t0']) # create new time columns
841
- # exp_state_table['time1'] = pd.to_datetime(exp_state_table['t1'])
842
- # exp_state_table['t0'] = (pd.to_datetime(exp_state_table['t0']) - initialTime)/np.timedelta64(1,'s')
843
- # exp_state_table['t1'] = (pd.to_datetime(exp_state_table['t1']) - initialTime)/np.timedelta64(1,'s')
844
- # # calculate minimum t0 by presence
845
- # min_t0 = exp_stateTable.groupby(['presence'])['t0'].min()#.to_frame().rename({'t0':'min_t0'},inplace = True)
846
- # min_t0 = pd.Series(min_t0, name = 'min_t0')
847
- # min_t0 = pd.DataFrame(min_t0).reset_index()
848
- # # join to exp_stateTable as presence_time_0
849
- # exp_stateTable = pd.merge(left = exp_stateTable, right = min_t0, how = u'left',left_on = 'presence', right_on = 'presence')
850
- # # subtract presence_time_0 from t0 and t1
851
- # exp_stateTable['t0'] = exp_stateTable['t0'] - exp_stateTable['min_t0']
852
- # exp_stateTable['t1'] = exp_stateTable['t1'] - exp_stateTable['min_t0']
853
- # # drop presence_time_0 from exp_stateTable
854
-
855
- # exp_stateTable['hour'] = pd.DatetimeIndex(exp_stateTable['time0']).hour # get the hour of the day from the current time stamp
856
- # exp_stateTable['qDay'] = exp_stateTable.hour//6 # integer division by 6 to put the day into a quarter
857
- # exp_stateTable['test'] = exp_stateTable.t1 - exp_stateTable.t0 # this is no longer needed, but if t1 is smaller than t0 things are screwed up
858
- # stateTable = exp_stateTable
859
- # del exp_stateTable
860
- # stateTable['transition'] = tuple(zip(stateTable.startState.values.astype(int),stateTable.endState.values.astype(int))) # create transition variable, this is helpful in R
861
- # self.master_stateTable = self.master_stateTable.append(stateTable)
862
- # export
863
- self.master_stateTable.drop(labels = ['nextFlowPeriod'],axis = 1, inplace = True)
864
-
865
-
866
- if adjacency_filter is not None:
867
- '''When the truth value of a detection is assessed, a detection
868
- may be valid for a fish that is not present.
869
-
870
- In some instances, especially when using Yagi antennas, back-lobes
871
- may develop where a fish in the tailrace of a powerhouse is
872
- detected in the forebay antenna. In these instances, a downstream
873
- migrating fish would not have migrated up through the powerhouse.
874
-
875
- From a false positive perspective, these records are valid detections.
876
- However, from a movement perspective, these series of detections
877
- could not occur and should be removed.
878
-
879
- This function repeatedly removes rows with 'illegal' movements
880
- until there are none left. Rows with 'illegal' transitions are
881
- identified with a list that is passed to the function.
882
-
883
- input = list of illegal transitions stored as (from, to) tuples
884
- '''
885
- fish = self.master_stateTable.FreqCode.unique()
886
-
887
- for i in fish:
888
- fishDat = self.master_stateTable[self.master_stateTable.FreqCode == i]
889
- self.master_stateTable = self.master_stateTable[self.master_stateTable.FreqCode != i]
890
-
891
- # create a condition, we're running this filter because we know illogical movements are present
892
- bad_moves_present = True
893
-
894
- # while there are illogical movements, keep filtering
895
- while bad_moves_present == True:
896
- # let's keep count of the number of rows we are filtering
897
- filtered_rows = 0.0
898
-
899
- # for every known bad movement
900
- for j in adjacency_filter:
901
- print ("Starting %s filter"%(i))
902
- # find those rows where this movement exists
903
- fishDat['transition_filter'] = np.where(fishDat.transition == j,1,0)
904
- fishDat.set_index(['time0'], inplace = True)
905
-
906
- if fishDat.transition_filter.sum() > 0:
907
- # add up those rows
908
- filtered_rows = filtered_rows + fishDat.transition_filter.sum()
909
- print ('%s rows found with %s movements'%(fishDat.transition_filter.sum(),j))
910
-
911
- # do some data management, we need to take the start state and t0 of the affected rows and place them on the subsequent row
912
- idx = fishDat.index[fishDat['transition_filter']==1]
913
-
914
- for k in idx:
915
- idx_int = fishDat.index.get_loc(k)
916
- t0_col = fishDat.columns.get_loc('t0')
917
- start_col = fishDat.columns.get_loc('startState')
918
-
919
- # get start time and start state
920
- start = fishDat.iloc[idx_int]['startState']
921
- t0 = fishDat.iloc[idx_int]['t0']
922
-
923
- # write it to next row
924
- try:
925
- idx1 = idx_int + 1
926
- except:
927
- start = fishDat.iloc[idx_int].index[0]
928
- idx1 = start + 1
929
- try:
930
- fishDat.iloc[idx1, start_col] = start
931
- fishDat.iloc[idx1, t0_col] = t0
932
- except IndexError:
933
- # when this occurs, there is no extra row - this last row will be deleted
934
- continue
935
-
936
- # remove those rows
937
- fishDat = fishDat[fishDat.transition_filter != 1]
938
-
939
- # create a new transition field
940
- fishDat['transition'] = tuple(zip(fishDat.startState.values.astype(int),
941
- fishDat.endState.values.astype(int)))
942
-
943
- fishDat.reset_index(inplace = True)
944
- else:
945
- print ("No illegal movements identified")
946
- fishDat.reset_index(inplace = True)
947
-
948
- if filtered_rows == 0.0:
949
- print ("All illegal movements for fish %s removed"%(i))
950
- # stop that loop
951
- bad_moves_present = False
952
-
953
- else:
954
- # i feel bad for you son
955
- print ("%s illegal movements present in iteration, go again"%(filtered_rows))
956
-
957
- fishDat.drop(labels = ['transition_filter'], axis = 1, inplace = True)
958
- self.master_stateTable = self.master_stateTable.append(fishDat)
1128
+ print("The number of movements a fish is expected to make is best described with min, median, and maximum statistics.\n")
1129
+ print("Minimum number of times each transition was made:")
1130
+ print(summary_stats['min_transition_count'], "\n")
959
1131
 
960
- #self.master_stateTable = self.master_stateTable[self.master_stateTable.firstObs == 0]
961
- self.master_stateTable.to_csv(outputFile)
1132
+ print("Median number of times each transition was made:")
1133
+ print(summary_stats['median_transition_count'], "\n")
962
1134
 
963
- # generate summary statistics
964
- def summary(self):
965
- print ("--------------------------------------------------------------------------------------------------------------")
966
- print ("Time To Event Data Manage Complete")
967
- print ("--------------------------------------------------------------------------------------------------------------")
968
- print ("")
969
- print ("")
970
- print ("---------------------------------------MOVEMENT SUMMARY STATISTICS--------------------------------------------")
971
- print ("")
972
- print ("In Total, there were %s unique fish within this competing risks model"%(len(self.master_stateTable.FreqCode.unique())))
973
- print ("The number of unique fish per state:")
974
- countPerState = self.master_stateTable.groupby(['state'])['FreqCode'].nunique().to_frame()
975
- print (countPerState)
976
- print ("")
977
- msm_stateTable = pd.crosstab(self.master_stateTable.startState, self.master_stateTable.endState)
978
- print ("These fish made the following movements as enumerated in the state transition table:")
979
- print (msm_stateTable)
980
- print ("The table should read movement from a row to a column")
981
- print ("")
982
- self.master_stateTable['transition'] = self.master_stateTable.transition.astype(str)
983
- self.countTable = self.master_stateTable.groupby(['startState','endState'])['FreqCode'].nunique().to_frame()
984
- self.countTable.reset_index(inplace = True)
985
- countPerTrans = pd.crosstab(self.countTable.startState,self.countTable.endState,values = self.countTable.FreqCode, aggfunc = 'sum')
986
- print ("The number of unique fish to make these movements are found in the following count table")
987
- print (countPerTrans)
988
- print ("")
989
- # Step 3: Describe the expected number of transitions per fish
990
- self.fishTransCount = self.master_stateTable.groupby(['FreqCode','transition'])['transition'].count()
991
- self.fishTransCount = self.fishTransCount.to_frame(name = 'transCount')
992
- #self.fishTransCount.rename(columns = {'':'transCount'}, inplace = True)
993
- #self.fishTransCount.reset_index(inplace = True)
994
-
995
- print ("The number of movements a fish is expected to make is best described with min, median and maximum statistics")
996
- print ("The mininum number of times each transition was made:")
997
- min_transCount = self.fishTransCount.groupby(['transition'])['transCount'].min()
998
- print (min_transCount)
999
- print ("")
1000
- print ("The median number of times each transition was made:")
1001
- med_transCount = self.fishTransCount.groupby(['transition'])['transCount'].median()
1002
- print (med_transCount)
1003
- print ("")
1004
- print ("The maximum number of times each transition was made by each fish:")
1005
- max_transCount = self.fishTransCount.groupby(['FreqCode','transition'])['transCount'].max()
1006
- #max_transCount.to_csv(os.path.join(r'C:\Users\Kevin Nebiolo\Desktop','maxCountByFreqCode.csv'))
1007
- print (max_transCount)
1008
- print ("")
1009
- print ("Movement summaries - Duration between states in seconds")
1010
- self.master_stateTable['dur'] = (self.master_stateTable.t1 - self.master_stateTable.t0)
1011
- move_summ = self.master_stateTable.groupby('transition')['dur'].describe().round(decimals = 3)
1012
- print (move_summ)
1135
+ print("Maximum number of times each transition was made by each fish:")
1136
+ print(summary_stats['max_transition_count'], "\n")
1137
+
1138
+ print("Movement summaries - Duration between states in seconds:")
1139
+ print(summary_stats['movement_duration_summary'], "\n")
1140
+ self.msm_state_table.to_csv(os.path.join(self.project.output_dir,'state_table.csv'))
1141
+ self.count_table.to_csv(os.path.join(self.project.output_dir,'count_table.csv'))
1142
+ self.master_state_table.to_csv(os.path.join(self.project.output_dir,'tte.csv'))
1143
+ self.move_summ.to_csv(os.path.join(self.project.output_dir,'movement_summary.csv'))
1144
+ return summary_stats
1145
+
1146
+ # Helper methods for initial_state_release handling (refactored)
1147
+ def _convert_release_rows(self):
1148
+ """Convert transitions ending at release (0) into stay-in-place rows
1149
+ and remove trivial 0->0 rows. Operates per-fish to avoid cross-fish
1150
+ contamination and prints summaries when changes occur."""
1151
+ if 'end_state' not in self.master_state_table.columns:
1152
+ return
1153
+ per_fish = []
1154
+ total_converted = 0
1155
+ total_removed = 0
1156
+ for fish in self.master_state_table['freq_code'].unique():
1157
+ df = self.master_state_table[self.master_state_table['freq_code'] == fish].copy()
1158
+ mask_end0 = df['end_state'] == 0
1159
+ n_end0 = int(mask_end0.sum())
1160
+ if n_end0 > 0:
1161
+ total_converted += n_end0
1162
+ for idx in df.index[mask_end0].tolist():
1163
+ ss = int(df.at[idx, 'start_state']) if not pd.isna(df.at[idx, 'start_state']) else int(df.loc[idx, 'start_state'])
1164
+ df.at[idx, 'end_state'] = ss
1165
+ df.at[idx, 'time_delta'] = 0
1166
+ df.at[idx, 'time_1'] = df.at[idx, 'time_0']
1167
+ df.at[idx, 'transition'] = (ss, ss)
1168
+
1169
+ zero_zero_mask = (df['start_state'] == 0) & (df['end_state'] == 0)
1170
+ n_zero_zero = int(zero_zero_mask.sum())
1171
+ if n_zero_zero > 0:
1172
+ total_removed += n_zero_zero
1173
+ df = df.loc[~zero_zero_mask].copy()
1174
+
1175
+ per_fish.append(df)
1176
+
1177
+ if total_converted > 0:
1178
+ print(f"[TTE] Converting {total_converted} transitions ending at release state (0) into stay-in-place rows")
1179
+ if total_removed > 0:
1180
+ print(f"[TTE] Removing {total_removed} trivial release->release (0->0) rows")
1181
+
1182
+ if per_fish:
1183
+ self.master_state_table = pd.concat(per_fish, axis=0, ignore_index=True)
1184
+
1185
+ def _ensure_diagonals(self):
1186
+ """Ensure each fish's last-seen state has a diagonal (S->S) row
1187
+ whose entry time is the start of the contiguous final block of S."""
1188
+ diag_rows = []
1189
+ for fish in self.master_state_table['freq_code'].unique():
1190
+ rows = self.master_state_table[self.master_state_table['freq_code'] == fish].copy()
1191
+ if rows.empty:
1192
+ continue
1193
+ try:
1194
+ rows['time_1'] = rows['time_1'].astype('int64')
1195
+ rows['time_0'] = rows['time_0'].astype('int64')
1196
+ except (ValueError, TypeError) as exc:
1197
+ raise ValueError(
1198
+ f"[TTE] Invalid time_0/time_1 values for fish {fish}."
1199
+ ) from exc
1200
+ last_idx = rows['time_1'].idxmax()
1201
+ last_row = rows.loc[last_idx]
1202
+ last_state = int(last_row['end_state'])
1203
+ last_time = int(last_row['time_1'])
1204
+ diag_exists = ((rows['start_state'] == last_state) & (rows['end_state'] == last_state) & (rows['time_1'] == last_time)).any()
1205
+ if not diag_exists and last_state > 0:
1206
+ fr = rows.sort_values(['time_0','time_1']).reset_index(drop=True)
1207
+ positions = fr.index[fr['end_state'] == last_state].tolist()
1208
+ entry_time = last_time
1209
+ if positions:
1210
+ candidate_idxs = [i for i in positions if int(fr.at[i, 'time_1']) == last_time]
1211
+ last_pos = candidate_idxs[-1] if candidate_idxs else positions[-1]
1212
+ start_pos = last_pos
1213
+ while start_pos > 0 and int(fr.at[start_pos - 1, 'end_state']) == last_state:
1214
+ start_pos -= 1
1215
+ try:
1216
+ entry_time = int(fr.at[start_pos, 'time_0'])
1217
+ except (KeyError, ValueError, TypeError) as exc:
1218
+ raise ValueError(
1219
+ f"[TTE] Invalid entry time for fish {fish} at state {last_state}."
1220
+ ) from exc
1221
+
1222
+ diag = {
1223
+ 'freq_code': fish,
1224
+ 'species': last_row.get('species', np.nan),
1225
+ 'start_state': last_state,
1226
+ 'end_state': last_state,
1227
+ 'presence': int(last_row.get('presence', 0)),
1228
+ 'time_stamp': last_row.get('time_stamp', pd.to_datetime(last_time, unit='s')),
1229
+ 'time_delta': int(last_time - entry_time),
1230
+ 'first_obs': 0,
1231
+ 'time_0': int(entry_time),
1232
+ 'time_1': int(last_time),
1233
+ 'transition': (last_state, last_state)
1234
+ }
1235
+ diag_rows.append(diag)
1236
+
1237
+ if diag_rows:
1238
+ diag_df = pd.DataFrame(diag_rows)
1239
+ for c in diag_df.columns:
1240
+ if c in self.master_state_table.columns:
1241
+ try:
1242
+ diag_df[c] = diag_df[c].astype(self.master_state_table[c].dtype)
1243
+ except (ValueError, TypeError) as exc:
1244
+ raise ValueError(
1245
+ f"[TTE] Failed to cast diagonal column '{c}' for fish rows."
1246
+ ) from exc
1247
+ self.master_state_table = pd.concat([self.master_state_table, diag_df], axis=0, ignore_index=True)
1248
+
1249
+ def _insert_missing_releases(self, project=None):
1250
+ """Insert a release (start_state==0) row for fish missing one using
1251
+ release times from project's tag table (must have `freq_code`)."""
1252
+ tag_table = None
1253
+ if hasattr(self, 'project') and hasattr(self.project, 'tags'):
1254
+ tag_table = self.project.tags
1255
+ elif project is not None and hasattr(project, 'tags'):
1256
+ tag_table = project.tags
1257
+ if tag_table is None:
1258
+ raise ValueError("[TTE] Tag table missing; cannot insert missing releases.")
1259
+
1260
+ if 'freq_code' in tag_table.columns:
1261
+ tag_df = tag_table.copy()
1262
+ else:
1263
+ tag_df = tag_table.reset_index()
1264
+ if 'freq_code' not in tag_df.columns:
1265
+ raise ValueError("[TTE] Tag table missing required 'freq_code' column.")
1266
+
1267
+ missing = []
1268
+ for fish in self.master_state_table['freq_code'].unique():
1269
+ fish_df = self.master_state_table[self.master_state_table['freq_code'] == fish]
1270
+ if (fish_df['start_state'] == 0).any():
1271
+ continue
1272
+ earliest = fish_df.sort_values(['time_0', 'time_1']).iloc[0]
1273
+ try:
1274
+ candidate_end = int(earliest.get('start_state', earliest.get('end_state', 0)))
1275
+ except (ValueError, TypeError) as exc:
1276
+ raise ValueError(f"[TTE] Invalid start_state for fish {fish}.") from exc
1277
+ if candidate_end == 0:
1278
+ candidate_end = int(earliest.get('end_state', 0))
1279
+
1280
+ rows = tag_df[tag_df['freq_code'].astype(str) == str(fish)]
1281
+ if rows.empty:
1282
+ raise ValueError(f"[TTE] Missing release data for fish {fish}.")
1283
+
1284
+ if 'rel_date' in rows.columns:
1285
+ try:
1286
+ rel_epoch = int(
1287
+ (pd.to_datetime(rows['rel_date'].iloc[0]) - pd.Timestamp('1970-01-01'))
1288
+ / pd.Timedelta('1s')
1289
+ )
1290
+ except (ValueError, TypeError) as exc:
1291
+ raise ValueError(f"[TTE] Invalid rel_date for fish {fish}.") from exc
1292
+ elif 'epoch' in rows.columns:
1293
+ try:
1294
+ rel_epoch = int(rows['epoch'].iloc[0])
1295
+ except (ValueError, TypeError) as exc:
1296
+ raise ValueError(f"[TTE] Invalid epoch for fish {fish}.") from exc
1297
+ else:
1298
+ raise ValueError(f"[TTE] Tag table missing rel_date/epoch for fish {fish}.")
1299
+
1300
+ species_val = rows['species'].iloc[0] if 'species' in rows.columns else np.nan
1301
+ release_row = {
1302
+ 'freq_code': fish,
1303
+ 'species': species_val,
1304
+ 'start_state': 0,
1305
+ 'end_state': candidate_end,
1306
+ 'presence': 0,
1307
+ 'time_stamp': pd.to_datetime(rel_epoch, unit='s'),
1308
+ 'time_delta': int(earliest['time_0'] - rel_epoch) if 'time_0' in earliest else 0,
1309
+ 'first_obs': 0,
1310
+ 'time_0': int(rel_epoch),
1311
+ 'time_1': int(earliest['time_0']) if 'time_0' in earliest else int(rel_epoch),
1312
+ 'transition': (0, candidate_end)
1313
+ }
1314
+ missing.append(release_row)
1315
+
1316
+ if missing:
1317
+ mr = pd.DataFrame(missing)
1318
+ for c in mr.columns:
1319
+ if c in self.master_state_table.columns:
1320
+ try:
1321
+ mr[c] = mr[c].astype(self.master_state_table[c].dtype)
1322
+ except (ValueError, TypeError) as exc:
1323
+ raise ValueError(
1324
+ f"[TTE] Failed to cast release column '{c}' for missing releases."
1325
+ ) from exc
1326
+ self.master_state_table = pd.concat([self.master_state_table, mr], axis=0, ignore_index=True)
1327
+
1328
+ def _normalize_master(self):
1329
+ """Normalize master_state_table: coerce time dtypes, drop duplicates,
1330
+ sort deterministically, and recompute presence/first_obs per fish."""
1331
+ try:
1332
+ self.master_state_table['time_0'] = self.master_state_table['time_0'].astype('int64')
1333
+ self.master_state_table['time_1'] = self.master_state_table['time_1'].astype('int64')
1334
+ except (ValueError, TypeError) as exc:
1335
+ raise ValueError("[TTE] time_0/time_1 must be numeric epoch seconds.") from exc
1336
+
1337
+ dup_subset = ['freq_code', 'start_state', 'end_state', 'time_0', 'time_1']
1338
+ self.master_state_table = self.master_state_table.drop_duplicates(subset=dup_subset, keep='first').copy()
1339
+ self.master_state_table.sort_values(by=['freq_code', 'time_0', 'time_1'], inplace=True)
1340
+
1341
+ self.master_state_table['presence'] = (
1342
+ self.master_state_table.groupby('freq_code').cumcount().astype('int32')
1343
+ )
1344
+ self.master_state_table['first_obs'] = 0
1345
+
1346
+ # Example usage
1347
+ # master_state_table = pd.DataFrame({...})
1348
+ # summary = SurvivalDataSummary(master_state_table)
1349
+ # summary_stats = summary.summary()
1350
+ # summary.print_summary() # Optionally print the summary