circaPy 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
circaPy/episodes.py ADDED
@@ -0,0 +1,505 @@
1
+ # Scripts for finding episodes
2
+ # can be sleep or activity episodes!
3
+
4
+ import pdb
5
+ import pandas as pd
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+ from IPython.core.debugger import set_trace
9
+ import circaPy.preprocessing as prep
10
+
11
+ # function to create episode dataframe
12
+ # starting off by working on just a single
13
+ # column.
14
+ # take in dataframe and column number as args
15
+ # and then return a series <- later problem
16
+ # to loop over and do for everything in the
17
+ # df
18
+
19
+
20
+ @prep.validate_input
21
+ def find_episodes(data,
22
+ subject_no=0,
23
+ min_length="1s",
24
+ max_interruption="0s",
25
+ *args,
26
+ **kwargs):
27
+ """
28
+ Identifies episodes in a time series of activity data for a specific subject,
29
+ optionally merging episodes if interruptions between them are below a given
30
+ threshold.
31
+
32
+ Parameters
33
+ ----------
34
+ data : pd.DataFrame
35
+ The activity data for multiple subjects, where each column represents
36
+ a subject's activity over time, and the index is a time-based index.
37
+ subject_no : int, optional
38
+ The column index of the subject to analyze. Default is 0.
39
+ min_length : str or pandas.Timedelta, optional
40
+ The minimum duration for an episode to be included in the results.
41
+ Can be specified as a string (e.g., "1s", "5m") or a `pandas.Timedelta`
42
+ object. Default is "1s".
43
+ max_interruption : str or pandas.Timedelta, optional
44
+ The maximum allowable interruption between episodes for them to be
45
+ considered a single episode. If the interruption is below this threshold,
46
+ the episodes are merged. Can be specified as a string (e.g., "1s", "5m")
47
+ or a `pandas.Timedelta` object. Default is "0s" (no merging).
48
+ *args : tuple
49
+ Additional positional arguments passed to downstream filtering functions.
50
+ **kwargs : dict
51
+ Additional keyword arguments passed to downstream filtering functions.
52
+
53
+ Returns
54
+ -------
55
+ pandas.Series
56
+ A Series where the index represents the start time of valid episodes,
57
+ and the values represent the duration of each episode in seconds.
58
+
59
+ Examples
60
+ --------
61
+ >>> import pandas as pd
62
+ >>> import numpy as np
63
+ >>> index = pd.date_range("2024-01-01", periods=100, freq="1s")
64
+ >>> data = pd.DataFrame({
65
+ ... "Subject 1": np.random.choice([0, 1], size=100, p=[0.8, 0.2]),
66
+ ... "Subject 2": np.random.choice([0, 1], size=100, p=[0.7, 0.3]),
67
+ ... }, index=index)
68
+ >>> find_episodes(data, subject_no=0, min_length="3s", max_interruption="2s")
69
+ 2024-01-01 00:00:15 7.0
70
+ 2024-01-01 00:00:45 5.0
71
+ dtype: float64
72
+ """
73
+ # select single column
74
+ curr_data = data.iloc[:, subject_no]
75
+
76
+ # Determine the threshold for episode identification
77
+ zero_data = (curr_data == 0)
78
+ episode_data = curr_data[zero_data]
79
+
80
+ # Identify the time differences between consecutive points
81
+ shifted_index = episode_data.index.to_series().shift(-1) # Shift index forward
82
+ episode_durations = (
83
+ shifted_index - episode_data.index.to_series()
84
+ ).dropna().dt.total_seconds()
85
+
86
+ # Filter out consecutive zero episodes (treat them as one episode)
87
+ # where goes activity to 0
88
+ episode_ends = zero_data & ~zero_data.shift(1, fill_value=False)
89
+ # where goes from 0 to activity
90
+ episode_starts = zero_data & ~zero_data.shift(-1, fill_value=False)
91
+ # grab the start and end times
92
+ data_freq = pd.Timedelta(pd.infer_freq(curr_data.index))
93
+ episode_start_times = curr_data.index[episode_starts] + data_freq
94
+ episode_end_times = curr_data.index[episode_ends]
95
+
96
+ # Create a DataFrame with episodes
97
+ episode_df = pd.Series(
98
+ (episode_end_times[1:] - episode_start_times[:-1]).total_seconds(),
99
+ index=episode_start_times[:-1])
100
+
101
+ # Merge episodes based on max_interruption
102
+ if max_interruption != "0s":
103
+ max_interruption_td = pd.Timedelta(max_interruption)
104
+ merged_episodes = []
105
+ current_start = None
106
+ current_end = None
107
+
108
+ # go through each start time and duration
109
+ for start_time, duration in episode_df.items():
110
+ if current_start is None:
111
+ current_start = start_time
112
+ current_duration = duration
113
+ else:
114
+ # check what the interruption length is between this and last
115
+ interruption = (
116
+ start_time - (
117
+ current_start + pd.Timedelta(
118
+ seconds=current_duration)
119
+ )
120
+ ).total_seconds()
121
+ # if short enough
122
+ if interruption <= max_interruption_td.total_seconds():
123
+ # Merge episodes
124
+ current_duration += interruption + duration
125
+ else:
126
+ # save current episode and start new one
127
+ merged_episodes.append((current_start, current_duration))
128
+ current_start = start_time
129
+ current_duration = duration
130
+
131
+ # Append the last episode
132
+ if current_start is not None:
133
+ merged_episodes.append((current_start, current_duration))
134
+
135
+ # Update the episode DataFrame
136
+ episode_df = pd.Series(
137
+ {start: duration for start, duration in merged_episodes})
138
+
139
+ # Finally, filter episodes by min_length
140
+ min_length_td = pd.Timedelta(min_length)
141
+ episode_df = episode_df[episode_df >= min_length_td.total_seconds()]
142
+
143
+ return episode_df
144
+
145
+
146
+ def _episode_finder(data,
147
+ inactive_episodes=False,
148
+ allow_interruptions=False,
149
+ *args,
150
+ **kwargs):
151
+ """
152
+ _episode_finder
153
+
154
+ Returns a Series containing all the episodes in the given
155
+ data, with the index indicating start time and value indicating
156
+ duration.
157
+
158
+ Params:
159
+ data:
160
+ pd.Series. raw activity data to find episode in
161
+ inactive_episodes:
162
+ Boolean, default False.
163
+ If False, finds activity episodes (where value > 0)
164
+ If True, finds inactive episodes (value == 0)
165
+ allow_interruptions:
166
+ Boolean, default False
167
+ Whether to filter for episode interruptions by calling
168
+ filter_episodes function
169
+
170
+ Returns:
171
+ pd.Series. Index is start of episode and value is duration
172
+
173
+ """
174
+
175
+ # find all the zeros in the data
176
+ # get the timedeltas between them
177
+ # remove all those where 0-0 with
178
+ # nothing in between
179
+ # save them as a series with the
180
+ # length of each episode in the
181
+ # start-time of the index
182
+
183
+ # find all the zeros in the data
184
+ data_zeros = data[data == 0]
185
+ if inactive_episodes:
186
+ data_zeros = data[data > 0]
187
+
188
+ # get the timedeltas between them
189
+ data_zeros_shift = data_zeros[1:]
190
+ episode_lengths = (data_zeros_shift.index -
191
+ data_zeros.index[:-1]).total_seconds()
192
+ # create Series with the start times
193
+ start_times = data_zeros.index[:-1]
194
+ episode_series = pd.Series(episode_lengths,
195
+ index=start_times)
196
+ # filter out all those with no values between
197
+ # the zeros
198
+ # find the unit of time - assuming stationary
199
+ basic_time_unit = data.index[1] - data.index[0]
200
+ extended_time_unit = ((2 * basic_time_unit) -
201
+ pd.Timedelta("1s")).total_seconds()
202
+ if "min_length" in kwargs:
203
+ extended_time_unit = pd.Timedelta(kwargs["min_length"]).total_seconds()
204
+ episode_lengths_filtered = episode_series[
205
+ episode_series > extended_time_unit
206
+ ]
207
+ # label it with the correct name
208
+ name = data.name
209
+ episode_lengths_filtered.name = name
210
+
211
+ if allow_interruptions:
212
+ episode_lengths_filtered = filter_episodes(
213
+ data, episode_lengths_filtered, **kwargs)
214
+
215
+ return episode_lengths_filtered
216
+
217
+
218
+ def filter_episodes(
219
+ raw_data,
220
+ episode_data,
221
+ length_val: str = "10s",
222
+ intensity_val: int = 30,
223
+ **kwargs):
224
+ '''
225
+ Episode filter
226
+
227
+ Returns a dataframe of episodes where the duration and intensity
228
+ of an interruption is shorter than and less than the given values
229
+ respectively
230
+
231
+ param:
232
+ raw_data:
233
+ original activity dataframe
234
+ episode_data:
235
+ raw episodes from activity dataframe
236
+ length_val:str "10s"
237
+ interruption time to allow, as a timedelta string
238
+ intensity_val:int 30
239
+ max interruption intensity to allow
240
+
241
+ returns:
242
+ pandas DataFrame
243
+ Index is start of episode and value is duration of episode
244
+ in seconds
245
+ '''
246
+
247
+ # find start of each episodes
248
+ start_index = episode_data.index[:-1]
249
+ start_index_next = episode_data.index[1:]
250
+
251
+ # find lengths of interruptions
252
+ time_between_episodes = (start_index_next - start_index).total_seconds()
253
+ durations = episode_data.values[:-1]
254
+
255
+ # find locations where interruption is shorter than value
256
+ filter_length = pd.Timedelta(length_val).total_seconds()
257
+ duration_plus_filter = durations + filter_length
258
+ locations = duration_plus_filter > time_between_episodes
259
+
260
+ # find where interruption value is below given value
261
+ max_values = [raw_data.loc[x:y].max()
262
+ for x, y in zip(start_index, start_index_next)]
263
+ max_mask = np.array([x > intensity_val for x in max_values])
264
+
265
+ # filter for given length and intensity interruption
266
+ episodes_filtered = episode_data.iloc[
267
+ :-1][locations & max_mask]
268
+
269
+ # Add the duration of skipped episode to the main episode
270
+ start_list = episodes_filtered.index[0:-1]
271
+ end_list = episodes_filtered.index[1:] - pd.Timedelta("1s")
272
+ new_durations = [episode_data.loc[x:y].sum() for
273
+ x, y in zip(start_list, end_list)]
274
+ new_durations.append(episodes_filtered.iloc[-1])
275
+ episodes_filtered.iloc[:] = new_durations
276
+
277
+ return episodes_filtered
278
+
279
+
280
+ def episode_find_df(data,
281
+ LDR=-1,
282
+ remove_lights=True,
283
+ check_max=True,
284
+ *args,
285
+ **kwargs):
286
+ """
287
+ Episode_find_df
288
+
289
+ Returns a dataframe with found episodes for each column.
290
+ Applies _episode_finder in turn to each column and then
291
+ concatenates them into a single dataframe
292
+
293
+ Params:
294
+ data:
295
+ pd.DataFrame. Dataframe of activity data to find
296
+ episodes in
297
+ remove_lights:
298
+ Boolean. Default True.
299
+ If true, drops the LDR column
300
+ LDR:
301
+ int, default -1. Column number to remove if lights
302
+ included in dataframe and removing lights
303
+ check_max:
304
+ Boolean, default True.
305
+ If true, passes to check_episode_max to see if any
306
+ episodes are longer than a given max
307
+
308
+ Returns:
309
+ pd.Dataframe
310
+ Dataframe with same columns as original, index indicates
311
+ start of episode and value indicates duration in seconds.
312
+ """
313
+
314
+ # loop through each column
315
+ # and find the episodes in that column
316
+
317
+ # remove light column
318
+ if remove_lights:
319
+ ldr_data = data.iloc[:, LDR].copy()
320
+ ldr_label = data.columns[LDR]
321
+
322
+ # find episodes for each animal
323
+ episode_series_list = []
324
+ for col in data:
325
+ data_series = data.loc[:, col]
326
+ col_episodes = _episode_finder(data_series, *args, **kwargs)
327
+ episode_series_list.append(col_episodes)
328
+ episode_df = pd.concat(episode_series_list, axis=1)
329
+
330
+ # put light column back in
331
+ if remove_lights:
332
+ episode_df[ldr_label] = ldr_data
333
+
334
+ # check that we are getting reasonable episode lengths
335
+ if check_max:
336
+ try:
337
+ check_episode_max(episode_df)
338
+ except BaseException:
339
+ episode_df = episode_df.iloc[:-1, :]
340
+ check_episode_max(episode_df, **kwargs)
341
+
342
+ return episode_df
343
+
344
+
345
+ def check_episode_max(data,
346
+ max_time="6h",
347
+ LDR=-1,
348
+ **kwargs):
349
+ """
350
+ Simple function to raise value error if any of the
351
+ values are over 6 hours long
352
+ :param data:
353
+ :return:
354
+ """
355
+ comparison = pd.Timedelta(max_time).total_seconds()
356
+ # grab the max values from all non-LDR columns
357
+ max_values = data.max()
358
+ max_values.pop(data.columns[LDR])
359
+ if any(max_values > comparison):
360
+ raise ValueError("Max episode longer than %s" % max_time)
361
+
362
+ # Functions to plot histogram of data
363
+
364
+
365
+ def _deprec_episode_histogram(data,
366
+ fig=None,
367
+ ax=None,
368
+ LDR=-1,
369
+ convert=False,
370
+ log=True,
371
+ **kwargs):
372
+ """
373
+ Function to take dataframe and plot each pir as a
374
+ separate column
375
+ :param data:
376
+ :param LDR:
377
+ :param args:
378
+ :param kwargs:
379
+ :return:
380
+ """
381
+ # preprocess to be able to plot easily
382
+ data.dropna(inplace=True)
383
+ if convert:
384
+ data = convert_data_to_unit(data)
385
+
386
+ # create figure if not given
387
+ if not fig and not ax:
388
+ no_animals = len(data.columns)
389
+ fig, ax = plt.subplots(nrows=1,
390
+ ncols=no_animals,
391
+ sharex=True,
392
+ sharey=True)
393
+
394
+ # plot a histogram on the axis
395
+ for axis, col in zip(ax, data.columns):
396
+ axis.hist(data.loc[:, col])
397
+ if log:
398
+ axis.set_yscale('log')
399
+ axis.set_title(col)
400
+
401
+ # set the defaults
402
+ params_dict = {
403
+
404
+ }
405
+
406
+
407
+ def episode_histogram(data_list,
408
+ LDR: int = -1,
409
+ logx: bool = True,
410
+ clip: bool = True,
411
+ **kwargs):
412
+ """
413
+ Plotting function takes in df and separates into list and plots
414
+ :param data_list:
415
+ :param LDR:
416
+ :param kwargs:
417
+ :return:
418
+ """
419
+
420
+ # remove LDR from all dfs in the list and put in label
421
+ tidied_data_list = [x.drop(x.columns[LDR], axis=1) for x in data_list]
422
+ label_list = [x.name for x in data_list]
423
+ no_animals = len(tidied_data_list[0].columns)
424
+ no_conditions = len(tidied_data_list)
425
+
426
+ # set some function constants
427
+ bins = 10
428
+ if "bins" in kwargs:
429
+ bins = kwargs["bins"]
430
+
431
+ logy = False
432
+ if "logy" in kwargs:
433
+ logy = kwargs["logy"]
434
+
435
+ # plot the data, each condition separate row, each animal on a column
436
+ fig, ax = plt.subplots(nrows=no_conditions,
437
+ ncols=no_animals,
438
+ sharex=True,
439
+ sharey=True)
440
+ # plot each condition on a separate row
441
+ for row, condition in enumerate(label_list):
442
+ plotting_df = tidied_data_list[row]
443
+ for col_plot, col_label in enumerate(plotting_df):
444
+ plotting_col = plotting_df.loc[:, col_label].dropna()
445
+ if clip and "bins" in kwargs:
446
+ plotting_col = np.clip(plotting_col, 0, bins[-1])
447
+ if no_animals > 1:
448
+ curr_axis = ax[row, col_plot]
449
+ else:
450
+ curr_axis = ax[row]
451
+ curr_axis.hist(plotting_col,
452
+ bins=bins,
453
+ log=logy,
454
+ density=True)
455
+ if row == 0:
456
+ curr_axis.set_title(col_label)
457
+ if col_plot == 0:
458
+ curr_axis.set_ylabel(condition)
459
+ if logx:
460
+ curr_axis.set_xscale('log')
461
+
462
+ # tidy up the subplots
463
+ fig.subplots_adjust(hspace=0,
464
+ wspace=0)
465
+
466
+ # set the default values
467
+ params_dict = {
468
+ "xlabel": "Episode Duration, seconds",
469
+ "ylabel": "Normalised Density",
470
+ "title": "Episode histogram"
471
+ }
472
+
473
+ return fig, curr_axis, params_dict
474
+
475
+
476
+ def convert_data_to_unit(data,
477
+ unit_time="1M"):
478
+ """
479
+ Function to convert all the values from seconds to specified
480
+ unit
481
+ :param data:
482
+ :param unit_time:
483
+ :return:
484
+ """
485
+
486
+ conversion_amount = pd.Timedelta(unit_time).total_seconds()
487
+ data_new = data.copy() / conversion_amount
488
+ return data_new
489
+
490
+
491
+ def _stack_all_values(data):
492
+ """
493
+ gets all values for all animals in a single column
494
+ :param data:
495
+ :return:
496
+ """
497
+ # create long dataframe with values for all animals in it
498
+ df_long = data.stack()
499
+ df_long.index = df_long.index.droplevel(2)
500
+ df = pd.DataFrame(df_long)
501
+
502
+ df.columns = ["Sum of all animals"]
503
+ df.name = data.name
504
+
505
+ return df
circaPy/periodogram.py ADDED
@@ -0,0 +1,101 @@
1
+ import re
2
+ import pdb
3
+ import pandas as pd
4
+ import numpy as np
5
+ from astropy.timeseries import LombScargle
6
+ import circaPy.activity as act
7
+ import circaPy.preprocessing as prep
8
+
9
+
10
+ @prep.validate_input
11
+ def lomb_scargle_period(data, subject_no=0, low_period=20, high_period=30,
12
+ **kwargs):
13
+ """
14
+ Calculates the Lomb-Scargle periodogram for a single column in a DataFrame.
15
+
16
+ Parameters
17
+ ----------
18
+ data : pd.DataFrame
19
+ Input DataFrame with time-series data. The index represents time, and
20
+ the columns contain observations.
21
+ subject_no : int, optional
22
+ The positional index of the column to analyze. Default is 0.
23
+ low_period : float, optional
24
+ The shortest period to search for, in hours. Default is 20.
25
+ high_period : float, optional
26
+ The longest period to search for, in hours. Default is 30.
27
+
28
+ Returns
29
+ -------
30
+ dict
31
+ A dictionary with the following keys:
32
+ - "Pmax" : float
33
+ Maximum power from the Lomb-Scargle periodogram.
34
+ - "Period" : float
35
+ Period corresponding to the maximum power, in hours.
36
+ - "Power_values" : pd.Series
37
+ Power values for all test periods, indexed by period in hours.
38
+
39
+ Raises
40
+ ------
41
+ IndexError
42
+ If `subject_no` is out of the valid range for the DataFrame columns.
43
+ ValueError
44
+ If `low_period` is greater than or equal to `high_period`.
45
+
46
+ Notes
47
+ -----
48
+ - The function assumes evenly spaced time-series data. If the time index is
49
+ irregular,
50
+ the results may be inaccurate.
51
+ - The power calculation may return NaN if the data is insufficient or
52
+ contains only NaNs.
53
+ """
54
+ # Ensure the positional index is valid
55
+ if subject_no < 0 or subject_no >= len(data.columns):
56
+ raise IndexError(
57
+ f"Invalid subject_no {subject_no}. Must be between 0 and"
58
+ f"{len(data.columns) - 1}.")
59
+
60
+ # Validate periods
61
+ if low_period >= high_period:
62
+ raise ValueError(f"low_period ({low_period}) must be less than"
63
+ f"high_period ({high_period}).")
64
+
65
+ # get sampling frequency
66
+ sample_freq = pd.Timedelta(pd.infer_freq(data.index)).total_seconds()
67
+
68
+ # Define the range of frequencies to search in cycles/sample
69
+ low_freq = 1 / (high_period * 3600) # convert to seconds
70
+ high_freq = 1 / (low_period * 3600)
71
+ freq = np.linspace(low_freq, high_freq, 10000)
72
+ freq_hours = 1 / (freq * 3600)
73
+
74
+ # Prepare observations
75
+ observations = data.iloc[:, subject_no].values
76
+ if observations.size == 0 or np.all(np.isnan(observations)):
77
+ return {"Pmax": 0,
78
+ "Period": np.nan,
79
+ "Power_values": pd.Series(dtype=float)}
80
+ observation_times = np.arange(len(data)) * sample_freq
81
+
82
+ # Calculate Lomb-Scargle periodogram
83
+ power = LombScargle(
84
+ observation_times,
85
+ observations).power(
86
+ freq,
87
+ method='auto')
88
+
89
+ # Handle cases where the power calculation fails
90
+ if pd.isnull(power[0]):
91
+ return {"Pmax": 0, "Period": 0, "Power_values": pd.Series(dtype=float)}
92
+
93
+ # Maximum power and its corresponding period in hours
94
+ pmax = power.max()
95
+ best_period = freq_hours[np.argmax(power)]
96
+
97
+ # Create a power series for the output
98
+ power_values = pd.Series(
99
+ power, index=freq_hours).sort_index()
100
+
101
+ return {"Pmax": pmax, "Period": best_period, "Power_values": power_values}