pyCLINE 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ """
2
+ recovery_methods: Methods for recovering nullclines.
3
+
4
+ Modules:
5
+ data_preparation: Functions for preparing data.
6
+ nn_training: Functions for configuring and training neural network models.
7
+ """
8
+
9
+ from .data_preparation import prepare_data
10
+ from .nn_training import configure_FFNN_model, train_FFNN_model
11
+
12
+ from . import nn_training
13
+ from . import data_preparation
14
+
15
+ __all__ = ['nn_training', 'data_preparation', 'prepare_data', 'configure_FFNN_model', 'train_FFNN_model']
@@ -0,0 +1,394 @@
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import pandas as pd
4
+ from scipy.signal import find_peaks
5
+
6
+ def normalize_variables(df, vars, time, norm_method='minmax', value_max=1.0, value_min=0.0):
7
+ """
8
+ The function conducts min-max normalization in the range '[0:1]
9
+ of the varibles from the list 'vars' in the dataframe 'df'
10
+
11
+ return dataframe containing normalized variables 'df_norm'
12
+ and the dataframe containing normalization coefficients per variable 'df_coef'
13
+
14
+ Args:
15
+ df (pandas DataFrame): dataframe containing variables
16
+ vars (list): list of the variables to normalize
17
+ time (str): name of the time column
18
+ value_max (float, optional): Maximal value for min-max normalizaiton. Defaults to 1.0.
19
+ value_min (int, optional): Minimal value for min-max noramlization. Defaults to 0.0.
20
+
21
+ Returns:
22
+ df_norm (pandas DataFrame): dataframe containing normalized variables
23
+ df_coef (pandas DataFrame): dataframe containing minimal and maximal values per variable
24
+ """
25
+ # creating a new dataframe for normalized data and copying the time column
26
+ df_norm = pd.DataFrame()
27
+ df_norm[time] = df[time]
28
+ # creating a dataframe to store the normalization coefficients (min and max of the raw variables)
29
+ df_coef = pd.DataFrame(index=['min', 'max'])
30
+ if norm_method != 'None':
31
+ # normalizing variables
32
+ for var in vars:
33
+ if norm_method == 'minmax':
34
+ df_coef[var] = [df[var].min(), df[var].max()]
35
+
36
+ df_norm['norm {:}'.format(var)] = (df[var] - df_coef[var]['min'])*(value_max-value_min)/(df_coef[var]['max'] - df_coef[var]['min']) + value_min
37
+ if norm_method == 'zscore':
38
+ df_coef[var] = [df[var].mean(), df[var].std()]
39
+ df_norm['norm {:}'.format(var)] = (df[var] - df_coef[var]['mean'])/(df_coef[var]['sd'])
40
+
41
+ return df_norm, df_coef
42
+
43
+ def plot_optimal_thresholding(thresholds, nsamples, varsamples, optimal_threshold, idx, histogram):
44
+ """
45
+ When a system exhibits a non-uniform distribution of data points in the phase space,
46
+ the optimal thresholding can be used to sample the data uniformly.
47
+ This function plots the normalized sample size, normalized sampling standard deviation,
48
+ and the ratio of the sample size to the sampling standard deviation as a function of the threshold.
49
+
50
+ Args:
51
+ thresholds (numpy array): array of threshold values
52
+ nsamples (numpy array): array of normalized sample sizes
53
+ varsamples (numpy array): array of normalized sampling standard deviations
54
+ optimal_threshold (int): optimal threshold value
55
+ idx (int): index of the optimal threshold value
56
+ histogram (numpy histogram object): 2D histogram of the phase space
57
+ """
58
+ fig, axes = plt.subplots(1,2, figsize=(11,5))
59
+ axes[0].plot(thresholds, nsamples, label='norm. sample size')
60
+ axes[0].plot(thresholds, varsamples, label='norm. sampling SD', c='C1')
61
+ axtwinx = axes[0].twinx()
62
+ axtwinx.plot(thresholds, nsamples/varsamples, label='sample size/SD', c='C2')
63
+ axtwinx.scatter(optimal_threshold, (nsamples/varsamples)[idx], c='r', s=50, label='optimal threshold, {:d}'.format(int(optimal_threshold)))
64
+
65
+ axes[0].legend()
66
+ axtwinx.legend()
67
+ axes[0].set_xlabel('sampling threshold')
68
+ axes[0].set_ylabel('norm sample size, sampling SD')
69
+ axtwinx.set_ylabel('sample size/SD')
70
+
71
+ hthresh = np.copy(histogram)
72
+ hthresh[hthresh>optimal_threshold] = optimal_threshold
73
+
74
+ axes[1].imshow(hthresh)
75
+
76
+ plt.show()
77
+
78
+ def compute_optimal_threshold(df, vars, binx, biny, plot_thresholding=True):
79
+ """
80
+ When a system exhibits a non-uniform distribution of data points in the phase space,
81
+ the optimal thresholding can be used to sample the data uniformly.
82
+ This function computes the optimal thresholding value based on the normalized sample size,
83
+ normalized sampling standard deviation, and the ratio of the sample size to the sampling standard deviation.
84
+
85
+ Args:
86
+ df (pandas dataframe): dataframe containing variables
87
+ vars (list): list of the variables to generate the optimal threshold
88
+ binx (list): list of bins for the first variable
89
+ biny (list): list of bins for the second variable
90
+ plot_thresholding (bool, optional): If the thresholded phase space between the first and second
91
+ variable should be plotted. Defaults to True.
92
+
93
+ Returns:
94
+ optimal_threshold (int): optimal threshold value
95
+ """
96
+ h, _, _ = np.histogram2d(df[vars[0]], df[vars[1]], bins=(binx, biny))
97
+
98
+ nthresh = np.arange(np.min(h[h>0]),np.max(h[h>0]),50)
99
+ nsamples = np.zeros(nthresh.shape[0])
100
+ varsamples = np.zeros(nthresh.shape[0])
101
+
102
+ for i in range(nthresh.shape[0]):
103
+
104
+ hthresh = np.copy(h)
105
+ hthresh[hthresh>nthresh[i]] = nthresh[i]
106
+ nsamples[i] = np.sum(hthresh)
107
+ varsamples[i] = np.std(hthresh[hthresh>0])
108
+
109
+ if np.max(nsamples) != np.min(nsamples):
110
+ nsamples = (nsamples - np.min(nsamples))/(np.max(nsamples) - np.min(nsamples)) + 1
111
+ varsamples = (varsamples - np.min(varsamples))/(np.max(varsamples) - np.min(varsamples)) + 1
112
+
113
+ idx = np.argmax(nsamples/varsamples)
114
+ optimal_threshold = nthresh[idx]
115
+ else:
116
+ optimal_threshold = np.max(nthresh)
117
+
118
+ if plot_thresholding:
119
+ plot_optimal_thresholding(nthresh, nsamples, varsamples, optimal_threshold, idx, h)
120
+
121
+ return int(optimal_threshold)
122
+
123
+ def uniform_sampling(df, threshold, input_vars, binx, biny):
124
+ """
125
+ The function samples the data uniformly in the phase space defined by the input variables
126
+ based on the provided threshold value.
127
+
128
+ Args:
129
+ df (pandas dataframe): dataframe containing variables
130
+ threshold (int): threshold value for the uniform sampling
131
+ input_vars (list): list of the input variables
132
+ binx (list): list of bins for the first variable
133
+ biny (list): list of bins for the second variable
134
+
135
+ Returns:
136
+ df_uniform (pandas dataframe): dataframe containing uniformly sampled data
137
+ """
138
+ df_uniform = pd.DataFrame()
139
+
140
+ for i in range(0,binx.shape[0]):
141
+ for j in range(0,biny.shape[0]):
142
+
143
+ df_subsample = df[(df[input_vars[0]]>binx[i-1]) & (df[input_vars[0]]<binx[i]) &
144
+ (df[input_vars[1]]>biny[j-1]) & (df[input_vars[1]]<biny[j])].copy()
145
+ if df_subsample.shape[0]>0:
146
+ if df_subsample.shape[0]<=threshold:
147
+ df_uniform = pd.concat((df_uniform,df_subsample), ignore_index=True)
148
+ else:
149
+ df_uniform = pd.concat((df_uniform,df_subsample[:threshold]), ignore_index=True)
150
+
151
+ return df_uniform
152
+
153
+ # data preparation
154
+ def prepare_data(df, vars, time, tmin=None, tmax=None, scheme='newton_difference', norm_method='minmax',value_min=0.0, value_max=1.0, normalize=True):
155
+ """
156
+ The function prepares the raw time series of the system variables for feeding into ML model. Preparation includes the following steps: (i) data slicing in the indicated range [tmin:tmax], [:tmax], or [tmin:] (optional). If tmin and tmax are not provided, full data are processed (ii) min-max normalization of the variables from the list 'vars'(iii) computing a delayed [t-1] variable for each variable from the list.
157
+ The function returns a prepared dataframe 'df_prepared' and the dataframe containing normalization coefficients per variable 'df_coef'.
158
+
159
+ Args:
160
+ df (pandas dataframe): dataframe containing variables
161
+ vars (list): list of the variables to normalize
162
+ time (str): name of the time column
163
+ tmin (float, optional): Minimal time of the time slice of data. Defaults to None.
164
+ tmax (float, optional): Maximal time of the time slice of data. Defaults to None.
165
+ scheme (str, optional): Scheme for computing second input variable. Defaults to 'newton_difference'.
166
+ norm_method (str, optional): Normalization method. Defaults to 'minmax'.
167
+ value_min (int, optional): Minimal value for minmax normalization. Defaults to 0.0.
168
+ value_max (int, optional): Maximal value for minmax normalization. Defaults to 1.0.
169
+ normalize (bool, optional): If data should be normalized. Defaults to True.
170
+
171
+ Returns:
172
+ df_prepared (pandas dataframe): dataframe containing prepared data for feeding into ML model
173
+ df_coef (pandas dataframe): dataframe containing normalization coefficients (min and max values) per variable
174
+ """
175
+
176
+ if not isinstance(df, pd.DataFrame):
177
+ raise ValueError("Input data is not a pandas DataFrame")
178
+ if not isinstance(vars, list):
179
+ raise ValueError("Variables should be provided as a list")
180
+ if not isinstance(time, str):
181
+ raise ValueError("Time column name of pandas DataFrame should be provided as a string")
182
+ if not isinstance(tmin, (int, float)) and tmin is not None:
183
+ raise ValueError("tmin should be a float or an integer, or None")
184
+ if not isinstance(tmax, (int, float)) and tmax is not None:
185
+ raise ValueError("tmax should be a float or an integer, or None")
186
+ if not isinstance(scheme, str):
187
+ raise ValueError("Scheme should be provided as a string")
188
+ if not isinstance(norm_method, str):
189
+ raise ValueError("Normalization method should be provided as a string")
190
+ if not isinstance(value_min, (int, float)):
191
+ raise ValueError("Minimal value for minmax normalization should be a float or an integer")
192
+ if not isinstance(value_max, (int, float)):
193
+ raise ValueError("Maximal value for minmax normalization should be a float or an integer")
194
+ if not isinstance(normalize, bool):
195
+ raise ValueError("Normalize variable should be a boolean value")
196
+ if not all([var in df.columns for var in vars]):
197
+ raise ValueError("Variables should be present in the provided dataframe")
198
+ if not time in df.columns:
199
+ raise ValueError("Time column should be present in the provided dataframe")
200
+ if scheme not in ['newton_difference', 'two_point', 'five_point', 'derivative']:
201
+ raise ValueError("Unknown scheme: {:}".format(scheme))
202
+
203
+ # slice the data in the range [tmin; tmax] if needed
204
+ if ((tmin is None) or (tmax is None)):
205
+ if tmin is None:
206
+ tmin = df[time].min()
207
+ if tmax is None:
208
+ tmax = df[time].max()
209
+ df_slice = df[(df[time]>=tmin) & (df[time]<=tmax)].copy()
210
+
211
+ # min-max normalization of each variable in the range [value_min; value_max]
212
+ if normalize:
213
+ df_norm, df_coef = normalize_variables(df_slice, vars=vars, time=time, norm_method=norm_method,value_max=value_max,value_min=value_min)
214
+ else:
215
+ df_norm = df_slice.copy()
216
+ df_coef = pd.DataFrame(index=['min', 'max'])
217
+ for var in vars:
218
+ df_norm['norm {:}'.format(var)] = df_norm[var]
219
+ df_coef[var] = [df[var].min(), df[var].max()]
220
+ # computing delayed variables
221
+ df_prepared = pd.DataFrame()
222
+
223
+ if scheme == 'newton_difference':
224
+ first_point = 1
225
+ if scheme == 'two_point':
226
+ first_point = 2
227
+ if scheme == 'five_point':
228
+ first_point = 4
229
+ if scheme == 'derivative':
230
+ first_point=0
231
+ df_prepared[time] = df_norm[time].iloc[first_point:]
232
+ for var in vars:
233
+ # df_prepared['norm {:}'.format(var)] = df_norm['norm {:}'.format(var)].to_numpy()[first_point:]
234
+ if scheme=='newton_difference':
235
+ df_prepared['norm {:}'.format(var)] = df_norm['norm {:}'.format(var)].to_numpy()[first_point:]
236
+ df_prepared['norm {:}'.format(var)+'[t-1]'] = df_norm['norm {:}'.format(var)].to_numpy()[:-first_point]
237
+ if scheme=='two_point':
238
+ df_prepared['norm {:}'.format(var)] = df_norm['norm {:}'.format(var)].to_numpy()[first_point-1:-first_point+1]
239
+ df_prepared['norm {:}'.format(var)+'[t+1]'] = df_norm['norm {:}'.format(var)].to_numpy()[first_point:]
240
+ df_prepared['norm {:}'.format(var)+'[t-1]'] = df_norm['norm {:}'.format(var)].to_numpy()[first_point-2:-first_point]
241
+ if scheme=='five_point':
242
+ df_prepared['norm {:}'.format(var)] = df_norm['norm {:}'.format(var)].to_numpy()[first_point-2:-first_point+2]
243
+ df_prepared['norm {:}'.format(var)+'[t+2]'] = df_norm['norm {:}'.format(var)].to_numpy()[first_point:]
244
+ df_prepared['norm {:}'.format(var)+'[t+1]'] = df_norm['norm {:}'.format(var)].to_numpy()[first_point-1:-first_point+3]
245
+ df_prepared['norm {:}'.format(var)+'[t-1]'] = df_norm['norm {:}'.format(var)].to_numpy()[first_point-3:-first_point+1]
246
+ df_prepared['norm {:}'.format(var)+'[t-2]'] = df_norm['norm {:}'.format(var)].to_numpy()[:-first_point]
247
+ if scheme=='derivative':
248
+ dt=df[time][1]-df[time][0]
249
+ df_prepared['norm {:}'.format(var)] = df_norm['norm {:}'.format(var)].to_numpy()[first_point:]
250
+
251
+ x = df_norm['norm {:}'.format(var)].to_numpy()[first_point:]
252
+ x_dot = np.ones(x.shape[0])*np.NaN
253
+ x_dot_forward =[(x[i+1]-x[i])/dt for i in range(x[:-1].shape[0])]
254
+
255
+ x_dot_backward =[(x[i]-x[i-1])/dt for i in range(x[1:].shape[0])]
256
+
257
+ x_dot_forward = np.array(x_dot_forward)
258
+ x_dot_backward = np.array(x_dot_backward)
259
+ x_dot= (x_dot_forward + x_dot_backward)/2
260
+ # insert np.nan at the first position
261
+ x_dot = np.insert(x_dot, -1, np.nan)
262
+ x_dot[0]=np.nan
263
+
264
+ df_prepared['d norm{:}'.format(var)+'/dt'] = x_dot
265
+
266
+ return df_prepared.dropna(), df_coef
267
+
268
+ def shuffle_and_split(df, input_vars, target_var, train_frac=0.7, test_frac=0.15, optimal_thresholding=True, plot_thresholding=True):
269
+ """
270
+ The function prepares training, testing, and validation sets from the prepared dataframe
271
+ by random uniform shuffling and splitting the data according to the provided proportions
272
+
273
+ the function returns respective training, testing, and validation datasets
274
+
275
+ Args:
276
+ df (pandas dataframe): dataframe containing prepared data for feeding into ML model
277
+ input_vars (list): list of the input variables
278
+ target_var (list): list of the target variable(s)
279
+ train_frac (float, optional): fraction of training data. Defaults to 0.7.
280
+ test_frac (float, optional): fraction of testing data. Defaults to 0.15.
281
+ optimal_thresholding (bool, optional): If the optimal thresholding should be used. Defaults to True.
282
+ plot_thresholding (bool, optional): _description_. Defaults to True.
283
+
284
+ Returns:
285
+ input_train (pandas dataframe): dataframe containing input variables for training
286
+ target_train (pandas dataframe): dataframe containing target variables for training
287
+ input_test (pandas dataframe): dataframe containing input variables for testing
288
+ target_test (pandas dataframe): dataframe containing target variables for testing
289
+ input_val (pandas dataframe): dataframe containing input variables for validation
290
+ target_val (pandas dataframe): dataframe containing target variables for validation
291
+ """
292
+
293
+ if not isinstance(df, pd.DataFrame):
294
+ raise ValueError("Input data is not a pandas DataFrame")
295
+ if not isinstance(input_vars, list):
296
+ raise ValueError("Input variables should be provided as a list")
297
+ if not isinstance(target_var, list):
298
+ raise ValueError("Target variables should be provided as a list")
299
+ if not isinstance(train_frac, float):
300
+ raise ValueError("Training fraction should be a float")
301
+ if not isinstance(test_frac, float):
302
+ raise ValueError("Testing fraction should be a float")
303
+ if train_frac + test_frac > 1:
304
+ raise ValueError("Training and testing fractions should be less or equal to 1")
305
+ if train_frac< test_frac:
306
+ raise ValueError("Training fraction should be greater than testing fraction")
307
+ if not isinstance(optimal_thresholding, bool):
308
+ raise ValueError("Optimal thresholding should be a boolean value")
309
+ if not isinstance(plot_thresholding, bool):
310
+ raise ValueError("Plot thresholding should be a boolean value")
311
+ if not all([var in df.columns for var in input_vars]):
312
+ raise ValueError("Input variables should be present in the provided dataframe")
313
+ if not all([var in df.columns for var in target_var]):
314
+ raise ValueError("Target variables should be present in the provided dataframe")
315
+
316
+ # data shuffling
317
+ df_shuffled = df.sample(frac = 1)
318
+
319
+ # uniform data sampling in the phase space
320
+ if optimal_thresholding:
321
+ binx = np.linspace(df_shuffled[input_vars[0]].min(), df_shuffled[input_vars[0]].max(), 11)
322
+ biny = np.linspace(df_shuffled[target_var[0]].min(), df_shuffled[target_var[0]].max(), 11)
323
+
324
+ if optimal_thresholding:
325
+ optimal_threshold = compute_optimal_threshold(df_shuffled, [input_vars[0], target_var[0]], binx, biny, plot_thresholding)
326
+
327
+ df_shuffled = uniform_sampling(df_shuffled, optimal_threshold, [input_vars[0], target_var[0]], binx, biny)
328
+ df_shuffled = df_shuffled.sample(frac = 1)
329
+
330
+ # computing the sizes of the training and testing datasets based of the provided fractions
331
+ Ntrain = int(df_shuffled.shape[0]*train_frac)
332
+ Ntest = int(df_shuffled.shape[0]*test_frac)
333
+
334
+ # splitting the data
335
+ input_train = df_shuffled[input_vars].iloc[:Ntrain]
336
+ target_train = df_shuffled[target_var].iloc[:Ntrain]
337
+
338
+ input_test = df_shuffled[input_vars].iloc[Ntrain:Ntrain+Ntest]
339
+ target_test = df_shuffled[target_var].iloc[Ntrain:Ntrain+Ntest]
340
+
341
+ input_val = df_shuffled[input_vars].iloc[Ntrain+Ntest:]
342
+ target_val = df_shuffled[target_var].iloc[Ntrain+Ntest:]
343
+
344
+ return input_train, target_train, input_test, target_test, input_val, target_val
345
+
346
+ def normalize_adjusted(x, df_coef, var, min=-1, max=1):
347
+ return (x - df_coef[var].min())*(max-min) / (df_coef[var].max() - df_coef[var].min())+min
348
+
349
+
350
+ #### General functions
351
+ def calculate_period(x_train, t_train):
352
+ """
353
+ Calculate the period of an oscillation.
354
+
355
+ Parameters
356
+ ----------
357
+ x_train : np.array
358
+ Time series of data.
359
+ t_train : np.array
360
+ Time of the time series.
361
+
362
+ Returns
363
+ -------
364
+ peaks_u : np.array
365
+ Size of peaks detected in the time series.
366
+ peaks_t : np.array
367
+ Time points where peaks occur in the data set.
368
+ period : float
369
+ Period of Time Series.
370
+ peaks[0]: list
371
+ Array of all indicies of local maxima
372
+
373
+ """
374
+ if len(x_train.shape)<2:
375
+ peaks=find_peaks(x_train)
376
+ peaks_t=t_train[peaks[0]]
377
+ peaks_u=x_train[peaks[0]]
378
+ elif x_train.shape[1]==2 or x_train.shape[1]==3:
379
+ peaks=find_peaks(x_train[:,0])
380
+ peaks_t=t_train[peaks[0]]
381
+ peaks_u=x_train[peaks[0],0]
382
+ elif x_train.shape[1]==5:
383
+ peaks=find_peaks(x_train[:,1])
384
+ peaks_t=t_train[peaks[0]]
385
+ peaks_u=x_train[peaks[0],1]
386
+
387
+ period_temp=0
388
+ for i in range(len(peaks_t)-1):
389
+ period_temp=period_temp+(peaks_t[i+1]-peaks_t[i])
390
+ if len(peaks_t)>1:
391
+ subtract=1
392
+ else: subtract=0
393
+ period=period_temp/(len(peaks_t)-subtract)
394
+ return peaks_u, peaks_t, period, peaks[0]