PyEvoMotion 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,406 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.metrics import r2_score
4
+ from scipy.optimize import curve_fit
5
+ from scipy.stats import f as snedecor_f
6
+ from sklearn.linear_model import LinearRegression
7
+
8
+
9
+ class PyEvoMotionBase():
10
+ """
11
+ Base class for the ``PyEvoMotion`` project.
12
+
13
+ This class contains no data and is meant to be used as a mixin (provides utility methods for the project). It is inherited by :class:`PyEvoMotion`.
14
+ """
15
+
16
+ @staticmethod
17
+ def count_prefixes(prefix: str, mutations: list[str]) -> int:
18
+ """
19
+ Count the number of mutations that start with a specific prefix.
20
+
21
+ :param prefix: The prefix to count. It must be a single character.
22
+ :type prefix: str
23
+ :param mutations: The list of mutations where to count the prefix.
24
+ :type mutations: list[str]
25
+ :return: The number of mutations that start with the prefix.
26
+ :rtype: ``int``
27
+ """
28
+ return len(list(filter(
29
+ lambda x: x.startswith(prefix),
30
+ mutations
31
+ )))
32
+
33
+ @staticmethod
34
+ def mutation_length_modification(mutation:str) -> int:
35
+ """
36
+ Get the length modification induced by a mutation.
37
+
38
+ :param mutation: The mutation whose length modification to get.
39
+ :type mutation: str
40
+ :return: The length modification induced by the mutation.
41
+ :rtype: ``int``
42
+ :raises ValueError: If the mutation is not one of ``s``, ``i`` or ``d``.
43
+ """
44
+
45
+ if mutation.startswith("s"): return 0
46
+ else: _len = len(mutation.split("_")[-1])
47
+
48
+ if mutation.startswith("i"): return _len
49
+ elif mutation.startswith("d"): return -_len
50
+
51
+ raise ValueError(f"Mutation not recognized: {mutation}")
52
+
53
+ @staticmethod
54
+ def date_grouper(df: pd.DataFrame, DT: str, origin: str) -> pd.core.groupby.generic.DataFrameGroupBy:
55
+ """
56
+ Create grouped dataframe based on a ``datetime`` frequency.
57
+
58
+ :param df: The dataframe to group. It must have a ``date`` column.
59
+ :type df: pd.DataFrame
60
+ :param DT: The string datetime that will govern the grouping.
61
+ :type DT: str
62
+ :param origin: The string datetime that will be the origin of the grouping frequency.
63
+ :type origin: str
64
+ :return grouped: The dataset's corresponding pandas groupby object.
65
+ :rtype: ``pd.core.groupby.generic.DataFrameGroupBy``
66
+ """
67
+
68
+ return df.groupby(
69
+ pd.Grouper(
70
+ key="date",
71
+ axis=0,
72
+ freq=DT,
73
+ origin=origin
74
+ )
75
+ )
76
+
77
+ @staticmethod
78
+ def _invoke_method(
79
+ instance: object,
80
+ method: str,
81
+ *args: any,
82
+ **kwargs: dict[str, any]
83
+ ) -> any:
84
+ """
85
+ General method to invoke another method from a class instance.
86
+
87
+ :param instance: the instance to invoke the method from.
88
+ :type instance: object
89
+ :param method: the method to invoke.
90
+ :type method: str
91
+ :param args: the arguments to pass to the method.
92
+ :type args: any
93
+ :param kwargs: the keyword arguments to pass to the method.
94
+ :type kwargs: dict[str, any]
95
+ :return: the result of the method.
96
+ :rtype: any
97
+ """
98
+
99
+ try:
100
+ return getattr(instance, method)(*args, **kwargs)
101
+ except AttributeError:
102
+ print(f"Method {method} not found in {instance}")
103
+
104
+ @staticmethod
105
+ def _remove_nan(x: pd.Series, y: pd.Series) -> tuple[np.ndarray, np.ndarray]:
106
+ """
107
+ Remove NaN values from two pandas Series and return them as numpy arrays.
108
+
109
+ :param x: the first pandas Series.
110
+ :type x: pd.Series
111
+ :param y: the second pandas Series.
112
+ :type y: pd.Series
113
+ :return: a tuple with the two pandas Series without NaN values.
114
+ :rtype: tuple[np.ndarray,np.ndarray]
115
+ """
116
+
117
+ data = pd.DataFrame({"x": x, "y": y}).dropna()
118
+
119
+ x = data["x"].to_numpy().reshape(-1, 1)
120
+ y = data["y"].to_numpy().reshape(-1, 1)
121
+
122
+ return x, y
123
+
124
+ @classmethod
125
+ def linear_regression(cls,
126
+ x: np.ndarray,
127
+ y: np.ndarray,
128
+ fit_intercept=True
129
+ ) -> dict[str, any]:
130
+ """
131
+ Perform a linear regression on a set of data.
132
+
133
+ :param x: A numpy array of the features.
134
+ :type x: np.ndarray
135
+ :param y: A numpy array of the target.
136
+ :type y: np.ndarray
137
+ :param fit_intercept: Whether to fit the intercept. Default is ``True``.
138
+ :type fit_intercept: bool
139
+ :return: A dictionary containing:
140
+
141
+ * ``model``: A ``lambda`` function that computes predictions based on the fitted model.
142
+ * ``parameters``: A dictionary with the slope of the regression line.
143
+ * ``expression``: A string representation of the regression equation.
144
+ * ``r2``: The :math:`R^2` score of the regression.
145
+ :rtype: ``dict[str, any]``
146
+ """
147
+
148
+ reg = LinearRegression(fit_intercept=fit_intercept).fit(x,y)
149
+
150
+ if fit_intercept:
151
+ model = {
152
+ "model": lambda x: reg.coef_[0][0]*x + reg.intercept_[0],
153
+ "parameters": {
154
+ "m": reg.coef_[0][0],
155
+ "b": reg.intercept_[0]
156
+ },
157
+ "expression": "mx + b"
158
+ }
159
+
160
+ else:
161
+ model = {
162
+ "model": lambda x: reg.coef_[0][0]*x,
163
+ "parameters": {
164
+ "m": reg.coef_[0][0],
165
+ },
166
+ "expression": "mx"
167
+ }
168
+
169
+ model["r2"] = r2_score(y, reg.predict(x))
170
+
171
+ return model
172
+
173
+ @staticmethod
174
+ def _power_law(
175
+ x: np.ndarray | int | float,
176
+ a: int | float,
177
+ b: int | float
178
+ ) -> np.ndarray | int | float:
179
+ """
180
+ Power law function.
181
+
182
+ :param x: the input.
183
+ :type x: np.ndarray | int | float
184
+ :param a: the coefficient.
185
+ :type a: int | float
186
+ :param b: the exponent.
187
+ :type b: int | float
188
+ :return: the result of the power law.
189
+ :rtype: np.ndarray | int | float
190
+ """
191
+
192
+ return a*np.power(x, b)
193
+
194
+ @classmethod
195
+ def power_law_fit(cls, x: np.ndarray, y: np.ndarray) -> dict[str, any]:
196
+ """
197
+ Perform a power law fit on a set of data.
198
+
199
+ :param x: A numpy array of the features.
200
+ :type x: np.ndarray
201
+ :param y: A numpy array of the target.
202
+ :type y: np.ndarray
203
+ :return: A dictionary containing:
204
+
205
+ * ``model``: A ``lambda`` function that computes predictions based on the fitted model.
206
+ * ``parameters``: A dictionary with the parameters of the fitted power law.
207
+ * ``expression``: A string representation of the regression equation.
208
+ * ``r2``: The :math:`R^2` score of the regression.
209
+ :rtype: ``dict[str, any]``
210
+ """
211
+
212
+ try:
213
+ _popt, _, _, _msg, _ier = curve_fit(
214
+ cls._power_law,
215
+ x.T.tolist()[0], y.T.tolist()[0],
216
+ full_output=True
217
+ )
218
+ except RuntimeError as e:
219
+ _ier = 0
220
+ _msg = str(e)
221
+
222
+ if _ier not in range(1, 5):
223
+ print(f"{_msg}")
224
+ _popt = [0, 0]
225
+
226
+ model = {
227
+ "model": lambda x: _popt[0]*np.power(x, _popt[1]),
228
+ "parameters": {
229
+ "d": _popt[0],
230
+ "alpha": _popt[1]
231
+ },
232
+ "expression": "d*x^alpha",
233
+ "r2": r2_score(y, cls._power_law(x, *_popt))
234
+ }
235
+
236
+ return model
237
+
238
+ @staticmethod
239
+ def F_test(
240
+ model1: dict[str,any],
241
+ model2: dict[str,any],
242
+ data: np.ndarray
243
+ ) -> tuple[float, float]:
244
+ """
245
+ Perform an F-test between two models.
246
+
247
+ See https://en.wikipedia.org/wiki/F-test#Regression_problems for more details.
248
+
249
+ :param model1: The first model.
250
+ :type model1: dict[str, any]
251
+ :param model2: The second model.
252
+ :type model2: dict[str, any]
253
+ :param data: The data to test the models.
254
+ :type data: np.ndarray
255
+ :return: A tuple with the F-value and the p-value.
256
+ :rtype: ``tuple[float, float]``
257
+ """
258
+
259
+ data = data.flatten()
260
+
261
+ # Note that p1 < p2 always. Won't do an assertion because I'm making sure elsewhere that the linear model does not have an intercept, i.e. it only has the slope
262
+ p1 = len(model1["parameters"])
263
+ p2 = len(model2["parameters"])
264
+ n = len(data)
265
+
266
+ model1 = np.vectorize(model1["model"])
267
+ model2 = np.vectorize(model2["model"])
268
+
269
+ RS1 = (data - model1(range(n)))**2
270
+ RS2 = (data - model2(range(n)))**2
271
+
272
+ # Mask the infinite and nan values
273
+ mask = (
274
+ np.isinf(RS1)
275
+ | np.isinf(RS2)
276
+ | np.isnan(RS1)
277
+ | np.isnan(RS2)
278
+ )
279
+
280
+ # Sum the residuals without the infinite values
281
+ RSS1 = RS1.sum(where=~mask)
282
+ RSS2 = RS2.sum(where=~mask)
283
+
284
+ F = ((RSS1 - RSS2)/(p2 - p1))/(RSS2/(n - p2))
285
+
286
+ return F, 1 - snedecor_f.cdf(F, p2 - p1, n - p2)
287
+
288
+ @classmethod
289
+ def adjust_model(cls,
290
+ x: pd.Series,
291
+ y: pd.Series,
292
+ name: str = None
293
+ ) -> dict[str, any]:
294
+ """Adjust a model to the data.
295
+
296
+ :param x: The features. It is a single pandas Series.
297
+ :type x: pd.Series
298
+ :param y: The target. It is a single pandas Series.
299
+ :type y: pd.Series
300
+ :param name: The name of the data. Default is ``None``.
301
+ :type name: str
302
+ :return: A dictionary with the model.
303
+ :rtype: ``dict[str, any]``
304
+ :raises ValueError: If the dataset is empty or full of NaN values. This may occur if the grouped data contains only one entry per group, indicating that the variance cannot be computed.
305
+ """
306
+
307
+ x,y = cls._remove_nan(x, y)
308
+
309
+ # Raises an error if the dataset is (almost) empty at this point
310
+ if (x.size <= 1) or (y.size <= 1):
311
+ cls._check_dataset_is_not_empty(
312
+ pd.DataFrame(),
313
+ f"Dataset length after filtering is: x: {x.size} elements; y: {y.size} elements. In particular:\n\nx: {x}\ny: {y}\n\nPerhaps NaN appeared for certain entries. Check if the grouped data contains only one entry per group, as this may cause NaN values when computing the variance. Also, consider widening the time window."
314
+ )
315
+
316
+ model1 = cls.linear_regression(x, y, fit_intercept=False) # Not fitting the intercept because data is passed scaled to the minimum
317
+ model2 = cls.power_law_fit(x, y)
318
+
319
+ _, p = cls.F_test(model1, model2, y)
320
+
321
+ if p < 0.05:
322
+ model = model2
323
+ else:
324
+ model = model1
325
+
326
+ if name:
327
+ return {name: model}
328
+ else:
329
+ return model
330
+
331
+ @staticmethod
332
+ def plot_single_data_and_model(
333
+ data_x: pd.core.indexes.range.RangeIndex,
334
+ data_y: pd.Series,
335
+ data_ylabel: str,
336
+ model: callable,
337
+ model_label: str,
338
+ data_xlabel_units: str,
339
+ ax: any,
340
+ **kwargs: dict[str, any]
341
+ ) -> None:
342
+ """
343
+ Low level utility function to plot the data and a model.
344
+
345
+ :param data_x: The x-axis data.
346
+ :type data: pd.Series.index
347
+ :param data_y: The y-axis data.
348
+ :type data: pd.Series
349
+ :param data_ylabel: The ``ylabel`` of the data.
350
+ :type data_ylabel: str
351
+ :param model: The model to plot.
352
+ :type model: dict[str, any]
353
+ :param model_label: The label of the model.
354
+ :type model_label: str
355
+ :param data_xlabel_units: The units of the x-axis data.
356
+ :type data_xlabel_units: str
357
+ :param ax: The axis to plot.
358
+ :type ax: any
359
+ :param kwargs: Additional arguments to pass to the plot
360
+ :type kwargs: dict[str, any]
361
+ """
362
+
363
+ line_kwargs = {
364
+ "linestyle": None,
365
+ "color": "#1f77b4"
366
+ }
367
+ point_kwargs = {
368
+ "color": "#1f77b4"
369
+ }
370
+
371
+ for k in kwargs.keys():
372
+ _flag, _k = k.split("_")
373
+ if (_k in line_kwargs) and (_flag == "line"):
374
+ line_kwargs[_k] = kwargs[k]
375
+ if (_k in point_kwargs) and (_flag == "point"):
376
+ point_kwargs[_k] = kwargs[k]
377
+
378
+ ax.scatter(
379
+ data_x,
380
+ data_y,
381
+ **point_kwargs
382
+ )
383
+ ax.plot(
384
+ data_x,
385
+ model(data_x),
386
+ label=model_label,
387
+ **line_kwargs
388
+ )
389
+ ax.set_ylabel(data_ylabel)
390
+ ax.set_xlabel(f"time ({data_xlabel_units})")
391
+ ax.legend()
392
+
393
+ @staticmethod
394
+ def _check_dataset_is_not_empty(df: pd.DataFrame, msg: str) -> None:
395
+ """Check if the dataset is not empty.
396
+
397
+ :param df: the dataset to check.
398
+ :type df: pd.DataFrame
399
+ :param msg: The message to raise if the dataset is empty.
400
+ :type msg: str
401
+ """
402
+
403
+ if df.empty:
404
+ raise ValueError(
405
+ f"The dataset is (almost) empty at this point of the analysis.\n{msg}"
406
+ )