PyEvoMotion 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyEvoMotion/__init__.py +11 -0
- PyEvoMotion/cli.py +440 -0
- PyEvoMotion/core/__init__.py +7 -0
- PyEvoMotion/core/base.py +406 -0
- PyEvoMotion/core/core.py +520 -0
- PyEvoMotion/core/parser.py +467 -0
- PyEvoMotion/utils.py +87 -0
- pyevomotion-0.1.0.dist-info/METADATA +117 -0
- pyevomotion-0.1.0.dist-info/RECORD +13 -0
- pyevomotion-0.1.0.dist-info/WHEEL +4 -0
- pyevomotion-0.1.0.dist-info/entry_points.txt +3 -0
- share/mafft_install.sh +44 -0
- share/manuscript_figure.py +316 -0
PyEvoMotion/core/base.py
ADDED
@@ -0,0 +1,406 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from sklearn.metrics import r2_score
|
4
|
+
from scipy.optimize import curve_fit
|
5
|
+
from scipy.stats import f as snedecor_f
|
6
|
+
from sklearn.linear_model import LinearRegression
|
7
|
+
|
8
|
+
|
9
|
+
class PyEvoMotionBase():
|
10
|
+
"""
|
11
|
+
Base class for the ``PyEvoMotion`` project.
|
12
|
+
|
13
|
+
This class contains no data and is meant to be used as a mixin (provides utility methods for the project). It is inherited by :class:`PyEvoMotion`.
|
14
|
+
"""
|
15
|
+
|
16
|
+
@staticmethod
|
17
|
+
def count_prefixes(prefix: str, mutations: list[str]) -> int:
|
18
|
+
"""
|
19
|
+
Count the number of mutations that start with a specific prefix.
|
20
|
+
|
21
|
+
:param prefix: The prefix to count. It must be a single character.
|
22
|
+
:type prefix: str
|
23
|
+
:param mutations: The list of mutations where to count the prefix.
|
24
|
+
:type mutations: list[str]
|
25
|
+
:return: The number of mutations that start with the prefix.
|
26
|
+
:rtype: ``int``
|
27
|
+
"""
|
28
|
+
return len(list(filter(
|
29
|
+
lambda x: x.startswith(prefix),
|
30
|
+
mutations
|
31
|
+
)))
|
32
|
+
|
33
|
+
@staticmethod
|
34
|
+
def mutation_length_modification(mutation:str) -> int:
|
35
|
+
"""
|
36
|
+
Get the length modification induced by a mutation.
|
37
|
+
|
38
|
+
:param mutation: The mutation whose length modification to get.
|
39
|
+
:type mutation: str
|
40
|
+
:return: The length modification induced by the mutation.
|
41
|
+
:rtype: ``int``
|
42
|
+
:raises ValueError: If the mutation is not one of ``s``, ``i`` or ``d``.
|
43
|
+
"""
|
44
|
+
|
45
|
+
if mutation.startswith("s"): return 0
|
46
|
+
else: _len = len(mutation.split("_")[-1])
|
47
|
+
|
48
|
+
if mutation.startswith("i"): return _len
|
49
|
+
elif mutation.startswith("d"): return -_len
|
50
|
+
|
51
|
+
raise ValueError(f"Mutation not recognized: {mutation}")
|
52
|
+
|
53
|
+
@staticmethod
|
54
|
+
def date_grouper(df: pd.DataFrame, DT: str, origin: str) -> pd.core.groupby.generic.DataFrameGroupBy:
|
55
|
+
"""
|
56
|
+
Create grouped dataframe based on a ``datetime`` frequency.
|
57
|
+
|
58
|
+
:param df: The dataframe to group. It must have a ``date`` column.
|
59
|
+
:type df: pd.DataFrame
|
60
|
+
:param DT: The string datetime that will govern the grouping.
|
61
|
+
:type DT: str
|
62
|
+
:param origin: The string datetime that will be the origin of the grouping frequency.
|
63
|
+
:type origin: str
|
64
|
+
:return grouped: The dataset's corresponding pandas groupby object.
|
65
|
+
:rtype: ``pd.core.groupby.generic.DataFrameGroupBy``
|
66
|
+
"""
|
67
|
+
|
68
|
+
return df.groupby(
|
69
|
+
pd.Grouper(
|
70
|
+
key="date",
|
71
|
+
axis=0,
|
72
|
+
freq=DT,
|
73
|
+
origin=origin
|
74
|
+
)
|
75
|
+
)
|
76
|
+
|
77
|
+
@staticmethod
|
78
|
+
def _invoke_method(
|
79
|
+
instance: object,
|
80
|
+
method: str,
|
81
|
+
*args: any,
|
82
|
+
**kwargs: dict[str, any]
|
83
|
+
) -> any:
|
84
|
+
"""
|
85
|
+
General method to invoke another method from a class instance.
|
86
|
+
|
87
|
+
:param instance: the instance to invoke the method from.
|
88
|
+
:type instance: object
|
89
|
+
:param method: the method to invoke.
|
90
|
+
:type method: str
|
91
|
+
:param args: the arguments to pass to the method.
|
92
|
+
:type args: any
|
93
|
+
:param kwargs: the keyword arguments to pass to the method.
|
94
|
+
:type kwargs: dict[str, any]
|
95
|
+
:return: the result of the method.
|
96
|
+
:rtype: any
|
97
|
+
"""
|
98
|
+
|
99
|
+
try:
|
100
|
+
return getattr(instance, method)(*args, **kwargs)
|
101
|
+
except AttributeError:
|
102
|
+
print(f"Method {method} not found in {instance}")
|
103
|
+
|
104
|
+
@staticmethod
|
105
|
+
def _remove_nan(x: pd.Series, y: pd.Series) -> tuple[np.ndarray, np.ndarray]:
|
106
|
+
"""
|
107
|
+
Remove NaN values from two pandas Series and return them as numpy arrays.
|
108
|
+
|
109
|
+
:param x: the first pandas Series.
|
110
|
+
:type x: pd.Series
|
111
|
+
:param y: the second pandas Series.
|
112
|
+
:type y: pd.Series
|
113
|
+
:return: a tuple with the two pandas Series without NaN values.
|
114
|
+
:rtype: tuple[np.ndarray,np.ndarray]
|
115
|
+
"""
|
116
|
+
|
117
|
+
data = pd.DataFrame({"x": x, "y": y}).dropna()
|
118
|
+
|
119
|
+
x = data["x"].to_numpy().reshape(-1, 1)
|
120
|
+
y = data["y"].to_numpy().reshape(-1, 1)
|
121
|
+
|
122
|
+
return x, y
|
123
|
+
|
124
|
+
@classmethod
|
125
|
+
def linear_regression(cls,
|
126
|
+
x: np.ndarray,
|
127
|
+
y: np.ndarray,
|
128
|
+
fit_intercept=True
|
129
|
+
) -> dict[str, any]:
|
130
|
+
"""
|
131
|
+
Perform a linear regression on a set of data.
|
132
|
+
|
133
|
+
:param x: A numpy array of the features.
|
134
|
+
:type x: np.ndarray
|
135
|
+
:param y: A numpy array of the target.
|
136
|
+
:type y: np.ndarray
|
137
|
+
:param fit_intercept: Whether to fit the intercept. Default is ``True``.
|
138
|
+
:type fit_intercept: bool
|
139
|
+
:return: A dictionary containing:
|
140
|
+
|
141
|
+
* ``model``: A ``lambda`` function that computes predictions based on the fitted model.
|
142
|
+
* ``parameters``: A dictionary with the slope of the regression line.
|
143
|
+
* ``expression``: A string representation of the regression equation.
|
144
|
+
* ``r2``: The :math:`R^2` score of the regression.
|
145
|
+
:rtype: ``dict[str, any]``
|
146
|
+
"""
|
147
|
+
|
148
|
+
reg = LinearRegression(fit_intercept=fit_intercept).fit(x,y)
|
149
|
+
|
150
|
+
if fit_intercept:
|
151
|
+
model = {
|
152
|
+
"model": lambda x: reg.coef_[0][0]*x + reg.intercept_[0],
|
153
|
+
"parameters": {
|
154
|
+
"m": reg.coef_[0][0],
|
155
|
+
"b": reg.intercept_[0]
|
156
|
+
},
|
157
|
+
"expression": "mx + b"
|
158
|
+
}
|
159
|
+
|
160
|
+
else:
|
161
|
+
model = {
|
162
|
+
"model": lambda x: reg.coef_[0][0]*x,
|
163
|
+
"parameters": {
|
164
|
+
"m": reg.coef_[0][0],
|
165
|
+
},
|
166
|
+
"expression": "mx"
|
167
|
+
}
|
168
|
+
|
169
|
+
model["r2"] = r2_score(y, reg.predict(x))
|
170
|
+
|
171
|
+
return model
|
172
|
+
|
173
|
+
@staticmethod
|
174
|
+
def _power_law(
|
175
|
+
x: np.ndarray | int | float,
|
176
|
+
a: int | float,
|
177
|
+
b: int | float
|
178
|
+
) -> np.ndarray | int | float:
|
179
|
+
"""
|
180
|
+
Power law function.
|
181
|
+
|
182
|
+
:param x: the input.
|
183
|
+
:type x: np.ndarray | int | float
|
184
|
+
:param a: the coefficient.
|
185
|
+
:type a: int | float
|
186
|
+
:param b: the exponent.
|
187
|
+
:type b: int | float
|
188
|
+
:return: the result of the power law.
|
189
|
+
:rtype: np.ndarray | int | float
|
190
|
+
"""
|
191
|
+
|
192
|
+
return a*np.power(x, b)
|
193
|
+
|
194
|
+
@classmethod
|
195
|
+
def power_law_fit(cls, x: np.ndarray, y: np.ndarray) -> dict[str, any]:
|
196
|
+
"""
|
197
|
+
Perform a power law fit on a set of data.
|
198
|
+
|
199
|
+
:param x: A numpy array of the features.
|
200
|
+
:type x: np.ndarray
|
201
|
+
:param y: A numpy array of the target.
|
202
|
+
:type y: np.ndarray
|
203
|
+
:return: A dictionary containing:
|
204
|
+
|
205
|
+
* ``model``: A ``lambda`` function that computes predictions based on the fitted model.
|
206
|
+
* ``parameters``: A dictionary with the parameters of the fitted power law.
|
207
|
+
* ``expression``: A string representation of the regression equation.
|
208
|
+
* ``r2``: The :math:`R^2` score of the regression.
|
209
|
+
:rtype: ``dict[str, any]``
|
210
|
+
"""
|
211
|
+
|
212
|
+
try:
|
213
|
+
_popt, _, _, _msg, _ier = curve_fit(
|
214
|
+
cls._power_law,
|
215
|
+
x.T.tolist()[0], y.T.tolist()[0],
|
216
|
+
full_output=True
|
217
|
+
)
|
218
|
+
except RuntimeError as e:
|
219
|
+
_ier = 0
|
220
|
+
_msg = str(e)
|
221
|
+
|
222
|
+
if _ier not in range(1, 5):
|
223
|
+
print(f"{_msg}")
|
224
|
+
_popt = [0, 0]
|
225
|
+
|
226
|
+
model = {
|
227
|
+
"model": lambda x: _popt[0]*np.power(x, _popt[1]),
|
228
|
+
"parameters": {
|
229
|
+
"d": _popt[0],
|
230
|
+
"alpha": _popt[1]
|
231
|
+
},
|
232
|
+
"expression": "d*x^alpha",
|
233
|
+
"r2": r2_score(y, cls._power_law(x, *_popt))
|
234
|
+
}
|
235
|
+
|
236
|
+
return model
|
237
|
+
|
238
|
+
@staticmethod
|
239
|
+
def F_test(
|
240
|
+
model1: dict[str,any],
|
241
|
+
model2: dict[str,any],
|
242
|
+
data: np.ndarray
|
243
|
+
) -> tuple[float, float]:
|
244
|
+
"""
|
245
|
+
Perform an F-test between two models.
|
246
|
+
|
247
|
+
See https://en.wikipedia.org/wiki/F-test#Regression_problems for more details.
|
248
|
+
|
249
|
+
:param model1: The first model.
|
250
|
+
:type model1: dict[str, any]
|
251
|
+
:param model2: The second model.
|
252
|
+
:type model2: dict[str, any]
|
253
|
+
:param data: The data to test the models.
|
254
|
+
:type data: np.ndarray
|
255
|
+
:return: A tuple with the F-value and the p-value.
|
256
|
+
:rtype: ``tuple[float, float]``
|
257
|
+
"""
|
258
|
+
|
259
|
+
data = data.flatten()
|
260
|
+
|
261
|
+
# Note that p1 < p2 always. Won't do an assertion because I'm making sure elsewhere that the linear model does not have an intercept, i.e. it only has the slope
|
262
|
+
p1 = len(model1["parameters"])
|
263
|
+
p2 = len(model2["parameters"])
|
264
|
+
n = len(data)
|
265
|
+
|
266
|
+
model1 = np.vectorize(model1["model"])
|
267
|
+
model2 = np.vectorize(model2["model"])
|
268
|
+
|
269
|
+
RS1 = (data - model1(range(n)))**2
|
270
|
+
RS2 = (data - model2(range(n)))**2
|
271
|
+
|
272
|
+
# Mask the infinite and nan values
|
273
|
+
mask = (
|
274
|
+
np.isinf(RS1)
|
275
|
+
| np.isinf(RS2)
|
276
|
+
| np.isnan(RS1)
|
277
|
+
| np.isnan(RS2)
|
278
|
+
)
|
279
|
+
|
280
|
+
# Sum the residuals without the infinite values
|
281
|
+
RSS1 = RS1.sum(where=~mask)
|
282
|
+
RSS2 = RS2.sum(where=~mask)
|
283
|
+
|
284
|
+
F = ((RSS1 - RSS2)/(p2 - p1))/(RSS2/(n - p2))
|
285
|
+
|
286
|
+
return F, 1 - snedecor_f.cdf(F, p2 - p1, n - p2)
|
287
|
+
|
288
|
+
@classmethod
|
289
|
+
def adjust_model(cls,
|
290
|
+
x: pd.Series,
|
291
|
+
y: pd.Series,
|
292
|
+
name: str = None
|
293
|
+
) -> dict[str, any]:
|
294
|
+
"""Adjust a model to the data.
|
295
|
+
|
296
|
+
:param x: The features. It is a single pandas Series.
|
297
|
+
:type x: pd.Series
|
298
|
+
:param y: The target. It is a single pandas Series.
|
299
|
+
:type y: pd.Series
|
300
|
+
:param name: The name of the data. Default is ``None``.
|
301
|
+
:type name: str
|
302
|
+
:return: A dictionary with the model.
|
303
|
+
:rtype: ``dict[str, any]``
|
304
|
+
:raises ValueError: If the dataset is empty or full of NaN values. This may occur if the grouped data contains only one entry per group, indicating that the variance cannot be computed.
|
305
|
+
"""
|
306
|
+
|
307
|
+
x,y = cls._remove_nan(x, y)
|
308
|
+
|
309
|
+
# Raises an error if the dataset is (almost) empty at this point
|
310
|
+
if (x.size <= 1) or (y.size <= 1):
|
311
|
+
cls._check_dataset_is_not_empty(
|
312
|
+
pd.DataFrame(),
|
313
|
+
f"Dataset length after filtering is: x: {x.size} elements; y: {y.size} elements. In particular:\n\nx: {x}\ny: {y}\n\nPerhaps NaN appeared for certain entries. Check if the grouped data contains only one entry per group, as this may cause NaN values when computing the variance. Also, consider widening the time window."
|
314
|
+
)
|
315
|
+
|
316
|
+
model1 = cls.linear_regression(x, y, fit_intercept=False) # Not fitting the intercept because data is passed scaled to the minimum
|
317
|
+
model2 = cls.power_law_fit(x, y)
|
318
|
+
|
319
|
+
_, p = cls.F_test(model1, model2, y)
|
320
|
+
|
321
|
+
if p < 0.05:
|
322
|
+
model = model2
|
323
|
+
else:
|
324
|
+
model = model1
|
325
|
+
|
326
|
+
if name:
|
327
|
+
return {name: model}
|
328
|
+
else:
|
329
|
+
return model
|
330
|
+
|
331
|
+
@staticmethod
|
332
|
+
def plot_single_data_and_model(
|
333
|
+
data_x: pd.core.indexes.range.RangeIndex,
|
334
|
+
data_y: pd.Series,
|
335
|
+
data_ylabel: str,
|
336
|
+
model: callable,
|
337
|
+
model_label: str,
|
338
|
+
data_xlabel_units: str,
|
339
|
+
ax: any,
|
340
|
+
**kwargs: dict[str, any]
|
341
|
+
) -> None:
|
342
|
+
"""
|
343
|
+
Low level utility function to plot the data and a model.
|
344
|
+
|
345
|
+
:param data_x: The x-axis data.
|
346
|
+
:type data: pd.Series.index
|
347
|
+
:param data_y: The y-axis data.
|
348
|
+
:type data: pd.Series
|
349
|
+
:param data_ylabel: The ``ylabel`` of the data.
|
350
|
+
:type data_ylabel: str
|
351
|
+
:param model: The model to plot.
|
352
|
+
:type model: dict[str, any]
|
353
|
+
:param model_label: The label of the model.
|
354
|
+
:type model_label: str
|
355
|
+
:param data_xlabel_units: The units of the x-axis data.
|
356
|
+
:type data_xlabel_units: str
|
357
|
+
:param ax: The axis to plot.
|
358
|
+
:type ax: any
|
359
|
+
:param kwargs: Additional arguments to pass to the plot
|
360
|
+
:type kwargs: dict[str, any]
|
361
|
+
"""
|
362
|
+
|
363
|
+
line_kwargs = {
|
364
|
+
"linestyle": None,
|
365
|
+
"color": "#1f77b4"
|
366
|
+
}
|
367
|
+
point_kwargs = {
|
368
|
+
"color": "#1f77b4"
|
369
|
+
}
|
370
|
+
|
371
|
+
for k in kwargs.keys():
|
372
|
+
_flag, _k = k.split("_")
|
373
|
+
if (_k in line_kwargs) and (_flag == "line"):
|
374
|
+
line_kwargs[_k] = kwargs[k]
|
375
|
+
if (_k in point_kwargs) and (_flag == "point"):
|
376
|
+
point_kwargs[_k] = kwargs[k]
|
377
|
+
|
378
|
+
ax.scatter(
|
379
|
+
data_x,
|
380
|
+
data_y,
|
381
|
+
**point_kwargs
|
382
|
+
)
|
383
|
+
ax.plot(
|
384
|
+
data_x,
|
385
|
+
model(data_x),
|
386
|
+
label=model_label,
|
387
|
+
**line_kwargs
|
388
|
+
)
|
389
|
+
ax.set_ylabel(data_ylabel)
|
390
|
+
ax.set_xlabel(f"time ({data_xlabel_units})")
|
391
|
+
ax.legend()
|
392
|
+
|
393
|
+
@staticmethod
|
394
|
+
def _check_dataset_is_not_empty(df: pd.DataFrame, msg: str) -> None:
|
395
|
+
"""Check if the dataset is not empty.
|
396
|
+
|
397
|
+
:param df: the dataset to check.
|
398
|
+
:type df: pd.DataFrame
|
399
|
+
:param msg: The message to raise if the dataset is empty.
|
400
|
+
:type msg: str
|
401
|
+
"""
|
402
|
+
|
403
|
+
if df.empty:
|
404
|
+
raise ValueError(
|
405
|
+
f"The dataset is (almost) empty at this point of the analysis.\n{msg}"
|
406
|
+
)
|