PyEvoMotion 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyEvoMotion/cli.py +1 -8
- PyEvoMotion/core/base.py +83 -20
- PyEvoMotion/core/core.py +82 -38
- PyEvoMotion/core/parser.py +4 -1
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.1.dist-info}/METADATA +72 -4
- pyevomotion-0.1.1.dist-info/RECORD +31 -0
- share/anomalous_diffusion.pdf +0 -0
- share/figUK.tsv +9949 -0
- share/figUK_plots.pdf +0 -0
- share/figUK_regression_results.json +18 -0
- share/figUK_run_args.json +13 -0
- share/figUK_stats.tsv +41 -0
- share/figUSA.tsv +9470 -0
- share/figUSA_plots.pdf +0 -0
- share/figUSA_regression_results.json +18 -0
- share/figUSA_run_args.json +13 -0
- share/figUSA_stats.tsv +34 -0
- share/figdataUK.tsv +10001 -0
- share/figdataUSA.tsv +10001 -0
- share/figure.pdf +0 -0
- share/generate_sequences_from_synthdata.py +85 -0
- share/manuscript_figure.py +457 -12
- share/synth_figure.pdf +0 -0
- share/uk_time_windows.pdf +0 -0
- share/weekly_size.pdf +0 -0
- pyevomotion-0.1.0.dist-info/RECORD +0 -13
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.1.dist-info}/WHEEL +0 -0
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.1.dist-info}/entry_points.txt +0 -0
PyEvoMotion/cli.py
CHANGED
@@ -255,13 +255,6 @@ def _parse_arguments() -> argparse.Namespace:
|
|
255
255
|
default=0,
|
256
256
|
help="Length filter for the sequences (removes sequences with length less than the specified value). Default is 0."
|
257
257
|
)
|
258
|
-
parser.add_argument(
|
259
|
-
"-n",
|
260
|
-
"--n_threshold",
|
261
|
-
type=int,
|
262
|
-
default=2,
|
263
|
-
help="Minimum number of sequences required in a time interval to compute statistics. Default is 2."
|
264
|
-
)
|
265
258
|
parser.add_argument(
|
266
259
|
"-xj",
|
267
260
|
"--export_json",
|
@@ -407,7 +400,6 @@ def _main():
|
|
407
400
|
# Runs the analysis
|
408
401
|
stats, reg = instance.analysis(
|
409
402
|
length=args.length_filter,
|
410
|
-
n_threshold=args.n_threshold,
|
411
403
|
show=args.show,
|
412
404
|
mutation_kind=args.kind,
|
413
405
|
export_plots_filename=(
|
@@ -432,6 +424,7 @@ def _main():
|
|
432
424
|
# Exports the regression models to a JSON file
|
433
425
|
with open(f"{args.out}_regression_results.json", "w") as file:
|
434
426
|
json.dump(_reg, file, indent=4)
|
427
|
+
print(f"Regression results saved to {args.out}_regression_results.json")
|
435
428
|
|
436
429
|
# Exits the program with code 0 (success)
|
437
430
|
exit(0)
|
PyEvoMotion/core/base.py
CHANGED
@@ -102,7 +102,7 @@ class PyEvoMotionBase():
|
|
102
102
|
print(f"Method {method} not found in {instance}")
|
103
103
|
|
104
104
|
@staticmethod
|
105
|
-
def _remove_nan(x: pd.Series, y: pd.Series) -> tuple[np.ndarray, np.ndarray]:
|
105
|
+
def _remove_nan(x: pd.Series, y: pd.Series, z: pd.Series) -> tuple[np.ndarray, np.ndarray]:
|
106
106
|
"""
|
107
107
|
Remove NaN values from two pandas Series and return them as numpy arrays.
|
108
108
|
|
@@ -110,22 +110,40 @@ class PyEvoMotionBase():
|
|
110
110
|
:type x: pd.Series
|
111
111
|
:param y: the second pandas Series.
|
112
112
|
:type y: pd.Series
|
113
|
+
:param z: the third pandas Series.
|
114
|
+
:type z: pd.Series
|
113
115
|
:return: a tuple with the two pandas Series without NaN values.
|
114
116
|
:rtype: tuple[np.ndarray,np.ndarray]
|
115
117
|
"""
|
116
118
|
|
117
|
-
data = pd.DataFrame({"x": x, "y": y}).dropna()
|
119
|
+
data = pd.DataFrame({"x": x, "y": y, "z": z}).dropna()
|
118
120
|
|
119
121
|
x = data["x"].to_numpy().reshape(-1, 1)
|
120
122
|
y = data["y"].to_numpy().reshape(-1, 1)
|
123
|
+
z = data["z"].to_numpy().reshape(-1, 1)
|
124
|
+
return x, y, z
|
121
125
|
|
122
|
-
|
126
|
+
@staticmethod
|
127
|
+
def _weighting_function(n: int, n_0: int = 30) -> np.ndarray:
|
128
|
+
"""
|
129
|
+
Weighting function for the data points.
|
130
|
+
|
131
|
+
:param n: The number of data points.
|
132
|
+
:type n: int
|
133
|
+
:param n_0: The number of data points at which the weighting function approximates the constant 1. Default is 30.
|
134
|
+
:type n_0: int
|
135
|
+
:return: The weighting function.
|
136
|
+
:rtype: np.ndarray
|
137
|
+
"""
|
138
|
+
|
139
|
+
return np.tanh(2*n/n_0)
|
123
140
|
|
124
141
|
@classmethod
|
125
142
|
def linear_regression(cls,
|
126
143
|
x: np.ndarray,
|
127
144
|
y: np.ndarray,
|
128
|
-
|
145
|
+
weights: np.ndarray | None = None,
|
146
|
+
fit_intercept: bool = True
|
129
147
|
) -> dict[str, any]:
|
130
148
|
"""
|
131
149
|
Perform a linear regression on a set of data.
|
@@ -136,6 +154,8 @@ class PyEvoMotionBase():
|
|
136
154
|
:type y: np.ndarray
|
137
155
|
:param fit_intercept: Whether to fit the intercept. Default is ``True``.
|
138
156
|
:type fit_intercept: bool
|
157
|
+
:param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
|
158
|
+
:type weights: np.ndarray | None
|
139
159
|
:return: A dictionary containing:
|
140
160
|
|
141
161
|
* ``model``: A ``lambda`` function that computes predictions based on the fitted model.
|
@@ -145,7 +165,9 @@ class PyEvoMotionBase():
|
|
145
165
|
:rtype: ``dict[str, any]``
|
146
166
|
"""
|
147
167
|
|
148
|
-
|
168
|
+
_weights = cls._weighting_function(weights).flatten() if weights is not None else None
|
169
|
+
|
170
|
+
reg = LinearRegression(fit_intercept=fit_intercept).fit(x, y, sample_weight=_weights)
|
149
171
|
|
150
172
|
if fit_intercept:
|
151
173
|
model = {
|
@@ -166,7 +188,7 @@ class PyEvoMotionBase():
|
|
166
188
|
"expression": "mx"
|
167
189
|
}
|
168
190
|
|
169
|
-
model["r2"] = r2_score(y, reg.predict(x))
|
191
|
+
model["r2"] = r2_score(y, reg.predict(x), sample_weight=_weights)
|
170
192
|
|
171
193
|
return model
|
172
194
|
|
@@ -192,7 +214,7 @@ class PyEvoMotionBase():
|
|
192
214
|
return a*np.power(x, b)
|
193
215
|
|
194
216
|
@classmethod
|
195
|
-
def power_law_fit(cls, x: np.ndarray, y: np.ndarray) -> dict[str, any]:
|
217
|
+
def power_law_fit(cls, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None) -> dict[str, any]:
|
196
218
|
"""
|
197
219
|
Perform a power law fit on a set of data.
|
198
220
|
|
@@ -200,6 +222,8 @@ class PyEvoMotionBase():
|
|
200
222
|
:type x: np.ndarray
|
201
223
|
:param y: A numpy array of the target.
|
202
224
|
:type y: np.ndarray
|
225
|
+
:param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
|
226
|
+
:type weights: np.ndarray | None
|
203
227
|
:return: A dictionary containing:
|
204
228
|
|
205
229
|
* ``model``: A ``lambda`` function that computes predictions based on the fitted model.
|
@@ -209,10 +233,13 @@ class PyEvoMotionBase():
|
|
209
233
|
:rtype: ``dict[str, any]``
|
210
234
|
"""
|
211
235
|
|
236
|
+
_weights = cls._weighting_function(weights).flatten() if weights is not None else None
|
237
|
+
|
212
238
|
try:
|
213
239
|
_popt, _, _, _msg, _ier = curve_fit(
|
214
240
|
cls._power_law,
|
215
241
|
x.T.tolist()[0], y.T.tolist()[0],
|
242
|
+
sigma=1/np.sqrt(_weights) if _weights is not None else None,
|
216
243
|
full_output=True
|
217
244
|
)
|
218
245
|
except RuntimeError as e:
|
@@ -230,16 +257,18 @@ class PyEvoMotionBase():
|
|
230
257
|
"alpha": _popt[1]
|
231
258
|
},
|
232
259
|
"expression": "d*x^alpha",
|
233
|
-
"r2": r2_score(y, cls._power_law(x, *_popt))
|
260
|
+
"r2": r2_score(y, cls._power_law(x, *_popt), sample_weight=_weights)
|
234
261
|
}
|
235
262
|
|
236
263
|
return model
|
237
264
|
|
238
|
-
@
|
265
|
+
@classmethod
|
239
266
|
def F_test(
|
267
|
+
cls,
|
240
268
|
model1: dict[str,any],
|
241
269
|
model2: dict[str,any],
|
242
|
-
data: np.ndarray
|
270
|
+
data: np.ndarray,
|
271
|
+
weights: np.ndarray | None = None
|
243
272
|
) -> tuple[float, float]:
|
244
273
|
"""
|
245
274
|
Perform an F-test between two models.
|
@@ -257,6 +286,11 @@ class PyEvoMotionBase():
|
|
257
286
|
"""
|
258
287
|
|
259
288
|
data = data.flatten()
|
289
|
+
|
290
|
+
if weights is not None:
|
291
|
+
_weights = cls._weighting_function(weights.flatten())
|
292
|
+
else:
|
293
|
+
_weights = np.ones(len(data))
|
260
294
|
|
261
295
|
# Note that p1 < p2 always. Won't do an assertion because I'm making sure elsewhere that the linear model does not have an intercept, i.e. it only has the slope
|
262
296
|
p1 = len(model1["parameters"])
|
@@ -278,8 +312,8 @@ class PyEvoMotionBase():
|
|
278
312
|
)
|
279
313
|
|
280
314
|
# Sum the residuals without the infinite values
|
281
|
-
RSS1 =
|
282
|
-
RSS2 =
|
315
|
+
RSS1 = np.sum(_weights*RS1, where=~mask)
|
316
|
+
RSS2 = np.sum(_weights*RS2, where=~mask)
|
283
317
|
|
284
318
|
F = ((RSS1 - RSS2)/(p2 - p1))/(RSS2/(n - p2))
|
285
319
|
|
@@ -289,7 +323,8 @@ class PyEvoMotionBase():
|
|
289
323
|
def adjust_model(cls,
|
290
324
|
x: pd.Series,
|
291
325
|
y: pd.Series,
|
292
|
-
name: str = None
|
326
|
+
name: str = None,
|
327
|
+
weights: pd.Series | None = None
|
293
328
|
) -> dict[str, any]:
|
294
329
|
"""Adjust a model to the data.
|
295
330
|
|
@@ -299,12 +334,14 @@ class PyEvoMotionBase():
|
|
299
334
|
:type y: pd.Series
|
300
335
|
:param name: The name of the data. Default is ``None``.
|
301
336
|
:type name: str
|
337
|
+
:param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
|
338
|
+
:type weights: np.ndarray | None
|
302
339
|
:return: A dictionary with the model.
|
303
340
|
:rtype: ``dict[str, any]``
|
304
341
|
:raises ValueError: If the dataset is empty or full of NaN values. This may occur if the grouped data contains only one entry per group, indicating that the variance cannot be computed.
|
305
342
|
"""
|
306
343
|
|
307
|
-
x,y = cls._remove_nan(x, y)
|
344
|
+
x,y,w = cls._remove_nan(x, y, weights)
|
308
345
|
|
309
346
|
# Raises an error if the dataset is (almost) empty at this point
|
310
347
|
if (x.size <= 1) or (y.size <= 1):
|
@@ -313,10 +350,10 @@ class PyEvoMotionBase():
|
|
313
350
|
f"Dataset length after filtering is: x: {x.size} elements; y: {y.size} elements. In particular:\n\nx: {x}\ny: {y}\n\nPerhaps NaN appeared for certain entries. Check if the grouped data contains only one entry per group, as this may cause NaN values when computing the variance. Also, consider widening the time window."
|
314
351
|
)
|
315
352
|
|
316
|
-
model1 = cls.linear_regression(x, y, fit_intercept=False) # Not fitting the intercept because data is passed scaled to the minimum
|
317
|
-
model2 = cls.power_law_fit(x, y)
|
353
|
+
model1 = cls.linear_regression(x, y, weights=w, fit_intercept=False) # Not fitting the intercept because data is passed scaled to the minimum
|
354
|
+
model2 = cls.power_law_fit(x, y, weights=w)
|
318
355
|
|
319
|
-
_, p = cls.F_test(model1, model2, y)
|
356
|
+
_, p = cls.F_test(model1, model2, y, weights=w)
|
320
357
|
|
321
358
|
if p < 0.05:
|
322
359
|
model = model2
|
@@ -337,6 +374,7 @@ class PyEvoMotionBase():
|
|
337
374
|
model_label: str,
|
338
375
|
data_xlabel_units: str,
|
339
376
|
ax: any,
|
377
|
+
dt_ratio: float,
|
340
378
|
**kwargs: dict[str, any]
|
341
379
|
) -> None:
|
342
380
|
"""
|
@@ -376,13 +414,13 @@ class PyEvoMotionBase():
|
|
376
414
|
point_kwargs[_k] = kwargs[k]
|
377
415
|
|
378
416
|
ax.scatter(
|
379
|
-
data_x,
|
417
|
+
data_x.to_numpy()*dt_ratio,
|
380
418
|
data_y,
|
381
419
|
**point_kwargs
|
382
420
|
)
|
383
421
|
ax.plot(
|
384
|
-
data_x,
|
385
|
-
model(data_x),
|
422
|
+
data_x.to_numpy()*dt_ratio,
|
423
|
+
model(data_x.to_numpy()*dt_ratio),
|
386
424
|
label=model_label,
|
387
425
|
**line_kwargs
|
388
426
|
)
|
@@ -404,3 +442,28 @@ class PyEvoMotionBase():
|
|
404
442
|
raise ValueError(
|
405
443
|
f"The dataset is (almost) empty at this point of the analysis.\n{msg}"
|
406
444
|
)
|
445
|
+
|
446
|
+
@staticmethod
|
447
|
+
def _get_time_ratio(dt: str, reference: str = "7D") -> float:
|
448
|
+
"""Get the ratio of a time interval with respect to a reference interval.
|
449
|
+
|
450
|
+
:param dt: Time interval string (e.g. "5D", "7D", "10D", "14D", "12H")
|
451
|
+
:type dt: str
|
452
|
+
:param reference: Reference time interval string. Default is "7D".
|
453
|
+
:type reference: str
|
454
|
+
:return: The ratio of dt to reference
|
455
|
+
:rtype: float
|
456
|
+
"""
|
457
|
+
|
458
|
+
return pd.Timedelta(dt) / pd.Timedelta(reference)
|
459
|
+
|
460
|
+
@classmethod
|
461
|
+
def _verify_dt(cls, dt: str) -> None:
|
462
|
+
"""Verify that the time window string is greater than 1 day.
|
463
|
+
|
464
|
+
:param dt: Time window string (e.g. "5D", "7D", "10D", "14D")
|
465
|
+
:type dt: str
|
466
|
+
:raises ValueError: If the time window is not greater than 1 day
|
467
|
+
"""
|
468
|
+
if cls._get_time_ratio(dt, "1D") <= 1:
|
469
|
+
raise ValueError(f"Time window must be greater than 1 day. Got {dt}")
|
PyEvoMotion/core/core.py
CHANGED
@@ -62,7 +62,9 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
62
62
|
:type date_range: tuple[str] | None
|
63
63
|
"""
|
64
64
|
|
65
|
+
self._verify_dt(dt)
|
65
66
|
self.dt = dt
|
67
|
+
self.dt_ratio = self._get_time_ratio(dt)
|
66
68
|
|
67
69
|
# Parse the input fasta and metadata files
|
68
70
|
super().__init__(
|
@@ -89,7 +91,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
89
91
|
def plot_results(cls,
|
90
92
|
stats: pd.DataFrame,
|
91
93
|
regs: dict[str, dict[str, any]],
|
92
|
-
data_xlabel_units: str
|
94
|
+
data_xlabel_units: str,
|
95
|
+
dt_ratio: float
|
93
96
|
) -> None:
|
94
97
|
"""
|
95
98
|
Plot the results of the analysis.
|
@@ -110,7 +113,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
110
113
|
for k,v in regs.items()
|
111
114
|
if k.startswith("mean")
|
112
115
|
)
|
113
|
-
_mean_data = stats[stats.columns[
|
116
|
+
_mean_data = stats[stats.columns[2]]
|
114
117
|
cls.plot_single_data_and_model(
|
115
118
|
stats.index,
|
116
119
|
_mean_data,
|
@@ -118,7 +121,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
118
121
|
_model["model"],
|
119
122
|
r"$r^2$: " + f"{_model['r2']:.2f}",
|
120
123
|
data_xlabel_units,
|
121
|
-
ax[0]
|
124
|
+
ax[0],
|
125
|
+
dt_ratio=dt_ratio
|
122
126
|
)
|
123
127
|
|
124
128
|
# Variance
|
@@ -127,7 +131,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
127
131
|
for k,v in regs.items()
|
128
132
|
if k.startswith("scaled var")
|
129
133
|
)
|
130
|
-
_variance_data = stats[stats.columns[
|
134
|
+
_variance_data = stats[stats.columns[3]]
|
131
135
|
cls.plot_single_data_and_model(
|
132
136
|
stats.index,
|
133
137
|
_variance_data,
|
@@ -135,7 +139,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
135
139
|
_model["model"],
|
136
140
|
r"$r^2$: " + f"{_model['r2']:.2f}",
|
137
141
|
data_xlabel_units,
|
138
|
-
ax[1]
|
142
|
+
ax[1],
|
143
|
+
dt_ratio=dt_ratio
|
139
144
|
)
|
140
145
|
|
141
146
|
# Dispersion index
|
@@ -147,6 +152,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
147
152
|
"Poissonian regime",
|
148
153
|
data_xlabel_units,
|
149
154
|
ax[2],
|
155
|
+
dt_ratio=dt_ratio,
|
150
156
|
line_linestyle="--",
|
151
157
|
line_color="black"
|
152
158
|
)
|
@@ -159,6 +165,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
159
165
|
stats: pd.DataFrame,
|
160
166
|
regs: dict[str, dict[str, any]],
|
161
167
|
data_xlabel_units: str,
|
168
|
+
dt_ratio: float,
|
162
169
|
output_ptr: str | None = None
|
163
170
|
) -> None:
|
164
171
|
"""
|
@@ -183,7 +190,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
183
190
|
for k,v in regs.items()
|
184
191
|
if k.startswith("mean")
|
185
192
|
)
|
186
|
-
_mean_data = stats[stats.columns[
|
193
|
+
_mean_data = stats[stats.columns[2]]
|
187
194
|
cls.plot_single_data_and_model(
|
188
195
|
stats.index,
|
189
196
|
_mean_data,
|
@@ -191,7 +198,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
191
198
|
_model["model"],
|
192
199
|
r"$r^2$: " + f"{_model['r2']:.2f}",
|
193
200
|
data_xlabel_units,
|
194
|
-
plt.gca()
|
201
|
+
plt.gca(),
|
202
|
+
dt_ratio=dt_ratio
|
195
203
|
)
|
196
204
|
|
197
205
|
plt.title(_mean_data.name)
|
@@ -205,7 +213,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
205
213
|
for k,v in regs.items()
|
206
214
|
if k.startswith("scaled var")
|
207
215
|
)
|
208
|
-
_variance_data = stats[stats.columns[
|
216
|
+
_variance_data = stats[stats.columns[3]]
|
209
217
|
cls.plot_single_data_and_model(
|
210
218
|
stats.index,
|
211
219
|
_variance_data,
|
@@ -213,7 +221,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
213
221
|
lambda x: _model["model"](x) + _variance_data.min(), # Adjust the model to the original variance
|
214
222
|
r"$r^2$: " + f"{_model['r2']:.2f}",
|
215
223
|
data_xlabel_units,
|
216
|
-
plt.gca()
|
224
|
+
plt.gca(),
|
225
|
+
dt_ratio=dt_ratio
|
217
226
|
)
|
218
227
|
|
219
228
|
plt.title(_variance_data.name)
|
@@ -232,6 +241,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
232
241
|
"Poissonian regime",
|
233
242
|
data_xlabel_units,
|
234
243
|
plt.gca(),
|
244
|
+
dt_ratio=dt_ratio,
|
235
245
|
line_linestyle="--",
|
236
246
|
line_color="black"
|
237
247
|
)
|
@@ -360,7 +370,6 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
360
370
|
def compute_stats(self,
|
361
371
|
DT: str,
|
362
372
|
origin: str,
|
363
|
-
n_threshold: int | None = None,
|
364
373
|
mutation_kind: str = "all"
|
365
374
|
) -> pd.DataFrame:
|
366
375
|
"""
|
@@ -372,31 +381,37 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
372
381
|
:type DT: str
|
373
382
|
:param origin: The string datetime that will be the origin of the grouping.
|
374
383
|
:type origin: str
|
375
|
-
:param n_threshold: Minimum number of sequences required in a time interval to compute statistics.
|
376
|
-
:type n_threshold: int | None
|
377
384
|
:param mutation_kind: The kind of mutation to compute the statistics for. Has to be one of ``all``, ``total``, ``substitutions``, ``insertions``, ``deletions`` or ``indels``. Default is ``all``.
|
378
385
|
:return: The statistics of the data.
|
379
386
|
:rtype: ``pd.DataFrame``
|
380
387
|
"""
|
381
388
|
|
382
|
-
|
389
|
+
# Create a local copy of the data
|
390
|
+
_data = self.data.copy()
|
383
391
|
|
384
|
-
#
|
385
|
-
if
|
392
|
+
# If the very first row's date is the same as the origin, and there happens to be only one entry for that date, duplicate that row; this way the stats for the first week can be computed (with variance = 0 of course)
|
393
|
+
if _data.iloc[0]["date"] == origin and len(_data[_data["date"] == origin]) == 1:
|
394
|
+
_data = pd.concat([_data, pd.DataFrame([_data.iloc[0]])], ignore_index=True)
|
395
|
+
_data.sort_values(by="date", inplace=True)
|
396
|
+
_data.reset_index(drop=True, inplace=True)
|
386
397
|
|
387
|
-
|
398
|
+
# Group the data by the datetime interval
|
399
|
+
grouped = self.date_grouper(_data, DT, origin)
|
388
400
|
|
389
|
-
|
390
|
-
|
391
|
-
f"No groups with at least {n_threshold} observations. Consider lowering the threshold."
|
392
|
-
)
|
401
|
+
# Only keep weeks where the number of observations is greater than 1
|
402
|
+
_filtered = grouped.filter(lambda x: len(x) >= 2)
|
393
403
|
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
origin
|
404
|
+
if len(_filtered) == 0:
|
405
|
+
raise ValueError(
|
406
|
+
f"No groups with at least 2 observations. Consider widening the time interval."
|
398
407
|
)
|
399
408
|
|
409
|
+
grouped = self.date_grouper(
|
410
|
+
_filtered,
|
411
|
+
DT,
|
412
|
+
origin
|
413
|
+
)
|
414
|
+
|
400
415
|
levels = [
|
401
416
|
f"number of {x}"
|
402
417
|
for x in self._mutation_type_switch(mutation_kind)
|
@@ -416,7 +431,6 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
416
431
|
|
417
432
|
def analysis(self,
|
418
433
|
length: int,
|
419
|
-
n_threshold: int | None = None,
|
420
434
|
show: bool = False,
|
421
435
|
mutation_kind: str = "all",
|
422
436
|
export_plots_filename: str | None = None
|
@@ -428,7 +442,6 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
428
442
|
|
429
443
|
:param length: The length to filter by.
|
430
444
|
:type length: int
|
431
|
-
:param n_threshold: Minimum number of sequences required in a time interval to compute statistics.
|
432
445
|
:param show: Whether to show the plots or not. Default is False.
|
433
446
|
:type show: bool
|
434
447
|
:param mutation_kind: The kind of mutation to compute the statistics for. Has to be one of ``all``, ``total``, ``substitutions`` or ``indels``. Default is ``all``.
|
@@ -447,20 +460,22 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
447
460
|
stats = self.compute_stats(
|
448
461
|
self.dt,
|
449
462
|
self.origin,
|
450
|
-
n_threshold,
|
451
463
|
mutation_kind
|
452
464
|
)
|
453
465
|
|
466
|
+
# Get weights for weighted fitting
|
467
|
+
weights = stats["size"]
|
454
468
|
|
455
469
|
regs = {}
|
456
470
|
# For each column in the statistics (except the date and the size), compute the corresponding regression model
|
457
471
|
for col in stats.columns[1:-1]:
|
458
472
|
if col.startswith("mean"):
|
459
473
|
_single_regression = {
|
460
|
-
f"{col}
|
474
|
+
f"{col} model": self.linear_regression(
|
461
475
|
*self._remove_nan(
|
462
476
|
stats.index, # Regression is given by the index, so in time, it is the same as multiplying by dt days
|
463
|
-
stats[col]
|
477
|
+
stats[col],
|
478
|
+
weights
|
464
479
|
)
|
465
480
|
)
|
466
481
|
}
|
@@ -468,33 +483,59 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
468
483
|
_single_regression = self.adjust_model(
|
469
484
|
stats.index,
|
470
485
|
stats[col] - stats[col].min(),
|
471
|
-
name=f"scaled {col}
|
486
|
+
name=f"scaled {col} model",
|
487
|
+
weights=weights.to_numpy().flatten()
|
472
488
|
)
|
473
489
|
# Save the regression model
|
474
490
|
regs.update(_single_regression)
|
475
491
|
|
492
|
+
# Add scaling correction to the regression models
|
493
|
+
for k, v in regs.items():
|
494
|
+
if v["expression"] == "mx + b":
|
495
|
+
m = v["parameters"]["m"]
|
496
|
+
b = v["parameters"]["b"]
|
497
|
+
regs[k]["parameters"]["m"] = m/self.dt_ratio
|
498
|
+
m = regs[k]["parameters"]["m"]
|
499
|
+
regs[k]["model"] = lambda x: m*x + b
|
500
|
+
elif v["expression"] == "mx":
|
501
|
+
m = v["parameters"]["m"]
|
502
|
+
regs[k]["parameters"]["m"] = m/self.dt_ratio
|
503
|
+
m = regs[k]["parameters"]["m"]
|
504
|
+
regs[k]["model"] = lambda x: m*x
|
505
|
+
|
506
|
+
elif v["expression"] == "d*x^alpha":
|
507
|
+
d = v["parameters"]["d"]
|
508
|
+
alpha = v["parameters"]["alpha"]
|
509
|
+
regs[k]["parameters"]["d"] = d/(self.dt_ratio**alpha)
|
510
|
+
d = regs[k]["parameters"]["d"]
|
511
|
+
regs[k]["model"] = lambda x: d*(x**alpha)
|
512
|
+
|
476
513
|
# Sets of mutation types used in the analysis
|
477
514
|
_sets = sorted({
|
478
515
|
" ".join(x.split()[1:])
|
479
516
|
for x in stats.columns[1:-1]
|
480
517
|
})
|
481
518
|
|
519
|
+
stats["dt_idx"] = (stats["date"] - stats["date"].min()) / pd.Timedelta("7D")
|
520
|
+
|
482
521
|
# Plot the results
|
483
522
|
if show:
|
484
523
|
# For each set of mutation types
|
485
524
|
for _type in _sets:
|
486
525
|
self.plot_results(
|
487
|
-
stats[["date", f"mean {_type}", f"var {_type}"]],
|
526
|
+
stats[["date", "dt_idx", f"mean {_type}", f"var {_type}"]],
|
488
527
|
{
|
489
528
|
k: v
|
490
529
|
for k, v in regs.items()
|
491
530
|
if k in (
|
492
|
-
f"mean {_type}
|
493
|
-
f"scaled var {_type}
|
531
|
+
f"mean {_type} model",
|
532
|
+
f"scaled var {_type} model"
|
494
533
|
)
|
495
534
|
},
|
496
|
-
|
535
|
+
"wk",
|
536
|
+
self.dt_ratio
|
497
537
|
)
|
538
|
+
|
498
539
|
# Export the plots
|
499
540
|
if export_plots_filename:
|
500
541
|
# Open pdf file pointer
|
@@ -502,19 +543,22 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
502
543
|
# For each set of mutation types save the plots
|
503
544
|
for _type in _sets:
|
504
545
|
self.export_plot_results(
|
505
|
-
stats[["date", f"mean {_type}", f"var {_type}"]],
|
546
|
+
stats[["date", "dt_idx", f"mean {_type}", f"var {_type}"]],
|
506
547
|
{
|
507
548
|
k: v
|
508
549
|
for k, v in regs.items()
|
509
550
|
if k in (
|
510
|
-
f"mean {_type}
|
511
|
-
f"scaled var {_type}
|
551
|
+
f"mean {_type} model",
|
552
|
+
f"scaled var {_type} model"
|
512
553
|
)
|
513
554
|
},
|
514
|
-
|
555
|
+
"wk",
|
556
|
+
self.dt_ratio,
|
515
557
|
pdf
|
516
558
|
)
|
517
559
|
# Close pdf file pointer
|
518
560
|
pdf.close()
|
519
561
|
|
520
562
|
return stats, regs
|
563
|
+
|
564
|
+
|
PyEvoMotion/core/parser.py
CHANGED
@@ -153,11 +153,14 @@ class PyEvoMotionParser():
|
|
153
153
|
mod
|
154
154
|
for mod in x
|
155
155
|
if start - 1 < int(mod.split("_")[1]) < end
|
156
|
-
]
|
156
|
+
] if x else ["NO_MUTATION"]
|
157
157
|
)
|
158
158
|
self.data = self.data[
|
159
159
|
self.data["mutation instructions"].apply(len) > 0
|
160
160
|
]
|
161
|
+
self.data["mutation instructions"] = self.data["mutation instructions"].apply(
|
162
|
+
lambda x: [] if x == ["NO_MUTATION"] else x
|
163
|
+
)
|
161
164
|
|
162
165
|
def filter_columns(self, filters: dict[str, list[str] | str]) -> None:
|
163
166
|
"""
|