PyEvoMotion 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyEvoMotion/cli.py +88 -11
- PyEvoMotion/core/base.py +373 -34
- PyEvoMotion/core/core.py +136 -43
- PyEvoMotion/core/parser.py +4 -1
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.2.dist-info}/METADATA +72 -4
- pyevomotion-0.1.2.dist-info/RECORD +35 -0
- share/analyze_model_selection_accuracy.py +316 -0
- share/analyze_test_runs.py +436 -0
- share/anomalous_diffusion.pdf +0 -0
- share/confusion_matrix_heatmap.pdf +0 -0
- share/figUK.tsv +9949 -0
- share/figUK_plots.pdf +0 -0
- share/figUK_regression_results.json +65 -0
- share/figUK_run_args.json +14 -0
- share/figUK_stats.tsv +41 -0
- share/figUSA.tsv +9470 -0
- share/figUSA_plots.pdf +0 -0
- share/figUSA_regression_results.json +65 -0
- share/figUSA_run_args.json +14 -0
- share/figUSA_stats.tsv +34 -0
- share/figdataUK.tsv +10001 -0
- share/figdataUSA.tsv +10001 -0
- share/generate_sequences_from_synthdata.py +85 -0
- share/generate_sequences_from_test5_data.py +107 -0
- share/manuscript_figure.py +858 -43
- share/run_parallel_analysis.py +196 -0
- share/synth_figure.pdf +0 -0
- share/uk_time_windows.pdf +0 -0
- share/weekly_size.pdf +0 -0
- pyevomotion-0.1.0.dist-info/RECORD +0 -13
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.2.dist-info}/WHEEL +0 -0
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.2.dist-info}/entry_points.txt +0 -0
PyEvoMotion/core/base.py
CHANGED
|
@@ -2,7 +2,7 @@ import numpy as np
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from sklearn.metrics import r2_score
|
|
4
4
|
from scipy.optimize import curve_fit
|
|
5
|
-
from scipy.stats import f as snedecor_f
|
|
5
|
+
from scipy.stats import f as snedecor_f, t as t_dist
|
|
6
6
|
from sklearn.linear_model import LinearRegression
|
|
7
7
|
|
|
8
8
|
|
|
@@ -102,7 +102,7 @@ class PyEvoMotionBase():
|
|
|
102
102
|
print(f"Method {method} not found in {instance}")
|
|
103
103
|
|
|
104
104
|
@staticmethod
|
|
105
|
-
def _remove_nan(x: pd.Series, y: pd.Series) -> tuple[np.ndarray, np.ndarray]:
|
|
105
|
+
def _remove_nan(x: pd.Series, y: pd.Series, z: pd.Series) -> tuple[np.ndarray, np.ndarray]:
|
|
106
106
|
"""
|
|
107
107
|
Remove NaN values from two pandas Series and return them as numpy arrays.
|
|
108
108
|
|
|
@@ -110,22 +110,77 @@ class PyEvoMotionBase():
|
|
|
110
110
|
:type x: pd.Series
|
|
111
111
|
:param y: the second pandas Series.
|
|
112
112
|
:type y: pd.Series
|
|
113
|
+
:param z: the third pandas Series.
|
|
114
|
+
:type z: pd.Series
|
|
113
115
|
:return: a tuple with the two pandas Series without NaN values.
|
|
114
116
|
:rtype: tuple[np.ndarray,np.ndarray]
|
|
115
117
|
"""
|
|
116
118
|
|
|
117
|
-
data = pd.DataFrame({"x": x, "y": y}).dropna()
|
|
119
|
+
data = pd.DataFrame({"x": x, "y": y, "z": z}).dropna()
|
|
118
120
|
|
|
119
121
|
x = data["x"].to_numpy().reshape(-1, 1)
|
|
120
122
|
y = data["y"].to_numpy().reshape(-1, 1)
|
|
123
|
+
z = data["z"].to_numpy().reshape(-1, 1)
|
|
124
|
+
return x, y, z
|
|
121
125
|
|
|
122
|
-
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _weighting_function(n: int, n_0: int = 30) -> np.ndarray:
|
|
128
|
+
"""
|
|
129
|
+
Weighting function for the data points.
|
|
130
|
+
|
|
131
|
+
:param n: The number of data points.
|
|
132
|
+
:type n: int
|
|
133
|
+
:param n_0: The number of data points at which the weighting function approximates the constant 1. Default is 30.
|
|
134
|
+
:type n_0: int
|
|
135
|
+
:return: The weighting function.
|
|
136
|
+
:rtype: np.ndarray
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
return np.tanh(2*n/n_0)
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def _compute_confidence_intervals(
|
|
143
|
+
parameters: dict[str, float],
|
|
144
|
+
standard_errors: dict[str, float],
|
|
145
|
+
degrees_of_freedom: int,
|
|
146
|
+
confidence_level: float = 0.95
|
|
147
|
+
) -> dict[str, tuple[float, float]]:
|
|
148
|
+
"""
|
|
149
|
+
Compute confidence intervals for parameters using t-distribution.
|
|
150
|
+
|
|
151
|
+
:param parameters: Dictionary of parameter names and their estimated values.
|
|
152
|
+
:type parameters: dict[str, float]
|
|
153
|
+
:param standard_errors: Dictionary of parameter names and their standard errors.
|
|
154
|
+
:type standard_errors: dict[str, float]
|
|
155
|
+
:param degrees_of_freedom: Degrees of freedom for the t-distribution.
|
|
156
|
+
:type degrees_of_freedom: int
|
|
157
|
+
:param confidence_level: Confidence level for the intervals (default 0.95 for 95% CI).
|
|
158
|
+
:type confidence_level: float
|
|
159
|
+
:return: Dictionary with parameter names as keys and (lower_bound, upper_bound) tuples as values.
|
|
160
|
+
:rtype: dict[str, tuple[float, float]]
|
|
161
|
+
"""
|
|
162
|
+
alpha = 1 - confidence_level
|
|
163
|
+
t_val = t_dist.ppf(1 - alpha/2, degrees_of_freedom)
|
|
164
|
+
|
|
165
|
+
confidence_intervals = {}
|
|
166
|
+
for param_name in parameters.keys():
|
|
167
|
+
param_value = parameters[param_name]
|
|
168
|
+
param_se = standard_errors[param_name]
|
|
169
|
+
margin_of_error = t_val * param_se
|
|
170
|
+
confidence_intervals[param_name] = (
|
|
171
|
+
param_value - margin_of_error,
|
|
172
|
+
param_value + margin_of_error
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return confidence_intervals
|
|
123
176
|
|
|
124
177
|
@classmethod
|
|
125
178
|
def linear_regression(cls,
|
|
126
179
|
x: np.ndarray,
|
|
127
180
|
y: np.ndarray,
|
|
128
|
-
|
|
181
|
+
weights: np.ndarray | None = None,
|
|
182
|
+
fit_intercept: bool = True,
|
|
183
|
+
confidence_level: float = 0.95
|
|
129
184
|
) -> dict[str, any]:
|
|
130
185
|
"""
|
|
131
186
|
Perform a linear regression on a set of data.
|
|
@@ -136,16 +191,58 @@ class PyEvoMotionBase():
|
|
|
136
191
|
:type y: np.ndarray
|
|
137
192
|
:param fit_intercept: Whether to fit the intercept. Default is ``True``.
|
|
138
193
|
:type fit_intercept: bool
|
|
194
|
+
:param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
|
|
195
|
+
:type weights: np.ndarray | None
|
|
196
|
+
:param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
|
|
197
|
+
:type confidence_level: float
|
|
139
198
|
:return: A dictionary containing:
|
|
140
199
|
|
|
141
200
|
* ``model``: A ``lambda`` function that computes predictions based on the fitted model.
|
|
142
201
|
* ``parameters``: A dictionary with the slope of the regression line.
|
|
202
|
+
* ``confidence_intervals``: A dictionary with confidence intervals for each parameter.
|
|
143
203
|
* ``expression``: A string representation of the regression equation.
|
|
144
204
|
* ``r2``: The :math:`R^2` score of the regression.
|
|
145
205
|
:rtype: ``dict[str, any]``
|
|
146
206
|
"""
|
|
147
207
|
|
|
148
|
-
|
|
208
|
+
_weights = cls._weighting_function(weights).flatten() if weights is not None else None
|
|
209
|
+
|
|
210
|
+
reg = LinearRegression(fit_intercept=fit_intercept).fit(x, y, sample_weight=_weights)
|
|
211
|
+
|
|
212
|
+
# Calculate confidence intervals
|
|
213
|
+
n = len(x)
|
|
214
|
+
_df = n - (2 if fit_intercept else 1) # degrees of freedom
|
|
215
|
+
|
|
216
|
+
# Calculate residuals and MSE
|
|
217
|
+
y_pred = reg.predict(x)
|
|
218
|
+
residuals = y.flatten() - y_pred.flatten()
|
|
219
|
+
|
|
220
|
+
if _weights is not None:
|
|
221
|
+
# Weighted MSE
|
|
222
|
+
mse = np.sum(_weights * residuals**2) / (np.sum(_weights) - (2 if fit_intercept else 1))
|
|
223
|
+
else:
|
|
224
|
+
mse = np.sum(residuals**2) / _df
|
|
225
|
+
|
|
226
|
+
# Calculate standard errors
|
|
227
|
+
x_flat = x.flatten()
|
|
228
|
+
x_mean = np.mean(x_flat)
|
|
229
|
+
sxx = np.sum((x_flat - x_mean)**2)
|
|
230
|
+
|
|
231
|
+
# Standard error for slope
|
|
232
|
+
se_slope = np.sqrt(mse / sxx)
|
|
233
|
+
|
|
234
|
+
parameters = {"m": reg.coef_[0][0]}
|
|
235
|
+
standard_errors = {"m": se_slope}
|
|
236
|
+
|
|
237
|
+
if fit_intercept:
|
|
238
|
+
se_intercept = np.sqrt(mse * (1/n + x_mean**2/sxx))
|
|
239
|
+
parameters["b"] = reg.intercept_[0]
|
|
240
|
+
standard_errors["b"] = se_intercept
|
|
241
|
+
|
|
242
|
+
# Compute confidence intervals using the abstracted method
|
|
243
|
+
confidence_intervals = cls._compute_confidence_intervals(
|
|
244
|
+
parameters, standard_errors, _df, confidence_level
|
|
245
|
+
)
|
|
149
246
|
|
|
150
247
|
if fit_intercept:
|
|
151
248
|
model = {
|
|
@@ -154,7 +251,9 @@ class PyEvoMotionBase():
|
|
|
154
251
|
"m": reg.coef_[0][0],
|
|
155
252
|
"b": reg.intercept_[0]
|
|
156
253
|
},
|
|
157
|
-
"
|
|
254
|
+
"confidence_intervals": confidence_intervals,
|
|
255
|
+
"expression": "mx + b",
|
|
256
|
+
"confidence_level": confidence_level
|
|
158
257
|
}
|
|
159
258
|
|
|
160
259
|
else:
|
|
@@ -163,10 +262,12 @@ class PyEvoMotionBase():
|
|
|
163
262
|
"parameters": {
|
|
164
263
|
"m": reg.coef_[0][0],
|
|
165
264
|
},
|
|
166
|
-
"
|
|
265
|
+
"confidence_intervals": confidence_intervals,
|
|
266
|
+
"expression": "mx",
|
|
267
|
+
"confidence_level": confidence_level
|
|
167
268
|
}
|
|
168
269
|
|
|
169
|
-
model["r2"] = r2_score(y, reg.predict(x))
|
|
270
|
+
model["r2"] = r2_score(y, reg.predict(x), sample_weight=_weights)
|
|
170
271
|
|
|
171
272
|
return model
|
|
172
273
|
|
|
@@ -192,36 +293,97 @@ class PyEvoMotionBase():
|
|
|
192
293
|
return a*np.power(x, b)
|
|
193
294
|
|
|
194
295
|
@classmethod
|
|
195
|
-
def power_law_fit(cls, x: np.ndarray, y: np.ndarray) -> dict[str, any]:
|
|
296
|
+
def power_law_fit(cls, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, confidence_level: float = 0.95) -> dict[str, any]:
|
|
196
297
|
"""
|
|
197
298
|
Perform a power law fit on a set of data.
|
|
299
|
+
|
|
300
|
+
This method fits a power law model of the form :math:`y = d \\cdot x^{\\alpha}` to the data.
|
|
301
|
+
Initial parameter estimates are obtained via linear regression on log-transformed data,
|
|
302
|
+
which provides better convergence than default initialization.
|
|
198
303
|
|
|
199
304
|
:param x: A numpy array of the features.
|
|
200
305
|
:type x: np.ndarray
|
|
201
306
|
:param y: A numpy array of the target.
|
|
202
307
|
:type y: np.ndarray
|
|
308
|
+
:param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
|
|
309
|
+
:type weights: np.ndarray | None
|
|
310
|
+
:param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
|
|
311
|
+
:type confidence_level: float
|
|
203
312
|
:return: A dictionary containing:
|
|
204
313
|
|
|
205
314
|
* ``model``: A ``lambda`` function that computes predictions based on the fitted model.
|
|
206
|
-
* ``parameters``: A dictionary with the parameters of the fitted power law.
|
|
315
|
+
* ``parameters``: A dictionary with the parameters of the fitted power law (``d`` and ``alpha``).
|
|
316
|
+
* ``confidence_intervals``: A dictionary with confidence intervals for each parameter.
|
|
207
317
|
* ``expression``: A string representation of the regression equation.
|
|
208
318
|
* ``r2``: The :math:`R^2` score of the regression.
|
|
319
|
+
* ``confidence_level``: The confidence level used for the confidence intervals.
|
|
209
320
|
:rtype: ``dict[str, any]``
|
|
210
321
|
"""
|
|
211
322
|
|
|
323
|
+
_weights = cls._weighting_function(weights).flatten() if weights is not None else None
|
|
324
|
+
|
|
325
|
+
# Provide good initial parameter guesses for power law
|
|
326
|
+
# Use linear regression on log-transformed data to get initial estimates
|
|
327
|
+
x_flat = x.T.tolist()[0]
|
|
328
|
+
y_flat = y.T.tolist()[0]
|
|
329
|
+
mask = (np.array(x_flat) > 0) & (np.array(y_flat) > 0)
|
|
330
|
+
x_log = np.log(np.array(x_flat)[mask])
|
|
331
|
+
y_log = np.log(np.array(y_flat)[mask])
|
|
332
|
+
|
|
333
|
+
# Linear regression on log-transformed data: log(y) = log(d) + alpha*log(x)
|
|
334
|
+
# This gives us initial estimates for d and alpha
|
|
335
|
+
if len(x_log) > 1:
|
|
336
|
+
reg = LinearRegression(fit_intercept=True).fit(x_log.reshape(-1, 1), y_log.reshape(-1, 1))
|
|
337
|
+
|
|
338
|
+
p0 = [np.exp(reg.intercept_[0]), reg.coef_[0][0]] # [d, alpha]
|
|
339
|
+
else:
|
|
340
|
+
p0 = [1.0, 1.0] # Default fallback
|
|
341
|
+
|
|
342
|
+
# Set reasonable bounds for power law parameters
|
|
343
|
+
# d > 0 (coefficient must be positive)
|
|
344
|
+
# alpha can be any real number, but constrain to reasonable range
|
|
345
|
+
bounds = ([1e-10, -10], [np.inf, 10]) # [d_min, alpha_min], [d_max, alpha_max]
|
|
346
|
+
|
|
212
347
|
try:
|
|
213
|
-
_popt,
|
|
348
|
+
_popt, _pcov, _, _msg, _ier = curve_fit(
|
|
214
349
|
cls._power_law,
|
|
215
|
-
|
|
350
|
+
x_flat, y_flat,
|
|
351
|
+
p0=p0,
|
|
352
|
+
bounds=bounds,
|
|
353
|
+
sigma=1/np.sqrt(_weights) if _weights is not None else None,
|
|
216
354
|
full_output=True
|
|
217
355
|
)
|
|
218
356
|
except RuntimeError as e:
|
|
219
357
|
_ier = 0
|
|
220
358
|
_msg = str(e)
|
|
359
|
+
_pcov = np.array([[np.inf, 0], [0, np.inf]])
|
|
221
360
|
|
|
222
361
|
if _ier not in range(1, 5):
|
|
223
362
|
print(f"{_msg}")
|
|
224
363
|
_popt = [0, 0]
|
|
364
|
+
_pcov = np.array([[np.inf, 0], [0, np.inf]])
|
|
365
|
+
|
|
366
|
+
# Calculate confidence intervals from covariance matrix
|
|
367
|
+
n = len(x)
|
|
368
|
+
df = n - 2 # degrees of freedom for 2 parameters
|
|
369
|
+
|
|
370
|
+
# Standard errors from covariance matrix diagonal
|
|
371
|
+
param_errors = np.sqrt(np.diag(_pcov))
|
|
372
|
+
|
|
373
|
+
# Prepare parameters and standard errors for confidence interval computation
|
|
374
|
+
parameters = {
|
|
375
|
+
"d": _popt[0],
|
|
376
|
+
"alpha": _popt[1]
|
|
377
|
+
}
|
|
378
|
+
standard_errors = {
|
|
379
|
+
"d": param_errors[0],
|
|
380
|
+
"alpha": param_errors[1]
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
# Compute confidence intervals using the abstracted method
|
|
384
|
+
confidence_intervals = cls._compute_confidence_intervals(
|
|
385
|
+
parameters, standard_errors, df, confidence_level
|
|
386
|
+
)
|
|
225
387
|
|
|
226
388
|
model = {
|
|
227
389
|
"model": lambda x: _popt[0]*np.power(x, _popt[1]),
|
|
@@ -229,17 +391,21 @@ class PyEvoMotionBase():
|
|
|
229
391
|
"d": _popt[0],
|
|
230
392
|
"alpha": _popt[1]
|
|
231
393
|
},
|
|
394
|
+
"confidence_intervals": confidence_intervals,
|
|
232
395
|
"expression": "d*x^alpha",
|
|
233
|
-
"
|
|
396
|
+
"confidence_level": confidence_level,
|
|
397
|
+
"r2": r2_score(y, cls._power_law(x, *_popt), sample_weight=_weights)
|
|
234
398
|
}
|
|
235
399
|
|
|
236
400
|
return model
|
|
237
401
|
|
|
238
|
-
@
|
|
402
|
+
@classmethod
|
|
239
403
|
def F_test(
|
|
404
|
+
cls,
|
|
240
405
|
model1: dict[str,any],
|
|
241
406
|
model2: dict[str,any],
|
|
242
|
-
data: np.ndarray
|
|
407
|
+
data: np.ndarray,
|
|
408
|
+
weights: np.ndarray | None = None
|
|
243
409
|
) -> tuple[float, float]:
|
|
244
410
|
"""
|
|
245
411
|
Perform an F-test between two models.
|
|
@@ -257,6 +423,11 @@ class PyEvoMotionBase():
|
|
|
257
423
|
"""
|
|
258
424
|
|
|
259
425
|
data = data.flatten()
|
|
426
|
+
|
|
427
|
+
if weights is not None:
|
|
428
|
+
_weights = cls._weighting_function(weights.flatten())
|
|
429
|
+
else:
|
|
430
|
+
_weights = np.ones(len(data))
|
|
260
431
|
|
|
261
432
|
# Note that p1 < p2 always. Won't do an assertion because I'm making sure elsewhere that the linear model does not have an intercept, i.e. it only has the slope
|
|
262
433
|
p1 = len(model1["parameters"])
|
|
@@ -278,20 +449,112 @@ class PyEvoMotionBase():
|
|
|
278
449
|
)
|
|
279
450
|
|
|
280
451
|
# Sum the residuals without the infinite values
|
|
281
|
-
RSS1 =
|
|
282
|
-
RSS2 =
|
|
452
|
+
RSS1 = np.sum(_weights*RS1, where=~mask)
|
|
453
|
+
RSS2 = np.sum(_weights*RS2, where=~mask)
|
|
283
454
|
|
|
284
455
|
F = ((RSS1 - RSS2)/(p2 - p1))/(RSS2/(n - p2))
|
|
285
456
|
|
|
286
|
-
return F, 1 - snedecor_f.cdf(F, p2 - p1, n - p2)
|
|
457
|
+
return F, 1 - snedecor_f.cdf(F, p2 - p1, n - p2)
|
|
287
458
|
|
|
459
|
+
@classmethod
|
|
460
|
+
def AIC(
|
|
461
|
+
cls,
|
|
462
|
+
model1: dict[str,any],
|
|
463
|
+
model2: dict[str,any],
|
|
464
|
+
data: np.ndarray,
|
|
465
|
+
weights: np.ndarray | None = None
|
|
466
|
+
) -> tuple[float, float]:
|
|
467
|
+
"""
|
|
468
|
+
Perform an AIC test between two models.
|
|
469
|
+
|
|
470
|
+
Uses the small-sample corrected AIC with full constant terms:
|
|
471
|
+
AICc = n*ln(2*pi) + n*ln(RSS/n) + n + 2k + [2k(k+1)]/(n-k-1)
|
|
472
|
+
|
|
473
|
+
See https://en.wikipedia.org/wiki/Akaike_information_criterion for more details.
|
|
474
|
+
|
|
475
|
+
:param model1: The first model.
|
|
476
|
+
:type model1: dict[str, any]
|
|
477
|
+
:param model2: The second model.
|
|
478
|
+
:type model2: dict[str, any]
|
|
479
|
+
:param data: The data to test the models.
|
|
480
|
+
:type data: np.ndarray
|
|
481
|
+
:return: A tuple with the F-value and the p-value.
|
|
482
|
+
:rtype: ``tuple[float, float]``
|
|
483
|
+
"""
|
|
484
|
+
|
|
485
|
+
data = data.flatten()
|
|
486
|
+
|
|
487
|
+
if weights is not None:
|
|
488
|
+
_weights = cls._weighting_function(weights.flatten())
|
|
489
|
+
else:
|
|
490
|
+
_weights = np.ones(len(data))
|
|
491
|
+
|
|
492
|
+
k1 = len(model1["parameters"])
|
|
493
|
+
k2 = len(model2["parameters"])
|
|
494
|
+
n = len(data)
|
|
495
|
+
|
|
496
|
+
model1 = np.vectorize(model1["model"])
|
|
497
|
+
model2 = np.vectorize(model2["model"])
|
|
498
|
+
|
|
499
|
+
RS1 = (data - model1(range(n)))**2
|
|
500
|
+
RS2 = (data - model2(range(n)))**2
|
|
501
|
+
|
|
502
|
+
# Mask the infinite and nan values
|
|
503
|
+
mask = (
|
|
504
|
+
np.isinf(RS1)
|
|
505
|
+
| np.isinf(RS2)
|
|
506
|
+
| np.isnan(RS1)
|
|
507
|
+
| np.isnan(RS2)
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# Sum the residuals without the infinite values
|
|
511
|
+
RSS1 = np.sum(_weights*RS1, where=~mask)
|
|
512
|
+
RSS2 = np.sum(_weights*RS2, where=~mask)
|
|
513
|
+
|
|
514
|
+
# Handle edge case where RSS is 0 (perfect fit) to avoid log(0)
|
|
515
|
+
if RSS1 == 0:
|
|
516
|
+
RSS1 = 1e-10 # Small positive value to avoid log(0)
|
|
517
|
+
if RSS2 == 0:
|
|
518
|
+
RSS2 = 1e-10 # Small positive value to avoid log(0)
|
|
519
|
+
|
|
520
|
+
const_term = n * (np.log(2*np.pi) + 1.0)
|
|
521
|
+
denom1 = n - k1 - 1
|
|
522
|
+
denom2 = n - k2 - 1
|
|
523
|
+
|
|
524
|
+
# If denom <= 0, AICc is undefined; treat as +inf (no support)
|
|
525
|
+
if denom1 <= 0:
|
|
526
|
+
AICc1 = np.inf
|
|
527
|
+
else:
|
|
528
|
+
AICc1 = const_term + n * np.log(RSS1 / n) + 2 * k1 + (2 * k1 * (k1 + 1)) / denom1
|
|
529
|
+
|
|
530
|
+
if denom2 <= 0:
|
|
531
|
+
AICc2 = np.inf
|
|
532
|
+
else:
|
|
533
|
+
AICc2 = const_term + n * np.log(RSS2 / n) + 2 * k2 + (2 * k2 * (k2 + 1)) / denom2
|
|
534
|
+
|
|
535
|
+
# ΔAIC: relative to best (lowest AIC)
|
|
536
|
+
min_aicc = min(AICc1, AICc2)
|
|
537
|
+
dAICc1 = AICc1 - min_aicc
|
|
538
|
+
dAICc2 = AICc2 - min_aicc
|
|
539
|
+
|
|
540
|
+
# Akaike weights
|
|
541
|
+
rel1 = np.exp(-0.5 * dAICc1) if np.isfinite(dAICc1) else 0
|
|
542
|
+
rel2 = np.exp(-0.5 * dAICc2) if np.isfinite(dAICc2) else 0
|
|
543
|
+
denom = rel1 + rel2 if (rel1 + rel2) > 0 else 1.0
|
|
544
|
+
w1 = rel1 / denom
|
|
545
|
+
w2 = rel2 / denom
|
|
546
|
+
|
|
547
|
+
return AICc1, AICc2, dAICc1, dAICc2, w1, w2
|
|
548
|
+
|
|
288
549
|
@classmethod
|
|
289
550
|
def adjust_model(cls,
|
|
290
551
|
x: pd.Series,
|
|
291
552
|
y: pd.Series,
|
|
292
|
-
name: str = None
|
|
553
|
+
name: str = None,
|
|
554
|
+
weights: pd.Series | None = None,
|
|
555
|
+
confidence_level: float = 0.95
|
|
293
556
|
) -> dict[str, any]:
|
|
294
|
-
"""Adjust a model to the data.
|
|
557
|
+
"""Adjust a model to the data using AIC for model selection.
|
|
295
558
|
|
|
296
559
|
:param x: The features. It is a single pandas Series.
|
|
297
560
|
:type x: pd.Series
|
|
@@ -299,12 +562,25 @@ class PyEvoMotionBase():
|
|
|
299
562
|
:type y: pd.Series
|
|
300
563
|
:param name: The name of the data. Default is ``None``.
|
|
301
564
|
:type name: str
|
|
302
|
-
:
|
|
565
|
+
:param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
|
|
566
|
+
:type weights: np.ndarray | None
|
|
567
|
+
:param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
|
|
568
|
+
:type confidence_level: float
|
|
569
|
+
:return: A dictionary containing:
|
|
570
|
+
|
|
571
|
+
* If name is provided: A dictionary with the name as key and the result dictionary as value
|
|
572
|
+
* If name is None: A dictionary containing:
|
|
573
|
+
|
|
574
|
+
* ``selected_model``: The selected model based on lowest AIC
|
|
575
|
+
* ``linear_model``: The linear regression model with AIC statistics
|
|
576
|
+
* ``power_law_model``: The power law model with AIC statistics
|
|
577
|
+
* ``model_selection``: Dictionary with AIC comparison results
|
|
578
|
+
|
|
303
579
|
:rtype: ``dict[str, any]``
|
|
304
580
|
:raises ValueError: If the dataset is empty or full of NaN values. This may occur if the grouped data contains only one entry per group, indicating that the variance cannot be computed.
|
|
305
581
|
"""
|
|
306
582
|
|
|
307
|
-
x,y = cls._remove_nan(x, y)
|
|
583
|
+
x,y,w = cls._remove_nan(x, y, weights)
|
|
308
584
|
|
|
309
585
|
# Raises an error if the dataset is (almost) empty at this point
|
|
310
586
|
if (x.size <= 1) or (y.size <= 1):
|
|
@@ -313,20 +589,57 @@ class PyEvoMotionBase():
|
|
|
313
589
|
f"Dataset length after filtering is: x: {x.size} elements; y: {y.size} elements. In particular:\n\nx: {x}\ny: {y}\n\nPerhaps NaN appeared for certain entries. Check if the grouped data contains only one entry per group, as this may cause NaN values when computing the variance. Also, consider widening the time window."
|
|
314
590
|
)
|
|
315
591
|
|
|
316
|
-
model1 = cls.linear_regression(x, y, fit_intercept=False) # Not fitting the intercept because data is passed scaled to the minimum
|
|
317
|
-
model2 = cls.power_law_fit(x, y)
|
|
592
|
+
model1 = cls.linear_regression(x, y, weights=w, fit_intercept=False, confidence_level=confidence_level) # Not fitting the intercept because data is passed scaled to the minimum
|
|
593
|
+
model2 = cls.power_law_fit(x, y, weights=w, confidence_level=confidence_level)
|
|
318
594
|
|
|
319
|
-
|
|
595
|
+
# Compute AIC statistics for both models
|
|
596
|
+
AIC1, AIC2, dAIC1, dAIC2, w1, w2 = cls.AIC(model1, model2, y, weights=w)
|
|
320
597
|
|
|
321
|
-
|
|
322
|
-
|
|
598
|
+
# Select model with lowest AIC (highest Akaike weight)
|
|
599
|
+
if AIC1 <= AIC2:
|
|
600
|
+
selected_model = model1
|
|
601
|
+
selected_model_name = "linear"
|
|
323
602
|
else:
|
|
324
|
-
|
|
603
|
+
selected_model = model2
|
|
604
|
+
selected_model_name = "power_law"
|
|
605
|
+
|
|
606
|
+
# Add AIC statistics to each model
|
|
607
|
+
model1_with_aic = model1.copy()
|
|
608
|
+
model1_with_aic.update({
|
|
609
|
+
"AIC": AIC1,
|
|
610
|
+
"delta_AIC": dAIC1,
|
|
611
|
+
"akaike_weight": w1,
|
|
612
|
+
"confidence_level": confidence_level
|
|
613
|
+
})
|
|
614
|
+
|
|
615
|
+
model2_with_aic = model2.copy()
|
|
616
|
+
model2_with_aic.update({
|
|
617
|
+
"AIC": AIC2,
|
|
618
|
+
"delta_AIC": dAIC2,
|
|
619
|
+
"akaike_weight": w2,
|
|
620
|
+
"confidence_level": confidence_level
|
|
621
|
+
})
|
|
622
|
+
|
|
623
|
+
# Create comprehensive result dictionary
|
|
624
|
+
result = {
|
|
625
|
+
"selected_model": selected_model,
|
|
626
|
+
"linear_model": model1_with_aic,
|
|
627
|
+
"power_law_model": model2_with_aic,
|
|
628
|
+
"model_selection": {
|
|
629
|
+
"selected": selected_model_name,
|
|
630
|
+
"linear_AIC": AIC1,
|
|
631
|
+
"power_law_AIC": AIC2,
|
|
632
|
+
"delta_AIC_linear": dAIC1,
|
|
633
|
+
"delta_AIC_power_law": dAIC2,
|
|
634
|
+
"akaike_weight_linear": w1,
|
|
635
|
+
"akaike_weight_power_law": w2
|
|
636
|
+
}
|
|
637
|
+
}
|
|
325
638
|
|
|
326
639
|
if name:
|
|
327
|
-
return {name:
|
|
640
|
+
return {name: result}
|
|
328
641
|
else:
|
|
329
|
-
return
|
|
642
|
+
return result
|
|
330
643
|
|
|
331
644
|
@staticmethod
|
|
332
645
|
def plot_single_data_and_model(
|
|
@@ -337,6 +650,7 @@ class PyEvoMotionBase():
|
|
|
337
650
|
model_label: str,
|
|
338
651
|
data_xlabel_units: str,
|
|
339
652
|
ax: any,
|
|
653
|
+
dt_ratio: float,
|
|
340
654
|
**kwargs: dict[str, any]
|
|
341
655
|
) -> None:
|
|
342
656
|
"""
|
|
@@ -376,13 +690,13 @@ class PyEvoMotionBase():
|
|
|
376
690
|
point_kwargs[_k] = kwargs[k]
|
|
377
691
|
|
|
378
692
|
ax.scatter(
|
|
379
|
-
data_x,
|
|
693
|
+
data_x.to_numpy()*dt_ratio,
|
|
380
694
|
data_y,
|
|
381
695
|
**point_kwargs
|
|
382
696
|
)
|
|
383
697
|
ax.plot(
|
|
384
|
-
data_x,
|
|
385
|
-
model(data_x),
|
|
698
|
+
data_x.to_numpy()*dt_ratio,
|
|
699
|
+
model(data_x.to_numpy()*dt_ratio),
|
|
386
700
|
label=model_label,
|
|
387
701
|
**line_kwargs
|
|
388
702
|
)
|
|
@@ -404,3 +718,28 @@ class PyEvoMotionBase():
|
|
|
404
718
|
raise ValueError(
|
|
405
719
|
f"The dataset is (almost) empty at this point of the analysis.\n{msg}"
|
|
406
720
|
)
|
|
721
|
+
|
|
722
|
+
@staticmethod
|
|
723
|
+
def _get_time_ratio(dt: str, reference: str = "7D") -> float:
|
|
724
|
+
"""Get the ratio of a time interval with respect to a reference interval.
|
|
725
|
+
|
|
726
|
+
:param dt: Time interval string (e.g. "5D", "7D", "10D", "14D", "12H")
|
|
727
|
+
:type dt: str
|
|
728
|
+
:param reference: Reference time interval string. Default is "7D".
|
|
729
|
+
:type reference: str
|
|
730
|
+
:return: The ratio of dt to reference
|
|
731
|
+
:rtype: float
|
|
732
|
+
"""
|
|
733
|
+
|
|
734
|
+
return pd.Timedelta(dt) / pd.Timedelta(reference)
|
|
735
|
+
|
|
736
|
+
@classmethod
|
|
737
|
+
def _verify_dt(cls, dt: str) -> None:
|
|
738
|
+
"""Verify that the time window string is greater than 1 day.
|
|
739
|
+
|
|
740
|
+
:param dt: Time window string (e.g. "5D", "7D", "10D", "14D")
|
|
741
|
+
:type dt: str
|
|
742
|
+
:raises ValueError: If the time window is not greater than 1 day
|
|
743
|
+
"""
|
|
744
|
+
if cls._get_time_ratio(dt, "1D") <= 1:
|
|
745
|
+
raise ValueError(f"Time window must be greater than 1 day. Got {dt}")
|