PyEvoMotion 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PKG-INFO +1 -1
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/cli.py +87 -3
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/core/base.py +296 -20
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/core/core.py +73 -24
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/pyproject.toml +1 -1
- pyevomotion-0.1.2/share/analyze_model_selection_accuracy.py +316 -0
- pyevomotion-0.1.2/share/analyze_test_runs.py +436 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/anomalous_diffusion.pdf +0 -0
- pyevomotion-0.1.2/share/confusion_matrix_heatmap.pdf +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figUK_plots.pdf +0 -0
- pyevomotion-0.1.2/share/figUK_regression_results.json +65 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figUK_run_args.json +1 -0
- pyevomotion-0.1.2/share/figUK_stats.tsv +41 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figUSA_plots.pdf +0 -0
- pyevomotion-0.1.2/share/figUSA_regression_results.json +65 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figUSA_run_args.json +1 -0
- pyevomotion-0.1.2/share/figUSA_stats.tsv +34 -0
- pyevomotion-0.1.2/share/generate_sequences_from_test5_data.py +107 -0
- pyevomotion-0.1.2/share/manuscript_figure.py +1131 -0
- pyevomotion-0.1.2/share/run_parallel_analysis.py +196 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/synth_figure.pdf +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/uk_time_windows.pdf +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/weekly_size.pdf +0 -0
- pyevomotion-0.1.2/tests/test_synthetic_datasets.py +79 -0
- pyevomotion-0.1.1/share/figUK_regression_results.json +0 -18
- pyevomotion-0.1.1/share/figUK_stats.tsv +0 -41
- pyevomotion-0.1.1/share/figUSA_regression_results.json +0 -18
- pyevomotion-0.1.1/share/figUSA_stats.tsv +0 -34
- pyevomotion-0.1.1/share/figure.pdf +0 -0
- pyevomotion-0.1.1/share/manuscript_figure.py +0 -761
- pyevomotion-0.1.1/tests/test_synthetic_datasets.py +0 -55
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/__init__.py +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/core/__init__.py +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/core/parser.py +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/utils.py +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/README.md +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figUK.tsv +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figUSA.tsv +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figdataUK.tsv +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figdataUSA.tsv +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/generate_sequences_from_synthdata.py +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/mafft_install.sh +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/__init__.py +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test1/out_run_args.json +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test1/output/test1.data.tsv +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test1/test1.metadata.tsv +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test1/test1.sequences.fasta +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test2/out_run_args.json +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test2/test2.metadata.parquet.gz +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test3/ids_sampled_for_figure.json +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/helpers/test_UK_USA_dataset_helpers.py +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/helpers/test_parser_helpers.py +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/test_UK_USA_dataset.py +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/test_core.py +0 -0
- {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/test_parser.py +0 -0
|
@@ -248,6 +248,13 @@ def _parse_arguments() -> argparse.Namespace:
|
|
|
248
248
|
action="store_true",
|
|
249
249
|
help="Export the plots of the analysis."
|
|
250
250
|
)
|
|
251
|
+
parser.add_argument(
|
|
252
|
+
"-cl",
|
|
253
|
+
"--confidence_level",
|
|
254
|
+
type=float,
|
|
255
|
+
default=0.95,
|
|
256
|
+
help="Confidence level for parameter confidence intervals (default 0.95 for 95%% CI). Must be between 0 and 1."
|
|
257
|
+
)
|
|
251
258
|
parser.add_argument(
|
|
252
259
|
"-l",
|
|
253
260
|
"--length_filter",
|
|
@@ -357,6 +364,73 @@ def _simple_serializer(k: str, v: any) -> any:
|
|
|
357
364
|
return "..".join(map(lambda x: x.strftime("%Y-%m-%d") if x else "", v))
|
|
358
365
|
return v
|
|
359
366
|
|
|
367
|
+
def _remove_model_functions(obj):
|
|
368
|
+
"""Recursively remove 'model' keys containing lambda functions from nested dictionaries.
|
|
369
|
+
|
|
370
|
+
:param obj: Dictionary or other object to clean
|
|
371
|
+
:type obj: any
|
|
372
|
+
:return: Cleaned object with model functions removed
|
|
373
|
+
:rtype: any
|
|
374
|
+
"""
|
|
375
|
+
if isinstance(obj, dict):
|
|
376
|
+
# Create a copy to avoid modifying during iteration
|
|
377
|
+
cleaned_obj = {}
|
|
378
|
+
for key, value in obj.items():
|
|
379
|
+
if key == "model":
|
|
380
|
+
# Skip lambda model functions - they can't be serialized to JSON
|
|
381
|
+
continue
|
|
382
|
+
elif isinstance(value, dict):
|
|
383
|
+
# Recursively clean nested dictionaries
|
|
384
|
+
cleaned_obj[key] = _remove_model_functions(value)
|
|
385
|
+
else:
|
|
386
|
+
# Keep all other values
|
|
387
|
+
cleaned_obj[key] = value
|
|
388
|
+
return cleaned_obj
|
|
389
|
+
else:
|
|
390
|
+
return obj
|
|
391
|
+
|
|
392
|
+
def _restructure_regression_results(reg_results):
|
|
393
|
+
"""Restructure regression results for cleaner JSON export format.
|
|
394
|
+
|
|
395
|
+
:param reg_results: Raw regression results from analysis
|
|
396
|
+
:type reg_results: dict
|
|
397
|
+
:return: Restructured results with cleaner format
|
|
398
|
+
:rtype: dict
|
|
399
|
+
"""
|
|
400
|
+
restructured = {}
|
|
401
|
+
|
|
402
|
+
for key, value in reg_results.items():
|
|
403
|
+
if key.endswith("_full_results"):
|
|
404
|
+
# Extract the base name (remove _full_results suffix)
|
|
405
|
+
base_name = key.replace("_full_results", "")
|
|
406
|
+
|
|
407
|
+
# Create the new structure with only essential fields
|
|
408
|
+
restructured[base_name] = {
|
|
409
|
+
"linear_model": {
|
|
410
|
+
"parameters": value["linear_model"]["parameters"],
|
|
411
|
+
"confidence_intervals": value["linear_model"]["confidence_intervals"],
|
|
412
|
+
"expression": value["linear_model"]["expression"],
|
|
413
|
+
"r2": value["linear_model"]["r2"],
|
|
414
|
+
"confidence_level": value["linear_model"]["confidence_level"]
|
|
415
|
+
},
|
|
416
|
+
"power_law_model": {
|
|
417
|
+
"parameters": value["power_law_model"]["parameters"],
|
|
418
|
+
"confidence_intervals": value["power_law_model"]["confidence_intervals"],
|
|
419
|
+
"expression": value["power_law_model"]["expression"],
|
|
420
|
+
"r2": value["power_law_model"]["r2"],
|
|
421
|
+
"confidence_level": value["power_law_model"]["confidence_level"]
|
|
422
|
+
},
|
|
423
|
+
"model_selection": value["model_selection"]
|
|
424
|
+
}
|
|
425
|
+
else:
|
|
426
|
+
# Keep non-full-results entries as-is (backward compatibility models)
|
|
427
|
+
# But skip them if there's a corresponding _full_results entry
|
|
428
|
+
full_results_key = f"{key}_full_results"
|
|
429
|
+
if full_results_key not in reg_results:
|
|
430
|
+
restructured[key] = value
|
|
431
|
+
|
|
432
|
+
return restructured
|
|
433
|
+
|
|
360
434
|
def _main():
|
|
361
435
|
check_and_install_mafft()
|
|
362
436
|
"""
|
|
@@ -367,6 +441,11 @@ def _main():
|
|
|
367
441
|
print(BANNER)
|
|
368
442
|
args = _parse_arguments()
|
|
369
443
|
|
|
444
|
+
# Validate confidence level
|
|
445
|
+
if not (0 < args.confidence_level < 1):
|
|
446
|
+
parser = _ArgumentParserWithHelpOnError(description=PACKAGE_DESCRIPTION)
|
|
447
|
+
parser.error("Confidence level must be between 0 and 1 (exclusive)")
|
|
448
|
+
|
|
370
449
|
# If the -xj argument is passed, the arguments are exported to a JSON file before running the analysis altogether
|
|
371
450
|
if args.export_json:
|
|
372
451
|
with open(f"{args.out}_run_args.json", "w") as file:
|
|
@@ -406,13 +485,18 @@ def _main():
|
|
|
406
485
|
f"{args.out}_plots"
|
|
407
486
|
if args.export_plots
|
|
408
487
|
else None
|
|
409
|
-
)
|
|
488
|
+
),
|
|
489
|
+
confidence_level=args.confidence_level
|
|
410
490
|
)
|
|
411
491
|
|
|
412
492
|
_reg = reg.copy()
|
|
413
493
|
|
|
414
|
-
|
|
415
|
-
|
|
494
|
+
# First restructure the results to the desired export format
|
|
495
|
+
_reg = _restructure_regression_results(_reg)
|
|
496
|
+
|
|
497
|
+
# Then apply the cleaning function to remove lambda functions
|
|
498
|
+
for k in list(_reg.keys()):
|
|
499
|
+
_reg[k] = _remove_model_functions(_reg[k])
|
|
416
500
|
|
|
417
501
|
# Exports the statistic results to TSV file
|
|
418
502
|
stats.to_csv(
|
|
@@ -2,7 +2,7 @@ import numpy as np
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from sklearn.metrics import r2_score
|
|
4
4
|
from scipy.optimize import curve_fit
|
|
5
|
-
from scipy.stats import f as snedecor_f
|
|
5
|
+
from scipy.stats import f as snedecor_f, t as t_dist
|
|
6
6
|
from sklearn.linear_model import LinearRegression
|
|
7
7
|
|
|
8
8
|
|
|
@@ -138,12 +138,49 @@ class PyEvoMotionBase():
|
|
|
138
138
|
|
|
139
139
|
return np.tanh(2*n/n_0)
|
|
140
140
|
|
|
141
|
+
@staticmethod
|
|
142
|
+
def _compute_confidence_intervals(
|
|
143
|
+
parameters: dict[str, float],
|
|
144
|
+
standard_errors: dict[str, float],
|
|
145
|
+
degrees_of_freedom: int,
|
|
146
|
+
confidence_level: float = 0.95
|
|
147
|
+
) -> dict[str, tuple[float, float]]:
|
|
148
|
+
"""
|
|
149
|
+
Compute confidence intervals for parameters using t-distribution.
|
|
150
|
+
|
|
151
|
+
:param parameters: Dictionary of parameter names and their estimated values.
|
|
152
|
+
:type parameters: dict[str, float]
|
|
153
|
+
:param standard_errors: Dictionary of parameter names and their standard errors.
|
|
154
|
+
:type standard_errors: dict[str, float]
|
|
155
|
+
:param degrees_of_freedom: Degrees of freedom for the t-distribution.
|
|
156
|
+
:type degrees_of_freedom: int
|
|
157
|
+
:param confidence_level: Confidence level for the intervals (default 0.95 for 95% CI).
|
|
158
|
+
:type confidence_level: float
|
|
159
|
+
:return: Dictionary with parameter names as keys and (lower_bound, upper_bound) tuples as values.
|
|
160
|
+
:rtype: dict[str, tuple[float, float]]
|
|
161
|
+
"""
|
|
162
|
+
alpha = 1 - confidence_level
|
|
163
|
+
t_val = t_dist.ppf(1 - alpha/2, degrees_of_freedom)
|
|
164
|
+
|
|
165
|
+
confidence_intervals = {}
|
|
166
|
+
for param_name in parameters.keys():
|
|
167
|
+
param_value = parameters[param_name]
|
|
168
|
+
param_se = standard_errors[param_name]
|
|
169
|
+
margin_of_error = t_val * param_se
|
|
170
|
+
confidence_intervals[param_name] = (
|
|
171
|
+
param_value - margin_of_error,
|
|
172
|
+
param_value + margin_of_error
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return confidence_intervals
|
|
176
|
+
|
|
141
177
|
@classmethod
|
|
142
178
|
def linear_regression(cls,
|
|
143
179
|
x: np.ndarray,
|
|
144
180
|
y: np.ndarray,
|
|
145
181
|
weights: np.ndarray | None = None,
|
|
146
|
-
fit_intercept: bool = True
|
|
182
|
+
fit_intercept: bool = True,
|
|
183
|
+
confidence_level: float = 0.95
|
|
147
184
|
) -> dict[str, any]:
|
|
148
185
|
"""
|
|
149
186
|
Perform a linear regression on a set of data.
|
|
@@ -156,10 +193,13 @@ class PyEvoMotionBase():
|
|
|
156
193
|
:type fit_intercept: bool
|
|
157
194
|
:param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
|
|
158
195
|
:type weights: np.ndarray | None
|
|
196
|
+
:param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
|
|
197
|
+
:type confidence_level: float
|
|
159
198
|
:return: A dictionary containing:
|
|
160
199
|
|
|
161
200
|
* ``model``: A ``lambda`` function that computes predictions based on the fitted model.
|
|
162
201
|
* ``parameters``: A dictionary with the slope of the regression line.
|
|
202
|
+
* ``confidence_intervals``: A dictionary with confidence intervals for each parameter.
|
|
163
203
|
* ``expression``: A string representation of the regression equation.
|
|
164
204
|
* ``r2``: The :math:`R^2` score of the regression.
|
|
165
205
|
:rtype: ``dict[str, any]``
|
|
@@ -169,6 +209,41 @@ class PyEvoMotionBase():
|
|
|
169
209
|
|
|
170
210
|
reg = LinearRegression(fit_intercept=fit_intercept).fit(x, y, sample_weight=_weights)
|
|
171
211
|
|
|
212
|
+
# Calculate confidence intervals
|
|
213
|
+
n = len(x)
|
|
214
|
+
_df = n - (2 if fit_intercept else 1) # degrees of freedom
|
|
215
|
+
|
|
216
|
+
# Calculate residuals and MSE
|
|
217
|
+
y_pred = reg.predict(x)
|
|
218
|
+
residuals = y.flatten() - y_pred.flatten()
|
|
219
|
+
|
|
220
|
+
if _weights is not None:
|
|
221
|
+
# Weighted MSE
|
|
222
|
+
mse = np.sum(_weights * residuals**2) / (np.sum(_weights) - (2 if fit_intercept else 1))
|
|
223
|
+
else:
|
|
224
|
+
mse = np.sum(residuals**2) / _df
|
|
225
|
+
|
|
226
|
+
# Calculate standard errors
|
|
227
|
+
x_flat = x.flatten()
|
|
228
|
+
x_mean = np.mean(x_flat)
|
|
229
|
+
sxx = np.sum((x_flat - x_mean)**2)
|
|
230
|
+
|
|
231
|
+
# Standard error for slope
|
|
232
|
+
se_slope = np.sqrt(mse / sxx)
|
|
233
|
+
|
|
234
|
+
parameters = {"m": reg.coef_[0][0]}
|
|
235
|
+
standard_errors = {"m": se_slope}
|
|
236
|
+
|
|
237
|
+
if fit_intercept:
|
|
238
|
+
se_intercept = np.sqrt(mse * (1/n + x_mean**2/sxx))
|
|
239
|
+
parameters["b"] = reg.intercept_[0]
|
|
240
|
+
standard_errors["b"] = se_intercept
|
|
241
|
+
|
|
242
|
+
# Compute confidence intervals using the abstracted method
|
|
243
|
+
confidence_intervals = cls._compute_confidence_intervals(
|
|
244
|
+
parameters, standard_errors, _df, confidence_level
|
|
245
|
+
)
|
|
246
|
+
|
|
172
247
|
if fit_intercept:
|
|
173
248
|
model = {
|
|
174
249
|
"model": lambda x: reg.coef_[0][0]*x + reg.intercept_[0],
|
|
@@ -176,7 +251,9 @@ class PyEvoMotionBase():
|
|
|
176
251
|
"m": reg.coef_[0][0],
|
|
177
252
|
"b": reg.intercept_[0]
|
|
178
253
|
},
|
|
179
|
-
"
|
|
254
|
+
"confidence_intervals": confidence_intervals,
|
|
255
|
+
"expression": "mx + b",
|
|
256
|
+
"confidence_level": confidence_level
|
|
180
257
|
}
|
|
181
258
|
|
|
182
259
|
else:
|
|
@@ -185,7 +262,9 @@ class PyEvoMotionBase():
|
|
|
185
262
|
"parameters": {
|
|
186
263
|
"m": reg.coef_[0][0],
|
|
187
264
|
},
|
|
188
|
-
"
|
|
265
|
+
"confidence_intervals": confidence_intervals,
|
|
266
|
+
"expression": "mx",
|
|
267
|
+
"confidence_level": confidence_level
|
|
189
268
|
}
|
|
190
269
|
|
|
191
270
|
model["r2"] = r2_score(y, reg.predict(x), sample_weight=_weights)
|
|
@@ -214,9 +293,13 @@ class PyEvoMotionBase():
|
|
|
214
293
|
return a*np.power(x, b)
|
|
215
294
|
|
|
216
295
|
@classmethod
|
|
217
|
-
def power_law_fit(cls, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None) -> dict[str, any]:
|
|
296
|
+
def power_law_fit(cls, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, confidence_level: float = 0.95) -> dict[str, any]:
|
|
218
297
|
"""
|
|
219
298
|
Perform a power law fit on a set of data.
|
|
299
|
+
|
|
300
|
+
This method fits a power law model of the form :math:`y = d \\cdot x^{\\alpha}` to the data.
|
|
301
|
+
Initial parameter estimates are obtained via linear regression on log-transformed data,
|
|
302
|
+
which provides better convergence than default initialization.
|
|
220
303
|
|
|
221
304
|
:param x: A numpy array of the features.
|
|
222
305
|
:type x: np.ndarray
|
|
@@ -224,31 +307,83 @@ class PyEvoMotionBase():
|
|
|
224
307
|
:type y: np.ndarray
|
|
225
308
|
:param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
|
|
226
309
|
:type weights: np.ndarray | None
|
|
310
|
+
:param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
|
|
311
|
+
:type confidence_level: float
|
|
227
312
|
:return: A dictionary containing:
|
|
228
313
|
|
|
229
314
|
* ``model``: A ``lambda`` function that computes predictions based on the fitted model.
|
|
230
|
-
* ``parameters``: A dictionary with the parameters of the fitted power law.
|
|
315
|
+
* ``parameters``: A dictionary with the parameters of the fitted power law (``d`` and ``alpha``).
|
|
316
|
+
* ``confidence_intervals``: A dictionary with confidence intervals for each parameter.
|
|
231
317
|
* ``expression``: A string representation of the regression equation.
|
|
232
318
|
* ``r2``: The :math:`R^2` score of the regression.
|
|
319
|
+
* ``confidence_level``: The confidence level used for the confidence intervals.
|
|
233
320
|
:rtype: ``dict[str, any]``
|
|
234
321
|
"""
|
|
235
322
|
|
|
236
323
|
_weights = cls._weighting_function(weights).flatten() if weights is not None else None
|
|
237
324
|
|
|
325
|
+
# Provide good initial parameter guesses for power law
|
|
326
|
+
# Use linear regression on log-transformed data to get initial estimates
|
|
327
|
+
x_flat = x.T.tolist()[0]
|
|
328
|
+
y_flat = y.T.tolist()[0]
|
|
329
|
+
mask = (np.array(x_flat) > 0) & (np.array(y_flat) > 0)
|
|
330
|
+
x_log = np.log(np.array(x_flat)[mask])
|
|
331
|
+
y_log = np.log(np.array(y_flat)[mask])
|
|
332
|
+
|
|
333
|
+
# Linear regression on log-transformed data: log(y) = log(d) + alpha*log(x)
|
|
334
|
+
# This gives us initial estimates for d and alpha
|
|
335
|
+
if len(x_log) > 1:
|
|
336
|
+
reg = LinearRegression(fit_intercept=True).fit(x_log.reshape(-1, 1), y_log.reshape(-1, 1))
|
|
337
|
+
|
|
338
|
+
p0 = [np.exp(reg.intercept_[0]), reg.coef_[0][0]] # [d, alpha]
|
|
339
|
+
else:
|
|
340
|
+
p0 = [1.0, 1.0] # Default fallback
|
|
341
|
+
|
|
342
|
+
# Set reasonable bounds for power law parameters
|
|
343
|
+
# d > 0 (coefficient must be positive)
|
|
344
|
+
# alpha can be any real number, but constrain to reasonable range
|
|
345
|
+
bounds = ([1e-10, -10], [np.inf, 10]) # [d_min, alpha_min], [d_max, alpha_max]
|
|
346
|
+
|
|
238
347
|
try:
|
|
239
|
-
_popt,
|
|
348
|
+
_popt, _pcov, _, _msg, _ier = curve_fit(
|
|
240
349
|
cls._power_law,
|
|
241
|
-
|
|
350
|
+
x_flat, y_flat,
|
|
351
|
+
p0=p0,
|
|
352
|
+
bounds=bounds,
|
|
242
353
|
sigma=1/np.sqrt(_weights) if _weights is not None else None,
|
|
243
354
|
full_output=True
|
|
244
355
|
)
|
|
245
356
|
except RuntimeError as e:
|
|
246
357
|
_ier = 0
|
|
247
358
|
_msg = str(e)
|
|
359
|
+
_pcov = np.array([[np.inf, 0], [0, np.inf]])
|
|
248
360
|
|
|
249
361
|
if _ier not in range(1, 5):
|
|
250
362
|
print(f"{_msg}")
|
|
251
363
|
_popt = [0, 0]
|
|
364
|
+
_pcov = np.array([[np.inf, 0], [0, np.inf]])
|
|
365
|
+
|
|
366
|
+
# Calculate confidence intervals from covariance matrix
|
|
367
|
+
n = len(x)
|
|
368
|
+
df = n - 2 # degrees of freedom for 2 parameters
|
|
369
|
+
|
|
370
|
+
# Standard errors from covariance matrix diagonal
|
|
371
|
+
param_errors = np.sqrt(np.diag(_pcov))
|
|
372
|
+
|
|
373
|
+
# Prepare parameters and standard errors for confidence interval computation
|
|
374
|
+
parameters = {
|
|
375
|
+
"d": _popt[0],
|
|
376
|
+
"alpha": _popt[1]
|
|
377
|
+
}
|
|
378
|
+
standard_errors = {
|
|
379
|
+
"d": param_errors[0],
|
|
380
|
+
"alpha": param_errors[1]
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
# Compute confidence intervals using the abstracted method
|
|
384
|
+
confidence_intervals = cls._compute_confidence_intervals(
|
|
385
|
+
parameters, standard_errors, df, confidence_level
|
|
386
|
+
)
|
|
252
387
|
|
|
253
388
|
model = {
|
|
254
389
|
"model": lambda x: _popt[0]*np.power(x, _popt[1]),
|
|
@@ -256,7 +391,9 @@ class PyEvoMotionBase():
|
|
|
256
391
|
"d": _popt[0],
|
|
257
392
|
"alpha": _popt[1]
|
|
258
393
|
},
|
|
394
|
+
"confidence_intervals": confidence_intervals,
|
|
259
395
|
"expression": "d*x^alpha",
|
|
396
|
+
"confidence_level": confidence_level,
|
|
260
397
|
"r2": r2_score(y, cls._power_law(x, *_popt), sample_weight=_weights)
|
|
261
398
|
}
|
|
262
399
|
|
|
@@ -317,16 +454,107 @@ class PyEvoMotionBase():
|
|
|
317
454
|
|
|
318
455
|
F = ((RSS1 - RSS2)/(p2 - p1))/(RSS2/(n - p2))
|
|
319
456
|
|
|
320
|
-
return F, 1 - snedecor_f.cdf(F, p2 - p1, n - p2)
|
|
457
|
+
return F, 1 - snedecor_f.cdf(F, p2 - p1, n - p2)
|
|
321
458
|
|
|
459
|
+
@classmethod
|
|
460
|
+
def AIC(
|
|
461
|
+
cls,
|
|
462
|
+
model1: dict[str,any],
|
|
463
|
+
model2: dict[str,any],
|
|
464
|
+
data: np.ndarray,
|
|
465
|
+
weights: np.ndarray | None = None
|
|
466
|
+
) -> tuple[float, float]:
|
|
467
|
+
"""
|
|
468
|
+
Perform an AIC test between two models.
|
|
469
|
+
|
|
470
|
+
Uses the small-sample corrected AIC with full constant terms:
|
|
471
|
+
AICc = n*ln(2*pi) + n*ln(RSS/n) + n + 2k + [2k(k+1)]/(n-k-1)
|
|
472
|
+
|
|
473
|
+
See https://en.wikipedia.org/wiki/Akaike_information_criterion for more details.
|
|
474
|
+
|
|
475
|
+
:param model1: The first model.
|
|
476
|
+
:type model1: dict[str, any]
|
|
477
|
+
:param model2: The second model.
|
|
478
|
+
:type model2: dict[str, any]
|
|
479
|
+
:param data: The data to test the models.
|
|
480
|
+
:type data: np.ndarray
|
|
481
|
+
:return: A tuple with the F-value and the p-value.
|
|
482
|
+
:rtype: ``tuple[float, float]``
|
|
483
|
+
"""
|
|
484
|
+
|
|
485
|
+
data = data.flatten()
|
|
486
|
+
|
|
487
|
+
if weights is not None:
|
|
488
|
+
_weights = cls._weighting_function(weights.flatten())
|
|
489
|
+
else:
|
|
490
|
+
_weights = np.ones(len(data))
|
|
491
|
+
|
|
492
|
+
k1 = len(model1["parameters"])
|
|
493
|
+
k2 = len(model2["parameters"])
|
|
494
|
+
n = len(data)
|
|
495
|
+
|
|
496
|
+
model1 = np.vectorize(model1["model"])
|
|
497
|
+
model2 = np.vectorize(model2["model"])
|
|
498
|
+
|
|
499
|
+
RS1 = (data - model1(range(n)))**2
|
|
500
|
+
RS2 = (data - model2(range(n)))**2
|
|
501
|
+
|
|
502
|
+
# Mask the infinite and nan values
|
|
503
|
+
mask = (
|
|
504
|
+
np.isinf(RS1)
|
|
505
|
+
| np.isinf(RS2)
|
|
506
|
+
| np.isnan(RS1)
|
|
507
|
+
| np.isnan(RS2)
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# Sum the residuals without the infinite values
|
|
511
|
+
RSS1 = np.sum(_weights*RS1, where=~mask)
|
|
512
|
+
RSS2 = np.sum(_weights*RS2, where=~mask)
|
|
513
|
+
|
|
514
|
+
# Handle edge case where RSS is 0 (perfect fit) to avoid log(0)
|
|
515
|
+
if RSS1 == 0:
|
|
516
|
+
RSS1 = 1e-10 # Small positive value to avoid log(0)
|
|
517
|
+
if RSS2 == 0:
|
|
518
|
+
RSS2 = 1e-10 # Small positive value to avoid log(0)
|
|
519
|
+
|
|
520
|
+
const_term = n * (np.log(2*np.pi) + 1.0)
|
|
521
|
+
denom1 = n - k1 - 1
|
|
522
|
+
denom2 = n - k2 - 1
|
|
523
|
+
|
|
524
|
+
# If denom <= 0, AICc is undefined; treat as +inf (no support)
|
|
525
|
+
if denom1 <= 0:
|
|
526
|
+
AICc1 = np.inf
|
|
527
|
+
else:
|
|
528
|
+
AICc1 = const_term + n * np.log(RSS1 / n) + 2 * k1 + (2 * k1 * (k1 + 1)) / denom1
|
|
529
|
+
|
|
530
|
+
if denom2 <= 0:
|
|
531
|
+
AICc2 = np.inf
|
|
532
|
+
else:
|
|
533
|
+
AICc2 = const_term + n * np.log(RSS2 / n) + 2 * k2 + (2 * k2 * (k2 + 1)) / denom2
|
|
534
|
+
|
|
535
|
+
# ΔAIC: relative to best (lowest AIC)
|
|
536
|
+
min_aicc = min(AICc1, AICc2)
|
|
537
|
+
dAICc1 = AICc1 - min_aicc
|
|
538
|
+
dAICc2 = AICc2 - min_aicc
|
|
539
|
+
|
|
540
|
+
# Akaike weights
|
|
541
|
+
rel1 = np.exp(-0.5 * dAICc1) if np.isfinite(dAICc1) else 0
|
|
542
|
+
rel2 = np.exp(-0.5 * dAICc2) if np.isfinite(dAICc2) else 0
|
|
543
|
+
denom = rel1 + rel2 if (rel1 + rel2) > 0 else 1.0
|
|
544
|
+
w1 = rel1 / denom
|
|
545
|
+
w2 = rel2 / denom
|
|
546
|
+
|
|
547
|
+
return AICc1, AICc2, dAICc1, dAICc2, w1, w2
|
|
548
|
+
|
|
322
549
|
@classmethod
|
|
323
550
|
def adjust_model(cls,
|
|
324
551
|
x: pd.Series,
|
|
325
552
|
y: pd.Series,
|
|
326
553
|
name: str = None,
|
|
327
|
-
weights: pd.Series | None = None
|
|
554
|
+
weights: pd.Series | None = None,
|
|
555
|
+
confidence_level: float = 0.95
|
|
328
556
|
) -> dict[str, any]:
|
|
329
|
-
"""Adjust a model to the data.
|
|
557
|
+
"""Adjust a model to the data using AIC for model selection.
|
|
330
558
|
|
|
331
559
|
:param x: The features. It is a single pandas Series.
|
|
332
560
|
:type x: pd.Series
|
|
@@ -336,7 +564,18 @@ class PyEvoMotionBase():
|
|
|
336
564
|
:type name: str
|
|
337
565
|
:param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
|
|
338
566
|
:type weights: np.ndarray | None
|
|
339
|
-
:
|
|
567
|
+
:param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
|
|
568
|
+
:type confidence_level: float
|
|
569
|
+
:return: A dictionary containing:
|
|
570
|
+
|
|
571
|
+
* If name is provided: A dictionary with the name as key and the result dictionary as value
|
|
572
|
+
* If name is None: A dictionary containing:
|
|
573
|
+
|
|
574
|
+
* ``selected_model``: The selected model based on lowest AIC
|
|
575
|
+
* ``linear_model``: The linear regression model with AIC statistics
|
|
576
|
+
* ``power_law_model``: The power law model with AIC statistics
|
|
577
|
+
* ``model_selection``: Dictionary with AIC comparison results
|
|
578
|
+
|
|
340
579
|
:rtype: ``dict[str, any]``
|
|
341
580
|
:raises ValueError: If the dataset is empty or full of NaN values. This may occur if the grouped data contains only one entry per group, indicating that the variance cannot be computed.
|
|
342
581
|
"""
|
|
@@ -350,20 +589,57 @@ class PyEvoMotionBase():
|
|
|
350
589
|
f"Dataset length after filtering is: x: {x.size} elements; y: {y.size} elements. In particular:\n\nx: {x}\ny: {y}\n\nPerhaps NaN appeared for certain entries. Check if the grouped data contains only one entry per group, as this may cause NaN values when computing the variance. Also, consider widening the time window."
|
|
351
590
|
)
|
|
352
591
|
|
|
353
|
-
model1 = cls.linear_regression(x, y, weights=w, fit_intercept=False) # Not fitting the intercept because data is passed scaled to the minimum
|
|
354
|
-
model2 = cls.power_law_fit(x, y, weights=w)
|
|
592
|
+
model1 = cls.linear_regression(x, y, weights=w, fit_intercept=False, confidence_level=confidence_level) # Not fitting the intercept because data is passed scaled to the minimum
|
|
593
|
+
model2 = cls.power_law_fit(x, y, weights=w, confidence_level=confidence_level)
|
|
355
594
|
|
|
356
|
-
|
|
595
|
+
# Compute AIC statistics for both models
|
|
596
|
+
AIC1, AIC2, dAIC1, dAIC2, w1, w2 = cls.AIC(model1, model2, y, weights=w)
|
|
357
597
|
|
|
358
|
-
|
|
359
|
-
|
|
598
|
+
# Select model with lowest AIC (highest Akaike weight)
|
|
599
|
+
if AIC1 <= AIC2:
|
|
600
|
+
selected_model = model1
|
|
601
|
+
selected_model_name = "linear"
|
|
360
602
|
else:
|
|
361
|
-
|
|
603
|
+
selected_model = model2
|
|
604
|
+
selected_model_name = "power_law"
|
|
605
|
+
|
|
606
|
+
# Add AIC statistics to each model
|
|
607
|
+
model1_with_aic = model1.copy()
|
|
608
|
+
model1_with_aic.update({
|
|
609
|
+
"AIC": AIC1,
|
|
610
|
+
"delta_AIC": dAIC1,
|
|
611
|
+
"akaike_weight": w1,
|
|
612
|
+
"confidence_level": confidence_level
|
|
613
|
+
})
|
|
614
|
+
|
|
615
|
+
model2_with_aic = model2.copy()
|
|
616
|
+
model2_with_aic.update({
|
|
617
|
+
"AIC": AIC2,
|
|
618
|
+
"delta_AIC": dAIC2,
|
|
619
|
+
"akaike_weight": w2,
|
|
620
|
+
"confidence_level": confidence_level
|
|
621
|
+
})
|
|
622
|
+
|
|
623
|
+
# Create comprehensive result dictionary
|
|
624
|
+
result = {
|
|
625
|
+
"selected_model": selected_model,
|
|
626
|
+
"linear_model": model1_with_aic,
|
|
627
|
+
"power_law_model": model2_with_aic,
|
|
628
|
+
"model_selection": {
|
|
629
|
+
"selected": selected_model_name,
|
|
630
|
+
"linear_AIC": AIC1,
|
|
631
|
+
"power_law_AIC": AIC2,
|
|
632
|
+
"delta_AIC_linear": dAIC1,
|
|
633
|
+
"delta_AIC_power_law": dAIC2,
|
|
634
|
+
"akaike_weight_linear": w1,
|
|
635
|
+
"akaike_weight_power_law": w2
|
|
636
|
+
}
|
|
637
|
+
}
|
|
362
638
|
|
|
363
639
|
if name:
|
|
364
|
-
return {name:
|
|
640
|
+
return {name: result}
|
|
365
641
|
else:
|
|
366
|
-
return
|
|
642
|
+
return result
|
|
367
643
|
|
|
368
644
|
@staticmethod
|
|
369
645
|
def plot_single_data_and_model(
|