PyEvoMotion 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PKG-INFO +1 -1
  2. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/cli.py +87 -3
  3. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/core/base.py +296 -20
  4. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/core/core.py +73 -24
  5. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/pyproject.toml +1 -1
  6. pyevomotion-0.1.2/share/analyze_model_selection_accuracy.py +316 -0
  7. pyevomotion-0.1.2/share/analyze_test_runs.py +436 -0
  8. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/anomalous_diffusion.pdf +0 -0
  9. pyevomotion-0.1.2/share/confusion_matrix_heatmap.pdf +0 -0
  10. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figUK_plots.pdf +0 -0
  11. pyevomotion-0.1.2/share/figUK_regression_results.json +65 -0
  12. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figUK_run_args.json +1 -0
  13. pyevomotion-0.1.2/share/figUK_stats.tsv +41 -0
  14. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figUSA_plots.pdf +0 -0
  15. pyevomotion-0.1.2/share/figUSA_regression_results.json +65 -0
  16. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figUSA_run_args.json +1 -0
  17. pyevomotion-0.1.2/share/figUSA_stats.tsv +34 -0
  18. pyevomotion-0.1.2/share/generate_sequences_from_test5_data.py +107 -0
  19. pyevomotion-0.1.2/share/manuscript_figure.py +1131 -0
  20. pyevomotion-0.1.2/share/run_parallel_analysis.py +196 -0
  21. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/synth_figure.pdf +0 -0
  22. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/uk_time_windows.pdf +0 -0
  23. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/weekly_size.pdf +0 -0
  24. pyevomotion-0.1.2/tests/test_synthetic_datasets.py +79 -0
  25. pyevomotion-0.1.1/share/figUK_regression_results.json +0 -18
  26. pyevomotion-0.1.1/share/figUK_stats.tsv +0 -41
  27. pyevomotion-0.1.1/share/figUSA_regression_results.json +0 -18
  28. pyevomotion-0.1.1/share/figUSA_stats.tsv +0 -34
  29. pyevomotion-0.1.1/share/figure.pdf +0 -0
  30. pyevomotion-0.1.1/share/manuscript_figure.py +0 -761
  31. pyevomotion-0.1.1/tests/test_synthetic_datasets.py +0 -55
  32. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/__init__.py +0 -0
  33. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/core/__init__.py +0 -0
  34. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/core/parser.py +0 -0
  35. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/utils.py +0 -0
  36. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/README.md +0 -0
  37. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figUK.tsv +0 -0
  38. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figUSA.tsv +0 -0
  39. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figdataUK.tsv +0 -0
  40. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/figdataUSA.tsv +0 -0
  41. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/generate_sequences_from_synthdata.py +0 -0
  42. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/share/mafft_install.sh +0 -0
  43. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/__init__.py +0 -0
  44. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test1/out_run_args.json +0 -0
  45. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test1/output/test1.data.tsv +0 -0
  46. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test1/test1.metadata.tsv +0 -0
  47. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test1/test1.sequences.fasta +0 -0
  48. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test2/out_run_args.json +0 -0
  49. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test2/test2.metadata.parquet.gz +0 -0
  50. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/data/test3/ids_sampled_for_figure.json +0 -0
  51. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/helpers/test_UK_USA_dataset_helpers.py +0 -0
  52. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/helpers/test_parser_helpers.py +0 -0
  53. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/test_UK_USA_dataset.py +0 -0
  54. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/test_core.py +0 -0
  55. {pyevomotion-0.1.1 → pyevomotion-0.1.2}/tests/test_parser.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PyEvoMotion
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Evolutionary motion analysis tool
5
5
  Keywords: evolution,anomalous diffusion,bioinformatics
6
6
  Author: Lucas Goiriz
@@ -248,6 +248,13 @@ def _parse_arguments() -> argparse.Namespace:
248
248
  action="store_true",
249
249
  help="Export the plots of the analysis."
250
250
  )
251
+ parser.add_argument(
252
+ "-cl",
253
+ "--confidence_level",
254
+ type=float,
255
+ default=0.95,
256
+ help="Confidence level for parameter confidence intervals (default 0.95 for 95%% CI). Must be between 0 and 1."
257
+ )
251
258
  parser.add_argument(
252
259
  "-l",
253
260
  "--length_filter",
@@ -357,6 +364,73 @@ def _simple_serializer(k: str, v: any) -> any:
357
364
  return "..".join(map(lambda x: x.strftime("%Y-%m-%d") if x else "", v))
358
365
  return v
359
366
 
367
+ def _remove_model_functions(obj):
368
+ """Recursively remove 'model' keys containing lambda functions from nested dictionaries.
369
+
370
+ :param obj: Dictionary or other object to clean
371
+ :type obj: any
372
+ :return: Cleaned object with model functions removed
373
+ :rtype: any
374
+ """
375
+ if isinstance(obj, dict):
376
+ # Create a copy to avoid modifying during iteration
377
+ cleaned_obj = {}
378
+ for key, value in obj.items():
379
+ if key == "model":
380
+ # Skip lambda model functions - they can't be serialized to JSON
381
+ continue
382
+ elif isinstance(value, dict):
383
+ # Recursively clean nested dictionaries
384
+ cleaned_obj[key] = _remove_model_functions(value)
385
+ else:
386
+ # Keep all other values
387
+ cleaned_obj[key] = value
388
+ return cleaned_obj
389
+ else:
390
+ return obj
391
+
392
+ def _restructure_regression_results(reg_results):
393
+ """Restructure regression results for cleaner JSON export format.
394
+
395
+ :param reg_results: Raw regression results from analysis
396
+ :type reg_results: dict
397
+ :return: Restructured results with cleaner format
398
+ :rtype: dict
399
+ """
400
+ restructured = {}
401
+
402
+ for key, value in reg_results.items():
403
+ if key.endswith("_full_results"):
404
+ # Extract the base name (remove _full_results suffix)
405
+ base_name = key.replace("_full_results", "")
406
+
407
+ # Create the new structure with only essential fields
408
+ restructured[base_name] = {
409
+ "linear_model": {
410
+ "parameters": value["linear_model"]["parameters"],
411
+ "confidence_intervals": value["linear_model"]["confidence_intervals"],
412
+ "expression": value["linear_model"]["expression"],
413
+ "r2": value["linear_model"]["r2"],
414
+ "confidence_level": value["linear_model"]["confidence_level"]
415
+ },
416
+ "power_law_model": {
417
+ "parameters": value["power_law_model"]["parameters"],
418
+ "confidence_intervals": value["power_law_model"]["confidence_intervals"],
419
+ "expression": value["power_law_model"]["expression"],
420
+ "r2": value["power_law_model"]["r2"],
421
+ "confidence_level": value["power_law_model"]["confidence_level"]
422
+ },
423
+ "model_selection": value["model_selection"]
424
+ }
425
+ else:
426
+ # Keep non-full-results entries as-is (backward compatibility models)
427
+ # But skip them if there's a corresponding _full_results entry
428
+ full_results_key = f"{key}_full_results"
429
+ if full_results_key not in reg_results:
430
+ restructured[key] = value
431
+
432
+ return restructured
433
+
360
434
  def _main():
361
435
  check_and_install_mafft()
362
436
  """
@@ -367,6 +441,11 @@ def _main():
367
441
  print(BANNER)
368
442
  args = _parse_arguments()
369
443
 
444
+ # Validate confidence level
445
+ if not (0 < args.confidence_level < 1):
446
+ parser = _ArgumentParserWithHelpOnError(description=PACKAGE_DESCRIPTION)
447
+ parser.error("Confidence level must be between 0 and 1 (exclusive)")
448
+
370
449
  # If the -xj argument is passed, the arguments are exported to a JSON file before running the analysis altogether
371
450
  if args.export_json:
372
451
  with open(f"{args.out}_run_args.json", "w") as file:
@@ -406,13 +485,18 @@ def _main():
406
485
  f"{args.out}_plots"
407
486
  if args.export_plots
408
487
  else None
409
- )
488
+ ),
489
+ confidence_level=args.confidence_level
410
490
  )
411
491
 
412
492
  _reg = reg.copy()
413
493
 
414
- for k in _reg.keys():
415
- del _reg[k]["model"]
494
+ # First restructure the results to the desired export format
495
+ _reg = _restructure_regression_results(_reg)
496
+
497
+ # Then apply the cleaning function to remove lambda functions
498
+ for k in list(_reg.keys()):
499
+ _reg[k] = _remove_model_functions(_reg[k])
416
500
 
417
501
  # Exports the statistic results to TSV file
418
502
  stats.to_csv(
@@ -2,7 +2,7 @@ import numpy as np
2
2
  import pandas as pd
3
3
  from sklearn.metrics import r2_score
4
4
  from scipy.optimize import curve_fit
5
- from scipy.stats import f as snedecor_f
5
+ from scipy.stats import f as snedecor_f, t as t_dist
6
6
  from sklearn.linear_model import LinearRegression
7
7
 
8
8
 
@@ -138,12 +138,49 @@ class PyEvoMotionBase():
138
138
 
139
139
  return np.tanh(2*n/n_0)
140
140
 
141
+ @staticmethod
142
+ def _compute_confidence_intervals(
143
+ parameters: dict[str, float],
144
+ standard_errors: dict[str, float],
145
+ degrees_of_freedom: int,
146
+ confidence_level: float = 0.95
147
+ ) -> dict[str, tuple[float, float]]:
148
+ """
149
+ Compute confidence intervals for parameters using t-distribution.
150
+
151
+ :param parameters: Dictionary of parameter names and their estimated values.
152
+ :type parameters: dict[str, float]
153
+ :param standard_errors: Dictionary of parameter names and their standard errors.
154
+ :type standard_errors: dict[str, float]
155
+ :param degrees_of_freedom: Degrees of freedom for the t-distribution.
156
+ :type degrees_of_freedom: int
157
+ :param confidence_level: Confidence level for the intervals (default 0.95 for 95% CI).
158
+ :type confidence_level: float
159
+ :return: Dictionary with parameter names as keys and (lower_bound, upper_bound) tuples as values.
160
+ :rtype: dict[str, tuple[float, float]]
161
+ """
162
+ alpha = 1 - confidence_level
163
+ t_val = t_dist.ppf(1 - alpha/2, degrees_of_freedom)
164
+
165
+ confidence_intervals = {}
166
+ for param_name in parameters.keys():
167
+ param_value = parameters[param_name]
168
+ param_se = standard_errors[param_name]
169
+ margin_of_error = t_val * param_se
170
+ confidence_intervals[param_name] = (
171
+ param_value - margin_of_error,
172
+ param_value + margin_of_error
173
+ )
174
+
175
+ return confidence_intervals
176
+
141
177
  @classmethod
142
178
  def linear_regression(cls,
143
179
  x: np.ndarray,
144
180
  y: np.ndarray,
145
181
  weights: np.ndarray | None = None,
146
- fit_intercept: bool = True
182
+ fit_intercept: bool = True,
183
+ confidence_level: float = 0.95
147
184
  ) -> dict[str, any]:
148
185
  """
149
186
  Perform a linear regression on a set of data.
@@ -156,10 +193,13 @@ class PyEvoMotionBase():
156
193
  :type fit_intercept: bool
157
194
  :param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
158
195
  :type weights: np.ndarray | None
196
+ :param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
197
+ :type confidence_level: float
159
198
  :return: A dictionary containing:
160
199
 
161
200
  * ``model``: A ``lambda`` function that computes predictions based on the fitted model.
162
201
  * ``parameters``: A dictionary with the slope of the regression line.
202
+ * ``confidence_intervals``: A dictionary with confidence intervals for each parameter.
163
203
  * ``expression``: A string representation of the regression equation.
164
204
  * ``r2``: The :math:`R^2` score of the regression.
165
205
  :rtype: ``dict[str, any]``
@@ -169,6 +209,41 @@ class PyEvoMotionBase():
169
209
 
170
210
  reg = LinearRegression(fit_intercept=fit_intercept).fit(x, y, sample_weight=_weights)
171
211
 
212
+ # Calculate confidence intervals
213
+ n = len(x)
214
+ _df = n - (2 if fit_intercept else 1) # degrees of freedom
215
+
216
+ # Calculate residuals and MSE
217
+ y_pred = reg.predict(x)
218
+ residuals = y.flatten() - y_pred.flatten()
219
+
220
+ if _weights is not None:
221
+ # Weighted MSE
222
+ mse = np.sum(_weights * residuals**2) / (np.sum(_weights) - (2 if fit_intercept else 1))
223
+ else:
224
+ mse = np.sum(residuals**2) / _df
225
+
226
+ # Calculate standard errors
227
+ x_flat = x.flatten()
228
+ x_mean = np.mean(x_flat)
229
+ sxx = np.sum((x_flat - x_mean)**2)
230
+
231
+ # Standard error for slope
232
+ se_slope = np.sqrt(mse / sxx)
233
+
234
+ parameters = {"m": reg.coef_[0][0]}
235
+ standard_errors = {"m": se_slope}
236
+
237
+ if fit_intercept:
238
+ se_intercept = np.sqrt(mse * (1/n + x_mean**2/sxx))
239
+ parameters["b"] = reg.intercept_[0]
240
+ standard_errors["b"] = se_intercept
241
+
242
+ # Compute confidence intervals using the abstracted method
243
+ confidence_intervals = cls._compute_confidence_intervals(
244
+ parameters, standard_errors, _df, confidence_level
245
+ )
246
+
172
247
  if fit_intercept:
173
248
  model = {
174
249
  "model": lambda x: reg.coef_[0][0]*x + reg.intercept_[0],
@@ -176,7 +251,9 @@ class PyEvoMotionBase():
176
251
  "m": reg.coef_[0][0],
177
252
  "b": reg.intercept_[0]
178
253
  },
179
- "expression": "mx + b"
254
+ "confidence_intervals": confidence_intervals,
255
+ "expression": "mx + b",
256
+ "confidence_level": confidence_level
180
257
  }
181
258
 
182
259
  else:
@@ -185,7 +262,9 @@ class PyEvoMotionBase():
185
262
  "parameters": {
186
263
  "m": reg.coef_[0][0],
187
264
  },
188
- "expression": "mx"
265
+ "confidence_intervals": confidence_intervals,
266
+ "expression": "mx",
267
+ "confidence_level": confidence_level
189
268
  }
190
269
 
191
270
  model["r2"] = r2_score(y, reg.predict(x), sample_weight=_weights)
@@ -214,9 +293,13 @@ class PyEvoMotionBase():
214
293
  return a*np.power(x, b)
215
294
 
216
295
  @classmethod
217
- def power_law_fit(cls, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None) -> dict[str, any]:
296
+ def power_law_fit(cls, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, confidence_level: float = 0.95) -> dict[str, any]:
218
297
  """
219
298
  Perform a power law fit on a set of data.
299
+
300
+ This method fits a power law model of the form :math:`y = d \\cdot x^{\\alpha}` to the data.
301
+ Initial parameter estimates are obtained via linear regression on log-transformed data,
302
+ which provides better convergence than default initialization.
220
303
 
221
304
  :param x: A numpy array of the features.
222
305
  :type x: np.ndarray
@@ -224,31 +307,83 @@ class PyEvoMotionBase():
224
307
  :type y: np.ndarray
225
308
  :param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
226
309
  :type weights: np.ndarray | None
310
+ :param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
311
+ :type confidence_level: float
227
312
  :return: A dictionary containing:
228
313
 
229
314
  * ``model``: A ``lambda`` function that computes predictions based on the fitted model.
230
- * ``parameters``: A dictionary with the parameters of the fitted power law.
315
+ * ``parameters``: A dictionary with the parameters of the fitted power law (``d`` and ``alpha``).
316
+ * ``confidence_intervals``: A dictionary with confidence intervals for each parameter.
231
317
  * ``expression``: A string representation of the regression equation.
232
318
  * ``r2``: The :math:`R^2` score of the regression.
319
+ * ``confidence_level``: The confidence level used for the confidence intervals.
233
320
  :rtype: ``dict[str, any]``
234
321
  """
235
322
 
236
323
  _weights = cls._weighting_function(weights).flatten() if weights is not None else None
237
324
 
325
+ # Provide good initial parameter guesses for power law
326
+ # Use linear regression on log-transformed data to get initial estimates
327
+ x_flat = x.T.tolist()[0]
328
+ y_flat = y.T.tolist()[0]
329
+ mask = (np.array(x_flat) > 0) & (np.array(y_flat) > 0)
330
+ x_log = np.log(np.array(x_flat)[mask])
331
+ y_log = np.log(np.array(y_flat)[mask])
332
+
333
+ # Linear regression on log-transformed data: log(y) = log(d) + alpha*log(x)
334
+ # This gives us initial estimates for d and alpha
335
+ if len(x_log) > 1:
336
+ reg = LinearRegression(fit_intercept=True).fit(x_log.reshape(-1, 1), y_log.reshape(-1, 1))
337
+
338
+ p0 = [np.exp(reg.intercept_[0]), reg.coef_[0][0]] # [d, alpha]
339
+ else:
340
+ p0 = [1.0, 1.0] # Default fallback
341
+
342
+ # Set reasonable bounds for power law parameters
343
+ # d > 0 (coefficient must be positive)
344
+ # alpha can be any real number, but constrain to reasonable range
345
+ bounds = ([1e-10, -10], [np.inf, 10]) # [d_min, alpha_min], [d_max, alpha_max]
346
+
238
347
  try:
239
- _popt, _, _, _msg, _ier = curve_fit(
348
+ _popt, _pcov, _, _msg, _ier = curve_fit(
240
349
  cls._power_law,
241
- x.T.tolist()[0], y.T.tolist()[0],
350
+ x_flat, y_flat,
351
+ p0=p0,
352
+ bounds=bounds,
242
353
  sigma=1/np.sqrt(_weights) if _weights is not None else None,
243
354
  full_output=True
244
355
  )
245
356
  except RuntimeError as e:
246
357
  _ier = 0
247
358
  _msg = str(e)
359
+ _pcov = np.array([[np.inf, 0], [0, np.inf]])
248
360
 
249
361
  if _ier not in range(1, 5):
250
362
  print(f"{_msg}")
251
363
  _popt = [0, 0]
364
+ _pcov = np.array([[np.inf, 0], [0, np.inf]])
365
+
366
+ # Calculate confidence intervals from covariance matrix
367
+ n = len(x)
368
+ df = n - 2 # degrees of freedom for 2 parameters
369
+
370
+ # Standard errors from covariance matrix diagonal
371
+ param_errors = np.sqrt(np.diag(_pcov))
372
+
373
+ # Prepare parameters and standard errors for confidence interval computation
374
+ parameters = {
375
+ "d": _popt[0],
376
+ "alpha": _popt[1]
377
+ }
378
+ standard_errors = {
379
+ "d": param_errors[0],
380
+ "alpha": param_errors[1]
381
+ }
382
+
383
+ # Compute confidence intervals using the abstracted method
384
+ confidence_intervals = cls._compute_confidence_intervals(
385
+ parameters, standard_errors, df, confidence_level
386
+ )
252
387
 
253
388
  model = {
254
389
  "model": lambda x: _popt[0]*np.power(x, _popt[1]),
@@ -256,7 +391,9 @@ class PyEvoMotionBase():
256
391
  "d": _popt[0],
257
392
  "alpha": _popt[1]
258
393
  },
394
+ "confidence_intervals": confidence_intervals,
259
395
  "expression": "d*x^alpha",
396
+ "confidence_level": confidence_level,
260
397
  "r2": r2_score(y, cls._power_law(x, *_popt), sample_weight=_weights)
261
398
  }
262
399
 
@@ -317,16 +454,107 @@ class PyEvoMotionBase():
317
454
 
318
455
  F = ((RSS1 - RSS2)/(p2 - p1))/(RSS2/(n - p2))
319
456
 
320
- return F, 1 - snedecor_f.cdf(F, p2 - p1, n - p2)
457
+ return F, 1 - snedecor_f.cdf(F, p2 - p1, n - p2)
321
458
 
459
+ @classmethod
460
+ def AIC(
461
+ cls,
462
+ model1: dict[str,any],
463
+ model2: dict[str,any],
464
+ data: np.ndarray,
465
+ weights: np.ndarray | None = None
466
+ ) -> tuple[float, float]:
467
+ """
468
+ Perform an AIC test between two models.
469
+
470
+ Uses the small-sample corrected AIC with full constant terms:
471
+ AICc = n*ln(2*pi) + n*ln(RSS/n) + n + 2k + [2k(k+1)]/(n-k-1)
472
+
473
+ See https://en.wikipedia.org/wiki/Akaike_information_criterion for more details.
474
+
475
+ :param model1: The first model.
476
+ :type model1: dict[str, any]
477
+ :param model2: The second model.
478
+ :type model2: dict[str, any]
479
+ :param data: The data to test the models.
480
+ :type data: np.ndarray
481
+ :return: A tuple with the F-value and the p-value.
482
+ :rtype: ``tuple[float, float]``
483
+ """
484
+
485
+ data = data.flatten()
486
+
487
+ if weights is not None:
488
+ _weights = cls._weighting_function(weights.flatten())
489
+ else:
490
+ _weights = np.ones(len(data))
491
+
492
+ k1 = len(model1["parameters"])
493
+ k2 = len(model2["parameters"])
494
+ n = len(data)
495
+
496
+ model1 = np.vectorize(model1["model"])
497
+ model2 = np.vectorize(model2["model"])
498
+
499
+ RS1 = (data - model1(range(n)))**2
500
+ RS2 = (data - model2(range(n)))**2
501
+
502
+ # Mask the infinite and nan values
503
+ mask = (
504
+ np.isinf(RS1)
505
+ | np.isinf(RS2)
506
+ | np.isnan(RS1)
507
+ | np.isnan(RS2)
508
+ )
509
+
510
+ # Sum the residuals without the infinite values
511
+ RSS1 = np.sum(_weights*RS1, where=~mask)
512
+ RSS2 = np.sum(_weights*RS2, where=~mask)
513
+
514
+ # Handle edge case where RSS is 0 (perfect fit) to avoid log(0)
515
+ if RSS1 == 0:
516
+ RSS1 = 1e-10 # Small positive value to avoid log(0)
517
+ if RSS2 == 0:
518
+ RSS2 = 1e-10 # Small positive value to avoid log(0)
519
+
520
+ const_term = n * (np.log(2*np.pi) + 1.0)
521
+ denom1 = n - k1 - 1
522
+ denom2 = n - k2 - 1
523
+
524
+ # If denom <= 0, AICc is undefined; treat as +inf (no support)
525
+ if denom1 <= 0:
526
+ AICc1 = np.inf
527
+ else:
528
+ AICc1 = const_term + n * np.log(RSS1 / n) + 2 * k1 + (2 * k1 * (k1 + 1)) / denom1
529
+
530
+ if denom2 <= 0:
531
+ AICc2 = np.inf
532
+ else:
533
+ AICc2 = const_term + n * np.log(RSS2 / n) + 2 * k2 + (2 * k2 * (k2 + 1)) / denom2
534
+
535
+ # ΔAIC: relative to best (lowest AIC)
536
+ min_aicc = min(AICc1, AICc2)
537
+ dAICc1 = AICc1 - min_aicc
538
+ dAICc2 = AICc2 - min_aicc
539
+
540
+ # Akaike weights
541
+ rel1 = np.exp(-0.5 * dAICc1) if np.isfinite(dAICc1) else 0
542
+ rel2 = np.exp(-0.5 * dAICc2) if np.isfinite(dAICc2) else 0
543
+ denom = rel1 + rel2 if (rel1 + rel2) > 0 else 1.0
544
+ w1 = rel1 / denom
545
+ w2 = rel2 / denom
546
+
547
+ return AICc1, AICc2, dAICc1, dAICc2, w1, w2
548
+
322
549
  @classmethod
323
550
  def adjust_model(cls,
324
551
  x: pd.Series,
325
552
  y: pd.Series,
326
553
  name: str = None,
327
- weights: pd.Series | None = None
554
+ weights: pd.Series | None = None,
555
+ confidence_level: float = 0.95
328
556
  ) -> dict[str, any]:
329
- """Adjust a model to the data.
557
+ """Adjust a model to the data using AIC for model selection.
330
558
 
331
559
  :param x: The features. It is a single pandas Series.
332
560
  :type x: pd.Series
@@ -336,7 +564,18 @@ class PyEvoMotionBase():
336
564
  :type name: str
337
565
  :param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
338
566
  :type weights: np.ndarray | None
339
- :return: A dictionary with the model.
567
+ :param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
568
+ :type confidence_level: float
569
+ :return: A dictionary containing:
570
+
571
+ * If name is provided: A dictionary with the name as key and the result dictionary as value
572
+ * If name is None: A dictionary containing:
573
+
574
+ * ``selected_model``: The selected model based on lowest AIC
575
+ * ``linear_model``: The linear regression model with AIC statistics
576
+ * ``power_law_model``: The power law model with AIC statistics
577
+ * ``model_selection``: Dictionary with AIC comparison results
578
+
340
579
  :rtype: ``dict[str, any]``
341
580
  :raises ValueError: If the dataset is empty or full of NaN values. This may occur if the grouped data contains only one entry per group, indicating that the variance cannot be computed.
342
581
  """
@@ -350,20 +589,57 @@ class PyEvoMotionBase():
350
589
  f"Dataset length after filtering is: x: {x.size} elements; y: {y.size} elements. In particular:\n\nx: {x}\ny: {y}\n\nPerhaps NaN appeared for certain entries. Check if the grouped data contains only one entry per group, as this may cause NaN values when computing the variance. Also, consider widening the time window."
351
590
  )
352
591
 
353
- model1 = cls.linear_regression(x, y, weights=w, fit_intercept=False) # Not fitting the intercept because data is passed scaled to the minimum
354
- model2 = cls.power_law_fit(x, y, weights=w)
592
+ model1 = cls.linear_regression(x, y, weights=w, fit_intercept=False, confidence_level=confidence_level) # Not fitting the intercept because data is passed scaled to the minimum
593
+ model2 = cls.power_law_fit(x, y, weights=w, confidence_level=confidence_level)
355
594
 
356
- _, p = cls.F_test(model1, model2, y, weights=w)
595
+ # Compute AIC statistics for both models
596
+ AIC1, AIC2, dAIC1, dAIC2, w1, w2 = cls.AIC(model1, model2, y, weights=w)
357
597
 
358
- if p < 0.05:
359
- model = model2
598
+ # Select model with lowest AIC (highest Akaike weight)
599
+ if AIC1 <= AIC2:
600
+ selected_model = model1
601
+ selected_model_name = "linear"
360
602
  else:
361
- model = model1
603
+ selected_model = model2
604
+ selected_model_name = "power_law"
605
+
606
+ # Add AIC statistics to each model
607
+ model1_with_aic = model1.copy()
608
+ model1_with_aic.update({
609
+ "AIC": AIC1,
610
+ "delta_AIC": dAIC1,
611
+ "akaike_weight": w1,
612
+ "confidence_level": confidence_level
613
+ })
614
+
615
+ model2_with_aic = model2.copy()
616
+ model2_with_aic.update({
617
+ "AIC": AIC2,
618
+ "delta_AIC": dAIC2,
619
+ "akaike_weight": w2,
620
+ "confidence_level": confidence_level
621
+ })
622
+
623
+ # Create comprehensive result dictionary
624
+ result = {
625
+ "selected_model": selected_model,
626
+ "linear_model": model1_with_aic,
627
+ "power_law_model": model2_with_aic,
628
+ "model_selection": {
629
+ "selected": selected_model_name,
630
+ "linear_AIC": AIC1,
631
+ "power_law_AIC": AIC2,
632
+ "delta_AIC_linear": dAIC1,
633
+ "delta_AIC_power_law": dAIC2,
634
+ "akaike_weight_linear": w1,
635
+ "akaike_weight_power_law": w2
636
+ }
637
+ }
362
638
 
363
639
  if name:
364
- return {name: model}
640
+ return {name: result}
365
641
  else:
366
- return model
642
+ return result
367
643
 
368
644
  @staticmethod
369
645
  def plot_single_data_and_model(