PyPI - PyEvoMotion - Versions diffs - 0.1.1__tar.gz → 0.1.2__tar.gz - Mend

PyEvoMotion 0.1.1tar.gz → 0.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{pyevomotion-0.1.1 → pyevomotion-0.1.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: PyEvoMotion
-Version: 0.1.1
+Version: 0.1.2
 Summary: Evolutionary motion analysis tool
 Keywords: evolution,anomalous diffusion,bioinformatics
 Author: Lucas Goiriz

{pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/cli.py RENAMED Viewed

@@ -248,6 +248,13 @@ def _parse_arguments() -> argparse.Namespace:
         action="store_true",
         help="Export the plots of the analysis."
     )
+    parser.add_argument(
+        "-cl",
+        "--confidence_level",
+        type=float,
+        default=0.95,
+        help="Confidence level for parameter confidence intervals (default 0.95 for 95%% CI). Must be between 0 and 1."
+    )
     parser.add_argument(
         "-l",
         "--length_filter",
@@ -357,6 +364,73 @@ def _simple_serializer(k: str, v: any) -> any:
         return "..".join(map(lambda x: x.strftime("%Y-%m-%d") if x else "", v))
     return v
+def _remove_model_functions(obj):
+    """Recursively remove 'model' keys containing lambda functions from nested dictionaries.
+    :param obj: Dictionary or other object to clean
+    :type obj: any
+    :return: Cleaned object with model functions removed
+    :rtype: any
+    """
+    if isinstance(obj, dict):
+        # Create a copy to avoid modifying during iteration
+        cleaned_obj = {}
+        for key, value in obj.items():
+            if key == "model":
+                # Skip lambda model functions - they can't be serialized to JSON
+                continue
+            elif isinstance(value, dict):
+                # Recursively clean nested dictionaries
+                cleaned_obj[key] = _remove_model_functions(value)
+            else:
+                # Keep all other values
+                cleaned_obj[key] = value
+        return cleaned_obj
+    else:
+        return obj
+def _restructure_regression_results(reg_results):
+    """Restructure regression results for cleaner JSON export format.
+    :param reg_results: Raw regression results from analysis
+    :type reg_results: dict
+    :return: Restructured results with cleaner format
+    :rtype: dict
+    """
+    restructured = {}
+    for key, value in reg_results.items():
+        if key.endswith("_full_results"):
+            # Extract the base name (remove _full_results suffix)
+            base_name = key.replace("_full_results", "")
+            # Create the new structure with only essential fields
+            restructured[base_name] = {
+                "linear_model": {
+                    "parameters": value["linear_model"]["parameters"],
+                    "confidence_intervals": value["linear_model"]["confidence_intervals"],
+                    "expression": value["linear_model"]["expression"],
+                    "r2": value["linear_model"]["r2"],
+                    "confidence_level": value["linear_model"]["confidence_level"]
+                },
+                "power_law_model": {
+                    "parameters": value["power_law_model"]["parameters"],
+                    "confidence_intervals": value["power_law_model"]["confidence_intervals"],
+                    "expression": value["power_law_model"]["expression"],
+                    "r2": value["power_law_model"]["r2"],
+                    "confidence_level": value["power_law_model"]["confidence_level"]
+                },
+                "model_selection": value["model_selection"]
+            }
+        else:
+            # Keep non-full-results entries as-is (backward compatibility models)
+            # But skip them if there's a corresponding _full_results entry
+            full_results_key = f"{key}_full_results"
+            if full_results_key not in reg_results:
+                restructured[key] = value
+    return restructured
 def _main():
     check_and_install_mafft()
     """
@@ -367,6 +441,11 @@ def _main():
     print(BANNER)
     args = _parse_arguments()
+    # Validate confidence level
+    if not (0 < args.confidence_level < 1):
+        parser = _ArgumentParserWithHelpOnError(description=PACKAGE_DESCRIPTION)
+        parser.error("Confidence level must be between 0 and 1 (exclusive)")
     # If the -xj argument is passed, the arguments are exported to a JSON file before running the analysis altogether
     if args.export_json:
         with open(f"{args.out}_run_args.json", "w") as file:
@@ -406,13 +485,18 @@ def _main():
             f"{args.out}_plots"
             if args.export_plots
             else None
-        )
+        ),
+        confidence_level=args.confidence_level
     )
     _reg = reg.copy()
-    for k in _reg.keys():
-        del _reg[k]["model"]
+    # First restructure the results to the desired export format
+    _reg = _restructure_regression_results(_reg)
+    # Then apply the cleaning function to remove lambda functions
+    for k in list(_reg.keys()):
+        _reg[k] = _remove_model_functions(_reg[k])
     # Exports the statistic results to TSV file
     stats.to_csv(

{pyevomotion-0.1.1 → pyevomotion-0.1.2}/PyEvoMotion/core/base.py RENAMED Viewed

@@ -2,7 +2,7 @@ import numpy as np
 import pandas as pd
 from sklearn.metrics import r2_score
 from scipy.optimize import curve_fit
-from scipy.stats import f as snedecor_f
+from scipy.stats import f as snedecor_f, t as t_dist
 from sklearn.linear_model import LinearRegression
@@ -138,12 +138,49 @@ class PyEvoMotionBase():
         return np.tanh(2*n/n_0)
+    @staticmethod
+    def _compute_confidence_intervals(
+        parameters: dict[str, float],
+        standard_errors: dict[str, float],
+        degrees_of_freedom: int,
+        confidence_level: float = 0.95
+    ) -> dict[str, tuple[float, float]]:
+        """
+        Compute confidence intervals for parameters using t-distribution.
+        :param parameters: Dictionary of parameter names and their estimated values.
+        :type parameters: dict[str, float]
+        :param standard_errors: Dictionary of parameter names and their standard errors.
+        :type standard_errors: dict[str, float]
+        :param degrees_of_freedom: Degrees of freedom for the t-distribution.
+        :type degrees_of_freedom: int
+        :param confidence_level: Confidence level for the intervals (default 0.95 for 95% CI).
+        :type confidence_level: float
+        :return: Dictionary with parameter names as keys and (lower_bound, upper_bound) tuples as values.
+        :rtype: dict[str, tuple[float, float]]
+        """
+        alpha = 1 - confidence_level
+        t_val = t_dist.ppf(1 - alpha/2, degrees_of_freedom)
+        confidence_intervals = {}
+        for param_name in parameters.keys():
+            param_value = parameters[param_name]
+            param_se = standard_errors[param_name]
+            margin_of_error = t_val * param_se
+            confidence_intervals[param_name] = (
+                param_value - margin_of_error,
+                param_value + margin_of_error
+            )
+        return confidence_intervals
     @classmethod
     def linear_regression(cls,
         x: np.ndarray,
         y: np.ndarray,
         weights: np.ndarray | None = None,
-        fit_intercept: bool = True
+        fit_intercept: bool = True,
+        confidence_level: float = 0.95
     ) -> dict[str, any]:
         """
         Perform a linear regression on a set of data.
@@ -156,10 +193,13 @@ class PyEvoMotionBase():
         :type fit_intercept: bool
         :param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
         :type weights: np.ndarray | None
+        :param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
+        :type confidence_level: float
         :return: A dictionary containing:
             * ``model``: A ``lambda`` function that computes predictions based on the fitted model.
             * ``parameters``: A dictionary with the slope of the regression line.
+            * ``confidence_intervals``: A dictionary with confidence intervals for each parameter.
             * ``expression``: A string representation of the regression equation.
             * ``r2``: The :math:`R^2` score of the regression.
         :rtype: ``dict[str, any]``
@@ -169,6 +209,41 @@ class PyEvoMotionBase():
         reg = LinearRegression(fit_intercept=fit_intercept).fit(x, y, sample_weight=_weights)
+        # Calculate confidence intervals
+        n = len(x)
+        _df = n - (2 if fit_intercept else 1)  # degrees of freedom
+        # Calculate residuals and MSE
+        y_pred = reg.predict(x)
+        residuals = y.flatten() - y_pred.flatten()
+        if _weights is not None:
+            # Weighted MSE
+            mse = np.sum(_weights * residuals**2) / (np.sum(_weights) - (2 if fit_intercept else 1))
+        else:
+            mse = np.sum(residuals**2) / _df
+        # Calculate standard errors
+        x_flat = x.flatten()
+        x_mean = np.mean(x_flat)
+        sxx = np.sum((x_flat - x_mean)**2)
+        # Standard error for slope
+        se_slope = np.sqrt(mse / sxx)
+        parameters = {"m": reg.coef_[0][0]}
+        standard_errors = {"m": se_slope}
+        if fit_intercept:
+            se_intercept = np.sqrt(mse * (1/n + x_mean**2/sxx))
+            parameters["b"] = reg.intercept_[0]
+            standard_errors["b"] = se_intercept
+        # Compute confidence intervals using the abstracted method
+        confidence_intervals = cls._compute_confidence_intervals(
+            parameters, standard_errors, _df, confidence_level
+        )
         if fit_intercept:
             model = {
                 "model": lambda x: reg.coef_[0][0]*x + reg.intercept_[0],
@@ -176,7 +251,9 @@ class PyEvoMotionBase():
                     "m": reg.coef_[0][0],
                     "b": reg.intercept_[0]
                 },
-                "expression": "mx + b"
+                "confidence_intervals": confidence_intervals,
+                "expression": "mx + b",
+                "confidence_level": confidence_level
             }
         else:
@@ -185,7 +262,9 @@ class PyEvoMotionBase():
                 "parameters": {
                     "m": reg.coef_[0][0],
                 },
-                "expression": "mx"
+                "confidence_intervals": confidence_intervals,
+                "expression": "mx",
+                "confidence_level": confidence_level
             }
         model["r2"] = r2_score(y, reg.predict(x), sample_weight=_weights)
@@ -214,9 +293,13 @@ class PyEvoMotionBase():
         return a*np.power(x, b)
     @classmethod
-    def power_law_fit(cls, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None) -> dict[str, any]:
+    def power_law_fit(cls, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, confidence_level: float = 0.95) -> dict[str, any]:
         """
         Perform a power law fit on a set of data.
+        This method fits a power law model of the form :math:`y = d \\cdot x^{\\alpha}` to the data.
+        Initial parameter estimates are obtained via linear regression on log-transformed data,
+        which provides better convergence than default initialization.
         :param x: A numpy array of the features.
         :type x: np.ndarray
@@ -224,31 +307,83 @@ class PyEvoMotionBase():
         :type y: np.ndarray
         :param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
         :type weights: np.ndarray | None
+        :param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
+        :type confidence_level: float
         :return: A dictionary containing:
             * ``model``: A ``lambda`` function that computes predictions based on the fitted model.
-            * ``parameters``: A dictionary with the parameters of the fitted power law.
+            * ``parameters``: A dictionary with the parameters of the fitted power law (``d`` and ``alpha``).
+            * ``confidence_intervals``: A dictionary with confidence intervals for each parameter.
             * ``expression``: A string representation of the regression equation.
             * ``r2``: The :math:`R^2` score of the regression.
+            * ``confidence_level``: The confidence level used for the confidence intervals.
         :rtype: ``dict[str, any]``
         """
         _weights = cls._weighting_function(weights).flatten() if weights is not None else None
+        # Provide good initial parameter guesses for power law
+        # Use linear regression on log-transformed data to get initial estimates
+        x_flat = x.T.tolist()[0]
+        y_flat = y.T.tolist()[0]
+        mask = (np.array(x_flat) > 0) & (np.array(y_flat) > 0)
+        x_log = np.log(np.array(x_flat)[mask])
+        y_log = np.log(np.array(y_flat)[mask])
+        # Linear regression on log-transformed data: log(y) = log(d) + alpha*log(x)
+        # This gives us initial estimates for d and alpha
+        if len(x_log) > 1:
+            reg = LinearRegression(fit_intercept=True).fit(x_log.reshape(-1, 1), y_log.reshape(-1, 1))
+            p0 = [np.exp(reg.intercept_[0]), reg.coef_[0][0]]  # [d, alpha]
+        else:
+            p0 = [1.0, 1.0]  # Default fallback
+        # Set reasonable bounds for power law parameters
+        # d > 0 (coefficient must be positive)
+        # alpha can be any real number, but constrain to reasonable range
+        bounds = ([1e-10, -10], [np.inf, 10])  # [d_min, alpha_min], [d_max, alpha_max]
         try:
-            _popt, _, _, _msg, _ier = curve_fit(
+            _popt, _pcov, _, _msg, _ier = curve_fit(
                 cls._power_law,
-                x.T.tolist()[0], y.T.tolist()[0],
+                x_flat, y_flat,
+                p0=p0,
+                bounds=bounds,
                 sigma=1/np.sqrt(_weights) if _weights is not None else None,
                 full_output=True
             )
         except RuntimeError as e:
             _ier = 0
             _msg = str(e)
+            _pcov = np.array([[np.inf, 0], [0, np.inf]])
         if _ier not in range(1, 5):
             print(f"{_msg}")
             _popt = [0, 0]
+            _pcov = np.array([[np.inf, 0], [0, np.inf]])
+        # Calculate confidence intervals from covariance matrix
+        n = len(x)
+        df = n - 2  # degrees of freedom for 2 parameters
+        # Standard errors from covariance matrix diagonal
+        param_errors = np.sqrt(np.diag(_pcov))
+        # Prepare parameters and standard errors for confidence interval computation
+        parameters = {
+            "d": _popt[0],
+            "alpha": _popt[1]
+        }
+        standard_errors = {
+            "d": param_errors[0],
+            "alpha": param_errors[1]
+        }
+        # Compute confidence intervals using the abstracted method
+        confidence_intervals = cls._compute_confidence_intervals(
+            parameters, standard_errors, df, confidence_level
+        )
         model = {
             "model": lambda x: _popt[0]*np.power(x, _popt[1]),
@@ -256,7 +391,9 @@ class PyEvoMotionBase():
                 "d": _popt[0],
                 "alpha": _popt[1]
             },
+            "confidence_intervals": confidence_intervals,
             "expression": "d*x^alpha",
+            "confidence_level": confidence_level,
             "r2": r2_score(y, cls._power_law(x, *_popt), sample_weight=_weights)
         }
@@ -317,16 +454,107 @@ class PyEvoMotionBase():
         F = ((RSS1 - RSS2)/(p2 - p1))/(RSS2/(n - p2))
-        return F, 1 - snedecor_f.cdf(F, p2 - p1, n - p2)
+        return F, 1 - snedecor_f.cdf(F, p2 - p1, n - p2)
+    @classmethod
+    def AIC(
+        cls,
+        model1: dict[str,any],
+        model2: dict[str,any],
+        data: np.ndarray,
+        weights: np.ndarray | None = None
+    ) -> tuple[float, float]:
+        """
+        Perform an AIC test between two models.
+        Uses the small-sample corrected AIC with full constant terms:
+            AICc = n*ln(2*pi) + n*ln(RSS/n) + n + 2k + [2k(k+1)]/(n-k-1)
+        See https://en.wikipedia.org/wiki/Akaike_information_criterion for more details.
+        :param model1: The first model.
+        :type model1: dict[str, any]
+        :param model2: The second model.
+        :type model2: dict[str, any]
+        :param data: The data to test the models.
+        :type data: np.ndarray
+        :return: A tuple with the F-value and the p-value.
+        :rtype: ``tuple[float, float]``
+        """
+        data = data.flatten()
+        if weights is not None:
+            _weights = cls._weighting_function(weights.flatten())
+        else:
+            _weights = np.ones(len(data))
+        k1 = len(model1["parameters"])
+        k2 = len(model2["parameters"])
+        n = len(data)
+        model1 = np.vectorize(model1["model"])
+        model2 = np.vectorize(model2["model"])
+        RS1 = (data - model1(range(n)))**2
+        RS2 = (data - model2(range(n)))**2
+        # Mask the infinite and nan values
+        mask = (
+            np.isinf(RS1)
+            | np.isinf(RS2)
+            | np.isnan(RS1)
+            | np.isnan(RS2)
+        )
+        # Sum the residuals without the infinite values
+        RSS1 = np.sum(_weights*RS1, where=~mask)
+        RSS2 = np.sum(_weights*RS2, where=~mask)
+        # Handle edge case where RSS is 0 (perfect fit) to avoid log(0)
+        if RSS1 == 0:
+            RSS1 = 1e-10  # Small positive value to avoid log(0)
+        if RSS2 == 0:
+            RSS2 = 1e-10  # Small positive value to avoid log(0)
+        const_term = n * (np.log(2*np.pi) + 1.0)
+        denom1 = n - k1 - 1
+        denom2 = n - k2 - 1
+        # If denom <= 0, AICc is undefined; treat as +inf (no support)
+        if denom1 <= 0:
+            AICc1 = np.inf
+        else:
+            AICc1 = const_term + n * np.log(RSS1 / n) + 2 * k1 + (2 * k1 * (k1 + 1)) / denom1
+        if denom2 <= 0:
+            AICc2 = np.inf
+        else:
+            AICc2 = const_term + n * np.log(RSS2 / n) + 2 * k2 + (2 * k2 * (k2 + 1)) / denom2
+        # ΔAIC: relative to best (lowest AIC)
+        min_aicc = min(AICc1, AICc2)
+        dAICc1 = AICc1 - min_aicc
+        dAICc2 = AICc2 - min_aicc
+        # Akaike weights
+        rel1 = np.exp(-0.5 * dAICc1) if np.isfinite(dAICc1) else 0
+        rel2 = np.exp(-0.5 * dAICc2) if np.isfinite(dAICc2) else 0
+        denom = rel1 + rel2 if (rel1 + rel2) > 0 else 1.0
+        w1 = rel1 / denom
+        w2 = rel2 / denom
+        return AICc1, AICc2, dAICc1, dAICc2, w1, w2
     @classmethod
     def adjust_model(cls,
         x: pd.Series,
         y: pd.Series,
         name: str = None,
-        weights: pd.Series | None = None
+        weights: pd.Series | None = None,
+        confidence_level: float = 0.95
     ) -> dict[str, any]:
-        """Adjust a model to the data.
+        """Adjust a model to the data using AIC for model selection.
         :param x: The features. It is a single pandas Series.
         :type x: pd.Series
@@ -336,7 +564,18 @@ class PyEvoMotionBase():
         :type name: str
         :param weights: Optional weights for the data points. If provided, points with higher weights will have more influence on the fit. These weights are scaled by the weighting function tanh(2*n/n_0), where n is the number of data points and n_0 is the number of data points at which the weighting function approximates the constant 1. Default is ``None``.
         :type weights: np.ndarray | None
-        :return: A dictionary with the model.
+        :param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
+        :type confidence_level: float
+        :return: A dictionary containing:
+            * If name is provided: A dictionary with the name as key and the result dictionary as value
+            * If name is None: A dictionary containing:
+                * ``selected_model``: The selected model based on lowest AIC
+                * ``linear_model``: The linear regression model with AIC statistics
+                * ``power_law_model``: The power law model with AIC statistics
+                * ``model_selection``: Dictionary with AIC comparison results
         :rtype: ``dict[str, any]``
         :raises ValueError: If the dataset is empty or full of NaN values. This may occur if the grouped data contains only one entry per group, indicating that the variance cannot be computed.
         """
@@ -350,20 +589,57 @@ class PyEvoMotionBase():
                 f"Dataset length after filtering is: x: {x.size} elements; y: {y.size} elements. In particular:\n\nx: {x}\ny: {y}\n\nPerhaps NaN appeared for certain entries. Check if the grouped data contains only one entry per group, as this may cause NaN values when computing the variance. Also, consider widening the time window."
             )
-        model1 = cls.linear_regression(x, y, weights=w, fit_intercept=False) # Not fitting the intercept because data is passed scaled to the minimum
-        model2 = cls.power_law_fit(x, y, weights=w)
+        model1 = cls.linear_regression(x, y, weights=w, fit_intercept=False, confidence_level=confidence_level) # Not fitting the intercept because data is passed scaled to the minimum
+        model2 = cls.power_law_fit(x, y, weights=w, confidence_level=confidence_level)
-        _, p = cls.F_test(model1, model2, y, weights=w)
+        # Compute AIC statistics for both models
+        AIC1, AIC2, dAIC1, dAIC2, w1, w2 = cls.AIC(model1, model2, y, weights=w)
-        if p < 0.05:
-            model = model2
+        # Select model with lowest AIC (highest Akaike weight)
+        if AIC1 <= AIC2:
+            selected_model = model1
+            selected_model_name = "linear"
         else:
-            model = model1
+            selected_model = model2
+            selected_model_name = "power_law"
+        # Add AIC statistics to each model
+        model1_with_aic = model1.copy()
+        model1_with_aic.update({
+            "AIC": AIC1,
+            "delta_AIC": dAIC1,
+            "akaike_weight": w1,
+            "confidence_level": confidence_level
+        })
+        model2_with_aic = model2.copy()
+        model2_with_aic.update({
+            "AIC": AIC2,
+            "delta_AIC": dAIC2,
+            "akaike_weight": w2,
+            "confidence_level": confidence_level
+        })
+        # Create comprehensive result dictionary
+        result = {
+            "selected_model": selected_model,
+            "linear_model": model1_with_aic,
+            "power_law_model": model2_with_aic,
+            "model_selection": {
+                "selected": selected_model_name,
+                "linear_AIC": AIC1,
+                "power_law_AIC": AIC2,
+                "delta_AIC_linear": dAIC1,
+                "delta_AIC_power_law": dAIC2,
+                "akaike_weight_linear": w1,
+                "akaike_weight_power_law": w2
+            }
+        }
         if name:
-            return {name: model}
+            return {name: result}
         else:
-            return model
+            return result
     @staticmethod
     def plot_single_data_and_model(

PyEvoMotion 0.1.1__tar.gz → 0.1.2__tar.gz

PyEvoMotion 0.1.1tar.gz → 0.1.2tar.gz