PyPI - PyEvoMotion - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

PyEvoMotion 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

PyEvoMotion/cli.py +88 -11
PyEvoMotion/core/base.py +373 -34
PyEvoMotion/core/core.py +136 -43
PyEvoMotion/core/parser.py +4 -1
{pyevomotion-0.1.0.dist-info → pyevomotion-0.1.2.dist-info}/METADATA +72 -4
pyevomotion-0.1.2.dist-info/RECORD +35 -0
share/analyze_model_selection_accuracy.py +316 -0
share/analyze_test_runs.py +436 -0
share/anomalous_diffusion.pdf +0 -0
share/confusion_matrix_heatmap.pdf +0 -0
share/figUK.tsv +9949 -0
share/figUK_plots.pdf +0 -0
share/figUK_regression_results.json +65 -0
share/figUK_run_args.json +14 -0
share/figUK_stats.tsv +41 -0
share/figUSA.tsv +9470 -0
share/figUSA_plots.pdf +0 -0
share/figUSA_regression_results.json +65 -0
share/figUSA_run_args.json +14 -0
share/figUSA_stats.tsv +34 -0
share/figdataUK.tsv +10001 -0
share/figdataUSA.tsv +10001 -0
share/generate_sequences_from_synthdata.py +85 -0
share/generate_sequences_from_test5_data.py +107 -0
share/manuscript_figure.py +858 -43
share/run_parallel_analysis.py +196 -0
share/synth_figure.pdf +0 -0
share/uk_time_windows.pdf +0 -0
share/weekly_size.pdf +0 -0
pyevomotion-0.1.0.dist-info/RECORD +0 -13
{pyevomotion-0.1.0.dist-info → pyevomotion-0.1.2.dist-info}/WHEEL +0 -0
{pyevomotion-0.1.0.dist-info → pyevomotion-0.1.2.dist-info}/entry_points.txt +0 -0

share/analyze_test_runs.py ADDED Viewed

@@ -0,0 +1,436 @@
+#!/usr/bin/env python3
+"""
+Analyze parameter variability across multiple PyEvoMotion runs.
+This script loads regression results from multiple runs and creates
+violin plots to visualize parameter distributions and assess
+reproducibility of the nonlinear fitting process.
+"""
+import json
+import os
+from pathlib import Path
+from typing import Dict, List
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import numpy as np
+import pandas as pd
+def set_matplotlib_params():
+    """Set consistent matplotlib styling."""
+    mpl_params = {
+        "font.sans-serif": "Helvetica",
+        "axes.linewidth": 2,
+        "axes.labelsize": 14,
+        "axes.spines.top": False,
+        "axes.spines.right": False,
+        "font.size": 12,
+        "xtick.major.width": 2,
+        "ytick.major.width": 2,
+        "xtick.major.size": 6,
+        "ytick.major.size": 6,
+        "legend.frameon": False,
+    }
+    for k, v in mpl_params.items():
+        mpl.rcParams[k] = v
+def load_regression_results(base_dir: Path, country: str, num_runs: int = 5) -> List[Dict]:
+    """
+    Load regression results from multiple runs.
+    :param base_dir: Base directory containing run subdirectories
+    :type base_dir: Path
+    :param country: Either "UK" or "USA"
+    :type country: str
+    :param num_runs: Number of runs to load (default 5)
+    :type num_runs: int
+    :return: List of dictionaries containing regression results
+    :rtype: List[Dict]
+    """
+    results = []
+    for run_num in range(1, num_runs + 1):
+        run_dir = base_dir / f"{country}_run{run_num}"
+        results_file = run_dir / f"fig{country}_regression_results.json"
+        if results_file.exists():
+            with open(results_file, 'r') as f:
+                data = json.load(f)
+                results.append({
+                    'run': run_num,
+                    'country': country,
+                    'data': data
+                })
+        else:
+            print(f"Warning: {results_file} not found")
+    return results
+def extract_parameters(results: List[Dict]) -> pd.DataFrame:
+    """
+    Extract parameters from regression results into a DataFrame.
+    :param results: List of regression result dictionaries
+    :type results: List[Dict]
+    :return: DataFrame with parameters from all runs
+    :rtype: pd.DataFrame
+    """
+    records = []
+    for result in results:
+        run = result['run']
+        country = result['country']
+        data = result['data']
+        record = {
+            'run': run,
+            'country': country
+        }
+        # Extract mean model parameters
+        mean_key = None
+        for key in ["mean number of mutations model",
+                    "mean number of mutations per 7D model",
+                    "mean number of substitutions model"]:
+            if key in data:
+                mean_key = key
+                break
+        if mean_key:
+            mean_model = data[mean_key]
+            record['mean_m'] = mean_model['parameters']['m']
+            record['mean_b'] = mean_model['parameters']['b']
+            record['mean_r2'] = mean_model['r2']
+        # Extract variance model parameters
+        var_key = None
+        for key in ["scaled var number of mutations model",
+                    "scaled var number of mutations per 7D model",
+                    "scaled var number of substitutions model"]:
+            if key in data:
+                var_key = key
+                break
+        if var_key:
+            var_model = data[var_key]
+            # Check if model selection was performed
+            if "model_selection" in var_model:
+                selected = var_model["model_selection"]["selected"]
+                record['var_model_selected'] = selected
+                if selected == "linear" and "linear_model" in var_model:
+                    linear = var_model["linear_model"]
+                    record['var_m'] = linear['parameters']['m']
+                    record['var_r2'] = linear['r2']
+                    record['var_d'] = None
+                    record['var_alpha'] = None
+                elif selected == "power_law" and "power_law_model" in var_model:
+                    power_law = var_model["power_law_model"]
+                    record['var_d'] = power_law['parameters']['d']
+                    record['var_alpha'] = power_law['parameters']['alpha']
+                    record['var_r2'] = power_law['r2']
+                    record['var_m'] = None
+            else:
+                # Old format without model selection
+                params = var_model['parameters']
+                record['var_r2'] = var_model['r2']
+                if 'm' in params:
+                    record['var_m'] = params['m']
+                    record['var_d'] = None
+                    record['var_alpha'] = None
+                    record['var_model_selected'] = 'linear'
+                elif 'd' in params and 'alpha' in params:
+                    record['var_d'] = params['d']
+                    record['var_alpha'] = params['alpha']
+                    record['var_m'] = None
+                    record['var_model_selected'] = 'power_law'
+        records.append(record)
+    return pd.DataFrame(records)
+def create_violin_plots(df: pd.DataFrame, export: bool = False, show: bool = True, output_filename: str = "share/test_runs_violin_plot.pdf"):
+    """
+    Create violin plots for parameter distributions.
+    :param df: DataFrame with extracted parameters
+    :type df: pd.DataFrame
+    :param export: Whether to save the figure (default False)
+    :type export: bool
+    :param show: Whether to display the figure (default True)
+    :type show: bool
+    :param output_filename: Path to save the figure
+    :type output_filename: str
+    """
+    set_matplotlib_params()
+    # Define colors
+    colors = {
+        "UK": "#76d6ff",
+        "USA": "#FF6346",
+    }
+    # Parameters to plot
+    mean_params = [
+        ('mean_m', 'Mean: Slope (m)', 'mutations/week'),
+        ('mean_b', 'Mean: Intercept (b)', 'mutations'),
+        ('mean_r2', 'Mean: R²', '')
+    ]
+    # Check which variance model is predominantly used
+    var_model_counts = df['var_model_selected'].value_counts()
+    print("\nVariance model selection:")
+    print(var_model_counts)
+    # Determine which variance parameters to plot
+    if var_model_counts.get('power_law', 0) > 0:
+        var_params = [
+            ('var_d', 'Variance: Coefficient (d)', ''),
+            ('var_alpha', 'Variance: Exponent (α)', ''),
+            ('var_r2', 'Variance: R²', '')
+        ]
+    else:
+        var_params = [
+            ('var_m', 'Variance: Slope (m)', 'mutations²/week'),
+            ('var_r2', 'Variance: R²', '')
+        ]
+    all_params = mean_params + var_params
+    # Create subplots
+    n_params = len(all_params)
+    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
+    axes = axes.flatten()
+    for idx, (param, title, unit) in enumerate(all_params):
+        if idx >= len(axes):
+            break
+        ax = axes[idx]
+        # Filter out None values for this parameter
+        plot_df = df[df[param].notna()].copy()
+        if len(plot_df) == 0:
+            ax.text(0.5, 0.5, 'No data', ha='center', va='center', transform=ax.transAxes)
+            ax.set_title(title)
+            continue
+        # Create violin plot
+        parts = ax.violinplot(
+            [plot_df[plot_df['country'] == 'UK'][param].values,
+             plot_df[plot_df['country'] == 'USA'][param].values],
+            positions=[0, 1],
+            showmeans=True,
+            showextrema=True,
+            widths=0.7
+        )
+        # Color the violins
+        for i, pc in enumerate(parts['bodies']):
+            country = ['UK', 'USA'][i]
+            pc.set_facecolor(colors[country])
+            pc.set_alpha(0.7)
+            pc.set_edgecolor('black')
+            pc.set_linewidth(1.5)
+        # Style the other elements
+        for partname in ['cmeans', 'cmaxes', 'cmins', 'cbars']:
+            if partname in parts:
+                parts[partname].set_edgecolor('black')
+                parts[partname].set_linewidth(2)
+        # Add scatter points for individual runs
+        for i, country in enumerate(['UK', 'USA']):
+            country_data = plot_df[plot_df['country'] == country]
+            x_pos = np.random.normal(i, 0.04, size=len(country_data))
+            ax.scatter(x_pos, country_data[param].values,
+                      alpha=0.6, s=50, c='black', zorder=3, edgecolors='white', linewidth=1)
+        # Styling
+        ax.set_xticks([0, 1])
+        ax.set_xticklabels(['UK', 'USA'])
+        ax.set_ylabel(f'{title.split(": ")[1]} {f"({unit})" if unit else ""}'.strip())
+        ax.set_title(title, fontweight='bold')
+        ax.grid(axis='y', alpha=0.3, linestyle='--')
+        # Add statistics text
+        for i, country in enumerate(['UK', 'USA']):
+            country_data = plot_df[plot_df['country'] == country][param]
+            if len(country_data) > 0:
+                mean_val = country_data.mean()
+                std_val = country_data.std()
+                cv = (std_val / mean_val * 100) if mean_val != 0 else 0
+                text_y = ax.get_ylim()[1] * 0.95 - i * (ax.get_ylim()[1] - ax.get_ylim()[0]) * 0.08
+                ax.text(0.98, text_y,
+                       f'{country}: μ={mean_val:.4f}, σ={std_val:.4f}, CV={cv:.2f}%',
+                       transform=ax.transData, ha='right', va='top',
+                       fontsize=9, bbox=dict(boxstyle='round', facecolor=colors[country], alpha=0.3))
+    # Hide unused subplots
+    for idx in range(len(all_params), len(axes)):
+        axes[idx].set_visible(False)
+    fig.suptitle('Parameter Variability Across Multiple Runs\n(Assessing Nonlinear Fitting Reproducibility)',
+                 fontsize=16, fontweight='bold', y=0.995)
+    plt.tight_layout()
+    if export:
+        fig.savefig(output_filename, dpi=400, bbox_inches='tight')
+        print(f"\nViolin plot saved as {output_filename}")
+    if show:
+        plt.show()
+def print_summary_statistics(df: pd.DataFrame):
+    """
+    Print summary statistics for all parameters.
+    Displays mean, standard deviation, and coefficient of variation (CV%)
+    for each parameter grouped by country.
+    :param df: DataFrame with extracted parameters
+    :type df: pd.DataFrame
+    """
+    print("\n" + "="*80)
+    print("PARAMETER VARIABILITY SUMMARY")
+    print("="*80)
+    for country in ['UK', 'USA']:
+        print(f"\n{country} Dataset:")
+        print("-" * 40)
+        country_df = df[df['country'] == country]
+        # Mean model parameters
+        print("\nMean Model:")
+        for param in ['mean_m', 'mean_b', 'mean_r2']:
+            if param in country_df.columns:
+                values = country_df[param].dropna()
+                if len(values) > 0:
+                    mean = values.mean()
+                    std = values.std()
+                    cv = (std / mean * 100) if mean != 0 else 0
+                    print(f"  {param:12s}: μ={mean:10.6f}, σ={std:10.6f}, CV={cv:6.2f}%")
+        # Variance model parameters
+        print("\nVariance Model:")
+        var_model = country_df['var_model_selected'].mode()[0] if 'var_model_selected' in country_df.columns else 'unknown'
+        print(f"  Selected model: {var_model}")
+        if var_model == 'power_law':
+            for param in ['var_d', 'var_alpha', 'var_r2']:
+                if param in country_df.columns:
+                    values = country_df[param].dropna()
+                    if len(values) > 0:
+                        mean = values.mean()
+                        std = values.std()
+                        cv = (std / mean * 100) if mean != 0 else 0
+                        print(f"  {param:12s}: μ={mean:10.6f}, σ={std:10.6f}, CV={cv:6.2f}%")
+        else:
+            for param in ['var_m', 'var_r2']:
+                if param in country_df.columns:
+                    values = country_df[param].dropna()
+                    if len(values) > 0:
+                        mean = values.mean()
+                        std = values.std()
+                        cv = (std / mean * 100) if mean != 0 else 0
+                        print(f"  {param:12s}: μ={mean:10.6f}, σ={std:10.6f}, CV={cv:6.2f}%")
+    print("\n" + "="*80)
+def main():
+    """
+    Main execution function for analyzing test run parameter variability.
+    Loads regression results from batch directories, extracts parameters,
+    computes statistics, and generates violin plots to visualize parameter
+    distributions across multiple runs.
+    """
+    import sys
+    # Parse command line arguments
+    if len(sys.argv) > 1:
+        batch_name = sys.argv[1]
+        BASE_DIR = Path(f"share/test-runs/{batch_name}")
+        output_suffix = f"_{batch_name}"
+    else:
+        # Try to auto-detect batch directories or use batch1 as default
+        test_runs_dir = Path("share/test-runs")
+        batch_dirs = [d for d in test_runs_dir.iterdir() if d.is_dir() and d.name.startswith("batch")]
+        if len(batch_dirs) == 0:
+            # Fall back to old structure (no batch subdirectories)
+            BASE_DIR = Path("share/test-runs")
+            output_suffix = ""
+        elif len(batch_dirs) == 1:
+            # Use the only batch found
+            BASE_DIR = batch_dirs[0]
+            output_suffix = f"_{batch_dirs[0].name}"
+            print(f"Auto-detected batch: {batch_dirs[0].name}")
+        else:
+            # Multiple batches - ask user or default to batch1
+            print(f"Found {len(batch_dirs)} batches: {[d.name for d in batch_dirs]}")
+            print("Please specify which batch to analyze:")
+            print("  python analyze_test_runs.py batch1")
+            print("Or analyze all batches separately by running for each.")
+            return
+    if not BASE_DIR.exists():
+        print(f"Error: Directory {BASE_DIR} does not exist!")
+        return
+    # Auto-detect number of runs
+    uk_runs = list(BASE_DIR.glob("UK_run*"))
+    usa_runs = list(BASE_DIR.glob("USA_run*"))
+    NUM_RUNS = max(len(uk_runs), len(usa_runs))
+    COUNTRIES = ["UK", "USA"]
+    print(f"Loading regression results from {BASE_DIR}...")
+    print(f"Detected {NUM_RUNS} runs per country")
+    # Load all results
+    all_results = []
+    for country in COUNTRIES:
+        results = load_regression_results(BASE_DIR, country, NUM_RUNS)
+        all_results.extend(results)
+        print(f"Loaded {len(results)} runs for {country}")
+    if not all_results:
+        print("Error: No results found!")
+        return
+    # Extract parameters into DataFrame
+    print("\nExtracting parameters...")
+    df = extract_parameters(all_results)
+    # Save to CSV for further analysis
+    output_csv = f"share/test_runs_parameters{output_suffix}.csv"
+    df.to_csv(output_csv, index=False)
+    print(f"Parameters saved to {output_csv}")
+    # Print summary statistics
+    print_summary_statistics(df)
+    # Create violin plots
+    print("\nCreating violin plots...")
+    output_plot = f"share/test_runs_violin_plot{output_suffix}.pdf"
+    create_violin_plots(df, export=True, show=True, output_filename=output_plot)
+if __name__ == "__main__":
+    main()

share/anomalous_diffusion.pdf ADDED Viewed

Binary file

share/confusion_matrix_heatmap.pdf ADDED Viewed

Binary file

PyEvoMotion 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

PyEvoMotion 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl