PyEvoMotion 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,436 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Analyze parameter variability across multiple PyEvoMotion runs.
4
+
5
+ This script loads regression results from multiple runs and creates
6
+ violin plots to visualize parameter distributions and assess
7
+ reproducibility of the nonlinear fitting process.
8
+ """
9
+
10
+ import json
11
+ import os
12
+ from pathlib import Path
13
+ from typing import Dict, List
14
+
15
+ import matplotlib.pyplot as plt
16
+ import matplotlib as mpl
17
+ import numpy as np
18
+ import pandas as pd
19
+
20
+
21
+ def set_matplotlib_params():
22
+ """Set consistent matplotlib styling."""
23
+ mpl_params = {
24
+ "font.sans-serif": "Helvetica",
25
+ "axes.linewidth": 2,
26
+ "axes.labelsize": 14,
27
+ "axes.spines.top": False,
28
+ "axes.spines.right": False,
29
+ "font.size": 12,
30
+ "xtick.major.width": 2,
31
+ "ytick.major.width": 2,
32
+ "xtick.major.size": 6,
33
+ "ytick.major.size": 6,
34
+ "legend.frameon": False,
35
+ }
36
+ for k, v in mpl_params.items():
37
+ mpl.rcParams[k] = v
38
+
39
+
40
+ def load_regression_results(base_dir: Path, country: str, num_runs: int = 5) -> List[Dict]:
41
+ """
42
+ Load regression results from multiple runs.
43
+
44
+ :param base_dir: Base directory containing run subdirectories
45
+ :type base_dir: Path
46
+ :param country: Either "UK" or "USA"
47
+ :type country: str
48
+ :param num_runs: Number of runs to load (default 5)
49
+ :type num_runs: int
50
+ :return: List of dictionaries containing regression results
51
+ :rtype: List[Dict]
52
+ """
53
+ results = []
54
+
55
+ for run_num in range(1, num_runs + 1):
56
+ run_dir = base_dir / f"{country}_run{run_num}"
57
+ results_file = run_dir / f"fig{country}_regression_results.json"
58
+
59
+ if results_file.exists():
60
+ with open(results_file, 'r') as f:
61
+ data = json.load(f)
62
+ results.append({
63
+ 'run': run_num,
64
+ 'country': country,
65
+ 'data': data
66
+ })
67
+ else:
68
+ print(f"Warning: {results_file} not found")
69
+
70
+ return results
71
+
72
+
73
+ def extract_parameters(results: List[Dict]) -> pd.DataFrame:
74
+ """
75
+ Extract parameters from regression results into a DataFrame.
76
+
77
+ :param results: List of regression result dictionaries
78
+ :type results: List[Dict]
79
+ :return: DataFrame with parameters from all runs
80
+ :rtype: pd.DataFrame
81
+ """
82
+ records = []
83
+
84
+ for result in results:
85
+ run = result['run']
86
+ country = result['country']
87
+ data = result['data']
88
+
89
+ record = {
90
+ 'run': run,
91
+ 'country': country
92
+ }
93
+
94
+ # Extract mean model parameters
95
+ mean_key = None
96
+ for key in ["mean number of mutations model",
97
+ "mean number of mutations per 7D model",
98
+ "mean number of substitutions model"]:
99
+ if key in data:
100
+ mean_key = key
101
+ break
102
+
103
+ if mean_key:
104
+ mean_model = data[mean_key]
105
+ record['mean_m'] = mean_model['parameters']['m']
106
+ record['mean_b'] = mean_model['parameters']['b']
107
+ record['mean_r2'] = mean_model['r2']
108
+
109
+ # Extract variance model parameters
110
+ var_key = None
111
+ for key in ["scaled var number of mutations model",
112
+ "scaled var number of mutations per 7D model",
113
+ "scaled var number of substitutions model"]:
114
+ if key in data:
115
+ var_key = key
116
+ break
117
+
118
+ if var_key:
119
+ var_model = data[var_key]
120
+
121
+ # Check if model selection was performed
122
+ if "model_selection" in var_model:
123
+ selected = var_model["model_selection"]["selected"]
124
+ record['var_model_selected'] = selected
125
+
126
+ if selected == "linear" and "linear_model" in var_model:
127
+ linear = var_model["linear_model"]
128
+ record['var_m'] = linear['parameters']['m']
129
+ record['var_r2'] = linear['r2']
130
+ record['var_d'] = None
131
+ record['var_alpha'] = None
132
+
133
+ elif selected == "power_law" and "power_law_model" in var_model:
134
+ power_law = var_model["power_law_model"]
135
+ record['var_d'] = power_law['parameters']['d']
136
+ record['var_alpha'] = power_law['parameters']['alpha']
137
+ record['var_r2'] = power_law['r2']
138
+ record['var_m'] = None
139
+ else:
140
+ # Old format without model selection
141
+ params = var_model['parameters']
142
+ record['var_r2'] = var_model['r2']
143
+
144
+ if 'm' in params:
145
+ record['var_m'] = params['m']
146
+ record['var_d'] = None
147
+ record['var_alpha'] = None
148
+ record['var_model_selected'] = 'linear'
149
+ elif 'd' in params and 'alpha' in params:
150
+ record['var_d'] = params['d']
151
+ record['var_alpha'] = params['alpha']
152
+ record['var_m'] = None
153
+ record['var_model_selected'] = 'power_law'
154
+
155
+ records.append(record)
156
+
157
+ return pd.DataFrame(records)
158
+
159
+
160
+ def create_violin_plots(df: pd.DataFrame, export: bool = False, show: bool = True, output_filename: str = "share/test_runs_violin_plot.pdf"):
161
+ """
162
+ Create violin plots for parameter distributions.
163
+
164
+ :param df: DataFrame with extracted parameters
165
+ :type df: pd.DataFrame
166
+ :param export: Whether to save the figure (default False)
167
+ :type export: bool
168
+ :param show: Whether to display the figure (default True)
169
+ :type show: bool
170
+ :param output_filename: Path to save the figure
171
+ :type output_filename: str
172
+ """
173
+ set_matplotlib_params()
174
+
175
+ # Define colors
176
+ colors = {
177
+ "UK": "#76d6ff",
178
+ "USA": "#FF6346",
179
+ }
180
+
181
+ # Parameters to plot
182
+ mean_params = [
183
+ ('mean_m', 'Mean: Slope (m)', 'mutations/week'),
184
+ ('mean_b', 'Mean: Intercept (b)', 'mutations'),
185
+ ('mean_r2', 'Mean: R²', '')
186
+ ]
187
+
188
+ # Check which variance model is predominantly used
189
+ var_model_counts = df['var_model_selected'].value_counts()
190
+ print("\nVariance model selection:")
191
+ print(var_model_counts)
192
+
193
+ # Determine which variance parameters to plot
194
+ if var_model_counts.get('power_law', 0) > 0:
195
+ var_params = [
196
+ ('var_d', 'Variance: Coefficient (d)', ''),
197
+ ('var_alpha', 'Variance: Exponent (α)', ''),
198
+ ('var_r2', 'Variance: R²', '')
199
+ ]
200
+ else:
201
+ var_params = [
202
+ ('var_m', 'Variance: Slope (m)', 'mutations²/week'),
203
+ ('var_r2', 'Variance: R²', '')
204
+ ]
205
+
206
+ all_params = mean_params + var_params
207
+
208
+ # Create subplots
209
+ n_params = len(all_params)
210
+ fig, axes = plt.subplots(2, 3, figsize=(18, 12))
211
+ axes = axes.flatten()
212
+
213
+ for idx, (param, title, unit) in enumerate(all_params):
214
+ if idx >= len(axes):
215
+ break
216
+
217
+ ax = axes[idx]
218
+
219
+ # Filter out None values for this parameter
220
+ plot_df = df[df[param].notna()].copy()
221
+
222
+ if len(plot_df) == 0:
223
+ ax.text(0.5, 0.5, 'No data', ha='center', va='center', transform=ax.transAxes)
224
+ ax.set_title(title)
225
+ continue
226
+
227
+ # Create violin plot
228
+ parts = ax.violinplot(
229
+ [plot_df[plot_df['country'] == 'UK'][param].values,
230
+ plot_df[plot_df['country'] == 'USA'][param].values],
231
+ positions=[0, 1],
232
+ showmeans=True,
233
+ showextrema=True,
234
+ widths=0.7
235
+ )
236
+
237
+ # Color the violins
238
+ for i, pc in enumerate(parts['bodies']):
239
+ country = ['UK', 'USA'][i]
240
+ pc.set_facecolor(colors[country])
241
+ pc.set_alpha(0.7)
242
+ pc.set_edgecolor('black')
243
+ pc.set_linewidth(1.5)
244
+
245
+ # Style the other elements
246
+ for partname in ['cmeans', 'cmaxes', 'cmins', 'cbars']:
247
+ if partname in parts:
248
+ parts[partname].set_edgecolor('black')
249
+ parts[partname].set_linewidth(2)
250
+
251
+ # Add scatter points for individual runs
252
+ for i, country in enumerate(['UK', 'USA']):
253
+ country_data = plot_df[plot_df['country'] == country]
254
+ x_pos = np.random.normal(i, 0.04, size=len(country_data))
255
+ ax.scatter(x_pos, country_data[param].values,
256
+ alpha=0.6, s=50, c='black', zorder=3, edgecolors='white', linewidth=1)
257
+
258
+ # Styling
259
+ ax.set_xticks([0, 1])
260
+ ax.set_xticklabels(['UK', 'USA'])
261
+ ax.set_ylabel(f'{title.split(": ")[1]} {f"({unit})" if unit else ""}'.strip())
262
+ ax.set_title(title, fontweight='bold')
263
+ ax.grid(axis='y', alpha=0.3, linestyle='--')
264
+
265
+ # Add statistics text
266
+ for i, country in enumerate(['UK', 'USA']):
267
+ country_data = plot_df[plot_df['country'] == country][param]
268
+ if len(country_data) > 0:
269
+ mean_val = country_data.mean()
270
+ std_val = country_data.std()
271
+ cv = (std_val / mean_val * 100) if mean_val != 0 else 0
272
+
273
+ text_y = ax.get_ylim()[1] * 0.95 - i * (ax.get_ylim()[1] - ax.get_ylim()[0]) * 0.08
274
+ ax.text(0.98, text_y,
275
+ f'{country}: μ={mean_val:.4f}, σ={std_val:.4f}, CV={cv:.2f}%',
276
+ transform=ax.transData, ha='right', va='top',
277
+ fontsize=9, bbox=dict(boxstyle='round', facecolor=colors[country], alpha=0.3))
278
+
279
+ # Hide unused subplots
280
+ for idx in range(len(all_params), len(axes)):
281
+ axes[idx].set_visible(False)
282
+
283
+ fig.suptitle('Parameter Variability Across Multiple Runs\n(Assessing Nonlinear Fitting Reproducibility)',
284
+ fontsize=16, fontweight='bold', y=0.995)
285
+ plt.tight_layout()
286
+
287
+ if export:
288
+ fig.savefig(output_filename, dpi=400, bbox_inches='tight')
289
+ print(f"\nViolin plot saved as {output_filename}")
290
+
291
+ if show:
292
+ plt.show()
293
+
294
+
295
+ def print_summary_statistics(df: pd.DataFrame):
296
+ """
297
+ Print summary statistics for all parameters.
298
+
299
+ Displays mean, standard deviation, and coefficient of variation (CV%)
300
+ for each parameter grouped by country.
301
+
302
+ :param df: DataFrame with extracted parameters
303
+ :type df: pd.DataFrame
304
+ """
305
+ print("\n" + "="*80)
306
+ print("PARAMETER VARIABILITY SUMMARY")
307
+ print("="*80)
308
+
309
+ for country in ['UK', 'USA']:
310
+ print(f"\n{country} Dataset:")
311
+ print("-" * 40)
312
+
313
+ country_df = df[df['country'] == country]
314
+
315
+ # Mean model parameters
316
+ print("\nMean Model:")
317
+ for param in ['mean_m', 'mean_b', 'mean_r2']:
318
+ if param in country_df.columns:
319
+ values = country_df[param].dropna()
320
+ if len(values) > 0:
321
+ mean = values.mean()
322
+ std = values.std()
323
+ cv = (std / mean * 100) if mean != 0 else 0
324
+ print(f" {param:12s}: μ={mean:10.6f}, σ={std:10.6f}, CV={cv:6.2f}%")
325
+
326
+ # Variance model parameters
327
+ print("\nVariance Model:")
328
+ var_model = country_df['var_model_selected'].mode()[0] if 'var_model_selected' in country_df.columns else 'unknown'
329
+ print(f" Selected model: {var_model}")
330
+
331
+ if var_model == 'power_law':
332
+ for param in ['var_d', 'var_alpha', 'var_r2']:
333
+ if param in country_df.columns:
334
+ values = country_df[param].dropna()
335
+ if len(values) > 0:
336
+ mean = values.mean()
337
+ std = values.std()
338
+ cv = (std / mean * 100) if mean != 0 else 0
339
+ print(f" {param:12s}: μ={mean:10.6f}, σ={std:10.6f}, CV={cv:6.2f}%")
340
+ else:
341
+ for param in ['var_m', 'var_r2']:
342
+ if param in country_df.columns:
343
+ values = country_df[param].dropna()
344
+ if len(values) > 0:
345
+ mean = values.mean()
346
+ std = values.std()
347
+ cv = (std / mean * 100) if mean != 0 else 0
348
+ print(f" {param:12s}: μ={mean:10.6f}, σ={std:10.6f}, CV={cv:6.2f}%")
349
+
350
+ print("\n" + "="*80)
351
+
352
+
353
+ def main():
354
+ """
355
+ Main execution function for analyzing test run parameter variability.
356
+
357
+ Loads regression results from batch directories, extracts parameters,
358
+ computes statistics, and generates violin plots to visualize parameter
359
+ distributions across multiple runs.
360
+ """
361
+
362
+ import sys
363
+
364
+ # Parse command line arguments
365
+ if len(sys.argv) > 1:
366
+ batch_name = sys.argv[1]
367
+ BASE_DIR = Path(f"share/test-runs/{batch_name}")
368
+ output_suffix = f"_{batch_name}"
369
+ else:
370
+ # Try to auto-detect batch directories or use batch1 as default
371
+ test_runs_dir = Path("share/test-runs")
372
+ batch_dirs = [d for d in test_runs_dir.iterdir() if d.is_dir() and d.name.startswith("batch")]
373
+
374
+ if len(batch_dirs) == 0:
375
+ # Fall back to old structure (no batch subdirectories)
376
+ BASE_DIR = Path("share/test-runs")
377
+ output_suffix = ""
378
+ elif len(batch_dirs) == 1:
379
+ # Use the only batch found
380
+ BASE_DIR = batch_dirs[0]
381
+ output_suffix = f"_{batch_dirs[0].name}"
382
+ print(f"Auto-detected batch: {batch_dirs[0].name}")
383
+ else:
384
+ # Multiple batches - ask user or default to batch1
385
+ print(f"Found {len(batch_dirs)} batches: {[d.name for d in batch_dirs]}")
386
+ print("Please specify which batch to analyze:")
387
+ print(" python analyze_test_runs.py batch1")
388
+ print("Or analyze all batches separately by running for each.")
389
+ return
390
+
391
+ if not BASE_DIR.exists():
392
+ print(f"Error: Directory {BASE_DIR} does not exist!")
393
+ return
394
+
395
+ # Auto-detect number of runs
396
+ uk_runs = list(BASE_DIR.glob("UK_run*"))
397
+ usa_runs = list(BASE_DIR.glob("USA_run*"))
398
+ NUM_RUNS = max(len(uk_runs), len(usa_runs))
399
+
400
+ COUNTRIES = ["UK", "USA"]
401
+
402
+ print(f"Loading regression results from {BASE_DIR}...")
403
+ print(f"Detected {NUM_RUNS} runs per country")
404
+
405
+ # Load all results
406
+ all_results = []
407
+ for country in COUNTRIES:
408
+ results = load_regression_results(BASE_DIR, country, NUM_RUNS)
409
+ all_results.extend(results)
410
+ print(f"Loaded {len(results)} runs for {country}")
411
+
412
+ if not all_results:
413
+ print("Error: No results found!")
414
+ return
415
+
416
+ # Extract parameters into DataFrame
417
+ print("\nExtracting parameters...")
418
+ df = extract_parameters(all_results)
419
+
420
+ # Save to CSV for further analysis
421
+ output_csv = f"share/test_runs_parameters{output_suffix}.csv"
422
+ df.to_csv(output_csv, index=False)
423
+ print(f"Parameters saved to {output_csv}")
424
+
425
+ # Print summary statistics
426
+ print_summary_statistics(df)
427
+
428
+ # Create violin plots
429
+ print("\nCreating violin plots...")
430
+ output_plot = f"share/test_runs_violin_plot{output_suffix}.pdf"
431
+ create_violin_plots(df, export=True, show=True, output_filename=output_plot)
432
+
433
+
434
+ if __name__ == "__main__":
435
+ main()
436
+
Binary file
Binary file