PyEvoMotion 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyEvoMotion/core/core.py CHANGED
@@ -433,7 +433,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
433
433
  length: int,
434
434
  show: bool = False,
435
435
  mutation_kind: str = "all",
436
- export_plots_filename: str | None = None
436
+ export_plots_filename: str | None = None,
437
+ confidence_level: float = 0.95
437
438
  ) -> tuple[pd.DataFrame, dict[str,dict[str,any]]]:
438
439
  """
439
440
  Perform the global analysis of the data.
@@ -446,8 +447,10 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
446
447
  :type show: bool
447
448
  :param mutation_kind: The kind of mutation to compute the statistics for. Has to be one of ``all``, ``total``, ``substitutions`` or ``indels``. Default is ``all``.
448
449
  :type mutation_kind: str
449
- :param export_plots: Filename to export the plots. Default is None and does not export the plots.
450
- :type export_plots: str | None
450
+ :param export_plots_filename: Filename to export the plots. Default is None and does not export the plots.
451
+ :type export_plots_filename: str | None
452
+ :param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
453
+ :type confidence_level: float
451
454
  :return: The statistics and the regression models.
452
455
  :rtype: ``tuple[pd.DataFrame, dict[str, dict[str, any]]]``
453
456
  """
@@ -476,39 +479,49 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
476
479
  stats.index, # Regression is given by the index, so in time, it is the same as multiplying by dt days
477
480
  stats[col],
478
481
  weights
479
- )
482
+ ),
483
+ confidence_level=confidence_level
480
484
  )
481
485
  }
482
486
  elif col.startswith("var"):
483
- _single_regression = self.adjust_model(
487
+ _adjust_result = self.adjust_model(
484
488
  stats.index,
485
489
  stats[col] - stats[col].min(),
486
490
  name=f"scaled {col} model",
487
- weights=weights.to_numpy().flatten()
491
+ weights=weights.to_numpy().flatten(),
492
+ confidence_level=confidence_level
488
493
  )
494
+ # Extract the selected model for backward compatibility while preserving all model info
495
+ model_name = f"scaled {col} model"
496
+ full_result = _adjust_result[model_name]
497
+ selected_model = full_result["selected_model"]
498
+
499
+ # Store both the selected model (for backward compatibility) and full results
500
+ _single_regression = {
501
+ model_name: selected_model,
502
+ f"{model_name}_full_results": full_result
503
+ }
489
504
  # Save the regression model
490
505
  regs.update(_single_regression)
491
506
 
492
507
  # Add scaling correction to the regression models
493
508
  for k, v in regs.items():
494
- if v["expression"] == "mx + b":
495
- m = v["parameters"]["m"]
496
- b = v["parameters"]["b"]
497
- regs[k]["parameters"]["m"] = m/self.dt_ratio
498
- m = regs[k]["parameters"]["m"]
499
- regs[k]["model"] = lambda x: m*x + b
500
- elif v["expression"] == "mx":
501
- m = v["parameters"]["m"]
502
- regs[k]["parameters"]["m"] = m/self.dt_ratio
503
- m = regs[k]["parameters"]["m"]
504
- regs[k]["model"] = lambda x: m*x
505
-
506
- elif v["expression"] == "d*x^alpha":
507
- d = v["parameters"]["d"]
508
- alpha = v["parameters"]["alpha"]
509
- regs[k]["parameters"]["d"] = d/(self.dt_ratio**alpha)
510
- d = regs[k]["parameters"]["d"]
511
- regs[k]["model"] = lambda x: d*(x**alpha)
509
+ # Skip full results entries - we'll handle them separately
510
+ if k.endswith("_full_results"):
511
+ continue
512
+
513
+ # Use the helper method for scaling correction
514
+ self._apply_scaling_correction_to_model(v)
515
+
516
+ # Apply scaling correction to all models in full results
517
+ for k, v in regs.items():
518
+ if k.endswith("_full_results"):
519
+ # Apply scaling to selected model
520
+ self._apply_scaling_correction_to_model(v["selected_model"])
521
+ # Apply scaling to linear model
522
+ self._apply_scaling_correction_to_model(v["linear_model"])
523
+ # Apply scaling to power law model
524
+ self._apply_scaling_correction_to_model(v["power_law_model"])
512
525
 
513
526
  # Sets of mutation types used in the analysis
514
527
  _sets = sorted({
@@ -561,4 +574,40 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
561
574
 
562
575
  return stats, regs
563
576
 
577
+ def _apply_scaling_correction_to_model(self, model: dict[str, any]) -> None:
578
+ """Apply scaling correction to a single model dictionary.
579
+
580
+ :param model: The model dictionary to apply scaling correction to
581
+ :type model: dict[str, any]
582
+ """
583
+ if model["expression"] == "mx + b":
584
+ m = model["parameters"]["m"]
585
+ b = model["parameters"]["b"]
586
+ model["parameters"]["m"] = m/self.dt_ratio
587
+ m = model["parameters"]["m"]
588
+ model["model"] = lambda x: m*x + b
589
+ # Update confidence intervals to match scaled parameters
590
+ if "confidence_intervals" in model:
591
+ m_ci_lower, m_ci_upper = model["confidence_intervals"]["m"]
592
+ model["confidence_intervals"]["m"] = (m_ci_lower/self.dt_ratio, m_ci_upper/self.dt_ratio)
593
+ elif model["expression"] == "mx":
594
+ m = model["parameters"]["m"]
595
+ model["parameters"]["m"] = m/self.dt_ratio
596
+ m = model["parameters"]["m"]
597
+ model["model"] = lambda x: m*x
598
+ # Update confidence intervals to match scaled parameters
599
+ if "confidence_intervals" in model:
600
+ m_ci_lower, m_ci_upper = model["confidence_intervals"]["m"]
601
+ model["confidence_intervals"]["m"] = (m_ci_lower/self.dt_ratio, m_ci_upper/self.dt_ratio)
602
+ elif model["expression"] == "d*x^alpha":
603
+ d = model["parameters"]["d"]
604
+ alpha = model["parameters"]["alpha"]
605
+ model["parameters"]["d"] = d/(self.dt_ratio**alpha)
606
+ d = model["parameters"]["d"]
607
+ model["model"] = lambda x: d*(x**alpha)
608
+ # Update confidence intervals to match scaled parameters
609
+ if "confidence_intervals" in model:
610
+ d_ci_lower, d_ci_upper = model["confidence_intervals"]["d"]
611
+ model["confidence_intervals"]["d"] = (d_ci_lower/(self.dt_ratio**alpha), d_ci_upper/(self.dt_ratio**alpha))
612
+
564
613
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: PyEvoMotion
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Evolutionary motion analysis tool
5
5
  Keywords: evolution,anomalous diffusion,bioinformatics
6
6
  Author: Lucas Goiriz
@@ -0,0 +1,35 @@
1
+ PyEvoMotion/__init__.py,sha256=NqFDD-EZBzouzTwXozZqhPC9sLr7GQaElRKtP0tkHoE,568
2
+ PyEvoMotion/cli.py,sha256=GnTBJDlKjXEtvqhT9bZxEHl-tq4e0QZYqWticCXklo0,18885
3
+ PyEvoMotion/core/__init__.py,sha256=1I-NkFFh6ljLgB_mqQVFLNvCrVKEHLVxa_5dsv3ihWQ,450
4
+ PyEvoMotion/core/base.py,sha256=L_uabRqGgAQy3mXs4QfzE05RuCz-my8ZJcTglsMAg7E,27931
5
+ PyEvoMotion/core/core.py,sha256=RHkIoIYIfteA_zrKrLF9-XemPcenl_BSbHcCwz6Sg-M,22737
6
+ PyEvoMotion/core/parser.py,sha256=w23KzX0jl3NLS0WYjAY1s_2VFEqfn6EoTrQXmGRRXfg,17323
7
+ PyEvoMotion/utils.py,sha256=Ye3eL1RXZOZzzs2KZy0R45u06DOtLYo-zqE45tN2t7g,2859
8
+ share/analyze_model_selection_accuracy.py,sha256=OnGKbmI515bIRdpYMNHGQ9SlZGmVQZi_tFnAX4g2Iyw,12846
9
+ share/analyze_test_runs.py,sha256=AXdz-TdyK7DO1iT_FWrYsONYMs-2HYst_7fofcZ8wxQ,15534
10
+ share/anomalous_diffusion.pdf,sha256=fWUvoxB2J9JRCRRjYEXtPNfJpR3ajbGfmCBiA5_-nzs,19384
11
+ share/confusion_matrix_heatmap.pdf,sha256=GHfupvVgHF4msQjVohc_5KWXmVsPZDueml5zb4sL4Zo,23108
12
+ share/figUK.tsv,sha256=DnPkVfbMGfsOq9x8IAkbPzIWsmQBB1hOXChSNkNlHGo,6711825
13
+ share/figUK_plots.pdf,sha256=6PQw3ujMN0bfj8s8DMstadyoIYna3a2Upig-W1yHWOc,22664
14
+ share/figUK_regression_results.json,sha256=Q-WpaqxNv7O7ZNKtnt9qmqEAifpmmRnarpT0zLCKAR4,1923
15
+ share/figUK_run_args.json,sha256=k5NFbR0YNFBQ7M6TNpLT6G2vLml27iu8DDwGmwVWKqM,344
16
+ share/figUK_stats.tsv,sha256=_gRc1-GYlGp4P7iUWtuiTeDy6cvYVbTX6SrXYUaNF-g,2252
17
+ share/figUSA.tsv,sha256=hSPmKjCFk0a5B0XN75JhQOuhZgfcz9ZCxNrfqMrYcF0,6577552
18
+ share/figUSA_plots.pdf,sha256=OsAseOotQGmFljxvgyqr4NGa4PXDPWo9UFAT47cjc-k,22066
19
+ share/figUSA_regression_results.json,sha256=xgXn3paIYmZ2mEAunx-VKqAaMKotIz2CT8zDiHWlJt4,1918
20
+ share/figUSA_run_args.json,sha256=N5ZBL9W0OcJEyMqcBq2dxSPr9vQUJsnQfflQMdTzmRw,347
21
+ share/figUSA_stats.tsv,sha256=-lC1Gk_t4nB5vQXTjvhAfFS34ILjtkVEILarcDGWHZA,1913
22
+ share/figdataUK.tsv,sha256=HMF07FNT7d3Tb2OMHuFYkRzc6vb5EQ6vj2nJBpXlXJ8,939837
23
+ share/figdataUSA.tsv,sha256=z5yaIwcyfLo7Wr5ioE-x6_qXg9IhT_CmAJxcLTfP4jA,827811
24
+ share/generate_sequences_from_synthdata.py,sha256=_2IdMgcOB7SxAq30iypA-VypSmZyZmMhA8otKQnkfAw,3443
25
+ share/generate_sequences_from_test5_data.py,sha256=H1J4FQgndTSrRbXqEzaHMFI2JGX9oWDhwNnU7uwu534,4127
26
+ share/mafft_install.sh,sha256=pCw70UsKkkNXUsZMwQlQ2b4zSXFrBA7jAj9iOfGLzUw,1007
27
+ share/manuscript_figure.py,sha256=JVwFc236-KtZnmkve0PrPcmMPIeg94jUOaZGq7Z6_QM,41109
28
+ share/run_parallel_analysis.py,sha256=D67RG0ze8xikOkOVeWm716ZNXUwaHR3O7flBioubgRg,6750
29
+ share/synth_figure.pdf,sha256=cqjXjnd0Q14p6NQHLr58vprTot8edKvaWDoq6ZNTkis,26350
30
+ share/uk_time_windows.pdf,sha256=JioGh4bHv8VROvGnySuJ0r7VAtr9ykWeH2M9eAinpSk,35022
31
+ share/weekly_size.pdf,sha256=_YKsCKln8wpgNNwuu9_mFRQfkv4r0g3gLVaVK76MIj8,14602
32
+ pyevomotion-0.1.2.dist-info/METADATA,sha256=Da84PseCS2zeTtQHMJArt584_k_jZLbNh7cCSISh7PI,7833
33
+ pyevomotion-0.1.2.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
34
+ pyevomotion-0.1.2.dist-info/entry_points.txt,sha256=UMzoojYwQi-713hRggkQXUIfGNygUARhTdGs77Usp7s,53
35
+ pyevomotion-0.1.2.dist-info/RECORD,,
@@ -0,0 +1,316 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to analyze model selection accuracy from test5 regression results.
4
+
5
+ This script analyzes the out_regression_results.json files from both linear and powerlaw
6
+ test datasets to compute accuracy metrics and create visualizations.
7
+
8
+ Success criteria:
9
+ - Linear datasets: success when "selected" field is "linear"
10
+ - Powerlaw datasets: success when "selected" field is "power_law"
11
+ """
12
+
13
+ import json
14
+ import os
15
+ import glob
16
+ import pandas as pd
17
+ import matplotlib.pyplot as plt
18
+ import numpy as np
19
+ from pathlib import Path
20
+ from typing import Dict, List
21
+
22
+
23
+ def load_regression_results(directory: str) -> List[Dict]:
24
+ """Load all regression results from a directory."""
25
+ results = []
26
+ pattern = os.path.join(directory, "**", "*out_regression_results.json")
27
+
28
+ for file_path in glob.glob(pattern, recursive=True):
29
+ try:
30
+ with open(file_path, 'r') as f:
31
+ data = json.load(f)
32
+ # Extract the model selection info
33
+ model_selection = data.get("scaled var number of substitutions model", {}).get("model_selection", {})
34
+ results.append({
35
+ 'file': file_path,
36
+ 'selected_model': model_selection.get("selected", "unknown"),
37
+ 'linear_AIC': model_selection.get("linear_AIC", None),
38
+ 'power_law_AIC': model_selection.get("power_law_AIC", None),
39
+ 'delta_AIC_linear': model_selection.get("delta_AIC_linear", None),
40
+ 'delta_AIC_power_law': model_selection.get("delta_AIC_power_law", None),
41
+ 'akaike_weight_linear': model_selection.get("akaike_weight_linear", None),
42
+ 'akaike_weight_power_law': model_selection.get("akaike_weight_power_law", None)
43
+ })
44
+ except Exception as e:
45
+ print(f"Error loading {file_path}: {e}")
46
+
47
+ return results
48
+
49
+
50
+ def analyze_model_selection_accuracy():
51
+ """Analyze model selection accuracy and create visualizations."""
52
+
53
+ # Define paths
54
+ base_path = Path(__file__).parent.parent / "tests" / "data" / "test5"
55
+ linear_dir = base_path / "linear" / "output"
56
+ powerlaw_dir = base_path / "powerlaw" / "output"
57
+
58
+ print("Loading regression results...")
59
+
60
+ # Load results from both directories
61
+ linear_results = load_regression_results(str(linear_dir))
62
+ powerlaw_results = load_regression_results(str(powerlaw_dir))
63
+
64
+ print(f"Loaded {len(linear_results)} linear results")
65
+ print(f"Loaded {len(powerlaw_results)} powerlaw results")
66
+
67
+ # Analyze linear dataset results
68
+ linear_success = sum(1 for r in linear_results if r['selected_model'] == 'linear')
69
+ linear_failure = len(linear_results) - linear_success
70
+
71
+ # Analyze powerlaw dataset results
72
+ powerlaw_success = sum(1 for r in powerlaw_results if r['selected_model'] == 'power_law')
73
+ powerlaw_failure = len(powerlaw_results) - powerlaw_success
74
+
75
+ # Create summary table
76
+ summary_data = {
77
+ 'Dataset Type': ['Linear', 'Powerlaw'],
78
+ 'Total Tests': [len(linear_results), len(powerlaw_results)],
79
+ 'Successes': [linear_success, powerlaw_success],
80
+ 'Failures': [linear_failure, powerlaw_failure],
81
+ 'Success Rate': [linear_success/len(linear_results) if linear_results else 0,
82
+ powerlaw_success/len(powerlaw_results) if powerlaw_results else 0]
83
+ }
84
+
85
+ df = pd.DataFrame(summary_data)
86
+ print("\nModel Selection Accuracy Summary:")
87
+ print("=" * 50)
88
+ print(df.to_string(index=False, float_format='%.3f'))
89
+
90
+ # Calculate overall accuracy metrics
91
+ total_tests = len(linear_results) + len(powerlaw_results)
92
+ total_successes = linear_success + powerlaw_success
93
+ overall_accuracy = total_successes / total_tests if total_tests > 0 else 0
94
+
95
+ # Calculate precision and recall for each model type
96
+ # For linear: TP = linear_success, FP = powerlaw_failure, FN = linear_failure, TN = powerlaw_success
97
+ linear_tp = linear_success
98
+ linear_fp = powerlaw_failure # Powerlaw datasets incorrectly classified as linear
99
+ linear_fn = linear_failure # Linear datasets incorrectly classified as powerlaw
100
+ linear_tn = powerlaw_success # Powerlaw datasets correctly classified as powerlaw
101
+
102
+ # For powerlaw: TP = powerlaw_success, FP = linear_failure, FN = powerlaw_failure, TN = linear_success
103
+ powerlaw_tp = powerlaw_success
104
+ powerlaw_fp = linear_failure # Linear datasets incorrectly classified as powerlaw
105
+ powerlaw_fn = powerlaw_failure # Powerlaw datasets incorrectly classified as linear
106
+ powerlaw_tn = linear_success # Linear datasets correctly classified as linear
107
+
108
+ # Calculate metrics
109
+ linear_precision = linear_tp / (linear_tp + linear_fp) if (linear_tp + linear_fp) > 0 else 0
110
+ linear_recall = linear_tp / (linear_tp + linear_fn) if (linear_tp + linear_fn) > 0 else 0
111
+ linear_specificity = linear_tn / (linear_tn + linear_fp) if (linear_tn + linear_fp) > 0 else 0
112
+
113
+ powerlaw_precision = powerlaw_tp / (powerlaw_tp + powerlaw_fp) if (powerlaw_tp + powerlaw_fp) > 0 else 0
114
+ powerlaw_recall = powerlaw_tp / (powerlaw_tp + powerlaw_fn) if (powerlaw_tp + powerlaw_fn) > 0 else 0
115
+ powerlaw_specificity = powerlaw_tn / (powerlaw_tn + powerlaw_fp) if (powerlaw_tn + powerlaw_fp) > 0 else 0
116
+
117
+ # F1 scores
118
+ linear_f1 = 2 * (linear_precision * linear_recall) / (linear_precision + linear_recall) if (linear_precision + linear_recall) > 0 else 0
119
+ powerlaw_f1 = 2 * (powerlaw_precision * powerlaw_recall) / (powerlaw_precision + powerlaw_recall) if (powerlaw_precision + powerlaw_recall) > 0 else 0
120
+
121
+ print(f"\nOverall Accuracy: {overall_accuracy:.3f} ({total_successes}/{total_tests})")
122
+ print("\nDetailed Metrics:")
123
+ print("=" * 50)
124
+
125
+ metrics_data = {
126
+ 'Model Type': ['Linear', 'Powerlaw'],
127
+ 'Precision': [linear_precision, powerlaw_precision],
128
+ 'Recall (Sensitivity)': [linear_recall, powerlaw_recall],
129
+ 'Specificity': [linear_specificity, powerlaw_specificity],
130
+ 'F1-Score': [linear_f1, powerlaw_f1]
131
+ }
132
+
133
+ metrics_df = pd.DataFrame(metrics_data)
134
+ print(metrics_df.to_string(index=False, float_format='%.3f'))
135
+
136
+ # Create confusion matrix data
137
+ confusion_matrix = np.array([
138
+ [linear_tp, linear_fp], # True Linear, False Linear
139
+ [linear_fn, linear_tn] # False Powerlaw, True Powerlaw
140
+ ])
141
+
142
+ print(f"\nConfusion Matrix:")
143
+ print("=" * 30)
144
+ print(" Predicted")
145
+ print(" Linear Powerlaw")
146
+ print(f"Actual Linear {linear_tp:3d} {linear_fp:3d}")
147
+ print(f" Powerlaw {linear_fn:3d} {linear_tn:3d}")
148
+
149
+ # Create visualizations
150
+ create_bar_chart(summary_data, overall_accuracy)
151
+ create_confusion_matrix_heatmap(confusion_matrix)
152
+ create_metrics_comparison(metrics_data)
153
+
154
+ # Save detailed results
155
+ save_detailed_results(linear_results, powerlaw_results, summary_data, metrics_data, overall_accuracy)
156
+
157
+ return df, metrics_df, overall_accuracy
158
+
159
+
160
+ def create_bar_chart(summary_data: Dict, overall_accuracy: float):
161
+ """Create a bar chart showing success rates."""
162
+
163
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
164
+
165
+ # Bar chart for success/failure counts
166
+ x = np.arange(len(summary_data['Dataset Type']))
167
+ width = 0.35
168
+
169
+ bars1 = ax1.bar(x - width/2, summary_data['Successes'], width, label='Successes', color='green', alpha=0.7)
170
+ bars2 = ax1.bar(x + width/2, summary_data['Failures'], width, label='Failures', color='red', alpha=0.7)
171
+
172
+ ax1.set_xlabel('Dataset Type')
173
+ ax1.set_ylabel('Number of Tests')
174
+ ax1.set_title('Model Selection Results by Dataset Type')
175
+ ax1.set_xticks(x)
176
+ ax1.set_xticklabels(summary_data['Dataset Type'])
177
+ ax1.legend()
178
+ ax1.grid(True, alpha=0.3)
179
+
180
+ # Add value labels on bars
181
+ for bar in bars1:
182
+ height = bar.get_height()
183
+ ax1.text(bar.get_x() + bar.get_width()/2., height + 0.1,
184
+ f'{int(height)}', ha='center', va='bottom')
185
+
186
+ for bar in bars2:
187
+ height = bar.get_height()
188
+ ax1.text(bar.get_x() + bar.get_width()/2., height + 0.1,
189
+ f'{int(height)}', ha='center', va='bottom')
190
+
191
+ # Success rate bar chart
192
+ bars3 = ax2.bar(summary_data['Dataset Type'], summary_data['Success Rate'],
193
+ color=['blue', 'orange'], alpha=0.7)
194
+
195
+ # Add overall accuracy line
196
+ ax2.axhline(y=overall_accuracy, color='red', linestyle='--', linewidth=2,
197
+ label=f'Overall Accuracy: {overall_accuracy:.3f}')
198
+
199
+ ax2.set_xlabel('Dataset Type')
200
+ ax2.set_ylabel('Success Rate')
201
+ ax2.set_title('Model Selection Success Rates')
202
+ ax2.set_ylim(0, 1)
203
+ ax2.legend()
204
+ ax2.grid(True, alpha=0.3)
205
+
206
+ # Add value labels on bars
207
+ for bar in bars3:
208
+ height = bar.get_height()
209
+ ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
210
+ f'{height:.3f}', ha='center', va='bottom')
211
+
212
+ plt.tight_layout()
213
+ plt.savefig('model_selection_accuracy_chart.pdf', dpi=300, bbox_inches='tight')
214
+
215
+
216
+ def create_confusion_matrix_heatmap(confusion_matrix: np.ndarray):
217
+ """Create a heatmap of the confusion matrix."""
218
+
219
+ fig, ax = plt.subplots(figsize=(8, 6))
220
+
221
+ im = ax.imshow(confusion_matrix, interpolation='nearest', cmap='Blues')
222
+ ax.figure.colorbar(im, ax=ax)
223
+
224
+ # Set ticks and labels
225
+ ax.set_xticks([0, 1])
226
+ ax.set_yticks([0, 1])
227
+ ax.set_xticklabels(['Linear', 'Powerlaw'])
228
+ ax.set_yticklabels(['Linear', 'Powerlaw'])
229
+
230
+ # Add text annotations
231
+ thresh = confusion_matrix.max() / 2.
232
+ for i in range(confusion_matrix.shape[0]):
233
+ for j in range(confusion_matrix.shape[1]):
234
+ ax.text(j, i, format(confusion_matrix[i, j], 'd'),
235
+ ha="center", va="center",
236
+ color="white" if confusion_matrix[i, j] > thresh else "black")
237
+
238
+ ax.set_xlabel('Predicted Label')
239
+ ax.set_ylabel('True Label')
240
+ ax.set_title('Confusion Matrix: Model Selection Results')
241
+
242
+ plt.tight_layout()
243
+ plt.savefig('share/confusion_matrix_heatmap.pdf', dpi=300, bbox_inches='tight')
244
+
245
+
246
+ def create_metrics_comparison(metrics_data: Dict):
247
+ """Create a comparison chart of different metrics."""
248
+
249
+ fig, ax = plt.subplots(figsize=(12, 8))
250
+
251
+ x = np.arange(len(metrics_data['Model Type']))
252
+ width = 0.2
253
+
254
+ metrics = ['Precision', 'Recall (Sensitivity)', 'Specificity', 'F1-Score']
255
+ colors = ['blue', 'green', 'orange', 'red']
256
+
257
+ for i, (metric, color) in enumerate(zip(metrics, colors)):
258
+ values = metrics_data[metric]
259
+ ax.bar(x + i * width, values, width, label=metric, color=color, alpha=0.7)
260
+
261
+ ax.set_xlabel('Model Type')
262
+ ax.set_ylabel('Score')
263
+ ax.set_title('Model Selection Performance Metrics Comparison')
264
+ ax.set_xticks(x + width * 1.5)
265
+ ax.set_xticklabels(metrics_data['Model Type'])
266
+ ax.legend()
267
+ ax.set_ylim(0, 1)
268
+ ax.grid(True, alpha=0.3)
269
+
270
+ # Add value labels on bars
271
+ for i, metric in enumerate(metrics):
272
+ values = metrics_data[metric]
273
+ for j, value in enumerate(values):
274
+ ax.text(j + i * width, value + 0.01, f'{value:.3f}',
275
+ ha='center', va='bottom', fontsize=9)
276
+
277
+ plt.tight_layout()
278
+ plt.savefig('share/metrics_comparison_chart.pdf', dpi=300, bbox_inches='tight')
279
+
280
+
281
+ def save_detailed_results(linear_results: List[Dict], powerlaw_results: List[Dict],
282
+ summary_data: Dict, metrics_data: Dict, overall_accuracy: float):
283
+ """Save detailed results to JSON file."""
284
+
285
+ results = {
286
+ 'overall_accuracy': overall_accuracy,
287
+ 'summary': summary_data,
288
+ 'metrics': metrics_data,
289
+ 'linear_results': linear_results,
290
+ 'powerlaw_results': powerlaw_results,
291
+ 'analysis_timestamp': pd.Timestamp.now().isoformat()
292
+ }
293
+
294
+ with open('model_selection_analysis_results.json', 'w') as f:
295
+ json.dump(results, f, indent=2, default=str)
296
+
297
+ print(f"Detailed results saved as 'model_selection_analysis_results.json'")
298
+
299
+
300
+ if __name__ == "__main__":
301
+ print("Model Selection Accuracy Analysis")
302
+ print("=" * 40)
303
+ print("Analyzing regression results from test5 datasets...")
304
+ print("Success criteria:")
305
+ print("- Linear datasets: success when 'selected' = 'linear'")
306
+ print("- Powerlaw datasets: success when 'selected' = 'power_law'")
307
+ print()
308
+
309
+ try:
310
+ summary_df, metrics_df, accuracy = analyze_model_selection_accuracy()
311
+ print(f"\nAnalysis complete! Overall accuracy: {accuracy:.3f}")
312
+
313
+ except Exception as e:
314
+ print(f"Error during analysis: {e}")
315
+ import traceback
316
+ traceback.print_exc()