cccpm 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,463 @@
1
+ import os
2
+ import pandas as pd
3
+
4
+ from typing import Union
5
+
6
+ import numpy as np
7
+
8
+ from glob import glob
9
+
10
+ from cccpm.models import NetworkDict, ModelDict
11
+ from cccpm.utils import vector_to_upper_triangular_matrix
12
+ from cccpm.scoring import regression_metrics
13
+
14
+
15
+ class ResultsManager:
16
+ """
17
+ A class to handle the aggregation, formatting, and saving of results.
18
+
19
+ Parameters
20
+ ----------
21
+ output_dir : str
22
+ Directory where results will be saved.
23
+ """
24
+ def __init__(self, output_dir: Union[str, None], perm_run: int, n_folds: int, n_features: int, n_params: int = None,
25
+ is_inner_cv: bool = False):
26
+ self.perm_run = perm_run
27
+ self.is_inner_cv = is_inner_cv
28
+ self.results_directory = self.update_results_directory(output_dir=output_dir)
29
+ self.n_folds = n_folds
30
+ self.n_features = n_features
31
+ self.n_params = n_params
32
+
33
+ self.cv_results = pd.DataFrame()
34
+ self.cv_predictions = pd.DataFrame()
35
+ self.cv_edges = self.initialize_edges(n_folds=self.n_folds, n_features=self.n_features,
36
+ n_params=self.n_params)
37
+ self.cv_network_strengths = pd.DataFrame()
38
+ self.agg_results = None
39
+
40
+ def update_results_directory(self, output_dir: Union[str, None]):
41
+ """
42
+ Determine the directory to save results.
43
+
44
+ :param output_dir:
45
+ :return: Results directory path.
46
+ """
47
+ if not self.is_inner_cv and self.perm_run > 0:
48
+ perm_directory = os.path.join(output_dir, 'permutation', f'{self.perm_run}')
49
+ if not os.path.exists(perm_directory):
50
+ os.makedirs(perm_directory)
51
+ return perm_directory
52
+
53
+ if not os.path.exists(output_dir):
54
+ os.makedirs(output_dir, exist_ok=True)
55
+ return output_dir
56
+
57
+ @staticmethod
58
+ def initialize_edges(n_folds, n_features, n_params=None):
59
+ """
60
+ Initialize a dictionary to store edges for cross-validation.
61
+
62
+ :param n_folds: Number of outer folds.
63
+ :param n_features: Number of features in the data.
64
+ :return: Dictionary to store edges.
65
+ """
66
+ if n_params is None:
67
+ return {'positive': np.zeros((n_folds, n_features)), 'negative': np.zeros((n_folds, n_features))}
68
+ else:
69
+ return {'positive': np.zeros((n_folds, n_features, n_params)),
70
+ 'negative': np.zeros((n_folds, n_features, n_params))}
71
+
72
+ def store_edges(self, edges: dict, fold: int, param_id: int = None):
73
+ if param_id is None:
74
+ self.cv_edges['positive'][fold, edges['positive']] = 1
75
+ self.cv_edges['negative'][fold, edges['negative']] = 1
76
+ else:
77
+ self.cv_edges['positive'][fold, edges['positive'], param_id] = 1
78
+ self.cv_edges['negative'][fold, edges['negative'], param_id] = 1
79
+
80
+ def calculate_edge_stability(self, write: bool = True, best_param_id: int = None):
81
+ """
82
+ Calculate and save edge stability and overlap.
83
+
84
+ :param cv_edges: Cross-validation edges.
85
+ :param results_directory: Directory to save the results.
86
+ """
87
+ edge_stability = {}
88
+ for sign, edges in self.cv_edges.items():
89
+ if best_param_id is None:
90
+ edge_stability[sign] = np.sum(edges, axis=0) / edges.shape[0]
91
+ else:
92
+ edge_stability[sign] = np.sum(edges[:, :, best_param_id], axis=0) / edges.shape[0]
93
+
94
+ if write:
95
+ np.save(os.path.join(self.results_directory, f'{sign}_edges.npy'),
96
+ vector_to_upper_triangular_matrix(edges[0]))
97
+ np.save(os.path.join(self.results_directory, f'stability_{sign}_edges.npy'),
98
+ vector_to_upper_triangular_matrix(edge_stability[sign]))
99
+ return edge_stability
100
+
101
+ def calculate_model_increments(self):
102
+ """
103
+ Calculate model increments comparing full model to a baseline.
104
+
105
+ :param cv_results: Cross-validation results.
106
+ :param metrics: List of metrics to calculate.
107
+ :return: Cross-validation results with increments.
108
+ """
109
+ increments = self.cv_results[regression_metrics].xs(key='full', level='model') - self.cv_results[regression_metrics].xs(key='covariates',
110
+ level='model')
111
+ increments['params'] = self.cv_results.xs(key='full', level='model')['params']
112
+ increments['model'] = 'increment'
113
+ increments = increments.set_index('model', append=True)
114
+ self.cv_results = pd.concat([self.cv_results, increments])
115
+ self.cv_results.sort_index(inplace=True)
116
+ return
117
+
118
+ def store_metrics(self, metrics, params, fold, param_id):
119
+ """
120
+ Update metrics DataFrame with new metrics and parameters.
121
+
122
+ :param metrics: Dictionary with computed metrics.
123
+ :param params: Best hyperparameters from inner cross-validation.
124
+ :param fold: Current fold number.
125
+ :return: Updated metrics DataFrame.
126
+ """
127
+ df = pd.DataFrame()
128
+ for model in ModelDict().keys():
129
+ d = pd.DataFrame.from_dict(metrics[model], orient='index')
130
+ d['model'] = [model] * NetworkDict.n_networks()
131
+ d['params'] = [params] * NetworkDict.n_networks()
132
+ d['param_id'] = [param_id] * NetworkDict.n_networks()
133
+ d['fold'] = [fold] * NetworkDict.n_networks()
134
+ df = pd.concat([df, d], axis=0)
135
+ df.reset_index(inplace=True)
136
+ df.rename(columns={'index': 'network'}, inplace=True)
137
+
138
+ self.cv_results = pd.concat([self.cv_results, df], axis=0)
139
+ return
140
+
141
+ def store_predictions(self, y_pred, y_true, params, fold, param_id, test_indices):
142
+ """
143
+ Update predictions DataFrame with new predictions and parameters.
144
+
145
+ :param y_pred: Predicted values.
146
+ :param y_true: True values.
147
+ :param params: Best hyperparameters from inner cross-validation.
148
+ :param fold: Current fold number.
149
+ :return: Updated predictions DataFrame.
150
+ """
151
+ #preds = (pd.DataFrame.from_dict(y_pred).stack().explode().reset_index().rename(
152
+ # {'level_0': 'network', 'level_1': 'model', 0: 'y_pred'}, axis=1).set_index(['network', 'model']))
153
+ preds = (
154
+ pd.DataFrame.from_dict(y_pred)
155
+ .stack()
156
+ .explode()
157
+ .reset_index()
158
+ .rename({'level_0': 'network', 'level_1': 'model', 0: 'y_pred'}, axis=1)
159
+ .set_index(['network', 'model'])
160
+ )
161
+ n_network_model = ModelDict.n_models() * NetworkDict.n_networks()
162
+ preds['y_true'] = np.tile(y_true, n_network_model)
163
+ preds['params'] = [params] * y_true.shape[0] * n_network_model
164
+ preds['fold'] = [fold] * y_true.shape[0] * n_network_model
165
+ preds['param_id'] = [param_id] * y_true.shape[0] * n_network_model
166
+ preds['sample_index'] = np.tile(test_indices, n_network_model) # include indices
167
+ self.cv_predictions = pd.concat([self.cv_predictions, preds], axis=0)
168
+ return
169
+
170
+ def store_network_strengths(self, network_strengths, y_true, fold):
171
+ dfs = list()
172
+ models = ['connectome', 'residuals']
173
+ networks = ['positive', 'negative']
174
+ for model in models:
175
+ for network in networks:
176
+ df = pd.DataFrame()
177
+ df['y_true'] = y_true
178
+ df['network_strength'] = np.squeeze(network_strengths[model][network])
179
+ df['model'] = [model] * network_strengths[model][network].shape[0]
180
+ df['fold'] = [fold] * network_strengths[model][network].shape[0]
181
+ df['network'] = [network] * network_strengths[model][network].shape[0]
182
+ dfs.append(df)
183
+
184
+ df = pd.concat(dfs, axis=0)
185
+ self.cv_network_strengths = pd.concat([self.cv_network_strengths, df], axis=0)
186
+ return
187
+
188
+ @staticmethod
189
+ def load_cv_results(folder):
190
+ """
191
+ Load cross-validation results from a CSV file.
192
+
193
+ :param folder: Directory containing the results file.
194
+ :return: DataFrame with the loaded results.
195
+ """
196
+ results = pd.read_csv(os.path.join(folder, 'cv_results_mean_std.csv'), header=[0, 1], index_col=[0, 1])
197
+ results = results.loc[:, results.columns.get_level_values(1) == 'mean']
198
+ results.columns = results.columns.droplevel(1)
199
+ return results
200
+
201
+ def save_predictions(self): # update save function to sort by index prior to saving
202
+ """
203
+ Save predictions to CSV.
204
+ """
205
+ df = self.cv_predictions.copy()
206
+ df.sort_values(by='sample_index', inplace=True)
207
+ #df.drop(columns='sample_index', inplace=True)
208
+ df.to_csv(os.path.join(self.results_directory, 'cv_predictions.csv'))
209
+ # self.cv_predictions.to_csv(os.path.join(self.results_directory, 'cv_predictions.csv'))
210
+
211
+ def save_network_strengths(self):
212
+ """
213
+ Save network strengths to CSV.
214
+ """
215
+ self.cv_network_strengths.to_csv(os.path.join(self.results_directory, 'cv_network_strengths.csv'))
216
+
217
+ def calculate_final_cv_results(self):
218
+ """
219
+ Calculate mean and standard deviation of cross-validation results and save to CSV.
220
+
221
+ :param cv_results: DataFrame with cross-validation results.
222
+ :param results_directory: Directory to save the results.
223
+ :return: Updated cross-validation results DataFrame.
224
+ """
225
+ self.cv_results.set_index(['fold', 'network', 'model'], inplace=True)
226
+ self.calculate_model_increments()
227
+ self.agg_results = self.cv_results.groupby(['network', 'model'])[regression_metrics].agg(['mean', 'std'])
228
+
229
+ # Save results to CSV
230
+ self.cv_results.to_csv(os.path.join(self.results_directory, 'cv_results.csv'))
231
+ self.agg_results.to_csv(os.path.join(self.results_directory, 'cv_results_mean_std.csv'), float_format='%.4f')
232
+ return
233
+
234
+ def aggregate_inner_folds(self):
235
+ self.cv_results.set_index(['fold', 'param_id', 'network', 'model'], inplace=True)
236
+ self.cv_results.sort_index(inplace=True)
237
+ self.calculate_model_increments()
238
+ self.agg_results = self.cv_results.groupby(['network', 'param_id', 'model'])[regression_metrics].agg(['mean', 'std'])
239
+
240
+ # save inner cv results to csv in case this is not a permutation run
241
+ if self.perm_run == 0:
242
+ self.cv_results.to_csv(os.path.join(self.results_directory, 'inner_cv_results.csv'))
243
+ self.agg_results.to_csv(os.path.join(self.results_directory, 'inner_cv_results_mean_std.csv'))
244
+ return
245
+
246
+ def find_best_params(self):
247
+ # find the best hyperparameter configuration (best edge selection)
248
+ best_params_ids = self.agg_results['spearman_score'].groupby(['network', 'model'])['mean'].idxmax()
249
+ best_params = self.cv_results.loc[(0, best_params_ids.loc[('both', 'full')][1], 'both', 'full'), 'params']
250
+ best_param_id = best_params_ids.loc[('both', 'full')][1]
251
+ return best_params, best_param_id
252
+
253
+ @staticmethod
254
+ def collect_results(fold_id, param_id, param, metrics):
255
+ df = pd.DataFrame()
256
+ for model_type in ModelDict().keys():
257
+ for network in NetworkDict().keys():
258
+ results_dict = metrics[model_type][network]
259
+ results_dict['model'] = model_type
260
+ results_dict['network'] = network
261
+ results_dict['fold'] = fold_id
262
+ results_dict['param_id'] = param_id
263
+ results_dict['params'] = [param]
264
+ df = pd.concat([df, pd.DataFrame(results_dict, index=[0])], ignore_index=True)
265
+ return df
266
+
267
+
268
+ class PermutationManager:
269
+ @staticmethod
270
+ def calculate_p_values(true_results, perms):
271
+ """
272
+ Calculate p-values based on true results and permutation results.
273
+
274
+ :param true_results: DataFrame with the true results.
275
+ :param perms: DataFrame with the permutation results.
276
+ :return: DataFrame with the calculated p-values.
277
+ """
278
+ grouped_true = true_results.groupby(['network', 'model'])
279
+ grouped_perms = perms.groupby(['network', 'model'])
280
+
281
+ p_values = []
282
+ for (name, true_group), (_, perms_group) in zip(grouped_true, grouped_perms):
283
+ p_value_series = PermutationManager._calculate_group_p_value(true_group, perms_group)
284
+ p_values.append(pd.DataFrame(p_value_series).T.assign(network=name[0], model=name[1]))
285
+
286
+ p_values_df = pd.concat(p_values).reset_index(drop=True)
287
+ p_values_df = p_values_df.set_index(['network', 'model'])
288
+ return p_values_df
289
+
290
+ @staticmethod
291
+ def _calculate_group_p_value(true_group, perms_group):
292
+ """
293
+ Calculate p-value for a group of metrics.
294
+
295
+ :param true_group: DataFrame with the true results.
296
+ :param perms_group: DataFrame with the permutation results.
297
+ :return: Series with calculated p-values.
298
+ """
299
+ result_dict = {}
300
+ for column in true_group.columns:
301
+ condition_count = 0
302
+ if column.endswith('error'):
303
+ condition_count = (true_group[column].values[0] > perms_group[column].astype(float)).sum()
304
+ elif column.endswith('score'):
305
+ condition_count = (true_group[column].values[0] < perms_group[column].astype(float)).sum()
306
+
307
+ result_dict[column] = condition_count / (len(perms_group[column]) + 1)
308
+
309
+ return pd.Series(result_dict)
310
+
311
+ @staticmethod
312
+ def calculate_permutation_results(results_directory, logger):
313
+ """
314
+ Calculate and save the permutation test results.
315
+
316
+ :param results_directory: Directory where the results are saved.
317
+ """
318
+ true_results = ResultsManager.load_cv_results(results_directory)
319
+
320
+ perm_dir = os.path.join(results_directory, 'permutation')
321
+ valid_perms = glob(os.path.join(perm_dir, '*'))
322
+ perm_results = list()
323
+ stability_positive = list()
324
+ stability_negative = list()
325
+ for perm_run_folder in valid_perms:
326
+ try:
327
+ perm_res = ResultsManager.load_cv_results(perm_run_folder)
328
+ perm_res['permutation'] = os.path.basename(perm_run_folder)
329
+ perm_res = perm_res.set_index('permutation', append=True)
330
+ perm_results.append(perm_res)
331
+
332
+ # load edge stability
333
+ stability_positive.append(np.load(os.path.join(perm_run_folder, 'stability_positive_edges.npy')))
334
+ stability_negative.append(np.load(os.path.join(perm_run_folder, 'stability_negative_edges.npy')))
335
+
336
+ except FileNotFoundError:
337
+ print(f'No permutation results found for {perm_run_folder}')
338
+ concatenated_df = pd.concat(perm_results)
339
+ concatenated_df.to_csv(os.path.join(results_directory, 'permutation_results.csv'))
340
+ p_values = PermutationManager.calculate_p_values(true_results, concatenated_df)
341
+ p_values.to_csv(os.path.join(results_directory, 'p_values.csv'))
342
+
343
+ # permutation stability
344
+ stability_positive = np.stack(stability_positive)
345
+ stability_negative = np.stack(stability_negative)
346
+
347
+ # actual stability
348
+ true_stability_positive = np.load(os.path.join(results_directory, 'stability_positive_edges.npy'))
349
+ true_stability_negative = np.load(os.path.join(results_directory, 'stability_negative_edges.npy'))
350
+
351
+ use_fdr = True
352
+ if use_fdr:
353
+ calculate_p_values_edges = PermutationManager.calculate_p_values_edges_fdr
354
+ else:
355
+ calculate_p_values_edges = PermutationManager.calculate_p_values_edges_max_value
356
+
357
+ sig_stability_positive = calculate_p_values_edges(true_stability_positive, stability_positive)
358
+ sig_stability_negative = calculate_p_values_edges(true_stability_negative, stability_negative)
359
+
360
+ np.save(os.path.join(results_directory, 'sig_stability_positive_edges.npy'), sig_stability_positive)
361
+ np.save(os.path.join(results_directory, 'sig_stability_negative_edges.npy'), sig_stability_negative)
362
+
363
+ logger.debug("Saving significance of edge stability.")
364
+ logger.info("Permutation test results")
365
+ logger.info(p_values.round(4).to_string())
366
+ return
367
+
368
+ @staticmethod
369
+ def calculate_p_values_edges_max_value(true_stability, permutation_stability):
370
+ """
371
+ Calculate empirical p-values for each edge in a connectivity matrix using the
372
+ max-value method from permutation testing.
373
+
374
+ For each permutation, the maximum value across all edges is taken to construct
375
+ a max-null distribution. Each true edge value is then compared to this distribution
376
+ to compute a p-value, which controls the family-wise error rate (FWER).
377
+
378
+ Parameters
379
+ ----------
380
+ true_stability : ndarray of shape (n_regions, n_regions)
381
+ Symmetric matrix containing the observed stability scores for each edge.
382
+
383
+ permutation_stability : ndarray of shape (n_permutations, n_regions, n_regions)
384
+ Array containing stability scores from each permutation run. Each entry is
385
+ a symmetric matrix of the same shape as `true_stability`.
386
+
387
+ Returns
388
+ -------
389
+ sig_stability : ndarray of shape (n_regions, n_regions)
390
+ Symmetric matrix of empirical p-values for each edge, calculated by comparing
391
+ the true stability values to the max null distribution. The p-values reflect
392
+ the probability of observing a value as extreme or more extreme under the null.
393
+ Family-wise error is controlled via the max-statistic method.
394
+ """
395
+ # n_permutations
396
+ n_permutations = permutation_stability.shape[0]
397
+
398
+ triu_indices = np.triu_indices_from(true_stability, k=1)
399
+
400
+ # Extract only the upper triangle for each permutation (ignores symmetric redundancy and diagonal)
401
+ max_null = np.max(permutation_stability[:, triu_indices[0], triu_indices[1]], axis=1)
402
+
403
+ # Compute significance p-values per edge, comparing against max null distribution
404
+ sig_stability = np.ones_like(true_stability)
405
+
406
+ # For only upper triangle (to avoid redundant computation)
407
+ for i, j in zip(*triu_indices):
408
+ true_val = true_stability[i, j]
409
+ p = (np.sum(max_null >= true_val) + 1) / (n_permutations + 1)
410
+
411
+ sig_stability[i, j] = p
412
+ sig_stability[j, i] = p # symmetric
413
+ return sig_stability
414
+
415
+ @staticmethod
416
+ def calculate_p_values_edges_fdr(true_stability, permutation_stability):
417
+ """
418
+ Calculate FDR-corrected p-values for each edge in a connectivity matrix using
419
+ permutation-based empirical p-values and the Benjamini–Yekutieli procedure.
420
+
421
+ For each edge, an empirical p-value is calculated by comparing the true
422
+ stability score to the distribution of permuted scores at the same edge.
423
+ The Benjamini–Yekutieli (BY) method is then applied to correct for multiple
424
+ comparisons, controlling the false discovery rate (FDR).
425
+
426
+ Parameters
427
+ ----------
428
+ true_stability : ndarray of shape (n_regions, n_regions)
429
+ Symmetric matrix containing the observed stability scores for each edge.
430
+
431
+ permutation_stability : ndarray of shape (n_permutations, n_regions, n_regions)
432
+ Array containing stability scores from each permutation run. Each entry is
433
+ a symmetric matrix of the same shape as `true_stability`.
434
+
435
+ Returns
436
+ -------
437
+ sig_stability : ndarray of shape (n_regions, n_regions)
438
+ Symmetric matrix of FDR-corrected p-values for each edge, calculated by first
439
+ computing empirical p-values and then applying the Benjamini–Yekutieli correction
440
+ to control the expected false discovery rate across all edges.
441
+ """
442
+ n_permutations = permutation_stability.shape[0]
443
+ triu_indices = np.triu_indices_from(true_stability, k=1)
444
+
445
+ # Flatten permutation values at upper triangle positions
446
+ perm_values = permutation_stability[:, triu_indices[0], triu_indices[1]] # shape: (n_permutations, n_edges)
447
+
448
+ # Compute empirical p-values for each edge
449
+ true_values = true_stability[triu_indices]
450
+ p_vals = (np.sum(perm_values >= true_values[None, :], axis=0) + 1) / (n_permutations + 1)
451
+ #p_vals = (np.sum(perm_values >= 1, axis=0) + 1) / 1000
452
+
453
+ # Apply Benjamini-Yekutieli correction
454
+ #_, p_vals_corrected, _, _ = multipletests(p_vals, alpha=0.05, method='fdr_by')
455
+ p_vals_corrected = p_vals
456
+ # Fill into symmetric matrix
457
+ sig_stability = np.ones_like(true_stability)
458
+ for idx, (i, j) in enumerate(zip(*triu_indices)):
459
+ p = p_vals_corrected[idx]
460
+ sig_stability[i, j] = p
461
+ sig_stability[j, i] = p
462
+
463
+ return sig_stability
cccpm/scoring.py ADDED
@@ -0,0 +1,40 @@
1
+ from sklearn.metrics import (mean_squared_error, mean_absolute_error, explained_variance_score)
2
+ from scipy.stats import pearsonr, spearmanr
3
+
4
+
5
+ regression_metrics_functions = {
6
+ 'mean_squared_error': mean_squared_error,
7
+ 'mean_absolute_error': mean_absolute_error,
8
+ 'explained_variance_score': explained_variance_score,
9
+ 'pearson_score': lambda y_true, y_pred: pearsonr(y_true, y_pred)[0],
10
+ 'spearman_score': lambda y_true, y_pred: spearmanr(y_true, y_pred)[0]}
11
+
12
+ regression_metrics = list(regression_metrics_functions.keys())
13
+
14
+
15
+ def score_regression(y_true, y_pred):
16
+ scores = {}
17
+ for metric_name, metric_func in regression_metrics_functions.items():
18
+ scores[metric_name] = metric_func(y_true, y_pred)
19
+ return scores
20
+
21
+
22
+ def apply_metrics(y_true, y_pred, primary_metric_only: bool = False):
23
+ result = {}
24
+ result['spearman_score'] = regression_metrics_functions['spearman_score'](y_true, y_pred)
25
+ for metric_name, metric_func in regression_metrics_functions.items():
26
+ if metric_name == 'spearman_score':
27
+ pass
28
+ if not primary_metric_only:
29
+ result[metric_name] = regression_metrics_functions[metric_name](y_true, y_pred)
30
+ return result
31
+
32
+
33
+ def score_regression_models(y_true, y_pred, primary_metric_only: bool = False):
34
+ scores = {}
35
+ for model in ['full', 'covariates', 'connectome', 'residuals']:
36
+ scores[model] = {}
37
+ for network in ['positive', 'negative', 'both']:
38
+ scores[model][network] = apply_metrics(y_true, y_pred[model][network], primary_metric_only = primary_metric_only)
39
+ return scores
40
+
File without changes