phenosign 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
phenosign/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ from .analysis import HPOCorrelationAnalyzer, SynergyAnalyzer
2
+ from .core import has_disease, has_gene, has_sex, has_variant_effect, has_exon_and_variant_effect, PhenotypeDatasetBuilder
3
+
4
+
5
+
6
+ __version__ = "0.1.1"
7
+
8
+
9
+ __all__ = [
10
+ "PhenotypeDatasetBuilder",
11
+ "HPOCorrelationAnalyzer",
12
+ "SynergyAnalyzer",
13
+ "has_disease",
14
+ "has_gene",
15
+ "has_sex",
16
+ "has_variant_effect",
17
+ "has_exon_and_variant_effect"
18
+
19
+ ]
@@ -0,0 +1,7 @@
1
+ from .hpo_correlation_analyzer import HPOCorrelationAnalyzer
2
+ from .synergy_analyzer import SynergyAnalyzer
3
+
4
+ __all__ = [
5
+ "HPOCorrelationAnalyzer",
6
+ "SynergyAnalyzer",
7
+ ]
@@ -0,0 +1,658 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from os import path
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.graph_objs as go
10
+ import scipy.stats
11
+ from joblib import Parallel, delayed
12
+ from scipy.sparse import coo_matrix, triu
13
+ from statsmodels.stats.multitest import multipletests
14
+ from tqdm import tqdm
15
+
16
+ from ..core import PhenotypeDataset
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class CorrelationResult:
22
+ """
23
+ A class to store, manage, and visualize HPO pairwise correlation results.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ correlation_results:pd.DataFrame,
29
+ coef_matrix: pd.DataFrame,
30
+ pval_matrix: pd.DataFrame,
31
+ label_mapping: dict[str, str]
32
+ ) -> None:
33
+ self.correlation_results = correlation_results
34
+ self.coef_matrix = coef_matrix
35
+ self.pval_matrix = pval_matrix
36
+ self.label_mapping = label_mapping
37
+ self.fig: go.Figure | None = None
38
+
39
+ @property
40
+ def results_table(self) -> pd.DataFrame:
41
+ """Get a safe copy of the correlation results table."""
42
+ return self.correlation_results.copy()
43
+
44
+ def save_correlation_results(
45
+ self,
46
+ corr_threshold: float = 0.1,
47
+ adj_pval_threshold: float = 0.05,
48
+ output_file: str="correlation_results.csv"
49
+ ) -> None:
50
+ """
51
+ Save correlation results to a CSV or Excel file.
52
+
53
+ Parameters
54
+ ----------
55
+ corr_threshold : float, default=0.0
56
+ Minimum correlation coefficient to retain.
57
+
58
+ adj_pval_threshold : float, default=0.05
59
+ Maximum adjusted p-value to retain.
60
+
61
+ output_file : str, default="correlation_results.csv"
62
+ Output file path. Supported formats are ``.csv``.
63
+
64
+ Raises
65
+ ------
66
+ ValueError
67
+ If correlation results have not been computed or if thresholds
68
+ are invalid.
69
+ """
70
+ if self.correlation_results.empty:
71
+ logger.warning("Correlation results table is empty. Saving empty file.")
72
+ df = self.correlation_results.copy()
73
+ else:
74
+ df = self.correlation_results.copy()
75
+ if not 0.0 <= corr_threshold <= 1.0:
76
+ raise ValueError("corr_threshold must be between 0.0 and 1.0")
77
+ df = df[df["correlation"].abs() >= corr_threshold]
78
+
79
+ if not 0.0 <= adj_pval_threshold <= 1.0:
80
+ raise ValueError("adj_pval_threshold must be between 0.0 and 1.0")
81
+ df = df[df["adj_p_value"] < adj_pval_threshold]
82
+
83
+ df.to_csv(output_file, index=False)
84
+
85
+ def filter_weak_correlations(
86
+ self,
87
+ corr_threshold: float = 0.1,
88
+ adj_pval_threshold: float = 0.05
89
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
90
+ """
91
+ Filter the correlation and p-value matrices by effect size and significance.
92
+
93
+ Parameters
94
+ ----------
95
+ corr_threshold : float, default=0.1
96
+ Minimum correlation coefficient to retain.
97
+
98
+ adj_pval_threshold : float, default=0.05
99
+ Maximum adjusted p-value to retain.
100
+
101
+ Returns
102
+ -------
103
+ tuple[pd.DataFrame, pd.DataFrame]
104
+ Filtered correlation matrix and filtered p-value matrix.
105
+ """
106
+ coef_matrix = self.coef_matrix.copy()
107
+ p_value = self.pval_matrix.copy()
108
+
109
+ if not 0.0 <= corr_threshold <= 1.0:
110
+ raise ValueError("corr_threshold must be between 0.0 and 1.0")
111
+ mask = coef_matrix.abs() < corr_threshold
112
+ coef_matrix[mask] = np.nan
113
+ p_value[mask] = np.nan
114
+
115
+ if not 0.0 <= adj_pval_threshold <= 1.0:
116
+ raise ValueError("adj_pval_threshold must be between 0.0 and 1.0")
117
+
118
+ if not self.correlation_results.empty:
119
+ non_signif = self.correlation_results.loc[
120
+ (self.correlation_results["adj_p_value"] >= adj_pval_threshold),
121
+ ["HPO_A", "HPO_B"]
122
+ ]
123
+ for _, row in non_signif.iterrows():
124
+ hpo_a, hpo_b = row["HPO_A"], row["HPO_B"]
125
+ if hpo_a in coef_matrix.index and hpo_b in coef_matrix.columns:
126
+ coef_matrix.loc[hpo_a, hpo_b] = np.nan
127
+ coef_matrix.loc[hpo_b, hpo_a] = np.nan
128
+ p_value.loc[hpo_a, hpo_b] = np.nan
129
+ p_value.loc[hpo_b, hpo_a] = np.nan
130
+
131
+ mask_rows = coef_matrix.isna().all(axis=1)
132
+ mask_cols = coef_matrix.isna().all(axis=0)
133
+ coef_matrix_cleaned = coef_matrix.loc[~mask_rows, ~mask_cols]
134
+ p_value_cleaned = p_value.loc[~mask_rows, ~mask_cols]
135
+
136
+ return coef_matrix_cleaned, p_value_cleaned
137
+
138
+ @staticmethod
139
+ def _format_hpo_pair(
140
+ hpo_id: str,
141
+ label: str | None
142
+ ) -> str:
143
+ """Format an HPO term for display."""
144
+ if label:
145
+ return f"{label} ({hpo_id})"
146
+ return hpo_id
147
+
148
+ @staticmethod
149
+ def _format_pmids_for_tooltip(
150
+ pmids: str | list[str] | None,
151
+ max_pmids: int = 5,
152
+ ) -> str:
153
+ """Format PMID values for hover text."""
154
+ if pmids is None or pmids == "":
155
+ return "None"
156
+
157
+ if isinstance(pmids, str):
158
+ pmid_list = [p.strip() for p in pmids.split(";") if p.strip()]
159
+ else:
160
+ pmid_list = [str(p).strip() for p in pmids if str(p).strip()]
161
+
162
+ if not pmid_list:
163
+ return "None"
164
+
165
+ if len(pmid_list) <= max_pmids:
166
+ return ", ".join(pmid_list)
167
+
168
+ shown = ", ".join(pmid_list[:max_pmids])
169
+ remaining = len(pmid_list) - max_pmids
170
+ return f"{shown} ... (+{remaining} more)"
171
+
172
+ def plot_correlation_heatmap_with_significance(
173
+ self,
174
+ corr_threshold: float = 0.1,
175
+ adj_pval_threshold: float = 0.05,
176
+ title_name: str | None = None,
177
+ ) -> go.Figure:
178
+ """
179
+ Plot an interactive correlation heatmap with statistical filtering.
180
+ """
181
+ raw_coef, pval_matrix = self.filter_weak_correlations(
182
+ corr_threshold=corr_threshold,
183
+ adj_pval_threshold=adj_pval_threshold
184
+ )
185
+
186
+ if raw_coef.empty or np.isnan(raw_coef.values).all():
187
+ raise ValueError(
188
+ "The coefficient matrix is empty after filtering. "
189
+ "Try adjusting `corr_threshold` or `adj_pval_threshold`."
190
+ )
191
+
192
+ coef_matrix = raw_coef.copy()
193
+
194
+ n_rows, n_cols = coef_matrix.shape
195
+ cell_size = 60 # Base pixel size per cell
196
+ max_dim = max(n_rows, n_cols)
197
+ fig_size = min(1200, max_dim * cell_size) # Cap total figure size to avoid excessive width
198
+
199
+ title_fontsize = max(14 + max_dim // 2, 28)
200
+ label_fontsize = max(8, 12 - max_dim // 8)
201
+ annot_fontsize = max(6, 12 - max_dim // 8)
202
+
203
+ triangle_mask = pd.DataFrame(
204
+ np.tril(np.ones(coef_matrix.shape, dtype=bool), k=0),
205
+ index=coef_matrix.index,
206
+ columns=coef_matrix.columns
207
+ )
208
+ coef_matrix = coef_matrix.where(triangle_mask)
209
+ pval_matrix = pval_matrix.where(triangle_mask)
210
+ display_matrix = coef_matrix.where(triangle_mask)
211
+
212
+ nan_bg = pd.DataFrame(np.nan, index=coef_matrix.index, columns=coef_matrix.columns)
213
+ nan_bg[triangle_mask & coef_matrix.isna()] = 2
214
+
215
+ text_matrix = np.where(
216
+ np.isnan(coef_matrix.values),
217
+ "",
218
+ coef_matrix.round(2).astype(str)
219
+ )
220
+
221
+ counts_lookup = {}
222
+ for _, row in self.correlation_results.iterrows():
223
+ forward = {
224
+ "Coefficient": row["correlation"],
225
+ "P_value": row["p_value"],
226
+ "P_value_corrected": row.get("adj_p_value", None),
227
+ "Count_00": row["n(A:E/B:E)"],
228
+ "Count_01": row["n(A:E/B:O)"],
229
+ "Count_10": row["n(A:O/B:E)"],
230
+ "Count_11": row["n(A:O/B:O)"],
231
+ "n_individuals": row["n_individuals"],
232
+ }
233
+
234
+ backward = {
235
+ "Coefficient": row["correlation"],
236
+ "P_value": row["p_value"],
237
+ "P_value_corrected": row.get("adj_p_value", None),
238
+ "Count_00": row["n(A:E/B:E)"],
239
+ "Count_01": row["n(A:O/B:E)"], # swapped
240
+ "Count_10": row["n(A:E/B:O)"], # swapped
241
+ "Count_11": row["n(A:O/B:O)"],
242
+ "n_individuals": row["n_individuals"],
243
+ }
244
+
245
+ if "n_pmids" in row.index:
246
+ forward["n_pmids"] = row["n_pmids"]
247
+ forward["pmids"] = row.get("pmids", "")
248
+ backward["n_pmids"] = row["n_pmids"]
249
+ backward["pmids"] = row.get("pmids", "")
250
+
251
+ counts_lookup[(row["HPO_A"], row["HPO_B"])] = forward
252
+ counts_lookup[(row["HPO_B"], row["HPO_A"])] = backward
253
+
254
+ hover_text = []
255
+ for i, row in enumerate(coef_matrix.index):
256
+ hover_row = []
257
+ for j, col in enumerate(coef_matrix.columns):
258
+ coef = coef_matrix.iloc[i, j]
259
+ pval = pval_matrix.iloc[i, j]
260
+
261
+ display_row = self._format_hpo_pair(row, self.label_mapping.get(row))
262
+ display_col = self._format_hpo_pair(col, self.label_mapping.get(col))
263
+
264
+ if not triangle_mask.iloc[i, j] or np.isnan(coef):
265
+ hover_row.append("")
266
+ else:
267
+ counts = counts_lookup.get((row, col), {})
268
+ pmid_block = ""
269
+ if "n_pmids" in counts:
270
+ pmid_text = self._format_pmids_for_tooltip(
271
+ counts.get("pmids", ""),
272
+ max_pmids=4,
273
+ )
274
+ pmid_block = (
275
+ f"<b>N_PMIDs</b>: {int(counts.get('n_pmids', 0))}<br>"
276
+ f"<b>PMIDs</b>: {pmid_text}"
277
+ )
278
+ hover_row.append(
279
+ f"<b>HPO_A</b>: {display_col}<br><b>HPO_B</b>: {display_row}<br>"
280
+ f"<b>Corr</b>: {coef:.2f}<br><b>p-val</b>: {pval:.6f}<br>"
281
+ f"<b>adj_p_val</b>: {counts.get('P_value_corrected', np.nan):.6f}<br>"
282
+ f"<b>Counts(A/B): E/E</b>: {counts.get('Count_00', 0)}, "
283
+ f"<b>E/O</b>: {counts.get('Count_01', 0)}, "
284
+ f"<b>O/E</b>: {counts.get('Count_10', 0)}, "
285
+ f"<b>O/O</b>: {counts.get('Count_11', 0)}<br>"
286
+ f"<b>Total_individuals</b>: {counts.get('n_individuals', 0)}<br>"
287
+ f"{pmid_block}"
288
+ )
289
+ hover_text.append(hover_row)
290
+
291
+ coef_matrix.rename(index=self.label_mapping, columns=self.label_mapping, inplace=True)
292
+
293
+ fig = go.Figure()
294
+ fig.add_trace(go.Heatmap(
295
+ z=nan_bg.values,
296
+ x=coef_matrix.columns,
297
+ y=coef_matrix.index,
298
+ colorscale=[[0, "#eef4fb"], [1, "#eef4fb"]],
299
+ showscale=False,
300
+ hoverinfo="skip",
301
+ xgap=1,
302
+ ygap=1,
303
+ ))
304
+ fig.add_trace(go.Heatmap(
305
+ z=display_matrix.values,
306
+ x=coef_matrix.columns,
307
+ y=coef_matrix.index,
308
+ colorscale=[
309
+ [0.00, "#203864"], # navy
310
+ [0.50, "#F7F4ED"], # ivory
311
+ [1.00, "#7A1F3D"] # wine
312
+ ],
313
+ zmin=-1,
314
+ zmax=1,
315
+ zmid=0,
316
+ text=text_matrix,
317
+ texttemplate=f"<span style='font-size:{annot_fontsize}px'>%{{text}}</span>",
318
+ hovertext=hover_text,
319
+ hoverinfo="text",
320
+ colorbar=dict(title="Corr.", len=0.8, thickness=title_fontsize),
321
+ xgap=1,
322
+ ygap=1,
323
+ ))
324
+
325
+ max_ylabel_len = max(len(str(lbl)) for lbl in coef_matrix.index) if not coef_matrix.empty else 10
326
+ left_margin = 60 + max_ylabel_len * label_fontsize
327
+
328
+ clean_subtitle = title_name.strip() if title_name and title_name.strip() else ""
329
+
330
+ main_title = "<b>Phi Coefficient Matrix for HPO Pairwise Associations</b>"
331
+
332
+ full_title = f"{main_title}<br><span style='font-size:0.8em'>{clean_subtitle}</span>" if clean_subtitle else main_title
333
+
334
+ fig.update_layout(
335
+ title=dict(
336
+ text=full_title,
337
+ x=0.5,
338
+ xanchor="center",
339
+ yanchor="top",
340
+ font=dict(
341
+ size=min(title_fontsize, 24),
342
+ family="Arial"
343
+ )
344
+ ),
345
+ xaxis=dict(
346
+ tickangle=90,
347
+ tickfont=dict(size=label_fontsize),
348
+ ),
349
+ yaxis=dict(
350
+ tickfont=dict(size=label_fontsize),
351
+ scaleanchor="x",
352
+ scaleratio=1
353
+ ),
354
+ width=fig_size + left_margin,
355
+ height=fig_size + left_margin,
356
+ plot_bgcolor="white",
357
+ paper_bgcolor="white"
358
+ )
359
+ fig.update_yaxes(autorange="reversed")
360
+ self.fig = fig
361
+ return fig
362
+
363
+ def save_correlation_heatmap(self, output_file: str = "correlation_heatmap.html") -> None:
364
+ """
365
+ Save a correlation heatmap as an HTML file.
366
+
367
+ Parameters
368
+ ----------
369
+ output_file : str
370
+ Output HTML file path.
371
+ """
372
+ if self.fig is None:
373
+ raise RuntimeError("No heatmap figure found. Please run `plot_correlation_heatmap_with_significance()` first.")
374
+ if not output_file.endswith(".html"):
375
+ raise ValueError("output_file must have a '.html' extension")
376
+ self.fig.write_html(output_file)
377
+
378
+
379
+ class HPOCorrelationAnalyzer:
380
+ """
381
+ Analyze pairwise correlations between HPO terms using the Phi coefficient and Fisher's exact test.
382
+ """
383
+
384
+ def __init__(
385
+ self,
386
+ dataset: PhenotypeDataset,
387
+ min_individuals_for_correlation_test: int = 20,
388
+ ) -> None:
389
+ """
390
+ Parameters
391
+ ----------
392
+ dataset : PhenotypeDataset
393
+ Dataset containing HPO feature data and metadata.
394
+
395
+ min_individuals_for_correlation_test : int, default=20
396
+ Minimum number of valid individuals required to evaluate a
397
+ pairwise correlation.
398
+ """
399
+
400
+ if not isinstance(dataset, PhenotypeDataset):
401
+ raise TypeError("`dataset` must be a `PhenotypeDataset` instance.")
402
+ self.dataset= dataset
403
+ self.hpo_matrix = self.dataset.hpo_data.matrix
404
+ self.hpo_terms = self.hpo_matrix.columns
405
+ self.n_features = self.hpo_matrix.shape[1]
406
+ self.label_mapping = self.dataset.hpo_data.label_mapping
407
+ self.individual_ids = self.hpo_matrix.index
408
+
409
+ relationship_mask_df = self.dataset.hpo_data.relationship_mask
410
+ if relationship_mask_df is not None:
411
+ self.relationship_mask = relationship_mask_df.to_numpy(copy=True)
412
+ else:
413
+ logger.warning("No relationship_mask provided. All feature pairs will be evaluated for correlation.")
414
+ self.relationship_mask = np.zeros((self.n_features, self.n_features))
415
+ np.fill_diagonal(self.relationship_mask, np.nan)
416
+
417
+ self.min_individuals_for_correlation_test = min_individuals_for_correlation_test
418
+
419
+ @staticmethod
420
+ def _calculate_stats( observed_status_A: np.ndarray, observed_status_B: np.ndarray) -> tuple[float, float]:
421
+ """Compute the Phi correlation coefficient and Fisher's Exact test p-value."""
422
+ confusion_matrix = pd.crosstab(observed_status_A, observed_status_B, dropna=False)
423
+ if confusion_matrix.shape == (2, 2):
424
+ a = confusion_matrix.iloc[0, 0]
425
+ b = confusion_matrix.iloc[0, 1]
426
+ c = confusion_matrix.iloc[1, 0]
427
+ d = confusion_matrix.iloc[1, 1]
428
+
429
+ numerator = (a * d) - (b * c)
430
+ denominator = np.sqrt(int(a + b) * int(c + d) * int(a + c) * int(b + d))
431
+ phi = numerator / denominator if denominator != 0 else np.nan
432
+ else:
433
+ phi = np.nan
434
+
435
+ try:
436
+ _, pval = scipy.stats.fisher_exact(confusion_matrix)
437
+ except:
438
+ pval = np.nan
439
+
440
+ return phi, pval
441
+
442
+ def _calculate_pairwise_correlation(
443
+ self,
444
+ col_a: int,
445
+ col_b: int,
446
+ include_pmids: bool = True
447
+ ) -> tuple[int, int, float, float, dict[str,Any]]:
448
+ """Compute the correlation between two specific HPO term columns."""
449
+ matrix = self.hpo_matrix.values
450
+ mask = (~np.isnan(matrix[:, col_a])) & (~np.isnan(matrix[:, col_b]))
451
+ col_a_values = matrix[mask, col_a]
452
+ col_b_values = matrix[mask, col_b]
453
+
454
+ count_11 = np.sum((col_a_values == 1) & (col_b_values == 1))
455
+ count_10 = np.sum((col_a_values == 1) & (col_b_values == 0))
456
+ count_01 = np.sum((col_a_values == 0) & (col_b_values == 1))
457
+ count_00 = np.sum((col_a_values == 0) & (col_b_values == 0))
458
+ total = len(col_a_values)
459
+
460
+ empty_counts: dict[str, Any] = {
461
+ "00": 0,
462
+ "01": 0,
463
+ "10": 0,
464
+ "11": 0,
465
+ "N": 0,
466
+ "n_pmid": np.nan,
467
+ "pmids": [],
468
+ }
469
+
470
+ if total == 0 or np.all(col_a_values == col_a_values[0]) or np.all(col_b_values == col_b_values[0]):
471
+ return (col_a, col_b, np.nan, np.nan, empty_counts)
472
+
473
+ try:
474
+ coef, p_val = self._calculate_stats(col_a_values, col_b_values)
475
+ if include_pmids:
476
+ individual_ids = self.individual_ids[mask]
477
+ all_pmids_series = self.dataset.get_pmids()
478
+ pmids_list = all_pmids_series.loc[individual_ids].to_numpy()
479
+
480
+ all_pmids = sorted(
481
+ {
482
+ str(pmid)
483
+ for pmids in pmids_list
484
+ if pmids is not None
485
+ for pmid in pmids
486
+ if pd.notna(pmid)
487
+ }
488
+ )
489
+ n_pmids = len(all_pmids)
490
+ else:
491
+ all_pmids = []
492
+ n_pmids = np.nan
493
+
494
+ return (col_a, col_b, coef, p_val, {
495
+ "00": count_00,
496
+ "01": count_01,
497
+ "10": count_10,
498
+ "11": count_11,
499
+ "N": total,
500
+ "n_pmid": n_pmids,
501
+ "pmids": all_pmids,
502
+ })
503
+ except Exception as e:
504
+ logger.error(
505
+ "Error calculating correlation for columns %d and %d: %s",
506
+ col_a,
507
+ col_b,
508
+ e,
509
+ )
510
+ return col_a, col_b, np.nan, np.nan, empty_counts
511
+
512
+ def compute_correlation_matrix(
513
+ self,
514
+ n_jobs: int = -1,
515
+ include_pmids: bool = True
516
+ ) -> pd.DataFrame:
517
+ """
518
+ Compute pairwise correlations between HPO terms.
519
+
520
+ Parameters
521
+ ----------
522
+ correlation_type : str | CorrelationType, default="spearman"
523
+ Correlation metric to compute.
524
+ Supported values:
525
+ - "spearman"
526
+ - "phi"
527
+
528
+ n_jobs : int, default=-1
529
+ Number of parallel jobs. ``-1`` uses all available CPUs.
530
+
531
+ include_pmids : bool, default=True
532
+ If ``True``, aggregate PMIDs from contributing individuals.
533
+
534
+ Returns
535
+ -------
536
+ CorrelationResult
537
+ An object encapsulating the long-format correlationnstatistics, symmetric
538
+ score/p-value matrices, and helper plotting methods.
539
+ """
540
+ x = self.hpo_matrix.to_numpy()
541
+
542
+ has_one = np.any(x == 1)
543
+ has_zero = np.any(x == 0)
544
+
545
+ if not has_one or not has_zero:
546
+ raise ValueError(
547
+ "HPO matrix lacks sufficient variation for correlation analysis.\n"
548
+ f"Detected values: "
549
+ f"{'1 present, ' if has_one else 'no 1, '}"
550
+ f"{'0 present' if has_zero else 'no 0'}.\n"
551
+ "At least one observed (1) and one excluded (0) value are required.\n"
552
+ "Please check your preprocessing (e.g., missing exclusion annotations)."
553
+ )
554
+
555
+ mask = ~np.isnan(x)
556
+ valid_counts = mask.T.astype(int) @ mask.astype(int)
557
+ valid_counts_sparse = triu(coo_matrix(valid_counts), k=1)
558
+ rows, cols, counts = (
559
+ valid_counts_sparse.row,
560
+ valid_counts_sparse.col,
561
+ valid_counts_sparse.data
562
+ )
563
+
564
+ ontology_values = self.relationship_mask[rows, cols]
565
+ ontology_candidate = ~np.isnan(ontology_values)
566
+
567
+ n_pairs_after_ontology = np.sum(ontology_candidate)
568
+
569
+ candidate_idx = np.where(ontology_candidate & (counts >= self.min_individuals_for_correlation_test))[0]
570
+
571
+ rows_cand, cols_cand = rows[candidate_idx], cols[candidate_idx]
572
+ pairs = list(zip(rows_cand, cols_cand))
573
+
574
+ if len(pairs) == 0:
575
+ logger.warning(
576
+ "[Correlation Analysis Blocked]: No HPO term pairs passed the candidate pre-filtering selection.\n"
577
+ "--------------------------------------------------------------------------------------------------\n"
578
+ "DIAGNOSIS SUMMARY:\n"
579
+ f" - Pairs remaining after HPO Hierarchy Masking (excluding ancestors/descendants): {n_pairs_after_ontology}\n"
580
+ f" - Pairs dropped due to low sample size (min_individuals_for_correlation_test={self.min_individuals_for_correlation_test}): {n_pairs_after_ontology}\n"
581
+ "SUGGESTION:\n"
582
+ " Try lowering `min_individuals_for_correlation_test` (e.g., to 10 or 5) when instantiating HPOCorrelationAnalyzer,\n"
583
+ " or check the sample size and missing value distribution in your Phenopackets queue.\n"
584
+ "--------------------------------------------------------------------------------------------------"
585
+ )
586
+ empty_df = pd.DataFrame(columns=["HPO_A", "HPO_B", "correlation", "p_value", "adj_p_value"])
587
+ empty_matrix = pd.DataFrame(index=self.hpo_terms, columns=self.hpo_terms, dtype=float)
588
+ return CorrelationResult(empty_df, empty_matrix, empty_matrix, self.label_mapping)
589
+
590
+ results = Parallel(n_jobs=n_jobs)(
591
+ delayed(self._calculate_pairwise_correlation)(i, j, include_pmids=include_pmids)
592
+ for i, j in tqdm(pairs, desc="Calculating pairwise correlation")
593
+ )
594
+
595
+ coef_matrix = np.full((self.n_features, self.n_features), np.nan)
596
+ pvalue_matrix = np.full((self.n_features, self.n_features), np.nan)
597
+
598
+ rows = []
599
+ for r in results:
600
+ i, j, coef, pval, counts = r
601
+ coef_matrix[i, j] = coef
602
+ coef_matrix[j, i] = coef
603
+ pvalue_matrix[i, j] = pval
604
+ pvalue_matrix[j, i] = pval
605
+
606
+ hpo_a, hpo_b = self.hpo_terms[i], self.hpo_terms[j]
607
+ if j > i:
608
+ if not np.isnan(coef):
609
+ row_data = {
610
+ "HPO_A": hpo_a,
611
+ **({"HPO_A_label": self.label_mapping.get(hpo_a)} if self.label_mapping.get(hpo_a) else {}),
612
+ "HPO_B": hpo_b,
613
+ **({"HPO_B_label": self.label_mapping.get(hpo_b)} if self.label_mapping.get(hpo_b) else {}),
614
+ "correlation": coef,
615
+ "p_value": pval,
616
+ "n(A:E/B:E)": counts["00"],
617
+ "n(A:E/B:O)": counts["01"],
618
+ "n(A:O/B:E)": counts["10"],
619
+ "n(A:O/B:O)": counts["11"],
620
+ "n_individuals": counts["N"],
621
+ }
622
+ if include_pmids:
623
+ row_data["n_pmids"] = counts["n_pmid"]
624
+ row_data["pmids"] = ";".join(counts.get("pmids", []))
625
+ rows.append(row_data)
626
+
627
+ valid_mask = ~(np.isnan(coef_matrix).all(axis=0))
628
+
629
+ if not np.any(valid_mask):
630
+ logger.warning(
631
+ "[Correlation Analysis Empty]: Pairwise calculations finished, but NO valid statistical correlations were found.\n"
632
+ "Possible reasons include:\n"
633
+ " - All calculated correlation coefficients returned NaN due to zero variance (constant terms).\n"
634
+ " - Perfect separation or overlapping annotations skewed the contingency tables.\n"
635
+ "The resulting CorrelationResult matrices will contain entirely NaN values."
636
+ )
637
+
638
+ filtered_columns = self.hpo_terms[valid_mask]
639
+
640
+ self.coef_df = pd.DataFrame(coef_matrix[np.ix_(valid_mask, valid_mask)], index=filtered_columns, columns=filtered_columns)
641
+ self.pval_df = pd.DataFrame(pvalue_matrix[np.ix_(valid_mask, valid_mask)], index=filtered_columns, columns=filtered_columns)
642
+ self.correlation_results = pd.DataFrame(rows)
643
+
644
+ if not self.correlation_results.empty:
645
+ pvals = self.correlation_results["p_value"].values
646
+ _, pvals_corrected, _, _ = multipletests(pvals, method="fdr_bh")
647
+ loc = int(self.correlation_results.columns.get_loc("p_value"))
648
+ self.correlation_results.insert(loc + 1, "adj_p_value", pvals_corrected)
649
+ self.correlation_results.sort_values(by="adj_p_value", ascending=True, inplace=True)
650
+ else:
651
+ self.correlation_results["adj_p_value"] = pd.Series(dtype=float)
652
+
653
+ return CorrelationResult(
654
+ correlation_results = self.correlation_results,
655
+ coef_matrix = self.coef_df,
656
+ pval_matrix = self.pval_df,
657
+ label_mapping = self.label_mapping
658
+ )