phenosign 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phenosign/__init__.py +19 -0
- phenosign/analysis/__init__.py +7 -0
- phenosign/analysis/hpo_correlation_analyzer.py +658 -0
- phenosign/analysis/synergy_analyzer.py +751 -0
- phenosign/core/__init__.py +14 -0
- phenosign/core/builder.py +356 -0
- phenosign/core/dataset.py +544 -0
- phenosign/core/features_data.py +121 -0
- phenosign/core/predicates.py +271 -0
- phenosign/ontology/__init__.py +5 -0
- phenosign/ontology/_hpo_hierarchy.py +242 -0
- phenosign/ontology/_term_manager.py +188 -0
- phenosign/synergy_tree/__init__.py +18 -0
- phenosign/synergy_tree/builder.py +234 -0
- phenosign/synergy_tree/mi_calculator.py +104 -0
- phenosign/synergy_tree/partition.py +172 -0
- phenosign/synergy_tree/tree_node.py +80 -0
- phenosign/synergy_tree/visualizer.py +124 -0
- phenosign-0.1.1.dist-info/METADATA +127 -0
- phenosign-0.1.1.dist-info/RECORD +23 -0
- phenosign-0.1.1.dist-info/WHEEL +5 -0
- phenosign-0.1.1.dist-info/licenses/LICENSE +28 -0
- phenosign-0.1.1.dist-info/top_level.txt +1 -0
phenosign/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from .analysis import HPOCorrelationAnalyzer, SynergyAnalyzer
|
|
2
|
+
from .core import has_disease, has_gene, has_sex, has_variant_effect, has_exon_and_variant_effect, PhenotypeDatasetBuilder
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.1"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"PhenotypeDatasetBuilder",
|
|
11
|
+
"HPOCorrelationAnalyzer",
|
|
12
|
+
"SynergyAnalyzer",
|
|
13
|
+
"has_disease",
|
|
14
|
+
"has_gene",
|
|
15
|
+
"has_sex",
|
|
16
|
+
"has_variant_effect",
|
|
17
|
+
"has_exon_and_variant_effect"
|
|
18
|
+
|
|
19
|
+
]
|
|
@@ -0,0 +1,658 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from os import path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import plotly.graph_objs as go
|
|
10
|
+
import scipy.stats
|
|
11
|
+
from joblib import Parallel, delayed
|
|
12
|
+
from scipy.sparse import coo_matrix, triu
|
|
13
|
+
from statsmodels.stats.multitest import multipletests
|
|
14
|
+
from tqdm import tqdm
|
|
15
|
+
|
|
16
|
+
from ..core import PhenotypeDataset
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CorrelationResult:
|
|
22
|
+
"""
|
|
23
|
+
A class to store, manage, and visualize HPO pairwise correlation results.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
correlation_results:pd.DataFrame,
|
|
29
|
+
coef_matrix: pd.DataFrame,
|
|
30
|
+
pval_matrix: pd.DataFrame,
|
|
31
|
+
label_mapping: dict[str, str]
|
|
32
|
+
) -> None:
|
|
33
|
+
self.correlation_results = correlation_results
|
|
34
|
+
self.coef_matrix = coef_matrix
|
|
35
|
+
self.pval_matrix = pval_matrix
|
|
36
|
+
self.label_mapping = label_mapping
|
|
37
|
+
self.fig: go.Figure | None = None
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def results_table(self) -> pd.DataFrame:
|
|
41
|
+
"""Get a safe copy of the correlation results table."""
|
|
42
|
+
return self.correlation_results.copy()
|
|
43
|
+
|
|
44
|
+
def save_correlation_results(
|
|
45
|
+
self,
|
|
46
|
+
corr_threshold: float = 0.1,
|
|
47
|
+
adj_pval_threshold: float = 0.05,
|
|
48
|
+
output_file: str="correlation_results.csv"
|
|
49
|
+
) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Save correlation results to a CSV or Excel file.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
corr_threshold : float, default=0.0
|
|
56
|
+
Minimum correlation coefficient to retain.
|
|
57
|
+
|
|
58
|
+
adj_pval_threshold : float, default=0.05
|
|
59
|
+
Maximum adjusted p-value to retain.
|
|
60
|
+
|
|
61
|
+
output_file : str, default="correlation_results.csv"
|
|
62
|
+
Output file path. Supported formats are ``.csv``.
|
|
63
|
+
|
|
64
|
+
Raises
|
|
65
|
+
------
|
|
66
|
+
ValueError
|
|
67
|
+
If correlation results have not been computed or if thresholds
|
|
68
|
+
are invalid.
|
|
69
|
+
"""
|
|
70
|
+
if self.correlation_results.empty:
|
|
71
|
+
logger.warning("Correlation results table is empty. Saving empty file.")
|
|
72
|
+
df = self.correlation_results.copy()
|
|
73
|
+
else:
|
|
74
|
+
df = self.correlation_results.copy()
|
|
75
|
+
if not 0.0 <= corr_threshold <= 1.0:
|
|
76
|
+
raise ValueError("corr_threshold must be between 0.0 and 1.0")
|
|
77
|
+
df = df[df["correlation"].abs() >= corr_threshold]
|
|
78
|
+
|
|
79
|
+
if not 0.0 <= adj_pval_threshold <= 1.0:
|
|
80
|
+
raise ValueError("adj_pval_threshold must be between 0.0 and 1.0")
|
|
81
|
+
df = df[df["adj_p_value"] < adj_pval_threshold]
|
|
82
|
+
|
|
83
|
+
df.to_csv(output_file, index=False)
|
|
84
|
+
|
|
85
|
+
def filter_weak_correlations(
|
|
86
|
+
self,
|
|
87
|
+
corr_threshold: float = 0.1,
|
|
88
|
+
adj_pval_threshold: float = 0.05
|
|
89
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
90
|
+
"""
|
|
91
|
+
Filter the correlation and p-value matrices by effect size and significance.
|
|
92
|
+
|
|
93
|
+
Parameters
|
|
94
|
+
----------
|
|
95
|
+
corr_threshold : float, default=0.1
|
|
96
|
+
Minimum correlation coefficient to retain.
|
|
97
|
+
|
|
98
|
+
adj_pval_threshold : float, default=0.05
|
|
99
|
+
Maximum adjusted p-value to retain.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
tuple[pd.DataFrame, pd.DataFrame]
|
|
104
|
+
Filtered correlation matrix and filtered p-value matrix.
|
|
105
|
+
"""
|
|
106
|
+
coef_matrix = self.coef_matrix.copy()
|
|
107
|
+
p_value = self.pval_matrix.copy()
|
|
108
|
+
|
|
109
|
+
if not 0.0 <= corr_threshold <= 1.0:
|
|
110
|
+
raise ValueError("corr_threshold must be between 0.0 and 1.0")
|
|
111
|
+
mask = coef_matrix.abs() < corr_threshold
|
|
112
|
+
coef_matrix[mask] = np.nan
|
|
113
|
+
p_value[mask] = np.nan
|
|
114
|
+
|
|
115
|
+
if not 0.0 <= adj_pval_threshold <= 1.0:
|
|
116
|
+
raise ValueError("adj_pval_threshold must be between 0.0 and 1.0")
|
|
117
|
+
|
|
118
|
+
if not self.correlation_results.empty:
|
|
119
|
+
non_signif = self.correlation_results.loc[
|
|
120
|
+
(self.correlation_results["adj_p_value"] >= adj_pval_threshold),
|
|
121
|
+
["HPO_A", "HPO_B"]
|
|
122
|
+
]
|
|
123
|
+
for _, row in non_signif.iterrows():
|
|
124
|
+
hpo_a, hpo_b = row["HPO_A"], row["HPO_B"]
|
|
125
|
+
if hpo_a in coef_matrix.index and hpo_b in coef_matrix.columns:
|
|
126
|
+
coef_matrix.loc[hpo_a, hpo_b] = np.nan
|
|
127
|
+
coef_matrix.loc[hpo_b, hpo_a] = np.nan
|
|
128
|
+
p_value.loc[hpo_a, hpo_b] = np.nan
|
|
129
|
+
p_value.loc[hpo_b, hpo_a] = np.nan
|
|
130
|
+
|
|
131
|
+
mask_rows = coef_matrix.isna().all(axis=1)
|
|
132
|
+
mask_cols = coef_matrix.isna().all(axis=0)
|
|
133
|
+
coef_matrix_cleaned = coef_matrix.loc[~mask_rows, ~mask_cols]
|
|
134
|
+
p_value_cleaned = p_value.loc[~mask_rows, ~mask_cols]
|
|
135
|
+
|
|
136
|
+
return coef_matrix_cleaned, p_value_cleaned
|
|
137
|
+
|
|
138
|
+
@staticmethod
|
|
139
|
+
def _format_hpo_pair(
|
|
140
|
+
hpo_id: str,
|
|
141
|
+
label: str | None
|
|
142
|
+
) -> str:
|
|
143
|
+
"""Format an HPO term for display."""
|
|
144
|
+
if label:
|
|
145
|
+
return f"{label} ({hpo_id})"
|
|
146
|
+
return hpo_id
|
|
147
|
+
|
|
148
|
+
@staticmethod
|
|
149
|
+
def _format_pmids_for_tooltip(
|
|
150
|
+
pmids: str | list[str] | None,
|
|
151
|
+
max_pmids: int = 5,
|
|
152
|
+
) -> str:
|
|
153
|
+
"""Format PMID values for hover text."""
|
|
154
|
+
if pmids is None or pmids == "":
|
|
155
|
+
return "None"
|
|
156
|
+
|
|
157
|
+
if isinstance(pmids, str):
|
|
158
|
+
pmid_list = [p.strip() for p in pmids.split(";") if p.strip()]
|
|
159
|
+
else:
|
|
160
|
+
pmid_list = [str(p).strip() for p in pmids if str(p).strip()]
|
|
161
|
+
|
|
162
|
+
if not pmid_list:
|
|
163
|
+
return "None"
|
|
164
|
+
|
|
165
|
+
if len(pmid_list) <= max_pmids:
|
|
166
|
+
return ", ".join(pmid_list)
|
|
167
|
+
|
|
168
|
+
shown = ", ".join(pmid_list[:max_pmids])
|
|
169
|
+
remaining = len(pmid_list) - max_pmids
|
|
170
|
+
return f"{shown} ... (+{remaining} more)"
|
|
171
|
+
|
|
172
|
+
def plot_correlation_heatmap_with_significance(
|
|
173
|
+
self,
|
|
174
|
+
corr_threshold: float = 0.1,
|
|
175
|
+
adj_pval_threshold: float = 0.05,
|
|
176
|
+
title_name: str | None = None,
|
|
177
|
+
) -> go.Figure:
|
|
178
|
+
"""
|
|
179
|
+
Plot an interactive correlation heatmap with statistical filtering.
|
|
180
|
+
"""
|
|
181
|
+
raw_coef, pval_matrix = self.filter_weak_correlations(
|
|
182
|
+
corr_threshold=corr_threshold,
|
|
183
|
+
adj_pval_threshold=adj_pval_threshold
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
if raw_coef.empty or np.isnan(raw_coef.values).all():
|
|
187
|
+
raise ValueError(
|
|
188
|
+
"The coefficient matrix is empty after filtering. "
|
|
189
|
+
"Try adjusting `corr_threshold` or `adj_pval_threshold`."
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
coef_matrix = raw_coef.copy()
|
|
193
|
+
|
|
194
|
+
n_rows, n_cols = coef_matrix.shape
|
|
195
|
+
cell_size = 60 # Base pixel size per cell
|
|
196
|
+
max_dim = max(n_rows, n_cols)
|
|
197
|
+
fig_size = min(1200, max_dim * cell_size) # Cap total figure size to avoid excessive width
|
|
198
|
+
|
|
199
|
+
title_fontsize = max(14 + max_dim // 2, 28)
|
|
200
|
+
label_fontsize = max(8, 12 - max_dim // 8)
|
|
201
|
+
annot_fontsize = max(6, 12 - max_dim // 8)
|
|
202
|
+
|
|
203
|
+
triangle_mask = pd.DataFrame(
|
|
204
|
+
np.tril(np.ones(coef_matrix.shape, dtype=bool), k=0),
|
|
205
|
+
index=coef_matrix.index,
|
|
206
|
+
columns=coef_matrix.columns
|
|
207
|
+
)
|
|
208
|
+
coef_matrix = coef_matrix.where(triangle_mask)
|
|
209
|
+
pval_matrix = pval_matrix.where(triangle_mask)
|
|
210
|
+
display_matrix = coef_matrix.where(triangle_mask)
|
|
211
|
+
|
|
212
|
+
nan_bg = pd.DataFrame(np.nan, index=coef_matrix.index, columns=coef_matrix.columns)
|
|
213
|
+
nan_bg[triangle_mask & coef_matrix.isna()] = 2
|
|
214
|
+
|
|
215
|
+
text_matrix = np.where(
|
|
216
|
+
np.isnan(coef_matrix.values),
|
|
217
|
+
"",
|
|
218
|
+
coef_matrix.round(2).astype(str)
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
counts_lookup = {}
|
|
222
|
+
for _, row in self.correlation_results.iterrows():
|
|
223
|
+
forward = {
|
|
224
|
+
"Coefficient": row["correlation"],
|
|
225
|
+
"P_value": row["p_value"],
|
|
226
|
+
"P_value_corrected": row.get("adj_p_value", None),
|
|
227
|
+
"Count_00": row["n(A:E/B:E)"],
|
|
228
|
+
"Count_01": row["n(A:E/B:O)"],
|
|
229
|
+
"Count_10": row["n(A:O/B:E)"],
|
|
230
|
+
"Count_11": row["n(A:O/B:O)"],
|
|
231
|
+
"n_individuals": row["n_individuals"],
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
backward = {
|
|
235
|
+
"Coefficient": row["correlation"],
|
|
236
|
+
"P_value": row["p_value"],
|
|
237
|
+
"P_value_corrected": row.get("adj_p_value", None),
|
|
238
|
+
"Count_00": row["n(A:E/B:E)"],
|
|
239
|
+
"Count_01": row["n(A:O/B:E)"], # swapped
|
|
240
|
+
"Count_10": row["n(A:E/B:O)"], # swapped
|
|
241
|
+
"Count_11": row["n(A:O/B:O)"],
|
|
242
|
+
"n_individuals": row["n_individuals"],
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
if "n_pmids" in row.index:
|
|
246
|
+
forward["n_pmids"] = row["n_pmids"]
|
|
247
|
+
forward["pmids"] = row.get("pmids", "")
|
|
248
|
+
backward["n_pmids"] = row["n_pmids"]
|
|
249
|
+
backward["pmids"] = row.get("pmids", "")
|
|
250
|
+
|
|
251
|
+
counts_lookup[(row["HPO_A"], row["HPO_B"])] = forward
|
|
252
|
+
counts_lookup[(row["HPO_B"], row["HPO_A"])] = backward
|
|
253
|
+
|
|
254
|
+
hover_text = []
|
|
255
|
+
for i, row in enumerate(coef_matrix.index):
|
|
256
|
+
hover_row = []
|
|
257
|
+
for j, col in enumerate(coef_matrix.columns):
|
|
258
|
+
coef = coef_matrix.iloc[i, j]
|
|
259
|
+
pval = pval_matrix.iloc[i, j]
|
|
260
|
+
|
|
261
|
+
display_row = self._format_hpo_pair(row, self.label_mapping.get(row))
|
|
262
|
+
display_col = self._format_hpo_pair(col, self.label_mapping.get(col))
|
|
263
|
+
|
|
264
|
+
if not triangle_mask.iloc[i, j] or np.isnan(coef):
|
|
265
|
+
hover_row.append("")
|
|
266
|
+
else:
|
|
267
|
+
counts = counts_lookup.get((row, col), {})
|
|
268
|
+
pmid_block = ""
|
|
269
|
+
if "n_pmids" in counts:
|
|
270
|
+
pmid_text = self._format_pmids_for_tooltip(
|
|
271
|
+
counts.get("pmids", ""),
|
|
272
|
+
max_pmids=4,
|
|
273
|
+
)
|
|
274
|
+
pmid_block = (
|
|
275
|
+
f"<b>N_PMIDs</b>: {int(counts.get('n_pmids', 0))}<br>"
|
|
276
|
+
f"<b>PMIDs</b>: {pmid_text}"
|
|
277
|
+
)
|
|
278
|
+
hover_row.append(
|
|
279
|
+
f"<b>HPO_A</b>: {display_col}<br><b>HPO_B</b>: {display_row}<br>"
|
|
280
|
+
f"<b>Corr</b>: {coef:.2f}<br><b>p-val</b>: {pval:.6f}<br>"
|
|
281
|
+
f"<b>adj_p_val</b>: {counts.get('P_value_corrected', np.nan):.6f}<br>"
|
|
282
|
+
f"<b>Counts(A/B): E/E</b>: {counts.get('Count_00', 0)}, "
|
|
283
|
+
f"<b>E/O</b>: {counts.get('Count_01', 0)}, "
|
|
284
|
+
f"<b>O/E</b>: {counts.get('Count_10', 0)}, "
|
|
285
|
+
f"<b>O/O</b>: {counts.get('Count_11', 0)}<br>"
|
|
286
|
+
f"<b>Total_individuals</b>: {counts.get('n_individuals', 0)}<br>"
|
|
287
|
+
f"{pmid_block}"
|
|
288
|
+
)
|
|
289
|
+
hover_text.append(hover_row)
|
|
290
|
+
|
|
291
|
+
coef_matrix.rename(index=self.label_mapping, columns=self.label_mapping, inplace=True)
|
|
292
|
+
|
|
293
|
+
fig = go.Figure()
|
|
294
|
+
fig.add_trace(go.Heatmap(
|
|
295
|
+
z=nan_bg.values,
|
|
296
|
+
x=coef_matrix.columns,
|
|
297
|
+
y=coef_matrix.index,
|
|
298
|
+
colorscale=[[0, "#eef4fb"], [1, "#eef4fb"]],
|
|
299
|
+
showscale=False,
|
|
300
|
+
hoverinfo="skip",
|
|
301
|
+
xgap=1,
|
|
302
|
+
ygap=1,
|
|
303
|
+
))
|
|
304
|
+
fig.add_trace(go.Heatmap(
|
|
305
|
+
z=display_matrix.values,
|
|
306
|
+
x=coef_matrix.columns,
|
|
307
|
+
y=coef_matrix.index,
|
|
308
|
+
colorscale=[
|
|
309
|
+
[0.00, "#203864"], # navy
|
|
310
|
+
[0.50, "#F7F4ED"], # ivory
|
|
311
|
+
[1.00, "#7A1F3D"] # wine
|
|
312
|
+
],
|
|
313
|
+
zmin=-1,
|
|
314
|
+
zmax=1,
|
|
315
|
+
zmid=0,
|
|
316
|
+
text=text_matrix,
|
|
317
|
+
texttemplate=f"<span style='font-size:{annot_fontsize}px'>%{{text}}</span>",
|
|
318
|
+
hovertext=hover_text,
|
|
319
|
+
hoverinfo="text",
|
|
320
|
+
colorbar=dict(title="Corr.", len=0.8, thickness=title_fontsize),
|
|
321
|
+
xgap=1,
|
|
322
|
+
ygap=1,
|
|
323
|
+
))
|
|
324
|
+
|
|
325
|
+
max_ylabel_len = max(len(str(lbl)) for lbl in coef_matrix.index) if not coef_matrix.empty else 10
|
|
326
|
+
left_margin = 60 + max_ylabel_len * label_fontsize
|
|
327
|
+
|
|
328
|
+
clean_subtitle = title_name.strip() if title_name and title_name.strip() else ""
|
|
329
|
+
|
|
330
|
+
main_title = "<b>Phi Coefficient Matrix for HPO Pairwise Associations</b>"
|
|
331
|
+
|
|
332
|
+
full_title = f"{main_title}<br><span style='font-size:0.8em'>{clean_subtitle}</span>" if clean_subtitle else main_title
|
|
333
|
+
|
|
334
|
+
fig.update_layout(
|
|
335
|
+
title=dict(
|
|
336
|
+
text=full_title,
|
|
337
|
+
x=0.5,
|
|
338
|
+
xanchor="center",
|
|
339
|
+
yanchor="top",
|
|
340
|
+
font=dict(
|
|
341
|
+
size=min(title_fontsize, 24),
|
|
342
|
+
family="Arial"
|
|
343
|
+
)
|
|
344
|
+
),
|
|
345
|
+
xaxis=dict(
|
|
346
|
+
tickangle=90,
|
|
347
|
+
tickfont=dict(size=label_fontsize),
|
|
348
|
+
),
|
|
349
|
+
yaxis=dict(
|
|
350
|
+
tickfont=dict(size=label_fontsize),
|
|
351
|
+
scaleanchor="x",
|
|
352
|
+
scaleratio=1
|
|
353
|
+
),
|
|
354
|
+
width=fig_size + left_margin,
|
|
355
|
+
height=fig_size + left_margin,
|
|
356
|
+
plot_bgcolor="white",
|
|
357
|
+
paper_bgcolor="white"
|
|
358
|
+
)
|
|
359
|
+
fig.update_yaxes(autorange="reversed")
|
|
360
|
+
self.fig = fig
|
|
361
|
+
return fig
|
|
362
|
+
|
|
363
|
+
def save_correlation_heatmap(self, output_file: str = "correlation_heatmap.html") -> None:
|
|
364
|
+
"""
|
|
365
|
+
Save a correlation heatmap as an HTML file.
|
|
366
|
+
|
|
367
|
+
Parameters
|
|
368
|
+
----------
|
|
369
|
+
output_file : str
|
|
370
|
+
Output HTML file path.
|
|
371
|
+
"""
|
|
372
|
+
if self.fig is None:
|
|
373
|
+
raise RuntimeError("No heatmap figure found. Please run `plot_correlation_heatmap_with_significance()` first.")
|
|
374
|
+
if not output_file.endswith(".html"):
|
|
375
|
+
raise ValueError("output_file must have a '.html' extension")
|
|
376
|
+
self.fig.write_html(output_file)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
class HPOCorrelationAnalyzer:
|
|
380
|
+
"""
|
|
381
|
+
Analyze pairwise correlations between HPO terms using the Phi coefficient and Fisher's exact test.
|
|
382
|
+
"""
|
|
383
|
+
|
|
384
|
+
def __init__(
|
|
385
|
+
self,
|
|
386
|
+
dataset: PhenotypeDataset,
|
|
387
|
+
min_individuals_for_correlation_test: int = 20,
|
|
388
|
+
) -> None:
|
|
389
|
+
"""
|
|
390
|
+
Parameters
|
|
391
|
+
----------
|
|
392
|
+
dataset : PhenotypeDataset
|
|
393
|
+
Dataset containing HPO feature data and metadata.
|
|
394
|
+
|
|
395
|
+
min_individuals_for_correlation_test : int, default=20
|
|
396
|
+
Minimum number of valid individuals required to evaluate a
|
|
397
|
+
pairwise correlation.
|
|
398
|
+
"""
|
|
399
|
+
|
|
400
|
+
if not isinstance(dataset, PhenotypeDataset):
|
|
401
|
+
raise TypeError("`dataset` must be a `PhenotypeDataset` instance.")
|
|
402
|
+
self.dataset= dataset
|
|
403
|
+
self.hpo_matrix = self.dataset.hpo_data.matrix
|
|
404
|
+
self.hpo_terms = self.hpo_matrix.columns
|
|
405
|
+
self.n_features = self.hpo_matrix.shape[1]
|
|
406
|
+
self.label_mapping = self.dataset.hpo_data.label_mapping
|
|
407
|
+
self.individual_ids = self.hpo_matrix.index
|
|
408
|
+
|
|
409
|
+
relationship_mask_df = self.dataset.hpo_data.relationship_mask
|
|
410
|
+
if relationship_mask_df is not None:
|
|
411
|
+
self.relationship_mask = relationship_mask_df.to_numpy(copy=True)
|
|
412
|
+
else:
|
|
413
|
+
logger.warning("No relationship_mask provided. All feature pairs will be evaluated for correlation.")
|
|
414
|
+
self.relationship_mask = np.zeros((self.n_features, self.n_features))
|
|
415
|
+
np.fill_diagonal(self.relationship_mask, np.nan)
|
|
416
|
+
|
|
417
|
+
self.min_individuals_for_correlation_test = min_individuals_for_correlation_test
|
|
418
|
+
|
|
419
|
+
@staticmethod
|
|
420
|
+
def _calculate_stats( observed_status_A: np.ndarray, observed_status_B: np.ndarray) -> tuple[float, float]:
|
|
421
|
+
"""Compute the Phi correlation coefficient and Fisher's Exact test p-value."""
|
|
422
|
+
confusion_matrix = pd.crosstab(observed_status_A, observed_status_B, dropna=False)
|
|
423
|
+
if confusion_matrix.shape == (2, 2):
|
|
424
|
+
a = confusion_matrix.iloc[0, 0]
|
|
425
|
+
b = confusion_matrix.iloc[0, 1]
|
|
426
|
+
c = confusion_matrix.iloc[1, 0]
|
|
427
|
+
d = confusion_matrix.iloc[1, 1]
|
|
428
|
+
|
|
429
|
+
numerator = (a * d) - (b * c)
|
|
430
|
+
denominator = np.sqrt(int(a + b) * int(c + d) * int(a + c) * int(b + d))
|
|
431
|
+
phi = numerator / denominator if denominator != 0 else np.nan
|
|
432
|
+
else:
|
|
433
|
+
phi = np.nan
|
|
434
|
+
|
|
435
|
+
try:
|
|
436
|
+
_, pval = scipy.stats.fisher_exact(confusion_matrix)
|
|
437
|
+
except:
|
|
438
|
+
pval = np.nan
|
|
439
|
+
|
|
440
|
+
return phi, pval
|
|
441
|
+
|
|
442
|
+
def _calculate_pairwise_correlation(
|
|
443
|
+
self,
|
|
444
|
+
col_a: int,
|
|
445
|
+
col_b: int,
|
|
446
|
+
include_pmids: bool = True
|
|
447
|
+
) -> tuple[int, int, float, float, dict[str,Any]]:
|
|
448
|
+
"""Compute the correlation between two specific HPO term columns."""
|
|
449
|
+
matrix = self.hpo_matrix.values
|
|
450
|
+
mask = (~np.isnan(matrix[:, col_a])) & (~np.isnan(matrix[:, col_b]))
|
|
451
|
+
col_a_values = matrix[mask, col_a]
|
|
452
|
+
col_b_values = matrix[mask, col_b]
|
|
453
|
+
|
|
454
|
+
count_11 = np.sum((col_a_values == 1) & (col_b_values == 1))
|
|
455
|
+
count_10 = np.sum((col_a_values == 1) & (col_b_values == 0))
|
|
456
|
+
count_01 = np.sum((col_a_values == 0) & (col_b_values == 1))
|
|
457
|
+
count_00 = np.sum((col_a_values == 0) & (col_b_values == 0))
|
|
458
|
+
total = len(col_a_values)
|
|
459
|
+
|
|
460
|
+
empty_counts: dict[str, Any] = {
|
|
461
|
+
"00": 0,
|
|
462
|
+
"01": 0,
|
|
463
|
+
"10": 0,
|
|
464
|
+
"11": 0,
|
|
465
|
+
"N": 0,
|
|
466
|
+
"n_pmid": np.nan,
|
|
467
|
+
"pmids": [],
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
if total == 0 or np.all(col_a_values == col_a_values[0]) or np.all(col_b_values == col_b_values[0]):
|
|
471
|
+
return (col_a, col_b, np.nan, np.nan, empty_counts)
|
|
472
|
+
|
|
473
|
+
try:
|
|
474
|
+
coef, p_val = self._calculate_stats(col_a_values, col_b_values)
|
|
475
|
+
if include_pmids:
|
|
476
|
+
individual_ids = self.individual_ids[mask]
|
|
477
|
+
all_pmids_series = self.dataset.get_pmids()
|
|
478
|
+
pmids_list = all_pmids_series.loc[individual_ids].to_numpy()
|
|
479
|
+
|
|
480
|
+
all_pmids = sorted(
|
|
481
|
+
{
|
|
482
|
+
str(pmid)
|
|
483
|
+
for pmids in pmids_list
|
|
484
|
+
if pmids is not None
|
|
485
|
+
for pmid in pmids
|
|
486
|
+
if pd.notna(pmid)
|
|
487
|
+
}
|
|
488
|
+
)
|
|
489
|
+
n_pmids = len(all_pmids)
|
|
490
|
+
else:
|
|
491
|
+
all_pmids = []
|
|
492
|
+
n_pmids = np.nan
|
|
493
|
+
|
|
494
|
+
return (col_a, col_b, coef, p_val, {
|
|
495
|
+
"00": count_00,
|
|
496
|
+
"01": count_01,
|
|
497
|
+
"10": count_10,
|
|
498
|
+
"11": count_11,
|
|
499
|
+
"N": total,
|
|
500
|
+
"n_pmid": n_pmids,
|
|
501
|
+
"pmids": all_pmids,
|
|
502
|
+
})
|
|
503
|
+
except Exception as e:
|
|
504
|
+
logger.error(
|
|
505
|
+
"Error calculating correlation for columns %d and %d: %s",
|
|
506
|
+
col_a,
|
|
507
|
+
col_b,
|
|
508
|
+
e,
|
|
509
|
+
)
|
|
510
|
+
return col_a, col_b, np.nan, np.nan, empty_counts
|
|
511
|
+
|
|
512
|
+
def compute_correlation_matrix(
|
|
513
|
+
self,
|
|
514
|
+
n_jobs: int = -1,
|
|
515
|
+
include_pmids: bool = True
|
|
516
|
+
) -> pd.DataFrame:
|
|
517
|
+
"""
|
|
518
|
+
Compute pairwise correlations between HPO terms.
|
|
519
|
+
|
|
520
|
+
Parameters
|
|
521
|
+
----------
|
|
522
|
+
correlation_type : str | CorrelationType, default="spearman"
|
|
523
|
+
Correlation metric to compute.
|
|
524
|
+
Supported values:
|
|
525
|
+
- "spearman"
|
|
526
|
+
- "phi"
|
|
527
|
+
|
|
528
|
+
n_jobs : int, default=-1
|
|
529
|
+
Number of parallel jobs. ``-1`` uses all available CPUs.
|
|
530
|
+
|
|
531
|
+
include_pmids : bool, default=True
|
|
532
|
+
If ``True``, aggregate PMIDs from contributing individuals.
|
|
533
|
+
|
|
534
|
+
Returns
|
|
535
|
+
-------
|
|
536
|
+
CorrelationResult
|
|
537
|
+
An object encapsulating the long-format correlationnstatistics, symmetric
|
|
538
|
+
score/p-value matrices, and helper plotting methods.
|
|
539
|
+
"""
|
|
540
|
+
x = self.hpo_matrix.to_numpy()
|
|
541
|
+
|
|
542
|
+
has_one = np.any(x == 1)
|
|
543
|
+
has_zero = np.any(x == 0)
|
|
544
|
+
|
|
545
|
+
if not has_one or not has_zero:
|
|
546
|
+
raise ValueError(
|
|
547
|
+
"HPO matrix lacks sufficient variation for correlation analysis.\n"
|
|
548
|
+
f"Detected values: "
|
|
549
|
+
f"{'1 present, ' if has_one else 'no 1, '}"
|
|
550
|
+
f"{'0 present' if has_zero else 'no 0'}.\n"
|
|
551
|
+
"At least one observed (1) and one excluded (0) value are required.\n"
|
|
552
|
+
"Please check your preprocessing (e.g., missing exclusion annotations)."
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
mask = ~np.isnan(x)
|
|
556
|
+
valid_counts = mask.T.astype(int) @ mask.astype(int)
|
|
557
|
+
valid_counts_sparse = triu(coo_matrix(valid_counts), k=1)
|
|
558
|
+
rows, cols, counts = (
|
|
559
|
+
valid_counts_sparse.row,
|
|
560
|
+
valid_counts_sparse.col,
|
|
561
|
+
valid_counts_sparse.data
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
ontology_values = self.relationship_mask[rows, cols]
|
|
565
|
+
ontology_candidate = ~np.isnan(ontology_values)
|
|
566
|
+
|
|
567
|
+
n_pairs_after_ontology = np.sum(ontology_candidate)
|
|
568
|
+
|
|
569
|
+
candidate_idx = np.where(ontology_candidate & (counts >= self.min_individuals_for_correlation_test))[0]
|
|
570
|
+
|
|
571
|
+
rows_cand, cols_cand = rows[candidate_idx], cols[candidate_idx]
|
|
572
|
+
pairs = list(zip(rows_cand, cols_cand))
|
|
573
|
+
|
|
574
|
+
if len(pairs) == 0:
|
|
575
|
+
logger.warning(
|
|
576
|
+
"[Correlation Analysis Blocked]: No HPO term pairs passed the candidate pre-filtering selection.\n"
|
|
577
|
+
"--------------------------------------------------------------------------------------------------\n"
|
|
578
|
+
"DIAGNOSIS SUMMARY:\n"
|
|
579
|
+
f" - Pairs remaining after HPO Hierarchy Masking (excluding ancestors/descendants): {n_pairs_after_ontology}\n"
|
|
580
|
+
f" - Pairs dropped due to low sample size (min_individuals_for_correlation_test={self.min_individuals_for_correlation_test}): {n_pairs_after_ontology}\n"
|
|
581
|
+
"SUGGESTION:\n"
|
|
582
|
+
" Try lowering `min_individuals_for_correlation_test` (e.g., to 10 or 5) when instantiating HPOCorrelationAnalyzer,\n"
|
|
583
|
+
" or check the sample size and missing value distribution in your Phenopackets queue.\n"
|
|
584
|
+
"--------------------------------------------------------------------------------------------------"
|
|
585
|
+
)
|
|
586
|
+
empty_df = pd.DataFrame(columns=["HPO_A", "HPO_B", "correlation", "p_value", "adj_p_value"])
|
|
587
|
+
empty_matrix = pd.DataFrame(index=self.hpo_terms, columns=self.hpo_terms, dtype=float)
|
|
588
|
+
return CorrelationResult(empty_df, empty_matrix, empty_matrix, self.label_mapping)
|
|
589
|
+
|
|
590
|
+
results = Parallel(n_jobs=n_jobs)(
|
|
591
|
+
delayed(self._calculate_pairwise_correlation)(i, j, include_pmids=include_pmids)
|
|
592
|
+
for i, j in tqdm(pairs, desc="Calculating pairwise correlation")
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
coef_matrix = np.full((self.n_features, self.n_features), np.nan)
|
|
596
|
+
pvalue_matrix = np.full((self.n_features, self.n_features), np.nan)
|
|
597
|
+
|
|
598
|
+
rows = []
|
|
599
|
+
for r in results:
|
|
600
|
+
i, j, coef, pval, counts = r
|
|
601
|
+
coef_matrix[i, j] = coef
|
|
602
|
+
coef_matrix[j, i] = coef
|
|
603
|
+
pvalue_matrix[i, j] = pval
|
|
604
|
+
pvalue_matrix[j, i] = pval
|
|
605
|
+
|
|
606
|
+
hpo_a, hpo_b = self.hpo_terms[i], self.hpo_terms[j]
|
|
607
|
+
if j > i:
|
|
608
|
+
if not np.isnan(coef):
|
|
609
|
+
row_data = {
|
|
610
|
+
"HPO_A": hpo_a,
|
|
611
|
+
**({"HPO_A_label": self.label_mapping.get(hpo_a)} if self.label_mapping.get(hpo_a) else {}),
|
|
612
|
+
"HPO_B": hpo_b,
|
|
613
|
+
**({"HPO_B_label": self.label_mapping.get(hpo_b)} if self.label_mapping.get(hpo_b) else {}),
|
|
614
|
+
"correlation": coef,
|
|
615
|
+
"p_value": pval,
|
|
616
|
+
"n(A:E/B:E)": counts["00"],
|
|
617
|
+
"n(A:E/B:O)": counts["01"],
|
|
618
|
+
"n(A:O/B:E)": counts["10"],
|
|
619
|
+
"n(A:O/B:O)": counts["11"],
|
|
620
|
+
"n_individuals": counts["N"],
|
|
621
|
+
}
|
|
622
|
+
if include_pmids:
|
|
623
|
+
row_data["n_pmids"] = counts["n_pmid"]
|
|
624
|
+
row_data["pmids"] = ";".join(counts.get("pmids", []))
|
|
625
|
+
rows.append(row_data)
|
|
626
|
+
|
|
627
|
+
valid_mask = ~(np.isnan(coef_matrix).all(axis=0))
|
|
628
|
+
|
|
629
|
+
if not np.any(valid_mask):
|
|
630
|
+
logger.warning(
|
|
631
|
+
"[Correlation Analysis Empty]: Pairwise calculations finished, but NO valid statistical correlations were found.\n"
|
|
632
|
+
"Possible reasons include:\n"
|
|
633
|
+
" - All calculated correlation coefficients returned NaN due to zero variance (constant terms).\n"
|
|
634
|
+
" - Perfect separation or overlapping annotations skewed the contingency tables.\n"
|
|
635
|
+
"The resulting CorrelationResult matrices will contain entirely NaN values."
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
filtered_columns = self.hpo_terms[valid_mask]
|
|
639
|
+
|
|
640
|
+
self.coef_df = pd.DataFrame(coef_matrix[np.ix_(valid_mask, valid_mask)], index=filtered_columns, columns=filtered_columns)
|
|
641
|
+
self.pval_df = pd.DataFrame(pvalue_matrix[np.ix_(valid_mask, valid_mask)], index=filtered_columns, columns=filtered_columns)
|
|
642
|
+
self.correlation_results = pd.DataFrame(rows)
|
|
643
|
+
|
|
644
|
+
if not self.correlation_results.empty:
|
|
645
|
+
pvals = self.correlation_results["p_value"].values
|
|
646
|
+
_, pvals_corrected, _, _ = multipletests(pvals, method="fdr_bh")
|
|
647
|
+
loc = int(self.correlation_results.columns.get_loc("p_value"))
|
|
648
|
+
self.correlation_results.insert(loc + 1, "adj_p_value", pvals_corrected)
|
|
649
|
+
self.correlation_results.sort_values(by="adj_p_value", ascending=True, inplace=True)
|
|
650
|
+
else:
|
|
651
|
+
self.correlation_results["adj_p_value"] = pd.Series(dtype=float)
|
|
652
|
+
|
|
653
|
+
return CorrelationResult(
|
|
654
|
+
correlation_results = self.correlation_results,
|
|
655
|
+
coef_matrix = self.coef_df,
|
|
656
|
+
pval_matrix = self.pval_df,
|
|
657
|
+
label_mapping = self.label_mapping
|
|
658
|
+
)
|