pylocuszoom 0.2.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pylocuszoom/__init__.py +52 -1
- pylocuszoom/backends/base.py +47 -0
- pylocuszoom/backends/bokeh_backend.py +323 -61
- pylocuszoom/backends/matplotlib_backend.py +133 -7
- pylocuszoom/backends/plotly_backend.py +423 -33
- pylocuszoom/colors.py +3 -1
- pylocuszoom/finemapping.py +0 -1
- pylocuszoom/gene_track.py +232 -23
- pylocuszoom/loaders.py +862 -0
- pylocuszoom/plotter.py +354 -245
- pylocuszoom/py.typed +0 -0
- pylocuszoom/recombination.py +4 -4
- pylocuszoom/schemas.py +395 -0
- {pylocuszoom-0.2.0.dist-info → pylocuszoom-0.5.0.dist-info}/METADATA +125 -31
- pylocuszoom-0.5.0.dist-info/RECORD +24 -0
- pylocuszoom-0.2.0.dist-info/RECORD +0 -21
- {pylocuszoom-0.2.0.dist-info → pylocuszoom-0.5.0.dist-info}/WHEEL +0 -0
- {pylocuszoom-0.2.0.dist-info → pylocuszoom-0.5.0.dist-info}/licenses/LICENSE.md +0 -0
pylocuszoom/loaders.py
ADDED
|
@@ -0,0 +1,862 @@
|
|
|
1
|
+
"""File format loaders for common GWAS, eQTL, and fine-mapping outputs.
|
|
2
|
+
|
|
3
|
+
Convenience functions to load data from standard file formats into
|
|
4
|
+
DataFrames ready for use with LocusZoomPlotter.
|
|
5
|
+
|
|
6
|
+
GWAS formats:
|
|
7
|
+
- PLINK (.assoc, .assoc.linear, .assoc.logistic, .qassoc)
|
|
8
|
+
- REGENIE (.regenie)
|
|
9
|
+
- BOLT-LMM (.stats)
|
|
10
|
+
- GEMMA (.assoc.txt)
|
|
11
|
+
- SAIGE (.txt)
|
|
12
|
+
- Generic TSV/CSV
|
|
13
|
+
|
|
14
|
+
eQTL formats:
|
|
15
|
+
- GTEx significant pairs format
|
|
16
|
+
- eQTL Catalogue format
|
|
17
|
+
- MatrixEQTL output
|
|
18
|
+
|
|
19
|
+
Fine-mapping formats:
|
|
20
|
+
- SuSiE (susieR output)
|
|
21
|
+
- FINEMAP (.snp output)
|
|
22
|
+
- CAVIAR (.set output)
|
|
23
|
+
|
|
24
|
+
Gene annotation formats:
|
|
25
|
+
- GTF/GFF3
|
|
26
|
+
- BED (4-column: chr, start, end, name)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Optional, Union
|
|
31
|
+
|
|
32
|
+
import pandas as pd
|
|
33
|
+
|
|
34
|
+
from .logging import logger
|
|
35
|
+
from .schemas import (
|
|
36
|
+
LoaderValidationError,
|
|
37
|
+
validate_eqtl_dataframe,
|
|
38
|
+
validate_finemapping_dataframe,
|
|
39
|
+
validate_genes_dataframe,
|
|
40
|
+
validate_gwas_dataframe,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# =============================================================================
|
|
44
|
+
# GWAS Loaders
|
|
45
|
+
# =============================================================================
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def load_plink_assoc(
|
|
49
|
+
filepath: Union[str, Path],
|
|
50
|
+
pos_col: str = "ps",
|
|
51
|
+
p_col: str = "p_wald",
|
|
52
|
+
rs_col: str = "rs",
|
|
53
|
+
) -> pd.DataFrame:
|
|
54
|
+
"""Load PLINK association results (.assoc, .assoc.linear, .assoc.logistic, .qassoc).
|
|
55
|
+
|
|
56
|
+
Automatically detects PLINK format variant and maps columns to standard names.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
filepath: Path to PLINK association file.
|
|
60
|
+
pos_col: Output column name for position. Default "ps".
|
|
61
|
+
p_col: Output column name for p-value. Default "p_wald".
|
|
62
|
+
rs_col: Output column name for SNP ID. Default "rs".
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
DataFrame with standardized column names.
|
|
66
|
+
|
|
67
|
+
Example:
|
|
68
|
+
>>> gwas_df = load_plink_assoc("results.assoc.linear")
|
|
69
|
+
>>> fig = plotter.plot(gwas_df, chrom=1, start=1e6, end=2e6)
|
|
70
|
+
"""
|
|
71
|
+
df = pd.read_csv(filepath, sep=r"\s+", comment="#")
|
|
72
|
+
|
|
73
|
+
# Standardize column names (PLINK uses various conventions)
|
|
74
|
+
col_map = {}
|
|
75
|
+
|
|
76
|
+
# Position columns
|
|
77
|
+
for col in ["BP", "POS", "bp", "pos"]:
|
|
78
|
+
if col in df.columns:
|
|
79
|
+
col_map[col] = pos_col
|
|
80
|
+
break
|
|
81
|
+
|
|
82
|
+
# P-value columns
|
|
83
|
+
for col in ["P", "P_BOLT_LMM", "p", "PVAL", "pval", "P_LINREG"]:
|
|
84
|
+
if col in df.columns:
|
|
85
|
+
col_map[col] = p_col
|
|
86
|
+
break
|
|
87
|
+
|
|
88
|
+
# SNP ID columns
|
|
89
|
+
for col in ["SNP", "ID", "rsid", "RSID", "MarkerName", "variant_id"]:
|
|
90
|
+
if col in df.columns:
|
|
91
|
+
col_map[col] = rs_col
|
|
92
|
+
break
|
|
93
|
+
|
|
94
|
+
# Chromosome column (keep as "chr" for reference)
|
|
95
|
+
for col in ["CHR", "chr", "CHROM", "chrom", "#CHROM"]:
|
|
96
|
+
if col in df.columns:
|
|
97
|
+
col_map[col] = "chr"
|
|
98
|
+
break
|
|
99
|
+
|
|
100
|
+
df = df.rename(columns=col_map)
|
|
101
|
+
logger.debug(f"Loaded PLINK file with {len(df)} variants")
|
|
102
|
+
|
|
103
|
+
# Validate output
|
|
104
|
+
validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
|
|
105
|
+
|
|
106
|
+
return df
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def load_regenie(
|
|
110
|
+
filepath: Union[str, Path],
|
|
111
|
+
pos_col: str = "ps",
|
|
112
|
+
p_col: str = "p_wald",
|
|
113
|
+
rs_col: str = "rs",
|
|
114
|
+
) -> pd.DataFrame:
|
|
115
|
+
"""Load REGENIE association results (.regenie).
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
filepath: Path to REGENIE results file.
|
|
119
|
+
pos_col: Output column name for position. Default "ps".
|
|
120
|
+
p_col: Output column name for p-value. Default "p_wald".
|
|
121
|
+
rs_col: Output column name for SNP ID. Default "rs".
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
DataFrame with standardized column names.
|
|
125
|
+
|
|
126
|
+
Example:
|
|
127
|
+
>>> gwas_df = load_regenie("results.regenie")
|
|
128
|
+
"""
|
|
129
|
+
df = pd.read_csv(filepath, sep=r"\s+", comment="#")
|
|
130
|
+
|
|
131
|
+
col_map = {
|
|
132
|
+
"GENPOS": pos_col,
|
|
133
|
+
"ID": rs_col,
|
|
134
|
+
"CHROM": "chr",
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# REGENIE uses LOG10P, need to convert
|
|
138
|
+
if "LOG10P" in df.columns:
|
|
139
|
+
df[p_col] = 10 ** (-df["LOG10P"])
|
|
140
|
+
elif "P" in df.columns:
|
|
141
|
+
col_map["P"] = p_col
|
|
142
|
+
|
|
143
|
+
df = df.rename(columns=col_map)
|
|
144
|
+
logger.debug(f"Loaded REGENIE file with {len(df)} variants")
|
|
145
|
+
validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
|
|
146
|
+
return df
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def load_bolt_lmm(
|
|
150
|
+
filepath: Union[str, Path],
|
|
151
|
+
pos_col: str = "ps",
|
|
152
|
+
p_col: str = "p_wald",
|
|
153
|
+
rs_col: str = "rs",
|
|
154
|
+
) -> pd.DataFrame:
|
|
155
|
+
"""Load BOLT-LMM association results (.stats).
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
filepath: Path to BOLT-LMM stats file.
|
|
159
|
+
pos_col: Output column name for position. Default "ps".
|
|
160
|
+
p_col: Output column name for p-value. Default "p_wald".
|
|
161
|
+
rs_col: Output column name for SNP ID. Default "rs".
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
DataFrame with standardized column names.
|
|
165
|
+
|
|
166
|
+
Example:
|
|
167
|
+
>>> gwas_df = load_bolt_lmm("results.stats")
|
|
168
|
+
"""
|
|
169
|
+
df = pd.read_csv(filepath, sep="\t")
|
|
170
|
+
|
|
171
|
+
col_map = {
|
|
172
|
+
"BP": pos_col,
|
|
173
|
+
"SNP": rs_col,
|
|
174
|
+
"CHR": "chr",
|
|
175
|
+
"P_BOLT_LMM_INF": p_col, # Infinitesimal model (default)
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
# Prefer P_BOLT_LMM if available (full model)
|
|
179
|
+
if "P_BOLT_LMM" in df.columns:
|
|
180
|
+
col_map["P_BOLT_LMM"] = p_col
|
|
181
|
+
if "P_BOLT_LMM_INF" in col_map:
|
|
182
|
+
del col_map["P_BOLT_LMM_INF"]
|
|
183
|
+
|
|
184
|
+
df = df.rename(columns=col_map)
|
|
185
|
+
logger.debug(f"Loaded BOLT-LMM file with {len(df)} variants")
|
|
186
|
+
validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
|
|
187
|
+
return df
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def load_gemma(
|
|
191
|
+
filepath: Union[str, Path],
|
|
192
|
+
pos_col: str = "ps",
|
|
193
|
+
p_col: str = "p_wald",
|
|
194
|
+
rs_col: str = "rs",
|
|
195
|
+
) -> pd.DataFrame:
|
|
196
|
+
"""Load GEMMA association results (.assoc.txt).
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
filepath: Path to GEMMA association file.
|
|
200
|
+
pos_col: Output column name for position. Default "ps".
|
|
201
|
+
p_col: Output column name for p-value. Default "p_wald".
|
|
202
|
+
rs_col: Output column name for SNP ID. Default "rs".
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
DataFrame with standardized column names.
|
|
206
|
+
|
|
207
|
+
Example:
|
|
208
|
+
>>> gwas_df = load_gemma("output.assoc.txt")
|
|
209
|
+
"""
|
|
210
|
+
df = pd.read_csv(filepath, sep="\t")
|
|
211
|
+
|
|
212
|
+
col_map = {
|
|
213
|
+
"ps": pos_col,
|
|
214
|
+
"rs": rs_col,
|
|
215
|
+
"chr": "chr",
|
|
216
|
+
"p_wald": p_col,
|
|
217
|
+
"p_lrt": p_col, # Alternative if p_wald not present
|
|
218
|
+
"p_score": p_col, # Alternative
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
# Only map first matching p-value column
|
|
222
|
+
p_cols = ["p_wald", "p_lrt", "p_score"]
|
|
223
|
+
found_p = False
|
|
224
|
+
for p in p_cols:
|
|
225
|
+
if p in df.columns and not found_p:
|
|
226
|
+
col_map[p] = p_col
|
|
227
|
+
found_p = True
|
|
228
|
+
elif p in col_map:
|
|
229
|
+
del col_map[p]
|
|
230
|
+
|
|
231
|
+
df = df.rename(columns=col_map)
|
|
232
|
+
logger.debug(f"Loaded GEMMA file with {len(df)} variants")
|
|
233
|
+
validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
|
|
234
|
+
return df
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def load_saige(
|
|
238
|
+
filepath: Union[str, Path],
|
|
239
|
+
pos_col: str = "ps",
|
|
240
|
+
p_col: str = "p_wald",
|
|
241
|
+
rs_col: str = "rs",
|
|
242
|
+
) -> pd.DataFrame:
|
|
243
|
+
"""Load SAIGE association results.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
filepath: Path to SAIGE results file.
|
|
247
|
+
pos_col: Output column name for position. Default "ps".
|
|
248
|
+
p_col: Output column name for p-value. Default "p_wald".
|
|
249
|
+
rs_col: Output column name for SNP ID. Default "rs".
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
DataFrame with standardized column names.
|
|
253
|
+
|
|
254
|
+
Example:
|
|
255
|
+
>>> gwas_df = load_saige("results.txt")
|
|
256
|
+
"""
|
|
257
|
+
df = pd.read_csv(filepath, sep="\t")
|
|
258
|
+
|
|
259
|
+
col_map = {
|
|
260
|
+
"POS": pos_col,
|
|
261
|
+
"MarkerID": rs_col,
|
|
262
|
+
"CHR": "chr",
|
|
263
|
+
"p.value": p_col,
|
|
264
|
+
"p.value.NA": p_col, # SPA-adjusted
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
df = df.rename(columns=col_map)
|
|
268
|
+
logger.debug(f"Loaded SAIGE file with {len(df)} variants")
|
|
269
|
+
validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
|
|
270
|
+
return df
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def load_gwas_catalog(
|
|
274
|
+
filepath: Union[str, Path],
|
|
275
|
+
pos_col: str = "ps",
|
|
276
|
+
p_col: str = "p_wald",
|
|
277
|
+
rs_col: str = "rs",
|
|
278
|
+
) -> pd.DataFrame:
|
|
279
|
+
"""Load GWAS Catalog summary statistics format.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
filepath: Path to GWAS Catalog file.
|
|
283
|
+
pos_col: Output column name for position. Default "ps".
|
|
284
|
+
p_col: Output column name for p-value. Default "p_wald".
|
|
285
|
+
rs_col: Output column name for SNP ID. Default "rs".
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
DataFrame with standardized column names.
|
|
289
|
+
"""
|
|
290
|
+
df = pd.read_csv(filepath, sep="\t")
|
|
291
|
+
|
|
292
|
+
col_map = {
|
|
293
|
+
"base_pair_location": pos_col,
|
|
294
|
+
"variant_id": rs_col,
|
|
295
|
+
"chromosome": "chr",
|
|
296
|
+
"p_value": p_col,
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
df = df.rename(columns=col_map)
|
|
300
|
+
logger.debug(f"Loaded GWAS Catalog file with {len(df)} variants")
|
|
301
|
+
validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
|
|
302
|
+
return df
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
# =============================================================================
|
|
306
|
+
# eQTL Loaders
|
|
307
|
+
# =============================================================================
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def load_gtex_eqtl(
|
|
311
|
+
filepath: Union[str, Path],
|
|
312
|
+
gene: Optional[str] = None,
|
|
313
|
+
) -> pd.DataFrame:
|
|
314
|
+
"""Load GTEx eQTL significant pairs format.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
filepath: Path to GTEx eQTL file (e.g., signif_variant_gene_pairs.txt.gz).
|
|
318
|
+
gene: Optional gene to filter to (ENSG ID or gene symbol).
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
DataFrame with columns: pos, p_value, gene, effect.
|
|
322
|
+
|
|
323
|
+
Example:
|
|
324
|
+
>>> eqtl_df = load_gtex_eqtl("GTEx_Analysis.signif_pairs.txt.gz", gene="BRCA1")
|
|
325
|
+
"""
|
|
326
|
+
# GTEx files are often gzipped
|
|
327
|
+
df = pd.read_csv(filepath, sep="\t")
|
|
328
|
+
|
|
329
|
+
# Map GTEx columns to standard format
|
|
330
|
+
col_map = {}
|
|
331
|
+
|
|
332
|
+
# Variant position (GTEx uses variant_id like chr1_12345_A_G_b38)
|
|
333
|
+
if "variant_id" in df.columns:
|
|
334
|
+
# Extract position from variant_id
|
|
335
|
+
df["pos"] = df["variant_id"].str.split("_").str[1].astype(int)
|
|
336
|
+
elif "pos" not in df.columns:
|
|
337
|
+
for col in ["tss_distance", "POS"]:
|
|
338
|
+
if col in df.columns:
|
|
339
|
+
col_map[col] = "pos"
|
|
340
|
+
break
|
|
341
|
+
|
|
342
|
+
# P-value
|
|
343
|
+
for col in ["pval_nominal", "p_value", "pvalue", "P"]:
|
|
344
|
+
if col in df.columns:
|
|
345
|
+
col_map[col] = "p_value"
|
|
346
|
+
break
|
|
347
|
+
|
|
348
|
+
# Gene
|
|
349
|
+
for col in ["gene_id", "gene_name", "phenotype_id"]:
|
|
350
|
+
if col in df.columns:
|
|
351
|
+
col_map[col] = "gene"
|
|
352
|
+
break
|
|
353
|
+
|
|
354
|
+
# Effect size (slope)
|
|
355
|
+
for col in ["slope", "beta", "effect_size"]:
|
|
356
|
+
if col in df.columns:
|
|
357
|
+
col_map[col] = "effect"
|
|
358
|
+
break
|
|
359
|
+
|
|
360
|
+
df = df.rename(columns=col_map)
|
|
361
|
+
|
|
362
|
+
# Filter to gene if specified
|
|
363
|
+
if gene is not None and "gene" in df.columns:
|
|
364
|
+
# Match either ENSG ID or gene symbol
|
|
365
|
+
mask = df["gene"].str.contains(gene, case=False, na=False)
|
|
366
|
+
df = df[mask]
|
|
367
|
+
|
|
368
|
+
logger.debug(f"Loaded GTEx eQTL file with {len(df)} associations")
|
|
369
|
+
|
|
370
|
+
# Validate if required columns present
|
|
371
|
+
if "pos" in df.columns and "p_value" in df.columns and "gene" in df.columns:
|
|
372
|
+
validate_eqtl_dataframe(df)
|
|
373
|
+
|
|
374
|
+
return df
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def load_eqtl_catalogue(
|
|
378
|
+
filepath: Union[str, Path],
|
|
379
|
+
gene: Optional[str] = None,
|
|
380
|
+
) -> pd.DataFrame:
|
|
381
|
+
"""Load eQTL Catalogue format.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
filepath: Path to eQTL Catalogue file.
|
|
385
|
+
gene: Optional gene to filter to.
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
DataFrame with columns: pos, p_value, gene, effect.
|
|
389
|
+
"""
|
|
390
|
+
df = pd.read_csv(filepath, sep="\t")
|
|
391
|
+
|
|
392
|
+
col_map = {
|
|
393
|
+
"position": "pos",
|
|
394
|
+
"pvalue": "p_value",
|
|
395
|
+
"gene_id": "gene",
|
|
396
|
+
"beta": "effect",
|
|
397
|
+
"chromosome": "chr",
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
df = df.rename(columns=col_map)
|
|
401
|
+
|
|
402
|
+
if gene is not None and "gene" in df.columns:
|
|
403
|
+
mask = df["gene"].str.contains(gene, case=False, na=False)
|
|
404
|
+
df = df[mask]
|
|
405
|
+
|
|
406
|
+
logger.debug(f"Loaded eQTL Catalogue file with {len(df)} associations")
|
|
407
|
+
|
|
408
|
+
if "pos" in df.columns and "p_value" in df.columns and "gene" in df.columns:
|
|
409
|
+
validate_eqtl_dataframe(df)
|
|
410
|
+
|
|
411
|
+
return df
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def load_matrixeqtl(
|
|
415
|
+
filepath: Union[str, Path],
|
|
416
|
+
gene: Optional[str] = None,
|
|
417
|
+
) -> pd.DataFrame:
|
|
418
|
+
"""Load MatrixEQTL output format.
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
filepath: Path to MatrixEQTL output file.
|
|
422
|
+
gene: Optional gene to filter to.
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
DataFrame with columns: pos, p_value, gene, effect.
|
|
426
|
+
|
|
427
|
+
Note:
|
|
428
|
+
MatrixEQTL output doesn't include position by default.
|
|
429
|
+
You may need to merge with a SNP annotation file.
|
|
430
|
+
"""
|
|
431
|
+
df = pd.read_csv(filepath, sep="\t")
|
|
432
|
+
|
|
433
|
+
col_map = {
|
|
434
|
+
"SNP": "rs",
|
|
435
|
+
"gene": "gene",
|
|
436
|
+
"p-value": "p_value",
|
|
437
|
+
"pvalue": "p_value",
|
|
438
|
+
"beta": "effect",
|
|
439
|
+
"t-stat": "t_stat",
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
df = df.rename(columns=col_map)
|
|
443
|
+
|
|
444
|
+
if gene is not None and "gene" in df.columns:
|
|
445
|
+
df = df[df["gene"] == gene]
|
|
446
|
+
|
|
447
|
+
logger.debug(f"Loaded MatrixEQTL file with {len(df)} associations")
|
|
448
|
+
return df
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
# =============================================================================
|
|
452
|
+
# Fine-mapping Loaders
|
|
453
|
+
# =============================================================================
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def load_susie(
|
|
457
|
+
filepath: Union[str, Path],
|
|
458
|
+
cs_col: str = "cs",
|
|
459
|
+
) -> pd.DataFrame:
|
|
460
|
+
"""Load SuSiE fine-mapping results.
|
|
461
|
+
|
|
462
|
+
Supports both R susieR output (saved as TSV) and SuSiE-inf output.
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
filepath: Path to SuSiE results file.
|
|
466
|
+
cs_col: Output column name for credible set. Default "cs".
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
DataFrame with columns: pos, pip, cs.
|
|
470
|
+
|
|
471
|
+
Example:
|
|
472
|
+
>>> fm_df = load_susie("susie_results.tsv")
|
|
473
|
+
>>> fig = plotter.plot_stacked([gwas_df], ..., finemapping_df=fm_df)
|
|
474
|
+
"""
|
|
475
|
+
df = pd.read_csv(filepath, sep="\t")
|
|
476
|
+
|
|
477
|
+
col_map = {}
|
|
478
|
+
|
|
479
|
+
# Position
|
|
480
|
+
for col in ["pos", "position", "BP", "bp", "POS"]:
|
|
481
|
+
if col in df.columns:
|
|
482
|
+
col_map[col] = "pos"
|
|
483
|
+
break
|
|
484
|
+
|
|
485
|
+
# PIP (posterior inclusion probability)
|
|
486
|
+
for col in ["pip", "PIP", "posterior_prob", "prob"]:
|
|
487
|
+
if col in df.columns:
|
|
488
|
+
col_map[col] = "pip"
|
|
489
|
+
break
|
|
490
|
+
|
|
491
|
+
# Credible set
|
|
492
|
+
for col in ["cs", "CS", "credible_set", "cs_index", "L"]:
|
|
493
|
+
if col in df.columns:
|
|
494
|
+
col_map[col] = cs_col
|
|
495
|
+
break
|
|
496
|
+
|
|
497
|
+
# SNP ID
|
|
498
|
+
for col in ["snp", "SNP", "variant_id", "rsid"]:
|
|
499
|
+
if col in df.columns:
|
|
500
|
+
col_map[col] = "rs"
|
|
501
|
+
break
|
|
502
|
+
|
|
503
|
+
df = df.rename(columns=col_map)
|
|
504
|
+
|
|
505
|
+
# SuSiE uses -1 or NA for variants not in a credible set; standardize to 0
|
|
506
|
+
if cs_col in df.columns:
|
|
507
|
+
df[cs_col] = df[cs_col].fillna(0).astype(int)
|
|
508
|
+
df.loc[df[cs_col] < 0, cs_col] = 0
|
|
509
|
+
|
|
510
|
+
logger.debug(f"Loaded SuSiE file with {len(df)} variants")
|
|
511
|
+
|
|
512
|
+
if "pos" in df.columns and "pip" in df.columns:
|
|
513
|
+
validate_finemapping_dataframe(df, cs_col=cs_col)
|
|
514
|
+
|
|
515
|
+
return df
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def load_finemap(
|
|
519
|
+
filepath: Union[str, Path],
|
|
520
|
+
cs_col: str = "cs",
|
|
521
|
+
) -> pd.DataFrame:
|
|
522
|
+
"""Load FINEMAP results (.snp output file).
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
filepath: Path to FINEMAP .snp output file.
|
|
526
|
+
cs_col: Output column name for credible set. Default "cs".
|
|
527
|
+
|
|
528
|
+
Returns:
|
|
529
|
+
DataFrame with columns: pos, pip, cs.
|
|
530
|
+
|
|
531
|
+
Example:
|
|
532
|
+
>>> fm_df = load_finemap("results.snp")
|
|
533
|
+
"""
|
|
534
|
+
df = pd.read_csv(filepath, sep=r"\s+")
|
|
535
|
+
|
|
536
|
+
col_map = {
|
|
537
|
+
"position": "pos",
|
|
538
|
+
"prob": "pip",
|
|
539
|
+
"rsid": "rs",
|
|
540
|
+
"chromosome": "chr",
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
df = df.rename(columns=col_map)
|
|
544
|
+
|
|
545
|
+
# FINEMAP doesn't directly output credible sets
|
|
546
|
+
# Assign based on cumulative PIP threshold (95% default)
|
|
547
|
+
if cs_col not in df.columns and "pip" in df.columns:
|
|
548
|
+
df = df.sort_values("pip", ascending=False)
|
|
549
|
+
df["cumsum_pip"] = df["pip"].cumsum()
|
|
550
|
+
df[cs_col] = (df["cumsum_pip"] <= 0.95).astype(int)
|
|
551
|
+
df = df.drop(columns=["cumsum_pip"])
|
|
552
|
+
|
|
553
|
+
logger.debug(f"Loaded FINEMAP file with {len(df)} variants")
|
|
554
|
+
|
|
555
|
+
if "pos" in df.columns and "pip" in df.columns:
|
|
556
|
+
validate_finemapping_dataframe(df, cs_col=cs_col)
|
|
557
|
+
|
|
558
|
+
return df
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
def load_caviar(
|
|
562
|
+
filepath: Union[str, Path],
|
|
563
|
+
cs_col: str = "cs",
|
|
564
|
+
) -> pd.DataFrame:
|
|
565
|
+
"""Load CAVIAR results (.set output file).
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
filepath: Path to CAVIAR output file.
|
|
569
|
+
cs_col: Output column name for credible set. Default "cs".
|
|
570
|
+
|
|
571
|
+
Returns:
|
|
572
|
+
DataFrame with columns: pos, pip, cs.
|
|
573
|
+
"""
|
|
574
|
+
# CAVIAR .set file format: SNP_ID Causal_Post_Prob
|
|
575
|
+
df = pd.read_csv(filepath, sep=r"\s+", header=None, names=["rs", "pip"])
|
|
576
|
+
|
|
577
|
+
# CAVIAR doesn't include position - user needs to merge
|
|
578
|
+
logger.warning(
|
|
579
|
+
"CAVIAR output doesn't include positions. "
|
|
580
|
+
"Merge with SNP annotation file to add 'pos' column."
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
# Assign credible set based on PIP threshold
|
|
584
|
+
df = df.sort_values("pip", ascending=False)
|
|
585
|
+
df["cumsum_pip"] = df["pip"].cumsum()
|
|
586
|
+
df[cs_col] = (df["cumsum_pip"] <= 0.95).astype(int)
|
|
587
|
+
df = df.drop(columns=["cumsum_pip"])
|
|
588
|
+
|
|
589
|
+
logger.debug(f"Loaded CAVIAR file with {len(df)} variants")
|
|
590
|
+
|
|
591
|
+
# CAVIAR doesn't have pos - can't fully validate
|
|
592
|
+
if "pip" in df.columns:
|
|
593
|
+
if ((df["pip"] < 0) | (df["pip"] > 1)).any():
|
|
594
|
+
raise LoaderValidationError("PIP values must be in range [0, 1]")
|
|
595
|
+
|
|
596
|
+
return df
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def load_polyfun(
|
|
600
|
+
filepath: Union[str, Path],
|
|
601
|
+
cs_col: str = "cs",
|
|
602
|
+
) -> pd.DataFrame:
|
|
603
|
+
"""Load PolyFun/SuSiE fine-mapping results.
|
|
604
|
+
|
|
605
|
+
Args:
|
|
606
|
+
filepath: Path to PolyFun output file.
|
|
607
|
+
cs_col: Output column name for credible set. Default "cs".
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
DataFrame with columns: pos, pip, cs.
|
|
611
|
+
"""
|
|
612
|
+
df = pd.read_csv(filepath, sep=r"\s+")
|
|
613
|
+
|
|
614
|
+
col_map = {
|
|
615
|
+
"BP": "pos",
|
|
616
|
+
"PIP": "pip",
|
|
617
|
+
"SNP": "rs",
|
|
618
|
+
"CHR": "chr",
|
|
619
|
+
"CREDIBLE_SET": cs_col,
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
df = df.rename(columns=col_map)
|
|
623
|
+
|
|
624
|
+
if cs_col in df.columns:
|
|
625
|
+
df[cs_col] = df[cs_col].fillna(0).astype(int)
|
|
626
|
+
|
|
627
|
+
logger.debug(f"Loaded PolyFun file with {len(df)} variants")
|
|
628
|
+
|
|
629
|
+
if "pos" in df.columns and "pip" in df.columns:
|
|
630
|
+
validate_finemapping_dataframe(df, cs_col=cs_col)
|
|
631
|
+
|
|
632
|
+
return df
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
# =============================================================================
|
|
636
|
+
# Gene Annotation Loaders
|
|
637
|
+
# =============================================================================
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def load_gtf(
|
|
641
|
+
filepath: Union[str, Path],
|
|
642
|
+
feature_type: str = "gene",
|
|
643
|
+
) -> pd.DataFrame:
|
|
644
|
+
"""Load gene annotations from GTF/GFF3 file.
|
|
645
|
+
|
|
646
|
+
Args:
|
|
647
|
+
filepath: Path to GTF or GFF3 file (can be gzipped).
|
|
648
|
+
feature_type: Feature type to extract ("gene", "exon", "transcript").
|
|
649
|
+
Default "gene".
|
|
650
|
+
|
|
651
|
+
Returns:
|
|
652
|
+
DataFrame with columns: chr, start, end, gene_name, strand.
|
|
653
|
+
|
|
654
|
+
Example:
|
|
655
|
+
>>> genes_df = load_gtf("genes.gtf", feature_type="gene")
|
|
656
|
+
>>> exons_df = load_gtf("genes.gtf", feature_type="exon")
|
|
657
|
+
"""
|
|
658
|
+
# GTF columns: seqname, source, feature, start, end, score, strand, frame, attributes
|
|
659
|
+
df = pd.read_csv(
|
|
660
|
+
filepath,
|
|
661
|
+
sep="\t",
|
|
662
|
+
comment="#",
|
|
663
|
+
header=None,
|
|
664
|
+
names=[
|
|
665
|
+
"chr",
|
|
666
|
+
"source",
|
|
667
|
+
"feature",
|
|
668
|
+
"start",
|
|
669
|
+
"end",
|
|
670
|
+
"score",
|
|
671
|
+
"strand",
|
|
672
|
+
"frame",
|
|
673
|
+
"attributes",
|
|
674
|
+
],
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
# Filter to requested feature type
|
|
678
|
+
df = df[df["feature"] == feature_type].copy()
|
|
679
|
+
|
|
680
|
+
# Parse gene_name from attributes
|
|
681
|
+
def extract_gene_name(attrs: str) -> str:
|
|
682
|
+
"""Extract gene_name or gene_id from GTF attributes."""
|
|
683
|
+
for attr in attrs.split(";"):
|
|
684
|
+
attr = attr.strip()
|
|
685
|
+
if attr.startswith("gene_name"):
|
|
686
|
+
# gene_name "BRCA1" or gene_name=BRCA1
|
|
687
|
+
return attr.split('"')[1] if '"' in attr else attr.split("=")[1]
|
|
688
|
+
if attr.startswith("gene_id"):
|
|
689
|
+
return attr.split('"')[1] if '"' in attr else attr.split("=")[1]
|
|
690
|
+
return ""
|
|
691
|
+
|
|
692
|
+
df["gene_name"] = df["attributes"].apply(extract_gene_name)
|
|
693
|
+
|
|
694
|
+
# Clean chromosome names
|
|
695
|
+
df["chr"] = df["chr"].astype(str).str.replace("chr", "", regex=False)
|
|
696
|
+
|
|
697
|
+
# Select and return relevant columns
|
|
698
|
+
result = df[["chr", "start", "end", "gene_name", "strand"]].copy()
|
|
699
|
+
logger.debug(f"Loaded {len(result)} {feature_type} features from GTF")
|
|
700
|
+
validate_genes_dataframe(result)
|
|
701
|
+
return result
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
def load_bed(
|
|
705
|
+
filepath: Union[str, Path],
|
|
706
|
+
has_header: bool = False,
|
|
707
|
+
) -> pd.DataFrame:
|
|
708
|
+
"""Load gene annotations from BED file.
|
|
709
|
+
|
|
710
|
+
Supports BED4+ format (chr, start, end, name, ...).
|
|
711
|
+
|
|
712
|
+
Args:
|
|
713
|
+
filepath: Path to BED file.
|
|
714
|
+
has_header: Whether file has header row. Default False.
|
|
715
|
+
|
|
716
|
+
Returns:
|
|
717
|
+
DataFrame with columns: chr, start, end, gene_name.
|
|
718
|
+
|
|
719
|
+
Example:
|
|
720
|
+
>>> genes_df = load_bed("genes.bed")
|
|
721
|
+
"""
|
|
722
|
+
header = 0 if has_header else None
|
|
723
|
+
df = pd.read_csv(filepath, sep="\t", header=header)
|
|
724
|
+
|
|
725
|
+
# Assign column names if no header
|
|
726
|
+
if not has_header:
|
|
727
|
+
n_cols = len(df.columns)
|
|
728
|
+
col_names = ["chr", "start", "end"]
|
|
729
|
+
if n_cols >= 4:
|
|
730
|
+
col_names.append("gene_name")
|
|
731
|
+
if n_cols >= 5:
|
|
732
|
+
col_names.append("score")
|
|
733
|
+
if n_cols >= 6:
|
|
734
|
+
col_names.append("strand")
|
|
735
|
+
df.columns = col_names[:n_cols]
|
|
736
|
+
|
|
737
|
+
# Standardize column names if header was present
|
|
738
|
+
col_map = {
|
|
739
|
+
"chrom": "chr",
|
|
740
|
+
"chromStart": "start",
|
|
741
|
+
"chromEnd": "end",
|
|
742
|
+
"name": "gene_name",
|
|
743
|
+
}
|
|
744
|
+
df = df.rename(columns=col_map)
|
|
745
|
+
|
|
746
|
+
# Clean chromosome names
|
|
747
|
+
if "chr" in df.columns:
|
|
748
|
+
df["chr"] = df["chr"].astype(str).str.replace("chr", "", regex=False)
|
|
749
|
+
|
|
750
|
+
logger.debug(f"Loaded {len(df)} features from BED")
|
|
751
|
+
|
|
752
|
+
if all(col in df.columns for col in ["chr", "start", "end", "gene_name"]):
|
|
753
|
+
validate_genes_dataframe(df)
|
|
754
|
+
|
|
755
|
+
return df
|
|
756
|
+
|
|
757
|
+
|
|
758
|
+
def load_ensembl_genes(
|
|
759
|
+
filepath: Union[str, Path],
|
|
760
|
+
) -> pd.DataFrame:
|
|
761
|
+
"""Load Ensembl BioMart gene export.
|
|
762
|
+
|
|
763
|
+
Args:
|
|
764
|
+
filepath: Path to BioMart export file (TSV).
|
|
765
|
+
|
|
766
|
+
Returns:
|
|
767
|
+
DataFrame with columns: chr, start, end, gene_name, strand.
|
|
768
|
+
"""
|
|
769
|
+
df = pd.read_csv(filepath, sep="\t")
|
|
770
|
+
|
|
771
|
+
col_map = {
|
|
772
|
+
"Chromosome/scaffold name": "chr",
|
|
773
|
+
"Gene start (bp)": "start",
|
|
774
|
+
"Gene end (bp)": "end",
|
|
775
|
+
"Gene name": "gene_name",
|
|
776
|
+
"Strand": "strand",
|
|
777
|
+
# Alternative column names
|
|
778
|
+
"chromosome_name": "chr",
|
|
779
|
+
"start_position": "start",
|
|
780
|
+
"end_position": "end",
|
|
781
|
+
"external_gene_name": "gene_name",
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
df = df.rename(columns=col_map)
|
|
785
|
+
|
|
786
|
+
# Convert strand (Ensembl uses 1/-1)
|
|
787
|
+
if "strand" in df.columns:
|
|
788
|
+
df["strand"] = df["strand"].map({1: "+", -1: "-", "+": "+", "-": "-"})
|
|
789
|
+
|
|
790
|
+
logger.debug(f"Loaded {len(df)} genes from Ensembl export")
|
|
791
|
+
|
|
792
|
+
if all(col in df.columns for col in ["chr", "start", "end", "gene_name"]):
|
|
793
|
+
validate_genes_dataframe(df)
|
|
794
|
+
|
|
795
|
+
return df
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
# =============================================================================
|
|
799
|
+
# Generic Loader
|
|
800
|
+
# =============================================================================
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
def load_gwas(
|
|
804
|
+
filepath: Union[str, Path],
|
|
805
|
+
format: Optional[str] = None,
|
|
806
|
+
pos_col: str = "ps",
|
|
807
|
+
p_col: str = "p_wald",
|
|
808
|
+
rs_col: str = "rs",
|
|
809
|
+
**kwargs,
|
|
810
|
+
) -> pd.DataFrame:
|
|
811
|
+
"""Load GWAS results with automatic format detection.
|
|
812
|
+
|
|
813
|
+
Args:
|
|
814
|
+
filepath: Path to GWAS results file.
|
|
815
|
+
format: File format. If None, auto-detects from extension.
|
|
816
|
+
Options: "plink", "regenie", "bolt", "gemma", "saige", "catalog".
|
|
817
|
+
pos_col: Output column name for position. Default "ps".
|
|
818
|
+
p_col: Output column name for p-value. Default "p_wald".
|
|
819
|
+
rs_col: Output column name for SNP ID. Default "rs".
|
|
820
|
+
**kwargs: Additional arguments passed to format-specific loader.
|
|
821
|
+
|
|
822
|
+
Returns:
|
|
823
|
+
DataFrame with standardized column names.
|
|
824
|
+
|
|
825
|
+
Example:
|
|
826
|
+
>>> # Auto-detect format
|
|
827
|
+
>>> gwas_df = load_gwas("results.assoc.linear")
|
|
828
|
+
>>>
|
|
829
|
+
>>> # Explicit format
|
|
830
|
+
>>> gwas_df = load_gwas("results.txt", format="regenie")
|
|
831
|
+
"""
|
|
832
|
+
filepath = Path(filepath)
|
|
833
|
+
name = filepath.name.lower()
|
|
834
|
+
|
|
835
|
+
# Auto-detect format from filename
|
|
836
|
+
if format is None:
|
|
837
|
+
if ".assoc" in name or ".qassoc" in name:
|
|
838
|
+
format = "plink"
|
|
839
|
+
elif ".regenie" in name:
|
|
840
|
+
format = "regenie"
|
|
841
|
+
elif ".stats" in name:
|
|
842
|
+
format = "bolt"
|
|
843
|
+
elif "gemma" in name or name.endswith(".assoc.txt"):
|
|
844
|
+
format = "gemma"
|
|
845
|
+
elif "saige" in name:
|
|
846
|
+
format = "saige"
|
|
847
|
+
else:
|
|
848
|
+
format = "plink" # Default fallback
|
|
849
|
+
|
|
850
|
+
loaders = {
|
|
851
|
+
"plink": load_plink_assoc,
|
|
852
|
+
"regenie": load_regenie,
|
|
853
|
+
"bolt": load_bolt_lmm,
|
|
854
|
+
"gemma": load_gemma,
|
|
855
|
+
"saige": load_saige,
|
|
856
|
+
"catalog": load_gwas_catalog,
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
if format not in loaders:
|
|
860
|
+
raise ValueError(f"Unknown format '{format}'. Options: {list(loaders.keys())}")
|
|
861
|
+
|
|
862
|
+
return loaders[format](filepath, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
|