pylocuszoom 0.2.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pylocuszoom/loaders.py ADDED
@@ -0,0 +1,862 @@
1
+ """File format loaders for common GWAS, eQTL, and fine-mapping outputs.
2
+
3
+ Convenience functions to load data from standard file formats into
4
+ DataFrames ready for use with LocusZoomPlotter.
5
+
6
+ GWAS formats:
7
+ - PLINK (.assoc, .assoc.linear, .assoc.logistic, .qassoc)
8
+ - REGENIE (.regenie)
9
+ - BOLT-LMM (.stats)
10
+ - GEMMA (.assoc.txt)
11
+ - SAIGE (.txt)
12
+ - Generic TSV/CSV
13
+
14
+ eQTL formats:
15
+ - GTEx significant pairs format
16
+ - eQTL Catalogue format
17
+ - MatrixEQTL output
18
+
19
+ Fine-mapping formats:
20
+ - SuSiE (susieR output)
21
+ - FINEMAP (.snp output)
22
+ - CAVIAR (.set output)
23
+
24
+ Gene annotation formats:
25
+ - GTF/GFF3
26
+ - BED (4-column: chr, start, end, name)
27
+ """
28
+
29
+ from pathlib import Path
30
+ from typing import Optional, Union
31
+
32
+ import pandas as pd
33
+
34
+ from .logging import logger
35
+ from .schemas import (
36
+ LoaderValidationError,
37
+ validate_eqtl_dataframe,
38
+ validate_finemapping_dataframe,
39
+ validate_genes_dataframe,
40
+ validate_gwas_dataframe,
41
+ )
42
+
43
+ # =============================================================================
44
+ # GWAS Loaders
45
+ # =============================================================================
46
+
47
+
48
+ def load_plink_assoc(
49
+ filepath: Union[str, Path],
50
+ pos_col: str = "ps",
51
+ p_col: str = "p_wald",
52
+ rs_col: str = "rs",
53
+ ) -> pd.DataFrame:
54
+ """Load PLINK association results (.assoc, .assoc.linear, .assoc.logistic, .qassoc).
55
+
56
+ Automatically detects PLINK format variant and maps columns to standard names.
57
+
58
+ Args:
59
+ filepath: Path to PLINK association file.
60
+ pos_col: Output column name for position. Default "ps".
61
+ p_col: Output column name for p-value. Default "p_wald".
62
+ rs_col: Output column name for SNP ID. Default "rs".
63
+
64
+ Returns:
65
+ DataFrame with standardized column names.
66
+
67
+ Example:
68
+ >>> gwas_df = load_plink_assoc("results.assoc.linear")
69
+ >>> fig = plotter.plot(gwas_df, chrom=1, start=1e6, end=2e6)
70
+ """
71
+ df = pd.read_csv(filepath, sep=r"\s+", comment="#")
72
+
73
+ # Standardize column names (PLINK uses various conventions)
74
+ col_map = {}
75
+
76
+ # Position columns
77
+ for col in ["BP", "POS", "bp", "pos"]:
78
+ if col in df.columns:
79
+ col_map[col] = pos_col
80
+ break
81
+
82
+ # P-value columns
83
+ for col in ["P", "P_BOLT_LMM", "p", "PVAL", "pval", "P_LINREG"]:
84
+ if col in df.columns:
85
+ col_map[col] = p_col
86
+ break
87
+
88
+ # SNP ID columns
89
+ for col in ["SNP", "ID", "rsid", "RSID", "MarkerName", "variant_id"]:
90
+ if col in df.columns:
91
+ col_map[col] = rs_col
92
+ break
93
+
94
+ # Chromosome column (keep as "chr" for reference)
95
+ for col in ["CHR", "chr", "CHROM", "chrom", "#CHROM"]:
96
+ if col in df.columns:
97
+ col_map[col] = "chr"
98
+ break
99
+
100
+ df = df.rename(columns=col_map)
101
+ logger.debug(f"Loaded PLINK file with {len(df)} variants")
102
+
103
+ # Validate output
104
+ validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
105
+
106
+ return df
107
+
108
+
109
+ def load_regenie(
110
+ filepath: Union[str, Path],
111
+ pos_col: str = "ps",
112
+ p_col: str = "p_wald",
113
+ rs_col: str = "rs",
114
+ ) -> pd.DataFrame:
115
+ """Load REGENIE association results (.regenie).
116
+
117
+ Args:
118
+ filepath: Path to REGENIE results file.
119
+ pos_col: Output column name for position. Default "ps".
120
+ p_col: Output column name for p-value. Default "p_wald".
121
+ rs_col: Output column name for SNP ID. Default "rs".
122
+
123
+ Returns:
124
+ DataFrame with standardized column names.
125
+
126
+ Example:
127
+ >>> gwas_df = load_regenie("results.regenie")
128
+ """
129
+ df = pd.read_csv(filepath, sep=r"\s+", comment="#")
130
+
131
+ col_map = {
132
+ "GENPOS": pos_col,
133
+ "ID": rs_col,
134
+ "CHROM": "chr",
135
+ }
136
+
137
+ # REGENIE uses LOG10P, need to convert
138
+ if "LOG10P" in df.columns:
139
+ df[p_col] = 10 ** (-df["LOG10P"])
140
+ elif "P" in df.columns:
141
+ col_map["P"] = p_col
142
+
143
+ df = df.rename(columns=col_map)
144
+ logger.debug(f"Loaded REGENIE file with {len(df)} variants")
145
+ validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
146
+ return df
147
+
148
+
149
+ def load_bolt_lmm(
150
+ filepath: Union[str, Path],
151
+ pos_col: str = "ps",
152
+ p_col: str = "p_wald",
153
+ rs_col: str = "rs",
154
+ ) -> pd.DataFrame:
155
+ """Load BOLT-LMM association results (.stats).
156
+
157
+ Args:
158
+ filepath: Path to BOLT-LMM stats file.
159
+ pos_col: Output column name for position. Default "ps".
160
+ p_col: Output column name for p-value. Default "p_wald".
161
+ rs_col: Output column name for SNP ID. Default "rs".
162
+
163
+ Returns:
164
+ DataFrame with standardized column names.
165
+
166
+ Example:
167
+ >>> gwas_df = load_bolt_lmm("results.stats")
168
+ """
169
+ df = pd.read_csv(filepath, sep="\t")
170
+
171
+ col_map = {
172
+ "BP": pos_col,
173
+ "SNP": rs_col,
174
+ "CHR": "chr",
175
+ "P_BOLT_LMM_INF": p_col, # Infinitesimal model (default)
176
+ }
177
+
178
+ # Prefer P_BOLT_LMM if available (full model)
179
+ if "P_BOLT_LMM" in df.columns:
180
+ col_map["P_BOLT_LMM"] = p_col
181
+ if "P_BOLT_LMM_INF" in col_map:
182
+ del col_map["P_BOLT_LMM_INF"]
183
+
184
+ df = df.rename(columns=col_map)
185
+ logger.debug(f"Loaded BOLT-LMM file with {len(df)} variants")
186
+ validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
187
+ return df
188
+
189
+
190
+ def load_gemma(
191
+ filepath: Union[str, Path],
192
+ pos_col: str = "ps",
193
+ p_col: str = "p_wald",
194
+ rs_col: str = "rs",
195
+ ) -> pd.DataFrame:
196
+ """Load GEMMA association results (.assoc.txt).
197
+
198
+ Args:
199
+ filepath: Path to GEMMA association file.
200
+ pos_col: Output column name for position. Default "ps".
201
+ p_col: Output column name for p-value. Default "p_wald".
202
+ rs_col: Output column name for SNP ID. Default "rs".
203
+
204
+ Returns:
205
+ DataFrame with standardized column names.
206
+
207
+ Example:
208
+ >>> gwas_df = load_gemma("output.assoc.txt")
209
+ """
210
+ df = pd.read_csv(filepath, sep="\t")
211
+
212
+ col_map = {
213
+ "ps": pos_col,
214
+ "rs": rs_col,
215
+ "chr": "chr",
216
+ "p_wald": p_col,
217
+ "p_lrt": p_col, # Alternative if p_wald not present
218
+ "p_score": p_col, # Alternative
219
+ }
220
+
221
+ # Only map first matching p-value column
222
+ p_cols = ["p_wald", "p_lrt", "p_score"]
223
+ found_p = False
224
+ for p in p_cols:
225
+ if p in df.columns and not found_p:
226
+ col_map[p] = p_col
227
+ found_p = True
228
+ elif p in col_map:
229
+ del col_map[p]
230
+
231
+ df = df.rename(columns=col_map)
232
+ logger.debug(f"Loaded GEMMA file with {len(df)} variants")
233
+ validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
234
+ return df
235
+
236
+
237
+ def load_saige(
238
+ filepath: Union[str, Path],
239
+ pos_col: str = "ps",
240
+ p_col: str = "p_wald",
241
+ rs_col: str = "rs",
242
+ ) -> pd.DataFrame:
243
+ """Load SAIGE association results.
244
+
245
+ Args:
246
+ filepath: Path to SAIGE results file.
247
+ pos_col: Output column name for position. Default "ps".
248
+ p_col: Output column name for p-value. Default "p_wald".
249
+ rs_col: Output column name for SNP ID. Default "rs".
250
+
251
+ Returns:
252
+ DataFrame with standardized column names.
253
+
254
+ Example:
255
+ >>> gwas_df = load_saige("results.txt")
256
+ """
257
+ df = pd.read_csv(filepath, sep="\t")
258
+
259
+ col_map = {
260
+ "POS": pos_col,
261
+ "MarkerID": rs_col,
262
+ "CHR": "chr",
263
+ "p.value": p_col,
264
+ "p.value.NA": p_col, # SPA-adjusted
265
+ }
266
+
267
+ df = df.rename(columns=col_map)
268
+ logger.debug(f"Loaded SAIGE file with {len(df)} variants")
269
+ validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
270
+ return df
271
+
272
+
273
+ def load_gwas_catalog(
274
+ filepath: Union[str, Path],
275
+ pos_col: str = "ps",
276
+ p_col: str = "p_wald",
277
+ rs_col: str = "rs",
278
+ ) -> pd.DataFrame:
279
+ """Load GWAS Catalog summary statistics format.
280
+
281
+ Args:
282
+ filepath: Path to GWAS Catalog file.
283
+ pos_col: Output column name for position. Default "ps".
284
+ p_col: Output column name for p-value. Default "p_wald".
285
+ rs_col: Output column name for SNP ID. Default "rs".
286
+
287
+ Returns:
288
+ DataFrame with standardized column names.
289
+ """
290
+ df = pd.read_csv(filepath, sep="\t")
291
+
292
+ col_map = {
293
+ "base_pair_location": pos_col,
294
+ "variant_id": rs_col,
295
+ "chromosome": "chr",
296
+ "p_value": p_col,
297
+ }
298
+
299
+ df = df.rename(columns=col_map)
300
+ logger.debug(f"Loaded GWAS Catalog file with {len(df)} variants")
301
+ validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
302
+ return df
303
+
304
+
305
+ # =============================================================================
306
+ # eQTL Loaders
307
+ # =============================================================================
308
+
309
+
310
+ def load_gtex_eqtl(
311
+ filepath: Union[str, Path],
312
+ gene: Optional[str] = None,
313
+ ) -> pd.DataFrame:
314
+ """Load GTEx eQTL significant pairs format.
315
+
316
+ Args:
317
+ filepath: Path to GTEx eQTL file (e.g., signif_variant_gene_pairs.txt.gz).
318
+ gene: Optional gene to filter to (ENSG ID or gene symbol).
319
+
320
+ Returns:
321
+ DataFrame with columns: pos, p_value, gene, effect.
322
+
323
+ Example:
324
+ >>> eqtl_df = load_gtex_eqtl("GTEx_Analysis.signif_pairs.txt.gz", gene="BRCA1")
325
+ """
326
+ # GTEx files are often gzipped
327
+ df = pd.read_csv(filepath, sep="\t")
328
+
329
+ # Map GTEx columns to standard format
330
+ col_map = {}
331
+
332
+ # Variant position (GTEx uses variant_id like chr1_12345_A_G_b38)
333
+ if "variant_id" in df.columns:
334
+ # Extract position from variant_id
335
+ df["pos"] = df["variant_id"].str.split("_").str[1].astype(int)
336
+ elif "pos" not in df.columns:
337
+ for col in ["tss_distance", "POS"]:
338
+ if col in df.columns:
339
+ col_map[col] = "pos"
340
+ break
341
+
342
+ # P-value
343
+ for col in ["pval_nominal", "p_value", "pvalue", "P"]:
344
+ if col in df.columns:
345
+ col_map[col] = "p_value"
346
+ break
347
+
348
+ # Gene
349
+ for col in ["gene_id", "gene_name", "phenotype_id"]:
350
+ if col in df.columns:
351
+ col_map[col] = "gene"
352
+ break
353
+
354
+ # Effect size (slope)
355
+ for col in ["slope", "beta", "effect_size"]:
356
+ if col in df.columns:
357
+ col_map[col] = "effect"
358
+ break
359
+
360
+ df = df.rename(columns=col_map)
361
+
362
+ # Filter to gene if specified
363
+ if gene is not None and "gene" in df.columns:
364
+ # Match either ENSG ID or gene symbol
365
+ mask = df["gene"].str.contains(gene, case=False, na=False)
366
+ df = df[mask]
367
+
368
+ logger.debug(f"Loaded GTEx eQTL file with {len(df)} associations")
369
+
370
+ # Validate if required columns present
371
+ if "pos" in df.columns and "p_value" in df.columns and "gene" in df.columns:
372
+ validate_eqtl_dataframe(df)
373
+
374
+ return df
375
+
376
+
377
+ def load_eqtl_catalogue(
378
+ filepath: Union[str, Path],
379
+ gene: Optional[str] = None,
380
+ ) -> pd.DataFrame:
381
+ """Load eQTL Catalogue format.
382
+
383
+ Args:
384
+ filepath: Path to eQTL Catalogue file.
385
+ gene: Optional gene to filter to.
386
+
387
+ Returns:
388
+ DataFrame with columns: pos, p_value, gene, effect.
389
+ """
390
+ df = pd.read_csv(filepath, sep="\t")
391
+
392
+ col_map = {
393
+ "position": "pos",
394
+ "pvalue": "p_value",
395
+ "gene_id": "gene",
396
+ "beta": "effect",
397
+ "chromosome": "chr",
398
+ }
399
+
400
+ df = df.rename(columns=col_map)
401
+
402
+ if gene is not None and "gene" in df.columns:
403
+ mask = df["gene"].str.contains(gene, case=False, na=False)
404
+ df = df[mask]
405
+
406
+ logger.debug(f"Loaded eQTL Catalogue file with {len(df)} associations")
407
+
408
+ if "pos" in df.columns and "p_value" in df.columns and "gene" in df.columns:
409
+ validate_eqtl_dataframe(df)
410
+
411
+ return df
412
+
413
+
414
+ def load_matrixeqtl(
415
+ filepath: Union[str, Path],
416
+ gene: Optional[str] = None,
417
+ ) -> pd.DataFrame:
418
+ """Load MatrixEQTL output format.
419
+
420
+ Args:
421
+ filepath: Path to MatrixEQTL output file.
422
+ gene: Optional gene to filter to.
423
+
424
+ Returns:
425
+ DataFrame with columns: pos, p_value, gene, effect.
426
+
427
+ Note:
428
+ MatrixEQTL output doesn't include position by default.
429
+ You may need to merge with a SNP annotation file.
430
+ """
431
+ df = pd.read_csv(filepath, sep="\t")
432
+
433
+ col_map = {
434
+ "SNP": "rs",
435
+ "gene": "gene",
436
+ "p-value": "p_value",
437
+ "pvalue": "p_value",
438
+ "beta": "effect",
439
+ "t-stat": "t_stat",
440
+ }
441
+
442
+ df = df.rename(columns=col_map)
443
+
444
+ if gene is not None and "gene" in df.columns:
445
+ df = df[df["gene"] == gene]
446
+
447
+ logger.debug(f"Loaded MatrixEQTL file with {len(df)} associations")
448
+ return df
449
+
450
+
451
+ # =============================================================================
452
+ # Fine-mapping Loaders
453
+ # =============================================================================
454
+
455
+
456
+ def load_susie(
457
+ filepath: Union[str, Path],
458
+ cs_col: str = "cs",
459
+ ) -> pd.DataFrame:
460
+ """Load SuSiE fine-mapping results.
461
+
462
+ Supports both R susieR output (saved as TSV) and SuSiE-inf output.
463
+
464
+ Args:
465
+ filepath: Path to SuSiE results file.
466
+ cs_col: Output column name for credible set. Default "cs".
467
+
468
+ Returns:
469
+ DataFrame with columns: pos, pip, cs.
470
+
471
+ Example:
472
+ >>> fm_df = load_susie("susie_results.tsv")
473
+ >>> fig = plotter.plot_stacked([gwas_df], ..., finemapping_df=fm_df)
474
+ """
475
+ df = pd.read_csv(filepath, sep="\t")
476
+
477
+ col_map = {}
478
+
479
+ # Position
480
+ for col in ["pos", "position", "BP", "bp", "POS"]:
481
+ if col in df.columns:
482
+ col_map[col] = "pos"
483
+ break
484
+
485
+ # PIP (posterior inclusion probability)
486
+ for col in ["pip", "PIP", "posterior_prob", "prob"]:
487
+ if col in df.columns:
488
+ col_map[col] = "pip"
489
+ break
490
+
491
+ # Credible set
492
+ for col in ["cs", "CS", "credible_set", "cs_index", "L"]:
493
+ if col in df.columns:
494
+ col_map[col] = cs_col
495
+ break
496
+
497
+ # SNP ID
498
+ for col in ["snp", "SNP", "variant_id", "rsid"]:
499
+ if col in df.columns:
500
+ col_map[col] = "rs"
501
+ break
502
+
503
+ df = df.rename(columns=col_map)
504
+
505
+ # SuSiE uses -1 or NA for variants not in a credible set; standardize to 0
506
+ if cs_col in df.columns:
507
+ df[cs_col] = df[cs_col].fillna(0).astype(int)
508
+ df.loc[df[cs_col] < 0, cs_col] = 0
509
+
510
+ logger.debug(f"Loaded SuSiE file with {len(df)} variants")
511
+
512
+ if "pos" in df.columns and "pip" in df.columns:
513
+ validate_finemapping_dataframe(df, cs_col=cs_col)
514
+
515
+ return df
516
+
517
+
518
+ def load_finemap(
519
+ filepath: Union[str, Path],
520
+ cs_col: str = "cs",
521
+ ) -> pd.DataFrame:
522
+ """Load FINEMAP results (.snp output file).
523
+
524
+ Args:
525
+ filepath: Path to FINEMAP .snp output file.
526
+ cs_col: Output column name for credible set. Default "cs".
527
+
528
+ Returns:
529
+ DataFrame with columns: pos, pip, cs.
530
+
531
+ Example:
532
+ >>> fm_df = load_finemap("results.snp")
533
+ """
534
+ df = pd.read_csv(filepath, sep=r"\s+")
535
+
536
+ col_map = {
537
+ "position": "pos",
538
+ "prob": "pip",
539
+ "rsid": "rs",
540
+ "chromosome": "chr",
541
+ }
542
+
543
+ df = df.rename(columns=col_map)
544
+
545
+ # FINEMAP doesn't directly output credible sets
546
+ # Assign based on cumulative PIP threshold (95% default)
547
+ if cs_col not in df.columns and "pip" in df.columns:
548
+ df = df.sort_values("pip", ascending=False)
549
+ df["cumsum_pip"] = df["pip"].cumsum()
550
+ df[cs_col] = (df["cumsum_pip"] <= 0.95).astype(int)
551
+ df = df.drop(columns=["cumsum_pip"])
552
+
553
+ logger.debug(f"Loaded FINEMAP file with {len(df)} variants")
554
+
555
+ if "pos" in df.columns and "pip" in df.columns:
556
+ validate_finemapping_dataframe(df, cs_col=cs_col)
557
+
558
+ return df
559
+
560
+
561
+ def load_caviar(
562
+ filepath: Union[str, Path],
563
+ cs_col: str = "cs",
564
+ ) -> pd.DataFrame:
565
+ """Load CAVIAR results (.set output file).
566
+
567
+ Args:
568
+ filepath: Path to CAVIAR output file.
569
+ cs_col: Output column name for credible set. Default "cs".
570
+
571
+ Returns:
572
+ DataFrame with columns: pos, pip, cs.
573
+ """
574
+ # CAVIAR .set file format: SNP_ID Causal_Post_Prob
575
+ df = pd.read_csv(filepath, sep=r"\s+", header=None, names=["rs", "pip"])
576
+
577
+ # CAVIAR doesn't include position - user needs to merge
578
+ logger.warning(
579
+ "CAVIAR output doesn't include positions. "
580
+ "Merge with SNP annotation file to add 'pos' column."
581
+ )
582
+
583
+ # Assign credible set based on PIP threshold
584
+ df = df.sort_values("pip", ascending=False)
585
+ df["cumsum_pip"] = df["pip"].cumsum()
586
+ df[cs_col] = (df["cumsum_pip"] <= 0.95).astype(int)
587
+ df = df.drop(columns=["cumsum_pip"])
588
+
589
+ logger.debug(f"Loaded CAVIAR file with {len(df)} variants")
590
+
591
+ # CAVIAR doesn't have pos - can't fully validate
592
+ if "pip" in df.columns:
593
+ if ((df["pip"] < 0) | (df["pip"] > 1)).any():
594
+ raise LoaderValidationError("PIP values must be in range [0, 1]")
595
+
596
+ return df
597
+
598
+
599
+ def load_polyfun(
600
+ filepath: Union[str, Path],
601
+ cs_col: str = "cs",
602
+ ) -> pd.DataFrame:
603
+ """Load PolyFun/SuSiE fine-mapping results.
604
+
605
+ Args:
606
+ filepath: Path to PolyFun output file.
607
+ cs_col: Output column name for credible set. Default "cs".
608
+
609
+ Returns:
610
+ DataFrame with columns: pos, pip, cs.
611
+ """
612
+ df = pd.read_csv(filepath, sep=r"\s+")
613
+
614
+ col_map = {
615
+ "BP": "pos",
616
+ "PIP": "pip",
617
+ "SNP": "rs",
618
+ "CHR": "chr",
619
+ "CREDIBLE_SET": cs_col,
620
+ }
621
+
622
+ df = df.rename(columns=col_map)
623
+
624
+ if cs_col in df.columns:
625
+ df[cs_col] = df[cs_col].fillna(0).astype(int)
626
+
627
+ logger.debug(f"Loaded PolyFun file with {len(df)} variants")
628
+
629
+ if "pos" in df.columns and "pip" in df.columns:
630
+ validate_finemapping_dataframe(df, cs_col=cs_col)
631
+
632
+ return df
633
+
634
+
635
+ # =============================================================================
636
+ # Gene Annotation Loaders
637
+ # =============================================================================
638
+
639
+
640
+ def load_gtf(
641
+ filepath: Union[str, Path],
642
+ feature_type: str = "gene",
643
+ ) -> pd.DataFrame:
644
+ """Load gene annotations from GTF/GFF3 file.
645
+
646
+ Args:
647
+ filepath: Path to GTF or GFF3 file (can be gzipped).
648
+ feature_type: Feature type to extract ("gene", "exon", "transcript").
649
+ Default "gene".
650
+
651
+ Returns:
652
+ DataFrame with columns: chr, start, end, gene_name, strand.
653
+
654
+ Example:
655
+ >>> genes_df = load_gtf("genes.gtf", feature_type="gene")
656
+ >>> exons_df = load_gtf("genes.gtf", feature_type="exon")
657
+ """
658
+ # GTF columns: seqname, source, feature, start, end, score, strand, frame, attributes
659
+ df = pd.read_csv(
660
+ filepath,
661
+ sep="\t",
662
+ comment="#",
663
+ header=None,
664
+ names=[
665
+ "chr",
666
+ "source",
667
+ "feature",
668
+ "start",
669
+ "end",
670
+ "score",
671
+ "strand",
672
+ "frame",
673
+ "attributes",
674
+ ],
675
+ )
676
+
677
+ # Filter to requested feature type
678
+ df = df[df["feature"] == feature_type].copy()
679
+
680
+ # Parse gene_name from attributes
681
+ def extract_gene_name(attrs: str) -> str:
682
+ """Extract gene_name or gene_id from GTF attributes."""
683
+ for attr in attrs.split(";"):
684
+ attr = attr.strip()
685
+ if attr.startswith("gene_name"):
686
+ # gene_name "BRCA1" or gene_name=BRCA1
687
+ return attr.split('"')[1] if '"' in attr else attr.split("=")[1]
688
+ if attr.startswith("gene_id"):
689
+ return attr.split('"')[1] if '"' in attr else attr.split("=")[1]
690
+ return ""
691
+
692
+ df["gene_name"] = df["attributes"].apply(extract_gene_name)
693
+
694
+ # Clean chromosome names
695
+ df["chr"] = df["chr"].astype(str).str.replace("chr", "", regex=False)
696
+
697
+ # Select and return relevant columns
698
+ result = df[["chr", "start", "end", "gene_name", "strand"]].copy()
699
+ logger.debug(f"Loaded {len(result)} {feature_type} features from GTF")
700
+ validate_genes_dataframe(result)
701
+ return result
702
+
703
+
704
+ def load_bed(
705
+ filepath: Union[str, Path],
706
+ has_header: bool = False,
707
+ ) -> pd.DataFrame:
708
+ """Load gene annotations from BED file.
709
+
710
+ Supports BED4+ format (chr, start, end, name, ...).
711
+
712
+ Args:
713
+ filepath: Path to BED file.
714
+ has_header: Whether file has header row. Default False.
715
+
716
+ Returns:
717
+ DataFrame with columns: chr, start, end, gene_name.
718
+
719
+ Example:
720
+ >>> genes_df = load_bed("genes.bed")
721
+ """
722
+ header = 0 if has_header else None
723
+ df = pd.read_csv(filepath, sep="\t", header=header)
724
+
725
+ # Assign column names if no header
726
+ if not has_header:
727
+ n_cols = len(df.columns)
728
+ col_names = ["chr", "start", "end"]
729
+ if n_cols >= 4:
730
+ col_names.append("gene_name")
731
+ if n_cols >= 5:
732
+ col_names.append("score")
733
+ if n_cols >= 6:
734
+ col_names.append("strand")
735
+ df.columns = col_names[:n_cols]
736
+
737
+ # Standardize column names if header was present
738
+ col_map = {
739
+ "chrom": "chr",
740
+ "chromStart": "start",
741
+ "chromEnd": "end",
742
+ "name": "gene_name",
743
+ }
744
+ df = df.rename(columns=col_map)
745
+
746
+ # Clean chromosome names
747
+ if "chr" in df.columns:
748
+ df["chr"] = df["chr"].astype(str).str.replace("chr", "", regex=False)
749
+
750
+ logger.debug(f"Loaded {len(df)} features from BED")
751
+
752
+ if all(col in df.columns for col in ["chr", "start", "end", "gene_name"]):
753
+ validate_genes_dataframe(df)
754
+
755
+ return df
756
+
757
+
758
+ def load_ensembl_genes(
759
+ filepath: Union[str, Path],
760
+ ) -> pd.DataFrame:
761
+ """Load Ensembl BioMart gene export.
762
+
763
+ Args:
764
+ filepath: Path to BioMart export file (TSV).
765
+
766
+ Returns:
767
+ DataFrame with columns: chr, start, end, gene_name, strand.
768
+ """
769
+ df = pd.read_csv(filepath, sep="\t")
770
+
771
+ col_map = {
772
+ "Chromosome/scaffold name": "chr",
773
+ "Gene start (bp)": "start",
774
+ "Gene end (bp)": "end",
775
+ "Gene name": "gene_name",
776
+ "Strand": "strand",
777
+ # Alternative column names
778
+ "chromosome_name": "chr",
779
+ "start_position": "start",
780
+ "end_position": "end",
781
+ "external_gene_name": "gene_name",
782
+ }
783
+
784
+ df = df.rename(columns=col_map)
785
+
786
+ # Convert strand (Ensembl uses 1/-1)
787
+ if "strand" in df.columns:
788
+ df["strand"] = df["strand"].map({1: "+", -1: "-", "+": "+", "-": "-"})
789
+
790
+ logger.debug(f"Loaded {len(df)} genes from Ensembl export")
791
+
792
+ if all(col in df.columns for col in ["chr", "start", "end", "gene_name"]):
793
+ validate_genes_dataframe(df)
794
+
795
+ return df
796
+
797
+
798
+ # =============================================================================
799
+ # Generic Loader
800
+ # =============================================================================
801
+
802
+
803
+ def load_gwas(
804
+ filepath: Union[str, Path],
805
+ format: Optional[str] = None,
806
+ pos_col: str = "ps",
807
+ p_col: str = "p_wald",
808
+ rs_col: str = "rs",
809
+ **kwargs,
810
+ ) -> pd.DataFrame:
811
+ """Load GWAS results with automatic format detection.
812
+
813
+ Args:
814
+ filepath: Path to GWAS results file.
815
+ format: File format. If None, auto-detects from extension.
816
+ Options: "plink", "regenie", "bolt", "gemma", "saige", "catalog".
817
+ pos_col: Output column name for position. Default "ps".
818
+ p_col: Output column name for p-value. Default "p_wald".
819
+ rs_col: Output column name for SNP ID. Default "rs".
820
+ **kwargs: Additional arguments passed to format-specific loader.
821
+
822
+ Returns:
823
+ DataFrame with standardized column names.
824
+
825
+ Example:
826
+ >>> # Auto-detect format
827
+ >>> gwas_df = load_gwas("results.assoc.linear")
828
+ >>>
829
+ >>> # Explicit format
830
+ >>> gwas_df = load_gwas("results.txt", format="regenie")
831
+ """
832
+ filepath = Path(filepath)
833
+ name = filepath.name.lower()
834
+
835
+ # Auto-detect format from filename
836
+ if format is None:
837
+ if ".assoc" in name or ".qassoc" in name:
838
+ format = "plink"
839
+ elif ".regenie" in name:
840
+ format = "regenie"
841
+ elif ".stats" in name:
842
+ format = "bolt"
843
+ elif "gemma" in name or name.endswith(".assoc.txt"):
844
+ format = "gemma"
845
+ elif "saige" in name:
846
+ format = "saige"
847
+ else:
848
+ format = "plink" # Default fallback
849
+
850
+ loaders = {
851
+ "plink": load_plink_assoc,
852
+ "regenie": load_regenie,
853
+ "bolt": load_bolt_lmm,
854
+ "gemma": load_gemma,
855
+ "saige": load_saige,
856
+ "catalog": load_gwas_catalog,
857
+ }
858
+
859
+ if format not in loaders:
860
+ raise ValueError(f"Unknown format '{format}'. Options: {list(loaders.keys())}")
861
+
862
+ return loaders[format](filepath, pos_col=pos_col, p_col=p_col, rs_col=rs_col)