gsMap 1.73.2__py3-none-any.whl → 1.73.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/__init__.py +1 -1
- gsMap/config.py +2 -9
- gsMap/diagnosis.py +4 -3
- gsMap/generate_ldscore.py +115 -453
- gsMap/utils/generate_r2_matrix.py +455 -352
- gsMap/utils/regression_read.py +131 -157
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/METADATA +1 -1
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/RECORD +11 -11
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/WHEEL +0 -0
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/entry_points.txt +0 -0
- {gsmap-1.73.2.dist-info → gsmap-1.73.4.dist-info}/licenses/LICENSE +0 -0
gsMap/generate_ldscore.py
CHANGED
@@ -13,12 +13,11 @@ from pathlib import Path
|
|
13
13
|
import numpy as np
|
14
14
|
import pandas as pd
|
15
15
|
import pyranges as pr
|
16
|
-
import zarr
|
17
16
|
from scipy.sparse import csr_matrix
|
18
17
|
from tqdm import trange
|
19
18
|
|
20
19
|
from gsMap.config import GenerateLDScoreConfig
|
21
|
-
from gsMap.utils.generate_r2_matrix import
|
20
|
+
from gsMap.utils.generate_r2_matrix import PlinkBEDFile
|
22
21
|
|
23
22
|
# Configure warning behavior more precisely
|
24
23
|
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
|
@@ -57,7 +56,8 @@ def load_gtf(
|
|
57
56
|
gtf = gtf[gtf["Feature"] == "gene"]
|
58
57
|
|
59
58
|
# Find common genes between GTF and marker scores
|
60
|
-
common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
|
59
|
+
# common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
|
60
|
+
common_gene = list(set(mk_score.index) & set(gtf.gene_name))
|
61
61
|
logger.info(f"Found {len(common_gene)} common genes between GTF and marker scores")
|
62
62
|
|
63
63
|
# Filter GTF and marker scores to common genes
|
@@ -69,6 +69,9 @@ def load_gtf(
|
|
69
69
|
|
70
70
|
# Process the GTF (open window around gene coordinates)
|
71
71
|
gtf_bed = gtf[["Chromosome", "Start", "End", "gene_name", "Strand"]].copy()
|
72
|
+
gtf_bed["Chromosome"] = gtf_bed["Chromosome"].apply(
|
73
|
+
lambda x: f"chr{x}" if not str(x).startswith("chr") else x
|
74
|
+
)
|
72
75
|
gtf_bed.loc[:, "TSS"] = gtf_bed["Start"]
|
73
76
|
gtf_bed.loc[:, "TED"] = gtf_bed["End"]
|
74
77
|
|
@@ -109,44 +112,6 @@ def load_marker_score(mk_score_file: str) -> pd.DataFrame:
|
|
109
112
|
return mk_score
|
110
113
|
|
111
114
|
|
112
|
-
def load_bim(bfile_root: str, chrom: int) -> tuple[pd.DataFrame, pr.PyRanges]:
|
113
|
-
"""
|
114
|
-
Load PLINK BIM file and convert to a PyRanges object.
|
115
|
-
|
116
|
-
Parameters
|
117
|
-
----------
|
118
|
-
bfile_root : str
|
119
|
-
Root path for PLINK bfiles
|
120
|
-
chrom : int
|
121
|
-
Chromosome number
|
122
|
-
|
123
|
-
Returns
|
124
|
-
-------
|
125
|
-
tuple
|
126
|
-
A tuple containing (bim_df, bim_pr) where:
|
127
|
-
- bim_df is a pandas DataFrame with BIM data
|
128
|
-
- bim_pr is a PyRanges object with BIM data
|
129
|
-
"""
|
130
|
-
bim_file = f"{bfile_root}.{chrom}.bim"
|
131
|
-
logger.debug(f"Loading BIM file: {bim_file}")
|
132
|
-
|
133
|
-
bim = pd.read_csv(bim_file, sep="\t", header=None)
|
134
|
-
bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
|
135
|
-
|
136
|
-
# Convert to PyRanges
|
137
|
-
bim_pr = bim.copy()
|
138
|
-
bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
|
139
|
-
|
140
|
-
# Adjust coordinates (BIM is 1-based, PyRanges uses 0-based)
|
141
|
-
bim_pr["End"] = bim_pr["Start"].copy()
|
142
|
-
bim_pr["Start"] = bim_pr["Start"] - 1
|
143
|
-
|
144
|
-
bim_pr = pr.PyRanges(bim_pr)
|
145
|
-
bim_pr.Chromosome = f"chr{chrom}"
|
146
|
-
|
147
|
-
return bim, bim_pr
|
148
|
-
|
149
|
-
|
150
115
|
def overlaps_gtf_bim(gtf_pr: pr.PyRanges, bim_pr: pr.PyRanges) -> pd.DataFrame:
|
151
116
|
"""
|
152
117
|
Find overlaps between GTF and BIM data, and select nearest gene for each SNP.
|
@@ -176,282 +141,13 @@ def overlaps_gtf_bim(gtf_pr: pr.PyRanges, bim_pr: pr.PyRanges) -> pd.DataFrame:
|
|
176
141
|
return nearest_genes
|
177
142
|
|
178
143
|
|
179
|
-
def filter_snps_by_keep_snp(bim_df: pd.DataFrame, keep_snp_file: str) -> pd.DataFrame:
|
180
|
-
"""
|
181
|
-
Filter BIM DataFrame to keep only SNPs in a provided list.
|
182
|
-
|
183
|
-
Parameters
|
184
|
-
----------
|
185
|
-
bim_df : pd.DataFrame
|
186
|
-
DataFrame with BIM data
|
187
|
-
keep_snp_file : str
|
188
|
-
Path to a file with SNP IDs to keep
|
189
|
-
|
190
|
-
Returns
|
191
|
-
-------
|
192
|
-
pd.DataFrame
|
193
|
-
Filtered BIM DataFrame
|
194
|
-
"""
|
195
|
-
# Read SNPs to keep
|
196
|
-
keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
|
197
|
-
|
198
|
-
# Filter the BIM DataFrame
|
199
|
-
filtered_bim_df = bim_df[bim_df["SNP"].isin(keep_snp)]
|
200
|
-
|
201
|
-
logger.info(f"Kept {len(filtered_bim_df)} SNPs out of {len(bim_df)} after filtering")
|
202
|
-
|
203
|
-
return filtered_bim_df
|
204
|
-
|
205
|
-
|
206
|
-
def get_snp_counts(config: GenerateLDScoreConfig) -> dict:
|
207
|
-
"""
|
208
|
-
Count SNPs per chromosome and calculate start positions for zarr arrays.
|
209
|
-
|
210
|
-
Parameters
|
211
|
-
----------
|
212
|
-
config : GenerateLDScoreConfig
|
213
|
-
Configuration object
|
214
|
-
|
215
|
-
Returns
|
216
|
-
-------
|
217
|
-
dict
|
218
|
-
Dictionary with SNP counts and start positions
|
219
|
-
"""
|
220
|
-
snp_counts = {}
|
221
|
-
total_snp = 0
|
222
|
-
|
223
|
-
for chrom in range(1, 23):
|
224
|
-
bim_df, _ = load_bim(config.bfile_root, chrom)
|
225
|
-
|
226
|
-
if config.keep_snp_root:
|
227
|
-
keep_snp_file = f"{config.keep_snp_root}.{chrom}.snp"
|
228
|
-
filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
|
229
|
-
else:
|
230
|
-
filtered_bim_df = bim_df
|
231
|
-
|
232
|
-
snp_counts[chrom] = filtered_bim_df.shape[0]
|
233
|
-
total_snp += snp_counts[chrom]
|
234
|
-
|
235
|
-
snp_counts["total"] = total_snp
|
236
|
-
|
237
|
-
# Calculate cumulative SNP counts for zarr array indexing
|
238
|
-
chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
|
239
|
-
snp_counts["chrom_snp_start_point"] = [0] + chrom_snp_length_array.tolist()
|
240
|
-
|
241
|
-
return snp_counts
|
242
|
-
|
243
|
-
|
244
|
-
def get_snp_pass_maf(bfile_root: str, chrom: int, maf_min: float = 0.05) -> list[str]:
|
245
|
-
"""
|
246
|
-
Get SNPs that pass the minimum minor allele frequency (MAF) threshold.
|
247
|
-
|
248
|
-
Parameters
|
249
|
-
----------
|
250
|
-
bfile_root : str
|
251
|
-
Root path for PLINK bfiles
|
252
|
-
chrom : int
|
253
|
-
Chromosome number
|
254
|
-
maf_min : float, optional
|
255
|
-
Minimum MAF threshold, by default 0.05
|
256
|
-
|
257
|
-
Returns
|
258
|
-
-------
|
259
|
-
list
|
260
|
-
List of SNP IDs that pass the MAF threshold
|
261
|
-
"""
|
262
|
-
array_snps, array_indivs, geno_array = load_bfile(
|
263
|
-
bfile_chr_prefix=f"{bfile_root}.{chrom}", mafMin=maf_min
|
264
|
-
)
|
265
|
-
|
266
|
-
m = len(array_snps.IDList)
|
267
|
-
n = len(array_indivs.IDList)
|
268
|
-
logger.info(
|
269
|
-
f"Loading genotype data for {m} SNPs and {n} individuals from {bfile_root}.{chrom}"
|
270
|
-
)
|
271
|
-
|
272
|
-
# Filter SNPs by MAF
|
273
|
-
snp_pass_maf = array_snps.IDList.iloc[geno_array.kept_snps]
|
274
|
-
logger.info(f"After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain")
|
275
|
-
|
276
|
-
return snp_pass_maf.SNP.to_list()
|
277
|
-
|
278
|
-
|
279
|
-
def get_ldscore(
|
280
|
-
bfile_root: str,
|
281
|
-
chrom: int,
|
282
|
-
annot_matrix: np.ndarray,
|
283
|
-
ld_wind: float,
|
284
|
-
ld_unit: str = "CM",
|
285
|
-
keep_snps_index: list[int] = None,
|
286
|
-
) -> pd.DataFrame:
|
287
|
-
"""
|
288
|
-
Calculate LD scores using PLINK data and an annotation matrix.
|
289
|
-
|
290
|
-
Parameters
|
291
|
-
----------
|
292
|
-
bfile_root : str
|
293
|
-
Root path for PLINK bfiles
|
294
|
-
chrom : int
|
295
|
-
Chromosome number
|
296
|
-
annot_matrix : np.ndarray
|
297
|
-
Annotation matrix
|
298
|
-
ld_wind : float
|
299
|
-
LD window size
|
300
|
-
ld_unit : str, optional
|
301
|
-
Unit for the LD window, by default "CM"
|
302
|
-
keep_snps_index : list[int], optional
|
303
|
-
Indices of SNPs to keep, by default None
|
304
|
-
|
305
|
-
Returns
|
306
|
-
-------
|
307
|
-
pd.DataFrame
|
308
|
-
DataFrame with calculated LD scores
|
309
|
-
"""
|
310
|
-
array_snps, array_indivs, geno_array = load_bfile(
|
311
|
-
bfile_chr_prefix=f"{bfile_root}.{chrom}", keep_snps=keep_snps_index
|
312
|
-
)
|
313
|
-
|
314
|
-
# Configure LD window based on specified unit
|
315
|
-
if ld_unit == "SNP":
|
316
|
-
max_dist = ld_wind
|
317
|
-
coords = np.array(range(geno_array.m))
|
318
|
-
elif ld_unit == "KB":
|
319
|
-
max_dist = ld_wind * 1000
|
320
|
-
coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
|
321
|
-
elif ld_unit == "CM":
|
322
|
-
max_dist = ld_wind
|
323
|
-
coords = np.array(array_snps.df["CM"])[geno_array.kept_snps]
|
324
|
-
# Check if the CM is all 0
|
325
|
-
if np.all(coords == 0):
|
326
|
-
logger.warning(
|
327
|
-
"All CM values are 0 in the BIM file. Using 1MB window size for LD score calculation."
|
328
|
-
)
|
329
|
-
max_dist = 1_000_000
|
330
|
-
coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
|
331
|
-
else:
|
332
|
-
raise ValueError(f"Invalid ld_wind_unit: {ld_unit}. Must be one of: SNP, KB, CM")
|
333
|
-
|
334
|
-
# Calculate blocks for LD computation
|
335
|
-
block_left = getBlockLefts(coords, max_dist)
|
336
|
-
assert block_left.sum() > 0, "Invalid window size, please check the ld_wind parameter."
|
337
|
-
|
338
|
-
# Calculate LD scores
|
339
|
-
ld_scores = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
|
340
|
-
|
341
|
-
return ld_scores
|
342
|
-
|
343
|
-
|
344
|
-
def calculate_ldscore_from_annotation(
|
345
|
-
snp_annotation_df: pd.DataFrame,
|
346
|
-
chrom: int,
|
347
|
-
bfile_root: str,
|
348
|
-
ld_wind: float = 1,
|
349
|
-
ld_unit: str = "CM",
|
350
|
-
) -> pd.DataFrame:
|
351
|
-
"""
|
352
|
-
Calculate LD scores from SNP annotation DataFrame.
|
353
|
-
|
354
|
-
Parameters
|
355
|
-
----------
|
356
|
-
snp_annotation_df : pd.DataFrame
|
357
|
-
DataFrame with SNP annotations
|
358
|
-
chrom : int
|
359
|
-
Chromosome number
|
360
|
-
bfile_root : str
|
361
|
-
Root path for PLINK bfiles
|
362
|
-
ld_wind : float, optional
|
363
|
-
LD window size, by default 1
|
364
|
-
ld_unit : str, optional
|
365
|
-
Unit for the LD window, by default "CM"
|
366
|
-
|
367
|
-
Returns
|
368
|
-
-------
|
369
|
-
pd.DataFrame
|
370
|
-
DataFrame with calculated LD scores
|
371
|
-
"""
|
372
|
-
# Calculate LD scores
|
373
|
-
snp_gene_weight_matrix = get_ldscore(
|
374
|
-
bfile_root, chrom, snp_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
|
375
|
-
)
|
376
|
-
|
377
|
-
# Set proper data types and indices
|
378
|
-
snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
|
379
|
-
snp_gene_weight_matrix.index = snp_annotation_df.index
|
380
|
-
snp_gene_weight_matrix.columns = snp_annotation_df.columns
|
381
|
-
|
382
|
-
return snp_gene_weight_matrix
|
383
|
-
|
384
|
-
|
385
|
-
def calculate_ldscore_from_multiple_annotation(
|
386
|
-
snp_annotation_df_list: list[pd.DataFrame],
|
387
|
-
chrom: int,
|
388
|
-
bfile_root: str,
|
389
|
-
ld_wind: float = 1,
|
390
|
-
ld_unit: str = "CM",
|
391
|
-
) -> list[pd.DataFrame]:
|
392
|
-
"""
|
393
|
-
Calculate LD scores from multiple SNP annotation DataFrames.
|
394
|
-
|
395
|
-
Parameters
|
396
|
-
----------
|
397
|
-
snp_annotation_df_list : list
|
398
|
-
List of DataFrames with SNP annotations
|
399
|
-
chrom : int
|
400
|
-
Chromosome number
|
401
|
-
bfile_root : str
|
402
|
-
Root path for PLINK bfiles
|
403
|
-
ld_wind : float, optional
|
404
|
-
LD window size, by default 1
|
405
|
-
ld_unit : str, optional
|
406
|
-
Unit for the LD window, by default "CM"
|
407
|
-
|
408
|
-
Returns
|
409
|
-
-------
|
410
|
-
list
|
411
|
-
List of DataFrames with calculated LD scores
|
412
|
-
"""
|
413
|
-
# Combine annotations
|
414
|
-
combined_annotations = pd.concat(snp_annotation_df_list, axis=1).astype(np.float32, copy=False)
|
415
|
-
|
416
|
-
# Calculate LD scores
|
417
|
-
combined_ld_scores = get_ldscore(
|
418
|
-
bfile_root, chrom, combined_annotations.values, ld_wind=ld_wind, ld_unit=ld_unit
|
419
|
-
)
|
420
|
-
|
421
|
-
# Apply proper indices and columns
|
422
|
-
combined_ld_scores.index = combined_annotations.index
|
423
|
-
combined_ld_scores.columns = combined_annotations.columns
|
424
|
-
|
425
|
-
# Split back into separate DataFrames
|
426
|
-
annotation_lengths = [len(df.columns) for df in snp_annotation_df_list]
|
427
|
-
result_dataframes = []
|
428
|
-
start_col = 0
|
429
|
-
|
430
|
-
for length in annotation_lengths:
|
431
|
-
end_col = start_col + length
|
432
|
-
result_dataframes.append(combined_ld_scores.iloc[:, start_col:end_col])
|
433
|
-
start_col = end_col
|
434
|
-
|
435
|
-
return result_dataframes
|
436
|
-
|
437
|
-
|
438
144
|
class LDScoreCalculator:
|
439
145
|
"""
|
440
146
|
Class for calculating LD scores from gene specificity scores.
|
441
|
-
|
442
|
-
This class handles the assignment of gene specificity scores to SNPs
|
443
|
-
and the calculation of LD scores.
|
444
147
|
"""
|
445
148
|
|
446
149
|
def __init__(self, config: GenerateLDScoreConfig):
|
447
|
-
"""
|
448
|
-
Initialize LDScoreCalculator.
|
449
|
-
|
450
|
-
Parameters
|
451
|
-
----------
|
452
|
-
config : GenerateLDScoreConfig
|
453
|
-
Configuration object
|
454
|
-
"""
|
150
|
+
"""Initialize LDScoreCalculator."""
|
455
151
|
self.config = config
|
456
152
|
self.validate_config()
|
457
153
|
|
@@ -466,9 +162,6 @@ class LDScoreCalculator:
|
|
466
162
|
# Initialize enhancer data if provided
|
467
163
|
self.enhancer_pr = self._initialize_enhancer() if config.enhancer_annotation_file else None
|
468
164
|
|
469
|
-
# Initialize zarr file if needed
|
470
|
-
self._initialize_zarr_if_needed()
|
471
|
-
|
472
165
|
def validate_config(self):
|
473
166
|
"""Validate configuration parameters."""
|
474
167
|
if not Path(self.config.mkscore_feather_path).exists():
|
@@ -519,33 +212,6 @@ class LDScoreCalculator:
|
|
519
212
|
# Convert to PyRanges
|
520
213
|
return pr.PyRanges(enhancer_df.reset_index())
|
521
214
|
|
522
|
-
def _initialize_zarr_if_needed(self):
|
523
|
-
"""Initialize zarr file if zarr format is specified."""
|
524
|
-
if self.config.ldscore_save_format == "zarr":
|
525
|
-
chrom_snp_length_dict = get_snp_counts(self.config)
|
526
|
-
self.chrom_snp_start_point = chrom_snp_length_dict["chrom_snp_start_point"]
|
527
|
-
|
528
|
-
zarr_path = (
|
529
|
-
Path(self.config.ldscore_save_dir) / f"{self.config.sample_name}.ldscore.zarr"
|
530
|
-
)
|
531
|
-
|
532
|
-
if not zarr_path.exists():
|
533
|
-
self.zarr_file = zarr.open(
|
534
|
-
zarr_path.as_posix(),
|
535
|
-
mode="a",
|
536
|
-
dtype=np.float16,
|
537
|
-
chunks=self.config.zarr_chunk_size,
|
538
|
-
shape=(chrom_snp_length_dict["total"], self.mk_score_common.shape[1]),
|
539
|
-
)
|
540
|
-
zarr_path.parent.mkdir(parents=True, exist_ok=True)
|
541
|
-
|
542
|
-
# Save metadata
|
543
|
-
self.zarr_file.attrs["spot_names"] = self.mk_score_common.columns.to_list()
|
544
|
-
self.zarr_file.attrs["chrom_snp_start_point"] = self.chrom_snp_start_point
|
545
|
-
|
546
|
-
else:
|
547
|
-
self.zarr_file = zarr.open(zarr_path.as_posix(), mode="a")
|
548
|
-
|
549
215
|
def process_chromosome(self, chrom: int):
|
550
216
|
"""
|
551
217
|
Process a single chromosome to calculate LD scores.
|
@@ -557,35 +223,42 @@ class LDScoreCalculator:
|
|
557
223
|
"""
|
558
224
|
logger.info(f"Processing chromosome {chrom}")
|
559
225
|
|
560
|
-
#
|
561
|
-
|
226
|
+
# Initialize PlinkBEDFile once for this chromosome
|
227
|
+
plink_bed = PlinkBEDFile(f"{self.config.bfile_root}.{chrom}")
|
228
|
+
|
229
|
+
# Get SNPs passing MAF filter using built-in method
|
230
|
+
self.snp_pass_maf = plink_bed.get_snps_by_maf(0.05)
|
562
231
|
|
563
232
|
# Get SNP-gene dummy pairs
|
564
|
-
self.snp_gene_pair_dummy = self._get_snp_gene_dummy(chrom)
|
233
|
+
self.snp_gene_pair_dummy = self._get_snp_gene_dummy(chrom, plink_bed)
|
565
234
|
|
566
235
|
# Apply SNP filter if provided
|
567
236
|
self._apply_snp_filter(chrom)
|
568
237
|
|
569
238
|
# Process additional baseline annotations if provided
|
570
239
|
if self.config.additional_baseline_annotation:
|
571
|
-
self._process_additional_baseline(chrom)
|
240
|
+
self._process_additional_baseline(chrom, plink_bed)
|
572
241
|
else:
|
573
|
-
# Calculate SNP-gene weight matrix
|
574
|
-
|
575
|
-
self.snp_gene_pair_dummy,
|
576
|
-
chrom,
|
577
|
-
self.config.bfile_root,
|
242
|
+
# Calculate SNP-gene weight matrix using built-in methods
|
243
|
+
ld_scores = plink_bed.get_ldscore(
|
244
|
+
annot_matrix=self.snp_gene_pair_dummy.values,
|
578
245
|
ld_wind=self.config.ld_wind,
|
579
246
|
ld_unit=self.config.ld_unit,
|
580
247
|
)
|
581
248
|
|
249
|
+
self.snp_gene_weight_matrix = pd.DataFrame(
|
250
|
+
ld_scores,
|
251
|
+
index=self.snp_gene_pair_dummy.index,
|
252
|
+
columns=self.snp_gene_pair_dummy.columns,
|
253
|
+
)
|
254
|
+
|
582
255
|
# Apply SNP filter if needed
|
583
256
|
if self.keep_snp_mask is not None:
|
584
257
|
self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
|
585
258
|
|
586
259
|
# Generate w_ld file if keep_snp_root is provided
|
587
260
|
if self.config.keep_snp_root:
|
588
|
-
self._generate_w_ld(chrom)
|
261
|
+
self._generate_w_ld(chrom, plink_bed)
|
589
262
|
|
590
263
|
# Save pre-calculated SNP-gene weight matrix if requested
|
591
264
|
self._save_snp_gene_weight_matrix_if_needed(chrom)
|
@@ -596,16 +269,16 @@ class LDScoreCalculator:
|
|
596
269
|
|
597
270
|
# Calculate baseline LD scores
|
598
271
|
logger.info(f"Calculating baseline LD scores for chr{chrom}")
|
599
|
-
self._calculate_baseline_ldscores(chrom)
|
272
|
+
self._calculate_baseline_ldscores(chrom, plink_bed)
|
600
273
|
|
601
274
|
# Calculate LD scores for annotation
|
602
275
|
logger.info(f"Calculating annotation LD scores for chr{chrom}")
|
603
|
-
self._calculate_annotation_ldscores(chrom)
|
276
|
+
self._calculate_annotation_ldscores(chrom, plink_bed)
|
604
277
|
|
605
278
|
# Clear memory
|
606
279
|
self._clear_memory()
|
607
280
|
|
608
|
-
def _generate_w_ld(self, chrom: int):
|
281
|
+
def _generate_w_ld(self, chrom: int, plink_bed):
|
609
282
|
"""
|
610
283
|
Generate w_ld file for the chromosome using filtered SNPs.
|
611
284
|
|
@@ -613,6 +286,8 @@ class LDScoreCalculator:
|
|
613
286
|
----------
|
614
287
|
chrom : int
|
615
288
|
Chromosome number
|
289
|
+
plink_bed : PlinkBEDFile
|
290
|
+
Initialized PlinkBEDFile object
|
616
291
|
"""
|
617
292
|
if not self.config.keep_snp_root:
|
618
293
|
logger.info(
|
@@ -622,48 +297,38 @@ class LDScoreCalculator:
|
|
622
297
|
|
623
298
|
logger.info(f"Generating w_ld for chr{chrom}")
|
624
299
|
|
625
|
-
# Get the indices of SNPs to keep based on the
|
626
|
-
|
300
|
+
# Get the indices of SNPs to keep based on the keep_snp
|
301
|
+
keep_snps_indices = plink_bed.bim_df[
|
302
|
+
plink_bed.bim_df.SNP.isin(self.snp_name)
|
303
|
+
].index.tolist()
|
627
304
|
|
628
305
|
# Create a simple unit annotation (all ones) for the filtered SNPs
|
629
|
-
unit_annotation = np.ones((len(
|
306
|
+
unit_annotation = np.ones((len(keep_snps_indices), 1))
|
630
307
|
|
631
|
-
# Calculate LD scores
|
632
|
-
w_ld_scores = get_ldscore(
|
633
|
-
|
634
|
-
chrom,
|
635
|
-
unit_annotation,
|
308
|
+
# Calculate LD scores
|
309
|
+
w_ld_scores = plink_bed.get_ldscore(
|
310
|
+
annot_matrix=unit_annotation,
|
636
311
|
ld_wind=self.config.ld_wind,
|
637
312
|
ld_unit=self.config.ld_unit,
|
638
|
-
keep_snps_index=
|
639
|
-
)
|
640
|
-
|
641
|
-
# Load the BIM file to get SNP information
|
642
|
-
bim_data = pd.read_csv(
|
643
|
-
f"{self.config.bfile_root}.{chrom}.bim",
|
644
|
-
sep="\t",
|
645
|
-
header=None,
|
646
|
-
names=["CHR", "SNP", "CM", "BP", "A1", "A2"],
|
313
|
+
keep_snps_index=keep_snps_indices,
|
647
314
|
)
|
648
315
|
|
649
|
-
# Get SNP names for the kept indices
|
650
|
-
kept_snp_names = bim_data.iloc[keep_snps_index].SNP.tolist()
|
651
|
-
|
652
316
|
# Create the w_ld DataFrame
|
317
|
+
bim_subset = plink_bed.bim_df.loc[keep_snps_indices]
|
653
318
|
w_ld_df = pd.DataFrame(
|
654
319
|
{
|
655
|
-
"SNP":
|
656
|
-
"L2": w_ld_scores.
|
657
|
-
"CHR":
|
658
|
-
"BP":
|
659
|
-
"CM":
|
320
|
+
"SNP": bim_subset.SNP,
|
321
|
+
"L2": w_ld_scores.flatten(),
|
322
|
+
"CHR": bim_subset.CHR,
|
323
|
+
"BP": bim_subset.BP,
|
324
|
+
"CM": bim_subset.CM,
|
660
325
|
}
|
661
326
|
)
|
662
327
|
|
663
328
|
# Reorder columns
|
664
329
|
w_ld_df = w_ld_df[["CHR", "SNP", "BP", "CM", "L2"]]
|
665
330
|
|
666
|
-
# Save to
|
331
|
+
# Save to file
|
667
332
|
w_ld_dir = Path(self.config.ldscore_save_dir) / "w_ld"
|
668
333
|
w_ld_dir.mkdir(parents=True, exist_ok=True)
|
669
334
|
w_ld_file = w_ld_dir / f"weights.{chrom}.l2.ldscore.gz"
|
@@ -693,7 +358,7 @@ class LDScoreCalculator:
|
|
693
358
|
logger.info(f"Using all {len(self.snp_name)} SNPs (no filter applied)")
|
694
359
|
logger.warning("No keep_snp_root provided, all SNPs will be used to calculate w_ld.")
|
695
360
|
|
696
|
-
def _process_additional_baseline(self, chrom: int):
|
361
|
+
def _process_additional_baseline(self, chrom: int, plink_bed):
|
697
362
|
"""
|
698
363
|
Process additional baseline annotations.
|
699
364
|
|
@@ -701,6 +366,8 @@ class LDScoreCalculator:
|
|
701
366
|
----------
|
702
367
|
chrom : int
|
703
368
|
Chromosome number
|
369
|
+
plink_bed : PlinkBEDFile
|
370
|
+
Initialized PlinkBEDFile object
|
704
371
|
"""
|
705
372
|
# Load additional baseline annotations
|
706
373
|
additional_baseline_path = Path(self.config.additional_baseline_annotation)
|
@@ -730,25 +397,44 @@ class LDScoreCalculator:
|
|
730
397
|
f"{missing_count} SNPs not found in additional baseline annotations. "
|
731
398
|
"Setting their values to 0."
|
732
399
|
)
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
self.snp_gene_weight_matrix, additional_ldscore = (
|
741
|
-
calculate_ldscore_from_multiple_annotation(
|
742
|
-
[self.snp_gene_pair_dummy, additional_baseline_df],
|
743
|
-
chrom,
|
744
|
-
self.config.bfile_root,
|
745
|
-
ld_wind=self.config.ld_wind,
|
746
|
-
ld_unit=self.config.ld_unit,
|
747
|
-
)
|
400
|
+
additional_baseline_df = additional_baseline_df.reindex(
|
401
|
+
self.snp_gene_pair_dummy.index, fill_value=0
|
402
|
+
)
|
403
|
+
|
404
|
+
# Combine annotations into a single matrix
|
405
|
+
combined_annotations = pd.concat(
|
406
|
+
[self.snp_gene_pair_dummy, additional_baseline_df], axis=1
|
748
407
|
)
|
749
408
|
|
750
|
-
#
|
751
|
-
|
409
|
+
# Calculate LD scores
|
410
|
+
ld_scores = plink_bed.get_ldscore(
|
411
|
+
annot_matrix=combined_annotations.values.astype(np.float32, copy=False),
|
412
|
+
ld_wind=self.config.ld_wind,
|
413
|
+
ld_unit=self.config.ld_unit,
|
414
|
+
)
|
415
|
+
|
416
|
+
# Split results
|
417
|
+
# total_cols = combined_annotations.shape[1]
|
418
|
+
gene_cols = self.snp_gene_pair_dummy.shape[1]
|
419
|
+
# baseline_cols = additional_baseline_df.shape[1]
|
420
|
+
|
421
|
+
# Create DataFrames with proper indices and columns
|
422
|
+
self.snp_gene_weight_matrix = pd.DataFrame(
|
423
|
+
ld_scores[:, :gene_cols],
|
424
|
+
index=combined_annotations.index,
|
425
|
+
columns=self.snp_gene_pair_dummy.columns,
|
426
|
+
)
|
427
|
+
|
428
|
+
additional_ldscore = pd.DataFrame(
|
429
|
+
ld_scores[:, gene_cols:],
|
430
|
+
index=combined_annotations.index,
|
431
|
+
columns=additional_baseline_df.columns,
|
432
|
+
)
|
433
|
+
|
434
|
+
# Filter by keep_snp_mask if specified
|
435
|
+
if self.keep_snp_mask is not None:
|
436
|
+
additional_ldscore = additional_ldscore[self.keep_snp_mask]
|
437
|
+
self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
|
752
438
|
|
753
439
|
# Save additional baseline LD scores
|
754
440
|
ld_score_file = f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather"
|
@@ -793,7 +479,7 @@ class LDScoreCalculator:
|
|
793
479
|
save_path = save_dir / f"{chrom}.snp_gene_weight_matrix.feather"
|
794
480
|
self.snp_gene_weight_matrix.reset_index().to_feather(save_path)
|
795
481
|
|
796
|
-
def _calculate_baseline_ldscores(self, chrom: int):
|
482
|
+
def _calculate_baseline_ldscores(self, chrom: int, plink_bed):
|
797
483
|
"""
|
798
484
|
Calculate and save baseline LD scores.
|
799
485
|
|
@@ -801,6 +487,8 @@ class LDScoreCalculator:
|
|
801
487
|
----------
|
802
488
|
chrom : int
|
803
489
|
Chromosome number
|
490
|
+
plink_bed : PlinkBEDFile
|
491
|
+
Initialized PlinkBEDFile object
|
804
492
|
"""
|
805
493
|
# Create baseline scores
|
806
494
|
baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
|
@@ -818,7 +506,9 @@ class LDScoreCalculator:
|
|
818
506
|
m_5_file = f"{self.config.ldscore_save_dir}/baseline/baseline.{chrom}.l2.M_5_50"
|
819
507
|
|
820
508
|
# Calculate LD scores
|
821
|
-
ldscore_chunk = self._calculate_ldscore_from_weights(
|
509
|
+
ldscore_chunk = self._calculate_ldscore_from_weights(
|
510
|
+
baseline_df, plink_bed, drop_dummy_na=False
|
511
|
+
)
|
822
512
|
|
823
513
|
# Save LD scores and M values
|
824
514
|
self._save_ldscore_to_feather(
|
@@ -836,9 +526,9 @@ class LDScoreCalculator:
|
|
836
526
|
|
837
527
|
# If keep_snp_root is not provided, use the first column of baseline ldscore as w_ld
|
838
528
|
if not self.config.keep_snp_root:
|
839
|
-
self._save_baseline_as_w_ld(chrom, ldscore_chunk)
|
529
|
+
self._save_baseline_as_w_ld(chrom, ldscore_chunk, plink_bed)
|
840
530
|
|
841
|
-
def _save_baseline_as_w_ld(self, chrom: int, ldscore_chunk: np.ndarray):
|
531
|
+
def _save_baseline_as_w_ld(self, chrom: int, ldscore_chunk: np.ndarray, plink_bed):
|
842
532
|
"""
|
843
533
|
Save the first column of baseline ldscore as w_ld.
|
844
534
|
|
@@ -848,6 +538,8 @@ class LDScoreCalculator:
|
|
848
538
|
Chromosome number
|
849
539
|
ldscore_chunk : np.ndarray
|
850
540
|
Array with baseline LD scores
|
541
|
+
plink_bed : PlinkBEDFile
|
542
|
+
Initialized PlinkBEDFile object
|
851
543
|
"""
|
852
544
|
logger.info(f"Using first column of baseline ldscore as w_ld for chr{chrom}")
|
853
545
|
|
@@ -861,23 +553,24 @@ class LDScoreCalculator:
|
|
861
553
|
# Extract the first column
|
862
554
|
w_ld_values = ldscore_chunk[:, 0]
|
863
555
|
|
864
|
-
# Create a DataFrame
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
names=["CHR", "SNP", "CM", "BP", "A1", "A2"],
|
556
|
+
# Create a DataFrame with SNP information from the BIM file
|
557
|
+
snp_indices = (
|
558
|
+
plink_bed.kept_snps
|
559
|
+
if hasattr(plink_bed, "kept_snps")
|
560
|
+
else np.arange(len(self.snp_name))
|
870
561
|
)
|
562
|
+
bim_subset = plink_bed.bim_df.iloc[snp_indices]
|
563
|
+
|
871
564
|
w_ld_df = pd.DataFrame(
|
872
565
|
{
|
873
566
|
"SNP": self.snp_name,
|
874
567
|
"L2": w_ld_values,
|
568
|
+
"CHR": bim_subset.CHR.values[: len(self.snp_name)], # Ensure length matches
|
569
|
+
"BP": bim_subset.BP.values[: len(self.snp_name)],
|
570
|
+
"CM": bim_subset.CM.values[: len(self.snp_name)],
|
875
571
|
}
|
876
572
|
)
|
877
573
|
|
878
|
-
# Add CHR, BP, and CM information
|
879
|
-
w_ld_df = w_ld_df.merge(bim_data[["SNP", "CHR", "BP", "CM"]], on="SNP", how="left")
|
880
|
-
|
881
574
|
# Reorder columns
|
882
575
|
w_ld_df = w_ld_df[["CHR", "SNP", "BP", "CM", "L2"]]
|
883
576
|
|
@@ -885,7 +578,7 @@ class LDScoreCalculator:
|
|
885
578
|
|
886
579
|
logger.info(f"Saved w_ld for chr{chrom} to {w_ld_file}")
|
887
580
|
|
888
|
-
def _calculate_annotation_ldscores(self, chrom: int):
|
581
|
+
def _calculate_annotation_ldscores(self, chrom: int, plink_bed):
|
889
582
|
"""
|
890
583
|
Calculate and save LD scores for spatial annotations.
|
891
584
|
|
@@ -893,6 +586,8 @@ class LDScoreCalculator:
|
|
893
586
|
----------
|
894
587
|
chrom : int
|
895
588
|
Chromosome number
|
589
|
+
plink_bed : PlinkBEDFile
|
590
|
+
Initialized PlinkBEDFile object
|
896
591
|
"""
|
897
592
|
# Get marker scores for gene columns (excluding dummy NA column)
|
898
593
|
mk_scores = self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]]
|
@@ -915,7 +610,7 @@ class LDScoreCalculator:
|
|
915
610
|
m_5_file = f"{self.config.ldscore_save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50"
|
916
611
|
|
917
612
|
# Calculate LD scores
|
918
|
-
ldscore_chunk = self._calculate_ldscore_from_weights(mk_score_chunk)
|
613
|
+
ldscore_chunk = self._calculate_ldscore_from_weights(mk_score_chunk, plink_bed)
|
919
614
|
|
920
615
|
# Save LD scores based on format
|
921
616
|
if self.config.ldscore_save_format == "feather":
|
@@ -924,12 +619,6 @@ class LDScoreCalculator:
|
|
924
619
|
column_names=mk_score_chunk.columns,
|
925
620
|
save_file_name=ld_score_file,
|
926
621
|
)
|
927
|
-
elif self.config.ldscore_save_format == "zarr":
|
928
|
-
self._save_ldscore_chunk_to_zarr(
|
929
|
-
ldscore_chunk,
|
930
|
-
chrom=chrom,
|
931
|
-
start_col_index=i,
|
932
|
-
)
|
933
622
|
else:
|
934
623
|
raise ValueError(f"Invalid ldscore_save_format: {self.config.ldscore_save_format}")
|
935
624
|
|
@@ -948,7 +637,7 @@ class LDScoreCalculator:
|
|
948
637
|
gc.collect()
|
949
638
|
|
950
639
|
def _calculate_ldscore_from_weights(
|
951
|
-
self, marker_scores: pd.DataFrame, drop_dummy_na: bool = True
|
640
|
+
self, marker_scores: pd.DataFrame, plink_bed, drop_dummy_na: bool = True
|
952
641
|
) -> np.ndarray:
|
953
642
|
"""
|
954
643
|
Calculate LD scores using SNP-gene weight matrix.
|
@@ -957,6 +646,8 @@ class LDScoreCalculator:
|
|
957
646
|
----------
|
958
647
|
marker_scores : pd.DataFrame
|
959
648
|
DataFrame with marker scores
|
649
|
+
plink_bed : PlinkBEDFile
|
650
|
+
Initialized PlinkBEDFile object
|
960
651
|
drop_dummy_na : bool, optional
|
961
652
|
Whether to drop the dummy NA column, by default True
|
962
653
|
|
@@ -1009,37 +700,6 @@ class LDScoreCalculator:
|
|
1009
700
|
df.index.name = "SNP"
|
1010
701
|
df.reset_index().to_feather(save_file_name)
|
1011
702
|
|
1012
|
-
def _save_ldscore_chunk_to_zarr(
|
1013
|
-
self, ldscore_data: np.ndarray, chrom: int, start_col_index: int
|
1014
|
-
):
|
1015
|
-
"""
|
1016
|
-
Save LD scores to a zarr array.
|
1017
|
-
|
1018
|
-
Parameters
|
1019
|
-
----------
|
1020
|
-
ldscore_data : np.ndarray
|
1021
|
-
Array with LD scores
|
1022
|
-
chrom : int
|
1023
|
-
Chromosome number
|
1024
|
-
start_col_index : int
|
1025
|
-
Starting column index in the zarr array
|
1026
|
-
"""
|
1027
|
-
# Convert to float16 for storage efficiency
|
1028
|
-
ldscore_data = ldscore_data.astype(np.float16, copy=False)
|
1029
|
-
|
1030
|
-
# Handle numerical overflow
|
1031
|
-
ldscore_data[np.isinf(ldscore_data)] = np.finfo(np.float16).max
|
1032
|
-
|
1033
|
-
# Get start and end indices for this chromosome
|
1034
|
-
chrom_start = self.chrom_snp_start_point[chrom - 1]
|
1035
|
-
chrom_end = self.chrom_snp_start_point[chrom]
|
1036
|
-
|
1037
|
-
# Save to zarr array
|
1038
|
-
self.zarr_file[
|
1039
|
-
chrom_start:chrom_end,
|
1040
|
-
start_col_index : start_col_index + ldscore_data.shape[1],
|
1041
|
-
] = ldscore_data
|
1042
|
-
|
1043
703
|
def _calculate_and_save_m_values(
|
1044
704
|
self,
|
1045
705
|
marker_scores: pd.DataFrame,
|
@@ -1084,7 +744,7 @@ class LDScoreCalculator:
|
|
1084
744
|
np.savetxt(m_file_path, m_values, delimiter="\t")
|
1085
745
|
np.savetxt(m_5_file_path, m_5_values, delimiter="\t")
|
1086
746
|
|
1087
|
-
def _get_snp_gene_dummy(self, chrom: int) -> pd.DataFrame:
|
747
|
+
def _get_snp_gene_dummy(self, chrom: int, plink_bed) -> pd.DataFrame:
|
1088
748
|
"""
|
1089
749
|
Get dummy matrix for SNP-gene pairs.
|
1090
750
|
|
@@ -1092,6 +752,7 @@ class LDScoreCalculator:
|
|
1092
752
|
----------
|
1093
753
|
chrom : int
|
1094
754
|
Chromosome number
|
755
|
+
plink_bed : PlinkBEDFile
|
1095
756
|
|
1096
757
|
Returns
|
1097
758
|
-------
|
@@ -1101,7 +762,8 @@ class LDScoreCalculator:
|
|
1101
762
|
logger.info(f"Creating SNP-gene mappings for chromosome {chrom}")
|
1102
763
|
|
1103
764
|
# Load BIM file
|
1104
|
-
bim
|
765
|
+
bim = plink_bed.bim_df
|
766
|
+
bim_pr = plink_bed.convert_bim_to_pyrange(bim)
|
1105
767
|
|
1106
768
|
# Determine mapping strategy
|
1107
769
|
if self.config.gene_window_enhancer_priority in ["gene_window_first", "enhancer_first"]:
|