gsMap 1.73.3__py3-none-any.whl → 1.73.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/__init__.py +2 -2
- gsMap/config.py +2 -9
- gsMap/diagnosis.py +4 -3
- gsMap/generate_ldscore.py +110 -454
- gsMap/utils/generate_r2_matrix.py +453 -352
- gsMap/utils/regression_read.py +131 -157
- {gsmap-1.73.3.dist-info → gsmap-1.73.5.dist-info}/METADATA +2 -2
- {gsmap-1.73.3.dist-info → gsmap-1.73.5.dist-info}/RECORD +11 -11
- {gsmap-1.73.3.dist-info → gsmap-1.73.5.dist-info}/WHEEL +0 -0
- {gsmap-1.73.3.dist-info → gsmap-1.73.5.dist-info}/entry_points.txt +0 -0
- {gsmap-1.73.3.dist-info → gsmap-1.73.5.dist-info}/licenses/LICENSE +0 -0
gsMap/generate_ldscore.py
CHANGED
@@ -13,12 +13,11 @@ from pathlib import Path
|
|
13
13
|
import numpy as np
|
14
14
|
import pandas as pd
|
15
15
|
import pyranges as pr
|
16
|
-
import zarr
|
17
16
|
from scipy.sparse import csr_matrix
|
18
17
|
from tqdm import trange
|
19
18
|
|
20
19
|
from gsMap.config import GenerateLDScoreConfig
|
21
|
-
from gsMap.utils.generate_r2_matrix import
|
20
|
+
from gsMap.utils.generate_r2_matrix import PlinkBEDFile
|
22
21
|
|
23
22
|
# Configure warning behavior more precisely
|
24
23
|
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
|
@@ -113,44 +112,6 @@ def load_marker_score(mk_score_file: str) -> pd.DataFrame:
|
|
113
112
|
return mk_score
|
114
113
|
|
115
114
|
|
116
|
-
def load_bim(bfile_root: str, chrom: int) -> tuple[pd.DataFrame, pr.PyRanges]:
|
117
|
-
"""
|
118
|
-
Load PLINK BIM file and convert to a PyRanges object.
|
119
|
-
|
120
|
-
Parameters
|
121
|
-
----------
|
122
|
-
bfile_root : str
|
123
|
-
Root path for PLINK bfiles
|
124
|
-
chrom : int
|
125
|
-
Chromosome number
|
126
|
-
|
127
|
-
Returns
|
128
|
-
-------
|
129
|
-
tuple
|
130
|
-
A tuple containing (bim_df, bim_pr) where:
|
131
|
-
- bim_df is a pandas DataFrame with BIM data
|
132
|
-
- bim_pr is a PyRanges object with BIM data
|
133
|
-
"""
|
134
|
-
bim_file = f"{bfile_root}.{chrom}.bim"
|
135
|
-
logger.info(f"Loading BIM file: {bim_file}")
|
136
|
-
|
137
|
-
bim = pd.read_csv(bim_file, sep="\t", header=None)
|
138
|
-
bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
|
139
|
-
|
140
|
-
# Convert to PyRanges
|
141
|
-
bim_pr = bim.copy()
|
142
|
-
bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
|
143
|
-
|
144
|
-
# Adjust coordinates (BIM is 1-based, PyRanges uses 0-based)
|
145
|
-
bim_pr["End"] = bim_pr["Start"].copy()
|
146
|
-
bim_pr["Start"] = bim_pr["Start"] - 1
|
147
|
-
|
148
|
-
bim_pr = pr.PyRanges(bim_pr)
|
149
|
-
bim_pr.Chromosome = f"chr{chrom}"
|
150
|
-
|
151
|
-
return bim, bim_pr
|
152
|
-
|
153
|
-
|
154
115
|
def overlaps_gtf_bim(gtf_pr: pr.PyRanges, bim_pr: pr.PyRanges) -> pd.DataFrame:
|
155
116
|
"""
|
156
117
|
Find overlaps between GTF and BIM data, and select nearest gene for each SNP.
|
@@ -180,284 +141,13 @@ def overlaps_gtf_bim(gtf_pr: pr.PyRanges, bim_pr: pr.PyRanges) -> pd.DataFrame:
|
|
180
141
|
return nearest_genes
|
181
142
|
|
182
143
|
|
183
|
-
def filter_snps_by_keep_snp(bim_df: pd.DataFrame, keep_snp_file: str) -> pd.DataFrame:
|
184
|
-
"""
|
185
|
-
Filter BIM DataFrame to keep only SNPs in a provided list.
|
186
|
-
|
187
|
-
Parameters
|
188
|
-
----------
|
189
|
-
bim_df : pd.DataFrame
|
190
|
-
DataFrame with BIM data
|
191
|
-
keep_snp_file : str
|
192
|
-
Path to a file with SNP IDs to keep
|
193
|
-
|
194
|
-
Returns
|
195
|
-
-------
|
196
|
-
pd.DataFrame
|
197
|
-
Filtered BIM DataFrame
|
198
|
-
"""
|
199
|
-
# Read SNPs to keep
|
200
|
-
keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
|
201
|
-
|
202
|
-
# Filter the BIM DataFrame
|
203
|
-
filtered_bim_df = bim_df[bim_df["SNP"].isin(keep_snp)]
|
204
|
-
|
205
|
-
logger.info(f"Kept {len(filtered_bim_df)} SNPs out of {len(bim_df)} after filtering")
|
206
|
-
|
207
|
-
return filtered_bim_df
|
208
|
-
|
209
|
-
|
210
|
-
def get_snp_counts(config: GenerateLDScoreConfig) -> dict:
|
211
|
-
"""
|
212
|
-
Count SNPs per chromosome and calculate start positions for zarr arrays.
|
213
|
-
|
214
|
-
Parameters
|
215
|
-
----------
|
216
|
-
config : GenerateLDScoreConfig
|
217
|
-
Configuration object
|
218
|
-
|
219
|
-
Returns
|
220
|
-
-------
|
221
|
-
dict
|
222
|
-
Dictionary with SNP counts and start positions
|
223
|
-
"""
|
224
|
-
snp_counts = {}
|
225
|
-
total_snp = 0
|
226
|
-
|
227
|
-
for chrom in range(1, 23):
|
228
|
-
bim_df, _ = load_bim(config.bfile_root, chrom)
|
229
|
-
|
230
|
-
if config.keep_snp_root:
|
231
|
-
keep_snp_file = f"{config.keep_snp_root}.{chrom}.snp"
|
232
|
-
filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
|
233
|
-
else:
|
234
|
-
filtered_bim_df = bim_df
|
235
|
-
|
236
|
-
snp_counts[chrom] = filtered_bim_df.shape[0]
|
237
|
-
total_snp += snp_counts[chrom]
|
238
|
-
|
239
|
-
snp_counts["total"] = total_snp
|
240
|
-
|
241
|
-
# Calculate cumulative SNP counts for zarr array indexing
|
242
|
-
chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
|
243
|
-
snp_counts["chrom_snp_start_point"] = [0] + chrom_snp_length_array.tolist()
|
244
|
-
|
245
|
-
return snp_counts
|
246
|
-
|
247
|
-
|
248
|
-
def get_snp_pass_maf(bfile_root: str, chrom: int, maf_min: float = 0.05) -> list[str]:
|
249
|
-
"""
|
250
|
-
Get SNPs that pass the minimum minor allele frequency (MAF) threshold.
|
251
|
-
|
252
|
-
Parameters
|
253
|
-
----------
|
254
|
-
bfile_root : str
|
255
|
-
Root path for PLINK bfiles
|
256
|
-
chrom : int
|
257
|
-
Chromosome number
|
258
|
-
maf_min : float, optional
|
259
|
-
Minimum MAF threshold, by default 0.05
|
260
|
-
|
261
|
-
Returns
|
262
|
-
-------
|
263
|
-
list
|
264
|
-
List of SNP IDs that pass the MAF threshold
|
265
|
-
"""
|
266
|
-
array_snps, array_indivs, geno_array = load_bfile(
|
267
|
-
bfile_chr_prefix=f"{bfile_root}.{chrom}", mafMin=maf_min
|
268
|
-
)
|
269
|
-
|
270
|
-
m = len(array_snps.IDList)
|
271
|
-
n = len(array_indivs.IDList)
|
272
|
-
logger.info(
|
273
|
-
f"Loading genotype data for {m} SNPs and {n} individuals from {bfile_root}.{chrom}"
|
274
|
-
)
|
275
|
-
|
276
|
-
# Filter SNPs by MAF
|
277
|
-
snp_pass_maf = array_snps.IDList.iloc[geno_array.kept_snps]
|
278
|
-
logger.info(f"After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain")
|
279
|
-
|
280
|
-
return snp_pass_maf.SNP.to_list()
|
281
|
-
|
282
|
-
|
283
|
-
def get_ldscore(
|
284
|
-
bfile_root: str,
|
285
|
-
chrom: int,
|
286
|
-
annot_matrix: np.ndarray,
|
287
|
-
ld_wind: float,
|
288
|
-
ld_unit: str = "CM",
|
289
|
-
keep_snps_index: list[int] = None,
|
290
|
-
) -> pd.DataFrame:
|
291
|
-
"""
|
292
|
-
Calculate LD scores using PLINK data and an annotation matrix.
|
293
|
-
|
294
|
-
Parameters
|
295
|
-
----------
|
296
|
-
bfile_root : str
|
297
|
-
Root path for PLINK bfiles
|
298
|
-
chrom : int
|
299
|
-
Chromosome number
|
300
|
-
annot_matrix : np.ndarray
|
301
|
-
Annotation matrix
|
302
|
-
ld_wind : float
|
303
|
-
LD window size
|
304
|
-
ld_unit : str, optional
|
305
|
-
Unit for the LD window, by default "CM"
|
306
|
-
keep_snps_index : list[int], optional
|
307
|
-
Indices of SNPs to keep, by default None
|
308
|
-
|
309
|
-
Returns
|
310
|
-
-------
|
311
|
-
pd.DataFrame
|
312
|
-
DataFrame with calculated LD scores
|
313
|
-
"""
|
314
|
-
array_snps, array_indivs, geno_array = load_bfile(
|
315
|
-
bfile_chr_prefix=f"{bfile_root}.{chrom}", keep_snps=keep_snps_index
|
316
|
-
)
|
317
|
-
|
318
|
-
annot_matrix = annot_matrix[geno_array.kept_snps, :]
|
319
|
-
|
320
|
-
# Configure LD window based on specified unit
|
321
|
-
if ld_unit == "SNP":
|
322
|
-
max_dist = ld_wind
|
323
|
-
coords = np.array(range(geno_array.m))
|
324
|
-
elif ld_unit == "KB":
|
325
|
-
max_dist = ld_wind * 1000
|
326
|
-
coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
|
327
|
-
elif ld_unit == "CM":
|
328
|
-
max_dist = ld_wind
|
329
|
-
coords = np.array(array_snps.df["CM"])[geno_array.kept_snps]
|
330
|
-
# Check if the CM is all 0
|
331
|
-
if np.all(coords == 0):
|
332
|
-
logger.warning(
|
333
|
-
"All CM values are 0 in the BIM file. Using 1MB window size for LD score calculation."
|
334
|
-
)
|
335
|
-
max_dist = 1_000_000
|
336
|
-
coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
|
337
|
-
else:
|
338
|
-
raise ValueError(f"Invalid ld_wind_unit: {ld_unit}. Must be one of: SNP, KB, CM")
|
339
|
-
|
340
|
-
# Calculate blocks for LD computation
|
341
|
-
block_left = getBlockLefts(coords, max_dist)
|
342
|
-
assert block_left.sum() > 0, "Invalid window size, please check the ld_wind parameter."
|
343
|
-
|
344
|
-
# Calculate LD scores
|
345
|
-
ld_scores = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
|
346
|
-
|
347
|
-
return ld_scores
|
348
|
-
|
349
|
-
|
350
|
-
def calculate_ldscore_from_annotation(
|
351
|
-
snp_annotation_df: pd.DataFrame,
|
352
|
-
chrom: int,
|
353
|
-
bfile_root: str,
|
354
|
-
ld_wind: float = 1,
|
355
|
-
ld_unit: str = "CM",
|
356
|
-
) -> pd.DataFrame:
|
357
|
-
"""
|
358
|
-
Calculate LD scores from SNP annotation DataFrame.
|
359
|
-
|
360
|
-
Parameters
|
361
|
-
----------
|
362
|
-
snp_annotation_df : pd.DataFrame
|
363
|
-
DataFrame with SNP annotations
|
364
|
-
chrom : int
|
365
|
-
Chromosome number
|
366
|
-
bfile_root : str
|
367
|
-
Root path for PLINK bfiles
|
368
|
-
ld_wind : float, optional
|
369
|
-
LD window size, by default 1
|
370
|
-
ld_unit : str, optional
|
371
|
-
Unit for the LD window, by default "CM"
|
372
|
-
|
373
|
-
Returns
|
374
|
-
-------
|
375
|
-
pd.DataFrame
|
376
|
-
DataFrame with calculated LD scores
|
377
|
-
"""
|
378
|
-
# Calculate LD scores
|
379
|
-
snp_gene_weight_matrix = get_ldscore(
|
380
|
-
bfile_root, chrom, snp_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
|
381
|
-
)
|
382
|
-
|
383
|
-
# Set proper data types and indices
|
384
|
-
snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
|
385
|
-
snp_gene_weight_matrix.index = snp_annotation_df.index
|
386
|
-
snp_gene_weight_matrix.columns = snp_annotation_df.columns
|
387
|
-
|
388
|
-
return snp_gene_weight_matrix
|
389
|
-
|
390
|
-
|
391
|
-
def calculate_ldscore_from_multiple_annotation(
|
392
|
-
snp_annotation_df_list: list[pd.DataFrame],
|
393
|
-
chrom: int,
|
394
|
-
bfile_root: str,
|
395
|
-
ld_wind: float = 1,
|
396
|
-
ld_unit: str = "CM",
|
397
|
-
) -> list[pd.DataFrame]:
|
398
|
-
"""
|
399
|
-
Calculate LD scores from multiple SNP annotation DataFrames.
|
400
|
-
|
401
|
-
Parameters
|
402
|
-
----------
|
403
|
-
snp_annotation_df_list : list
|
404
|
-
List of DataFrames with SNP annotations
|
405
|
-
chrom : int
|
406
|
-
Chromosome number
|
407
|
-
bfile_root : str
|
408
|
-
Root path for PLINK bfiles
|
409
|
-
ld_wind : float, optional
|
410
|
-
LD window size, by default 1
|
411
|
-
ld_unit : str, optional
|
412
|
-
Unit for the LD window, by default "CM"
|
413
|
-
|
414
|
-
Returns
|
415
|
-
-------
|
416
|
-
list
|
417
|
-
List of DataFrames with calculated LD scores
|
418
|
-
"""
|
419
|
-
# Combine annotations
|
420
|
-
combined_annotations = pd.concat(snp_annotation_df_list, axis=1).astype(np.float32, copy=False)
|
421
|
-
|
422
|
-
# Calculate LD scores
|
423
|
-
combined_ld_scores = get_ldscore(
|
424
|
-
bfile_root, chrom, combined_annotations.values, ld_wind=ld_wind, ld_unit=ld_unit
|
425
|
-
)
|
426
|
-
|
427
|
-
# Apply proper indices and columns
|
428
|
-
combined_ld_scores.index = combined_annotations.index
|
429
|
-
combined_ld_scores.columns = combined_annotations.columns
|
430
|
-
|
431
|
-
# Split back into separate DataFrames
|
432
|
-
annotation_lengths = [len(df.columns) for df in snp_annotation_df_list]
|
433
|
-
result_dataframes = []
|
434
|
-
start_col = 0
|
435
|
-
|
436
|
-
for length in annotation_lengths:
|
437
|
-
end_col = start_col + length
|
438
|
-
result_dataframes.append(combined_ld_scores.iloc[:, start_col:end_col])
|
439
|
-
start_col = end_col
|
440
|
-
|
441
|
-
return result_dataframes
|
442
|
-
|
443
|
-
|
444
144
|
class LDScoreCalculator:
|
445
145
|
"""
|
446
146
|
Class for calculating LD scores from gene specificity scores.
|
447
|
-
|
448
|
-
This class handles the assignment of gene specificity scores to SNPs
|
449
|
-
and the calculation of LD scores.
|
450
147
|
"""
|
451
148
|
|
452
149
|
def __init__(self, config: GenerateLDScoreConfig):
|
453
|
-
"""
|
454
|
-
Initialize LDScoreCalculator.
|
455
|
-
|
456
|
-
Parameters
|
457
|
-
----------
|
458
|
-
config : GenerateLDScoreConfig
|
459
|
-
Configuration object
|
460
|
-
"""
|
150
|
+
"""Initialize LDScoreCalculator."""
|
461
151
|
self.config = config
|
462
152
|
self.validate_config()
|
463
153
|
|
@@ -472,9 +162,6 @@ class LDScoreCalculator:
|
|
472
162
|
# Initialize enhancer data if provided
|
473
163
|
self.enhancer_pr = self._initialize_enhancer() if config.enhancer_annotation_file else None
|
474
164
|
|
475
|
-
# Initialize zarr file if needed
|
476
|
-
self._initialize_zarr_if_needed()
|
477
|
-
|
478
165
|
def validate_config(self):
|
479
166
|
"""Validate configuration parameters."""
|
480
167
|
if not Path(self.config.mkscore_feather_path).exists():
|
@@ -525,33 +212,6 @@ class LDScoreCalculator:
|
|
525
212
|
# Convert to PyRanges
|
526
213
|
return pr.PyRanges(enhancer_df.reset_index())
|
527
214
|
|
528
|
-
def _initialize_zarr_if_needed(self):
|
529
|
-
"""Initialize zarr file if zarr format is specified."""
|
530
|
-
if self.config.ldscore_save_format == "zarr":
|
531
|
-
chrom_snp_length_dict = get_snp_counts(self.config)
|
532
|
-
self.chrom_snp_start_point = chrom_snp_length_dict["chrom_snp_start_point"]
|
533
|
-
|
534
|
-
zarr_path = (
|
535
|
-
Path(self.config.ldscore_save_dir) / f"{self.config.sample_name}.ldscore.zarr"
|
536
|
-
)
|
537
|
-
|
538
|
-
if not zarr_path.exists():
|
539
|
-
self.zarr_file = zarr.open(
|
540
|
-
zarr_path.as_posix(),
|
541
|
-
mode="a",
|
542
|
-
dtype=np.float16,
|
543
|
-
chunks=self.config.zarr_chunk_size,
|
544
|
-
shape=(chrom_snp_length_dict["total"], self.mk_score_common.shape[1]),
|
545
|
-
)
|
546
|
-
zarr_path.parent.mkdir(parents=True, exist_ok=True)
|
547
|
-
|
548
|
-
# Save metadata
|
549
|
-
self.zarr_file.attrs["spot_names"] = self.mk_score_common.columns.to_list()
|
550
|
-
self.zarr_file.attrs["chrom_snp_start_point"] = self.chrom_snp_start_point
|
551
|
-
|
552
|
-
else:
|
553
|
-
self.zarr_file = zarr.open(zarr_path.as_posix(), mode="a")
|
554
|
-
|
555
215
|
def process_chromosome(self, chrom: int):
|
556
216
|
"""
|
557
217
|
Process a single chromosome to calculate LD scores.
|
@@ -563,35 +223,42 @@ class LDScoreCalculator:
|
|
563
223
|
"""
|
564
224
|
logger.info(f"Processing chromosome {chrom}")
|
565
225
|
|
566
|
-
#
|
567
|
-
|
226
|
+
# Initialize PlinkBEDFile once for this chromosome
|
227
|
+
plink_bed = PlinkBEDFile(f"{self.config.bfile_root}.{chrom}")
|
228
|
+
|
229
|
+
# Get SNPs passing MAF filter using built-in method
|
230
|
+
self.snp_pass_maf = plink_bed.get_snps_by_maf(0.05)
|
568
231
|
|
569
232
|
# Get SNP-gene dummy pairs
|
570
|
-
self.snp_gene_pair_dummy = self._get_snp_gene_dummy(chrom)
|
233
|
+
self.snp_gene_pair_dummy = self._get_snp_gene_dummy(chrom, plink_bed)
|
571
234
|
|
572
235
|
# Apply SNP filter if provided
|
573
236
|
self._apply_snp_filter(chrom)
|
574
237
|
|
575
238
|
# Process additional baseline annotations if provided
|
576
239
|
if self.config.additional_baseline_annotation:
|
577
|
-
self._process_additional_baseline(chrom)
|
240
|
+
self._process_additional_baseline(chrom, plink_bed)
|
578
241
|
else:
|
579
|
-
# Calculate SNP-gene weight matrix
|
580
|
-
|
581
|
-
self.snp_gene_pair_dummy,
|
582
|
-
chrom,
|
583
|
-
self.config.bfile_root,
|
242
|
+
# Calculate SNP-gene weight matrix using built-in methods
|
243
|
+
ld_scores = plink_bed.get_ldscore(
|
244
|
+
annot_matrix=self.snp_gene_pair_dummy.values,
|
584
245
|
ld_wind=self.config.ld_wind,
|
585
246
|
ld_unit=self.config.ld_unit,
|
586
247
|
)
|
587
248
|
|
249
|
+
self.snp_gene_weight_matrix = pd.DataFrame(
|
250
|
+
ld_scores,
|
251
|
+
index=self.snp_gene_pair_dummy.index,
|
252
|
+
columns=self.snp_gene_pair_dummy.columns,
|
253
|
+
)
|
254
|
+
|
588
255
|
# Apply SNP filter if needed
|
589
256
|
if self.keep_snp_mask is not None:
|
590
257
|
self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
|
591
258
|
|
592
259
|
# Generate w_ld file if keep_snp_root is provided
|
593
260
|
if self.config.keep_snp_root:
|
594
|
-
self._generate_w_ld(chrom)
|
261
|
+
self._generate_w_ld(chrom, plink_bed)
|
595
262
|
|
596
263
|
# Save pre-calculated SNP-gene weight matrix if requested
|
597
264
|
self._save_snp_gene_weight_matrix_if_needed(chrom)
|
@@ -602,16 +269,16 @@ class LDScoreCalculator:
|
|
602
269
|
|
603
270
|
# Calculate baseline LD scores
|
604
271
|
logger.info(f"Calculating baseline LD scores for chr{chrom}")
|
605
|
-
self._calculate_baseline_ldscores(chrom)
|
272
|
+
self._calculate_baseline_ldscores(chrom, plink_bed)
|
606
273
|
|
607
274
|
# Calculate LD scores for annotation
|
608
275
|
logger.info(f"Calculating annotation LD scores for chr{chrom}")
|
609
|
-
self._calculate_annotation_ldscores(chrom)
|
276
|
+
self._calculate_annotation_ldscores(chrom, plink_bed)
|
610
277
|
|
611
278
|
# Clear memory
|
612
279
|
self._clear_memory()
|
613
280
|
|
614
|
-
def _generate_w_ld(self, chrom: int):
|
281
|
+
def _generate_w_ld(self, chrom: int, plink_bed):
|
615
282
|
"""
|
616
283
|
Generate w_ld file for the chromosome using filtered SNPs.
|
617
284
|
|
@@ -619,6 +286,8 @@ class LDScoreCalculator:
|
|
619
286
|
----------
|
620
287
|
chrom : int
|
621
288
|
Chromosome number
|
289
|
+
plink_bed : PlinkBEDFile
|
290
|
+
Initialized PlinkBEDFile object
|
622
291
|
"""
|
623
292
|
if not self.config.keep_snp_root:
|
624
293
|
logger.info(
|
@@ -628,48 +297,38 @@ class LDScoreCalculator:
|
|
628
297
|
|
629
298
|
logger.info(f"Generating w_ld for chr{chrom}")
|
630
299
|
|
631
|
-
# Get the indices of SNPs to keep based on the
|
632
|
-
|
300
|
+
# Get the indices of SNPs to keep based on the keep_snp
|
301
|
+
keep_snps_indices = plink_bed.bim_df[
|
302
|
+
plink_bed.bim_df.SNP.isin(self.snp_name)
|
303
|
+
].index.tolist()
|
633
304
|
|
634
305
|
# Create a simple unit annotation (all ones) for the filtered SNPs
|
635
|
-
unit_annotation = np.ones((len(
|
306
|
+
unit_annotation = np.ones((len(keep_snps_indices), 1))
|
636
307
|
|
637
|
-
# Calculate LD scores
|
638
|
-
w_ld_scores = get_ldscore(
|
639
|
-
|
640
|
-
chrom,
|
641
|
-
unit_annotation,
|
308
|
+
# Calculate LD scores
|
309
|
+
w_ld_scores = plink_bed.get_ldscore(
|
310
|
+
annot_matrix=unit_annotation,
|
642
311
|
ld_wind=self.config.ld_wind,
|
643
312
|
ld_unit=self.config.ld_unit,
|
644
|
-
keep_snps_index=
|
645
|
-
)
|
646
|
-
|
647
|
-
# Load the BIM file to get SNP information
|
648
|
-
bim_data = pd.read_csv(
|
649
|
-
f"{self.config.bfile_root}.{chrom}.bim",
|
650
|
-
sep="\t",
|
651
|
-
header=None,
|
652
|
-
names=["CHR", "SNP", "CM", "BP", "A1", "A2"],
|
313
|
+
keep_snps_index=keep_snps_indices,
|
653
314
|
)
|
654
315
|
|
655
|
-
# Get SNP names for the kept indices
|
656
|
-
kept_snp_names = bim_data.iloc[keep_snps_index].SNP.tolist()
|
657
|
-
|
658
316
|
# Create the w_ld DataFrame
|
317
|
+
bim_subset = plink_bed.bim_df.loc[keep_snps_indices]
|
659
318
|
w_ld_df = pd.DataFrame(
|
660
319
|
{
|
661
|
-
"SNP":
|
662
|
-
"L2": w_ld_scores.
|
663
|
-
"CHR":
|
664
|
-
"BP":
|
665
|
-
"CM":
|
320
|
+
"SNP": bim_subset.SNP,
|
321
|
+
"L2": w_ld_scores.flatten(),
|
322
|
+
"CHR": bim_subset.CHR,
|
323
|
+
"BP": bim_subset.BP,
|
324
|
+
"CM": bim_subset.CM,
|
666
325
|
}
|
667
326
|
)
|
668
327
|
|
669
328
|
# Reorder columns
|
670
329
|
w_ld_df = w_ld_df[["CHR", "SNP", "BP", "CM", "L2"]]
|
671
330
|
|
672
|
-
# Save to
|
331
|
+
# Save to file
|
673
332
|
w_ld_dir = Path(self.config.ldscore_save_dir) / "w_ld"
|
674
333
|
w_ld_dir.mkdir(parents=True, exist_ok=True)
|
675
334
|
w_ld_file = w_ld_dir / f"weights.{chrom}.l2.ldscore.gz"
|
@@ -699,7 +358,7 @@ class LDScoreCalculator:
|
|
699
358
|
logger.info(f"Using all {len(self.snp_name)} SNPs (no filter applied)")
|
700
359
|
logger.warning("No keep_snp_root provided, all SNPs will be used to calculate w_ld.")
|
701
360
|
|
702
|
-
def _process_additional_baseline(self, chrom: int):
|
361
|
+
def _process_additional_baseline(self, chrom: int, plink_bed):
|
703
362
|
"""
|
704
363
|
Process additional baseline annotations.
|
705
364
|
|
@@ -707,6 +366,8 @@ class LDScoreCalculator:
|
|
707
366
|
----------
|
708
367
|
chrom : int
|
709
368
|
Chromosome number
|
369
|
+
plink_bed : PlinkBEDFile
|
370
|
+
Initialized PlinkBEDFile object
|
710
371
|
"""
|
711
372
|
# Load additional baseline annotations
|
712
373
|
additional_baseline_path = Path(self.config.additional_baseline_annotation)
|
@@ -736,25 +397,44 @@ class LDScoreCalculator:
|
|
736
397
|
f"{missing_count} SNPs not found in additional baseline annotations. "
|
737
398
|
"Setting their values to 0."
|
738
399
|
)
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
self.snp_gene_weight_matrix, additional_ldscore = (
|
747
|
-
calculate_ldscore_from_multiple_annotation(
|
748
|
-
[self.snp_gene_pair_dummy, additional_baseline_df],
|
749
|
-
chrom,
|
750
|
-
self.config.bfile_root,
|
751
|
-
ld_wind=self.config.ld_wind,
|
752
|
-
ld_unit=self.config.ld_unit,
|
753
|
-
)
|
400
|
+
additional_baseline_df = additional_baseline_df.reindex(
|
401
|
+
self.snp_gene_pair_dummy.index, fill_value=0
|
402
|
+
)
|
403
|
+
|
404
|
+
# Combine annotations into a single matrix
|
405
|
+
combined_annotations = pd.concat(
|
406
|
+
[self.snp_gene_pair_dummy, additional_baseline_df], axis=1
|
754
407
|
)
|
755
408
|
|
756
|
-
#
|
757
|
-
|
409
|
+
# Calculate LD scores
|
410
|
+
ld_scores = plink_bed.get_ldscore(
|
411
|
+
annot_matrix=combined_annotations.values.astype(np.float32, copy=False),
|
412
|
+
ld_wind=self.config.ld_wind,
|
413
|
+
ld_unit=self.config.ld_unit,
|
414
|
+
)
|
415
|
+
|
416
|
+
# Split results
|
417
|
+
# total_cols = combined_annotations.shape[1]
|
418
|
+
gene_cols = self.snp_gene_pair_dummy.shape[1]
|
419
|
+
# baseline_cols = additional_baseline_df.shape[1]
|
420
|
+
|
421
|
+
# Create DataFrames with proper indices and columns
|
422
|
+
self.snp_gene_weight_matrix = pd.DataFrame(
|
423
|
+
ld_scores[:, :gene_cols],
|
424
|
+
index=combined_annotations.index,
|
425
|
+
columns=self.snp_gene_pair_dummy.columns,
|
426
|
+
)
|
427
|
+
|
428
|
+
additional_ldscore = pd.DataFrame(
|
429
|
+
ld_scores[:, gene_cols:],
|
430
|
+
index=combined_annotations.index,
|
431
|
+
columns=additional_baseline_df.columns,
|
432
|
+
)
|
433
|
+
|
434
|
+
# Filter by keep_snp_mask if specified
|
435
|
+
if self.keep_snp_mask is not None:
|
436
|
+
additional_ldscore = additional_ldscore[self.keep_snp_mask]
|
437
|
+
self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
|
758
438
|
|
759
439
|
# Save additional baseline LD scores
|
760
440
|
ld_score_file = f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather"
|
@@ -799,7 +479,7 @@ class LDScoreCalculator:
|
|
799
479
|
save_path = save_dir / f"{chrom}.snp_gene_weight_matrix.feather"
|
800
480
|
self.snp_gene_weight_matrix.reset_index().to_feather(save_path)
|
801
481
|
|
802
|
-
def _calculate_baseline_ldscores(self, chrom: int):
|
482
|
+
def _calculate_baseline_ldscores(self, chrom: int, plink_bed):
|
803
483
|
"""
|
804
484
|
Calculate and save baseline LD scores.
|
805
485
|
|
@@ -807,6 +487,8 @@ class LDScoreCalculator:
|
|
807
487
|
----------
|
808
488
|
chrom : int
|
809
489
|
Chromosome number
|
490
|
+
plink_bed : PlinkBEDFile
|
491
|
+
Initialized PlinkBEDFile object
|
810
492
|
"""
|
811
493
|
# Create baseline scores
|
812
494
|
baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
|
@@ -824,7 +506,9 @@ class LDScoreCalculator:
|
|
824
506
|
m_5_file = f"{self.config.ldscore_save_dir}/baseline/baseline.{chrom}.l2.M_5_50"
|
825
507
|
|
826
508
|
# Calculate LD scores
|
827
|
-
ldscore_chunk = self._calculate_ldscore_from_weights(
|
509
|
+
ldscore_chunk = self._calculate_ldscore_from_weights(
|
510
|
+
baseline_df, plink_bed, drop_dummy_na=False
|
511
|
+
)
|
828
512
|
|
829
513
|
# Save LD scores and M values
|
830
514
|
self._save_ldscore_to_feather(
|
@@ -842,9 +526,9 @@ class LDScoreCalculator:
|
|
842
526
|
|
843
527
|
# If keep_snp_root is not provided, use the first column of baseline ldscore as w_ld
|
844
528
|
if not self.config.keep_snp_root:
|
845
|
-
self._save_baseline_as_w_ld(chrom, ldscore_chunk)
|
529
|
+
self._save_baseline_as_w_ld(chrom, ldscore_chunk, plink_bed)
|
846
530
|
|
847
|
-
def _save_baseline_as_w_ld(self, chrom: int, ldscore_chunk: np.ndarray):
|
531
|
+
def _save_baseline_as_w_ld(self, chrom: int, ldscore_chunk: np.ndarray, plink_bed):
|
848
532
|
"""
|
849
533
|
Save the first column of baseline ldscore as w_ld.
|
850
534
|
|
@@ -854,6 +538,8 @@ class LDScoreCalculator:
|
|
854
538
|
Chromosome number
|
855
539
|
ldscore_chunk : np.ndarray
|
856
540
|
Array with baseline LD scores
|
541
|
+
plink_bed : PlinkBEDFile
|
542
|
+
Initialized PlinkBEDFile object
|
857
543
|
"""
|
858
544
|
logger.info(f"Using first column of baseline ldscore as w_ld for chr{chrom}")
|
859
545
|
|
@@ -867,23 +553,24 @@ class LDScoreCalculator:
|
|
867
553
|
# Extract the first column
|
868
554
|
w_ld_values = ldscore_chunk[:, 0]
|
869
555
|
|
870
|
-
# Create a DataFrame
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
names=["CHR", "SNP", "CM", "BP", "A1", "A2"],
|
556
|
+
# Create a DataFrame with SNP information from the BIM file
|
557
|
+
snp_indices = (
|
558
|
+
plink_bed.kept_snps
|
559
|
+
if hasattr(plink_bed, "kept_snps")
|
560
|
+
else np.arange(len(self.snp_name))
|
876
561
|
)
|
562
|
+
bim_subset = plink_bed.bim_df.iloc[snp_indices]
|
563
|
+
|
877
564
|
w_ld_df = pd.DataFrame(
|
878
565
|
{
|
879
566
|
"SNP": self.snp_name,
|
880
567
|
"L2": w_ld_values,
|
568
|
+
"CHR": bim_subset.CHR.values[: len(self.snp_name)], # Ensure length matches
|
569
|
+
"BP": bim_subset.BP.values[: len(self.snp_name)],
|
570
|
+
"CM": bim_subset.CM.values[: len(self.snp_name)],
|
881
571
|
}
|
882
572
|
)
|
883
573
|
|
884
|
-
# Add CHR, BP, and CM information
|
885
|
-
w_ld_df = w_ld_df.merge(bim_data[["SNP", "CHR", "BP", "CM"]], on="SNP", how="left")
|
886
|
-
|
887
574
|
# Reorder columns
|
888
575
|
w_ld_df = w_ld_df[["CHR", "SNP", "BP", "CM", "L2"]]
|
889
576
|
|
@@ -891,7 +578,7 @@ class LDScoreCalculator:
|
|
891
578
|
|
892
579
|
logger.info(f"Saved w_ld for chr{chrom} to {w_ld_file}")
|
893
580
|
|
894
|
-
def _calculate_annotation_ldscores(self, chrom: int):
|
581
|
+
def _calculate_annotation_ldscores(self, chrom: int, plink_bed):
|
895
582
|
"""
|
896
583
|
Calculate and save LD scores for spatial annotations.
|
897
584
|
|
@@ -899,6 +586,8 @@ class LDScoreCalculator:
|
|
899
586
|
----------
|
900
587
|
chrom : int
|
901
588
|
Chromosome number
|
589
|
+
plink_bed : PlinkBEDFile
|
590
|
+
Initialized PlinkBEDFile object
|
902
591
|
"""
|
903
592
|
# Get marker scores for gene columns (excluding dummy NA column)
|
904
593
|
mk_scores = self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]]
|
@@ -921,7 +610,7 @@ class LDScoreCalculator:
|
|
921
610
|
m_5_file = f"{self.config.ldscore_save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50"
|
922
611
|
|
923
612
|
# Calculate LD scores
|
924
|
-
ldscore_chunk = self._calculate_ldscore_from_weights(mk_score_chunk)
|
613
|
+
ldscore_chunk = self._calculate_ldscore_from_weights(mk_score_chunk, plink_bed)
|
925
614
|
|
926
615
|
# Save LD scores based on format
|
927
616
|
if self.config.ldscore_save_format == "feather":
|
@@ -930,12 +619,6 @@ class LDScoreCalculator:
|
|
930
619
|
column_names=mk_score_chunk.columns,
|
931
620
|
save_file_name=ld_score_file,
|
932
621
|
)
|
933
|
-
elif self.config.ldscore_save_format == "zarr":
|
934
|
-
self._save_ldscore_chunk_to_zarr(
|
935
|
-
ldscore_chunk,
|
936
|
-
chrom=chrom,
|
937
|
-
start_col_index=i,
|
938
|
-
)
|
939
622
|
else:
|
940
623
|
raise ValueError(f"Invalid ldscore_save_format: {self.config.ldscore_save_format}")
|
941
624
|
|
@@ -954,7 +637,7 @@ class LDScoreCalculator:
|
|
954
637
|
gc.collect()
|
955
638
|
|
956
639
|
def _calculate_ldscore_from_weights(
|
957
|
-
self, marker_scores: pd.DataFrame, drop_dummy_na: bool = True
|
640
|
+
self, marker_scores: pd.DataFrame, plink_bed, drop_dummy_na: bool = True
|
958
641
|
) -> np.ndarray:
|
959
642
|
"""
|
960
643
|
Calculate LD scores using SNP-gene weight matrix.
|
@@ -963,6 +646,8 @@ class LDScoreCalculator:
|
|
963
646
|
----------
|
964
647
|
marker_scores : pd.DataFrame
|
965
648
|
DataFrame with marker scores
|
649
|
+
plink_bed : PlinkBEDFile
|
650
|
+
Initialized PlinkBEDFile object
|
966
651
|
drop_dummy_na : bool, optional
|
967
652
|
Whether to drop the dummy NA column, by default True
|
968
653
|
|
@@ -1015,37 +700,6 @@ class LDScoreCalculator:
|
|
1015
700
|
df.index.name = "SNP"
|
1016
701
|
df.reset_index().to_feather(save_file_name)
|
1017
702
|
|
1018
|
-
def _save_ldscore_chunk_to_zarr(
|
1019
|
-
self, ldscore_data: np.ndarray, chrom: int, start_col_index: int
|
1020
|
-
):
|
1021
|
-
"""
|
1022
|
-
Save LD scores to a zarr array.
|
1023
|
-
|
1024
|
-
Parameters
|
1025
|
-
----------
|
1026
|
-
ldscore_data : np.ndarray
|
1027
|
-
Array with LD scores
|
1028
|
-
chrom : int
|
1029
|
-
Chromosome number
|
1030
|
-
start_col_index : int
|
1031
|
-
Starting column index in the zarr array
|
1032
|
-
"""
|
1033
|
-
# Convert to float16 for storage efficiency
|
1034
|
-
ldscore_data = ldscore_data.astype(np.float16, copy=False)
|
1035
|
-
|
1036
|
-
# Handle numerical overflow
|
1037
|
-
ldscore_data[np.isinf(ldscore_data)] = np.finfo(np.float16).max
|
1038
|
-
|
1039
|
-
# Get start and end indices for this chromosome
|
1040
|
-
chrom_start = self.chrom_snp_start_point[chrom - 1]
|
1041
|
-
chrom_end = self.chrom_snp_start_point[chrom]
|
1042
|
-
|
1043
|
-
# Save to zarr array
|
1044
|
-
self.zarr_file[
|
1045
|
-
chrom_start:chrom_end,
|
1046
|
-
start_col_index : start_col_index + ldscore_data.shape[1],
|
1047
|
-
] = ldscore_data
|
1048
|
-
|
1049
703
|
def _calculate_and_save_m_values(
|
1050
704
|
self,
|
1051
705
|
marker_scores: pd.DataFrame,
|
@@ -1090,7 +744,7 @@ class LDScoreCalculator:
|
|
1090
744
|
np.savetxt(m_file_path, m_values, delimiter="\t")
|
1091
745
|
np.savetxt(m_5_file_path, m_5_values, delimiter="\t")
|
1092
746
|
|
1093
|
-
def _get_snp_gene_dummy(self, chrom: int) -> pd.DataFrame:
|
747
|
+
def _get_snp_gene_dummy(self, chrom: int, plink_bed) -> pd.DataFrame:
|
1094
748
|
"""
|
1095
749
|
Get dummy matrix for SNP-gene pairs.
|
1096
750
|
|
@@ -1098,6 +752,7 @@ class LDScoreCalculator:
|
|
1098
752
|
----------
|
1099
753
|
chrom : int
|
1100
754
|
Chromosome number
|
755
|
+
plink_bed : PlinkBEDFile
|
1101
756
|
|
1102
757
|
Returns
|
1103
758
|
-------
|
@@ -1107,7 +762,8 @@ class LDScoreCalculator:
|
|
1107
762
|
logger.info(f"Creating SNP-gene mappings for chromosome {chrom}")
|
1108
763
|
|
1109
764
|
# Load BIM file
|
1110
|
-
bim
|
765
|
+
bim = plink_bed.bim_df
|
766
|
+
bim_pr = plink_bed.convert_bim_to_pyrange(bim)
|
1111
767
|
|
1112
768
|
# Determine mapping strategy
|
1113
769
|
if self.config.gene_window_enhancer_priority in ["gene_window_first", "enhancer_first"]:
|