gsMap 1.73.0__py3-none-any.whl → 1.73.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN/train.py +1 -1
- gsMap/__init__.py +1 -1
- gsMap/config.py +29 -16
- gsMap/create_slice_mean.py +1 -0
- gsMap/diagnosis.py +18 -18
- gsMap/find_latent_representation.py +18 -2
- gsMap/generate_ldscore.py +1068 -441
- gsMap/latent_to_gene.py +15 -5
- gsMap/run_all_mode.py +1 -0
- gsMap/utils/generate_r2_matrix.py +2 -2
- gsMap/utils/manhattan_plot.py +15 -7
- {gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/METADATA +9 -1
- {gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/RECORD +16 -16
- {gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/WHEEL +1 -1
- {gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/entry_points.txt +0 -0
- {gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/licenses/LICENSE +0 -0
gsMap/generate_ldscore.py
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
"""
|
2
|
+
Module for generating LD scores for each spot in spatial transcriptomics data.
|
3
|
+
|
4
|
+
This module is responsible for assigning gene specificity scores to SNPs
|
5
|
+
and computing stratified LD scores that will be used for spatial LDSC analysis.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import gc
|
1
9
|
import logging
|
2
10
|
import warnings
|
3
11
|
from pathlib import Path
|
@@ -12,108 +20,203 @@ from tqdm import trange
|
|
12
20
|
from gsMap.config import GenerateLDScoreConfig
|
13
21
|
from gsMap.utils.generate_r2_matrix import getBlockLefts, load_bfile
|
14
22
|
|
15
|
-
|
23
|
+
# Configure warning behavior more precisely
|
24
|
+
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
|
16
25
|
logger = logging.getLogger(__name__)
|
17
26
|
|
18
27
|
|
19
|
-
|
20
|
-
|
21
|
-
|
28
|
+
def load_gtf(
|
29
|
+
gtf_file: str, mk_score: pd.DataFrame, window_size: int
|
30
|
+
) -> tuple[pr.PyRanges, pd.DataFrame]:
|
22
31
|
"""
|
23
|
-
Load the gene annotation file (
|
32
|
+
Load and process the gene annotation file (GTF).
|
33
|
+
|
34
|
+
Parameters
|
35
|
+
----------
|
36
|
+
gtf_file : str
|
37
|
+
Path to the GTF file
|
38
|
+
mk_score : pd.DataFrame
|
39
|
+
DataFrame containing marker scores
|
40
|
+
window_size : int
|
41
|
+
Window size around gene bodies in base pairs
|
42
|
+
|
43
|
+
Returns
|
44
|
+
-------
|
45
|
+
tuple
|
46
|
+
A tuple containing (gtf_pr, mk_score) where:
|
47
|
+
- gtf_pr is a PyRanges object with gene coordinates
|
48
|
+
- mk_score is the filtered marker score DataFrame
|
24
49
|
"""
|
25
|
-
|
26
|
-
|
50
|
+
logger.info("Loading GTF data from %s", gtf_file)
|
51
|
+
|
27
52
|
# Load GTF file
|
28
|
-
gtf = pr.read_gtf(
|
29
|
-
gtf_file,
|
30
|
-
)
|
53
|
+
gtf = pr.read_gtf(gtf_file)
|
31
54
|
gtf = gtf.df
|
32
|
-
|
33
|
-
#
|
55
|
+
|
56
|
+
# Filter for gene features
|
34
57
|
gtf = gtf[gtf["Feature"] == "gene"]
|
58
|
+
|
59
|
+
# Find common genes between GTF and marker scores
|
35
60
|
common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
|
36
|
-
|
61
|
+
logger.info(f"Found {len(common_gene)} common genes between GTF and marker scores")
|
62
|
+
|
63
|
+
# Filter GTF and marker scores to common genes
|
37
64
|
gtf = gtf[gtf.gene_name.isin(common_gene)]
|
38
65
|
mk_score = mk_score[mk_score.index.isin(common_gene)]
|
39
|
-
|
40
|
-
# Remove duplicated
|
66
|
+
|
67
|
+
# Remove duplicated gene entries
|
41
68
|
gtf = gtf.drop_duplicates(subset="gene_name", keep="first")
|
42
|
-
|
43
|
-
# Process the GTF (open
|
69
|
+
|
70
|
+
# Process the GTF (open window around gene coordinates)
|
44
71
|
gtf_bed = gtf[["Chromosome", "Start", "End", "gene_name", "Strand"]].copy()
|
45
72
|
gtf_bed.loc[:, "TSS"] = gtf_bed["Start"]
|
46
73
|
gtf_bed.loc[:, "TED"] = gtf_bed["End"]
|
47
74
|
|
75
|
+
# Create windows around genes
|
48
76
|
gtf_bed.loc[:, "Start"] = gtf_bed["TSS"] - window_size
|
49
77
|
gtf_bed.loc[:, "End"] = gtf_bed["TED"] + window_size
|
50
78
|
gtf_bed.loc[gtf_bed["Start"] < 0, "Start"] = 0
|
51
|
-
|
52
|
-
#
|
79
|
+
|
80
|
+
# Handle genes on negative strand (swap TSS and TED)
|
53
81
|
tss_neg = gtf_bed.loc[gtf_bed["Strand"] == "-", "TSS"]
|
54
82
|
ted_neg = gtf_bed.loc[gtf_bed["Strand"] == "-", "TED"]
|
55
83
|
gtf_bed.loc[gtf_bed["Strand"] == "-", "TSS"] = ted_neg
|
56
84
|
gtf_bed.loc[gtf_bed["Strand"] == "-", "TED"] = tss_neg
|
57
85
|
gtf_bed = gtf_bed.drop("Strand", axis=1)
|
58
|
-
|
59
|
-
#
|
86
|
+
|
87
|
+
# Convert to PyRanges
|
60
88
|
gtf_pr = pr.PyRanges(gtf_bed)
|
89
|
+
|
61
90
|
return gtf_pr, mk_score
|
62
91
|
|
63
92
|
|
64
|
-
|
65
|
-
def load_marker_score(mk_score_file):
|
93
|
+
def load_marker_score(mk_score_file: str) -> pd.DataFrame:
|
66
94
|
"""
|
67
|
-
Load marker scores
|
95
|
+
Load marker scores from a feather file.
|
96
|
+
|
97
|
+
Parameters
|
98
|
+
----------
|
99
|
+
mk_score_file : str
|
100
|
+
Path to the marker score feather file
|
101
|
+
|
102
|
+
Returns
|
103
|
+
-------
|
104
|
+
pd.DataFrame
|
105
|
+
DataFrame with marker scores indexed by gene names
|
68
106
|
"""
|
69
107
|
mk_score = pd.read_feather(mk_score_file).set_index("HUMAN_GENE_SYM").rename_axis("gene_name")
|
70
108
|
mk_score = mk_score.astype(np.float32, copy=False)
|
71
109
|
return mk_score
|
72
110
|
|
73
111
|
|
74
|
-
|
75
|
-
def load_bim(bfile_root, chrom):
|
112
|
+
def load_bim(bfile_root: str, chrom: int) -> tuple[pd.DataFrame, pr.PyRanges]:
|
76
113
|
"""
|
77
|
-
Load
|
114
|
+
Load PLINK BIM file and convert to a PyRanges object.
|
115
|
+
|
116
|
+
Parameters
|
117
|
+
----------
|
118
|
+
bfile_root : str
|
119
|
+
Root path for PLINK bfiles
|
120
|
+
chrom : int
|
121
|
+
Chromosome number
|
122
|
+
|
123
|
+
Returns
|
124
|
+
-------
|
125
|
+
tuple
|
126
|
+
A tuple containing (bim_df, bim_pr) where:
|
127
|
+
- bim_df is a pandas DataFrame with BIM data
|
128
|
+
- bim_pr is a PyRanges object with BIM data
|
78
129
|
"""
|
79
|
-
|
130
|
+
bim_file = f"{bfile_root}.{chrom}.bim"
|
131
|
+
logger.debug(f"Loading BIM file: {bim_file}")
|
132
|
+
|
133
|
+
bim = pd.read_csv(bim_file, sep="\t", header=None)
|
80
134
|
bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
|
81
|
-
|
82
|
-
#
|
135
|
+
|
136
|
+
# Convert to PyRanges
|
83
137
|
bim_pr = bim.copy()
|
84
138
|
bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
|
85
139
|
|
140
|
+
# Adjust coordinates (BIM is 1-based, PyRanges uses 0-based)
|
86
141
|
bim_pr["End"] = bim_pr["Start"].copy()
|
87
|
-
bim_pr["Start"] = bim_pr["Start"] - 1
|
142
|
+
bim_pr["Start"] = bim_pr["Start"] - 1
|
88
143
|
|
89
144
|
bim_pr = pr.PyRanges(bim_pr)
|
90
145
|
bim_pr.Chromosome = f"chr{chrom}"
|
146
|
+
|
91
147
|
return bim, bim_pr
|
92
148
|
|
93
149
|
|
94
|
-
|
95
|
-
def Overlaps_gtf_bim(gtf_pr, bim_pr):
|
150
|
+
def overlaps_gtf_bim(gtf_pr: pr.PyRanges, bim_pr: pr.PyRanges) -> pd.DataFrame:
|
96
151
|
"""
|
97
|
-
Find overlaps between
|
152
|
+
Find overlaps between GTF and BIM data, and select nearest gene for each SNP.
|
153
|
+
|
154
|
+
Parameters
|
155
|
+
----------
|
156
|
+
gtf_pr : pr.PyRanges
|
157
|
+
PyRanges object with gene coordinates
|
158
|
+
bim_pr : pr.PyRanges
|
159
|
+
PyRanges object with SNP coordinates
|
160
|
+
|
161
|
+
Returns
|
162
|
+
-------
|
163
|
+
pd.DataFrame
|
164
|
+
DataFrame with SNP-gene pairs where each SNP is matched to its closest gene
|
98
165
|
"""
|
99
|
-
#
|
166
|
+
# Join the PyRanges objects to find overlaps
|
100
167
|
overlaps = gtf_pr.join(bim_pr)
|
101
168
|
overlaps = overlaps.df
|
169
|
+
|
170
|
+
# Calculate distance to TSS
|
102
171
|
overlaps["Distance"] = np.abs(overlaps["Start_b"] - overlaps["TSS"])
|
103
|
-
|
104
|
-
|
105
|
-
|
172
|
+
|
173
|
+
# For each SNP, select the closest gene
|
174
|
+
nearest_genes = overlaps.loc[overlaps.groupby("SNP").Distance.idxmin()]
|
175
|
+
|
176
|
+
return nearest_genes
|
106
177
|
|
107
178
|
|
108
|
-
|
109
|
-
|
110
|
-
|
179
|
+
def filter_snps_by_keep_snp(bim_df: pd.DataFrame, keep_snp_file: str) -> pd.DataFrame:
|
180
|
+
"""
|
181
|
+
Filter BIM DataFrame to keep only SNPs in a provided list.
|
182
|
+
|
183
|
+
Parameters
|
184
|
+
----------
|
185
|
+
bim_df : pd.DataFrame
|
186
|
+
DataFrame with BIM data
|
187
|
+
keep_snp_file : str
|
188
|
+
Path to a file with SNP IDs to keep
|
189
|
+
|
190
|
+
Returns
|
191
|
+
-------
|
192
|
+
pd.DataFrame
|
193
|
+
Filtered BIM DataFrame
|
194
|
+
"""
|
195
|
+
# Read SNPs to keep
|
111
196
|
keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
|
197
|
+
|
198
|
+
# Filter the BIM DataFrame
|
112
199
|
filtered_bim_df = bim_df[bim_df["SNP"].isin(keep_snp)]
|
200
|
+
|
201
|
+
logger.info(f"Kept {len(filtered_bim_df)} SNPs out of {len(bim_df)} after filtering")
|
202
|
+
|
113
203
|
return filtered_bim_df
|
114
204
|
|
115
205
|
|
116
|
-
def get_snp_counts(config):
|
206
|
+
def get_snp_counts(config: GenerateLDScoreConfig) -> dict:
|
207
|
+
"""
|
208
|
+
Count SNPs per chromosome and calculate start positions for zarr arrays.
|
209
|
+
|
210
|
+
Parameters
|
211
|
+
----------
|
212
|
+
config : GenerateLDScoreConfig
|
213
|
+
Configuration object
|
214
|
+
|
215
|
+
Returns
|
216
|
+
-------
|
217
|
+
dict
|
218
|
+
Dictionary with SNP counts and start positions
|
219
|
+
"""
|
117
220
|
snp_counts = {}
|
118
221
|
total_snp = 0
|
119
222
|
|
@@ -131,19 +234,34 @@ def get_snp_counts(config):
|
|
131
234
|
|
132
235
|
snp_counts["total"] = total_snp
|
133
236
|
|
237
|
+
# Calculate cumulative SNP counts for zarr array indexing
|
134
238
|
chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
|
135
|
-
|
136
239
|
snp_counts["chrom_snp_start_point"] = [0] + chrom_snp_length_array.tolist()
|
137
240
|
|
138
241
|
return snp_counts
|
139
242
|
|
140
243
|
|
141
|
-
|
142
|
-
def get_snp_pass_maf(bfile_root, chrom, maf_min=0.05):
|
244
|
+
def get_snp_pass_maf(bfile_root: str, chrom: int, maf_min: float = 0.05) -> list[str]:
|
143
245
|
"""
|
144
|
-
Get the
|
246
|
+
Get SNPs that pass the minimum minor allele frequency (MAF) threshold.
|
247
|
+
|
248
|
+
Parameters
|
249
|
+
----------
|
250
|
+
bfile_root : str
|
251
|
+
Root path for PLINK bfiles
|
252
|
+
chrom : int
|
253
|
+
Chromosome number
|
254
|
+
maf_min : float, optional
|
255
|
+
Minimum MAF threshold, by default 0.05
|
256
|
+
|
257
|
+
Returns
|
258
|
+
-------
|
259
|
+
list
|
260
|
+
List of SNP IDs that pass the MAF threshold
|
145
261
|
"""
|
146
|
-
array_snps, array_indivs, geno_array = load_bfile(
|
262
|
+
array_snps, array_indivs, geno_array = load_bfile(
|
263
|
+
bfile_chr_prefix=f"{bfile_root}.{chrom}", mafMin=maf_min
|
264
|
+
)
|
147
265
|
|
148
266
|
m = len(array_snps.IDList)
|
149
267
|
n = len(array_indivs.IDList)
|
@@ -151,16 +269,49 @@ def get_snp_pass_maf(bfile_root, chrom, maf_min=0.05):
|
|
151
269
|
f"Loading genotype data for {m} SNPs and {n} individuals from {bfile_root}.{chrom}"
|
152
270
|
)
|
153
271
|
|
154
|
-
|
155
|
-
snp_pass_maf = array_snps.IDList[
|
156
|
-
logger.info(f"After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain
|
272
|
+
# Filter SNPs by MAF
|
273
|
+
snp_pass_maf = array_snps.IDList.iloc[geno_array.kept_snps]
|
274
|
+
logger.info(f"After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain")
|
275
|
+
|
157
276
|
return snp_pass_maf.SNP.to_list()
|
158
277
|
|
159
278
|
|
160
|
-
def get_ldscore(
|
161
|
-
|
279
|
+
def get_ldscore(
|
280
|
+
bfile_root: str,
|
281
|
+
chrom: int,
|
282
|
+
annot_matrix: np.ndarray,
|
283
|
+
ld_wind: float,
|
284
|
+
ld_unit: str = "CM",
|
285
|
+
keep_snps_index: list[int] = None,
|
286
|
+
) -> pd.DataFrame:
|
287
|
+
"""
|
288
|
+
Calculate LD scores using PLINK data and an annotation matrix.
|
289
|
+
|
290
|
+
Parameters
|
291
|
+
----------
|
292
|
+
bfile_root : str
|
293
|
+
Root path for PLINK bfiles
|
294
|
+
chrom : int
|
295
|
+
Chromosome number
|
296
|
+
annot_matrix : np.ndarray
|
297
|
+
Annotation matrix
|
298
|
+
ld_wind : float
|
299
|
+
LD window size
|
300
|
+
ld_unit : str, optional
|
301
|
+
Unit for the LD window, by default "CM"
|
302
|
+
keep_snps_index : list[int], optional
|
303
|
+
Indices of SNPs to keep, by default None
|
304
|
+
|
305
|
+
Returns
|
306
|
+
-------
|
307
|
+
pd.DataFrame
|
308
|
+
DataFrame with calculated LD scores
|
309
|
+
"""
|
310
|
+
array_snps, array_indivs, geno_array = load_bfile(
|
311
|
+
bfile_chr_prefix=f"{bfile_root}.{chrom}", keep_snps=keep_snps_index
|
312
|
+
)
|
162
313
|
|
163
|
-
#
|
314
|
+
# Configure LD window based on specified unit
|
164
315
|
if ld_unit == "SNP":
|
165
316
|
max_dist = ld_wind
|
166
317
|
coords = np.array(range(geno_array.m))
|
@@ -170,61 +321,141 @@ def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit="CM"):
|
|
170
321
|
elif ld_unit == "CM":
|
171
322
|
max_dist = ld_wind
|
172
323
|
coords = np.array(array_snps.df["CM"])[geno_array.kept_snps]
|
324
|
+
# Check if the CM is all 0
|
325
|
+
if np.all(coords == 0):
|
326
|
+
logger.warning(
|
327
|
+
"All CM values are 0 in the BIM file. Using 1MB window size for LD score calculation."
|
328
|
+
)
|
329
|
+
max_dist = 1_000_000
|
330
|
+
coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
|
173
331
|
else:
|
174
|
-
raise ValueError(f"Invalid ld_wind_unit: {ld_unit}")
|
332
|
+
raise ValueError(f"Invalid ld_wind_unit: {ld_unit}. Must be one of: SNP, KB, CM")
|
333
|
+
|
334
|
+
# Calculate blocks for LD computation
|
175
335
|
block_left = getBlockLefts(coords, max_dist)
|
176
|
-
|
177
|
-
|
178
|
-
|
336
|
+
assert block_left.sum() > 0, "Invalid window size, please check the ld_wind parameter."
|
337
|
+
|
338
|
+
# Calculate LD scores
|
339
|
+
ld_scores = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
|
340
|
+
|
341
|
+
return ld_scores
|
179
342
|
|
180
343
|
|
181
|
-
# %%
|
182
344
|
def calculate_ldscore_from_annotation(
|
183
|
-
|
184
|
-
|
345
|
+
snp_annotation_df: pd.DataFrame,
|
346
|
+
chrom: int,
|
347
|
+
bfile_root: str,
|
348
|
+
ld_wind: float = 1,
|
349
|
+
ld_unit: str = "CM",
|
350
|
+
) -> pd.DataFrame:
|
185
351
|
"""
|
186
|
-
Calculate
|
352
|
+
Calculate LD scores from SNP annotation DataFrame.
|
353
|
+
|
354
|
+
Parameters
|
355
|
+
----------
|
356
|
+
snp_annotation_df : pd.DataFrame
|
357
|
+
DataFrame with SNP annotations
|
358
|
+
chrom : int
|
359
|
+
Chromosome number
|
360
|
+
bfile_root : str
|
361
|
+
Root path for PLINK bfiles
|
362
|
+
ld_wind : float, optional
|
363
|
+
LD window size, by default 1
|
364
|
+
ld_unit : str, optional
|
365
|
+
Unit for the LD window, by default "CM"
|
366
|
+
|
367
|
+
Returns
|
368
|
+
-------
|
369
|
+
pd.DataFrame
|
370
|
+
DataFrame with calculated LD scores
|
187
371
|
"""
|
188
|
-
#
|
189
|
-
# Get the SNP-gene weight matrix
|
372
|
+
# Calculate LD scores
|
190
373
|
snp_gene_weight_matrix = get_ldscore(
|
191
|
-
bfile_root, chrom,
|
374
|
+
bfile_root, chrom, snp_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
|
192
375
|
)
|
376
|
+
|
377
|
+
# Set proper data types and indices
|
193
378
|
snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
|
194
|
-
snp_gene_weight_matrix.index =
|
195
|
-
snp_gene_weight_matrix.columns =
|
379
|
+
snp_gene_weight_matrix.index = snp_annotation_df.index
|
380
|
+
snp_gene_weight_matrix.columns = snp_annotation_df.columns
|
381
|
+
|
196
382
|
return snp_gene_weight_matrix
|
197
383
|
|
198
384
|
|
199
385
|
def calculate_ldscore_from_multiple_annotation(
|
200
|
-
|
201
|
-
|
202
|
-
|
386
|
+
snp_annotation_df_list: list[pd.DataFrame],
|
387
|
+
chrom: int,
|
388
|
+
bfile_root: str,
|
389
|
+
ld_wind: float = 1,
|
390
|
+
ld_unit: str = "CM",
|
391
|
+
) -> list[pd.DataFrame]:
|
392
|
+
"""
|
393
|
+
Calculate LD scores from multiple SNP annotation DataFrames.
|
394
|
+
|
395
|
+
Parameters
|
396
|
+
----------
|
397
|
+
snp_annotation_df_list : list
|
398
|
+
List of DataFrames with SNP annotations
|
399
|
+
chrom : int
|
400
|
+
Chromosome number
|
401
|
+
bfile_root : str
|
402
|
+
Root path for PLINK bfiles
|
403
|
+
ld_wind : float, optional
|
404
|
+
LD window size, by default 1
|
405
|
+
ld_unit : str, optional
|
406
|
+
Unit for the LD window, by default "CM"
|
407
|
+
|
408
|
+
Returns
|
409
|
+
-------
|
410
|
+
list
|
411
|
+
List of DataFrames with calculated LD scores
|
412
|
+
"""
|
413
|
+
# Combine annotations
|
414
|
+
combined_annotations = pd.concat(snp_annotation_df_list, axis=1).astype(np.float32, copy=False)
|
203
415
|
|
204
|
-
|
205
|
-
|
416
|
+
# Calculate LD scores
|
417
|
+
combined_ld_scores = get_ldscore(
|
418
|
+
bfile_root, chrom, combined_annotations.values, ld_wind=ld_wind, ld_unit=ld_unit
|
206
419
|
)
|
207
|
-
snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
|
208
|
-
snp_gene_weight_matrix.index = SNP_annotation_df.index
|
209
|
-
snp_gene_weight_matrix.columns = SNP_annotation_df.columns
|
210
|
-
|
211
|
-
# split to each annotation
|
212
|
-
snp_annotation_len_list = [len(df.columns) for df in SNP_annotation_df_list]
|
213
|
-
snp_gene_weight_matrix_list = []
|
214
|
-
start = 0
|
215
|
-
for snp_annotation_len in snp_annotation_len_list:
|
216
|
-
snp_gene_weight_matrix_list.append(
|
217
|
-
snp_gene_weight_matrix.iloc[:, start : start + snp_annotation_len]
|
218
|
-
)
|
219
|
-
start += snp_annotation_len
|
220
|
-
return snp_gene_weight_matrix_list
|
221
420
|
|
421
|
+
# Apply proper indices and columns
|
422
|
+
combined_ld_scores.index = combined_annotations.index
|
423
|
+
combined_ld_scores.columns = combined_annotations.columns
|
424
|
+
|
425
|
+
# Split back into separate DataFrames
|
426
|
+
annotation_lengths = [len(df.columns) for df in snp_annotation_df_list]
|
427
|
+
result_dataframes = []
|
428
|
+
start_col = 0
|
429
|
+
|
430
|
+
for length in annotation_lengths:
|
431
|
+
end_col = start_col + length
|
432
|
+
result_dataframes.append(combined_ld_scores.iloc[:, start_col:end_col])
|
433
|
+
start_col = end_col
|
434
|
+
|
435
|
+
return result_dataframes
|
436
|
+
|
437
|
+
|
438
|
+
class LDScoreCalculator:
|
439
|
+
"""
|
440
|
+
Class for calculating LD scores from gene specificity scores.
|
441
|
+
|
442
|
+
This class handles the assignment of gene specificity scores to SNPs
|
443
|
+
and the calculation of LD scores.
|
444
|
+
"""
|
222
445
|
|
223
|
-
# %%
|
224
|
-
class S_LDSC_Boost:
|
225
446
|
def __init__(self, config: GenerateLDScoreConfig):
|
447
|
+
"""
|
448
|
+
Initialize LDScoreCalculator.
|
449
|
+
|
450
|
+
Parameters
|
451
|
+
----------
|
452
|
+
config : GenerateLDScoreConfig
|
453
|
+
Configuration object
|
454
|
+
"""
|
226
455
|
self.config = config
|
456
|
+
self.validate_config()
|
227
457
|
|
458
|
+
# Load marker scores
|
228
459
|
self.mk_score = load_marker_score(config.mkscore_feather_path)
|
229
460
|
|
230
461
|
# Load GTF and get common markers
|
@@ -232,499 +463,895 @@ class S_LDSC_Boost:
|
|
232
463
|
config.gtf_annotation_file, self.mk_score, window_size=config.gene_window_size
|
233
464
|
)
|
234
465
|
|
235
|
-
#
|
236
|
-
if config.enhancer_annotation_file
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
on="gene_name",
|
466
|
+
# Initialize enhancer data if provided
|
467
|
+
self.enhancer_pr = self._initialize_enhancer() if config.enhancer_annotation_file else None
|
468
|
+
|
469
|
+
# Initialize zarr file if needed
|
470
|
+
self._initialize_zarr_if_needed()
|
471
|
+
|
472
|
+
def validate_config(self):
|
473
|
+
"""Validate configuration parameters."""
|
474
|
+
if not Path(self.config.mkscore_feather_path).exists():
|
475
|
+
raise FileNotFoundError(
|
476
|
+
f"Marker score file not found: {self.config.mkscore_feather_path}"
|
247
477
|
)
|
248
478
|
|
249
|
-
|
250
|
-
|
251
|
-
"
|
252
|
-
|
479
|
+
if not Path(self.config.gtf_annotation_file).exists():
|
480
|
+
raise FileNotFoundError(
|
481
|
+
f"GTF annotation file not found: {self.config.gtf_annotation_file}"
|
482
|
+
)
|
253
483
|
|
254
|
-
|
255
|
-
self.
|
484
|
+
if (
|
485
|
+
self.config.enhancer_annotation_file
|
486
|
+
and not Path(self.config.enhancer_annotation_file).exists()
|
487
|
+
):
|
488
|
+
raise FileNotFoundError(
|
489
|
+
f"Enhancer annotation file not found: {self.config.enhancer_annotation_file}"
|
490
|
+
)
|
256
491
|
|
257
|
-
|
258
|
-
|
492
|
+
def _initialize_enhancer(self) -> pr.PyRanges:
|
493
|
+
"""
|
494
|
+
Initialize enhancer data.
|
495
|
+
|
496
|
+
Returns
|
497
|
+
-------
|
498
|
+
pr.PyRanges
|
499
|
+
PyRanges object with enhancer data
|
500
|
+
"""
|
501
|
+
# Load enhancer data
|
502
|
+
enhancer_df = pr.read_bed(self.config.enhancer_annotation_file, as_df=True)
|
503
|
+
enhancer_df.set_index("Name", inplace=True)
|
504
|
+
enhancer_df.index.name = "gene_name"
|
505
|
+
|
506
|
+
# Keep common genes and add marker score information
|
507
|
+
avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=["avg_mkscore"])
|
508
|
+
enhancer_df = enhancer_df.join(
|
509
|
+
avg_mkscore,
|
510
|
+
how="inner",
|
511
|
+
on="gene_name",
|
512
|
+
)
|
513
|
+
|
514
|
+
# Add TSS information
|
515
|
+
enhancer_df["TSS"] = self.gtf_pr.df.set_index("gene_name").reindex(enhancer_df.index)[
|
516
|
+
"TSS"
|
517
|
+
]
|
259
518
|
|
260
|
-
#
|
261
|
-
|
262
|
-
|
519
|
+
# Convert to PyRanges
|
520
|
+
return pr.PyRanges(enhancer_df.reset_index())
|
521
|
+
|
522
|
+
def _initialize_zarr_if_needed(self):
|
523
|
+
"""Initialize zarr file if zarr format is specified."""
|
524
|
+
if self.config.ldscore_save_format == "zarr":
|
525
|
+
chrom_snp_length_dict = get_snp_counts(self.config)
|
263
526
|
self.chrom_snp_start_point = chrom_snp_length_dict["chrom_snp_start_point"]
|
264
527
|
|
265
|
-
zarr_path =
|
528
|
+
zarr_path = (
|
529
|
+
Path(self.config.ldscore_save_dir) / f"{self.config.sample_name}.ldscore.zarr"
|
530
|
+
)
|
531
|
+
|
266
532
|
if not zarr_path.exists():
|
267
533
|
self.zarr_file = zarr.open(
|
268
534
|
zarr_path.as_posix(),
|
269
535
|
mode="a",
|
270
536
|
dtype=np.float16,
|
271
|
-
chunks=config.zarr_chunk_size,
|
537
|
+
chunks=self.config.zarr_chunk_size,
|
272
538
|
shape=(chrom_snp_length_dict["total"], self.mk_score_common.shape[1]),
|
273
539
|
)
|
274
|
-
zarr_path.mkdir(parents=True, exist_ok=True)
|
275
|
-
|
540
|
+
zarr_path.parent.mkdir(parents=True, exist_ok=True)
|
541
|
+
|
542
|
+
# Save metadata
|
276
543
|
self.zarr_file.attrs["spot_names"] = self.mk_score_common.columns.to_list()
|
277
|
-
# save chrom_snp_length_dict
|
278
544
|
self.zarr_file.attrs["chrom_snp_start_point"] = self.chrom_snp_start_point
|
545
|
+
|
279
546
|
else:
|
280
547
|
self.zarr_file = zarr.open(zarr_path.as_posix(), mode="a")
|
281
548
|
|
282
549
|
def process_chromosome(self, chrom: int):
|
550
|
+
"""
|
551
|
+
Process a single chromosome to calculate LD scores.
|
552
|
+
|
553
|
+
Parameters
|
554
|
+
----------
|
555
|
+
chrom : int
|
556
|
+
Chromosome number
|
557
|
+
"""
|
558
|
+
logger.info(f"Processing chromosome {chrom}")
|
559
|
+
|
560
|
+
# Get SNPs passing MAF filter
|
283
561
|
self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
|
284
562
|
|
285
|
-
# Get SNP-
|
286
|
-
self.snp_gene_pair_dummy = self.
|
563
|
+
# Get SNP-gene dummy pairs
|
564
|
+
self.snp_gene_pair_dummy = self._get_snp_gene_dummy(chrom)
|
565
|
+
|
566
|
+
# Apply SNP filter if provided
|
567
|
+
self._apply_snp_filter(chrom)
|
568
|
+
|
569
|
+
# Process additional baseline annotations if provided
|
570
|
+
if self.config.additional_baseline_annotation:
|
571
|
+
self._process_additional_baseline(chrom)
|
572
|
+
else:
|
573
|
+
# Calculate SNP-gene weight matrix
|
574
|
+
self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(
|
575
|
+
self.snp_gene_pair_dummy,
|
576
|
+
chrom,
|
577
|
+
self.config.bfile_root,
|
578
|
+
ld_wind=self.config.ld_wind,
|
579
|
+
ld_unit=self.config.ld_unit,
|
580
|
+
)
|
581
|
+
|
582
|
+
# Apply SNP filter if needed
|
583
|
+
if self.keep_snp_mask is not None:
|
584
|
+
self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
|
585
|
+
|
586
|
+
# Generate w_ld file if keep_snp_root is provided
|
587
|
+
if self.config.keep_snp_root:
|
588
|
+
self._generate_w_ld(chrom)
|
589
|
+
|
590
|
+
# Save pre-calculated SNP-gene weight matrix if requested
|
591
|
+
self._save_snp_gene_weight_matrix_if_needed(chrom)
|
592
|
+
|
593
|
+
# Convert to sparse matrix for memory efficiency
|
594
|
+
self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
|
595
|
+
logger.info(f"SNP-gene weight matrix shape: {self.snp_gene_weight_matrix.shape}")
|
596
|
+
|
597
|
+
# Calculate baseline LD scores
|
598
|
+
logger.info(f"Calculating baseline LD scores for chr{chrom}")
|
599
|
+
self._calculate_baseline_ldscores(chrom)
|
600
|
+
|
601
|
+
# Calculate LD scores for annotation
|
602
|
+
logger.info(f"Calculating annotation LD scores for chr{chrom}")
|
603
|
+
self._calculate_annotation_ldscores(chrom)
|
604
|
+
|
605
|
+
# Clear memory
|
606
|
+
self._clear_memory()
|
607
|
+
|
608
|
+
def _generate_w_ld(self, chrom: int):
|
609
|
+
"""
|
610
|
+
Generate w_ld file for the chromosome using filtered SNPs.
|
611
|
+
|
612
|
+
Parameters
|
613
|
+
----------
|
614
|
+
chrom : int
|
615
|
+
Chromosome number
|
616
|
+
"""
|
617
|
+
if not self.config.keep_snp_root:
|
618
|
+
logger.info(
|
619
|
+
f"Skipping w_ld generation for chr{chrom} as keep_snp_root is not provided"
|
620
|
+
)
|
621
|
+
return
|
622
|
+
|
623
|
+
logger.info(f"Generating w_ld for chr{chrom}")
|
624
|
+
|
625
|
+
# Get the indices of SNPs to keep based on the keep_snp_mask
|
626
|
+
keep_snps_index = np.nonzero(self.keep_snp_mask)[0]
|
627
|
+
|
628
|
+
# Create a simple unit annotation (all ones) for the filtered SNPs
|
629
|
+
unit_annotation = np.ones((len(keep_snps_index), 1))
|
630
|
+
|
631
|
+
# Calculate LD scores using the filtered SNPs
|
632
|
+
w_ld_scores = get_ldscore(
|
633
|
+
self.config.bfile_root,
|
287
634
|
chrom,
|
635
|
+
unit_annotation,
|
636
|
+
ld_wind=self.config.ld_wind,
|
637
|
+
ld_unit=self.config.ld_unit,
|
638
|
+
keep_snps_index=keep_snps_index.tolist(),
|
288
639
|
)
|
289
640
|
|
641
|
+
# Load the BIM file to get SNP information
|
642
|
+
bim_data = pd.read_csv(
|
643
|
+
f"{self.config.bfile_root}.{chrom}.bim",
|
644
|
+
sep="\t",
|
645
|
+
header=None,
|
646
|
+
names=["CHR", "SNP", "CM", "BP", "A1", "A2"],
|
647
|
+
)
|
648
|
+
|
649
|
+
# Get SNP names for the kept indices
|
650
|
+
kept_snp_names = bim_data.iloc[keep_snps_index].SNP.tolist()
|
651
|
+
|
652
|
+
# Create the w_ld DataFrame
|
653
|
+
w_ld_df = pd.DataFrame(
|
654
|
+
{
|
655
|
+
"SNP": kept_snp_names,
|
656
|
+
"L2": w_ld_scores.values.flatten(),
|
657
|
+
"CHR": bim_data.iloc[keep_snps_index].CHR.values,
|
658
|
+
"BP": bim_data.iloc[keep_snps_index].BP.values,
|
659
|
+
"CM": bim_data.iloc[keep_snps_index].CM.values,
|
660
|
+
}
|
661
|
+
)
|
662
|
+
|
663
|
+
# Reorder columns
|
664
|
+
w_ld_df = w_ld_df[["CHR", "SNP", "BP", "CM", "L2"]]
|
665
|
+
|
666
|
+
# Save to feather format
|
667
|
+
w_ld_dir = Path(self.config.ldscore_save_dir) / "w_ld"
|
668
|
+
w_ld_dir.mkdir(parents=True, exist_ok=True)
|
669
|
+
w_ld_file = w_ld_dir / f"weights.{chrom}.l2.ldscore.gz"
|
670
|
+
w_ld_df.to_csv(w_ld_file, sep="\t", index=False, compression="gzip")
|
671
|
+
|
672
|
+
logger.info(f"Saved w_ld for chr{chrom} to {w_ld_file}")
|
673
|
+
|
674
|
+
def _apply_snp_filter(self, chrom: int):
|
675
|
+
"""
|
676
|
+
Apply SNP filter based on keep_snp_root.
|
677
|
+
|
678
|
+
Parameters
|
679
|
+
----------
|
680
|
+
chrom : int
|
681
|
+
Chromosome number
|
682
|
+
"""
|
290
683
|
if self.config.keep_snp_root is not None:
|
291
|
-
|
292
|
-
|
293
|
-
].to_list()
|
684
|
+
keep_snp_file = f"{self.config.keep_snp_root}.{chrom}.snp"
|
685
|
+
keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
|
294
686
|
self.keep_snp_mask = self.snp_gene_pair_dummy.index.isin(keep_snp)
|
295
|
-
# the SNP name of keeped
|
296
687
|
self.snp_name = self.snp_gene_pair_dummy.index[self.keep_snp_mask].to_list()
|
688
|
+
logger.info(f"Kept {len(self.snp_name)} SNPs after filtering with {keep_snp_file}")
|
689
|
+
logger.info("These filtered SNPs will be used to calculate w_ld")
|
297
690
|
else:
|
298
691
|
self.keep_snp_mask = None
|
299
692
|
self.snp_name = self.snp_gene_pair_dummy.index.to_list()
|
693
|
+
logger.info(f"Using all {len(self.snp_name)} SNPs (no filter applied)")
|
694
|
+
logger.warning("No keep_snp_root provided, all SNPs will be used to calculate w_ld.")
|
300
695
|
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
additional_baseline_annotation / f"baseline.{chrom}.annot.gz"
|
305
|
-
)
|
306
|
-
assert additional_baseline_annotation_file_path.exists(), (
|
307
|
-
f"additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}"
|
308
|
-
)
|
309
|
-
additional_baseline_annotation_df = pd.read_csv(
|
310
|
-
additional_baseline_annotation_file_path, sep="\t"
|
311
|
-
)
|
312
|
-
additional_baseline_annotation_df.set_index("SNP", inplace=True)
|
313
|
-
|
314
|
-
# drop these columns if exists CHR BP CM]
|
315
|
-
additional_baseline_annotation_df.drop(
|
316
|
-
["CHR", "BP", "CM"], axis=1, inplace=True, errors="ignore"
|
317
|
-
)
|
318
|
-
|
319
|
-
# reindex, for those SNPs not in additional_baseline_annotation_df, set to 0
|
320
|
-
num_of_not_exist_snp = (
|
321
|
-
~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)
|
322
|
-
).sum()
|
323
|
-
if num_of_not_exist_snp > 0:
|
324
|
-
logger.warning(
|
325
|
-
f"{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0"
|
326
|
-
)
|
327
|
-
additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
|
328
|
-
self.snp_gene_pair_dummy.index, fill_value=0
|
329
|
-
)
|
330
|
-
else:
|
331
|
-
additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
|
332
|
-
self.snp_gene_pair_dummy.index
|
333
|
-
)
|
696
|
+
def _process_additional_baseline(self, chrom: int):
|
697
|
+
"""
|
698
|
+
Process additional baseline annotations.
|
334
699
|
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
700
|
+
Parameters
|
701
|
+
----------
|
702
|
+
chrom : int
|
703
|
+
Chromosome number
|
704
|
+
"""
|
705
|
+
# Load additional baseline annotations
|
706
|
+
additional_baseline_path = Path(self.config.additional_baseline_annotation)
|
707
|
+
annot_file_path = additional_baseline_path / f"baseline.{chrom}.annot.gz"
|
708
|
+
|
709
|
+
# Verify file existence
|
710
|
+
if not annot_file_path.exists():
|
711
|
+
raise FileNotFoundError(
|
712
|
+
f"Additional baseline annotation file not found: {annot_file_path}"
|
344
713
|
)
|
345
714
|
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
# print(additional_baseline_annotation_ldscore.index.to_list()==self.snp_name)
|
715
|
+
# Load annotations
|
716
|
+
additional_baseline_df = pd.read_csv(annot_file_path, sep="\t")
|
717
|
+
additional_baseline_df.set_index("SNP", inplace=True)
|
350
718
|
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
M_5_file_path = (
|
356
|
-
f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50"
|
357
|
-
)
|
719
|
+
# Drop unnecessary columns
|
720
|
+
for col in ["CHR", "BP", "CM"]:
|
721
|
+
if col in additional_baseline_df.columns:
|
722
|
+
additional_baseline_df.drop(col, axis=1, inplace=True)
|
358
723
|
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
column_names=additional_baseline_annotation_ldscore.columns,
|
363
|
-
save_file_name=ld_score_file,
|
364
|
-
)
|
724
|
+
# Check for SNPs not in the additional baseline
|
725
|
+
missing_snps = ~self.snp_gene_pair_dummy.index.isin(additional_baseline_df.index)
|
726
|
+
missing_count = missing_snps.sum()
|
365
727
|
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(
|
371
|
-
axis=0, keepdims=True
|
372
|
-
)
|
373
|
-
np.savetxt(
|
374
|
-
M_file_path,
|
375
|
-
M_chr_chunk,
|
376
|
-
delimiter="\t",
|
728
|
+
if missing_count > 0:
|
729
|
+
logger.warning(
|
730
|
+
f"{missing_count} SNPs not found in additional baseline annotations. "
|
731
|
+
"Setting their values to 0."
|
377
732
|
)
|
378
|
-
|
379
|
-
|
380
|
-
M_5_chr_chunk,
|
381
|
-
delimiter="\t",
|
733
|
+
additional_baseline_df = additional_baseline_df.reindex(
|
734
|
+
self.snp_gene_pair_dummy.index, fill_value=0
|
382
735
|
)
|
383
|
-
|
384
736
|
else:
|
385
|
-
|
386
|
-
|
387
|
-
|
737
|
+
additional_baseline_df = additional_baseline_df.reindex(self.snp_gene_pair_dummy.index)
|
738
|
+
|
739
|
+
# Calculate LD scores for both annotation sets together
|
740
|
+
self.snp_gene_weight_matrix, additional_ldscore = (
|
741
|
+
calculate_ldscore_from_multiple_annotation(
|
742
|
+
[self.snp_gene_pair_dummy, additional_baseline_df],
|
388
743
|
chrom,
|
389
744
|
self.config.bfile_root,
|
390
745
|
ld_wind=self.config.ld_wind,
|
391
746
|
ld_unit=self.config.ld_unit,
|
392
747
|
)
|
393
|
-
|
394
|
-
if self.keep_snp_mask is not None:
|
395
|
-
self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
|
748
|
+
)
|
396
749
|
|
397
|
-
|
398
|
-
|
399
|
-
Path(self.config.ldscore_save_dir) / "snp_gene_weight_matrix"
|
400
|
-
)
|
401
|
-
snp_gene_weight_matrix_save_dir.mkdir(parents=True, exist_ok=True)
|
402
|
-
logger.info(f"Saving snp_gene_weight_matrix for chr{chrom}...")
|
403
|
-
self.snp_gene_weight_matrix.reset_index().to_feather(
|
404
|
-
snp_gene_weight_matrix_save_dir / f"{chrom}.snp_gene_weight_matrix.feather"
|
405
|
-
)
|
750
|
+
# Filter additional ldscore
|
751
|
+
additional_ldscore = additional_ldscore.loc[self.snp_name]
|
406
752
|
|
407
|
-
#
|
408
|
-
|
409
|
-
|
410
|
-
|
753
|
+
# Save additional baseline LD scores
|
754
|
+
ld_score_file = f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather"
|
755
|
+
m_file_path = f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M"
|
756
|
+
m_5_file_path = (
|
757
|
+
f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50"
|
411
758
|
)
|
759
|
+
Path(m_file_path).parent.mkdir(parents=True, exist_ok=True)
|
412
760
|
|
413
|
-
#
|
414
|
-
|
415
|
-
|
416
|
-
|
761
|
+
# Save LD scores
|
762
|
+
self._save_ldscore_to_feather(
|
763
|
+
additional_ldscore.values,
|
764
|
+
column_names=additional_ldscore.columns,
|
765
|
+
save_file_name=ld_score_file,
|
417
766
|
)
|
418
767
|
|
419
|
-
#
|
420
|
-
|
421
|
-
self.
|
422
|
-
|
423
|
-
chrom,
|
424
|
-
self.config.sample_name,
|
425
|
-
self.config.ldscore_save_dir,
|
768
|
+
# Calculate and save M values
|
769
|
+
m_chr_chunk = additional_baseline_df.values.sum(axis=0, keepdims=True)
|
770
|
+
m_5_chr_chunk = additional_baseline_df.loc[self.snp_pass_maf].values.sum(
|
771
|
+
axis=0, keepdims=True
|
426
772
|
)
|
427
773
|
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
drop_dummy_na=True,
|
432
|
-
):
|
433
|
-
if drop_dummy_na:
|
434
|
-
ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
|
435
|
-
else:
|
436
|
-
ldscore_chr_chunk = self.snp_gene_weight_matrix @ mk_score_chunk
|
774
|
+
# Save M statistics
|
775
|
+
np.savetxt(m_file_path, m_chr_chunk, delimiter="\t")
|
776
|
+
np.savetxt(m_5_file_path, m_5_chr_chunk, delimiter="\t")
|
437
777
|
|
438
|
-
|
778
|
+
def _save_snp_gene_weight_matrix_if_needed(self, chrom: int):
|
779
|
+
"""
|
780
|
+
Save pre-calculated SNP-gene weight matrix if requested.
|
439
781
|
|
440
|
-
|
441
|
-
|
442
|
-
|
782
|
+
Parameters
|
783
|
+
----------
|
784
|
+
chrom : int
|
785
|
+
Chromosome number
|
786
|
+
"""
|
787
|
+
if self.config.save_pre_calculate_snp_gene_weight_matrix:
|
788
|
+
save_dir = Path(self.config.ldscore_save_dir) / "snp_gene_weight_matrix"
|
789
|
+
save_dir.mkdir(parents=True, exist_ok=True)
|
443
790
|
|
444
|
-
|
445
|
-
# avoid overflow of float16, if inf, set to max of float16
|
446
|
-
ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
|
447
|
-
# ldscore_chr_chunk = ldscore_chr_chunk if self.config.keep_snp_root is None else ldscore_chr_chunk[
|
448
|
-
# self.keep_snp_mask]
|
791
|
+
logger.info(f"Saving SNP-gene weight matrix for chr{chrom}")
|
449
792
|
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
793
|
+
save_path = save_dir / f"{chrom}.snp_gene_weight_matrix.feather"
|
794
|
+
self.snp_gene_weight_matrix.reset_index().to_feather(save_path)
|
795
|
+
|
796
|
+
def _calculate_baseline_ldscores(self, chrom: int):
|
797
|
+
"""
|
798
|
+
Calculate and save baseline LD scores.
|
799
|
+
|
800
|
+
Parameters
|
801
|
+
----------
|
802
|
+
chrom : int
|
803
|
+
Chromosome number
|
804
|
+
"""
|
805
|
+
# Create baseline scores
|
806
|
+
baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
|
807
|
+
baseline_mk_score[-1, 0] = 0 # all_gene column
|
808
|
+
|
809
|
+
baseline_df = pd.DataFrame(
|
810
|
+
baseline_mk_score, index=self.snp_gene_pair_dummy.columns, columns=["all_gene", "base"]
|
455
811
|
)
|
456
|
-
df.index.name = "SNP"
|
457
|
-
df.reset_index().to_feather(save_file_name)
|
458
812
|
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
|
466
|
-
# avoid overflow of float16, if inf, set to max of float16
|
467
|
-
ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
|
813
|
+
# Define file paths
|
814
|
+
ld_score_file = (
|
815
|
+
f"{self.config.ldscore_save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather"
|
816
|
+
)
|
817
|
+
m_file = f"{self.config.ldscore_save_dir}/baseline/baseline.{chrom}.l2.M"
|
818
|
+
m_5_file = f"{self.config.ldscore_save_dir}/baseline/baseline.{chrom}.l2.M_5_50"
|
468
819
|
|
469
|
-
#
|
470
|
-
|
471
|
-
chrom_snp_end_point = self.chrom_snp_start_point[chrom]
|
820
|
+
# Calculate LD scores
|
821
|
+
ldscore_chunk = self._calculate_ldscore_from_weights(baseline_df, drop_dummy_na=False)
|
472
822
|
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
823
|
+
# Save LD scores and M values
|
824
|
+
self._save_ldscore_to_feather(
|
825
|
+
ldscore_chunk,
|
826
|
+
column_names=baseline_df.columns,
|
827
|
+
save_file_name=ld_score_file,
|
828
|
+
)
|
477
829
|
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
830
|
+
self._calculate_and_save_m_values(
|
831
|
+
baseline_df,
|
832
|
+
m_file,
|
833
|
+
m_5_file,
|
834
|
+
drop_dummy_na=False,
|
835
|
+
)
|
836
|
+
|
837
|
+
# If keep_snp_root is not provided, use the first column of baseline ldscore as w_ld
|
838
|
+
if not self.config.keep_snp_root:
|
839
|
+
self._save_baseline_as_w_ld(chrom, ldscore_chunk)
|
840
|
+
|
841
|
+
def _save_baseline_as_w_ld(self, chrom: int, ldscore_chunk: np.ndarray):
|
485
842
|
"""
|
486
|
-
|
843
|
+
Save the first column of baseline ldscore as w_ld.
|
844
|
+
|
845
|
+
Parameters
|
846
|
+
----------
|
847
|
+
chrom : int
|
848
|
+
Chromosome number
|
849
|
+
ldscore_chunk : np.ndarray
|
850
|
+
Array with baseline LD scores
|
487
851
|
"""
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
M_file_path,
|
507
|
-
M_chr_chunk,
|
508
|
-
delimiter="\t",
|
852
|
+
logger.info(f"Using first column of baseline ldscore as w_ld for chr{chrom}")
|
853
|
+
|
854
|
+
# Create w_ld directory
|
855
|
+
w_ld_dir = Path(self.config.ldscore_save_dir) / "w_ld"
|
856
|
+
w_ld_dir.mkdir(parents=True, exist_ok=True)
|
857
|
+
|
858
|
+
# Define file path
|
859
|
+
w_ld_file = w_ld_dir / f"weights.{chrom}.l2.ldscore.gz"
|
860
|
+
|
861
|
+
# Extract the first column
|
862
|
+
w_ld_values = ldscore_chunk[:, 0]
|
863
|
+
|
864
|
+
# Create a DataFrame
|
865
|
+
bim_data = pd.read_csv(
|
866
|
+
f"{self.config.bfile_root}.{chrom}.bim",
|
867
|
+
sep="\t",
|
868
|
+
header=None,
|
869
|
+
names=["CHR", "SNP", "CM", "BP", "A1", "A2"],
|
509
870
|
)
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
871
|
+
w_ld_df = pd.DataFrame(
|
872
|
+
{
|
873
|
+
"SNP": self.snp_name,
|
874
|
+
"L2": w_ld_values,
|
875
|
+
}
|
514
876
|
)
|
515
877
|
|
516
|
-
|
517
|
-
|
518
|
-
|
878
|
+
# Add CHR, BP, and CM information
|
879
|
+
w_ld_df = w_ld_df.merge(bim_data[["SNP", "CHR", "BP", "CM"]], on="SNP", how="left")
|
880
|
+
|
881
|
+
# Reorder columns
|
882
|
+
w_ld_df = w_ld_df[["CHR", "SNP", "BP", "CM", "L2"]]
|
883
|
+
|
884
|
+
w_ld_df.to_csv(w_ld_file, sep="\t", index=False, compression="gzip")
|
885
|
+
|
886
|
+
logger.info(f"Saved w_ld for chr{chrom} to {w_ld_file}")
|
887
|
+
|
888
|
+
def _calculate_annotation_ldscores(self, chrom: int):
|
519
889
|
"""
|
520
|
-
Calculate
|
521
|
-
|
890
|
+
Calculate and save LD scores for spatial annotations.
|
891
|
+
|
892
|
+
Parameters
|
893
|
+
----------
|
894
|
+
chrom : int
|
895
|
+
Chromosome number
|
522
896
|
"""
|
523
|
-
#
|
897
|
+
# Get marker scores for gene columns (excluding dummy NA column)
|
898
|
+
mk_scores = self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]]
|
899
|
+
|
900
|
+
# Process in chunks
|
524
901
|
chunk_index = 1
|
525
902
|
for i in trange(
|
526
903
|
0,
|
527
|
-
|
904
|
+
mk_scores.shape[1],
|
528
905
|
self.config.spots_per_chunk,
|
529
|
-
desc=f"Calculating LD
|
906
|
+
desc=f"Calculating LD scores for chr{chrom}",
|
530
907
|
):
|
531
|
-
|
908
|
+
# Get marker scores for current chunk
|
909
|
+
mk_score_chunk = mk_scores.iloc[:, i : i + self.config.spots_per_chunk]
|
532
910
|
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
911
|
+
# Define file paths
|
912
|
+
sample_name = self.config.sample_name
|
913
|
+
ld_score_file = f"{self.config.ldscore_save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather"
|
914
|
+
m_file = f"{self.config.ldscore_save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M"
|
915
|
+
m_5_file = f"{self.config.ldscore_save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50"
|
538
916
|
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
917
|
+
# Calculate LD scores
|
918
|
+
ldscore_chunk = self._calculate_ldscore_from_weights(mk_score_chunk)
|
919
|
+
|
920
|
+
# Save LD scores based on format
|
543
921
|
if self.config.ldscore_save_format == "feather":
|
544
|
-
self.
|
545
|
-
|
922
|
+
self._save_ldscore_to_feather(
|
923
|
+
ldscore_chunk,
|
546
924
|
column_names=mk_score_chunk.columns,
|
547
925
|
save_file_name=ld_score_file,
|
548
926
|
)
|
549
927
|
elif self.config.ldscore_save_format == "zarr":
|
550
|
-
self.
|
551
|
-
|
928
|
+
self._save_ldscore_chunk_to_zarr(
|
929
|
+
ldscore_chunk,
|
552
930
|
chrom=chrom,
|
553
931
|
start_col_index=i,
|
554
932
|
)
|
555
933
|
else:
|
556
934
|
raise ValueError(f"Invalid ldscore_save_format: {self.config.ldscore_save_format}")
|
557
935
|
|
558
|
-
|
936
|
+
# Calculate and save M values
|
937
|
+
self._calculate_and_save_m_values(
|
559
938
|
mk_score_chunk,
|
560
|
-
|
561
|
-
|
939
|
+
m_file,
|
940
|
+
m_5_file,
|
562
941
|
drop_dummy_na=True,
|
563
942
|
)
|
564
943
|
|
565
944
|
chunk_index += 1
|
566
945
|
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
baseline_mk_score[-1, 0] = 0 # all_gene
|
571
|
-
baseline_mk_score_df = pd.DataFrame(
|
572
|
-
baseline_mk_score, index=self.snp_gene_pair_dummy.columns, columns=["all_gene", "base"]
|
573
|
-
)
|
574
|
-
ld_score_file = f"{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather"
|
575
|
-
M_file = f"{save_dir}/baseline/baseline.{chrom}.l2.M"
|
576
|
-
M_5_file = f"{save_dir}/baseline/baseline.{chrom}.l2.M_5_50"
|
946
|
+
# Clear memory
|
947
|
+
del ldscore_chunk
|
948
|
+
gc.collect()
|
577
949
|
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
950
|
+
def _calculate_ldscore_from_weights(
|
951
|
+
self, marker_scores: pd.DataFrame, drop_dummy_na: bool = True
|
952
|
+
) -> np.ndarray:
|
953
|
+
"""
|
954
|
+
Calculate LD scores using SNP-gene weight matrix.
|
955
|
+
|
956
|
+
Parameters
|
957
|
+
----------
|
958
|
+
marker_scores : pd.DataFrame
|
959
|
+
DataFrame with marker scores
|
960
|
+
drop_dummy_na : bool, optional
|
961
|
+
Whether to drop the dummy NA column, by default True
|
962
|
+
|
963
|
+
Returns
|
964
|
+
-------
|
965
|
+
np.ndarray
|
966
|
+
Array with calculated LD scores
|
967
|
+
"""
|
968
|
+
weight_matrix = self.snp_gene_weight_matrix
|
582
969
|
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
970
|
+
if drop_dummy_na:
|
971
|
+
# Use all columns except the last one (dummy NA)
|
972
|
+
ldscore = weight_matrix[:, :-1] @ marker_scores
|
973
|
+
else:
|
974
|
+
ldscore = weight_matrix @ marker_scores
|
975
|
+
|
976
|
+
return ldscore
|
977
|
+
|
978
|
+
def _save_ldscore_to_feather(
|
979
|
+
self, ldscore_data: np.ndarray, column_names: list[str], save_file_name: str
|
980
|
+
):
|
981
|
+
"""
|
982
|
+
Save LD scores to a feather file.
|
983
|
+
|
984
|
+
Parameters
|
985
|
+
----------
|
986
|
+
ldscore_data : np.ndarray
|
987
|
+
Array with LD scores
|
988
|
+
column_names : list
|
989
|
+
List of column names
|
990
|
+
save_file_name : str
|
991
|
+
Path to save the feather file
|
992
|
+
"""
|
993
|
+
# Create directory if needed
|
994
|
+
save_dir = Path(save_file_name).parent
|
995
|
+
save_dir.mkdir(parents=True, exist_ok=True)
|
996
|
+
|
997
|
+
# Convert to float16 for storage efficiency
|
998
|
+
ldscore_data = ldscore_data.astype(np.float16, copy=False)
|
999
|
+
|
1000
|
+
# Handle numerical overflow
|
1001
|
+
ldscore_data[np.isinf(ldscore_data)] = np.finfo(np.float16).max
|
1002
|
+
|
1003
|
+
# Create DataFrame and save
|
1004
|
+
df = pd.DataFrame(
|
1005
|
+
ldscore_data,
|
1006
|
+
index=self.snp_name,
|
1007
|
+
columns=column_names,
|
594
1008
|
)
|
1009
|
+
df.index.name = "SNP"
|
1010
|
+
df.reset_index().to_feather(save_file_name)
|
595
1011
|
|
596
|
-
def
|
1012
|
+
def _save_ldscore_chunk_to_zarr(
|
1013
|
+
self, ldscore_data: np.ndarray, chrom: int, start_col_index: int
|
1014
|
+
):
|
1015
|
+
"""
|
1016
|
+
Save LD scores to a zarr array.
|
1017
|
+
|
1018
|
+
Parameters
|
1019
|
+
----------
|
1020
|
+
ldscore_data : np.ndarray
|
1021
|
+
Array with LD scores
|
1022
|
+
chrom : int
|
1023
|
+
Chromosome number
|
1024
|
+
start_col_index : int
|
1025
|
+
Starting column index in the zarr array
|
1026
|
+
"""
|
1027
|
+
# Convert to float16 for storage efficiency
|
1028
|
+
ldscore_data = ldscore_data.astype(np.float16, copy=False)
|
1029
|
+
|
1030
|
+
# Handle numerical overflow
|
1031
|
+
ldscore_data[np.isinf(ldscore_data)] = np.finfo(np.float16).max
|
1032
|
+
|
1033
|
+
# Get start and end indices for this chromosome
|
1034
|
+
chrom_start = self.chrom_snp_start_point[chrom - 1]
|
1035
|
+
chrom_end = self.chrom_snp_start_point[chrom]
|
1036
|
+
|
1037
|
+
# Save to zarr array
|
1038
|
+
self.zarr_file[
|
1039
|
+
chrom_start:chrom_end,
|
1040
|
+
start_col_index : start_col_index + ldscore_data.shape[1],
|
1041
|
+
] = ldscore_data
|
1042
|
+
|
1043
|
+
def _calculate_and_save_m_values(
|
597
1044
|
self,
|
598
|
-
|
1045
|
+
marker_scores: pd.DataFrame,
|
1046
|
+
m_file_path: str,
|
1047
|
+
m_5_file_path: str,
|
1048
|
+
drop_dummy_na: bool = True,
|
599
1049
|
):
|
600
1050
|
"""
|
601
|
-
|
1051
|
+
Calculate and save M statistics.
|
1052
|
+
|
1053
|
+
Parameters
|
1054
|
+
----------
|
1055
|
+
marker_scores : pd.DataFrame
|
1056
|
+
DataFrame with marker scores
|
1057
|
+
m_file_path : str
|
1058
|
+
Path to save M values
|
1059
|
+
m_5_file_path : str
|
1060
|
+
Path to save M_5_50 values
|
1061
|
+
drop_dummy_na : bool, optional
|
1062
|
+
Whether to drop the dummy NA column, by default True
|
1063
|
+
"""
|
1064
|
+
# Create directory if needed
|
1065
|
+
save_dir = Path(m_file_path).parent
|
1066
|
+
save_dir.mkdir(parents=True, exist_ok=True)
|
1067
|
+
|
1068
|
+
# Get sum of SNP-gene pairs
|
1069
|
+
snp_gene_sum = self.snp_gene_pair_dummy.values.sum(axis=0, keepdims=True)
|
1070
|
+
snp_gene_sum_maf = self.snp_gene_pair_dummy.loc[self.snp_pass_maf].values.sum(
|
1071
|
+
axis=0, keepdims=True
|
1072
|
+
)
|
1073
|
+
|
1074
|
+
# Drop dummy NA column if requested
|
1075
|
+
if drop_dummy_na:
|
1076
|
+
snp_gene_sum = snp_gene_sum[:, :-1]
|
1077
|
+
snp_gene_sum_maf = snp_gene_sum_maf[:, :-1]
|
1078
|
+
|
1079
|
+
# Calculate M values
|
1080
|
+
m_values = snp_gene_sum @ marker_scores
|
1081
|
+
m_5_values = snp_gene_sum_maf @ marker_scores
|
1082
|
+
|
1083
|
+
# Save M values
|
1084
|
+
np.savetxt(m_file_path, m_values, delimiter="\t")
|
1085
|
+
np.savetxt(m_5_file_path, m_5_values, delimiter="\t")
|
1086
|
+
|
1087
|
+
def _get_snp_gene_dummy(self, chrom: int) -> pd.DataFrame:
|
1088
|
+
"""
|
1089
|
+
Get dummy matrix for SNP-gene pairs.
|
1090
|
+
|
1091
|
+
Parameters
|
1092
|
+
----------
|
1093
|
+
chrom : int
|
1094
|
+
Chromosome number
|
1095
|
+
|
1096
|
+
Returns
|
1097
|
+
-------
|
1098
|
+
pd.DataFrame
|
1099
|
+
DataFrame with dummy variables for SNP-gene pairs
|
602
1100
|
"""
|
603
|
-
|
604
|
-
|
1101
|
+
logger.info(f"Creating SNP-gene mappings for chromosome {chrom}")
|
1102
|
+
|
1103
|
+
# Load BIM file
|
605
1104
|
bim, bim_pr = load_bim(self.config.bfile_root, chrom)
|
606
1105
|
|
1106
|
+
# Determine mapping strategy
|
607
1107
|
if self.config.gene_window_enhancer_priority in ["gene_window_first", "enhancer_first"]:
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
1108
|
+
# Use both gene window and enhancer
|
1109
|
+
snp_gene_pair = self._combine_gtf_and_enhancer_mappings(bim, bim_pr)
|
1110
|
+
|
1111
|
+
elif self.config.gene_window_enhancer_priority is None:
|
1112
|
+
# Use only gene window
|
1113
|
+
snp_gene_pair = self._get_snp_gene_pair_from_gtf(bim, bim_pr)
|
1114
|
+
|
1115
|
+
elif self.config.gene_window_enhancer_priority == "enhancer_only":
|
1116
|
+
# Use only enhancer
|
1117
|
+
snp_gene_pair = self._get_snp_gene_pair_from_enhancer(bim, bim_pr)
|
1118
|
+
|
1119
|
+
else:
|
1120
|
+
raise ValueError(
|
1121
|
+
f"Invalid gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}"
|
615
1122
|
)
|
616
|
-
# total_SNP_gene_pair = SNP_gene_pair_gtf.join(SNP_gene_pair_enhancer, how='outer', lsuffix='_gtf', )
|
617
|
-
|
618
|
-
mask_of_nan_gtf = SNP_gene_pair_gtf.gene_name.isna()
|
619
|
-
mask_of_nan_enhancer = SNP_gene_pair_enhancer.gene_name.isna()
|
620
|
-
|
621
|
-
if self.config.gene_window_enhancer_priority == "gene_window_first":
|
622
|
-
SNP_gene_pair = SNP_gene_pair_gtf
|
623
|
-
SNP_gene_pair.loc[mask_of_nan_gtf, "gene_name"] = SNP_gene_pair_enhancer.loc[
|
624
|
-
mask_of_nan_gtf, "gene_name"
|
625
|
-
]
|
626
|
-
elif self.config.gene_window_enhancer_priority == "enhancer_first":
|
627
|
-
SNP_gene_pair = SNP_gene_pair_enhancer
|
628
|
-
SNP_gene_pair.loc[mask_of_nan_enhancer, "gene_name"] = SNP_gene_pair_gtf.loc[
|
629
|
-
mask_of_nan_enhancer, "gene_name"
|
630
|
-
]
|
631
|
-
else:
|
632
|
-
raise ValueError(
|
633
|
-
f"Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}"
|
634
|
-
)
|
635
1123
|
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
1124
|
+
# Save SNP-gene pair mapping
|
1125
|
+
self._save_snp_gene_pair_mapping(snp_gene_pair, chrom)
|
1126
|
+
|
1127
|
+
# Create dummy variables
|
1128
|
+
snp_gene_dummy = pd.get_dummies(snp_gene_pair["gene_name"], dummy_na=True)
|
1129
|
+
|
1130
|
+
return snp_gene_dummy
|
1131
|
+
|
1132
|
+
def _combine_gtf_and_enhancer_mappings(
|
1133
|
+
self, bim: pd.DataFrame, bim_pr: pr.PyRanges
|
1134
|
+
) -> pd.DataFrame:
|
1135
|
+
"""
|
1136
|
+
Combine gene window and enhancer mappings.
|
1137
|
+
|
1138
|
+
Parameters
|
1139
|
+
----------
|
1140
|
+
bim : pd.DataFrame
|
1141
|
+
BIM DataFrame
|
1142
|
+
bim_pr : pr.PyRanges
|
1143
|
+
BIM PyRanges object
|
1144
|
+
|
1145
|
+
Returns
|
1146
|
+
-------
|
1147
|
+
pd.DataFrame
|
1148
|
+
Combined SNP-gene pair mapping
|
1149
|
+
"""
|
1150
|
+
# Get mappings from both sources
|
1151
|
+
gtf_mapping = self._get_snp_gene_pair_from_gtf(bim, bim_pr)
|
1152
|
+
enhancer_mapping = self._get_snp_gene_pair_from_enhancer(bim, bim_pr)
|
1153
|
+
|
1154
|
+
# Find SNPs with missing mappings in each source
|
1155
|
+
mask_of_nan_gtf = gtf_mapping.gene_name.isna()
|
1156
|
+
mask_of_nan_enhancer = enhancer_mapping.gene_name.isna()
|
1157
|
+
|
1158
|
+
# Combine based on priority
|
1159
|
+
if self.config.gene_window_enhancer_priority == "gene_window_first":
|
1160
|
+
# Use gene window mappings first, fill missing with enhancer mappings
|
1161
|
+
combined_mapping = gtf_mapping.copy()
|
1162
|
+
combined_mapping.loc[mask_of_nan_gtf, "gene_name"] = enhancer_mapping.loc[
|
1163
|
+
mask_of_nan_gtf, "gene_name"
|
1164
|
+
]
|
1165
|
+
logger.info(
|
1166
|
+
f"Filled {mask_of_nan_gtf.sum()} SNPs with no GTF mapping using enhancer mappings"
|
640
1167
|
)
|
641
|
-
SNP_gene_pair = SNP_gene_pair_gtf
|
642
1168
|
|
643
|
-
elif self.config.gene_window_enhancer_priority == "
|
644
|
-
|
645
|
-
|
646
|
-
|
1169
|
+
elif self.config.gene_window_enhancer_priority == "enhancer_first":
|
1170
|
+
# Use enhancer mappings first, fill missing with gene window mappings
|
1171
|
+
combined_mapping = enhancer_mapping.copy()
|
1172
|
+
combined_mapping.loc[mask_of_nan_enhancer, "gene_name"] = gtf_mapping.loc[
|
1173
|
+
mask_of_nan_enhancer, "gene_name"
|
1174
|
+
]
|
1175
|
+
logger.info(
|
1176
|
+
f"Filled {mask_of_nan_enhancer.sum()} SNPs with no enhancer mapping using GTF mappings"
|
647
1177
|
)
|
648
|
-
SNP_gene_pair = SNP_gene_pair_enhancer
|
649
|
-
else:
|
650
|
-
raise ValueError("gtf_pr and enhancer_pr cannot be None at the same time")
|
651
1178
|
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
SNP_gene_pair_save_path.parent.mkdir(parents=True, exist_ok=True)
|
657
|
-
SNP_gene_pair.reset_index().to_feather(SNP_gene_pair_save_path)
|
1179
|
+
else:
|
1180
|
+
raise ValueError(
|
1181
|
+
f"Invalid gene_window_enhancer_priority for combining: {self.config.gene_window_enhancer_priority}"
|
1182
|
+
)
|
658
1183
|
|
659
|
-
|
660
|
-
SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair["gene_name"], dummy_na=True)
|
661
|
-
return SNP_gene_pair_dummy
|
1184
|
+
return combined_mapping
|
662
1185
|
|
663
|
-
def
|
1186
|
+
def _get_snp_gene_pair_from_gtf(self, bim: pd.DataFrame, bim_pr: pr.PyRanges) -> pd.DataFrame:
|
1187
|
+
"""
|
1188
|
+
Get SNP-gene pairs based on GTF annotations.
|
1189
|
+
|
1190
|
+
Parameters
|
1191
|
+
----------
|
1192
|
+
bim : pd.DataFrame
|
1193
|
+
BIM DataFrame
|
1194
|
+
bim_pr : pr.PyRanges
|
1195
|
+
BIM PyRanges object
|
1196
|
+
|
1197
|
+
Returns
|
1198
|
+
-------
|
1199
|
+
pd.DataFrame
|
1200
|
+
SNP-gene pairs based on GTF
|
1201
|
+
"""
|
664
1202
|
logger.info(
|
665
|
-
"
|
1203
|
+
"Getting SNP-gene pairs from GTF. SNPs in multiple genes will be assigned to the nearest gene (by TSS)"
|
666
1204
|
)
|
667
|
-
|
668
|
-
#
|
1205
|
+
|
1206
|
+
# Find overlaps between SNPs and gene windows
|
1207
|
+
overlaps = overlaps_gtf_bim(self.gtf_pr, bim_pr)
|
1208
|
+
|
1209
|
+
# Get SNP information
|
669
1210
|
annot = bim[["CHR", "BP", "SNP", "CM"]]
|
670
|
-
|
671
|
-
|
1211
|
+
|
1212
|
+
# Create SNP-gene pairs DataFrame
|
1213
|
+
snp_gene_pair = (
|
1214
|
+
overlaps[["SNP", "gene_name"]]
|
672
1215
|
.set_index("SNP")
|
673
1216
|
.join(annot.set_index("SNP"), how="right")
|
674
1217
|
)
|
675
|
-
return SNP_gene_pair
|
676
1218
|
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
1219
|
+
logger.info(f"Found {overlaps.shape[0]} SNP-gene pairs from GTF")
|
1220
|
+
|
1221
|
+
return snp_gene_pair
|
1222
|
+
|
1223
|
+
def _get_snp_gene_pair_from_enhancer(
|
1224
|
+
self, bim: pd.DataFrame, bim_pr: pr.PyRanges
|
1225
|
+
) -> pd.DataFrame:
|
1226
|
+
"""
|
1227
|
+
Get SNP-gene pairs based on enhancer annotations.
|
1228
|
+
|
1229
|
+
Parameters
|
1230
|
+
----------
|
1231
|
+
bim : pd.DataFrame
|
1232
|
+
BIM DataFrame
|
1233
|
+
bim_pr : pr.PyRanges
|
1234
|
+
BIM PyRanges object
|
1235
|
+
|
1236
|
+
Returns
|
1237
|
+
-------
|
1238
|
+
pd.DataFrame
|
1239
|
+
SNP-gene pairs based on enhancer
|
1240
|
+
"""
|
1241
|
+
if self.enhancer_pr is None:
|
1242
|
+
raise ValueError("Enhancer annotation file is required but not provided")
|
1243
|
+
|
1244
|
+
# Find overlaps between SNPs and enhancers
|
1245
|
+
overlaps = self.enhancer_pr.join(bim_pr).df
|
1246
|
+
|
1247
|
+
# Get SNP information
|
687
1248
|
annot = bim[["CHR", "BP", "SNP", "CM"]]
|
1249
|
+
|
688
1250
|
if self.config.snp_multiple_enhancer_strategy == "max_mkscore":
|
689
|
-
logger.
|
690
|
-
|
1251
|
+
logger.info(
|
1252
|
+
"SNPs in multiple enhancers will be assigned to the gene with highest marker score"
|
1253
|
+
)
|
1254
|
+
overlaps = overlaps.loc[overlaps.groupby("SNP").avg_mkscore.idxmax()]
|
691
1255
|
|
692
1256
|
elif self.config.snp_multiple_enhancer_strategy == "nearest_TSS":
|
693
|
-
logger.
|
694
|
-
|
695
|
-
|
1257
|
+
logger.info("SNPs in multiple enhancers will be assigned to the gene with nearest TSS")
|
1258
|
+
overlaps["Distance"] = np.abs(overlaps["Start_b"] - overlaps["TSS"])
|
1259
|
+
overlaps = overlaps.loc[overlaps.groupby("SNP").Distance.idxmin()]
|
696
1260
|
|
697
|
-
|
698
|
-
|
1261
|
+
# Create SNP-gene pairs DataFrame
|
1262
|
+
snp_gene_pair = (
|
1263
|
+
overlaps[["SNP", "gene_name"]]
|
699
1264
|
.set_index("SNP")
|
700
1265
|
.join(annot.set_index("SNP"), how="right")
|
701
1266
|
)
|
702
1267
|
|
703
|
-
|
1268
|
+
logger.info(f"Found {overlaps.shape[0]} SNP-gene pairs from enhancers")
|
1269
|
+
|
1270
|
+
return snp_gene_pair
|
1271
|
+
|
1272
|
+
def _save_snp_gene_pair_mapping(self, snp_gene_pair: pd.DataFrame, chrom: int):
|
1273
|
+
"""
|
1274
|
+
Save SNP-gene pair mapping to a feather file.
|
1275
|
+
|
1276
|
+
Parameters
|
1277
|
+
----------
|
1278
|
+
snp_gene_pair : pd.DataFrame
|
1279
|
+
SNP-gene pair mapping
|
1280
|
+
chrom : int
|
1281
|
+
Chromosome number
|
1282
|
+
"""
|
1283
|
+
save_path = (
|
1284
|
+
Path(self.config.ldscore_save_dir) / f"SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather"
|
1285
|
+
)
|
1286
|
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
1287
|
+
snp_gene_pair.reset_index().to_feather(save_path)
|
1288
|
+
|
1289
|
+
def _clear_memory(self):
|
1290
|
+
"""Clear memory to prevent leaks."""
|
1291
|
+
gc.collect()
|
704
1292
|
|
705
1293
|
|
706
1294
|
def run_generate_ldscore(config: GenerateLDScoreConfig):
|
1295
|
+
"""
|
1296
|
+
Main function to run the LD score generation.
|
1297
|
+
|
1298
|
+
Parameters
|
1299
|
+
----------
|
1300
|
+
config : GenerateLDScoreConfig
|
1301
|
+
Configuration object
|
1302
|
+
"""
|
1303
|
+
# Create output directory
|
1304
|
+
Path(config.ldscore_save_dir).mkdir(parents=True, exist_ok=True)
|
1305
|
+
|
707
1306
|
if config.ldscore_save_format == "quick_mode":
|
708
1307
|
logger.info(
|
709
1308
|
"Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore."
|
710
1309
|
)
|
711
|
-
ldscore_save_dir = config.ldscore_save_dir
|
1310
|
+
ldscore_save_dir = Path(config.ldscore_save_dir)
|
712
1311
|
|
713
|
-
#
|
714
|
-
|
715
|
-
(
|
716
|
-
|
717
|
-
|
1312
|
+
# Set up symbolic links
|
1313
|
+
baseline_dir = ldscore_save_dir / "baseline"
|
1314
|
+
baseline_dir.parent.mkdir(parents=True, exist_ok=True)
|
1315
|
+
if not baseline_dir.exists():
|
1316
|
+
baseline_dir.symlink_to(config.baseline_annotation_dir, target_is_directory=True)
|
1317
|
+
|
1318
|
+
snp_gene_pair_dir = ldscore_save_dir / "SNP_gene_pair"
|
1319
|
+
snp_gene_pair_dir.parent.mkdir(parents=True, exist_ok=True)
|
1320
|
+
if not snp_gene_pair_dir.exists():
|
1321
|
+
snp_gene_pair_dir.symlink_to(config.SNP_gene_pair_dir, target_is_directory=True)
|
1322
|
+
|
1323
|
+
# Create a done file to mark completion
|
1324
|
+
done_file = ldscore_save_dir / f"{config.sample_name}_generate_ldscore.done"
|
1325
|
+
done_file.touch()
|
718
1326
|
|
719
|
-
# link the SNP_gene_pair
|
720
|
-
SNP_gene_pair_dir = Path(config.SNP_gene_pair_dir)
|
721
|
-
(ldscore_save_dir / "SNP_gene_pair").symlink_to(
|
722
|
-
SNP_gene_pair_dir, target_is_directory=True
|
723
|
-
)
|
724
1327
|
return
|
725
|
-
|
1328
|
+
|
1329
|
+
# Initialize calculator
|
1330
|
+
calculator = LDScoreCalculator(config)
|
1331
|
+
|
1332
|
+
# Process chromosomes
|
726
1333
|
if config.chrom == "all":
|
1334
|
+
# Process all chromosomes
|
727
1335
|
for chrom in range(1, 23):
|
728
|
-
|
1336
|
+
try:
|
1337
|
+
calculator.process_chromosome(chrom)
|
1338
|
+
except Exception as e:
|
1339
|
+
logger.error(f"Error processing chromosome {chrom}: {e}")
|
1340
|
+
raise
|
729
1341
|
else:
|
730
|
-
|
1342
|
+
# Process one chromosome
|
1343
|
+
try:
|
1344
|
+
chrom = int(config.chrom)
|
1345
|
+
except ValueError:
|
1346
|
+
logger.error(f"Invalid chromosome: {config.chrom}")
|
1347
|
+
raise ValueError(
|
1348
|
+
f"Invalid chromosome: {config.chrom}. Must be an integer between 1-22 or 'all'"
|
1349
|
+
) from None
|
1350
|
+
else:
|
1351
|
+
calculator.process_chromosome(chrom)
|
1352
|
+
|
1353
|
+
# Create a done file to mark completion
|
1354
|
+
done_file = Path(config.ldscore_save_dir) / f"{config.sample_name}_generate_ldscore.done"
|
1355
|
+
done_file.touch()
|
1356
|
+
|
1357
|
+
logger.info(f"LD score generation completed for {config.sample_name}")
|