gsMap 1.71.2__py3-none-any.whl → 1.72.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/generate_ldscore.py CHANGED
@@ -10,7 +10,7 @@ from scipy.sparse import csr_matrix
10
10
  from tqdm import trange
11
11
 
12
12
  from gsMap.config import GenerateLDScoreConfig
13
- from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
13
+ from gsMap.utils.generate_r2_matrix import ID_List_Factory, PlinkBEDFileWithR2Cache, getBlockLefts
14
14
 
15
15
  warnings.filterwarnings("ignore", category=FutureWarning)
16
16
  logger = logging.getLogger(__name__)
@@ -25,34 +25,36 @@ def load_gtf(gtf_file, mk_score, window_size):
25
25
  print("Loading gtf data")
26
26
  #
27
27
  # Load GTF file
28
- gtf = pr.read_gtf(gtf_file, )
28
+ gtf = pr.read_gtf(
29
+ gtf_file,
30
+ )
29
31
  gtf = gtf.df
30
32
  #
31
33
  # Select the common genes
32
- gtf = gtf[gtf['Feature'] == 'gene']
34
+ gtf = gtf[gtf["Feature"] == "gene"]
33
35
  common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
34
36
  #
35
37
  gtf = gtf[gtf.gene_name.isin(common_gene)]
36
38
  mk_score = mk_score[mk_score.index.isin(common_gene)]
37
39
  #
38
40
  # Remove duplicated lines
39
- gtf = gtf.drop_duplicates(subset='gene_name', keep="first")
41
+ gtf = gtf.drop_duplicates(subset="gene_name", keep="first")
40
42
  #
41
43
  # Process the GTF (open 100-KB window: Tss - Ted)
42
- gtf_bed = gtf[['Chromosome', 'Start', 'End', 'gene_name', 'Strand']].copy()
43
- gtf_bed.loc[:, 'TSS'] = gtf_bed['Start']
44
- gtf_bed.loc[:, 'TED'] = gtf_bed['End']
44
+ gtf_bed = gtf[["Chromosome", "Start", "End", "gene_name", "Strand"]].copy()
45
+ gtf_bed.loc[:, "TSS"] = gtf_bed["Start"]
46
+ gtf_bed.loc[:, "TED"] = gtf_bed["End"]
45
47
 
46
- gtf_bed.loc[:, 'Start'] = gtf_bed['TSS'] - window_size
47
- gtf_bed.loc[:, 'End'] = gtf_bed['TED'] + window_size
48
- gtf_bed.loc[gtf_bed['Start'] < 0, 'Start'] = 0
48
+ gtf_bed.loc[:, "Start"] = gtf_bed["TSS"] - window_size
49
+ gtf_bed.loc[:, "End"] = gtf_bed["TED"] + window_size
50
+ gtf_bed.loc[gtf_bed["Start"] < 0, "Start"] = 0
49
51
  #
50
52
  # Correct the negative strand
51
- tss_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS']
52
- ted_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED']
53
- gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS'] = ted_neg
54
- gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED'] = tss_neg
55
- gtf_bed = gtf_bed.drop('Strand', axis=1)
53
+ tss_neg = gtf_bed.loc[gtf_bed["Strand"] == "-", "TSS"]
54
+ ted_neg = gtf_bed.loc[gtf_bed["Strand"] == "-", "TED"]
55
+ gtf_bed.loc[gtf_bed["Strand"] == "-", "TSS"] = ted_neg
56
+ gtf_bed.loc[gtf_bed["Strand"] == "-", "TED"] = tss_neg
57
+ gtf_bed = gtf_bed.drop("Strand", axis=1)
56
58
  #
57
59
  # Transform the GTF to PyRanges
58
60
  gtf_pr = pr.PyRanges(gtf_bed)
@@ -64,7 +66,7 @@ def load_marker_score(mk_score_file):
64
66
  """
65
67
  Load marker scores of each cell.
66
68
  """
67
- mk_score = pd.read_feather(mk_score_file).set_index('HUMAN_GENE_SYM').rename_axis('gene_name')
69
+ mk_score = pd.read_feather(mk_score_file).set_index("HUMAN_GENE_SYM").rename_axis("gene_name")
68
70
  mk_score = mk_score.astype(np.float32, copy=False)
69
71
  return mk_score
70
72
 
@@ -77,18 +79,18 @@ def load_bim(bfile_root, chrom):
77
79
  """
78
80
  Load the bim file.
79
81
  """
80
- bim = pd.read_csv(f'{bfile_root}.{chrom}.bim', sep='\t', header=None)
82
+ bim = pd.read_csv(f"{bfile_root}.{chrom}.bim", sep="\t", header=None)
81
83
  bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
82
84
  #
83
85
  # Transform bim to PyRanges
84
86
  bim_pr = bim.copy()
85
87
  bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
86
88
 
87
- bim_pr['End'] = bim_pr['Start'].copy()
88
- bim_pr['Start'] = bim_pr['Start'] - 1 # Due to bim file is 1-based
89
+ bim_pr["End"] = bim_pr["Start"].copy()
90
+ bim_pr["Start"] = bim_pr["Start"] - 1 # Due to bim file is 1-based
89
91
 
90
92
  bim_pr = pr.PyRanges(bim_pr)
91
- bim_pr.Chromosome = f'chr{chrom}'
93
+ bim_pr.Chromosome = f"chr{chrom}"
92
94
  return bim, bim_pr
93
95
 
94
96
 
@@ -100,9 +102,9 @@ def Overlaps_gtf_bim(gtf_pr, bim_pr):
100
102
  # Select the overlapped regions (SNPs in gene windows)
101
103
  overlaps = gtf_pr.join(bim_pr)
102
104
  overlaps = overlaps.df
103
- overlaps['Distance'] = np.abs(overlaps['Start_b'] - overlaps['TSS'])
105
+ overlaps["Distance"] = np.abs(overlaps["Start_b"] - overlaps["TSS"])
104
106
  overlaps_small = overlaps.copy()
105
- overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
107
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby("SNP").Distance.idxmin()]
106
108
  return overlaps_small
107
109
 
108
110
 
@@ -110,7 +112,7 @@ def Overlaps_gtf_bim(gtf_pr, bim_pr):
110
112
  def filter_snps_by_keep_snp(bim_df, keep_snp_file):
111
113
  # Load the keep_snp file and filter the BIM DataFrame
112
114
  keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
113
- filtered_bim_df = bim_df[bim_df['SNP'].isin(keep_snp)]
115
+ filtered_bim_df = bim_df[bim_df["SNP"].isin(keep_snp)]
114
116
  return filtered_bim_df
115
117
 
116
118
 
@@ -122,7 +124,7 @@ def get_snp_counts(config):
122
124
  bim_df, _ = load_bim(config.bfile_root, chrom)
123
125
 
124
126
  if config.keep_snp_root:
125
- keep_snp_file = f'{config.keep_snp_root}.{chrom}.snp'
127
+ keep_snp_file = f"{config.keep_snp_root}.{chrom}.snp"
126
128
  filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
127
129
  else:
128
130
  filtered_bim_df = bim_df
@@ -130,11 +132,11 @@ def get_snp_counts(config):
130
132
  snp_counts[chrom] = filtered_bim_df.shape[0]
131
133
  total_snp += snp_counts[chrom]
132
134
 
133
- snp_counts['total'] = total_snp
135
+ snp_counts["total"] = total_snp
134
136
 
135
137
  chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
136
138
 
137
- snp_counts['chrom_snp_start_point'] = [0] + chrom_snp_length_array.tolist()
139
+ snp_counts["chrom_snp_start_point"] = [0] + chrom_snp_length_array.tolist()
138
140
 
139
141
  return snp_counts
140
142
 
@@ -145,55 +147,63 @@ def get_snp_pass_maf(bfile_root, chrom, maf_min=0.05):
145
147
  Get the dummy matrix of SNP-gene pairs.
146
148
  """
147
149
  # Load the bim file
148
- PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
149
- PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
150
+ PlinkBIMFile = ID_List_Factory(
151
+ ["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
152
+ )
153
+ PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
150
154
 
151
- bfile = f'{bfile_root}.{chrom}'
152
- snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
155
+ bfile = f"{bfile_root}.{chrom}"
156
+ snp_file, snp_obj = bfile + ".bim", PlinkBIMFile
153
157
  array_snps = snp_obj(snp_file)
154
- m = len(array_snps.IDList)
158
+ # m = len(array_snps.IDList)
155
159
 
156
160
  # Load fam
157
- ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
161
+ ind_file, ind_obj = bfile + ".fam", PlinkFAMFile
158
162
  array_indivs = ind_obj(ind_file)
159
163
  n = len(array_indivs.IDList)
160
- array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
161
- geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
164
+ array_file, array_obj = bfile + ".bed", PlinkBEDFileWithR2Cache
165
+ geno_array = array_obj(
166
+ array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None
167
+ )
162
168
  ii = geno_array.maf > maf_min
163
169
  snp_pass_maf = array_snps.IDList[ii]
164
- print(f'After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain.')
170
+ print(f"After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain.")
165
171
  return snp_pass_maf.SNP.to_list()
166
172
 
167
173
 
168
- def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit='CM'):
169
- PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
170
- PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
174
+ def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit="CM"):
175
+ PlinkBIMFile = ID_List_Factory(
176
+ ["CHR", "SNP", "CM", "BP", "A1", "A2"], 1, ".bim", usecols=[0, 1, 2, 3, 4, 5]
177
+ )
178
+ PlinkFAMFile = ID_List_Factory(["IID"], 0, ".fam", usecols=[1])
171
179
 
172
- bfile = f'{bfile_root}.{chrom}'
173
- snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
180
+ bfile = f"{bfile_root}.{chrom}"
181
+ snp_file, snp_obj = bfile + ".bim", PlinkBIMFile
174
182
  array_snps = snp_obj(snp_file)
175
183
  m = len(array_snps.IDList)
176
- print(f'Read list of {m} SNPs from {snp_file}')
184
+ print(f"Read list of {m} SNPs from {snp_file}")
177
185
 
178
186
  # Load fam
179
- ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
187
+ ind_file, ind_obj = bfile + ".fam", PlinkFAMFile
180
188
  array_indivs = ind_obj(ind_file)
181
189
  n = len(array_indivs.IDList)
182
- print(f'Read list of {n} individuals from {ind_file}')
183
- array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
184
- geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
190
+ print(f"Read list of {n} individuals from {ind_file}")
191
+ array_file, array_obj = bfile + ".bed", PlinkBEDFileWithR2Cache
192
+ geno_array = array_obj(
193
+ array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None
194
+ )
185
195
  # Load the annotations of the baseline
186
- if ld_unit == 'SNP':
196
+ if ld_unit == "SNP":
187
197
  max_dist = ld_wind
188
198
  coords = np.array(range(geno_array.m))
189
- elif ld_unit == 'KB':
199
+ elif ld_unit == "KB":
190
200
  max_dist = ld_wind * 1000
191
- coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
192
- elif ld_unit == 'CM':
201
+ coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
202
+ elif ld_unit == "CM":
193
203
  max_dist = ld_wind
194
- coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
204
+ coords = np.array(array_snps.df["CM"])[geno_array.kept_snps]
195
205
  else:
196
- raise ValueError(f'Invalid ld_wind_unit: {ld_unit}')
206
+ raise ValueError(f"Invalid ld_wind_unit: {ld_unit}")
197
207
  block_left = getBlockLefts(coords, max_dist)
198
208
  # Calculate the LD score
199
209
  lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
@@ -201,25 +211,31 @@ def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit='CM'):
201
211
 
202
212
 
203
213
  # %%
204
- def calculate_ldscore_from_annotation(SNP_annotation_df, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
214
+ def calculate_ldscore_from_annotation(
215
+ SNP_annotation_df, chrom, bfile_root, ld_wind=1, ld_unit="CM"
216
+ ):
205
217
  """
206
218
  Calculate the SNP-gene weight matrix.
207
219
  """
208
220
  # Get the dummy matrix
209
221
  # Get the SNP-gene weight matrix
210
- snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
211
- ld_unit=ld_unit)
222
+ snp_gene_weight_matrix = get_ldscore(
223
+ bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
224
+ )
212
225
  snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
213
226
  snp_gene_weight_matrix.index = SNP_annotation_df.index
214
227
  snp_gene_weight_matrix.columns = SNP_annotation_df.columns
215
228
  return snp_gene_weight_matrix
216
229
 
217
230
 
218
- def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
231
+ def calculate_ldscore_from_multiple_annotation(
232
+ SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit="CM"
233
+ ):
219
234
  SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1).astype(np.float32, copy=False)
220
235
 
221
- snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
222
- ld_unit=ld_unit)
236
+ snp_gene_weight_matrix = get_ldscore(
237
+ bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
238
+ )
223
239
  snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
224
240
  snp_gene_weight_matrix.index = SNP_annotation_df.index
225
241
  snp_gene_weight_matrix.columns = SNP_annotation_df.columns
@@ -229,7 +245,9 @@ def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bf
229
245
  snp_gene_weight_matrix_list = []
230
246
  start = 0
231
247
  for snp_annotation_len in snp_annotation_len_list:
232
- snp_gene_weight_matrix_list.append(snp_gene_weight_matrix.iloc[:, start:start + snp_annotation_len])
248
+ snp_gene_weight_matrix_list.append(
249
+ snp_gene_weight_matrix.iloc[:, start : start + snp_annotation_len]
250
+ )
233
251
  start += snp_annotation_len
234
252
  return snp_gene_weight_matrix_list
235
253
 
@@ -242,21 +260,28 @@ class S_LDSC_Boost:
242
260
  self.mk_score = load_marker_score(config.mkscore_feather_path)
243
261
 
244
262
  # Load GTF and get common markers
245
- self.gtf_pr, self.mk_score_common = load_gtf(config.gtf_annotation_file, self.mk_score,
246
- window_size=config.gene_window_size)
263
+ self.gtf_pr, self.mk_score_common = load_gtf(
264
+ config.gtf_annotation_file, self.mk_score, window_size=config.gene_window_size
265
+ )
247
266
 
248
267
  # Load enhancer
249
268
  if config.enhancer_annotation_file is not None:
250
269
  enhancer_df = pr.read_bed(config.enhancer_annotation_file, as_df=True)
251
- enhancer_df.set_index('Name', inplace=True)
252
- enhancer_df.index.name = 'gene_name'
270
+ enhancer_df.set_index("Name", inplace=True)
271
+ enhancer_df.index.name = "gene_name"
253
272
 
254
273
  # keep the common genes and add the enhancer score
255
- avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=['avg_mkscore'])
256
- enhancer_df = enhancer_df.join(avg_mkscore, how='inner', on='gene_name', )
274
+ avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=["avg_mkscore"])
275
+ enhancer_df = enhancer_df.join(
276
+ avg_mkscore,
277
+ how="inner",
278
+ on="gene_name",
279
+ )
257
280
 
258
281
  # add distance to TSS
259
- enhancer_df['TSS'] = self.gtf_pr.df.set_index('gene_name').reindex(enhancer_df.index)['TSS']
282
+ enhancer_df["TSS"] = self.gtf_pr.df.set_index("gene_name").reindex(enhancer_df.index)[
283
+ "TSS"
284
+ ]
260
285
 
261
286
  # convert to pyranges
262
287
  self.enhancer_pr = pr.PyRanges(enhancer_df.reset_index())
@@ -265,32 +290,39 @@ class S_LDSC_Boost:
265
290
  self.enhancer_pr = None
266
291
 
267
292
  # create tha zarr file
268
- if config.ldscore_save_format == 'zarr':
269
-
293
+ if config.ldscore_save_format == "zarr":
270
294
  chrom_snp_length_dict = get_snp_counts(config)
271
- self.chrom_snp_start_point = chrom_snp_length_dict['chrom_snp_start_point']
295
+ self.chrom_snp_start_point = chrom_snp_length_dict["chrom_snp_start_point"]
272
296
 
273
- zarr_path = Path(config.ldscore_save_dir) / f'{config.sample_name}.ldscore.zarr'
297
+ zarr_path = Path(config.ldscore_save_dir) / f"{config.sample_name}.ldscore.zarr"
274
298
  if not zarr_path.exists():
275
- self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a', dtype=np.float16,
276
- chunks=config.zarr_chunk_size,
277
- shape=(chrom_snp_length_dict['total'], self.mk_score_common.shape[1]))
299
+ self.zarr_file = zarr.open(
300
+ zarr_path.as_posix(),
301
+ mode="a",
302
+ dtype=np.float16,
303
+ chunks=config.zarr_chunk_size,
304
+ shape=(chrom_snp_length_dict["total"], self.mk_score_common.shape[1]),
305
+ )
278
306
  zarr_path.mkdir(parents=True, exist_ok=True)
279
307
  # save spot names
280
- self.zarr_file.attrs['spot_names'] = self.mk_score_common.columns.to_list()
308
+ self.zarr_file.attrs["spot_names"] = self.mk_score_common.columns.to_list()
281
309
  # save chrom_snp_length_dict
282
- self.zarr_file.attrs['chrom_snp_start_point'] = self.chrom_snp_start_point
310
+ self.zarr_file.attrs["chrom_snp_start_point"] = self.chrom_snp_start_point
283
311
  else:
284
- self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a')
312
+ self.zarr_file = zarr.open(zarr_path.as_posix(), mode="a")
285
313
 
286
314
  def process_chromosome(self, chrom: int):
287
315
  self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
288
316
 
289
317
  # Get SNP-Gene dummy pairs
290
- self.snp_gene_pair_dummy = self.get_snp_gene_dummy(chrom, )
318
+ self.snp_gene_pair_dummy = self.get_snp_gene_dummy(
319
+ chrom,
320
+ )
291
321
 
292
322
  if self.config.keep_snp_root is not None:
293
- keep_snp = pd.read_csv(f'{self.config.keep_snp_root}.{chrom}.snp', header=None)[0].to_list()
323
+ keep_snp = pd.read_csv(f"{self.config.keep_snp_root}.{chrom}.snp", header=None)[
324
+ 0
325
+ ].to_list()
294
326
  self.keep_snp_mask = self.snp_gene_pair_dummy.index.isin(keep_snp)
295
327
  # the SNP name of keeped
296
328
  self.snp_name = self.snp_gene_pair_dummy.index[self.keep_snp_mask].to_list()
@@ -300,25 +332,37 @@ class S_LDSC_Boost:
300
332
 
301
333
  if self.config.additional_baseline_annotation is not None:
302
334
  additional_baseline_annotation = Path(self.config.additional_baseline_annotation)
303
- additional_baseline_annotation_file_path = additional_baseline_annotation / f'baseline.{chrom}.annot.gz'
304
- assert additional_baseline_annotation_file_path.exists(), f'additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}'
305
- additional_baseline_annotation_df = pd.read_csv(additional_baseline_annotation_file_path, sep='\t')
306
- additional_baseline_annotation_df.set_index('SNP', inplace=True)
335
+ additional_baseline_annotation_file_path = (
336
+ additional_baseline_annotation / f"baseline.{chrom}.annot.gz"
337
+ )
338
+ assert additional_baseline_annotation_file_path.exists(), (
339
+ f"additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}"
340
+ )
341
+ additional_baseline_annotation_df = pd.read_csv(
342
+ additional_baseline_annotation_file_path, sep="\t"
343
+ )
344
+ additional_baseline_annotation_df.set_index("SNP", inplace=True)
307
345
 
308
346
  # drop these columns if exists CHR BP CM]
309
- additional_baseline_annotation_df.drop(['CHR', 'BP', 'CM'], axis=1, inplace=True, errors='ignore')
347
+ additional_baseline_annotation_df.drop(
348
+ ["CHR", "BP", "CM"], axis=1, inplace=True, errors="ignore"
349
+ )
310
350
 
311
351
  # reindex, for those SNPs not in additional_baseline_annotation_df, set to 0
312
- num_of_not_exist_snp = (~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)).sum()
352
+ num_of_not_exist_snp = (
353
+ ~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)
354
+ ).sum()
313
355
  if num_of_not_exist_snp > 0:
314
356
  logger.warning(
315
- f'{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0')
357
+ f"{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0"
358
+ )
316
359
  additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
317
- self.snp_gene_pair_dummy.index,
318
- fill_value=0)
360
+ self.snp_gene_pair_dummy.index, fill_value=0
361
+ )
319
362
  else:
320
363
  additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
321
- self.snp_gene_pair_dummy.index)
364
+ self.snp_gene_pair_dummy.index
365
+ )
322
366
 
323
367
  # do this for saving the cpu time, only calculate r2 once
324
368
  self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
@@ -327,56 +371,85 @@ class S_LDSC_Boost:
327
371
  chrom,
328
372
  self.config.bfile_root,
329
373
  ld_wind=self.config.ld_wind,
330
- ld_unit=self.config.ld_unit))
374
+ ld_unit=self.config.ld_unit,
375
+ )
376
+ )
331
377
 
332
- additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[self.snp_name]
378
+ additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[
379
+ self.snp_name
380
+ ]
333
381
  # print(additional_baseline_annotation_ldscore.index.to_list()==self.snp_name)
334
382
 
335
- ld_score_file = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather'
336
- M_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M'
337
- M_5_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50'
383
+ ld_score_file = f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather"
384
+ M_file_path = (
385
+ f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M"
386
+ )
387
+ M_5_file_path = (
388
+ f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50"
389
+ )
338
390
 
339
391
  # save additional baseline annotation ldscore
340
- self.save_ldscore_to_feather(additional_baseline_annotation_ldscore.values,
341
- column_names=additional_baseline_annotation_ldscore.columns,
342
- save_file_name=ld_score_file,
343
- )
392
+ self.save_ldscore_to_feather(
393
+ additional_baseline_annotation_ldscore.values,
394
+ column_names=additional_baseline_annotation_ldscore.columns,
395
+ save_file_name=ld_score_file,
396
+ )
344
397
 
345
398
  # caculate the M and save
346
399
  save_dir = Path(M_file_path).parent
347
400
  save_dir.mkdir(parents=True, exist_ok=True)
348
401
  M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
349
- M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0, keepdims=True)
350
- np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
351
- np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
402
+ M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(
403
+ axis=0, keepdims=True
404
+ )
405
+ np.savetxt(
406
+ M_file_path,
407
+ M_chr_chunk,
408
+ delimiter="\t",
409
+ )
410
+ np.savetxt(
411
+ M_5_file_path,
412
+ M_5_chr_chunk,
413
+ delimiter="\t",
414
+ )
352
415
 
353
416
  else:
354
417
  # Calculate SNP-Gene weight matrix
355
- self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(self.snp_gene_pair_dummy, chrom,
356
- self.config.bfile_root,
357
- ld_wind=self.config.ld_wind,
358
- ld_unit=self.config.ld_unit)
418
+ self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(
419
+ self.snp_gene_pair_dummy,
420
+ chrom,
421
+ self.config.bfile_root,
422
+ ld_wind=self.config.ld_wind,
423
+ ld_unit=self.config.ld_unit,
424
+ )
359
425
  # only keep the snp in keep_snp_root
360
426
  if self.keep_snp_mask is not None:
361
427
  self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
362
428
 
363
429
  if self.config.save_pre_calculate_snp_gene_weight_matrix:
364
- snp_gene_weight_matrix_save_dir = Path(self.config.ldscore_save_dir) / 'snp_gene_weight_matrix'
430
+ snp_gene_weight_matrix_save_dir = (
431
+ Path(self.config.ldscore_save_dir) / "snp_gene_weight_matrix"
432
+ )
365
433
  snp_gene_weight_matrix_save_dir.mkdir(parents=True, exist_ok=True)
366
- logger.info(f'Saving snp_gene_weight_matrix for chr{chrom}...')
434
+ logger.info(f"Saving snp_gene_weight_matrix for chr{chrom}...")
367
435
  self.snp_gene_weight_matrix.reset_index().to_feather(
368
- snp_gene_weight_matrix_save_dir / f'{chrom}.snp_gene_weight_matrix.feather')
436
+ snp_gene_weight_matrix_save_dir / f"{chrom}.snp_gene_weight_matrix.feather"
437
+ )
369
438
 
370
439
  # convert to sparse
371
440
  self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
372
- logger.info(f'Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}')
441
+ logger.info(
442
+ f"Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}"
443
+ )
373
444
 
374
445
  # calculate baseline ld score
375
- logger.info(f'Calculating baseline ld score for chr{chrom}...')
376
- self.calculate_ldscore_for_base_line(chrom, self.config.sample_name, self.config.ldscore_save_dir)
446
+ logger.info(f"Calculating baseline ld score for chr{chrom}...")
447
+ self.calculate_ldscore_for_base_line(
448
+ chrom, self.config.sample_name, self.config.ldscore_save_dir
449
+ )
377
450
 
378
451
  # calculate ld score for annotation
379
- logger.info(f'Calculating ld score for annotation for chr{chrom}...')
452
+ logger.info(f"Calculating ld score for annotation for chr{chrom}...")
380
453
  self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
381
454
  self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
382
455
  chrom,
@@ -384,11 +457,11 @@ class S_LDSC_Boost:
384
457
  self.config.ldscore_save_dir,
385
458
  )
386
459
 
387
- def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
388
- mk_score_chunk,
389
- drop_dummy_na=True,
390
- ):
391
-
460
+ def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
461
+ self,
462
+ mk_score_chunk,
463
+ drop_dummy_na=True,
464
+ ):
392
465
  if drop_dummy_na:
393
466
  ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
394
467
  else:
@@ -407,16 +480,20 @@ class S_LDSC_Boost:
407
480
  # self.keep_snp_mask]
408
481
 
409
482
  # save for each chunk
410
- df = pd.DataFrame(ldscore_chr_chunk,
411
- index=self.snp_name,
412
- columns=column_names,
413
- )
414
- df.index.name = 'SNP'
483
+ df = pd.DataFrame(
484
+ ldscore_chr_chunk,
485
+ index=self.snp_name,
486
+ columns=column_names,
487
+ )
488
+ df.index.name = "SNP"
415
489
  df.reset_index().to_feather(save_file_name)
416
490
 
417
- def save_ldscore_chunk_to_zarr(self, ldscore_chr_chunk: np.ndarray,
418
- chrom: int, start_col_index,
419
- ):
491
+ def save_ldscore_chunk_to_zarr(
492
+ self,
493
+ ldscore_chr_chunk: np.ndarray,
494
+ chrom: int,
495
+ start_col_index,
496
+ ):
420
497
  ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
421
498
  # avoid overflow of float16, if inf, set to max of float16
422
499
  ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
@@ -425,63 +502,90 @@ class S_LDSC_Boost:
425
502
  chrom_snp_start_point = self.chrom_snp_start_point[chrom - 1]
426
503
  chrom_snp_end_point = self.chrom_snp_start_point[chrom]
427
504
 
428
- self.zarr_file[chrom_snp_start_point:chrom_snp_end_point,
429
- start_col_index:start_col_index + ldscore_chr_chunk.shape[1]] = ldscore_chr_chunk
430
-
431
- def calculate_M_use_SNP_gene_pair_dummy_by_chunk(self,
432
- mk_score_chunk,
433
- M_file_path, M_5_file_path,
434
- drop_dummy_na=True,
435
- ):
436
- '''
437
- calculate M use SNP_gene_pair_dummy_sumed_along_snp_axis and mk_score_chunk
438
- '''
439
- SNP_gene_pair_dummy_sumed_along_snp_axis = self.snp_gene_pair_dummy.values.sum(axis=0, keepdims=True)
440
- SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = self.snp_gene_pair_dummy.loc[self.snp_pass_maf].values.sum(
441
- axis=0,
442
- keepdims=True)
505
+ self.zarr_file[
506
+ chrom_snp_start_point:chrom_snp_end_point,
507
+ start_col_index : start_col_index + ldscore_chr_chunk.shape[1],
508
+ ] = ldscore_chr_chunk
509
+
510
+ def calculate_M_use_SNP_gene_pair_dummy_by_chunk(
511
+ self,
512
+ mk_score_chunk,
513
+ M_file_path,
514
+ M_5_file_path,
515
+ drop_dummy_na=True,
516
+ ):
517
+ """
518
+ Calculate M use SNP_gene_pair_dummy_sumed_along_snp_axis and mk_score_chunk
519
+ """
520
+ SNP_gene_pair_dummy_sumed_along_snp_axis = self.snp_gene_pair_dummy.values.sum(
521
+ axis=0, keepdims=True
522
+ )
523
+ SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = self.snp_gene_pair_dummy.loc[
524
+ self.snp_pass_maf
525
+ ].values.sum(axis=0, keepdims=True)
443
526
  if drop_dummy_na:
444
- SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[:, :-1]
445
- SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf[:,
446
- :-1]
527
+ SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[
528
+ :, :-1
529
+ ]
530
+ SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = (
531
+ SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf[:, :-1]
532
+ )
447
533
  save_dir = Path(M_file_path).parent
448
534
  save_dir.mkdir(parents=True, exist_ok=True)
449
535
  M_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis @ mk_score_chunk
450
536
  M_5_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf @ mk_score_chunk
451
- np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
452
- np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
537
+ np.savetxt(
538
+ M_file_path,
539
+ M_chr_chunk,
540
+ delimiter="\t",
541
+ )
542
+ np.savetxt(
543
+ M_5_file_path,
544
+ M_5_chr_chunk,
545
+ delimiter="\t",
546
+ )
453
547
 
454
- def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(self, mk_score_common, chrom, sample_name, save_dir):
548
+ def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
549
+ self, mk_score_common, chrom, sample_name, save_dir
550
+ ):
455
551
  """
456
552
  Calculate the LD score using the SNP-gene weight matrix.
457
553
  :param sample_name:
458
554
  """
459
555
  # Calculate the LD score
460
556
  chunk_index = 1
461
- for i in trange(0, mk_score_common.shape[1], self.config.spots_per_chunk,
462
- desc=f'Calculating LD score by chunk for chr{chrom}'):
463
- mk_score_chunk = mk_score_common.iloc[:, i:i + self.config.spots_per_chunk]
464
-
465
- ld_score_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather'
466
- M_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M'
467
- M_5_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50'
557
+ for i in trange(
558
+ 0,
559
+ mk_score_common.shape[1],
560
+ self.config.spots_per_chunk,
561
+ desc=f"Calculating LD score by chunk for chr{chrom}",
562
+ ):
563
+ mk_score_chunk = mk_score_common.iloc[:, i : i + self.config.spots_per_chunk]
564
+
565
+ ld_score_file = f"{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather"
566
+ M_file = f"{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M"
567
+ M_5_file = (
568
+ f"{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50"
569
+ )
468
570
 
469
571
  ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
470
572
  mk_score_chunk,
471
573
  drop_dummy_na=True,
472
574
  )
473
- if self.config.ldscore_save_format == 'feather':
474
- self.save_ldscore_to_feather(ldscore_chr_chunk,
475
- column_names=mk_score_chunk.columns,
476
- save_file_name=ld_score_file,
477
- )
478
- elif self.config.ldscore_save_format == 'zarr':
479
- self.save_ldscore_chunk_to_zarr(ldscore_chr_chunk,
480
- chrom=chrom,
481
- start_col_index=i,
482
- )
575
+ if self.config.ldscore_save_format == "feather":
576
+ self.save_ldscore_to_feather(
577
+ ldscore_chr_chunk,
578
+ column_names=mk_score_chunk.columns,
579
+ save_file_name=ld_score_file,
580
+ )
581
+ elif self.config.ldscore_save_format == "zarr":
582
+ self.save_ldscore_chunk_to_zarr(
583
+ ldscore_chr_chunk,
584
+ chrom=chrom,
585
+ start_col_index=i,
586
+ )
483
587
  else:
484
- raise ValueError(f'Invalid ldscore_save_format: {self.config.ldscore_save_format}')
588
+ raise ValueError(f"Invalid ldscore_save_format: {self.config.ldscore_save_format}")
485
589
 
486
590
  self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
487
591
  mk_score_chunk,
@@ -496,21 +600,23 @@ class S_LDSC_Boost:
496
600
  # save baseline ld score
497
601
  baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
498
602
  baseline_mk_score[-1, 0] = 0 # all_gene
499
- baseline_mk_score_df = pd.DataFrame(baseline_mk_score, index=self.snp_gene_pair_dummy.columns,
500
- columns=['all_gene', 'base'])
501
- ld_score_file = f'{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather'
502
- M_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M'
503
- M_5_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M_5_50'
603
+ baseline_mk_score_df = pd.DataFrame(
604
+ baseline_mk_score, index=self.snp_gene_pair_dummy.columns, columns=["all_gene", "base"]
605
+ )
606
+ ld_score_file = f"{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather"
607
+ M_file = f"{save_dir}/baseline/baseline.{chrom}.l2.M"
608
+ M_5_file = f"{save_dir}/baseline/baseline.{chrom}.l2.M_5_50"
504
609
 
505
610
  ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
506
611
  baseline_mk_score_df,
507
612
  drop_dummy_na=False,
508
613
  )
509
614
 
510
- self.save_ldscore_to_feather(ldscore_chr_chunk,
511
- column_names=baseline_mk_score_df.columns,
512
- save_file_name=ld_score_file,
513
- )
615
+ self.save_ldscore_to_feather(
616
+ ldscore_chr_chunk,
617
+ column_names=baseline_mk_score_df.columns,
618
+ save_file_name=ld_score_file,
619
+ )
514
620
  # save baseline M
515
621
  self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
516
622
  baseline_mk_score_df,
@@ -519,7 +625,10 @@ class S_LDSC_Boost:
519
625
  drop_dummy_na=False,
520
626
  )
521
627
 
522
- def get_snp_gene_dummy(self, chrom, ):
628
+ def get_snp_gene_dummy(
629
+ self,
630
+ chrom,
631
+ ):
523
632
  """
524
633
  Get the dummy matrix of SNP-gene pairs.
525
634
  """
@@ -527,91 +636,126 @@ class S_LDSC_Boost:
527
636
  print("Loading bim data")
528
637
  bim, bim_pr = load_bim(self.config.bfile_root, chrom)
529
638
 
530
- if self.config.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
531
-
532
- SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
533
- SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
639
+ if self.config.gene_window_enhancer_priority in ["gene_window_first", "enhancer_first"]:
640
+ SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(
641
+ bim,
642
+ bim_pr,
643
+ )
644
+ SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(
645
+ bim,
646
+ bim_pr,
647
+ )
534
648
  # total_SNP_gene_pair = SNP_gene_pair_gtf.join(SNP_gene_pair_enhancer, how='outer', lsuffix='_gtf', )
535
649
 
536
650
  mask_of_nan_gtf = SNP_gene_pair_gtf.gene_name.isna()
537
651
  mask_of_nan_enhancer = SNP_gene_pair_enhancer.gene_name.isna()
538
652
 
539
- if self.config.gene_window_enhancer_priority == 'gene_window_first':
653
+ if self.config.gene_window_enhancer_priority == "gene_window_first":
540
654
  SNP_gene_pair = SNP_gene_pair_gtf
541
- SNP_gene_pair.loc[mask_of_nan_gtf, 'gene_name'] = SNP_gene_pair_enhancer.loc[
542
- mask_of_nan_gtf, 'gene_name']
543
- elif self.config.gene_window_enhancer_priority == 'enhancer_first':
655
+ SNP_gene_pair.loc[mask_of_nan_gtf, "gene_name"] = SNP_gene_pair_enhancer.loc[
656
+ mask_of_nan_gtf, "gene_name"
657
+ ]
658
+ elif self.config.gene_window_enhancer_priority == "enhancer_first":
544
659
  SNP_gene_pair = SNP_gene_pair_enhancer
545
- SNP_gene_pair.loc[mask_of_nan_enhancer, 'gene_name'] = SNP_gene_pair_gtf.loc[
546
- mask_of_nan_enhancer, 'gene_name']
660
+ SNP_gene_pair.loc[mask_of_nan_enhancer, "gene_name"] = SNP_gene_pair_gtf.loc[
661
+ mask_of_nan_enhancer, "gene_name"
662
+ ]
547
663
  else:
548
664
  raise ValueError(
549
- f'Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}')
665
+ f"Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}"
666
+ )
550
667
 
551
668
  elif self.config.gene_window_enhancer_priority is None: # use gtf only
552
- SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
669
+ SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(
670
+ bim,
671
+ bim_pr,
672
+ )
553
673
  SNP_gene_pair = SNP_gene_pair_gtf
554
674
 
555
- elif self.config.gene_window_enhancer_priority == 'enhancer_only':
556
- SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
675
+ elif self.config.gene_window_enhancer_priority == "enhancer_only":
676
+ SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(
677
+ bim,
678
+ bim_pr,
679
+ )
557
680
  SNP_gene_pair = SNP_gene_pair_enhancer
558
681
  else:
559
- raise ValueError('gtf_pr and enhancer_pr cannot be None at the same time')
682
+ raise ValueError("gtf_pr and enhancer_pr cannot be None at the same time")
560
683
 
561
684
  # save the SNP_gene_pair to feather
562
- SNP_gene_pair_save_path = Path(
563
- self.config.ldscore_save_dir) / f'SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather'
685
+ SNP_gene_pair_save_path = (
686
+ Path(self.config.ldscore_save_dir) / f"SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather"
687
+ )
564
688
  SNP_gene_pair_save_path.parent.mkdir(parents=True, exist_ok=True)
565
689
  SNP_gene_pair.reset_index().to_feather(SNP_gene_pair_save_path)
566
690
 
567
691
  # Get the dummy matrix
568
- SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair['gene_name'], dummy_na=True)
692
+ SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair["gene_name"], dummy_na=True)
569
693
  return SNP_gene_pair_dummy
570
694
 
571
695
  def get_SNP_gene_pair_from_gtf(self, bim, bim_pr):
572
696
  logger.info(
573
- "Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)")
697
+ "Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)"
698
+ )
574
699
  overlaps_small = Overlaps_gtf_bim(self.gtf_pr, bim_pr)
575
700
  # Get the SNP-gene pair
576
701
  annot = bim[["CHR", "BP", "SNP", "CM"]]
577
- SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
702
+ SNP_gene_pair = (
703
+ overlaps_small[["SNP", "gene_name"]]
704
+ .set_index("SNP")
705
+ .join(annot.set_index("SNP"), how="right")
706
+ )
578
707
  return SNP_gene_pair
579
708
 
580
- def get_SNP_gene_pair_from_enhancer(self, bim, bim_pr, ):
709
+ def get_SNP_gene_pair_from_enhancer(
710
+ self,
711
+ bim,
712
+ bim_pr,
713
+ ):
581
714
  logger.info(
582
- "Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score")
715
+ "Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score"
716
+ )
583
717
  # Get the SNP-gene pair
584
718
  overlaps_small = self.enhancer_pr.join(bim_pr).df
585
719
  annot = bim[["CHR", "BP", "SNP", "CM"]]
586
- if self.config.snp_multiple_enhancer_strategy == 'max_mkscore':
587
- logger.debug('select the gene with highest marker score')
588
- overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').avg_mkscore.idxmax()]
589
-
590
- elif self.config.snp_multiple_enhancer_strategy == 'nearest_TSS':
591
- logger.debug('select the gene with nearest TSS')
592
- overlaps_small['Distance'] = np.abs(overlaps_small['Start_b'] - overlaps_small['TSS'])
593
- overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
594
-
595
- SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
720
+ if self.config.snp_multiple_enhancer_strategy == "max_mkscore":
721
+ logger.debug("select the gene with highest marker score")
722
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby("SNP").avg_mkscore.idxmax()]
723
+
724
+ elif self.config.snp_multiple_enhancer_strategy == "nearest_TSS":
725
+ logger.debug("select the gene with nearest TSS")
726
+ overlaps_small["Distance"] = np.abs(overlaps_small["Start_b"] - overlaps_small["TSS"])
727
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby("SNP").Distance.idxmin()]
728
+
729
+ SNP_gene_pair = (
730
+ overlaps_small[["SNP", "gene_name"]]
731
+ .set_index("SNP")
732
+ .join(annot.set_index("SNP"), how="right")
733
+ )
596
734
 
597
735
  return SNP_gene_pair
598
736
 
599
737
 
600
738
  def run_generate_ldscore(config: GenerateLDScoreConfig):
601
- if config.ldscore_save_format == 'quick_mode':
602
- logger.info('Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore.')
739
+ if config.ldscore_save_format == "quick_mode":
740
+ logger.info(
741
+ "Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore."
742
+ )
603
743
  ldscore_save_dir = config.ldscore_save_dir
604
744
 
605
745
  # link the baseline annotation
606
746
  baseline_annotation_dir = Path(config.baseline_annotation_dir)
607
- (ldscore_save_dir / 'baseline').symlink_to(baseline_annotation_dir, target_is_directory=True)
747
+ (ldscore_save_dir / "baseline").symlink_to(
748
+ baseline_annotation_dir, target_is_directory=True
749
+ )
608
750
 
609
751
  # link the SNP_gene_pair
610
752
  SNP_gene_pair_dir = Path(config.SNP_gene_pair_dir)
611
- (ldscore_save_dir / 'SNP_gene_pair').symlink_to(SNP_gene_pair_dir, target_is_directory=True)
753
+ (ldscore_save_dir / "SNP_gene_pair").symlink_to(
754
+ SNP_gene_pair_dir, target_is_directory=True
755
+ )
612
756
  return
613
757
  s_ldsc_boost = S_LDSC_Boost(config)
614
- if config.chrom == 'all':
758
+ if config.chrom == "all":
615
759
  for chrom in range(1, 23):
616
760
  s_ldsc_boost.process_chromosome(chrom)
617
761
  else: