gsMap 1.71.2__py3-none-any.whl → 1.73.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/generate_ldscore.py CHANGED
@@ -10,7 +10,7 @@ from scipy.sparse import csr_matrix
10
10
  from tqdm import trange
11
11
 
12
12
  from gsMap.config import GenerateLDScoreConfig
13
- from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
13
+ from gsMap.utils.generate_r2_matrix import getBlockLefts, load_bfile
14
14
 
15
15
  warnings.filterwarnings("ignore", category=FutureWarning)
16
16
  logger = logging.getLogger(__name__)
@@ -25,34 +25,36 @@ def load_gtf(gtf_file, mk_score, window_size):
25
25
  print("Loading gtf data")
26
26
  #
27
27
  # Load GTF file
28
- gtf = pr.read_gtf(gtf_file, )
28
+ gtf = pr.read_gtf(
29
+ gtf_file,
30
+ )
29
31
  gtf = gtf.df
30
32
  #
31
33
  # Select the common genes
32
- gtf = gtf[gtf['Feature'] == 'gene']
34
+ gtf = gtf[gtf["Feature"] == "gene"]
33
35
  common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
34
36
  #
35
37
  gtf = gtf[gtf.gene_name.isin(common_gene)]
36
38
  mk_score = mk_score[mk_score.index.isin(common_gene)]
37
39
  #
38
40
  # Remove duplicated lines
39
- gtf = gtf.drop_duplicates(subset='gene_name', keep="first")
41
+ gtf = gtf.drop_duplicates(subset="gene_name", keep="first")
40
42
  #
41
43
  # Process the GTF (open 100-KB window: Tss - Ted)
42
- gtf_bed = gtf[['Chromosome', 'Start', 'End', 'gene_name', 'Strand']].copy()
43
- gtf_bed.loc[:, 'TSS'] = gtf_bed['Start']
44
- gtf_bed.loc[:, 'TED'] = gtf_bed['End']
44
+ gtf_bed = gtf[["Chromosome", "Start", "End", "gene_name", "Strand"]].copy()
45
+ gtf_bed.loc[:, "TSS"] = gtf_bed["Start"]
46
+ gtf_bed.loc[:, "TED"] = gtf_bed["End"]
45
47
 
46
- gtf_bed.loc[:, 'Start'] = gtf_bed['TSS'] - window_size
47
- gtf_bed.loc[:, 'End'] = gtf_bed['TED'] + window_size
48
- gtf_bed.loc[gtf_bed['Start'] < 0, 'Start'] = 0
48
+ gtf_bed.loc[:, "Start"] = gtf_bed["TSS"] - window_size
49
+ gtf_bed.loc[:, "End"] = gtf_bed["TED"] + window_size
50
+ gtf_bed.loc[gtf_bed["Start"] < 0, "Start"] = 0
49
51
  #
50
52
  # Correct the negative strand
51
- tss_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS']
52
- ted_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED']
53
- gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS'] = ted_neg
54
- gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED'] = tss_neg
55
- gtf_bed = gtf_bed.drop('Strand', axis=1)
53
+ tss_neg = gtf_bed.loc[gtf_bed["Strand"] == "-", "TSS"]
54
+ ted_neg = gtf_bed.loc[gtf_bed["Strand"] == "-", "TED"]
55
+ gtf_bed.loc[gtf_bed["Strand"] == "-", "TSS"] = ted_neg
56
+ gtf_bed.loc[gtf_bed["Strand"] == "-", "TED"] = tss_neg
57
+ gtf_bed = gtf_bed.drop("Strand", axis=1)
56
58
  #
57
59
  # Transform the GTF to PyRanges
58
60
  gtf_pr = pr.PyRanges(gtf_bed)
@@ -64,31 +66,28 @@ def load_marker_score(mk_score_file):
64
66
  """
65
67
  Load marker scores of each cell.
66
68
  """
67
- mk_score = pd.read_feather(mk_score_file).set_index('HUMAN_GENE_SYM').rename_axis('gene_name')
69
+ mk_score = pd.read_feather(mk_score_file).set_index("HUMAN_GENE_SYM").rename_axis("gene_name")
68
70
  mk_score = mk_score.astype(np.float32, copy=False)
69
71
  return mk_score
70
72
 
71
73
 
72
- # %%
73
- # load mkscore get common gene
74
- # %%
75
74
  # load bim
76
75
  def load_bim(bfile_root, chrom):
77
76
  """
78
77
  Load the bim file.
79
78
  """
80
- bim = pd.read_csv(f'{bfile_root}.{chrom}.bim', sep='\t', header=None)
79
+ bim = pd.read_csv(f"{bfile_root}.{chrom}.bim", sep="\t", header=None)
81
80
  bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
82
81
  #
83
82
  # Transform bim to PyRanges
84
83
  bim_pr = bim.copy()
85
84
  bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
86
85
 
87
- bim_pr['End'] = bim_pr['Start'].copy()
88
- bim_pr['Start'] = bim_pr['Start'] - 1 # Due to bim file is 1-based
86
+ bim_pr["End"] = bim_pr["Start"].copy()
87
+ bim_pr["Start"] = bim_pr["Start"] - 1 # Due to bim file is 1-based
89
88
 
90
89
  bim_pr = pr.PyRanges(bim_pr)
91
- bim_pr.Chromosome = f'chr{chrom}'
90
+ bim_pr.Chromosome = f"chr{chrom}"
92
91
  return bim, bim_pr
93
92
 
94
93
 
@@ -100,9 +99,9 @@ def Overlaps_gtf_bim(gtf_pr, bim_pr):
100
99
  # Select the overlapped regions (SNPs in gene windows)
101
100
  overlaps = gtf_pr.join(bim_pr)
102
101
  overlaps = overlaps.df
103
- overlaps['Distance'] = np.abs(overlaps['Start_b'] - overlaps['TSS'])
102
+ overlaps["Distance"] = np.abs(overlaps["Start_b"] - overlaps["TSS"])
104
103
  overlaps_small = overlaps.copy()
105
- overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
104
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby("SNP").Distance.idxmin()]
106
105
  return overlaps_small
107
106
 
108
107
 
@@ -110,7 +109,7 @@ def Overlaps_gtf_bim(gtf_pr, bim_pr):
110
109
  def filter_snps_by_keep_snp(bim_df, keep_snp_file):
111
110
  # Load the keep_snp file and filter the BIM DataFrame
112
111
  keep_snp = pd.read_csv(keep_snp_file, header=None)[0].to_list()
113
- filtered_bim_df = bim_df[bim_df['SNP'].isin(keep_snp)]
112
+ filtered_bim_df = bim_df[bim_df["SNP"].isin(keep_snp)]
114
113
  return filtered_bim_df
115
114
 
116
115
 
@@ -122,7 +121,7 @@ def get_snp_counts(config):
122
121
  bim_df, _ = load_bim(config.bfile_root, chrom)
123
122
 
124
123
  if config.keep_snp_root:
125
- keep_snp_file = f'{config.keep_snp_root}.{chrom}.snp'
124
+ keep_snp_file = f"{config.keep_snp_root}.{chrom}.snp"
126
125
  filtered_bim_df = filter_snps_by_keep_snp(bim_df, keep_snp_file)
127
126
  else:
128
127
  filtered_bim_df = bim_df
@@ -130,11 +129,11 @@ def get_snp_counts(config):
130
129
  snp_counts[chrom] = filtered_bim_df.shape[0]
131
130
  total_snp += snp_counts[chrom]
132
131
 
133
- snp_counts['total'] = total_snp
132
+ snp_counts["total"] = total_snp
134
133
 
135
134
  chrom_snp_length_array = np.array([snp_counts[chrom] for chrom in range(1, 23)]).cumsum()
136
135
 
137
- snp_counts['chrom_snp_start_point'] = [0] + chrom_snp_length_array.tolist()
136
+ snp_counts["chrom_snp_start_point"] = [0] + chrom_snp_length_array.tolist()
138
137
 
139
138
  return snp_counts
140
139
 
@@ -144,56 +143,35 @@ def get_snp_pass_maf(bfile_root, chrom, maf_min=0.05):
144
143
  """
145
144
  Get the dummy matrix of SNP-gene pairs.
146
145
  """
147
- # Load the bim file
148
- PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
149
- PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
146
+ array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix=f"{bfile_root}.{chrom}")
150
147
 
151
- bfile = f'{bfile_root}.{chrom}'
152
- snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
153
- array_snps = snp_obj(snp_file)
154
148
  m = len(array_snps.IDList)
155
-
156
- # Load fam
157
- ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
158
- array_indivs = ind_obj(ind_file)
159
149
  n = len(array_indivs.IDList)
160
- array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
161
- geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
150
+ logger.info(
151
+ f"Loading genotype data for {m} SNPs and {n} individuals from {bfile_root}.{chrom}"
152
+ )
153
+
162
154
  ii = geno_array.maf > maf_min
163
155
  snp_pass_maf = array_snps.IDList[ii]
164
- print(f'After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain.')
156
+ logger.info(f"After filtering SNPs with MAF < {maf_min}, {len(snp_pass_maf)} SNPs remain.")
165
157
  return snp_pass_maf.SNP.to_list()
166
158
 
167
159
 
168
- def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit='CM'):
169
- PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
170
- PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
160
+ def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit="CM"):
161
+ array_snps, array_indivs, geno_array = load_bfile(bfile_chr_prefix=f"{bfile_root}.{chrom}")
171
162
 
172
- bfile = f'{bfile_root}.{chrom}'
173
- snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
174
- array_snps = snp_obj(snp_file)
175
- m = len(array_snps.IDList)
176
- print(f'Read list of {m} SNPs from {snp_file}')
177
-
178
- # Load fam
179
- ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
180
- array_indivs = ind_obj(ind_file)
181
- n = len(array_indivs.IDList)
182
- print(f'Read list of {n} individuals from {ind_file}')
183
- array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
184
- geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
185
163
  # Load the annotations of the baseline
186
- if ld_unit == 'SNP':
164
+ if ld_unit == "SNP":
187
165
  max_dist = ld_wind
188
166
  coords = np.array(range(geno_array.m))
189
- elif ld_unit == 'KB':
167
+ elif ld_unit == "KB":
190
168
  max_dist = ld_wind * 1000
191
- coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
192
- elif ld_unit == 'CM':
169
+ coords = np.array(array_snps.df["BP"])[geno_array.kept_snps]
170
+ elif ld_unit == "CM":
193
171
  max_dist = ld_wind
194
- coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
172
+ coords = np.array(array_snps.df["CM"])[geno_array.kept_snps]
195
173
  else:
196
- raise ValueError(f'Invalid ld_wind_unit: {ld_unit}')
174
+ raise ValueError(f"Invalid ld_wind_unit: {ld_unit}")
197
175
  block_left = getBlockLefts(coords, max_dist)
198
176
  # Calculate the LD score
199
177
  lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 100, annot=annot_matrix))
@@ -201,25 +179,31 @@ def get_ldscore(bfile_root, chrom, annot_matrix, ld_wind, ld_unit='CM'):
201
179
 
202
180
 
203
181
  # %%
204
- def calculate_ldscore_from_annotation(SNP_annotation_df, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
182
+ def calculate_ldscore_from_annotation(
183
+ SNP_annotation_df, chrom, bfile_root, ld_wind=1, ld_unit="CM"
184
+ ):
205
185
  """
206
186
  Calculate the SNP-gene weight matrix.
207
187
  """
208
188
  # Get the dummy matrix
209
189
  # Get the SNP-gene weight matrix
210
- snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
211
- ld_unit=ld_unit)
190
+ snp_gene_weight_matrix = get_ldscore(
191
+ bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
192
+ )
212
193
  snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
213
194
  snp_gene_weight_matrix.index = SNP_annotation_df.index
214
195
  snp_gene_weight_matrix.columns = SNP_annotation_df.columns
215
196
  return snp_gene_weight_matrix
216
197
 
217
198
 
218
- def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit='CM'):
199
+ def calculate_ldscore_from_multiple_annotation(
200
+ SNP_annotation_df_list, chrom, bfile_root, ld_wind=1, ld_unit="CM"
201
+ ):
219
202
  SNP_annotation_df = pd.concat(SNP_annotation_df_list, axis=1).astype(np.float32, copy=False)
220
203
 
221
- snp_gene_weight_matrix = get_ldscore(bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind,
222
- ld_unit=ld_unit)
204
+ snp_gene_weight_matrix = get_ldscore(
205
+ bfile_root, chrom, SNP_annotation_df.values, ld_wind=ld_wind, ld_unit=ld_unit
206
+ )
223
207
  snp_gene_weight_matrix = snp_gene_weight_matrix.astype(np.float32, copy=False)
224
208
  snp_gene_weight_matrix.index = SNP_annotation_df.index
225
209
  snp_gene_weight_matrix.columns = SNP_annotation_df.columns
@@ -229,7 +213,9 @@ def calculate_ldscore_from_multiple_annotation(SNP_annotation_df_list, chrom, bf
229
213
  snp_gene_weight_matrix_list = []
230
214
  start = 0
231
215
  for snp_annotation_len in snp_annotation_len_list:
232
- snp_gene_weight_matrix_list.append(snp_gene_weight_matrix.iloc[:, start:start + snp_annotation_len])
216
+ snp_gene_weight_matrix_list.append(
217
+ snp_gene_weight_matrix.iloc[:, start : start + snp_annotation_len]
218
+ )
233
219
  start += snp_annotation_len
234
220
  return snp_gene_weight_matrix_list
235
221
 
@@ -242,21 +228,28 @@ class S_LDSC_Boost:
242
228
  self.mk_score = load_marker_score(config.mkscore_feather_path)
243
229
 
244
230
  # Load GTF and get common markers
245
- self.gtf_pr, self.mk_score_common = load_gtf(config.gtf_annotation_file, self.mk_score,
246
- window_size=config.gene_window_size)
231
+ self.gtf_pr, self.mk_score_common = load_gtf(
232
+ config.gtf_annotation_file, self.mk_score, window_size=config.gene_window_size
233
+ )
247
234
 
248
235
  # Load enhancer
249
236
  if config.enhancer_annotation_file is not None:
250
237
  enhancer_df = pr.read_bed(config.enhancer_annotation_file, as_df=True)
251
- enhancer_df.set_index('Name', inplace=True)
252
- enhancer_df.index.name = 'gene_name'
238
+ enhancer_df.set_index("Name", inplace=True)
239
+ enhancer_df.index.name = "gene_name"
253
240
 
254
241
  # keep the common genes and add the enhancer score
255
- avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=['avg_mkscore'])
256
- enhancer_df = enhancer_df.join(avg_mkscore, how='inner', on='gene_name', )
242
+ avg_mkscore = pd.DataFrame(self.mk_score_common.mean(axis=1), columns=["avg_mkscore"])
243
+ enhancer_df = enhancer_df.join(
244
+ avg_mkscore,
245
+ how="inner",
246
+ on="gene_name",
247
+ )
257
248
 
258
249
  # add distance to TSS
259
- enhancer_df['TSS'] = self.gtf_pr.df.set_index('gene_name').reindex(enhancer_df.index)['TSS']
250
+ enhancer_df["TSS"] = self.gtf_pr.df.set_index("gene_name").reindex(enhancer_df.index)[
251
+ "TSS"
252
+ ]
260
253
 
261
254
  # convert to pyranges
262
255
  self.enhancer_pr = pr.PyRanges(enhancer_df.reset_index())
@@ -265,32 +258,39 @@ class S_LDSC_Boost:
265
258
  self.enhancer_pr = None
266
259
 
267
260
  # create tha zarr file
268
- if config.ldscore_save_format == 'zarr':
269
-
261
+ if config.ldscore_save_format == "zarr":
270
262
  chrom_snp_length_dict = get_snp_counts(config)
271
- self.chrom_snp_start_point = chrom_snp_length_dict['chrom_snp_start_point']
263
+ self.chrom_snp_start_point = chrom_snp_length_dict["chrom_snp_start_point"]
272
264
 
273
- zarr_path = Path(config.ldscore_save_dir) / f'{config.sample_name}.ldscore.zarr'
265
+ zarr_path = Path(config.ldscore_save_dir) / f"{config.sample_name}.ldscore.zarr"
274
266
  if not zarr_path.exists():
275
- self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a', dtype=np.float16,
276
- chunks=config.zarr_chunk_size,
277
- shape=(chrom_snp_length_dict['total'], self.mk_score_common.shape[1]))
267
+ self.zarr_file = zarr.open(
268
+ zarr_path.as_posix(),
269
+ mode="a",
270
+ dtype=np.float16,
271
+ chunks=config.zarr_chunk_size,
272
+ shape=(chrom_snp_length_dict["total"], self.mk_score_common.shape[1]),
273
+ )
278
274
  zarr_path.mkdir(parents=True, exist_ok=True)
279
275
  # save spot names
280
- self.zarr_file.attrs['spot_names'] = self.mk_score_common.columns.to_list()
276
+ self.zarr_file.attrs["spot_names"] = self.mk_score_common.columns.to_list()
281
277
  # save chrom_snp_length_dict
282
- self.zarr_file.attrs['chrom_snp_start_point'] = self.chrom_snp_start_point
278
+ self.zarr_file.attrs["chrom_snp_start_point"] = self.chrom_snp_start_point
283
279
  else:
284
- self.zarr_file = zarr.open(zarr_path.as_posix(), mode='a')
280
+ self.zarr_file = zarr.open(zarr_path.as_posix(), mode="a")
285
281
 
286
282
  def process_chromosome(self, chrom: int):
287
283
  self.snp_pass_maf = get_snp_pass_maf(self.config.bfile_root, chrom, maf_min=0.05)
288
284
 
289
285
  # Get SNP-Gene dummy pairs
290
- self.snp_gene_pair_dummy = self.get_snp_gene_dummy(chrom, )
286
+ self.snp_gene_pair_dummy = self.get_snp_gene_dummy(
287
+ chrom,
288
+ )
291
289
 
292
290
  if self.config.keep_snp_root is not None:
293
- keep_snp = pd.read_csv(f'{self.config.keep_snp_root}.{chrom}.snp', header=None)[0].to_list()
291
+ keep_snp = pd.read_csv(f"{self.config.keep_snp_root}.{chrom}.snp", header=None)[
292
+ 0
293
+ ].to_list()
294
294
  self.keep_snp_mask = self.snp_gene_pair_dummy.index.isin(keep_snp)
295
295
  # the SNP name of keeped
296
296
  self.snp_name = self.snp_gene_pair_dummy.index[self.keep_snp_mask].to_list()
@@ -300,25 +300,37 @@ class S_LDSC_Boost:
300
300
 
301
301
  if self.config.additional_baseline_annotation is not None:
302
302
  additional_baseline_annotation = Path(self.config.additional_baseline_annotation)
303
- additional_baseline_annotation_file_path = additional_baseline_annotation / f'baseline.{chrom}.annot.gz'
304
- assert additional_baseline_annotation_file_path.exists(), f'additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}'
305
- additional_baseline_annotation_df = pd.read_csv(additional_baseline_annotation_file_path, sep='\t')
306
- additional_baseline_annotation_df.set_index('SNP', inplace=True)
303
+ additional_baseline_annotation_file_path = (
304
+ additional_baseline_annotation / f"baseline.{chrom}.annot.gz"
305
+ )
306
+ assert additional_baseline_annotation_file_path.exists(), (
307
+ f"additional_baseline_annotation_file_path not exists: {additional_baseline_annotation_file_path}"
308
+ )
309
+ additional_baseline_annotation_df = pd.read_csv(
310
+ additional_baseline_annotation_file_path, sep="\t"
311
+ )
312
+ additional_baseline_annotation_df.set_index("SNP", inplace=True)
307
313
 
308
314
  # drop these columns if exists CHR BP CM]
309
- additional_baseline_annotation_df.drop(['CHR', 'BP', 'CM'], axis=1, inplace=True, errors='ignore')
315
+ additional_baseline_annotation_df.drop(
316
+ ["CHR", "BP", "CM"], axis=1, inplace=True, errors="ignore"
317
+ )
310
318
 
311
319
  # reindex, for those SNPs not in additional_baseline_annotation_df, set to 0
312
- num_of_not_exist_snp = (~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)).sum()
320
+ num_of_not_exist_snp = (
321
+ ~self.snp_gene_pair_dummy.index.isin(additional_baseline_annotation_df.index)
322
+ ).sum()
313
323
  if num_of_not_exist_snp > 0:
314
324
  logger.warning(
315
- f'{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0')
325
+ f"{num_of_not_exist_snp} SNPs not in additional_baseline_annotation_df but in the reference panel, so the additional baseline annotation of these SNP will set to 0"
326
+ )
316
327
  additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
317
- self.snp_gene_pair_dummy.index,
318
- fill_value=0)
328
+ self.snp_gene_pair_dummy.index, fill_value=0
329
+ )
319
330
  else:
320
331
  additional_baseline_annotation_df = additional_baseline_annotation_df.reindex(
321
- self.snp_gene_pair_dummy.index)
332
+ self.snp_gene_pair_dummy.index
333
+ )
322
334
 
323
335
  # do this for saving the cpu time, only calculate r2 once
324
336
  self.snp_gene_weight_matrix, additional_baseline_annotation_ldscore = (
@@ -327,56 +339,85 @@ class S_LDSC_Boost:
327
339
  chrom,
328
340
  self.config.bfile_root,
329
341
  ld_wind=self.config.ld_wind,
330
- ld_unit=self.config.ld_unit))
342
+ ld_unit=self.config.ld_unit,
343
+ )
344
+ )
331
345
 
332
- additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[self.snp_name]
346
+ additional_baseline_annotation_ldscore = additional_baseline_annotation_ldscore.loc[
347
+ self.snp_name
348
+ ]
333
349
  # print(additional_baseline_annotation_ldscore.index.to_list()==self.snp_name)
334
350
 
335
- ld_score_file = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather'
336
- M_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M'
337
- M_5_file_path = f'{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50'
351
+ ld_score_file = f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.ldscore.feather"
352
+ M_file_path = (
353
+ f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M"
354
+ )
355
+ M_5_file_path = (
356
+ f"{self.config.ldscore_save_dir}/additional_baseline/baseline.{chrom}.l2.M_5_50"
357
+ )
338
358
 
339
359
  # save additional baseline annotation ldscore
340
- self.save_ldscore_to_feather(additional_baseline_annotation_ldscore.values,
341
- column_names=additional_baseline_annotation_ldscore.columns,
342
- save_file_name=ld_score_file,
343
- )
360
+ self.save_ldscore_to_feather(
361
+ additional_baseline_annotation_ldscore.values,
362
+ column_names=additional_baseline_annotation_ldscore.columns,
363
+ save_file_name=ld_score_file,
364
+ )
344
365
 
345
366
  # caculate the M and save
346
367
  save_dir = Path(M_file_path).parent
347
368
  save_dir.mkdir(parents=True, exist_ok=True)
348
369
  M_chr_chunk = additional_baseline_annotation_df.values.sum(axis=0, keepdims=True)
349
- M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(axis=0, keepdims=True)
350
- np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
351
- np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
370
+ M_5_chr_chunk = additional_baseline_annotation_df.loc[self.snp_pass_maf].values.sum(
371
+ axis=0, keepdims=True
372
+ )
373
+ np.savetxt(
374
+ M_file_path,
375
+ M_chr_chunk,
376
+ delimiter="\t",
377
+ )
378
+ np.savetxt(
379
+ M_5_file_path,
380
+ M_5_chr_chunk,
381
+ delimiter="\t",
382
+ )
352
383
 
353
384
  else:
354
385
  # Calculate SNP-Gene weight matrix
355
- self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(self.snp_gene_pair_dummy, chrom,
356
- self.config.bfile_root,
357
- ld_wind=self.config.ld_wind,
358
- ld_unit=self.config.ld_unit)
386
+ self.snp_gene_weight_matrix = calculate_ldscore_from_annotation(
387
+ self.snp_gene_pair_dummy,
388
+ chrom,
389
+ self.config.bfile_root,
390
+ ld_wind=self.config.ld_wind,
391
+ ld_unit=self.config.ld_unit,
392
+ )
359
393
  # only keep the snp in keep_snp_root
360
394
  if self.keep_snp_mask is not None:
361
395
  self.snp_gene_weight_matrix = self.snp_gene_weight_matrix[self.keep_snp_mask]
362
396
 
363
397
  if self.config.save_pre_calculate_snp_gene_weight_matrix:
364
- snp_gene_weight_matrix_save_dir = Path(self.config.ldscore_save_dir) / 'snp_gene_weight_matrix'
398
+ snp_gene_weight_matrix_save_dir = (
399
+ Path(self.config.ldscore_save_dir) / "snp_gene_weight_matrix"
400
+ )
365
401
  snp_gene_weight_matrix_save_dir.mkdir(parents=True, exist_ok=True)
366
- logger.info(f'Saving snp_gene_weight_matrix for chr{chrom}...')
402
+ logger.info(f"Saving snp_gene_weight_matrix for chr{chrom}...")
367
403
  self.snp_gene_weight_matrix.reset_index().to_feather(
368
- snp_gene_weight_matrix_save_dir / f'{chrom}.snp_gene_weight_matrix.feather')
404
+ snp_gene_weight_matrix_save_dir / f"{chrom}.snp_gene_weight_matrix.feather"
405
+ )
369
406
 
370
407
  # convert to sparse
371
408
  self.snp_gene_weight_matrix = csr_matrix(self.snp_gene_weight_matrix)
372
- logger.info(f'Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}')
409
+ logger.info(
410
+ f"Compute snp_gene_weight_matrix finished. shape: {self.snp_gene_weight_matrix.shape}"
411
+ )
373
412
 
374
413
  # calculate baseline ld score
375
- logger.info(f'Calculating baseline ld score for chr{chrom}...')
376
- self.calculate_ldscore_for_base_line(chrom, self.config.sample_name, self.config.ldscore_save_dir)
414
+ logger.info(f"Calculating baseline ld score for chr{chrom}...")
415
+ self.calculate_ldscore_for_base_line(
416
+ chrom, self.config.sample_name, self.config.ldscore_save_dir
417
+ )
377
418
 
378
419
  # calculate ld score for annotation
379
- logger.info(f'Calculating ld score for annotation for chr{chrom}...')
420
+ logger.info(f"Calculating ld score for annotation for chr{chrom}...")
380
421
  self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
381
422
  self.mk_score_common.loc[self.snp_gene_pair_dummy.columns[:-1]],
382
423
  chrom,
@@ -384,11 +425,11 @@ class S_LDSC_Boost:
384
425
  self.config.ldscore_save_dir,
385
426
  )
386
427
 
387
- def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(self,
388
- mk_score_chunk,
389
- drop_dummy_na=True,
390
- ):
391
-
428
+ def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
429
+ self,
430
+ mk_score_chunk,
431
+ drop_dummy_na=True,
432
+ ):
392
433
  if drop_dummy_na:
393
434
  ldscore_chr_chunk = self.snp_gene_weight_matrix[:, :-1] @ mk_score_chunk
394
435
  else:
@@ -407,16 +448,20 @@ class S_LDSC_Boost:
407
448
  # self.keep_snp_mask]
408
449
 
409
450
  # save for each chunk
410
- df = pd.DataFrame(ldscore_chr_chunk,
411
- index=self.snp_name,
412
- columns=column_names,
413
- )
414
- df.index.name = 'SNP'
451
+ df = pd.DataFrame(
452
+ ldscore_chr_chunk,
453
+ index=self.snp_name,
454
+ columns=column_names,
455
+ )
456
+ df.index.name = "SNP"
415
457
  df.reset_index().to_feather(save_file_name)
416
458
 
417
- def save_ldscore_chunk_to_zarr(self, ldscore_chr_chunk: np.ndarray,
418
- chrom: int, start_col_index,
419
- ):
459
+ def save_ldscore_chunk_to_zarr(
460
+ self,
461
+ ldscore_chr_chunk: np.ndarray,
462
+ chrom: int,
463
+ start_col_index,
464
+ ):
420
465
  ldscore_chr_chunk = ldscore_chr_chunk.astype(np.float16, copy=False)
421
466
  # avoid overflow of float16, if inf, set to max of float16
422
467
  ldscore_chr_chunk[np.isinf(ldscore_chr_chunk)] = np.finfo(np.float16).max
@@ -425,63 +470,90 @@ class S_LDSC_Boost:
425
470
  chrom_snp_start_point = self.chrom_snp_start_point[chrom - 1]
426
471
  chrom_snp_end_point = self.chrom_snp_start_point[chrom]
427
472
 
428
- self.zarr_file[chrom_snp_start_point:chrom_snp_end_point,
429
- start_col_index:start_col_index + ldscore_chr_chunk.shape[1]] = ldscore_chr_chunk
430
-
431
- def calculate_M_use_SNP_gene_pair_dummy_by_chunk(self,
432
- mk_score_chunk,
433
- M_file_path, M_5_file_path,
434
- drop_dummy_na=True,
435
- ):
436
- '''
437
- calculate M use SNP_gene_pair_dummy_sumed_along_snp_axis and mk_score_chunk
438
- '''
439
- SNP_gene_pair_dummy_sumed_along_snp_axis = self.snp_gene_pair_dummy.values.sum(axis=0, keepdims=True)
440
- SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = self.snp_gene_pair_dummy.loc[self.snp_pass_maf].values.sum(
441
- axis=0,
442
- keepdims=True)
473
+ self.zarr_file[
474
+ chrom_snp_start_point:chrom_snp_end_point,
475
+ start_col_index : start_col_index + ldscore_chr_chunk.shape[1],
476
+ ] = ldscore_chr_chunk
477
+
478
+ def calculate_M_use_SNP_gene_pair_dummy_by_chunk(
479
+ self,
480
+ mk_score_chunk,
481
+ M_file_path,
482
+ M_5_file_path,
483
+ drop_dummy_na=True,
484
+ ):
485
+ """
486
+ Calculate M use SNP_gene_pair_dummy_sumed_along_snp_axis and mk_score_chunk
487
+ """
488
+ SNP_gene_pair_dummy_sumed_along_snp_axis = self.snp_gene_pair_dummy.values.sum(
489
+ axis=0, keepdims=True
490
+ )
491
+ SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = self.snp_gene_pair_dummy.loc[
492
+ self.snp_pass_maf
493
+ ].values.sum(axis=0, keepdims=True)
443
494
  if drop_dummy_na:
444
- SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[:, :-1]
445
- SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf[:,
446
- :-1]
495
+ SNP_gene_pair_dummy_sumed_along_snp_axis = SNP_gene_pair_dummy_sumed_along_snp_axis[
496
+ :, :-1
497
+ ]
498
+ SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf = (
499
+ SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf[:, :-1]
500
+ )
447
501
  save_dir = Path(M_file_path).parent
448
502
  save_dir.mkdir(parents=True, exist_ok=True)
449
503
  M_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis @ mk_score_chunk
450
504
  M_5_chr_chunk = SNP_gene_pair_dummy_sumed_along_snp_axis_pass_maf @ mk_score_chunk
451
- np.savetxt(M_file_path, M_chr_chunk, delimiter='\t', )
452
- np.savetxt(M_5_file_path, M_5_chr_chunk, delimiter='\t', )
505
+ np.savetxt(
506
+ M_file_path,
507
+ M_chr_chunk,
508
+ delimiter="\t",
509
+ )
510
+ np.savetxt(
511
+ M_5_file_path,
512
+ M_5_chr_chunk,
513
+ delimiter="\t",
514
+ )
453
515
 
454
- def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(self, mk_score_common, chrom, sample_name, save_dir):
516
+ def calculate_ldscore_use_SNP_Gene_weight_matrix_by_chr(
517
+ self, mk_score_common, chrom, sample_name, save_dir
518
+ ):
455
519
  """
456
520
  Calculate the LD score using the SNP-gene weight matrix.
457
521
  :param sample_name:
458
522
  """
459
523
  # Calculate the LD score
460
524
  chunk_index = 1
461
- for i in trange(0, mk_score_common.shape[1], self.config.spots_per_chunk,
462
- desc=f'Calculating LD score by chunk for chr{chrom}'):
463
- mk_score_chunk = mk_score_common.iloc[:, i:i + self.config.spots_per_chunk]
464
-
465
- ld_score_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather'
466
- M_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M'
467
- M_5_file = f'{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50'
525
+ for i in trange(
526
+ 0,
527
+ mk_score_common.shape[1],
528
+ self.config.spots_per_chunk,
529
+ desc=f"Calculating LD score by chunk for chr{chrom}",
530
+ ):
531
+ mk_score_chunk = mk_score_common.iloc[:, i : i + self.config.spots_per_chunk]
532
+
533
+ ld_score_file = f"{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.ldscore.feather"
534
+ M_file = f"{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M"
535
+ M_5_file = (
536
+ f"{save_dir}/{sample_name}_chunk{chunk_index}/{sample_name}.{chrom}.l2.M_5_50"
537
+ )
468
538
 
469
539
  ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
470
540
  mk_score_chunk,
471
541
  drop_dummy_na=True,
472
542
  )
473
- if self.config.ldscore_save_format == 'feather':
474
- self.save_ldscore_to_feather(ldscore_chr_chunk,
475
- column_names=mk_score_chunk.columns,
476
- save_file_name=ld_score_file,
477
- )
478
- elif self.config.ldscore_save_format == 'zarr':
479
- self.save_ldscore_chunk_to_zarr(ldscore_chr_chunk,
480
- chrom=chrom,
481
- start_col_index=i,
482
- )
543
+ if self.config.ldscore_save_format == "feather":
544
+ self.save_ldscore_to_feather(
545
+ ldscore_chr_chunk,
546
+ column_names=mk_score_chunk.columns,
547
+ save_file_name=ld_score_file,
548
+ )
549
+ elif self.config.ldscore_save_format == "zarr":
550
+ self.save_ldscore_chunk_to_zarr(
551
+ ldscore_chr_chunk,
552
+ chrom=chrom,
553
+ start_col_index=i,
554
+ )
483
555
  else:
484
- raise ValueError(f'Invalid ldscore_save_format: {self.config.ldscore_save_format}')
556
+ raise ValueError(f"Invalid ldscore_save_format: {self.config.ldscore_save_format}")
485
557
 
486
558
  self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
487
559
  mk_score_chunk,
@@ -496,21 +568,23 @@ class S_LDSC_Boost:
496
568
  # save baseline ld score
497
569
  baseline_mk_score = np.ones((self.snp_gene_pair_dummy.shape[1], 2))
498
570
  baseline_mk_score[-1, 0] = 0 # all_gene
499
- baseline_mk_score_df = pd.DataFrame(baseline_mk_score, index=self.snp_gene_pair_dummy.columns,
500
- columns=['all_gene', 'base'])
501
- ld_score_file = f'{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather'
502
- M_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M'
503
- M_5_file = f'{save_dir}/baseline/baseline.{chrom}.l2.M_5_50'
571
+ baseline_mk_score_df = pd.DataFrame(
572
+ baseline_mk_score, index=self.snp_gene_pair_dummy.columns, columns=["all_gene", "base"]
573
+ )
574
+ ld_score_file = f"{save_dir}/baseline/baseline.{chrom}.l2.ldscore.feather"
575
+ M_file = f"{save_dir}/baseline/baseline.{chrom}.l2.M"
576
+ M_5_file = f"{save_dir}/baseline/baseline.{chrom}.l2.M_5_50"
504
577
 
505
578
  ldscore_chr_chunk = self.calculate_ldscore_use_SNP_Gene_weight_matrix_by_chunk(
506
579
  baseline_mk_score_df,
507
580
  drop_dummy_na=False,
508
581
  )
509
582
 
510
- self.save_ldscore_to_feather(ldscore_chr_chunk,
511
- column_names=baseline_mk_score_df.columns,
512
- save_file_name=ld_score_file,
513
- )
583
+ self.save_ldscore_to_feather(
584
+ ldscore_chr_chunk,
585
+ column_names=baseline_mk_score_df.columns,
586
+ save_file_name=ld_score_file,
587
+ )
514
588
  # save baseline M
515
589
  self.calculate_M_use_SNP_gene_pair_dummy_by_chunk(
516
590
  baseline_mk_score_df,
@@ -519,7 +593,10 @@ class S_LDSC_Boost:
519
593
  drop_dummy_na=False,
520
594
  )
521
595
 
522
- def get_snp_gene_dummy(self, chrom, ):
596
+ def get_snp_gene_dummy(
597
+ self,
598
+ chrom,
599
+ ):
523
600
  """
524
601
  Get the dummy matrix of SNP-gene pairs.
525
602
  """
@@ -527,91 +604,126 @@ class S_LDSC_Boost:
527
604
  print("Loading bim data")
528
605
  bim, bim_pr = load_bim(self.config.bfile_root, chrom)
529
606
 
530
- if self.config.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
531
-
532
- SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
533
- SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
607
+ if self.config.gene_window_enhancer_priority in ["gene_window_first", "enhancer_first"]:
608
+ SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(
609
+ bim,
610
+ bim_pr,
611
+ )
612
+ SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(
613
+ bim,
614
+ bim_pr,
615
+ )
534
616
  # total_SNP_gene_pair = SNP_gene_pair_gtf.join(SNP_gene_pair_enhancer, how='outer', lsuffix='_gtf', )
535
617
 
536
618
  mask_of_nan_gtf = SNP_gene_pair_gtf.gene_name.isna()
537
619
  mask_of_nan_enhancer = SNP_gene_pair_enhancer.gene_name.isna()
538
620
 
539
- if self.config.gene_window_enhancer_priority == 'gene_window_first':
621
+ if self.config.gene_window_enhancer_priority == "gene_window_first":
540
622
  SNP_gene_pair = SNP_gene_pair_gtf
541
- SNP_gene_pair.loc[mask_of_nan_gtf, 'gene_name'] = SNP_gene_pair_enhancer.loc[
542
- mask_of_nan_gtf, 'gene_name']
543
- elif self.config.gene_window_enhancer_priority == 'enhancer_first':
623
+ SNP_gene_pair.loc[mask_of_nan_gtf, "gene_name"] = SNP_gene_pair_enhancer.loc[
624
+ mask_of_nan_gtf, "gene_name"
625
+ ]
626
+ elif self.config.gene_window_enhancer_priority == "enhancer_first":
544
627
  SNP_gene_pair = SNP_gene_pair_enhancer
545
- SNP_gene_pair.loc[mask_of_nan_enhancer, 'gene_name'] = SNP_gene_pair_gtf.loc[
546
- mask_of_nan_enhancer, 'gene_name']
628
+ SNP_gene_pair.loc[mask_of_nan_enhancer, "gene_name"] = SNP_gene_pair_gtf.loc[
629
+ mask_of_nan_enhancer, "gene_name"
630
+ ]
547
631
  else:
548
632
  raise ValueError(
549
- f'Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}')
633
+ f"Invalid self.config.gene_window_enhancer_priority: {self.config.gene_window_enhancer_priority}"
634
+ )
550
635
 
551
636
  elif self.config.gene_window_enhancer_priority is None: # use gtf only
552
- SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(bim, bim_pr, )
637
+ SNP_gene_pair_gtf = self.get_SNP_gene_pair_from_gtf(
638
+ bim,
639
+ bim_pr,
640
+ )
553
641
  SNP_gene_pair = SNP_gene_pair_gtf
554
642
 
555
- elif self.config.gene_window_enhancer_priority == 'enhancer_only':
556
- SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(bim, bim_pr, )
643
+ elif self.config.gene_window_enhancer_priority == "enhancer_only":
644
+ SNP_gene_pair_enhancer = self.get_SNP_gene_pair_from_enhancer(
645
+ bim,
646
+ bim_pr,
647
+ )
557
648
  SNP_gene_pair = SNP_gene_pair_enhancer
558
649
  else:
559
- raise ValueError('gtf_pr and enhancer_pr cannot be None at the same time')
650
+ raise ValueError("gtf_pr and enhancer_pr cannot be None at the same time")
560
651
 
561
652
  # save the SNP_gene_pair to feather
562
- SNP_gene_pair_save_path = Path(
563
- self.config.ldscore_save_dir) / f'SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather'
653
+ SNP_gene_pair_save_path = (
654
+ Path(self.config.ldscore_save_dir) / f"SNP_gene_pair/SNP_gene_pair_chr{chrom}.feather"
655
+ )
564
656
  SNP_gene_pair_save_path.parent.mkdir(parents=True, exist_ok=True)
565
657
  SNP_gene_pair.reset_index().to_feather(SNP_gene_pair_save_path)
566
658
 
567
659
  # Get the dummy matrix
568
- SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair['gene_name'], dummy_na=True)
660
+ SNP_gene_pair_dummy = pd.get_dummies(SNP_gene_pair["gene_name"], dummy_na=True)
569
661
  return SNP_gene_pair_dummy
570
662
 
571
663
  def get_SNP_gene_pair_from_gtf(self, bim, bim_pr):
572
664
  logger.info(
573
- "Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)")
665
+ "Get SNP-gene pair from gtf, if a SNP is in multiple genes, it will be assigned to the most nearby gene (TSS)"
666
+ )
574
667
  overlaps_small = Overlaps_gtf_bim(self.gtf_pr, bim_pr)
575
668
  # Get the SNP-gene pair
576
669
  annot = bim[["CHR", "BP", "SNP", "CM"]]
577
- SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
670
+ SNP_gene_pair = (
671
+ overlaps_small[["SNP", "gene_name"]]
672
+ .set_index("SNP")
673
+ .join(annot.set_index("SNP"), how="right")
674
+ )
578
675
  return SNP_gene_pair
579
676
 
580
- def get_SNP_gene_pair_from_enhancer(self, bim, bim_pr, ):
677
+ def get_SNP_gene_pair_from_enhancer(
678
+ self,
679
+ bim,
680
+ bim_pr,
681
+ ):
581
682
  logger.info(
582
- "Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score")
683
+ "Get SNP-gene pair from enhancer, if a SNP is in multiple genes, it will be assigned to the gene with highest marker score"
684
+ )
583
685
  # Get the SNP-gene pair
584
686
  overlaps_small = self.enhancer_pr.join(bim_pr).df
585
687
  annot = bim[["CHR", "BP", "SNP", "CM"]]
586
- if self.config.snp_multiple_enhancer_strategy == 'max_mkscore':
587
- logger.debug('select the gene with highest marker score')
588
- overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').avg_mkscore.idxmax()]
589
-
590
- elif self.config.snp_multiple_enhancer_strategy == 'nearest_TSS':
591
- logger.debug('select the gene with nearest TSS')
592
- overlaps_small['Distance'] = np.abs(overlaps_small['Start_b'] - overlaps_small['TSS'])
593
- overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
594
-
595
- SNP_gene_pair = overlaps_small[['SNP', 'gene_name']].set_index('SNP').join(annot.set_index('SNP'), how='right')
688
+ if self.config.snp_multiple_enhancer_strategy == "max_mkscore":
689
+ logger.debug("select the gene with highest marker score")
690
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby("SNP").avg_mkscore.idxmax()]
691
+
692
+ elif self.config.snp_multiple_enhancer_strategy == "nearest_TSS":
693
+ logger.debug("select the gene with nearest TSS")
694
+ overlaps_small["Distance"] = np.abs(overlaps_small["Start_b"] - overlaps_small["TSS"])
695
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby("SNP").Distance.idxmin()]
696
+
697
+ SNP_gene_pair = (
698
+ overlaps_small[["SNP", "gene_name"]]
699
+ .set_index("SNP")
700
+ .join(annot.set_index("SNP"), how="right")
701
+ )
596
702
 
597
703
  return SNP_gene_pair
598
704
 
599
705
 
600
706
  def run_generate_ldscore(config: GenerateLDScoreConfig):
601
- if config.ldscore_save_format == 'quick_mode':
602
- logger.info('Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore.')
707
+ if config.ldscore_save_format == "quick_mode":
708
+ logger.info(
709
+ "Running in quick_mode. Skip the process of generating ldscore. Using the pre-calculated ldscore."
710
+ )
603
711
  ldscore_save_dir = config.ldscore_save_dir
604
712
 
605
713
  # link the baseline annotation
606
714
  baseline_annotation_dir = Path(config.baseline_annotation_dir)
607
- (ldscore_save_dir / 'baseline').symlink_to(baseline_annotation_dir, target_is_directory=True)
715
+ (ldscore_save_dir / "baseline").symlink_to(
716
+ baseline_annotation_dir, target_is_directory=True
717
+ )
608
718
 
609
719
  # link the SNP_gene_pair
610
720
  SNP_gene_pair_dir = Path(config.SNP_gene_pair_dir)
611
- (ldscore_save_dir / 'SNP_gene_pair').symlink_to(SNP_gene_pair_dir, target_is_directory=True)
721
+ (ldscore_save_dir / "SNP_gene_pair").symlink_to(
722
+ SNP_gene_pair_dir, target_is_directory=True
723
+ )
612
724
  return
613
725
  s_ldsc_boost = S_LDSC_Boost(config)
614
- if config.chrom == 'all':
726
+ if config.chrom == "all":
615
727
  for chrom in range(1, 23):
616
728
  s_ldsc_boost.process_chromosome(chrom)
617
729
  else: