gsMap 1.60__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,560 @@
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import pprint
5
+ import time
6
+ from pathlib import Path
7
+
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ import pyranges as pr
12
+ from progress.bar import IncrementalBar
13
+
14
+ from gsMap.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+ logger.setLevel(logging.DEBUG)
19
+ handler = logging.StreamHandler()
20
+ handler.setFormatter(logging.Formatter(
21
+ '[{asctime}] {levelname:8s} {filename} {message}', style='{'))
22
+ logger.addHandler(handler)
23
+ from dataclasses import dataclass, field
24
+ from typing import Optional
25
+ try:
26
+ import cupy as cp
27
+ pool = cp.cuda.MemoryPool(cp.cuda.malloc_async)
28
+ cp.cuda.set_allocator(pool.malloc)
29
+ except ImportError:
30
+ logger.warning('Cupy not found, will not use GPU to compute LD score')
31
+ cp = None
32
+ @dataclass
33
+ class MakeAnnotationConfig:
34
+ input_feather_file: str
35
+ output_dir: str
36
+ sample_name: str
37
+ gtf_file: Optional[str] = None
38
+ bfile_root: Optional[str] = None
39
+ baseline_annotation: Optional[str] = None
40
+ keep_snp_root: Optional[str] = None
41
+ chr: Optional[int] = None
42
+ window_size: int = 50000
43
+ cells_per_chunk: int = 500
44
+ ld_wind: float = 1.0
45
+ ld_wind_unit: str = field(default='CM', metadata={'choices': ['CM', 'BP', 'SNP']})
46
+ r2_cache_dir: Optional[str] = None
47
+ use_gpu: bool = False
48
+ snps_per_chunk: int = 50_000
49
+
50
+ def __post_init__(self):
51
+ if self.ld_wind_unit not in self.__dataclass_fields__['ld_wind_unit'].metadata['choices']:
52
+ raise ValueError(f"Invalid ld_wind_unit: {self.ld_wind_unit}. Choose from 'CM', 'BP', or 'SNP'.")
53
+
54
+
55
+
56
+ class Snp_Annotator:
57
+ """
58
+ 1. Annotate SNPs based on score of genes.
59
+ 2. Add baseline annotations.
60
+ """
61
+
62
+ def __init__(self, mk_score_file, gtf_file, bfile_root, annot_root, annot_name, chr=None, base_root=None,
63
+ window_size=50000, const_max_size=100):
64
+ #
65
+ # marker score
66
+ self.mk_score_file = mk_score_file
67
+ self.mk_score = self.load_marker_score()
68
+ #
69
+ # chunk cells
70
+ # self.const_max_size = const_max_size
71
+ self.n_cells = len(self.mk_score.columns)
72
+ self.max_chunk = const_max_size
73
+ # self.max_chunk = floor(self.n_cells / self.const_max_size)
74
+ #
75
+ # gtf data
76
+ self.gtf_file = gtf_file
77
+ self.window_size = window_size
78
+ self.gtf_pr = self.load_gtf(mk_score=self.mk_score)
79
+ #
80
+ self.bfile_root = bfile_root
81
+ self.annot_root = annot_root
82
+ self.base_root = base_root
83
+ self.chr = chr
84
+
85
+ self.data_name = annot_name
86
+
87
+ #
88
+ def load_marker_score(self):
89
+ """
90
+ Load marker scores of each cell.
91
+ """
92
+ mk_score = pd.read_feather(self.mk_score_file).set_index('HUMAN_GENE_SYM').rename_axis('gene_name')
93
+ mk_score.insert(0, 'all_gene', 1)
94
+ return mk_score
95
+
96
+ #
97
+ def load_gtf(self, mk_score):
98
+ """
99
+ Load the gene annotation file (gtf).
100
+ """
101
+ print("Loading gtf data")
102
+ #
103
+ # Load GTF file
104
+ gtf = pr.read_gtf(self.gtf_file)
105
+ gtf = gtf.df
106
+ #
107
+ # Select the common genes
108
+ gtf = gtf[gtf['Feature'] == 'gene']
109
+ common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
110
+ #
111
+ gtf = gtf[gtf.gene_name.isin(common_gene)]
112
+ mk_score = mk_score[mk_score.index.isin(common_gene)]
113
+ #
114
+ # Remove duplicated lines
115
+ gtf = gtf.drop_duplicates(subset='gene_name', keep="first")
116
+ #
117
+ # Process the GTF (open 100-KB window: Tss - Ted)
118
+ gtf_bed = gtf[['Chromosome', 'Start', 'End', 'gene_name', 'Strand']].copy()
119
+ gtf_bed.loc[:, 'TSS'] = gtf_bed['Start']
120
+ gtf_bed.loc[:, 'TED'] = gtf_bed['End']
121
+
122
+ gtf_bed.loc[:, 'Start'] = gtf_bed['TSS'] - self.window_size
123
+ gtf_bed.loc[:, 'End'] = gtf_bed['TED'] + self.window_size
124
+ gtf_bed.loc[gtf_bed['Start'] < 0, 'Start'] = 0
125
+ #
126
+ # Correct the negative strand
127
+ tss_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS']
128
+ ted_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED']
129
+ gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS'] = ted_neg
130
+ gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED'] = tss_neg
131
+ gtf_bed = gtf_bed.drop('Strand', axis=1)
132
+ #
133
+ # Transform the GTF to PyRanges
134
+ gtf_pr = pr.PyRanges(gtf_bed)
135
+ return gtf_pr
136
+
137
+ #
138
+ def load_baseline(self, chr):
139
+ """
140
+ Load baseline annotations.
141
+ """
142
+ baseline = pd.read_csv(f'{self.base_root}.{chr}.annot.gz', sep='\t')
143
+ baseline.drop(['CHR', 'BP', 'CM'], axis=1, inplace=True)
144
+ return baseline
145
+
146
+ # -
147
+ def Load_bim(self, chr):
148
+ """
149
+ Load bim files.
150
+ """
151
+ bim_file = f'{self.bfile_root}.{chr}.bim'
152
+ bim = pd.read_csv(bim_file, sep='\t', header=None)
153
+ bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
154
+ #
155
+ # Transform bim to PyRanges
156
+ bim_pr = bim.copy()
157
+ bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
158
+ bim_pr['End'] = bim_pr['Start']
159
+ bim_pr = pr.PyRanges(bim_pr)
160
+ bim_pr.Chromosome = f'chr{chr}'
161
+ return bim_pr, bim
162
+
163
+ # -
164
+ def Overlaps_gtf_bim(self, bim_pr):
165
+ """
166
+ Find overlaps between gtf and bim file.
167
+ """
168
+ # Select the overlapped regions (SNPs in gene windows)
169
+ overlaps = self.gtf_pr.join(bim_pr)
170
+ overlaps = overlaps.df
171
+ overlaps['Distance'] = np.abs(overlaps['Start_b'] - overlaps['TSS'])
172
+ overlaps_small = overlaps.copy()
173
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
174
+ return overlaps_small
175
+
176
+ # -
177
+ def map_baseline(self, snp_score, baseline, chr):
178
+ """
179
+ Generate the baseline annotations for SNPs.
180
+ """
181
+
182
+ header = snp_score.columns[0:6].to_list()
183
+
184
+ if baseline is None:
185
+ print(f'Baseline annotations of chr{chr} are not provided, using uniform annotations for genes and SNPs')
186
+ baseline_score = snp_score[header + ['all_gene']].copy()
187
+ baseline_score.loc[:, 'base'] = 1
188
+
189
+ else:
190
+ print(f'Mapping baseline annotations of chr{chr}')
191
+ snp_score_baseline = pd.merge(snp_score, baseline, how='left', on='SNP').fillna(0).copy()
192
+
193
+ baseline_score = snp_score_baseline[header + ['all_gene'] + baseline.columns.to_list()]
194
+ baseline_score = baseline_score.loc[:, ~baseline_score.columns.duplicated()].copy()
195
+
196
+ # Create the folder (for baseline annotation)
197
+ file_base_root = f'{self.annot_root}/baseline'
198
+ if not os.path.exists(file_base_root):
199
+ os.makedirs(file_base_root, mode=0o777, exist_ok=True)
200
+
201
+ # Save baseline annotations (in parquet format)
202
+ file_base = f'{file_base_root}/baseline.{chr}.feather'
203
+ baseline_score.to_feather(file_base)
204
+
205
+ return 0
206
+
207
+ # -
208
+ def annotate_chr(self, chr):
209
+ """
210
+ Annotate SNPs of each chr.
211
+ """
212
+ # Load the baseline file
213
+ baseline = None
214
+ if self.base_root is not None:
215
+ baseline = self.load_baseline(chr)
216
+
217
+ # Load the bim file
218
+ bim_pr, bim = self.Load_bim(chr)
219
+
220
+ # Find overlapping
221
+ overlaps_small = self.Overlaps_gtf_bim(bim_pr)
222
+
223
+ # Do annotations
224
+ all_chunks = int(np.ceil(self.n_cells / self.max_chunk))
225
+ bar = IncrementalBar(f'Mapping the gene marker scores to SNPs in chr{chr}', max=all_chunks)
226
+ bar.check_tty = False
227
+
228
+ # Preprocess bim outside the loop as it doesn't change
229
+ anno_template = bim[["CHR", "BP", "SNP", "CM"]]
230
+
231
+ for chunk_index, left in enumerate(range(0, self.n_cells, self.max_chunk), start=1):
232
+ right = min(left + self.max_chunk, self.n_cells)
233
+ mk_score_current = self.mk_score.iloc[:, left:right]
234
+
235
+ # Process marker scores for SNPs
236
+ anno = anno_template.copy()
237
+ merged_data = overlaps_small[['SNP', 'gene_name', 'TSS']].merge(mk_score_current, on='gene_name',
238
+ how='left')
239
+ snp_score = pd.merge(anno, merged_data, how='left', on='SNP').fillna(0)
240
+ snp_score = snp_score.rename(columns={'gene_name': 'Gene'})
241
+ snp_score.loc[snp_score.Gene == 0, 'Gene'] = 'None'
242
+
243
+ # Process baseline annotations for the first chunk
244
+ if chunk_index == 1:
245
+ self.map_baseline(snp_score, baseline, chr)
246
+ snp_score = snp_score.drop('all_gene', axis=1)
247
+
248
+ # Create the folder and save SNP annotations
249
+ file_root = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}'
250
+ os.makedirs(file_root, mode=0o777, exist_ok=True)
251
+ file_anno = f'{file_root}/{self.data_name}.{chr}.feather'
252
+ snp_score.to_feather(file_anno)
253
+
254
+ bar.next()
255
+
256
+ bar.finish()
257
+
258
+ return all_chunks
259
+
260
+ #
261
+ def annotate(self):
262
+ """
263
+ Perform SNP annotations for each chromosome.
264
+ """
265
+ if self.chr == None:
266
+ for chr in range(1, 23):
267
+ const_max_size = self.annotate_chr(chr=chr)
268
+ else:
269
+ const_max_size = self.annotate_chr(chr=self.chr)
270
+
271
+ return const_max_size
272
+
273
+
274
+ class LDscore_Generator:
275
+ def __init__(self, make_annotation_config: MakeAnnotationConfig, const_max_size):
276
+ self.bfile_root = make_annotation_config.bfile_root
277
+ self.annot_root = Path(make_annotation_config.output_dir)
278
+ self.const_max_size = const_max_size
279
+ self.data_name = make_annotation_config.sample_name
280
+ self.chr = make_annotation_config.chr
281
+ self.ld_wind = make_annotation_config.ld_wind
282
+ self.ld_wind_unit = make_annotation_config.ld_wind_unit
283
+ self.keep_snp = make_annotation_config.keep_snp_root
284
+ self.r2_cache_dir = make_annotation_config.r2_cache_dir
285
+ self.use_gpu = make_annotation_config.use_gpu
286
+ self.config = make_annotation_config
287
+ self.generate_r2_cache = False
288
+
289
+ # Set the r2 cache
290
+ if self.r2_cache_dir is None:
291
+ logger.info('No r2 cache directory specified, will not use r2 cache')
292
+ self.chr_r2_cache_dir = None
293
+ else:
294
+ assert self.chr is not None, 'Must specify chr when using r2 cache'
295
+ chr_r2_cache_dir = os.path.join(self.r2_cache_dir, f'chr{self.chr}')
296
+ self.chr_r2_cache_dir = chr_r2_cache_dir
297
+ if not os.path.exists(os.path.join(chr_r2_cache_dir, 'combined_r2_matrix.npz')):
298
+ logger.warning(
299
+ f'No r2 cache found for chr{self.chr}, will generate r2 cache for this chromosome, first time may take a while')
300
+ os.makedirs(chr_r2_cache_dir, exist_ok=True, mode=0o777, )
301
+ self.generate_r2_cache = True
302
+ else:
303
+ logger.info(f'Found r2 cache for chr{self.chr}, will use r2 cache for this chromosome')
304
+
305
+ def compute_ldscore(self):
306
+ """
307
+ Compute LD scores.
308
+ """
309
+ start_time = time.time()
310
+ if self.chr == None:
311
+ for chr in range(1, 23):
312
+ logger.info(f'Computing LD scores for chr{chr}')
313
+ self.compute_ldscore_chr(chr=chr)
314
+ logger.info(f'Finished computing LD scores for chr{chr}')
315
+ else:
316
+ logger.info(f'Computing LD scores for chr{self.chr}')
317
+ self.compute_ldscore_chr(chr=self.chr)
318
+ logger.info(f'Finished computing LD scores for chr{self.chr}')
319
+ end_time = time.time()
320
+ logger.info(f'Finished computing LD scores, time elapsed: {(end_time - start_time) / 60} minutes')
321
+
322
+ def compute_ldscore_chunk(self, annot_file, ld_score_file, M_file, M_5_file, geno_array: PlinkBEDFileWithR2Cache,
323
+ block_left, snp):
324
+ """
325
+ Compute and save LD scores for each chunk
326
+ :param annot_file: Path to the annotation file
327
+ :param ld_score_file: Path to the LD score file
328
+ :param M_file: Path to the M file
329
+ :param M_5_file: Path to the M_5_50 file
330
+ :param geno_array: Genotype array
331
+ :param block_left: Block left
332
+ :param snp: SNP to be kept
333
+ :return: None
334
+ """
335
+ annot_df = pd.read_feather(annot_file)
336
+ n_annot, ma = len(annot_df.columns) - 6, len(annot_df)
337
+
338
+ # print("Read {A} annotations for {M} SNPs from {f}".format(f=annot_file, A=n_annot, M=ma))
339
+ annot_matrix = np.array(annot_df.iloc[:, 6:])
340
+ annot_colnames = annot_df.columns[6:]
341
+
342
+ # Reset the SNP point
343
+ geno_array.__restart__()
344
+
345
+ # Compute annotated LD score
346
+ if self.chr_r2_cache_dir is None:
347
+ lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 50, annot=annot_matrix))
348
+ else:
349
+ lN_df = pd.DataFrame(self.get_ldscore_use_cache(annot_matrix))
350
+
351
+ ldscore = pd.concat([annot_df.iloc[:, 0:6], lN_df], axis=1)
352
+ ldscore.columns = annot_df.columns
353
+
354
+ # Keep the targeted SNPs
355
+ if not snp is None:
356
+ ldscore = ldscore.loc[ldscore.SNP.isin(snp)]
357
+
358
+ # Save the LD score annotations
359
+ ldscore = ldscore.reset_index()
360
+ ldscore.drop(columns=['index'], inplace=True)
361
+ ldscore.to_feather(ld_score_file)
362
+
363
+ # Compute the .M (.M_5_50) file
364
+ M = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix, axis=0))))
365
+ ii = geno_array.maf > 0.05
366
+ M_5_50 = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix[ii, :], axis=0))))
367
+
368
+ # Save the sum of score annotations (all and maf > 0.05)
369
+ np.savetxt(M_file, M, delimiter='\t')
370
+ np.savetxt(M_5_file, M_5_50, delimiter='\t')
371
+
372
+ def get_ldscore_use_cache(self, annot_matrix, ):
373
+ if self.use_gpu:
374
+ logger.debug('Using GPU to compute LD score')
375
+ annot_matrix = cp.asarray(annot_matrix, dtype=cp.float32)
376
+ for i, r2_matrix_chunk in enumerate(self.r2_matrix_chunk_list):
377
+ r2_matrix_chunk = cp.sparse.csr_matrix(r2_matrix_chunk, dtype=cp.float32)
378
+ lN_chunk = cp.asnumpy(r2_matrix_chunk @ annot_matrix)
379
+ # convert to float16
380
+ lN_chunk = lN_chunk.astype(np.float16)
381
+ if i == 0:
382
+ lN = lN_chunk
383
+ else:
384
+ lN = np.concatenate([lN, lN_chunk], axis=0)
385
+ else:
386
+ logger.debug('Using CPU to compute LD score')
387
+ for i, r2_matrix_chunk in enumerate(self.r2_matrix_chunk_list):
388
+ lN_chunk = r2_matrix_chunk @ annot_matrix
389
+ # convert to float16
390
+ lN_chunk = lN_chunk.astype(np.float16)
391
+ if i == 0:
392
+ lN = lN_chunk
393
+ else:
394
+ lN = np.concatenate([lN, lN_chunk], axis=0)
395
+ return lN
396
+
397
+ def compute_ldscore_chr(self, chr):
398
+ PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
399
+ PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
400
+
401
+ bfile = f"{self.bfile_root}.{chr}"
402
+ #
403
+ # Load bim file
404
+ snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
405
+ array_snps = snp_obj(snp_file)
406
+ m = len(array_snps.IDList)
407
+ print(f'Read list of {m} SNPs from {snp_file}')
408
+ #
409
+ # Load fam
410
+ ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
411
+ array_indivs = ind_obj(ind_file)
412
+ n = len(array_indivs.IDList)
413
+ print(f'Read list of {n} individuals from {ind_file}')
414
+ #
415
+ # Load genotype array
416
+ array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
417
+ geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
418
+
419
+ # Load the snp to be print
420
+ if not self.keep_snp is None:
421
+ snp = pd.read_csv(f'{self.keep_snp}.{chr}.snp', header=None)[0].to_list()
422
+ num_snp = len(snp)
423
+ print(f'Loading {num_snp} SNPs')
424
+ else:
425
+ snp = None
426
+
427
+ # Load the annotations of the baseline
428
+ if self.ld_wind_unit == 'SNP':
429
+ max_dist = self.ld_wind
430
+ coords = np.array(range(geno_array.m))
431
+ elif self.ld_wind_unit == 'BP':
432
+ max_dist = self.ld_wind * 1000
433
+ coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
434
+ elif self.ld_wind_unit == 'CM':
435
+ max_dist = self.ld_wind
436
+ coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
437
+ block_left = getBlockLefts(coords, max_dist)
438
+ if self.generate_r2_cache:
439
+ logger.info(f'Generating r2 cache for chr{chr}, this may take a while')
440
+ geno_array.compute_r2_cache(block_left,
441
+ Path(self.chr_r2_cache_dir))
442
+ logger.info(f'Finished generating r2 cache for chr{chr}')
443
+ if self.chr_r2_cache_dir is not None:
444
+ logger.info('Loading r2 cache')
445
+ r2_matrix = geno_array.load_combined_r2_matrix(cached_r2_matrix_dir=self.chr_r2_cache_dir)
446
+ self.r2_matrix_chunk_list = [r2_matrix[i:i + self.config.snps_per_chunk, :] for i in
447
+ range(0, r2_matrix.shape[0], self.config.snps_per_chunk)]
448
+ logger.info('Finished loading r2 cache')
449
+ # Set the baseline root
450
+ annot_file = f'{self.annot_root}/baseline/baseline.{chr}.feather'
451
+ ld_score_file = f'{self.annot_root}/baseline/baseline.{chr}.l2.ldscore.feather'
452
+ M_file = f'{self.annot_root}/baseline/baseline.{chr}.l2.M'
453
+ M_5_file = f'{self.annot_root}/baseline/baseline.{chr}.l2.M_5_50'
454
+
455
+ # Compute annotations of the baseline
456
+ print(f"Computing LD score for baseline annotations of chr{chr}")
457
+ self.compute_ldscore_chunk(annot_file, ld_score_file, M_file, M_5_file, geno_array, block_left, snp)
458
+
459
+ # Load annotations of chunks
460
+ bar = IncrementalBar(f"Computing LD scores for spatial data annotations of chr{chr}", max=self.const_max_size)
461
+ bar.check_tty = False
462
+ for chunk_index in range(1, self.const_max_size + 1):
463
+ # Set the file root
464
+ annot_file = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}/{self.data_name}.{chr}.feather'
465
+ ld_score_file = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}/{self.data_name}.{chr}.l2.ldscore.feather'
466
+ M_file = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}/{self.data_name}.{chr}.l2.M'
467
+ M_5_file = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}/{self.data_name}.{chr}.l2.M_5_50'
468
+
469
+ # Compute annotations of the current chunk
470
+ self.compute_ldscore_chunk(annot_file, ld_score_file, M_file, M_5_file, geno_array, block_left, snp)
471
+
472
+ bar.next()
473
+
474
+ bar.finish()
475
+
476
+
477
+
478
+ def add_make_annotation_args(parser):
479
+ parser.add_argument('--input_feather_file', required=True, type=str, help='Input feather file for marker genes score (output of gsMap latent_to_gene)')
480
+ parser.add_argument('--output_dir', required=True, type=str, help='Output directory to save the SNP annotation files')
481
+ parser.add_argument('--sample_name', type=str, help='Name of the sample', required=True)
482
+ parser.add_argument('--gtf_annotation_file', default=None, type=str, help='Path to the GTF file', required=True)
483
+ parser.add_argument('--bfile_root', default=None, type=str, help='Bfile root for LD score', required=True)
484
+ parser.add_argument('--baseline_annotation', default=None, type=str, help='Baseline annotation')
485
+ parser.add_argument('--keep_snp_root', default=None, type=str,
486
+ help='Only keep these SNP file after calculating LD score')
487
+ parser.add_argument('--chr', default=None, type=int, help='Chromosome ID', )
488
+ parser.add_argument('--window_size', default=50000, type=int,
489
+ help='Window size for SNP annotation')
490
+ parser.add_argument('--cells_per_chunk', default=500, type=int,
491
+ help='Chunk size for number of cells for batch processing')
492
+ parser.add_argument('--ld_wind', default=1, type=float)
493
+ parser.add_argument('--ld_wind_unit', default='CM', type=str, choices=['CM', 'BP', 'SNP'],
494
+ help='LD window size unit')
495
+ parser.add_argument('--r2_cache_dir', default=None, type=str, help='Directory for r2 cache')
496
+ parser.add_argument('--use_gpu', action='store_true', help='Whether to use GPU to compute LD score')
497
+ parser.add_argument('--snps_per_chunk', default=50_000, type=int,
498
+ help='Chunk size for number of SNPs for batch processing')
499
+
500
+
501
+ # Defin the Container for plink files
502
+
503
+ def run_make_annotation(args: MakeAnnotationConfig):
504
+
505
+ snp_annotate = Snp_Annotator(mk_score_file=args.input_feather_file,
506
+ gtf_file=args.gtf_file,
507
+ bfile_root=args.bfile_root,
508
+ annot_root=Path(args.output_dir),
509
+ annot_name=args.sample_name,
510
+ chr=args.chr,
511
+ base_root=args.baseline_annotation,
512
+ window_size=args.window_size,
513
+ const_max_size=args.cells_per_chunk
514
+ )
515
+ const_max_size = snp_annotate.annotate()
516
+ ldscore_generate = LDscore_Generator(
517
+ args, const_max_size
518
+ )
519
+ ldscore_generate.compute_ldscore()
520
+
521
+
522
+ if __name__ == '__main__':
523
+ parser = argparse.ArgumentParser(description='make_annotations.py',
524
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
525
+ add_make_annotation_args(parser)
526
+
527
+ # Store the Params
528
+ TEST = True
529
+ if TEST:
530
+ name = 'Cortex_151507'
531
+ TASK_ID = 2
532
+ test_dir = '/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/Nature_Neuroscience_2021'
533
+ config = MakeAnnotationConfig(
534
+ input_feather_file=f'{test_dir}/{name}/gene_markers/{name}_rank.feather',
535
+ sample_name=name,
536
+ output_dir=f'{test_dir}/{name}/snp_annotation/new_run',
537
+ gtf_file='/storage/yangjianLab/songliyang/ReferenceGenome/GRCh37/gencode.v39lift37.annotation.gtf',
538
+ bfile_root='/storage/yangjianLab/sharedata/LDSC_resource/1000G_EUR_Phase3_plink/1000G.EUR.QC',
539
+ baseline_annotation=None,
540
+ keep_snp_root='/storage/yangjianLab/sharedata/LDSC_resource/hapmap3_snps/hm',
541
+ chr=TASK_ID,
542
+ window_size=50000,
543
+ cells_per_chunk=500,
544
+ ld_wind=1,
545
+ ld_wind_unit='CM',
546
+ r2_cache_dir='/storage/yangjianLab/chenwenhao/projects/202312_gsMap/data/gsMap_test/r2_matrix',
547
+ use_gpu=True,
548
+ snps_per_chunk=100_000
549
+ )
550
+
551
+ else:
552
+ args = parser.parse_args()
553
+ config=MakeAnnotationConfig(**vars(args))
554
+
555
+ logger.info(f'Running make_annotation for {config.sample_name}')
556
+ pprint.pprint(config)
557
+ start_time = time.time()
558
+ run_make_annotation(config)
559
+ end_time = time.time()
560
+ logger.info(f'Make SNP annotation for {config.sample_name} finished. Time spent: {(end_time - start_time) / 60:.2f} min.')