gsMap 1.71__py3-none-any.whl → 1.71.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,518 +1,518 @@
1
- import argparse
2
- import logging
3
- import os
4
- import pprint
5
- import time
6
- from pathlib import Path
7
-
8
-
9
- import numpy as np
10
- import pandas as pd
11
- import pyranges as pr
12
- from progress.bar import IncrementalBar
13
-
14
- from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
15
-
16
-
17
- logger = logging.getLogger(__name__)
18
- logger.setLevel(logging.DEBUG)
19
- handler = logging.StreamHandler()
20
- handler.setFormatter(logging.Formatter(
21
- '[{asctime}] {levelname:8s} {filename} {message}', style='{'))
22
- logger.addHandler(handler)
23
- from dataclasses import dataclass, field
24
- from typing import Optional
25
- try:
26
- import cupy as cp
27
- pool = cp.cuda.MemoryPool(cp.cuda.malloc_async)
28
- cp.cuda.set_allocator(pool.malloc)
29
- except ImportError:
30
- logger.warning('Cupy not found, will not use GPU to compute LD score')
31
- cp = None
32
- @dataclass
33
- class MakeAnnotationConfig:
34
- input_feather_file: str
35
- output_dir: str
36
- sample_name: str
37
- gtf_file: Optional[str] = None
38
- bfile_root: Optional[str] = None
39
- baseline_annotation: Optional[str] = None
40
- keep_snp_root: Optional[str] = None
41
- chr: Optional[int] = None
42
- window_size: int = 50000
43
- cells_per_chunk: int = 500
44
- ld_wind: float = 1.0
45
- ld_wind_unit: str = field(default='CM', metadata={'choices': ['CM', 'BP', 'SNP']})
46
- r2_cache_dir: Optional[str] = None
47
- use_gpu: bool = False
48
- snps_per_chunk: int = 50_000
49
-
50
- def __post_init__(self):
51
- if self.ld_wind_unit not in self.__dataclass_fields__['ld_wind_unit'].metadata['choices']:
52
- raise ValueError(f"Invalid ld_wind_unit: {self.ld_wind_unit}. Choose from 'CM', 'BP', or 'SNP'.")
53
-
54
-
55
- class Snp_Annotator:
56
- """
57
- 1. Annotate SNPs based on score of genes.
58
- 2. Add baseline annotations.
59
- """
60
-
61
- def __init__(self, mk_score_file, gtf_file, bfile_root, annot_root, annot_name, chr=None, base_root=None,
62
- window_size=50000, const_max_size=100):
63
- #
64
- # marker score
65
- self.mk_score_file = mk_score_file
66
- self.mk_score = self.load_marker_score()
67
- #
68
- # chunk cells
69
- # self.const_max_size = const_max_size
70
- self.n_cells = len(self.mk_score.columns)
71
- self.max_chunk = const_max_size
72
- # self.max_chunk = floor(self.n_cells / self.const_max_size)
73
- #
74
- # gtf data
75
- self.gtf_file = gtf_file
76
- self.window_size = window_size
77
- self.gtf_pr = self.load_gtf(mk_score=self.mk_score)
78
- #
79
- self.bfile_root = bfile_root
80
- self.annot_root = annot_root
81
- self.base_root = base_root
82
- self.chr = chr
83
-
84
- self.data_name = annot_name
85
-
86
- #
87
- def load_marker_score(self):
88
- """
89
- Load marker scores of each cell.
90
- """
91
- mk_score = pd.read_feather(self.mk_score_file).set_index('HUMAN_GENE_SYM').rename_axis('gene_name')
92
- mk_score.insert(0, 'all_gene', 1)
93
- return mk_score
94
-
95
- #
96
- def load_gtf(self, mk_score):
97
- """
98
- Load the gene annotation file (gtf).
99
- """
100
- print("Loading gtf data")
101
- #
102
- # Load GTF file
103
- gtf = pr.read_gtf(self.gtf_file)
104
- gtf = gtf.df
105
- #
106
- # Select the common genes
107
- gtf = gtf[gtf['Feature'] == 'gene']
108
- common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
109
- #
110
- gtf = gtf[gtf.gene_name.isin(common_gene)]
111
- mk_score = mk_score[mk_score.index.isin(common_gene)]
112
- #
113
- # Remove duplicated lines
114
- gtf = gtf.drop_duplicates(subset='gene_name', keep="first")
115
- #
116
- # Process the GTF (open 100-KB window: Tss - Ted)
117
- gtf_bed = gtf[['Chromosome', 'Start', 'End', 'gene_name', 'Strand']].copy()
118
- gtf_bed.loc[:, 'TSS'] = gtf_bed['Start']
119
- gtf_bed.loc[:, 'TED'] = gtf_bed['End']
120
-
121
- gtf_bed.loc[:, 'Start'] = gtf_bed['TSS'] - self.window_size
122
- gtf_bed.loc[:, 'End'] = gtf_bed['TED'] + self.window_size
123
- gtf_bed.loc[gtf_bed['Start'] < 0, 'Start'] = 0
124
- #
125
- # Correct the negative strand
126
- tss_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS']
127
- ted_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED']
128
- gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS'] = ted_neg
129
- gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED'] = tss_neg
130
- gtf_bed = gtf_bed.drop('Strand', axis=1)
131
- #
132
- # Transform the GTF to PyRanges
133
- gtf_pr = pr.PyRanges(gtf_bed)
134
- return gtf_pr
135
-
136
- #
137
- def load_baseline(self, chr):
138
- """
139
- Load baseline annotations.
140
- """
141
- baseline = pd.read_csv(f'{self.base_root}.{chr}.annot.gz', sep='\t')
142
- baseline.drop(['CHR', 'BP', 'CM'], axis=1, inplace=True)
143
- return baseline
144
-
145
- # -
146
- def Load_bim(self, chr):
147
- """
148
- Load bim files.
149
- """
150
- bim_file = f'{self.bfile_root}.{chr}.bim'
151
- bim = pd.read_csv(bim_file, sep='\t', header=None)
152
- bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
153
- #
154
- # Transform bim to PyRanges
155
- bim_pr = bim.copy()
156
- bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
157
- bim_pr['End'] = bim_pr['Start']
158
- bim_pr = pr.PyRanges(bim_pr)
159
- bim_pr.Chromosome = f'chr{chr}'
160
- return bim_pr, bim
161
-
162
- # -
163
- def Overlaps_gtf_bim(self, bim_pr):
164
- """
165
- Find overlaps between gtf and bim file.
166
- """
167
- # Select the overlapped regions (SNPs in gene windows)
168
- overlaps = self.gtf_pr.join(bim_pr)
169
- overlaps = overlaps.df
170
- overlaps['Distance'] = np.abs(overlaps['Start_b'] - overlaps['TSS'])
171
- overlaps_small = overlaps.copy()
172
- overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
173
- return overlaps_small
174
-
175
- # -
176
- def map_baseline(self, snp_score, baseline, chr):
177
- """
178
- Generate the baseline annotations for SNPs.
179
- """
180
-
181
- header = snp_score.columns[0:6].to_list()
182
-
183
- if baseline is None:
184
- print(f'Baseline annotations of chr{chr} are not provided, using uniform annotations for genes and SNPs')
185
- baseline_score = snp_score[header + ['all_gene']].copy()
186
- baseline_score.loc[:, 'base'] = 1
187
-
188
- else:
189
- print(f'Mapping baseline annotations of chr{chr}')
190
- snp_score_baseline = pd.merge(snp_score, baseline, how='left', on='SNP').fillna(0).copy()
191
-
192
- baseline_score = snp_score_baseline[header + ['all_gene'] + baseline.columns.to_list()]
193
- baseline_score = baseline_score.loc[:, ~baseline_score.columns.duplicated()].copy()
194
-
195
- # Create the folder (for baseline annotation)
196
- file_base_root = f'{self.annot_root}/baseline'
197
- if not os.path.exists(file_base_root):
198
- os.makedirs(file_base_root, mode=0o777, exist_ok=True)
199
-
200
- # Save baseline annotations (in parquet format)
201
- file_base = f'{file_base_root}/baseline.{chr}.feather'
202
- baseline_score.to_feather(file_base)
203
-
204
- return 0
205
-
206
- # -
207
- def annotate_chr(self, chr):
208
- """
209
- Annotate SNPs of each chr.
210
- """
211
- # Load the baseline file
212
- baseline = None
213
- if self.base_root is not None:
214
- baseline = self.load_baseline(chr)
215
-
216
- # Load the bim file
217
- bim_pr, bim = self.Load_bim(chr)
218
-
219
- # Find overlapping
220
- overlaps_small = self.Overlaps_gtf_bim(bim_pr)
221
-
222
- # Do annotations
223
- all_chunks = int(np.ceil(self.n_cells / self.max_chunk))
224
- bar = IncrementalBar(f'Mapping the gene marker scores to SNPs in chr{chr}', max=all_chunks)
225
- bar.check_tty = False
226
-
227
- # Preprocess bim outside the loop as it doesn't change
228
- anno_template = bim[["CHR", "BP", "SNP", "CM"]]
229
-
230
- for chunk_index, left in enumerate(range(0, self.n_cells, self.max_chunk), start=1):
231
- right = min(left + self.max_chunk, self.n_cells)
232
- mk_score_current = self.mk_score.iloc[:, left:right]
233
-
234
- # Process marker scores for SNPs
235
- anno = anno_template.copy()
236
- merged_data = overlaps_small[['SNP', 'gene_name', 'TSS']].merge(mk_score_current, on='gene_name',
237
- how='left')
238
- snp_score = pd.merge(anno, merged_data, how='left', on='SNP').fillna(0)
239
- snp_score = snp_score.rename(columns={'gene_name': 'Gene'})
240
- snp_score.loc[snp_score.Gene == 0, 'Gene'] = 'None'
241
-
242
- # Process baseline annotations for the first chunk
243
- if chunk_index == 1:
244
- self.map_baseline(snp_score, baseline, chr)
245
- snp_score = snp_score.drop('all_gene', axis=1)
246
-
247
- # Create the folder and save SNP annotations
248
- file_root = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}'
249
- os.makedirs(file_root, mode=0o777, exist_ok=True)
250
- file_anno = f'{file_root}/{self.data_name}.{chr}.feather'
251
- snp_score.to_feather(file_anno)
252
-
253
- bar.next()
254
-
255
- bar.finish()
256
-
257
- return all_chunks
258
-
259
- #
260
- def annotate(self):
261
- """
262
- Perform SNP annotations for each chromosome.
263
- """
264
- if self.chr == None:
265
- for chr in range(1, 23):
266
- const_max_size = self.annotate_chr(chr=chr)
267
- else:
268
- const_max_size = self.annotate_chr(chr=self.chr)
269
-
270
- return const_max_size
271
-
272
-
273
- class LDscore_Generator:
274
- def __init__(self, make_annotation_config: MakeAnnotationConfig, const_max_size):
275
- self.bfile_root = make_annotation_config.bfile_root
276
- self.annot_root = Path(make_annotation_config.output_dir)
277
- self.const_max_size = const_max_size
278
- self.data_name = make_annotation_config.sample_name
279
- self.chr = make_annotation_config.chr
280
- self.ld_wind = make_annotation_config.ld_wind
281
- self.ld_wind_unit = make_annotation_config.ld_wind_unit
282
- self.keep_snp = make_annotation_config.keep_snp_root
283
- self.r2_cache_dir = make_annotation_config.r2_cache_dir
284
- self.use_gpu = make_annotation_config.use_gpu
285
- self.config = make_annotation_config
286
- self.generate_r2_cache = False
287
-
288
- # Set the r2 cache
289
- if self.r2_cache_dir is None:
290
- logger.info('No r2 cache directory specified, will not use r2 cache')
291
- self.chr_r2_cache_dir = None
292
- else:
293
- assert self.chr is not None, 'Must specify chr when using r2 cache'
294
- chr_r2_cache_dir = os.path.join(self.r2_cache_dir, f'chr{self.chr}')
295
- self.chr_r2_cache_dir = chr_r2_cache_dir
296
- if not os.path.exists(os.path.join(chr_r2_cache_dir, 'combined_r2_matrix.npz')):
297
- logger.warning(
298
- f'No r2 cache found for chr{self.chr}, will generate r2 cache for this chromosome, first time may take a while')
299
- os.makedirs(chr_r2_cache_dir, exist_ok=True, mode=0o777, )
300
- self.generate_r2_cache = True
301
- else:
302
- logger.info(f'Found r2 cache for chr{self.chr}, will use r2 cache for this chromosome')
303
-
304
- def compute_ldscore(self):
305
- """
306
- Compute LD scores.
307
- """
308
- start_time = time.time()
309
- if self.chr == None:
310
- for chr in range(1, 23):
311
- logger.info(f'Computing LD scores for chr{chr}')
312
- self.compute_ldscore_chr(chr=chr)
313
- logger.info(f'Finished computing LD scores for chr{chr}')
314
- else:
315
- logger.info(f'Computing LD scores for chr{self.chr}')
316
- self.compute_ldscore_chr(chr=self.chr)
317
- logger.info(f'Finished computing LD scores for chr{self.chr}')
318
- end_time = time.time()
319
- logger.info(f'Finished computing LD scores, time elapsed: {(end_time - start_time) / 60} minutes')
320
-
321
- def compute_ldscore_chunk(self, annot_file, ld_score_file, M_file, M_5_file, geno_array: PlinkBEDFileWithR2Cache,
322
- block_left, snp):
323
- """
324
- Compute and save LD scores for each chunk
325
- :param annot_file: Path to the annotation file
326
- :param ld_score_file: Path to the LD score file
327
- :param M_file: Path to the M file
328
- :param M_5_file: Path to the M_5_50 file
329
- :param geno_array: Genotype array
330
- :param block_left: Block left
331
- :param snp: SNP to be kept
332
- :return: None
333
- """
334
- annot_df = pd.read_feather(annot_file)
335
- n_annot, ma = len(annot_df.columns) - 6, len(annot_df)
336
-
337
- # print("Read {A} annotations for {M} SNPs from {f}".format(f=annot_file, A=n_annot, M=ma))
338
- annot_matrix = np.array(annot_df.iloc[:, 6:])
339
- annot_colnames = annot_df.columns[6:]
340
-
341
- # Reset the SNP point
342
- geno_array.__restart__()
343
-
344
- # Compute annotated LD score
345
- if self.chr_r2_cache_dir is None:
346
- lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 50, annot=annot_matrix))
347
- else:
348
- lN_df = pd.DataFrame(self.get_ldscore_use_cache(annot_matrix))
349
-
350
- ldscore = pd.concat([annot_df.iloc[:, 0:6], lN_df], axis=1)
351
- ldscore.columns = annot_df.columns
352
-
353
- # Keep the targeted SNPs
354
- if not snp is None:
355
- ldscore = ldscore.loc[ldscore.SNP.isin(snp)]
356
-
357
- # Save the LD score annotations
358
- ldscore = ldscore.reset_index()
359
- ldscore.drop(columns=['index'], inplace=True)
360
- ldscore.to_feather(ld_score_file)
361
-
362
- # Compute the .M (.M_5_50) file
363
- M = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix, axis=0))))
364
- ii = geno_array.maf > 0.05
365
- M_5_50 = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix[ii, :], axis=0))))
366
-
367
- # Save the sum of score annotations (all and maf > 0.05)
368
- np.savetxt(M_file, M, delimiter='\t')
369
- np.savetxt(M_5_file, M_5_50, delimiter='\t')
370
-
371
- def get_ldscore_use_cache(self, annot_matrix, ):
372
- if self.use_gpu:
373
- logger.debug('Using GPU to compute LD score')
374
- annot_matrix = cp.asarray(annot_matrix, dtype=cp.float32)
375
- for i, r2_matrix_chunk in enumerate(self.r2_matrix_chunk_list):
376
- r2_matrix_chunk = cp.sparse.csr_matrix(r2_matrix_chunk, dtype=cp.float32)
377
- lN_chunk = cp.asnumpy(r2_matrix_chunk @ annot_matrix)
378
- # convert to float16
379
- lN_chunk = lN_chunk.astype(np.float16)
380
- if i == 0:
381
- lN = lN_chunk
382
- else:
383
- lN = np.concatenate([lN, lN_chunk], axis=0)
384
- else:
385
- logger.debug('Using CPU to compute LD score')
386
- for i, r2_matrix_chunk in enumerate(self.r2_matrix_chunk_list):
387
- lN_chunk = r2_matrix_chunk @ annot_matrix
388
- # convert to float16
389
- lN_chunk = lN_chunk.astype(np.float16)
390
- if i == 0:
391
- lN = lN_chunk
392
- else:
393
- lN = np.concatenate([lN, lN_chunk], axis=0)
394
- return lN
395
-
396
- def compute_ldscore_chr(self, chr):
397
- PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
398
- PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
399
-
400
- bfile = f"{self.bfile_root}.{chr}"
401
- #
402
- # Load bim file
403
- snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
404
- array_snps = snp_obj(snp_file)
405
- m = len(array_snps.IDList)
406
- print(f'Read list of {m} SNPs from {snp_file}')
407
- #
408
- # Load fam
409
- ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
410
- array_indivs = ind_obj(ind_file)
411
- n = len(array_indivs.IDList)
412
- print(f'Read list of {n} individuals from {ind_file}')
413
- #
414
- # Load genotype array
415
- array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
416
- geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
417
-
418
- # Load the snp to be print
419
- if not self.keep_snp is None:
420
- snp = pd.read_csv(f'{self.keep_snp}.{chr}.snp', header=None)[0].to_list()
421
- num_snp = len(snp)
422
- print(f'Loading {num_snp} SNPs')
423
- else:
424
- snp = None
425
-
426
- # Load the annotations of the baseline
427
- if self.ld_wind_unit == 'SNP':
428
- max_dist = self.ld_wind
429
- coords = np.array(range(geno_array.m))
430
- elif self.ld_wind_unit == 'BP':
431
- max_dist = self.ld_wind * 1000
432
- coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
433
- elif self.ld_wind_unit == 'CM':
434
- max_dist = self.ld_wind
435
- coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
436
- block_left = getBlockLefts(coords, max_dist)
437
- if self.generate_r2_cache:
438
- logger.info(f'Generating r2 cache for chr{chr}, this may take a while')
439
- geno_array.compute_r2_cache(block_left,
440
- Path(self.chr_r2_cache_dir))
441
- logger.info(f'Finished generating r2 cache for chr{chr}')
442
- if self.chr_r2_cache_dir is not None:
443
- logger.info('Loading r2 cache')
444
- r2_matrix = geno_array.load_combined_r2_matrix(cached_r2_matrix_dir=self.chr_r2_cache_dir)
445
- self.r2_matrix_chunk_list = [r2_matrix[i:i + self.config.snps_per_chunk, :] for i in
446
- range(0, r2_matrix.shape[0], self.config.snps_per_chunk)]
447
- logger.info('Finished loading r2 cache')
448
- # Set the baseline root
449
- annot_file = f'{self.annot_root}/baseline/baseline.{chr}.feather'
450
- ld_score_file = f'{self.annot_root}/baseline/baseline.{chr}.l2.ldscore.feather'
451
- M_file = f'{self.annot_root}/baseline/baseline.{chr}.l2.M'
452
- M_5_file = f'{self.annot_root}/baseline/baseline.{chr}.l2.M_5_50'
453
-
454
- # Compute annotations of the baseline
455
- print(f"Computing LD score for baseline annotations of chr{chr}")
456
- self.compute_ldscore_chunk(annot_file, ld_score_file, M_file, M_5_file, geno_array, block_left, snp)
457
-
458
- # Load annotations of chunks
459
- bar = IncrementalBar(f"Computing LD scores for spatial data annotations of chr{chr}", max=self.const_max_size)
460
- bar.check_tty = False
461
- for chunk_index in range(1, self.const_max_size + 1):
462
- # Set the file root
463
- annot_file = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}/{self.data_name}.{chr}.feather'
464
- ld_score_file = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}/{self.data_name}.{chr}.l2.ldscore.feather'
465
- M_file = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}/{self.data_name}.{chr}.l2.M'
466
- M_5_file = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}/{self.data_name}.{chr}.l2.M_5_50'
467
-
468
- # Compute annotations of the current chunk
469
- self.compute_ldscore_chunk(annot_file, ld_score_file, M_file, M_5_file, geno_array, block_left, snp)
470
-
471
- bar.next()
472
-
473
- bar.finish()
474
-
475
-
476
-
477
- def add_make_annotation_args(parser):
478
- parser.add_argument('--input_feather_file', required=True, type=str, help='Input feather file for marker genes score (output of gsMap latent_to_gene)')
479
- parser.add_argument('--output_dir', required=True, type=str, help='Output directory to save the SNP annotation files')
480
- parser.add_argument('--sample_name', type=str, help='Name of the sample', required=True)
481
- parser.add_argument('--gtf_annotation_file', default=None, type=str, help='Path to the GTF file', required=True)
482
- parser.add_argument('--bfile_root', default=None, type=str, help='Bfile root for LD score', required=True)
483
- parser.add_argument('--baseline_annotation', default=None, type=str, help='Baseline annotation')
484
- parser.add_argument('--keep_snp_root', default=None, type=str,
485
- help='Only keep these SNP file after calculating LD score')
486
- parser.add_argument('--chr', default=None, type=int, help='Chromosome ID', )
487
- parser.add_argument('--window_size', default=50000, type=int,
488
- help='Window size for SNP annotation')
489
- parser.add_argument('--cells_per_chunk', default=500, type=int,
490
- help='Chunk size for number of cells for batch processing')
491
- parser.add_argument('--ld_wind', default=1, type=float)
492
- parser.add_argument('--ld_wind_unit', default='CM', type=str, choices=['CM', 'BP', 'SNP'],
493
- help='LD window size unit')
494
- parser.add_argument('--r2_cache_dir', default=None, type=str, help='Directory for r2 cache')
495
- parser.add_argument('--use_gpu', action='store_true', help='Whether to use GPU to compute LD score')
496
- parser.add_argument('--snps_per_chunk', default=50_000, type=int,
497
- help='Chunk size for number of SNPs for batch processing')
498
-
499
-
500
- # Defin the Container for plink files
501
-
502
- def run_make_annotation(args: MakeAnnotationConfig):
503
-
504
- snp_annotate = Snp_Annotator(mk_score_file=args.input_feather_file,
505
- gtf_file=args.gtf_file,
506
- bfile_root=args.bfile_root,
507
- annot_root=Path(args.output_dir),
508
- annot_name=args.sample_name,
509
- chr=args.chr,
510
- base_root=args.baseline_annotation,
511
- window_size=args.window_size,
512
- const_max_size=args.cells_per_chunk
513
- )
514
- const_max_size = snp_annotate.annotate()
515
- ldscore_generate = LDscore_Generator(
516
- args, const_max_size
517
- )
518
- ldscore_generate.compute_ldscore()
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import pprint
5
+ import time
6
+ from pathlib import Path
7
+
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ import pyranges as pr
12
+ from progress.bar import IncrementalBar
13
+
14
+ from gsMap.utils.generate_r2_matrix import PlinkBEDFileWithR2Cache, getBlockLefts, ID_List_Factory
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+ logger.setLevel(logging.DEBUG)
19
+ handler = logging.StreamHandler()
20
+ handler.setFormatter(logging.Formatter(
21
+ '[{asctime}] {levelname:8s} {filename} {message}', style='{'))
22
+ logger.addHandler(handler)
23
+ from dataclasses import dataclass, field
24
+ from typing import Optional
25
+ try:
26
+ import cupy as cp
27
+ pool = cp.cuda.MemoryPool(cp.cuda.malloc_async)
28
+ cp.cuda.set_allocator(pool.malloc)
29
+ except ImportError:
30
+ logger.warning('Cupy not found, will not use GPU to compute LD score')
31
+ cp = None
32
+ @dataclass
33
+ class MakeAnnotationConfig:
34
+ input_feather_file: str
35
+ output_dir: str
36
+ sample_name: str
37
+ gtf_file: Optional[str] = None
38
+ bfile_root: Optional[str] = None
39
+ baseline_annotation: Optional[str] = None
40
+ keep_snp_root: Optional[str] = None
41
+ chr: Optional[int] = None
42
+ window_size: int = 50000
43
+ cells_per_chunk: int = 500
44
+ ld_wind: float = 1.0
45
+ ld_wind_unit: str = field(default='CM', metadata={'choices': ['CM', 'BP', 'SNP']})
46
+ r2_cache_dir: Optional[str] = None
47
+ use_gpu: bool = False
48
+ snps_per_chunk: int = 50_000
49
+
50
+ def __post_init__(self):
51
+ if self.ld_wind_unit not in self.__dataclass_fields__['ld_wind_unit'].metadata['choices']:
52
+ raise ValueError(f"Invalid ld_wind_unit: {self.ld_wind_unit}. Choose from 'CM', 'BP', or 'SNP'.")
53
+
54
+
55
+ class Snp_Annotator:
56
+ """
57
+ 1. Annotate SNPs based on score of genes.
58
+ 2. Add baseline annotations.
59
+ """
60
+
61
+ def __init__(self, mk_score_file, gtf_file, bfile_root, annot_root, annot_name, chr=None, base_root=None,
62
+ window_size=50000, const_max_size=100):
63
+ #
64
+ # marker score
65
+ self.mk_score_file = mk_score_file
66
+ self.mk_score = self.load_marker_score()
67
+ #
68
+ # chunk cells
69
+ # self.const_max_size = const_max_size
70
+ self.n_cells = len(self.mk_score.columns)
71
+ self.max_chunk = const_max_size
72
+ # self.max_chunk = floor(self.n_cells / self.const_max_size)
73
+ #
74
+ # gtf data
75
+ self.gtf_file = gtf_file
76
+ self.window_size = window_size
77
+ self.gtf_pr = self.load_gtf(mk_score=self.mk_score)
78
+ #
79
+ self.bfile_root = bfile_root
80
+ self.annot_root = annot_root
81
+ self.base_root = base_root
82
+ self.chr = chr
83
+
84
+ self.data_name = annot_name
85
+
86
+ #
87
+ def load_marker_score(self):
88
+ """
89
+ Load marker scores of each cell.
90
+ """
91
+ mk_score = pd.read_feather(self.mk_score_file).set_index('HUMAN_GENE_SYM').rename_axis('gene_name')
92
+ mk_score.insert(0, 'all_gene', 1)
93
+ return mk_score
94
+
95
+ #
96
+ def load_gtf(self, mk_score):
97
+ """
98
+ Load the gene annotation file (gtf).
99
+ """
100
+ print("Loading gtf data")
101
+ #
102
+ # Load GTF file
103
+ gtf = pr.read_gtf(self.gtf_file)
104
+ gtf = gtf.df
105
+ #
106
+ # Select the common genes
107
+ gtf = gtf[gtf['Feature'] == 'gene']
108
+ common_gene = np.intersect1d(mk_score.index, gtf.gene_name)
109
+ #
110
+ gtf = gtf[gtf.gene_name.isin(common_gene)]
111
+ mk_score = mk_score[mk_score.index.isin(common_gene)]
112
+ #
113
+ # Remove duplicated lines
114
+ gtf = gtf.drop_duplicates(subset='gene_name', keep="first")
115
+ #
116
+ # Process the GTF (open 100-KB window: Tss - Ted)
117
+ gtf_bed = gtf[['Chromosome', 'Start', 'End', 'gene_name', 'Strand']].copy()
118
+ gtf_bed.loc[:, 'TSS'] = gtf_bed['Start']
119
+ gtf_bed.loc[:, 'TED'] = gtf_bed['End']
120
+
121
+ gtf_bed.loc[:, 'Start'] = gtf_bed['TSS'] - self.window_size
122
+ gtf_bed.loc[:, 'End'] = gtf_bed['TED'] + self.window_size
123
+ gtf_bed.loc[gtf_bed['Start'] < 0, 'Start'] = 0
124
+ #
125
+ # Correct the negative strand
126
+ tss_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS']
127
+ ted_neg = gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED']
128
+ gtf_bed.loc[gtf_bed['Strand'] == '-', 'TSS'] = ted_neg
129
+ gtf_bed.loc[gtf_bed['Strand'] == '-', 'TED'] = tss_neg
130
+ gtf_bed = gtf_bed.drop('Strand', axis=1)
131
+ #
132
+ # Transform the GTF to PyRanges
133
+ gtf_pr = pr.PyRanges(gtf_bed)
134
+ return gtf_pr
135
+
136
+ #
137
+ def load_baseline(self, chr):
138
+ """
139
+ Load baseline annotations.
140
+ """
141
+ baseline = pd.read_csv(f'{self.base_root}.{chr}.annot.gz', sep='\t')
142
+ baseline.drop(['CHR', 'BP', 'CM'], axis=1, inplace=True)
143
+ return baseline
144
+
145
+ # -
146
+ def Load_bim(self, chr):
147
+ """
148
+ Load bim files.
149
+ """
150
+ bim_file = f'{self.bfile_root}.{chr}.bim'
151
+ bim = pd.read_csv(bim_file, sep='\t', header=None)
152
+ bim.columns = ["CHR", "SNP", "CM", "BP", "A1", "A2"]
153
+ #
154
+ # Transform bim to PyRanges
155
+ bim_pr = bim.copy()
156
+ bim_pr.columns = ["Chromosome", "SNP", "CM", "Start", "A1", "A2"]
157
+ bim_pr['End'] = bim_pr['Start']
158
+ bim_pr = pr.PyRanges(bim_pr)
159
+ bim_pr.Chromosome = f'chr{chr}'
160
+ return bim_pr, bim
161
+
162
+ # -
163
+ def Overlaps_gtf_bim(self, bim_pr):
164
+ """
165
+ Find overlaps between gtf and bim file.
166
+ """
167
+ # Select the overlapped regions (SNPs in gene windows)
168
+ overlaps = self.gtf_pr.join(bim_pr)
169
+ overlaps = overlaps.df
170
+ overlaps['Distance'] = np.abs(overlaps['Start_b'] - overlaps['TSS'])
171
+ overlaps_small = overlaps.copy()
172
+ overlaps_small = overlaps_small.loc[overlaps_small.groupby('SNP').Distance.idxmin()]
173
+ return overlaps_small
174
+
175
+ # -
176
+ def map_baseline(self, snp_score, baseline, chr):
177
+ """
178
+ Generate the baseline annotations for SNPs.
179
+ """
180
+
181
+ header = snp_score.columns[0:6].to_list()
182
+
183
+ if baseline is None:
184
+ print(f'Baseline annotations of chr{chr} are not provided, using uniform annotations for genes and SNPs')
185
+ baseline_score = snp_score[header + ['all_gene']].copy()
186
+ baseline_score.loc[:, 'base'] = 1
187
+
188
+ else:
189
+ print(f'Mapping baseline annotations of chr{chr}')
190
+ snp_score_baseline = pd.merge(snp_score, baseline, how='left', on='SNP').fillna(0).copy()
191
+
192
+ baseline_score = snp_score_baseline[header + ['all_gene'] + baseline.columns.to_list()]
193
+ baseline_score = baseline_score.loc[:, ~baseline_score.columns.duplicated()].copy()
194
+
195
+ # Create the folder (for baseline annotation)
196
+ file_base_root = f'{self.annot_root}/baseline'
197
+ if not os.path.exists(file_base_root):
198
+ os.makedirs(file_base_root, mode=0o777, exist_ok=True)
199
+
200
+ # Save baseline annotations (in parquet format)
201
+ file_base = f'{file_base_root}/baseline.{chr}.feather'
202
+ baseline_score.to_feather(file_base)
203
+
204
+ return 0
205
+
206
+ # -
207
+ def annotate_chr(self, chr):
208
+ """
209
+ Annotate SNPs of each chr.
210
+ """
211
+ # Load the baseline file
212
+ baseline = None
213
+ if self.base_root is not None:
214
+ baseline = self.load_baseline(chr)
215
+
216
+ # Load the bim file
217
+ bim_pr, bim = self.Load_bim(chr)
218
+
219
+ # Find overlapping
220
+ overlaps_small = self.Overlaps_gtf_bim(bim_pr)
221
+
222
+ # Do annotations
223
+ all_chunks = int(np.ceil(self.n_cells / self.max_chunk))
224
+ bar = IncrementalBar(f'Mapping the gene marker scores to SNPs in chr{chr}', max=all_chunks)
225
+ bar.check_tty = False
226
+
227
+ # Preprocess bim outside the loop as it doesn't change
228
+ anno_template = bim[["CHR", "BP", "SNP", "CM"]]
229
+
230
+ for chunk_index, left in enumerate(range(0, self.n_cells, self.max_chunk), start=1):
231
+ right = min(left + self.max_chunk, self.n_cells)
232
+ mk_score_current = self.mk_score.iloc[:, left:right]
233
+
234
+ # Process marker scores for SNPs
235
+ anno = anno_template.copy()
236
+ merged_data = overlaps_small[['SNP', 'gene_name', 'TSS']].merge(mk_score_current, on='gene_name',
237
+ how='left')
238
+ snp_score = pd.merge(anno, merged_data, how='left', on='SNP').fillna(0)
239
+ snp_score = snp_score.rename(columns={'gene_name': 'Gene'})
240
+ snp_score.loc[snp_score.Gene == 0, 'Gene'] = 'None'
241
+
242
+ # Process baseline annotations for the first chunk
243
+ if chunk_index == 1:
244
+ self.map_baseline(snp_score, baseline, chr)
245
+ snp_score = snp_score.drop('all_gene', axis=1)
246
+
247
+ # Create the folder and save SNP annotations
248
+ file_root = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}'
249
+ os.makedirs(file_root, mode=0o777, exist_ok=True)
250
+ file_anno = f'{file_root}/{self.data_name}.{chr}.feather'
251
+ snp_score.to_feather(file_anno)
252
+
253
+ bar.next()
254
+
255
+ bar.finish()
256
+
257
+ return all_chunks
258
+
259
+ #
260
+ def annotate(self):
261
+ """
262
+ Perform SNP annotations for each chromosome.
263
+ """
264
+ if self.chr == None:
265
+ for chr in range(1, 23):
266
+ const_max_size = self.annotate_chr(chr=chr)
267
+ else:
268
+ const_max_size = self.annotate_chr(chr=self.chr)
269
+
270
+ return const_max_size
271
+
272
+
273
+ class LDscore_Generator:
274
+ def __init__(self, make_annotation_config: MakeAnnotationConfig, const_max_size):
275
+ self.bfile_root = make_annotation_config.bfile_root
276
+ self.annot_root = Path(make_annotation_config.output_dir)
277
+ self.const_max_size = const_max_size
278
+ self.data_name = make_annotation_config.sample_name
279
+ self.chr = make_annotation_config.chr
280
+ self.ld_wind = make_annotation_config.ld_wind
281
+ self.ld_wind_unit = make_annotation_config.ld_wind_unit
282
+ self.keep_snp = make_annotation_config.keep_snp_root
283
+ self.r2_cache_dir = make_annotation_config.r2_cache_dir
284
+ self.use_gpu = make_annotation_config.use_gpu
285
+ self.config = make_annotation_config
286
+ self.generate_r2_cache = False
287
+
288
+ # Set the r2 cache
289
+ if self.r2_cache_dir is None:
290
+ logger.info('No r2 cache directory specified, will not use r2 cache')
291
+ self.chr_r2_cache_dir = None
292
+ else:
293
+ assert self.chr is not None, 'Must specify chr when using r2 cache'
294
+ chr_r2_cache_dir = os.path.join(self.r2_cache_dir, f'chr{self.chr}')
295
+ self.chr_r2_cache_dir = chr_r2_cache_dir
296
+ if not os.path.exists(os.path.join(chr_r2_cache_dir, 'combined_r2_matrix.npz')):
297
+ logger.warning(
298
+ f'No r2 cache found for chr{self.chr}, will generate r2 cache for this chromosome, first time may take a while')
299
+ os.makedirs(chr_r2_cache_dir, exist_ok=True, mode=0o777, )
300
+ self.generate_r2_cache = True
301
+ else:
302
+ logger.info(f'Found r2 cache for chr{self.chr}, will use r2 cache for this chromosome')
303
+
304
+ def compute_ldscore(self):
305
+ """
306
+ Compute LD scores.
307
+ """
308
+ start_time = time.time()
309
+ if self.chr == None:
310
+ for chr in range(1, 23):
311
+ logger.info(f'Computing LD scores for chr{chr}')
312
+ self.compute_ldscore_chr(chr=chr)
313
+ logger.info(f'Finished computing LD scores for chr{chr}')
314
+ else:
315
+ logger.info(f'Computing LD scores for chr{self.chr}')
316
+ self.compute_ldscore_chr(chr=self.chr)
317
+ logger.info(f'Finished computing LD scores for chr{self.chr}')
318
+ end_time = time.time()
319
+ logger.info(f'Finished computing LD scores, time elapsed: {(end_time - start_time) / 60} minutes')
320
+
321
+ def compute_ldscore_chunk(self, annot_file, ld_score_file, M_file, M_5_file, geno_array: PlinkBEDFileWithR2Cache,
322
+ block_left, snp):
323
+ """
324
+ Compute and save LD scores for each chunk
325
+ :param annot_file: Path to the annotation file
326
+ :param ld_score_file: Path to the LD score file
327
+ :param M_file: Path to the M file
328
+ :param M_5_file: Path to the M_5_50 file
329
+ :param geno_array: Genotype array
330
+ :param block_left: Block left
331
+ :param snp: SNP to be kept
332
+ :return: None
333
+ """
334
+ annot_df = pd.read_feather(annot_file)
335
+ n_annot, ma = len(annot_df.columns) - 6, len(annot_df)
336
+
337
+ # print("Read {A} annotations for {M} SNPs from {f}".format(f=annot_file, A=n_annot, M=ma))
338
+ annot_matrix = np.array(annot_df.iloc[:, 6:])
339
+ annot_colnames = annot_df.columns[6:]
340
+
341
+ # Reset the SNP point
342
+ geno_array.__restart__()
343
+
344
+ # Compute annotated LD score
345
+ if self.chr_r2_cache_dir is None:
346
+ lN_df = pd.DataFrame(geno_array.ldScoreVarBlocks(block_left, 50, annot=annot_matrix))
347
+ else:
348
+ lN_df = pd.DataFrame(self.get_ldscore_use_cache(annot_matrix))
349
+
350
+ ldscore = pd.concat([annot_df.iloc[:, 0:6], lN_df], axis=1)
351
+ ldscore.columns = annot_df.columns
352
+
353
+ # Keep the targeted SNPs
354
+ if not snp is None:
355
+ ldscore = ldscore.loc[ldscore.SNP.isin(snp)]
356
+
357
+ # Save the LD score annotations
358
+ ldscore = ldscore.reset_index()
359
+ ldscore.drop(columns=['index'], inplace=True)
360
+ ldscore.to_feather(ld_score_file)
361
+
362
+ # Compute the .M (.M_5_50) file
363
+ M = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix, axis=0))))
364
+ ii = geno_array.maf > 0.05
365
+ M_5_50 = np.atleast_1d(np.squeeze(np.asarray(np.sum(annot_matrix[ii, :], axis=0))))
366
+
367
+ # Save the sum of score annotations (all and maf > 0.05)
368
+ np.savetxt(M_file, M, delimiter='\t')
369
+ np.savetxt(M_5_file, M_5_50, delimiter='\t')
370
+
371
+ def get_ldscore_use_cache(self, annot_matrix, ):
372
+ if self.use_gpu:
373
+ logger.debug('Using GPU to compute LD score')
374
+ annot_matrix = cp.asarray(annot_matrix, dtype=cp.float32)
375
+ for i, r2_matrix_chunk in enumerate(self.r2_matrix_chunk_list):
376
+ r2_matrix_chunk = cp.sparse.csr_matrix(r2_matrix_chunk, dtype=cp.float32)
377
+ lN_chunk = cp.asnumpy(r2_matrix_chunk @ annot_matrix)
378
+ # convert to float16
379
+ lN_chunk = lN_chunk.astype(np.float16)
380
+ if i == 0:
381
+ lN = lN_chunk
382
+ else:
383
+ lN = np.concatenate([lN, lN_chunk], axis=0)
384
+ else:
385
+ logger.debug('Using CPU to compute LD score')
386
+ for i, r2_matrix_chunk in enumerate(self.r2_matrix_chunk_list):
387
+ lN_chunk = r2_matrix_chunk @ annot_matrix
388
+ # convert to float16
389
+ lN_chunk = lN_chunk.astype(np.float16)
390
+ if i == 0:
391
+ lN = lN_chunk
392
+ else:
393
+ lN = np.concatenate([lN, lN_chunk], axis=0)
394
+ return lN
395
+
396
+ def compute_ldscore_chr(self, chr):
397
+ PlinkBIMFile = ID_List_Factory(['CHR', 'SNP', 'CM', 'BP', 'A1', 'A2'], 1, '.bim', usecols=[0, 1, 2, 3, 4, 5])
398
+ PlinkFAMFile = ID_List_Factory(['IID'], 0, '.fam', usecols=[1])
399
+
400
+ bfile = f"{self.bfile_root}.{chr}"
401
+ #
402
+ # Load bim file
403
+ snp_file, snp_obj = bfile + '.bim', PlinkBIMFile
404
+ array_snps = snp_obj(snp_file)
405
+ m = len(array_snps.IDList)
406
+ print(f'Read list of {m} SNPs from {snp_file}')
407
+ #
408
+ # Load fam
409
+ ind_file, ind_obj = bfile + '.fam', PlinkFAMFile
410
+ array_indivs = ind_obj(ind_file)
411
+ n = len(array_indivs.IDList)
412
+ print(f'Read list of {n} individuals from {ind_file}')
413
+ #
414
+ # Load genotype array
415
+ array_file, array_obj = bfile + '.bed', PlinkBEDFileWithR2Cache
416
+ geno_array = array_obj(array_file, n, array_snps, keep_snps=None, keep_indivs=None, mafMin=None)
417
+
418
+ # Load the snp to be print
419
+ if not self.keep_snp is None:
420
+ snp = pd.read_csv(f'{self.keep_snp}.{chr}.snp', header=None)[0].to_list()
421
+ num_snp = len(snp)
422
+ print(f'Loading {num_snp} SNPs')
423
+ else:
424
+ snp = None
425
+
426
+ # Load the annotations of the baseline
427
+ if self.ld_wind_unit == 'SNP':
428
+ max_dist = self.ld_wind
429
+ coords = np.array(range(geno_array.m))
430
+ elif self.ld_wind_unit == 'BP':
431
+ max_dist = self.ld_wind * 1000
432
+ coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
433
+ elif self.ld_wind_unit == 'CM':
434
+ max_dist = self.ld_wind
435
+ coords = np.array(array_snps.df['CM'])[geno_array.kept_snps]
436
+ block_left = getBlockLefts(coords, max_dist)
437
+ if self.generate_r2_cache:
438
+ logger.info(f'Generating r2 cache for chr{chr}, this may take a while')
439
+ geno_array.compute_r2_cache(block_left,
440
+ Path(self.chr_r2_cache_dir))
441
+ logger.info(f'Finished generating r2 cache for chr{chr}')
442
+ if self.chr_r2_cache_dir is not None:
443
+ logger.info('Loading r2 cache')
444
+ r2_matrix = geno_array.load_combined_r2_matrix(cached_r2_matrix_dir=self.chr_r2_cache_dir)
445
+ self.r2_matrix_chunk_list = [r2_matrix[i:i + self.config.snps_per_chunk, :] for i in
446
+ range(0, r2_matrix.shape[0], self.config.snps_per_chunk)]
447
+ logger.info('Finished loading r2 cache')
448
+ # Set the baseline root
449
+ annot_file = f'{self.annot_root}/baseline/baseline.{chr}.feather'
450
+ ld_score_file = f'{self.annot_root}/baseline/baseline.{chr}.l2.ldscore.feather'
451
+ M_file = f'{self.annot_root}/baseline/baseline.{chr}.l2.M'
452
+ M_5_file = f'{self.annot_root}/baseline/baseline.{chr}.l2.M_5_50'
453
+
454
+ # Compute annotations of the baseline
455
+ print(f"Computing LD score for baseline annotations of chr{chr}")
456
+ self.compute_ldscore_chunk(annot_file, ld_score_file, M_file, M_5_file, geno_array, block_left, snp)
457
+
458
+ # Load annotations of chunks
459
+ bar = IncrementalBar(f"Computing LD scores for spatial data annotations of chr{chr}", max=self.const_max_size)
460
+ bar.check_tty = False
461
+ for chunk_index in range(1, self.const_max_size + 1):
462
+ # Set the file root
463
+ annot_file = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}/{self.data_name}.{chr}.feather'
464
+ ld_score_file = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}/{self.data_name}.{chr}.l2.ldscore.feather'
465
+ M_file = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}/{self.data_name}.{chr}.l2.M'
466
+ M_5_file = f'{self.annot_root}/{self.data_name}_chunk{chunk_index}/{self.data_name}.{chr}.l2.M_5_50'
467
+
468
+ # Compute annotations of the current chunk
469
+ self.compute_ldscore_chunk(annot_file, ld_score_file, M_file, M_5_file, geno_array, block_left, snp)
470
+
471
+ bar.next()
472
+
473
+ bar.finish()
474
+
475
+
476
+
477
+ def add_make_annotation_args(parser):
478
+ parser.add_argument('--input_feather_file', required=True, type=str, help='Input feather file for marker genes score (output of gsMap latent_to_gene)')
479
+ parser.add_argument('--output_dir', required=True, type=str, help='Output directory to save the SNP annotation files')
480
+ parser.add_argument('--sample_name', type=str, help='Name of the sample', required=True)
481
+ parser.add_argument('--gtf_annotation_file', default=None, type=str, help='Path to the GTF file', required=True)
482
+ parser.add_argument('--bfile_root', default=None, type=str, help='Bfile root for LD score', required=True)
483
+ parser.add_argument('--baseline_annotation', default=None, type=str, help='Baseline annotation')
484
+ parser.add_argument('--keep_snp_root', default=None, type=str,
485
+ help='Only keep these SNP file after calculating LD score')
486
+ parser.add_argument('--chr', default=None, type=int, help='Chromosome ID', )
487
+ parser.add_argument('--window_size', default=50000, type=int,
488
+ help='Window size for SNP annotation')
489
+ parser.add_argument('--cells_per_chunk', default=500, type=int,
490
+ help='Chunk size for number of cells for batch processing')
491
+ parser.add_argument('--ld_wind', default=1, type=float)
492
+ parser.add_argument('--ld_wind_unit', default='CM', type=str, choices=['CM', 'BP', 'SNP'],
493
+ help='LD window size unit')
494
+ parser.add_argument('--r2_cache_dir', default=None, type=str, help='Directory for r2 cache')
495
+ parser.add_argument('--use_gpu', action='store_true', help='Whether to use GPU to compute LD score')
496
+ parser.add_argument('--snps_per_chunk', default=50_000, type=int,
497
+ help='Chunk size for number of SNPs for batch processing')
498
+
499
+
500
+ # Defin the Container for plink files
501
+
502
+ def run_make_annotation(args: MakeAnnotationConfig):
503
+
504
+ snp_annotate = Snp_Annotator(mk_score_file=args.input_feather_file,
505
+ gtf_file=args.gtf_file,
506
+ bfile_root=args.bfile_root,
507
+ annot_root=Path(args.output_dir),
508
+ annot_name=args.sample_name,
509
+ chr=args.chr,
510
+ base_root=args.baseline_annotation,
511
+ window_size=args.window_size,
512
+ const_max_size=args.cells_per_chunk
513
+ )
514
+ const_max_size = snp_annotate.annotate()
515
+ ldscore_generate = LDscore_Generator(
516
+ args, const_max_size
517
+ )
518
+ ldscore_generate.compute_ldscore()