gsMap 1.60__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/config.py ADDED
@@ -0,0 +1,734 @@
1
+ import argparse
2
+ import logging
3
+ from dataclasses import dataclass, field
4
+ from pprint import pprint
5
+ from typing import Union, Literal
6
+ from pathlib import Path
7
+
8
+ from collections import OrderedDict, namedtuple
9
+ from typing import Callable
10
+ from gsMap.__init__ import __version__
11
+ import pyfiglet
12
+
13
+ # Global registry to hold functions
14
+ cli_function_registry = OrderedDict()
15
+ subcommand = namedtuple('subcommand', ['name', 'func', 'add_args_function', 'description'])
16
+ logger = logging.getLogger(__name__)
17
+ logger.setLevel(logging.DEBUG)
18
+ handler = logging.StreamHandler()
19
+ handler.setFormatter(logging.Formatter(
20
+ '[{asctime}] {levelname:6s} {message}', style='{'))
21
+ logger.addHandler(handler)
22
+
23
+
24
+ # Decorator to register functions for cli parsing
25
+ def register_cli(name: str, description: str, add_args_function: Callable) -> Callable:
26
+ def decorator(func: Callable) -> Callable:
27
+ def wrapper(*args, **kwargs):
28
+ name.replace('_', ' ')
29
+ gsMap_main_logo = pyfiglet.figlet_format("gsMap", font='doom', width=80, justify='center', ).rstrip()
30
+ print(gsMap_main_logo, )
31
+ version_number = 'Version: ' + __version__
32
+ print(version_number.center(80))
33
+ print('=' * 80)
34
+ logger.info(f"Running {name}...")
35
+ func(*args, **kwargs)
36
+ logger.info(f"Finished running {name}.")
37
+
38
+ cli_function_registry[name] = subcommand(name=name, func=wrapper, add_args_function=add_args_function,
39
+ description=description)
40
+ return wrapper
41
+
42
+ return decorator
43
+
44
+
45
+ def add_find_latent_representations_args(parser):
46
+ parser.add_argument('--input_hdf5_path', required=True, type=str, help='Path to the input hdf5 file.')
47
+ parser.add_argument('--output_hdf5_path', required=True, type=str, help='Path to the output hdf5 file.')
48
+ parser.add_argument('--sample_name', required=True, type=str, help='Name of the sample.')
49
+ parser.add_argument('--annotation', default=None, type=str, help='Name of the annotation layer.')
50
+ parser.add_argument('--type', default=None, type=str, help="The gene expression layer of the input data (e.g., 'count', 'counts', 'log1p').")
51
+ parser.add_argument('--epochs', default=300, type=int,
52
+ help="Number of training epochs for the GNN-VAE model. Default is 300.")
53
+
54
+ parser.add_argument('--feat_hidden1', default=256, type=int,
55
+ help="Number of neurons in the first hidden layer of the feature extraction network. Default is 256.")
56
+ parser.add_argument('--feat_hidden2', default=128, type=int,
57
+ help="Number of neurons in the second hidden layer of the feature extraction network. Default is 128.")
58
+ parser.add_argument('--feat_cell', default=3000, type=int,
59
+ help="Number of top variable genes to select. Default is 3000.")
60
+ parser.add_argument('--gcn_hidden1', default=64, type=int,
61
+ help="Number of units in the first hidden layer of the GCN. Default is 64.")
62
+ parser.add_argument('--gcn_hidden2', default=30, type=int,
63
+ help="Number of units in the second hidden layer of the GCN. Default is 30.")
64
+ parser.add_argument('--p_drop', default=0.1, type=float,
65
+ help="Dropout rate used in the GNN-VAE model. Default is 0.1.")
66
+ parser.add_argument('--gcn_lr', default=0.001, type=float,
67
+ help="Learning rate for the GCN network. Default is 0.001.")
68
+ parser.add_argument('--gcn_decay', default=0.01, type=float,
69
+ help="Weight decay (L2 penalty) for the GCN network. Default is 0.01.")
70
+ parser.add_argument('--n_neighbors', default=11, type=int,
71
+ help="Number of neighbors to consider for graph construction in GCN. Default is 11.")
72
+ parser.add_argument('--label_w', default=1, type=float,
73
+ help="Weight of the label loss in the loss function. Default is 1.")
74
+ parser.add_argument('--rec_w', default=1, type=float,
75
+ help="Weight of the reconstruction loss in the loss function. Default is 1.")
76
+ parser.add_argument('--n_comps', default=300, type=int,
77
+ help="Number of principal components to keep if PCA is performed. Default is 300.")
78
+ parser.add_argument('--weighted_adj', action='store_true',
79
+ help="Use a weighted adjacency matrix in GCN. Default is False.")
80
+ parser.add_argument('--nheads', default=3, type=int,
81
+ help="Number of heads in the attention mechanism of the GNN. Default is 3.")
82
+ parser.add_argument('--var', action='store_true',
83
+ help="Enable var. Use --var to enable. Default is False.")
84
+ parser.add_argument('--convergence_threshold', default=1e-4, type=float,
85
+ help="Threshold for convergence during training. Training stops if the loss change is below this threshold. Default is 1e-4.")
86
+ parser.add_argument('--hierarchically', action='store_true',
87
+ help="Find latent representations hierarchically. Use --hierarchically to enable. Default is False.")
88
+
89
+
90
+ def chrom_choice(value):
91
+ if value.isdigit():
92
+ ivalue = int(value)
93
+ if 1 <= ivalue <= 22:
94
+ return ivalue
95
+ elif value.lower() == 'all':
96
+ return value
97
+ else:
98
+ raise argparse.ArgumentTypeError(f"'{value}' is an invalid chromosome choice. Choose from 1-22 or 'all'.")
99
+
100
+
101
+ def filter_args_for_dataclass(args_dict, data_class: dataclass):
102
+ return {k: v for k, v in args_dict.items() if k in data_class.__dataclass_fields__}
103
+
104
+
105
+ def get_dataclass_from_parser(args: argparse.Namespace, data_class: dataclass):
106
+ remain_kwargs = filter_args_for_dataclass(vars(args), data_class)
107
+ print(f'Using the following arguments for {data_class.__name__}:')
108
+ pprint(remain_kwargs)
109
+ return data_class(**remain_kwargs)
110
+
111
+
112
+ def add_generate_ldscore_args(parser):
113
+ parser.add_argument('--sample_name', type=str, required=True, help='Sample name')
114
+ parser.add_argument('--chrom', type=str, required=True, help='Chromosome number (1-22) or "all"')
115
+ parser.add_argument('--ldscore_save_dir', type=str, required=True, help='Directory to save ld score files')
116
+ parser.add_argument('--mkscore_feather_file', type=str, required=True, help='Mkscore feather file path')
117
+
118
+ # additional baseline annotation
119
+ parser.add_argument('--additional_baseline_annotation_dir_path', type=str, default=None,)
120
+
121
+ # reference panel
122
+ parser.add_argument('--bfile_root', type=str, required=True, help='Bfile root path')
123
+ parser.add_argument('--keep_snp_root', type=str, required=True, help='Keep SNP root path')
124
+
125
+ # Annotation by gene distance
126
+ parser.add_argument('--gtf_annotation_file', type=str, required=True, help='GTF file path')
127
+ parser.add_argument('--gene_window_size', type=int, default=50000, help='Gene window size')
128
+
129
+ # Enhancer annotation
130
+ parser.add_argument('--enhancer_annotation_file', type=str, default=None,
131
+ help='Enhancer annotation bed file path, optional.')
132
+ parser.add_argument('--snp_multiple_enhancer_strategy', type=str, default='max_mkscore',
133
+ choices=['max_mkscore', 'nearest_TSS'], help='Strategy for multiple enhancers per SNP')
134
+ parser.add_argument('--gene_window_enhancer_priority', type=str, default=None,
135
+ choices=['gene_window_first', 'enhancer_first', 'enhancer_only'],
136
+ help='Priority between gene window and enhancer')
137
+
138
+ # Arguments for calculating ld score
139
+ parser.add_argument('--spots_per_chunk', type=int, default=5_000, help='Number of spots per chunk')
140
+ parser.add_argument('--ld_wind', type=int, default=1, help='LD window size')
141
+ parser.add_argument('--ld_unit', type=str, default='CM', help='LD window unit (SNP/KB/CM)',
142
+ choices=['SNP', 'KB', 'CM'])
143
+
144
+
145
+ def add_latent_to_gene_args(parser):
146
+ parser.add_argument('--input_hdf5_with_latent_path', type=str, required=True,
147
+ help='Path to the input HDF5 file which contains latent representations.')
148
+ parser.add_argument('--sample_name', type=str, required=True, help='Name of the sample.')
149
+ parser.add_argument('--output_feather_path', type=str, required=True,
150
+ help='Path to save output gene marker score feather file.')
151
+ parser.add_argument('--annotation', default=None, type=str, help='Name of the annotation layer.')
152
+ parser.add_argument('--type', default=None, type=str, help="Type of input data (e.g., 'count', 'counts').")
153
+
154
+ # no_expression_fraction
155
+ parser.add_argument('--no_expression_fraction', action='store_true', default=False,
156
+ help='Flag to not use expression fraction as filter when calculate the maker score. Default is False.')
157
+
158
+ parser.add_argument('--latent_representation', type=str, default='latent_GVAE',
159
+ choices=['latent_GVAE', 'latent_PCA'],
160
+ help='Type of latent representation. Default is "latent_GVAE".')
161
+ parser.add_argument('--num_neighbour', type=int, default=21,
162
+ help='Number of neighbours to consider. Default is 21.')
163
+ parser.add_argument('--num_neighbour_spatial', type=int, default=101,
164
+ help='Number of spatial neighbours to consider. Default is 101.')
165
+ parser.add_argument('--species', type=str, default=None, help='Species name, which is the column name in the homolog gene conversion file.')
166
+ parser.add_argument('--gs_species', type=str, default=None, help='Homologous gene conversion file path, if applicable.')
167
+ parser.add_argument('--gM_slices', type=str, default=None, help='The mean Gene marker scores path across multiple slices.')
168
+
169
+
170
+ def add_spatial_ldsc_args(parser):
171
+ # Group for GWAS input data
172
+ parser.add_argument('--sample_name', required=True, help="Name of the spatial transcriptomic dataset.")
173
+
174
+ parser.add_argument('--sumstats_file', default=None, help="Path to GWAS summary statistics file.")
175
+ parser.add_argument('--sumstats_config_file', default=None, help="Path to GWAS summary statistics config file.")
176
+ parser.add_argument('--w_file', required=True, help="Path to regression weight file.")
177
+ parser.add_argument('--ldscore_input_dir', required=True, help="Input directory for LD Score files.")
178
+ parser.add_argument('--ldsc_save_dir', required=True, help="Directory to save Spatial LDSC results.")
179
+ parser.add_argument('--trait_name', default=None, help="Name of the trait.")
180
+ parser.add_argument('--not_M_5_50', action='store_true', help="Flag to not use M 5 50 in calculations.")
181
+ parser.add_argument('--n_blocks', type=int, default=200, help="Number of blocks for jackknife resampling.")
182
+ parser.add_argument('--chisq_max', type=int, help="Maximum chi-square value for filtering SNPs.")
183
+ parser.add_argument('--all_chunk', type=int, help="Number of chunks for processing spatial data.")
184
+
185
+ # if use additional baseline annotation
186
+ parser.add_argument('--disable_additional_baseline_annotation', action='store_true', default=False,)
187
+
188
+ parser.add_argument('--num_processes', type=int, default=4, help="Number of processes for parallel computing.")
189
+
190
+ return parser
191
+
192
+
193
+ def add_Cauchy_combination_args(parser):
194
+ # Required arguments
195
+ parser.add_argument('--input_hdf5_path', required=True, type=str, help='Path to the HDF5 file')
196
+ parser.add_argument('--input_ldsc_dir', required=True, type=str, help='Directory containing LDSC results')
197
+ parser.add_argument('--output_cauchy_dir', required=True, type=str,
198
+ help='Output directory for Cauchy combination results')
199
+ parser.add_argument('--sample_name', required=True, type=str, help='Name of the sample')
200
+ parser.add_argument('--trait_name', required=True, type=str, help='Name of the trait')
201
+ parser.add_argument('--annotation', required=True, type=str, help='Annotation layer name')
202
+
203
+ # Optional arguments
204
+ parser.add_argument('--meta', default=None, type=str, )
205
+ parser.add_argument('--slide', default=None, type=str, )
206
+
207
+
208
+
209
+ def add_Visualization_args(parser):
210
+ # Required arguments
211
+ parser.add_argument('--input_hdf5_path', required=True, type=str, help='Path to the HDF5 file')
212
+ parser.add_argument('--input_ldsc_dir', required=True, type=str, help='Directory containing LDSC results')
213
+ parser.add_argument('--output_figure_dir', required=True, type=str, help='Output directory for figures')
214
+ parser.add_argument('--sample_name', required=True, type=str, help='Name of the sample')
215
+ parser.add_argument('--trait_name', required=True, type=str, help='Name of the trait')
216
+ parser.add_argument('--annotation', default=None, type=str, help='Name of the annotation layer.')
217
+
218
+
219
+ # Arguments with defaults
220
+ parser.add_argument('--fig_title', type=str, default=None, help='Title of figure')
221
+ parser.add_argument('--fig_height', type=int, default=600, help='Height of figure in pixels')
222
+ parser.add_argument('--fig_width', type=int, default=800, help='Width of figure in pixels')
223
+ parser.add_argument('--point_size', type=int, default=None, help='Point size of figure')
224
+ parser.add_argument('--fig_style', type=str, default='light', choices=['dark', 'light'], help='Plot style of figure')
225
+
226
+ def add_all_mode_args(parser):
227
+ parser.add_argument('--input_hdf5_path', required=True, type=str, help='Path to the input hdf5 file.')
228
+ parser.add_argument('--save_dir', required=True, type=str, help='Path to the running results.')
229
+ # output
230
+ # parser.add_argument('--output_hdf5_path', required=True, type=str, help='Path to the output hdf5 file.')
231
+ parser.add_argument('--sample_name', required=True, type=str, help='Name of the sample.')
232
+ parser.add_argument('--annotation', default=None, type=str, help='Name of the annotation layer.')
233
+ parser.add_argument('--type', default=None, type=str, help="The gene expression layer of the input data (e.g., 'count', 'counts', 'log1p').")
234
+
235
+ # latent_to_gene
236
+ # input
237
+ # parser.add_argument('--input_hdf5_path', type=str, required=True, help='Path to the input HDF5 file.')
238
+ # parser.add_argument('--sample_name', type=str, required=True, help='Name of the sample.')
239
+ # output
240
+ # parser.add_argument('--output_feather_path', type=str, required=True,
241
+ # help='Path to save output gene marker score feather file.')
242
+ # parser.add_argument('--annotation', default=None, type=str, help='Name of the annotation layer.')
243
+ # parser.add_argument('--type', default=None, type=str, help="Type of input data (e.g., 'count', 'counts').")
244
+
245
+ # no_expression_fraction
246
+ # no_expression_fraction
247
+ parser.add_argument('--no_expression_fraction', action='store_true', default=False,
248
+ help='Flag to not use expression fraction as filter when calculate the maker score. Default is False.')
249
+
250
+ parser.add_argument('--latent_representation', type=str, default='latent_GVAE',
251
+ choices=['latent_GVAE', 'latent_PCA'],
252
+ help='Type of latent representation. Default is "latent_GVAE".')
253
+ parser.add_argument('--num_neighbour', type=int, default=21,
254
+ help='Number of neighbours to consider. Default is 21.')
255
+ parser.add_argument('--num_neighbour_spatial', type=int, default=101,
256
+ help='Number of spatial neighbours to consider. Default is 101.')
257
+ parser.add_argument('--species', type=str, default=None, help='Species name, which is the column name in the homolog gene conversion file.')
258
+ parser.add_argument('--gs_species', type=str, default=None, help='Homologous gene conversion file path, if applicable.')
259
+ parser.add_argument('--gM_slices', type=str, default=None, help='The mean Gene marker scores path across multiple slices.')
260
+
261
+
262
+ # generate_ldscore
263
+ # parser.add_argument('--sample_name', type=str, required=True, help='Sample name')
264
+ # should be all
265
+ # parser.add_argument('--chrom', type=chrom_choice, required=True, help='Chromosome number (1-22) or "all"')
266
+ # output
267
+ # parser.add_argument('--ldscore_save_dir', type=str, required=True, help='Directory to save ld score files')
268
+
269
+ # reference panel
270
+ parser.add_argument('--bfile_root', type=str, required=True, help='Bfile root path')
271
+ parser.add_argument('--keep_snp_root', type=str, required=True, help='Keep SNP root path')
272
+
273
+ # Annotation by gene distance
274
+ parser.add_argument('--gtf_annotation_file', type=str, required=True, help='GTF file path')
275
+ parser.add_argument('--gene_window_size', type=int, default=50000, help='Gene window size')
276
+
277
+ # Enhancer annotation
278
+ parser.add_argument('--enhancer_annotation_file', type=str, default=None,
279
+ help='Enhancer annotation bed file path, optional.')
280
+ parser.add_argument('--snp_multiple_enhancer_strategy', type=str, default='max_mkscore',
281
+ choices=['max_mkscore', 'nearest_TSS'], help='Strategy for multiple enhancers per SNP')
282
+ parser.add_argument('--gene_window_enhancer_priority', type=str, default=None,
283
+ choices=['gene_window_first', 'enhancer_first', 'enhancer_only'],
284
+ help='Priority between gene window and enhancer')
285
+
286
+ # Arguments for calculating ld score
287
+ parser.add_argument('--spots_per_chunk', type=int, default=5_000, help='Number of spots per chunk')
288
+ parser.add_argument('--ld_wind', type=int, default=1, help='LD window size')
289
+ parser.add_argument('--ld_unit', type=str, default='CM', help='LD window unit (SNP/KB/CM)',
290
+ choices=['SNP', 'KB', 'CM'])
291
+
292
+ # spatial ldsc args:
293
+ parser.add_argument('--sumstats_file', default=None, help="Path to GWAS summary statistics file.")
294
+ parser.add_argument('--sumstats_config_file', default=None, help="Path to GWAS summary statistics config file.")
295
+ parser.add_argument('--w_file', required=True, help="Path to regression weight file.")
296
+ parser.add_argument('--ldscore_input_dir', required=True, help="Input directory for LD Score files.")
297
+ parser.add_argument('--ldsc_save_dir', required=True, help="Directory to save Spatial LDSC results.")
298
+ parser.add_argument('--trait_name', default=None, help="Name of the trait.")
299
+ parser.add_argument('--not_M_5_50', action='store_true', help="Flag to not use M 5 50 in calculations.")
300
+ parser.add_argument('--n_blocks', type=int, default=200, help="Number of blocks for jackknife resampling.")
301
+ parser.add_argument('--chisq_max', type=int, help="Maximum chi-square value for filtering SNPs.")
302
+ parser.add_argument('--all_chunk', type=int, help="Number of chunks for processing spatial data.")
303
+
304
+
305
+ def get_runall_mode_config(args: argparse.Namespace):
306
+ # output
307
+ args.output_hdf5_path = f'{args.save_dir}/{args.sample_name}/find_latent_representations/{args.sample_name}_add_latent.h5ad'
308
+ args.output_feather_path = f'{args.save_dir}/{args.sample_name}/latent_to_gene/{args.sample_name}_gene_marker_score.feather'
309
+ args.ldscore_save_dir = f'{args.save_dir}/{args.sample_name}/generate_ldscore'
310
+ args.ldsc_save_dir = f'{args.save_dir}/{args.sample_name}/spatial_ldsc'
311
+ args.output_cauchy_dir = f'{args.save_dir}/{args.sample_name}/cauchy_combination/'
312
+
313
+ # input
314
+ args.input_hdf5_with_latent_path = args.output_hdf5_path
315
+ args.mkscore_feather_file = args.output_feather_path
316
+ args.ldscore_input_dir = args.ldscore_save_dir
317
+ args.chrom = 'all'
318
+ args.input_ldsc_dir = args.ldsc_save_dir
319
+ args.input_spatial_ldsc = f'{args.save_dir}/{args.sample_name}/spatial_ldsc/{args.sample_name}_{args.trait_name}.gz'
320
+ # find_latent_representations
321
+ flr_config = get_dataclass_from_parser(args, FindLatentRepresentationsConfig)
322
+ # latent_to_gene
323
+ ltg_config = get_dataclass_from_parser(args, LatentToGeneConfig)
324
+ # generate_ldscore
325
+ gls_config = get_dataclass_from_parser(args, GenerateLDScoreConfig)
326
+ # spatial ldsc
327
+ ldsc_config = get_dataclass_from_parser(args, SpatialLDSCConfig)
328
+ # cauchy combination
329
+ cauchy_config = get_dataclass_from_parser(args, CauchyCombinationConfig)
330
+ return RunAllModeConfig(flr_config=flr_config, ltg_config=ltg_config, gls_config=gls_config,
331
+ ldsc_config=ldsc_config, cauchy_config=cauchy_config)
332
+
333
+
334
+ def add_format_sumstats_args(parser):
335
+ # Required arguments
336
+ parser.add_argument('--sumstats', required=True, type=str,
337
+ help='Path to gwas summary data')
338
+ parser.add_argument('--out', required=True, type=str,
339
+ help='Path to save the formatted gwas data')
340
+
341
+ # Arguments for specify column name
342
+ parser.add_argument('--snp', default=None,type=str,
343
+ help="Name of snp column (if not a name that gsMap understands)")
344
+ parser.add_argument('--a1', default=None,type=str,
345
+ help="Name of effect allele column (if not a name that gsMap understands)")
346
+ parser.add_argument('--a2', default=None,type=str,
347
+ help="Name of none-effect allele column (if not a name that gsMap understands)")
348
+ parser.add_argument('--info', default=None,type=str,
349
+ help="Name of info column (if not a name that gsMap understands)")
350
+ parser.add_argument('--beta', default=None,type=str,
351
+ help="Name of gwas beta column (if not a name that gsMap understands).")
352
+ parser.add_argument('--se', default=None,type=str,
353
+ help="Name of gwas standar error of beta column (if not a name that gsMap understands)")
354
+ parser.add_argument('--p', default=None,type=str,
355
+ help="Name of p-value column (if not a name that gsMap understands)")
356
+ parser.add_argument('--frq', default=None,type=str,
357
+ help="Name of A1 ferquency column (if not a name that gsMap understands)")
358
+ parser.add_argument('--n', default=None,type=str,
359
+ help="Name of sample size column (if not a name that gsMap understands)")
360
+ parser.add_argument('--z', default=None,type=str,
361
+ help="Name of gwas Z-statistics column (if not a name that gsMap understands)")
362
+ parser.add_argument('--OR', default=None,type=str,
363
+ help="Name of gwas OR column (if not a name that gsMap understands)")
364
+ parser.add_argument('--se_OR', default=None,type=str,
365
+ help="Name of standar error of OR column (if not a name that gsMap understands)")
366
+
367
+ # Arguments for convert SNP (chr, pos) to rsid
368
+ parser.add_argument('--chr', default="Chr",type=str,
369
+ help="Name of SNP chromosome column (if not a name that gsMap understands)")
370
+ parser.add_argument('--pos', default="Pos",type=str,
371
+ help="Name of SNP positions column (if not a name that gsMap understands)")
372
+ parser.add_argument('--dbsnp', default=None,type=str,
373
+ help='Path to reference dnsnp file')
374
+ parser.add_argument('--chunksize', default=1e+6,type=int,
375
+ help='Chunk size for loading dbsnp file')
376
+
377
+ # Arguments for output format and quality
378
+ parser.add_argument('--format',default='gsMap', type=str,
379
+ help='Format of output data',choices=['gsMap', 'COJO'])
380
+ parser.add_argument('--info_min', default=0.9,type=float,
381
+ help='Minimum INFO score.')
382
+ parser.add_argument('--maf_min', default=0.01,type=float,
383
+ help='Minimum MAF.')
384
+ parser.add_argument('--keep_chr_pos', action='store_true', default=False,
385
+ help='Keep SNP chromosome and position columns in the output data')
386
+
387
+ @dataclass
388
+ class FindLatentRepresentationsConfig:
389
+ input_hdf5_path: str
390
+ output_hdf5_path: str
391
+ sample_name: str
392
+ annotation: str = None
393
+ type: str = None
394
+
395
+ epochs: int = 300
396
+ feat_hidden1: int = 256
397
+ feat_hidden2: int = 128
398
+ feat_cell: int = 3000
399
+ gcn_hidden1: int = 64
400
+ gcn_hidden2: int = 30
401
+ p_drop: float = 0.1
402
+ gcn_lr: float = 0.001
403
+ gcn_decay: float = 0.01
404
+ n_neighbors: int = 11
405
+ label_w: float = 1
406
+ rec_w: float = 1
407
+ input_pca: bool = True
408
+ n_comps: int = 300
409
+ weighted_adj: bool = False
410
+ nheads: int = 3
411
+ var: bool = False
412
+ convergence_threshold: float = 1e-4
413
+ hierarchically: bool = False
414
+
415
+ def __post_init__(self):
416
+ if self.hierarchically:
417
+ if self.annotation is None:
418
+ raise ValueError('annotation must be provided if hierarchically is True.')
419
+ logger.info(
420
+ f'------Hierarchical mode is enabled. This will find the latent representations within each annotation.')
421
+
422
+ # remind for not providing annotation
423
+ if self.annotation is None:
424
+ logger.warning(
425
+ 'annotation is not provided. This will find the latent representations for the whole dataset.')
426
+ else:
427
+ logger.info(f'------Find latent representations for {self.annotation}...')
428
+
429
+
430
+ @dataclass
431
+ class LatentToGeneConfig:
432
+ input_hdf5_with_latent_path: str
433
+ sample_name: str
434
+ output_feather_path: str
435
+ no_expression_fraction: bool = False
436
+ latent_representation: str = 'latent_GVAE'
437
+ num_neighbour: int = 21
438
+ num_neighbour_spatial: int = 101
439
+ species: str = None
440
+ gs_species: str = None
441
+ gM_slices: str = None
442
+ annotation: str = None
443
+ type: str = None
444
+
445
+
446
+ @dataclass
447
+ class GenerateLDScoreConfig:
448
+ sample_name: str
449
+ chrom: Union[int, str]
450
+ ldscore_save_dir: str
451
+ mkscore_feather_file: str
452
+ bfile_root: str
453
+ keep_snp_root: str
454
+
455
+ # annotation by gene distance
456
+ gtf_annotation_file: str
457
+ gene_window_size: int = 50000
458
+
459
+ # annotation by enhancer
460
+ enhancer_annotation_file: str = None
461
+ snp_multiple_enhancer_strategy: Literal['max_mkscore', 'nearest_TSS'] = 'max_mkscore'
462
+ gene_window_enhancer_priority: Literal['gene_window_first', 'enhancer_first', 'enhancer_only',] = None
463
+
464
+ # for calculating ld score
465
+ additional_baseline_annotation_dir_path: str = None
466
+ spots_per_chunk: int = 5_000
467
+ ld_wind: int = 1
468
+ ld_unit: str = 'CM'
469
+
470
+ def __post_init__(self):
471
+ if self.enhancer_annotation_file is not None and self.gene_window_enhancer_priority is None:
472
+ logger.warning("enhancer_annotation_file is provided but gene_window_enhancer_priority is not provided. "
473
+ "by default, gene_window_enhancer_priority is set to 'enhancer_only', when enhancer_annotation_file is provided.")
474
+ self.gene_window_enhancer_priority = 'enhancer_only'
475
+ if self.enhancer_annotation_file is None and self.gene_window_enhancer_priority is not None:
476
+ logger.warning("gene_window_enhancer_priority is provided but enhancer_annotation_file is not provided. "
477
+ "by default, gene_window_enhancer_priority is set to None, when enhancer_annotation_file is not provided.")
478
+ self.gene_window_enhancer_priority = None
479
+ assert self.gene_window_enhancer_priority in [None, 'gene_window_first', 'enhancer_first', 'enhancer_only', ], \
480
+ f"gene_window_enhancer_priority must be one of None, 'gene_window_first', 'enhancer_first', 'enhancer_only', but got {self.gene_window_enhancer_priority}."
481
+ if self.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
482
+ logger.info(f'Both gene_window and enhancer annotation will be used to calculate LD score. ')
483
+ logger.info(
484
+ f'SNP within +-{self.gene_window_size} bp of gene body will be used and enhancer annotation will be used to calculate LD score. If a snp maps to multiple enhancers, the strategy to choose by your select strategy: {self.snp_multiple_enhancer_strategy}.')
485
+ elif self.gene_window_enhancer_priority == 'enhancer_only':
486
+ logger.info(f'Only enhancer annotation will be used to calculate LD score. ')
487
+ else:
488
+ logger.info(
489
+ f'Only gene window annotation will be used to calculate LD score. SNP within +-{self.gene_window_size} bp of gene body will be used. ')
490
+
491
+ # remind for baseline annotation
492
+ if self.additional_baseline_annotation_dir_path is None:
493
+ logger.info(f'------Baseline annotation is not provided. Default baseline annotation will be used.')
494
+ else:
495
+ logger.info(f'------Baseline annotation is provided. Additional baseline annotation will be used with the default baseline annotation.')
496
+ logger.info(f'------Baseline annotation directory: {self.additional_baseline_annotation_dir_path}')
497
+ # check the existence of baseline annotation
498
+ if self.chrom == 'all':
499
+ for chrom in range(1, 23):
500
+ chrom = str(chrom)
501
+ baseline_annotation_path = Path(self.additional_baseline_annotation_dir_path) / f'baseline.{chrom}.annot.gz'
502
+ if not baseline_annotation_path.exists():
503
+ raise FileNotFoundError(f'baseline.{chrom}.annot.gz is not found in {self.additional_baseline_annotation_dir_path}.')
504
+ else:
505
+ baseline_annotation_path = Path(self.additional_baseline_annotation_dir_path) / f'baseline.{self.chrom}.annot.gz'
506
+ if not baseline_annotation_path.exists():
507
+ raise FileNotFoundError(f'baseline.{self.chrom}.annot.gz is not found in {self.additional_baseline_annotation_dir_path}.')
508
+
509
+
510
+
511
+ @dataclass
512
+ class SpatialLDSCConfig:
513
+ sample_name: str
514
+ w_file: str
515
+ ldscore_input_dir: str
516
+ ldsc_save_dir: str
517
+ disable_additional_baseline_annotation: bool = False
518
+ trait_name: str = None
519
+ sumstats_file: str = None
520
+ sumstats_config_file: str = None
521
+ num_processes: int = 4
522
+ not_M_5_50: bool = False
523
+ n_blocks: int = 200
524
+ chisq_max: int = None
525
+ all_chunk: int = None
526
+
527
+ def __post_init__(self):
528
+ if self.sumstats_file is None and self.sumstats_config_file is None:
529
+ raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
530
+ if self.sumstats_file is not None and self.sumstats_config_file is not None:
531
+ raise ValueError('Only one of sumstats_file and sumstats_config_file must be provided.')
532
+ if self.sumstats_file is not None and self.trait_name is None:
533
+ raise ValueError('trait_name must be provided if sumstats_file is provided.')
534
+ if self.sumstats_config_file is not None and self.trait_name is not None:
535
+ raise ValueError('trait_name must not be provided if sumstats_config_file is provided.')
536
+ self.sumstats_config_dict = {}
537
+ # load the sumstats config file
538
+ if self.sumstats_config_file is not None:
539
+ import yaml
540
+ with open(self.sumstats_config_file) as f:
541
+ config = yaml.load(f, Loader=yaml.FullLoader)
542
+ for trait_name, sumstats_file in config.items():
543
+ self.sumstats_config_dict[trait_name] = sumstats_file
544
+ # load the sumstats file
545
+ elif self.sumstats_file is not None:
546
+ self.sumstats_config_dict[self.trait_name] = self.sumstats_file
547
+ else:
548
+ raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
549
+
550
+ # check if additional baseline annotation is exist
551
+ self.use_additional_baseline_annotation = False
552
+ self.process_additional_baseline_annotation()
553
+
554
+ def process_additional_baseline_annotation(self):
555
+ additional_baseline_annotation_dir_path = Path(self.ldscore_input_dir) / 'additional_baseline'
556
+ dir_exists = additional_baseline_annotation_dir_path.exists()
557
+
558
+ if not dir_exists:
559
+ if self.use_additional_baseline_annotation:
560
+ logger.warning(f"additional_baseline directory is not found in {self.ldscore_input_dir}.")
561
+ print('''\
562
+ if you want to use additional baseline annotation,
563
+ please provide additional baseline annotation when calculating ld score.
564
+ ''')
565
+ raise FileNotFoundError(
566
+ f'additional_baseline directory is not found. You should disable use_additional_baseline_annotation')
567
+ return
568
+
569
+ self.use_additional_baseline_annotation = self.use_additional_baseline_annotation or True
570
+
571
+ if self.disable_additional_baseline_annotation:
572
+ logger.warning(
573
+ f"additional_baseline directory is found in {self.ldscore_input_dir}, but use_additional_baseline_annotation is disabled.")
574
+ print('''\
575
+ if you want to use additional baseline annotation,
576
+ please enable by not adding --disable_additional_baseline_annotation.
577
+ ''')
578
+ self.use_additional_baseline_annotation = False
579
+ else:
580
+ logger.info(
581
+ f'------Additional baseline annotation is provided. It will be used with the default baseline annotation.')
582
+ logger.info(f'------Additional baseline annotation directory: {additional_baseline_annotation_dir_path}')
583
+
584
+ chrom_list = range(1, 23)
585
+ for chrom in chrom_list:
586
+ baseline_annotation_path = additional_baseline_annotation_dir_path / f'baseline.{chrom}.l2.ldscore.feather'
587
+ if not baseline_annotation_path.exists():
588
+ raise FileNotFoundError(
589
+ f'baseline.{chrom}.annot.gz is not found in {additional_baseline_annotation_dir_path}.')
590
+
591
+
592
+ logger
593
+ @dataclass
594
+ class CauchyCombinationConfig:
595
+ input_hdf5_path: str
596
+ input_ldsc_dir: str
597
+ output_cauchy_dir: str
598
+ sample_name: str
599
+ trait_name: str
600
+ annotation: str
601
+ meta: str = None
602
+ slide: str = None
603
+
604
+
605
+ @dataclass
606
+ class VisualizeConfig:
607
+ input_hdf5_path: str
608
+ input_ldsc_dir: str
609
+ output_figure_dir: str
610
+ sample_name: str
611
+ trait_name: str
612
+
613
+ annotation: str = None
614
+ fig_title: str = None
615
+ fig_height: int = 600
616
+ fig_width: int = 800
617
+ point_size: int = None
618
+ fig_style: Literal['dark', 'light'] = 'light'
619
+
620
+
621
+ @dataclass
622
+ class RunAllModeConfig:
623
+ flr_config: FindLatentRepresentationsConfig
624
+ ltg_config: LatentToGeneConfig
625
+ gls_config: GenerateLDScoreConfig
626
+ ldsc_config: SpatialLDSCConfig
627
+ cauchy_config: CauchyCombinationConfig
628
+
629
+
630
+ @dataclass
631
+ class FormatSumstatsConfig:
632
+ sumstats: str
633
+ out: str
634
+ dbsnp: str
635
+ snp: str = None
636
+ a1: str = None
637
+ a2: str = None
638
+ info: str = None
639
+ beta: str = None
640
+ se: str = None
641
+ p: str = None
642
+ frq: str = None
643
+ n: str = None
644
+ z: str = None
645
+ OR: str = None
646
+ se_OR: str = None
647
+ format: str = None
648
+ chr: str = None
649
+ pos: str = None
650
+ chunksize: int = 1e+7
651
+ info_min: float = 0.9
652
+ maf_min: float = 0.01
653
+ keep_chr_pos:bool = False
654
+
655
+
656
+ @register_cli(name='run_find_latent_representations',
657
+ description='Run Find_latent_representations \nFind the latent representations of each spot by running GNN-VAE',
658
+ add_args_function=add_find_latent_representations_args)
659
+ def run_find_latent_representation_from_cli(args: argparse.Namespace):
660
+ from gsMap.find_latent_representation import run_find_latent_representation
661
+ config = get_dataclass_from_parser(args, FindLatentRepresentationsConfig)
662
+ run_find_latent_representation(config)
663
+
664
+
665
+ @register_cli(name='run_latent_to_gene',
666
+ description='Run Latent_to_gene \nEstimate gene marker gene scores for each spot by using latent representations from nearby spots',
667
+ add_args_function=add_latent_to_gene_args)
668
+ def run_latent_to_gene_from_cli(args: argparse.Namespace):
669
+ from gsMap.latent_to_gene import run_latent_to_gene
670
+ config = get_dataclass_from_parser(args, LatentToGeneConfig)
671
+ run_latent_to_gene(config)
672
+
673
+
674
+ @register_cli(name='run_generate_ldscore',
675
+ description='Run Generate_ldscore \nGenerate LD scores for each spot',
676
+ add_args_function=add_generate_ldscore_args)
677
+ def run_generate_ldscore_from_cli(args: argparse.Namespace):
678
+ from gsMap.generate_ldscore import run_generate_ldscore
679
+ config = get_dataclass_from_parser(args, GenerateLDScoreConfig)
680
+ run_generate_ldscore(config)
681
+
682
+
683
+ @register_cli(name='run_spatial_ldsc',
684
+ description='Run Spatial_ldsc \nRun spatial LDSC for each spot',
685
+ add_args_function=add_spatial_ldsc_args)
686
+ def run_spatial_ldsc_from_cli(args: argparse.Namespace):
687
+ from gsMap.spatial_ldsc_multiple_sumstats import run_spatial_ldsc
688
+ config = get_dataclass_from_parser(args, SpatialLDSCConfig)
689
+ run_spatial_ldsc(config)
690
+
691
+
692
+ @register_cli(name='run_cauchy_combination',
693
+ description='Run Cauchy_combination for each annotation',
694
+ add_args_function=add_Cauchy_combination_args)
695
+ def run_Cauchy_combination_from_cli(args: argparse.Namespace):
696
+ from gsMap.cauchy_combination_test import run_Cauchy_combination
697
+ config = get_dataclass_from_parser(args, CauchyCombinationConfig)
698
+ run_Cauchy_combination(config)
699
+
700
+
701
+ @register_cli(name='run_visualize',
702
+ description='Visualize the gsMap results',
703
+ add_args_function=add_Visualization_args)
704
+ def run_Visualize_from_cli(args: argparse.Namespace):
705
+ from gsMap.visualize import run_Visualize
706
+ config = get_dataclass_from_parser(args, VisualizeConfig)
707
+ run_Visualize(config)
708
+
709
+
710
+ @register_cli(name='run_all_mode',
711
+ description='Run gsMap method (the full process)',
712
+ add_args_function=add_all_mode_args)
713
+ def run_all_mode_from_cli(args: argparse.Namespace):
714
+ from gsMap.find_latent_representation import run_find_latent_representation
715
+ from gsMap.latent_to_gene import run_latent_to_gene
716
+ from gsMap.generate_ldscore import run_generate_ldscore
717
+ from gsMap.spatial_ldsc_multiple_sumstats import run_spatial_ldsc
718
+ from gsMap.cauchy_combination_test import run_Cauchy_combination
719
+ config = get_runall_mode_config(args)
720
+ run_find_latent_representation(config.flr_config)
721
+ run_latent_to_gene(config.ltg_config)
722
+ run_generate_ldscore(config.gls_config)
723
+ run_spatial_ldsc(config.ldsc_config)
724
+ if args.annotation is not None:
725
+ config.cauchy_config.annotation = args.annotation
726
+ run_Cauchy_combination(config.cauchy_config)
727
+
728
+ @register_cli(name='format_sumstats',
729
+ description='Format gwas summary statistics',
730
+ add_args_function=add_format_sumstats_args)
731
+ def gwas_format_from_cli(args: argparse.Namespace):
732
+ from gsMap.format_sumstats import gwas_format
733
+ config = get_dataclass_from_parser(args, FormatSumstatsConfig)
734
+ gwas_format(config)