gsMap 1.62__py3-none-any.whl → 1.64__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/config.py CHANGED
@@ -1,36 +1,42 @@
1
1
  import argparse
2
2
  import logging
3
- from dataclasses import dataclass, field
4
- from pprint import pprint
5
- from typing import Union, Literal
6
- from pathlib import Path
7
-
8
3
  from collections import OrderedDict, namedtuple
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from pprint import pprint
9
7
  from typing import Callable
10
- from gsMap.__init__ import __version__
8
+ from typing import Union, Literal, Tuple, Optional, List
9
+ from functools import wraps
11
10
  import pyfiglet
12
11
 
12
+ from gsMap.__init__ import __version__
13
+
13
14
  # Global registry to hold functions
14
15
  cli_function_registry = OrderedDict()
15
16
  subcommand = namedtuple('subcommand', ['name', 'func', 'add_args_function', 'description'])
16
- logger = logging.getLogger(__name__)
17
- logger.setLevel(logging.DEBUG)
18
- handler = logging.StreamHandler()
19
- handler.setFormatter(logging.Formatter(
20
- '[{asctime}] {levelname:6s} {message}', style='{'))
21
- logger.addHandler(handler)
22
17
 
23
18
 
19
+ def get_gsMap_logger(logger_name):
20
+ logger = logging.getLogger(logger_name)
21
+ logger.setLevel(logging.DEBUG)
22
+ handler = logging.StreamHandler()
23
+ handler.setFormatter(logging.Formatter(
24
+ '[{asctime}] {levelname:.5s} | {name} - {message}', style='{'))
25
+ logger.addHandler(handler)
26
+ return logger
27
+
28
+ logger = get_gsMap_logger('gsMap')
29
+
24
30
  # Decorator to register functions for cli parsing
25
31
  def register_cli(name: str, description: str, add_args_function: Callable) -> Callable:
26
32
  def decorator(func: Callable) -> Callable:
27
33
  def wrapper(*args, **kwargs):
28
34
  name.replace('_', ' ')
29
35
  gsMap_main_logo = pyfiglet.figlet_format("gsMap", font='doom', width=80, justify='center', ).rstrip()
30
- print(gsMap_main_logo, )
36
+ print(gsMap_main_logo, flush=True)
31
37
  version_number = 'Version: ' + __version__
32
- print(version_number.center(80))
33
- print('=' * 80)
38
+ print(version_number.center(80), flush=True)
39
+ print('=' * 80, flush=True)
34
40
  logger.info(f"Running {name}...")
35
41
  func(*args, **kwargs)
36
42
  logger.info(f"Finished running {name}.")
@@ -41,50 +47,28 @@ def register_cli(name: str, description: str, add_args_function: Callable) -> Ca
41
47
 
42
48
  return decorator
43
49
 
50
+ def add_shared_args(parser):
51
+ parser.add_argument('--workdir', type=str, required=True, help='Path to the working directory.')
52
+ parser.add_argument('--sample_name', type=str, required=True, help='Name of the sample.')
44
53
 
45
54
  def add_find_latent_representations_args(parser):
46
- parser.add_argument('--input_hdf5_path', required=True, type=str, help='Path to the input hdf5 file.')
47
- parser.add_argument('--output_hdf5_path', required=True, type=str, help='Path to the output hdf5 file.')
48
- parser.add_argument('--sample_name', required=True, type=str, help='Name of the sample.')
49
- parser.add_argument('--annotation', default=None, type=str, help='Name of the annotation layer.')
50
- parser.add_argument('--type', default=None, type=str, help="The gene expression layer of the input data (e.g., 'count', 'counts', 'log1p').")
51
- parser.add_argument('--epochs', default=300, type=int,
52
- help="Number of training epochs for the GNN-VAE model. Default is 300.")
53
-
54
- parser.add_argument('--feat_hidden1', default=256, type=int,
55
- help="Number of neurons in the first hidden layer of the feature extraction network. Default is 256.")
56
- parser.add_argument('--feat_hidden2', default=128, type=int,
57
- help="Number of neurons in the second hidden layer of the feature extraction network. Default is 128.")
58
- parser.add_argument('--feat_cell', default=3000, type=int,
59
- help="Number of top variable genes to select. Default is 3000.")
60
- parser.add_argument('--gcn_hidden1', default=64, type=int,
61
- help="Number of units in the first hidden layer of the GCN. Default is 64.")
62
- parser.add_argument('--gcn_hidden2', default=30, type=int,
63
- help="Number of units in the second hidden layer of the GCN. Default is 30.")
64
- parser.add_argument('--p_drop', default=0.1, type=float,
65
- help="Dropout rate used in the GNN-VAE model. Default is 0.1.")
66
- parser.add_argument('--gcn_lr', default=0.001, type=float,
67
- help="Learning rate for the GCN network. Default is 0.001.")
68
- parser.add_argument('--gcn_decay', default=0.01, type=float,
69
- help="Weight decay (L2 penalty) for the GCN network. Default is 0.01.")
70
- parser.add_argument('--n_neighbors', default=11, type=int,
71
- help="Number of neighbors to consider for graph construction in GCN. Default is 11.")
72
- parser.add_argument('--label_w', default=1, type=float,
73
- help="Weight of the label loss in the loss function. Default is 1.")
74
- parser.add_argument('--rec_w', default=1, type=float,
75
- help="Weight of the reconstruction loss in the loss function. Default is 1.")
76
- parser.add_argument('--n_comps', default=300, type=int,
77
- help="Number of principal components to keep if PCA is performed. Default is 300.")
78
- parser.add_argument('--weighted_adj', action='store_true',
79
- help="Use a weighted adjacency matrix in GCN. Default is False.")
80
- parser.add_argument('--nheads', default=3, type=int,
81
- help="Number of heads in the attention mechanism of the GNN. Default is 3.")
82
- parser.add_argument('--var', action='store_true',
83
- help="Enable var. Use --var to enable. Default is False.")
84
- parser.add_argument('--convergence_threshold', default=1e-4, type=float,
85
- help="Threshold for convergence during training. Training stops if the loss change is below this threshold. Default is 1e-4.")
86
- parser.add_argument('--hierarchically', action='store_true',
87
- help="Find latent representations hierarchically. Use --hierarchically to enable. Default is False.")
55
+ add_shared_args(parser)
56
+ parser.add_argument('--input_hdf5_path', required=True, type=str, help='Path to the input HDF5 file.')
57
+ parser.add_argument('--annotation', required=True, type=str, help='Name of the annotation in adata.obs to use.')
58
+ parser.add_argument('--data_layer', required=True, type=str, help='Data layer for gene expression (e.g., "counts", "log1p").')
59
+ parser.add_argument('--epochs', type=int, default=300, help='Number of training epochs.')
60
+ parser.add_argument('--feat_hidden1', type=int, default=256, help='Neurons in the first hidden layer.')
61
+ parser.add_argument('--feat_hidden2', type=int, default=128, help='Neurons in the second hidden layer.')
62
+ parser.add_argument('--gat_hidden1', type=int, default=64, help='Units in the first GAT hidden layer.')
63
+ parser.add_argument('--gat_hidden2', type=int, default=30, help='Units in the second GAT hidden layer.')
64
+ parser.add_argument('--p_drop', type=float, default=0.1, help='Dropout rate.')
65
+ parser.add_argument('--gat_lr', type=float, default=0.001, help='Learning rate for the GAT.')
66
+ parser.add_argument('--n_neighbors', type=int, default=11, help='Number of neighbors for GAT.')
67
+ parser.add_argument('--n_comps', type=int, default=300, help='Number of principal components for PCA.')
68
+ parser.add_argument('--weighted_adj', action='store_true', help='Use weighted adjacency in GAT.')
69
+ parser.add_argument('--var', action='store_true', help='Enable variance calculations.')
70
+ parser.add_argument('--convergence_threshold', type=float, default=1e-4, help='Threshold for convergence.')
71
+ parser.add_argument('--hierarchically', action='store_true', help='Enable hierarchical latent representation finding.')
88
72
 
89
73
 
90
74
  def chrom_choice(value):
@@ -109,297 +93,284 @@ def get_dataclass_from_parser(args: argparse.Namespace, data_class: dataclass):
109
93
  return data_class(**remain_kwargs)
110
94
 
111
95
 
112
- def add_generate_ldscore_args(parser):
113
- parser.add_argument('--sample_name', type=str, required=True, help='Sample name')
114
- parser.add_argument('--chrom', type=str, required=True, help='Chromosome number (1-22) or "all"')
115
- parser.add_argument('--ldscore_save_dir', type=str, required=True, help='Directory to save ld score files')
116
- parser.add_argument('--mkscore_feather_file', type=str, required=True, help='Mkscore feather file path')
117
-
118
- # additional baseline annotation
119
- parser.add_argument('--additional_baseline_annotation_dir_path', type=str, default=None,)
120
-
121
- # reference panel
122
- parser.add_argument('--bfile_root', type=str, required=True, help='Bfile root path')
123
- parser.add_argument('--keep_snp_root', type=str, required=True, help='Keep SNP root path')
124
-
125
- # Annotation by gene distance
126
- parser.add_argument('--gtf_annotation_file', type=str, required=True, help='GTF file path')
127
- parser.add_argument('--gene_window_size', type=int, default=50000, help='Gene window size')
128
-
129
- # Enhancer annotation
130
- parser.add_argument('--enhancer_annotation_file', type=str, default=None,
131
- help='Enhancer annotation bed file path, optional.')
132
- parser.add_argument('--snp_multiple_enhancer_strategy', type=str, default='max_mkscore',
133
- choices=['max_mkscore', 'nearest_TSS'], help='Strategy for multiple enhancers per SNP')
134
- parser.add_argument('--gene_window_enhancer_priority', type=str, default=None,
135
- choices=['gene_window_first', 'enhancer_first', 'enhancer_only'],
136
- help='Priority between gene window and enhancer')
137
-
138
- # Arguments for calculating ld score
139
- parser.add_argument('--spots_per_chunk', type=int, default=5_000, help='Number of spots per chunk')
140
- parser.add_argument('--ld_wind', type=int, default=1, help='LD window size')
141
- parser.add_argument('--ld_unit', type=str, default='CM', help='LD window unit (SNP/KB/CM)',
142
- choices=['SNP', 'KB', 'CM'])
143
-
144
-
145
96
  def add_latent_to_gene_args(parser):
146
- parser.add_argument('--input_hdf5_with_latent_path', type=str, required=True,
147
- help='Path to the input HDF5 file which contains latent representations.')
148
- parser.add_argument('--sample_name', type=str, required=True, help='Name of the sample.')
149
- parser.add_argument('--output_feather_path', type=str, required=True,
150
- help='Path to save output gene marker score feather file.')
151
- parser.add_argument('--annotation', default=None, type=str, help='Name of the annotation layer.')
152
- parser.add_argument('--type', default=None, type=str, help="Type of input data (e.g., 'count', 'counts').")
153
-
154
- # no_expression_fraction
155
- parser.add_argument('--no_expression_fraction', action='store_true', default=False,
156
- help='Flag to not use expression fraction as filter when calculate the maker score. Default is False.')
157
-
158
- parser.add_argument('--latent_representation', type=str, default='latent_GVAE',
159
- choices=['latent_GVAE', 'latent_PCA'],
160
- help='Type of latent representation. Default is "latent_GVAE".')
161
- parser.add_argument('--num_neighbour', type=int, default=21,
162
- help='Number of neighbours to consider. Default is 21.')
163
- parser.add_argument('--num_neighbour_spatial', type=int, default=101,
164
- help='Number of spatial neighbours to consider. Default is 101.')
165
- parser.add_argument('--species', type=str, default=None, help='Species name, which is the column name in the homolog gene conversion file.')
166
- parser.add_argument('--gs_species', type=str, default=None, help='Homologous gene conversion file path, if applicable.')
167
- parser.add_argument('--gM_slices', type=str, default=None, help='The mean Gene marker scores path across multiple slices.')
97
+ add_shared_args(parser)
98
+ parser.add_argument('--annotation', type=str, help='Name of the annotation in adata.obs to use. (optional).')
99
+ parser.add_argument('--no_expression_fraction', action='store_true', help='Skip expression fraction filtering.')
100
+ parser.add_argument('--latent_representation', type=str, choices=['latent_GVAE', 'latent_PCA'], default='latent_GVAE',
101
+ help='Type of latent representation.')
102
+ parser.add_argument('--num_neighbour', type=int, default=21, help='Number of neighbors.')
103
+ parser.add_argument('--num_neighbour_spatial', type=int, default=101, help='Number of spatial neighbors.')
104
+ # parser.add_argument('--species', type=str, help='Species name for homolog gene mapping (optional).')
105
+ parser.add_argument('--homolog_file', type=str, help='Path to homologous gene conversion file (optional).')
168
106
 
169
107
 
170
- def add_spatial_ldsc_args(parser):
171
- # Group for GWAS input data
172
- parser.add_argument('--sample_name', required=True, help="Name of the spatial transcriptomic dataset.")
108
+ def add_generate_ldscore_args(parser):
109
+ add_shared_args(parser)
110
+ parser.add_argument('--chrom', type=str, required=True, help='Chromosome id (1-22) or "all".')
111
+ parser.add_argument('--bfile_root', type=str, required=True, help='Root path for genotype plink bfiles (.bim, .bed, .fam).')
112
+ parser.add_argument('--keep_snp_root', type=str, required=True, help='Root path for SNP files.')
113
+ parser.add_argument('--gtf_annotation_file', type=str, required=True, help='Path to GTF annotation file.')
114
+ parser.add_argument('--gene_window_size', type=int, default=50000, help='Gene window size in base pairs.')
115
+ parser.add_argument('--enhancer_annotation_file', type=str, help='Path to enhancer annotation file (optional).')
116
+ parser.add_argument('--snp_multiple_enhancer_strategy', type=str, choices=['max_mkscore', 'nearest_TSS'], default='max_mkscore',
117
+ help='Strategy for handling multiple enhancers per SNP.')
118
+ parser.add_argument('--gene_window_enhancer_priority', type=str, choices=['gene_window_first', 'enhancer_first', 'enhancer_only'],
119
+ help='Priority between gene window and enhancer annotations.')
120
+ parser.add_argument('--spots_per_chunk', type=int, default=1000, help='Number of spots per chunk.')
121
+ parser.add_argument('--ld_wind', type=int, default=1, help='LD window size.')
122
+ parser.add_argument('--ld_unit', type=str, choices=['SNP', 'KB', 'CM'], default='CM', help='Unit for LD window.')
123
+ parser.add_argument('--additional_baseline_annotation', type=str, default=None, help='Path of additional baseline annotations')
173
124
 
174
- parser.add_argument('--sumstats_file', default=None, help="Path to GWAS summary statistics file.")
175
- parser.add_argument('--sumstats_config_file', default=None, help="Path to GWAS summary statistics config file.")
176
- parser.add_argument('--w_file', required=True, help="Path to regression weight file.")
177
- parser.add_argument('--ldscore_input_dir', required=True, help="Input directory for LD Score files.")
178
- parser.add_argument('--ldsc_save_dir', required=True, help="Directory to save Spatial LDSC results.")
179
- parser.add_argument('--trait_name', default=None, help="Name of the trait.")
180
- parser.add_argument('--not_M_5_50', action='store_true', help="Flag to not use M 5 50 in calculations.")
181
- parser.add_argument('--n_blocks', type=int, default=200, help="Number of blocks for jackknife resampling.")
182
- parser.add_argument('--chisq_max', type=int, help="Maximum chi-square value for filtering SNPs.")
183
- parser.add_argument('--all_chunk', type=int, help="Number of chunks for processing spatial data.")
184
125
 
185
- # if use additional baseline annotation
186
- parser.add_argument('--disable_additional_baseline_annotation', action='store_true', default=False,)
126
+ def add_latent_to_gene_args(parser):
127
+ add_shared_args(parser)
128
+ parser.add_argument('--annotation', type=str, required=True, help='Name of the annotation layer.')
129
+ parser.add_argument('--no_expression_fraction', action='store_true', help='Skip expression fraction filtering.')
130
+ parser.add_argument('--latent_representation', type=str, choices=['latent_GVAE', 'latent_PCA'], default='latent_GVAE',
131
+ help='Type of latent representation.')
132
+ parser.add_argument('--num_neighbour', type=int, default=21, help='Number of neighbors.')
133
+ parser.add_argument('--num_neighbour_spatial', type=int, default=101, help='Number of spatial neighbors.')
134
+ # parser.add_argument('--species', type=str, help='Species name for homolog gene mapping (optional).')
135
+ parser.add_argument('--homolog_file', type=str, help='Path to homologous gene conversion file (optional).')
187
136
 
188
- parser.add_argument('--num_processes', type=int, default=4, help="Number of processes for parallel computing.")
189
137
 
190
- return parser
138
+ def add_spatial_ldsc_args(parser):
139
+ add_shared_args(parser)
140
+ parser.add_argument('--sumstats_file', type=str, required=True, help='Path to GWAS summary statistics file.')
141
+ parser.add_argument('--w_file', type=str, required=True, help='Path to regression weight file.')
142
+ parser.add_argument('--trait_name', type=str, required=True, help='Name of the trait being analyzed.')
143
+ parser.add_argument('--n_blocks', type=int, default=200, help='Number of blocks for jackknife resampling.')
144
+ parser.add_argument('--chisq_max', type=int, help='Maximum chi-square value for filtering SNPs.')
145
+ parser.add_argument('--num_processes', type=int, default=4, help='Number of processes for parallel computing.')
146
+ parser.add_argument('--use_additional_baseline_annotation', type=bool, nargs='?', const=True, default=True, help='Use additional baseline annotations when provided')
191
147
 
192
148
 
193
149
  def add_Cauchy_combination_args(parser):
194
- # Required arguments
195
- parser.add_argument('--input_hdf5_path', required=True, type=str, help='Path to the HDF5 file')
196
- parser.add_argument('--input_ldsc_dir', required=True, type=str, help='Directory containing LDSC results')
197
- parser.add_argument('--output_cauchy_dir', required=True, type=str,
198
- help='Output directory for Cauchy combination results')
199
- parser.add_argument('--sample_name', required=True, type=str, help='Name of the sample')
200
- parser.add_argument('--trait_name', required=True, type=str, help='Name of the trait')
201
- parser.add_argument('--annotation', required=True, type=str, help='Annotation layer name')
202
-
203
- # Optional arguments
204
- parser.add_argument('--meta', default=None, type=str, )
205
- parser.add_argument('--slide', default=None, type=str, )
206
-
207
-
208
-
209
- def add_Visualization_args(parser):
210
- # Required arguments
211
- parser.add_argument('--input_hdf5_path', required=True, type=str, help='Path to the HDF5 file')
212
- parser.add_argument('--input_ldsc_dir', required=True, type=str, help='Directory containing LDSC results')
213
- parser.add_argument('--output_figure_dir', required=True, type=str, help='Output directory for figures')
214
- parser.add_argument('--sample_name', required=True, type=str, help='Name of the sample')
215
- parser.add_argument('--trait_name', required=True, type=str, help='Name of the trait')
216
- parser.add_argument('--annotation', default=None, type=str, help='Name of the annotation layer.')
217
-
218
-
219
- # Arguments with defaults
220
- parser.add_argument('--fig_title', type=str, default=None, help='Title of figure')
221
- parser.add_argument('--fig_height', type=int, default=600, help='Height of figure in pixels')
222
- parser.add_argument('--fig_width', type=int, default=800, help='Width of figure in pixels')
223
- parser.add_argument('--point_size', type=int, default=None, help='Point size of figure')
224
- parser.add_argument('--fig_style', type=str, default='light', choices=['dark', 'light'], help='Plot style of figure')
225
-
226
- def add_all_mode_args(parser):
227
- parser.add_argument('--input_hdf5_path', required=True, type=str, help='Path to the input hdf5 file.')
228
- parser.add_argument('--save_dir', required=True, type=str, help='Path to the running results.')
229
- # output
230
- # parser.add_argument('--output_hdf5_path', required=True, type=str, help='Path to the output hdf5 file.')
231
- parser.add_argument('--sample_name', required=True, type=str, help='Name of the sample.')
232
- parser.add_argument('--annotation', default=None, type=str, help='Name of the annotation layer.')
233
- parser.add_argument('--type', default=None, type=str, help="The gene expression layer of the input data (e.g., 'count', 'counts', 'log1p').")
234
-
235
- # latent_to_gene
236
- # input
237
- # parser.add_argument('--input_hdf5_path', type=str, required=True, help='Path to the input HDF5 file.')
238
- # parser.add_argument('--sample_name', type=str, required=True, help='Name of the sample.')
239
- # output
240
- # parser.add_argument('--output_feather_path', type=str, required=True,
241
- # help='Path to save output gene marker score feather file.')
242
- # parser.add_argument('--annotation', default=None, type=str, help='Name of the annotation layer.')
243
- # parser.add_argument('--type', default=None, type=str, help="Type of input data (e.g., 'count', 'counts').")
244
-
245
- # no_expression_fraction
246
- # no_expression_fraction
247
- parser.add_argument('--no_expression_fraction', action='store_true', default=False,
248
- help='Flag to not use expression fraction as filter when calculate the maker score. Default is False.')
249
-
250
- parser.add_argument('--latent_representation', type=str, default='latent_GVAE',
251
- choices=['latent_GVAE', 'latent_PCA'],
252
- help='Type of latent representation. Default is "latent_GVAE".')
253
- parser.add_argument('--num_neighbour', type=int, default=21,
254
- help='Number of neighbours to consider. Default is 21.')
255
- parser.add_argument('--num_neighbour_spatial', type=int, default=101,
256
- help='Number of spatial neighbours to consider. Default is 101.')
257
- parser.add_argument('--species', type=str, default=None, help='Species name, which is the column name in the homolog gene conversion file.')
258
- parser.add_argument('--gs_species', type=str, default=None, help='Homologous gene conversion file path, if applicable.')
259
- parser.add_argument('--gM_slices', type=str, default=None, help='The mean Gene marker scores path across multiple slices.')
260
-
261
-
262
- # generate_ldscore
263
- # parser.add_argument('--sample_name', type=str, required=True, help='Sample name')
264
- # should be all
265
- # parser.add_argument('--chrom', type=chrom_choice, required=True, help='Chromosome number (1-22) or "all"')
266
- # output
267
- # parser.add_argument('--ldscore_save_dir', type=str, required=True, help='Directory to save ld score files')
268
-
269
- # reference panel
270
- parser.add_argument('--bfile_root', type=str, required=True, help='Bfile root path')
271
- parser.add_argument('--keep_snp_root', type=str, required=True, help='Keep SNP root path')
272
-
273
- # Annotation by gene distance
274
- parser.add_argument('--gtf_annotation_file', type=str, required=True, help='GTF file path')
275
- parser.add_argument('--gene_window_size', type=int, default=50000, help='Gene window size')
276
-
277
- # Enhancer annotation
278
- parser.add_argument('--enhancer_annotation_file', type=str, default=None,
279
- help='Enhancer annotation bed file path, optional.')
280
- parser.add_argument('--snp_multiple_enhancer_strategy', type=str, default='max_mkscore',
281
- choices=['max_mkscore', 'nearest_TSS'], help='Strategy for multiple enhancers per SNP')
282
- parser.add_argument('--gene_window_enhancer_priority', type=str, default=None,
283
- choices=['gene_window_first', 'enhancer_first', 'enhancer_only'],
284
- help='Priority between gene window and enhancer')
285
-
286
- # Arguments for calculating ld score
287
- parser.add_argument('--spots_per_chunk', type=int, default=5_000, help='Number of spots per chunk')
288
- parser.add_argument('--ld_wind', type=int, default=1, help='LD window size')
289
- parser.add_argument('--ld_unit', type=str, default='CM', help='LD window unit (SNP/KB/CM)',
290
- choices=['SNP', 'KB', 'CM'])
291
-
292
- # spatial ldsc args:
293
- parser.add_argument('--sumstats_file', default=None, help="Path to GWAS summary statistics file.")
294
- parser.add_argument('--sumstats_config_file', default=None, help="Path to GWAS summary statistics config file.")
295
- parser.add_argument('--w_file', required=True, help="Path to regression weight file.")
296
- parser.add_argument('--ldscore_input_dir', required=True, help="Input directory for LD Score files.")
297
- parser.add_argument('--ldsc_save_dir', required=True, help="Directory to save Spatial LDSC results.")
298
- parser.add_argument('--trait_name', default=None, help="Name of the trait.")
299
- parser.add_argument('--not_M_5_50', action='store_true', help="Flag to not use M 5 50 in calculations.")
300
- parser.add_argument('--n_blocks', type=int, default=200, help="Number of blocks for jackknife resampling.")
301
- parser.add_argument('--chisq_max', type=int, help="Maximum chi-square value for filtering SNPs.")
302
- parser.add_argument('--all_chunk', type=int, help="Number of chunks for processing spatial data.")
303
-
304
-
305
- def get_runall_mode_config(args: argparse.Namespace):
306
- # output
307
- args.output_hdf5_path = f'{args.save_dir}/{args.sample_name}/find_latent_representations/{args.sample_name}_add_latent.h5ad'
308
- args.output_feather_path = f'{args.save_dir}/{args.sample_name}/latent_to_gene/{args.sample_name}_gene_marker_score.feather'
309
- args.ldscore_save_dir = f'{args.save_dir}/{args.sample_name}/generate_ldscore'
310
- args.ldsc_save_dir = f'{args.save_dir}/{args.sample_name}/spatial_ldsc'
311
- args.output_cauchy_dir = f'{args.save_dir}/{args.sample_name}/cauchy_combination/'
312
-
313
- # input
314
- args.input_hdf5_with_latent_path = args.output_hdf5_path
315
- args.mkscore_feather_file = args.output_feather_path
316
- args.ldscore_input_dir = args.ldscore_save_dir
317
- args.chrom = 'all'
318
- args.input_ldsc_dir = args.ldsc_save_dir
319
- args.input_spatial_ldsc = f'{args.save_dir}/{args.sample_name}/spatial_ldsc/{args.sample_name}_{args.trait_name}.gz'
320
- # find_latent_representations
321
- flr_config = get_dataclass_from_parser(args, FindLatentRepresentationsConfig)
322
- # latent_to_gene
323
- ltg_config = get_dataclass_from_parser(args, LatentToGeneConfig)
324
- # generate_ldscore
325
- gls_config = get_dataclass_from_parser(args, GenerateLDScoreConfig)
326
- # spatial ldsc
327
- ldsc_config = get_dataclass_from_parser(args, SpatialLDSCConfig)
328
- # cauchy combination
329
- cauchy_config = get_dataclass_from_parser(args, CauchyCombinationConfig)
330
- return RunAllModeConfig(flr_config=flr_config, ltg_config=ltg_config, gls_config=gls_config,
331
- ldsc_config=ldsc_config, cauchy_config=cauchy_config)
332
-
150
+ add_shared_args(parser)
151
+ parser.add_argument('--trait_name', type=str, required=True, help='Name of the trait being analyzed.')
152
+ parser.add_argument('--annotation', type=str, required=True, help='Name of the annotation in adata.obs to use.')
153
+ parser.add_argument('--meta', type=str, help='Optional meta information.')
154
+ parser.add_argument('--slide', type=str, help='Optional slide information.')
155
+
156
+
157
+ def add_report_args(parser):
158
+ add_shared_args(parser)
159
+ parser.add_argument('--trait_name', type=str, required=True, help='Name of the trait to generate the report for.')
160
+ parser.add_argument('--annotation', type=str, required=True, help='Annotation layer name.')
161
+ # parser.add_argument('--plot_type', type=str, choices=['manhattan', 'GSS', 'gsMap', 'all'], default='all',
162
+ # help="Type of diagnostic plot to generate. Choose from 'manhattan', 'GSS', 'gsMap', or 'all'.")
163
+ parser.add_argument('--top_corr_genes', type=int, default=50,
164
+ help='Number of top correlated genes to display.')
165
+ parser.add_argument('--selected_genes', type=str, nargs='*',
166
+ help='List of specific genes to include in the report (optional).')
167
+ parser.add_argument('--sumstats_file', type=str, required=True, help='Path to GWAS summary statistics file.')
168
+
169
+ # Optional arguments for customization
170
+ parser.add_argument('--fig_width', type=int, default=None, help='Width of the generated figures in pixels.')
171
+ parser.add_argument('--fig_height', type=int, default=None, help='Height of the generated figures in pixels.')
172
+ parser.add_argument('--point_size', type=int, default=None, help='Point size for the figures.')
173
+ parser.add_argument('--fig_style', type=str, default='light', choices=['dark', 'light'],
174
+ help='Style of the generated figures.')
333
175
 
334
176
  def add_format_sumstats_args(parser):
335
177
  # Required arguments
336
- parser.add_argument('--sumstats', required=True, type=str,
178
+ parser.add_argument('--sumstats', required=True, type=str,
337
179
  help='Path to gwas summary data')
338
- parser.add_argument('--out', required=True, type=str,
180
+ parser.add_argument('--out', required=True, type=str,
339
181
  help='Path to save the formatted gwas data')
340
-
182
+
341
183
  # Arguments for specify column name
342
- parser.add_argument('--snp', default=None,type=str,
184
+ parser.add_argument('--snp', default=None, type=str,
343
185
  help="Name of snp column (if not a name that gsMap understands)")
344
- parser.add_argument('--a1', default=None,type=str,
186
+ parser.add_argument('--a1', default=None, type=str,
345
187
  help="Name of effect allele column (if not a name that gsMap understands)")
346
- parser.add_argument('--a2', default=None,type=str,
188
+ parser.add_argument('--a2', default=None, type=str,
347
189
  help="Name of none-effect allele column (if not a name that gsMap understands)")
348
- parser.add_argument('--info', default=None,type=str,
190
+ parser.add_argument('--info', default=None, type=str,
349
191
  help="Name of info column (if not a name that gsMap understands)")
350
- parser.add_argument('--beta', default=None,type=str,
192
+ parser.add_argument('--beta', default=None, type=str,
351
193
  help="Name of gwas beta column (if not a name that gsMap understands).")
352
- parser.add_argument('--se', default=None,type=str,
194
+ parser.add_argument('--se', default=None, type=str,
353
195
  help="Name of gwas standar error of beta column (if not a name that gsMap understands)")
354
- parser.add_argument('--p', default=None,type=str,
196
+ parser.add_argument('--p', default=None, type=str,
355
197
  help="Name of p-value column (if not a name that gsMap understands)")
356
- parser.add_argument('--frq', default=None,type=str,
198
+ parser.add_argument('--frq', default=None, type=str,
357
199
  help="Name of A1 ferquency column (if not a name that gsMap understands)")
358
- parser.add_argument('--n', default=None,type=str,
200
+ parser.add_argument('--n', default=None, type=str,
359
201
  help="Name of sample size column (if not a name that gsMap understands)")
360
- parser.add_argument('--z', default=None,type=str,
202
+ parser.add_argument('--z', default=None, type=str,
361
203
  help="Name of gwas Z-statistics column (if not a name that gsMap understands)")
362
- parser.add_argument('--OR', default=None,type=str,
204
+ parser.add_argument('--OR', default=None, type=str,
363
205
  help="Name of gwas OR column (if not a name that gsMap understands)")
364
- parser.add_argument('--se_OR', default=None,type=str,
206
+ parser.add_argument('--se_OR', default=None, type=str,
365
207
  help="Name of standar error of OR column (if not a name that gsMap understands)")
366
-
208
+
367
209
  # Arguments for convert SNP (chr, pos) to rsid
368
- parser.add_argument('--chr', default="Chr",type=str,
210
+ parser.add_argument('--chr', default="Chr", type=str,
369
211
  help="Name of SNP chromosome column (if not a name that gsMap understands)")
370
- parser.add_argument('--pos', default="Pos",type=str,
212
+ parser.add_argument('--pos', default="Pos", type=str,
371
213
  help="Name of SNP positions column (if not a name that gsMap understands)")
372
- parser.add_argument('--dbsnp', default=None,type=str,
214
+ parser.add_argument('--dbsnp', default=None, type=str,
373
215
  help='Path to reference dnsnp file')
374
- parser.add_argument('--chunksize', default=1e+6,type=int,
216
+ parser.add_argument('--chunksize', default=1e+6, type=int,
375
217
  help='Chunk size for loading dbsnp file')
376
-
218
+
377
219
  # Arguments for output format and quality
378
- parser.add_argument('--format',default='gsMap', type=str,
379
- help='Format of output data',choices=['gsMap', 'COJO'])
380
- parser.add_argument('--info_min', default=0.9,type=float,
220
+ parser.add_argument('--format', default='gsMap', type=str,
221
+ help='Format of output data', choices=['gsMap', 'COJO'])
222
+ parser.add_argument('--info_min', default=0.9, type=float,
381
223
  help='Minimum INFO score.')
382
- parser.add_argument('--maf_min', default=0.01,type=float,
224
+ parser.add_argument('--maf_min', default=0.01, type=float,
383
225
  help='Minimum MAF.')
384
226
  parser.add_argument('--keep_chr_pos', action='store_true', default=False,
385
- help='Keep SNP chromosome and position columns in the output data')
227
+ help='Keep SNP chromosome and position columns in the output data')
228
+
229
+ def add_run_all_mode_args(parser):
230
+ add_shared_args(parser)
231
+
232
+ # Required paths and configurations
233
+ parser.add_argument('--gsMap_resource_dir', type=str, required=True,
234
+ help='Directory containing gsMap resources (e.g., genome annotations, LD reference panel, etc.).')
235
+ parser.add_argument('--hdf5_path', type=str, required=True,
236
+ help='Path to the input spatial transcriptomics data (H5AD format).')
237
+ parser.add_argument('--annotation', type=str, required=True,
238
+ help='Name of the annotation in adata.obs to use.')
239
+ parser.add_argument('--data_layer', type=str, default='X',
240
+ help='Data layer of h5ad for gene expression (e.g., "counts", "log1p", "X").')
241
+
242
+ # GWAS Data Parameters
243
+ parser.add_argument('--trait_name', type=str, help='Name of the trait for GWAS analysis (required if sumstats_file is provided).')
244
+ parser.add_argument('--sumstats_file', type=str,
245
+ help='Path to GWAS summary statistics file. Either sumstats_file or sumstats_config_file is required.')
246
+ parser.add_argument('--sumstats_config_file', type=str,
247
+ help='Path to GWAS summary statistics config file. Either sumstats_file or sumstats_config_file is required.')
248
+
249
+ # Homolog Data Parameters
250
+ parser.add_argument('--homolog_file', type=str,
251
+ help='Path to homologous gene for converting gene names from different species to human (optional, used for cross-species analysis).')
252
+
253
+ # Maximum number of processes
254
+ parser.add_argument('--max_processes', type=int, default=10,
255
+ help='Maximum number of processes for parallel execution.')
256
+
257
+ # # Optional paths for customization
258
+ # parser.add_argument('--bfile_root', type=str,
259
+ # help='Root path to PLINK bfiles (LD reference panel). If not provided, it will use the default in gsMap_resource_dir.')
260
+ # parser.add_argument('--keep_snp_root', type=str,
261
+ # help='Root path for SNP filtering. If not provided, it will use the default in gsMap_resource_dir.')
262
+ # parser.add_argument('--w_file', type=str,
263
+ # help='Path to the regression weight file. If not provided, it will use the default in gsMap_resource_dir.')
264
+ # parser.add_argument('--snp_gene_weight_adata_path', type=str,
265
+ # help='Path to the SNP-gene weight matrix file. If not provided, it will use the default in gsMap_resource_dir.')
266
+ # parser.add_argument('--baseline_annotation_dir', type=str,
267
+ # help='Directory containing the baseline annotations for quick mode. If not provided, it will use the default in gsMap_resource_dir.')
268
+ # parser.add_argument('--SNP_gene_pair_dir', type=str,
269
+ # help='Directory for SNP-gene pair data. If not provided, it will use the default in gsMap_resource_dir.')
270
+
271
+
272
+ def ensure_path_exists(func):
273
+ @wraps(func)
274
+ def wrapper(*args, **kwargs):
275
+ result = func(*args, **kwargs)
276
+ if isinstance(result, Path):
277
+ if result.suffix:
278
+ result.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
279
+ else: # It's a directory path
280
+ result.mkdir(parents=True, exist_ok=True, mode=0o755)
281
+ return result
282
+
283
+ return wrapper
284
+
386
285
 
387
286
  @dataclass
388
- class FindLatentRepresentationsConfig:
389
- input_hdf5_path: str
390
- output_hdf5_path: str
287
+ class ConfigWithAutoPaths:
288
+ workdir: str
391
289
  sample_name: str
290
+
291
+ def __post_init__(self):
292
+ if self.workdir is None:
293
+ raise ValueError('workdir must be provided.')
294
+
295
+ @property
296
+ @ensure_path_exists
297
+ def hdf5_with_latent_path(self) -> Path:
298
+ return Path(f'{self.workdir}/{self.sample_name}/find_latent_representations/{self.sample_name}_add_latent.h5ad')
299
+
300
+ @property
301
+ @ensure_path_exists
302
+ def mkscore_feather_path(self) -> Path:
303
+ return Path(f'{self.workdir}/{self.sample_name}/latent_to_gene/{self.sample_name}_gene_marker_score.feather')
304
+
305
+ @property
306
+ @ensure_path_exists
307
+ def ldscore_save_dir(self) -> Path:
308
+ return Path(f'{self.workdir}/{self.sample_name}/generate_ldscore')
309
+
310
+ @property
311
+ @ensure_path_exists
312
+ def ldsc_save_dir(self) -> Path:
313
+ return Path(f'{self.workdir}/{self.sample_name}/spatial_ldsc')
314
+
315
+ @property
316
+ @ensure_path_exists
317
+ def cauchy_save_dir(self) -> Path:
318
+ return Path(f'{self.workdir}/{self.sample_name}/cauchy_combination')
319
+
320
+ @ensure_path_exists
321
+ def get_report_dir(self, trait_name: str) -> Path:
322
+ return Path(f'{self.workdir}/{self.sample_name}/report/{trait_name}')
323
+
324
+ def get_gsMap_report_file(self, trait_name: str) -> Path:
325
+ return self.get_report_dir(trait_name) / f'{self.sample_name}_{trait_name}_gsMap_Report.html'
326
+
327
+ @ensure_path_exists
328
+ def get_manhattan_html_plot_path(self, trait_name: str) -> Path:
329
+ return Path(
330
+ f'{self.workdir}/{self.sample_name}/report/{trait_name}/manhattan_plot/{self.sample_name}_{trait_name}_Diagnostic_Manhattan_Plot.html')
331
+
332
+ @ensure_path_exists
333
+ def get_GSS_plot_dir(self, trait_name: str) -> Path:
334
+ return Path(f'{self.workdir}/{self.sample_name}/report/{trait_name}/GSS_plot')
335
+
336
+ def get_GSS_plot_select_gene_file(self, trait_name: str) -> Path:
337
+ return self.get_GSS_plot_dir(trait_name) / 'plot_genes.csv'
338
+
339
+ @ensure_path_exists
340
+ def get_ldsc_result_file(self, trait_name: str) -> Path:
341
+ return Path(f'{self.ldsc_save_dir}/{self.sample_name}_{trait_name}.csv.gz')
342
+
343
+ @ensure_path_exists
344
+ def get_cauchy_result_file(self, trait_name: str) -> Path:
345
+ return Path(f'{self.cauchy_save_dir}/{self.sample_name}_{trait_name}.Cauchy.csv.gz')
346
+
347
+ @ensure_path_exists
348
+ def get_gene_diagnostic_info_save_path(self, trait_name: str) -> Path:
349
+ return Path(
350
+ f'{self.workdir}/{self.sample_name}/report/{trait_name}/{self.sample_name}_{trait_name}_Gene_Diagnostic_Info.csv')
351
+
352
+ @ensure_path_exists
353
+ def get_gsMap_plot_save_dir(self, trait_name: str) -> Path:
354
+ return Path(f'{self.workdir}/{self.sample_name}/report/{trait_name}/gsMap_plot')
355
+
356
+ def get_gsMap_html_plot_save_path(self, trait_name: str) -> Path:
357
+ return self.get_gsMap_plot_save_dir(trait_name) / f'{self.sample_name}_{trait_name}_gsMap_plot.html'
358
+
359
+ @dataclass
360
+ class FindLatentRepresentationsConfig(ConfigWithAutoPaths):
361
+ input_hdf5_path: str
362
+ # output_hdf5_path: str
392
363
  annotation: str = None
393
- type: str = None
364
+ data_layer: str = None
394
365
 
395
366
  epochs: int = 300
396
367
  feat_hidden1: int = 256
397
368
  feat_hidden2: int = 128
398
369
  feat_cell: int = 3000
399
- gcn_hidden1: int = 64
400
- gcn_hidden2: int = 30
370
+ gat_hidden1: int = 64
371
+ gat_hidden2: int = 30
401
372
  p_drop: float = 0.1
402
- gcn_lr: float = 0.001
373
+ gat_lr: float = 0.001
403
374
  gcn_decay: float = 0.01
404
375
  n_neighbors: int = 11
405
376
  label_w: float = 1
@@ -413,6 +384,7 @@ class FindLatentRepresentationsConfig:
413
384
  hierarchically: bool = False
414
385
 
415
386
  def __post_init__(self):
387
+ # self.output_hdf5_path = self.hdf5_with_latent_path
416
388
  if self.hierarchically:
417
389
  if self.annotation is None:
418
390
  raise ValueError('annotation must be provided if hierarchically is True.')
@@ -428,29 +400,43 @@ class FindLatentRepresentationsConfig:
428
400
 
429
401
 
430
402
  @dataclass
431
- class LatentToGeneConfig:
432
- input_hdf5_with_latent_path: str
433
- sample_name: str
434
- output_feather_path: str
403
+ class LatentToGeneConfig(ConfigWithAutoPaths):
404
+ # input_hdf5_with_latent_path: str
405
+ # output_feather_path: str
435
406
  no_expression_fraction: bool = False
436
407
  latent_representation: str = 'latent_GVAE'
437
408
  num_neighbour: int = 21
438
409
  num_neighbour_spatial: int = 101
439
- species: str = None
440
- gs_species: str = None
410
+ homolog_file: str = None
441
411
  gM_slices: str = None
442
412
  annotation: str = None
443
- type: str = None
413
+
414
+ def __post_init__(self):
415
+ if self.homolog_file is not None:
416
+ logger.info(f"User provided homolog file to map gene names to human: {self.homolog_file}")
417
+ # check the format of the homolog file
418
+ with open(self.homolog_file, 'r') as f:
419
+ first_line = f.readline().strip()
420
+ _n_col = len(first_line.split())
421
+ if _n_col != 2:
422
+ raise ValueError(
423
+ f"Invalid homolog file format. Expected 2 columns, first column should be other species gene name, second column should be human gene name. "
424
+ f"Got {_n_col} columns in the first line.")
425
+ else:
426
+ first_col_name, second_col_name = first_line.split()
427
+ self.species = first_col_name
428
+ logger.info(
429
+ f"Homolog file provided and will map gene name from column1:{first_col_name} to column2:{second_col_name}")
430
+ else:
431
+ logger.info("No homolog file provided. Run in human mode.")
444
432
 
445
433
 
446
434
  @dataclass
447
- class GenerateLDScoreConfig:
448
- sample_name: str
435
+ class GenerateLDScoreConfig(ConfigWithAutoPaths):
449
436
  chrom: Union[int, str]
450
- ldscore_save_dir: str
451
- mkscore_feather_file: str
437
+
452
438
  bfile_root: str
453
- keep_snp_root: str
439
+ keep_snp_root: Optional[str]
454
440
 
455
441
  # annotation by gene distance
456
442
  gtf_annotation_file: str
@@ -459,15 +445,28 @@ class GenerateLDScoreConfig:
459
445
  # annotation by enhancer
460
446
  enhancer_annotation_file: str = None
461
447
  snp_multiple_enhancer_strategy: Literal['max_mkscore', 'nearest_TSS'] = 'max_mkscore'
462
- gene_window_enhancer_priority: Literal['gene_window_first', 'enhancer_first', 'enhancer_only',] = None
448
+ gene_window_enhancer_priority: Optional[Literal['gene_window_first', 'enhancer_first', 'enhancer_only',]] = None
463
449
 
464
450
  # for calculating ld score
465
- additional_baseline_annotation_dir_path: str = None
466
- spots_per_chunk: int = 5_000
451
+ additional_baseline_annotation: str = None
452
+ spots_per_chunk: int = 1_000
467
453
  ld_wind: int = 1
468
454
  ld_unit: str = 'CM'
469
455
 
456
+ # zarr config
457
+ ldscore_save_format: Literal['feather', 'zarr', 'quick_mode'] = 'feather'
458
+
459
+ zarr_chunk_size: Tuple[int, int] = None
460
+
461
+ # for pre calculating the SNP Gene ldscore Weight
462
+ save_pre_calculate_snp_gene_weight_matrix: bool = False
463
+
464
+ baseline_annotation_dir: Optional[str] = None
465
+ SNP_gene_pair_dir: Optional[str] = None
470
466
  def __post_init__(self):
467
+ # if self.mkscore_feather_file is None:
468
+ # self.mkscore_feather_file = self._get_mkscore_feather_path()
469
+
471
470
  if self.enhancer_annotation_file is not None and self.gene_window_enhancer_priority is None:
472
471
  logger.warning("enhancer_annotation_file is provided but gene_window_enhancer_priority is not provided. "
473
472
  "by default, gene_window_enhancer_priority is set to 'enhancer_only', when enhancer_annotation_file is provided.")
@@ -489,42 +488,55 @@ class GenerateLDScoreConfig:
489
488
  f'Only gene window annotation will be used to calculate LD score. SNP within +-{self.gene_window_size} bp of gene body will be used. ')
490
489
 
491
490
  # remind for baseline annotation
492
- if self.additional_baseline_annotation_dir_path is None:
491
+ if self.additional_baseline_annotation is None:
493
492
  logger.info(f'------Baseline annotation is not provided. Default baseline annotation will be used.')
494
493
  else:
495
- logger.info(f'------Baseline annotation is provided. Additional baseline annotation will be used with the default baseline annotation.')
496
- logger.info(f'------Baseline annotation directory: {self.additional_baseline_annotation_dir_path}')
494
+ logger.info(
495
+ f'------Baseline annotation is provided. Additional baseline annotation will be used with the default baseline annotation.')
496
+ logger.info(f'------Baseline annotation directory: {self.additional_baseline_annotation}')
497
497
  # check the existence of baseline annotation
498
498
  if self.chrom == 'all':
499
499
  for chrom in range(1, 23):
500
500
  chrom = str(chrom)
501
- baseline_annotation_path = Path(self.additional_baseline_annotation_dir_path) / f'baseline.{chrom}.annot.gz'
501
+ baseline_annotation_path = Path(
502
+ self.additional_baseline_annotation) / f'baseline.{chrom}.annot.gz'
502
503
  if not baseline_annotation_path.exists():
503
- raise FileNotFoundError(f'baseline.{chrom}.annot.gz is not found in {self.additional_baseline_annotation_dir_path}.')
504
+ raise FileNotFoundError(
505
+ f'baseline.{chrom}.annot.gz is not found in {self.additional_baseline_annotation}.')
504
506
  else:
505
- baseline_annotation_path = Path(self.additional_baseline_annotation_dir_path) / f'baseline.{self.chrom}.annot.gz'
507
+ baseline_annotation_path = Path(
508
+ self.additional_baseline_annotation) / f'baseline.{self.chrom}.annot.gz'
506
509
  if not baseline_annotation_path.exists():
507
- raise FileNotFoundError(f'baseline.{self.chrom}.annot.gz is not found in {self.additional_baseline_annotation_dir_path}.')
510
+ raise FileNotFoundError(
511
+ f'baseline.{self.chrom}.annot.gz is not found in {self.additional_baseline_annotation}.')
508
512
 
513
+ # set the default zarr chunk size
514
+ if self.ldscore_save_format == 'zarr' and self.zarr_chunk_size is None:
515
+ self.zarr_chunk_size = (10_000, self.spots_per_chunk)
509
516
 
510
517
 
511
518
  @dataclass
512
- class SpatialLDSCConfig:
513
- sample_name: str
519
+ class SpatialLDSCConfig(ConfigWithAutoPaths):
514
520
  w_file: str
515
- ldscore_input_dir: str
516
- ldsc_save_dir: str
517
- disable_additional_baseline_annotation: bool = False
518
- trait_name: str = None
519
- sumstats_file: str = None
520
- sumstats_config_file: str = None
521
+ # ldscore_save_dir: str
522
+ use_additional_baseline_annotation: bool = True
523
+ trait_name: Optional[str] = None
524
+ sumstats_file: Optional[str] = None
525
+ sumstats_config_file: Optional[str] = None
521
526
  num_processes: int = 4
522
527
  not_M_5_50: bool = False
523
528
  n_blocks: int = 200
524
- chisq_max: int = None
525
- all_chunk: int = None
529
+ chisq_max: Optional[int] = None
530
+ all_chunk: Optional[int] = None
531
+ chunk_range: Optional[Tuple[int, int]] = None
532
+
533
+ ldscore_save_format: Literal['feather', 'zarr', 'quick_mode'] = 'feather'
534
+
535
+ spots_per_chunk_quick_mode: int = 1_000
536
+ snp_gene_weight_adata_path: Optional[str] = None
526
537
 
527
538
  def __post_init__(self):
539
+ super().__post_init__()
528
540
  if self.sumstats_file is None and self.sumstats_config_file is None:
529
541
  raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
530
542
  if self.sumstats_file is not None and self.sumstats_config_file is not None:
@@ -540,62 +552,54 @@ class SpatialLDSCConfig:
540
552
  with open(self.sumstats_config_file) as f:
541
553
  config = yaml.load(f, Loader=yaml.FullLoader)
542
554
  for trait_name, sumstats_file in config.items():
543
- self.sumstats_config_dict[trait_name] = sumstats_file
555
+ assert Path(sumstats_file).exists(), f'{sumstats_file} does not exist.'
544
556
  # load the sumstats file
545
557
  elif self.sumstats_file is not None:
546
558
  self.sumstats_config_dict[self.trait_name] = self.sumstats_file
547
559
  else:
548
560
  raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
549
561
 
562
+ for sumstats_file in self.sumstats_config_dict.values():
563
+ assert Path(sumstats_file).exists(), f'{sumstats_file} does not exist.'
564
+
550
565
  # check if additional baseline annotation is exist
551
- self.use_additional_baseline_annotation = False
552
- self.process_additional_baseline_annotation()
566
+ # self.use_additional_baseline_annotation = False
567
+
568
+ if self.use_additional_baseline_annotation:
569
+ self.process_additional_baseline_annotation()
553
570
 
554
571
  def process_additional_baseline_annotation(self):
555
- additional_baseline_annotation_dir_path = Path(self.ldscore_input_dir) / 'additional_baseline'
556
- dir_exists = additional_baseline_annotation_dir_path.exists()
572
+ additional_baseline_annotation = Path(self.ldscore_save_dir) / 'additional_baseline'
573
+ dir_exists = additional_baseline_annotation.exists()
557
574
 
558
575
  if not dir_exists:
559
- if self.use_additional_baseline_annotation:
560
- logger.warning(f"additional_baseline directory is not found in {self.ldscore_input_dir}.")
561
- print('''\
562
- if you want to use additional baseline annotation,
563
- please provide additional baseline annotation when calculating ld score.
564
- ''')
565
- raise FileNotFoundError(
566
- f'additional_baseline directory is not found. You should disable use_additional_baseline_annotation')
567
- return
568
-
569
- self.use_additional_baseline_annotation = self.use_additional_baseline_annotation or True
570
-
571
- if self.disable_additional_baseline_annotation:
572
- logger.warning(
573
- f"additional_baseline directory is found in {self.ldscore_input_dir}, but use_additional_baseline_annotation is disabled.")
574
- print('''\
575
- if you want to use additional baseline annotation,
576
- please enable by not adding --disable_additional_baseline_annotation.
577
- ''')
578
576
  self.use_additional_baseline_annotation = False
577
+ # if self.use_additional_baseline_annotation:
578
+ # logger.warning(f"additional_baseline directory is not found in {self.ldscore_save_dir}.")
579
+ # print('''\
580
+ # if you want to use additional baseline annotation,
581
+ # please provide additional baseline annotation when calculating ld score.
582
+ # ''')
583
+ # raise FileNotFoundError(
584
+ # f'additional_baseline directory is not found.')
585
+ # return
586
+ # self.use_additional_baseline_annotation = self.use_additional_baseline_annotation or True
579
587
  else:
580
588
  logger.info(
581
589
  f'------Additional baseline annotation is provided. It will be used with the default baseline annotation.')
582
- logger.info(f'------Additional baseline annotation directory: {additional_baseline_annotation_dir_path}')
590
+ logger.info(f'------Additional baseline annotation directory: {additional_baseline_annotation}')
583
591
 
584
592
  chrom_list = range(1, 23)
585
593
  for chrom in chrom_list:
586
- baseline_annotation_path = additional_baseline_annotation_dir_path / f'baseline.{chrom}.l2.ldscore.feather'
594
+ baseline_annotation_path = additional_baseline_annotation / f'baseline.{chrom}.l2.ldscore.feather'
587
595
  if not baseline_annotation_path.exists():
588
596
  raise FileNotFoundError(
589
- f'baseline.{chrom}.annot.gz is not found in {additional_baseline_annotation_dir_path}.')
597
+ f'baseline.{chrom}.annot.gz is not found in {additional_baseline_annotation}.')
598
+ return None
590
599
 
591
600
 
592
- logger
593
601
  @dataclass
594
- class CauchyCombinationConfig:
595
- input_hdf5_path: str
596
- input_ldsc_dir: str
597
- output_cauchy_dir: str
598
- sample_name: str
602
+ class CauchyCombinationConfig(ConfigWithAutoPaths):
599
603
  trait_name: str
600
604
  annotation: str
601
605
  meta: str = None
@@ -603,11 +607,7 @@ class CauchyCombinationConfig:
603
607
 
604
608
 
605
609
  @dataclass
606
- class VisualizeConfig:
607
- input_hdf5_path: str
608
- input_ldsc_dir: str
609
- output_figure_dir: str
610
- sample_name: str
610
+ class VisualizeConfig(ConfigWithAutoPaths):
611
611
  trait_name: str
612
612
 
613
613
  annotation: str = None
@@ -619,12 +619,91 @@ class VisualizeConfig:
619
619
 
620
620
 
621
621
  @dataclass
622
- class RunAllModeConfig:
623
- flr_config: FindLatentRepresentationsConfig
624
- ltg_config: LatentToGeneConfig
625
- gls_config: GenerateLDScoreConfig
626
- ldsc_config: SpatialLDSCConfig
627
- cauchy_config: CauchyCombinationConfig
622
+ class DiagnosisConfig(ConfigWithAutoPaths):
623
+ annotation: str
624
+ # mkscore_feather_file: str
625
+
626
+ trait_name: str
627
+ sumstats_file: str
628
+ plot_type: Literal['manhattan', 'GSS', 'gsMap', 'all'] = 'all'
629
+ top_corr_genes: int = 50
630
+ selected_genes: Optional[List[str]] = None
631
+
632
+ fig_width: Optional[int] = None
633
+ fig_height: Optional[int] = None
634
+ point_size: Optional[int] = None
635
+ fig_style: Literal['dark', 'light'] = 'light'
636
+
637
+ def __post_init__(self):
638
+ if any([self.fig_width, self.fig_height, self.point_size]):
639
+ logger.info('Customizing the figure size and point size.')
640
+ assert all([self.fig_width, self.fig_height, self.point_size]), 'All of fig_width, fig_height, and point_size must be provided.'
641
+ self.customize_fig = True
642
+ else:
643
+ self.customize_fig = False
644
+ @dataclass
645
+ class ReportConfig(DiagnosisConfig):
646
+ pass
647
+
648
+
649
+ @dataclass
650
+ class RunAllModeConfig(ConfigWithAutoPaths):
651
+ gsMap_resource_dir: str
652
+
653
+ # == ST DATA PARAMETERS ==
654
+ hdf5_path: str
655
+ annotation: str
656
+ data_layer: str = 'X'
657
+
658
+ # ==GWAS DATA PARAMETERS==
659
+ trait_name: Optional[str] = None
660
+ sumstats_file: Optional[str] = None
661
+ sumstats_config_file: Optional[str] = None
662
+
663
+ # === homolog PARAMETERS ===
664
+ homolog_file: Optional[str] = None
665
+
666
+ max_processes: int = 10
667
+
668
+ def __post_init__(self):
669
+ super().__post_init__()
670
+ self.gtffile = f"{self.gsMap_resource_dir}/genome_annotation/gtf/gencode.v39lift37.annotation.gtf"
671
+ self.bfile_root = f"{self.gsMap_resource_dir}/LD_Reference_Panel/1000G_EUR_Phase3_plink/1000G.EUR.QC"
672
+ self.keep_snp_root = f"{self.gsMap_resource_dir}/LDSC_resource/hapmap3_snps/hm"
673
+ self.w_file = f"{self.gsMap_resource_dir}/LDSC_resource/weights_hm3_no_hla/weights."
674
+ self.snp_gene_weight_adata_path = f"{self.gsMap_resource_dir}/quick_mode/snp_gene_weight_matrix.h5ad"
675
+ self.baseline_annotation_dir = Path(f"{self.gsMap_resource_dir}/quick_mode/baseline").resolve()
676
+ self.SNP_gene_pair_dir = Path(f"{self.gsMap_resource_dir}/quick_mode/SNP_gene_pair").resolve()
677
+ # check the existence of the input files and resources files
678
+ for file in [self.hdf5_path, self.gtffile]:
679
+ if not Path(file).exists():
680
+ raise FileNotFoundError(f"File {file} does not exist.")
681
+
682
+ if self.sumstats_file is None and self.sumstats_config_file is None:
683
+ raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
684
+ if self.sumstats_file is not None and self.sumstats_config_file is not None:
685
+ raise ValueError('Only one of sumstats_file and sumstats_config_file must be provided.')
686
+ if self.sumstats_file is not None and self.trait_name is None:
687
+ raise ValueError('trait_name must be provided if sumstats_file is provided.')
688
+ if self.sumstats_config_file is not None and self.trait_name is not None:
689
+ raise ValueError('trait_name must not be provided if sumstats_config_file is provided.')
690
+ self.sumstats_config_dict = {}
691
+ # load the sumstats config file
692
+ if self.sumstats_config_file is not None:
693
+ import yaml
694
+ with open(self.sumstats_config_file) as f:
695
+ config = yaml.load(f, Loader=yaml.FullLoader)
696
+ for trait_name, sumstats_file in config.items():
697
+ assert Path(sumstats_file).exists(), f'{sumstats_file} does not exist.'
698
+ self.sumstats_config_dict[trait_name] = sumstats_file
699
+ # load the sumstats file
700
+ elif self.sumstats_file is not None and self.trait_name is not None:
701
+ self.sumstats_config_dict[self.trait_name] = self.sumstats_file
702
+ else:
703
+ raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
704
+
705
+ for sumstats_file in self.sumstats_config_dict.values():
706
+ assert Path(sumstats_file).exists(), f'{sumstats_file} does not exist.'
628
707
 
629
708
 
630
709
  @dataclass
@@ -650,7 +729,7 @@ class FormatSumstatsConfig:
650
729
  chunksize: int = 1e+7
651
730
  info_min: float = 0.9
652
731
  maf_min: float = 0.01
653
- keep_chr_pos:bool = False
732
+ keep_chr_pos: bool = False
654
733
 
655
734
 
656
735
  @register_cli(name='run_find_latent_representations',
@@ -698,37 +777,27 @@ def run_Cauchy_combination_from_cli(args: argparse.Namespace):
698
777
  run_Cauchy_combination(config)
699
778
 
700
779
 
701
- @register_cli(name='run_visualize',
702
- description='Visualize the gsMap results',
703
- add_args_function=add_Visualization_args)
704
- def run_Visualize_from_cli(args: argparse.Namespace):
705
- from gsMap.visualize import run_Visualize
706
- config = get_dataclass_from_parser(args, VisualizeConfig)
707
- run_Visualize(config)
780
+ @register_cli(name='run_report',
781
+ description='Run Report to generate diagnostic plots and tables',
782
+ add_args_function=add_report_args)
783
+ def run_Report_from_cli(args: argparse.Namespace):
784
+ from gsMap.report import run_report
785
+ config = get_dataclass_from_parser(args, ReportConfig)
786
+ run_report(config)
708
787
 
709
788
 
710
- @register_cli(name='run_all_mode',
711
- description='Run gsMap method (the full process)',
712
- add_args_function=add_all_mode_args)
713
- def run_all_mode_from_cli(args: argparse.Namespace):
714
- from gsMap.find_latent_representation import run_find_latent_representation
715
- from gsMap.latent_to_gene import run_latent_to_gene
716
- from gsMap.generate_ldscore import run_generate_ldscore
717
- from gsMap.spatial_ldsc_multiple_sumstats import run_spatial_ldsc
718
- from gsMap.cauchy_combination_test import run_Cauchy_combination
719
- config = get_runall_mode_config(args)
720
- run_find_latent_representation(config.flr_config)
721
- run_latent_to_gene(config.ltg_config)
722
- run_generate_ldscore(config.gls_config)
723
- run_spatial_ldsc(config.ldsc_config)
724
- if args.annotation is not None:
725
- config.cauchy_config.annotation = args.annotation
726
- run_Cauchy_combination(config.cauchy_config)
727
-
728
789
  @register_cli(name='format_sumstats',
729
790
  description='Format gwas summary statistics',
730
791
  add_args_function=add_format_sumstats_args)
731
792
  def gwas_format_from_cli(args: argparse.Namespace):
732
793
  from gsMap.format_sumstats import gwas_format
733
794
  config = get_dataclass_from_parser(args, FormatSumstatsConfig)
734
- gwas_format(config)
795
+ gwas_format(config)
796
+
797
+ @register_cli(name='quick_mode',
798
+ description='Run all the gsMap pipeline in quick mode',
799
+ add_args_function=add_run_all_mode_args)
800
+ def run_all_mode_from_cli(args: argparse.Namespace):
801
+ from gsMap.run_all_mode import run_pipeline
802
+ config = get_dataclass_from_parser(args, RunAllModeConfig)
803
+ run_pipeline(config)