gsMap 1.62__py3-none-any.whl → 1.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN_VAE/adjacency_matrix.py +1 -1
- gsMap/GNN_VAE/model.py +5 -5
- gsMap/GNN_VAE/train.py +1 -1
- gsMap/__init__.py +1 -1
- gsMap/cauchy_combination_test.py +14 -36
- gsMap/config.py +473 -404
- gsMap/diagnosis.py +273 -0
- gsMap/find_latent_representation.py +22 -86
- gsMap/format_sumstats.py +79 -82
- gsMap/generate_ldscore.py +145 -78
- gsMap/latent_to_gene.py +65 -104
- gsMap/main.py +1 -9
- gsMap/report.py +160 -0
- gsMap/run_all_mode.py +195 -0
- gsMap/spatial_ldsc_multiple_sumstats.py +188 -113
- gsMap/templates/report_template.html +198 -0
- gsMap/utils/__init__.py +0 -0
- gsMap/{generate_r2_matrix.py → utils/generate_r2_matrix.py} +2 -10
- gsMap/{make_annotations.py → utils/make_annotations.py} +1 -43
- gsMap/utils/manhattan_plot.py +639 -0
- gsMap/{regression_read.py → utils/regression_read.py} +1 -1
- gsMap/visualize.py +100 -55
- {gsmap-1.62.dist-info → gsmap-1.64.dist-info}/METADATA +21 -46
- gsmap-1.64.dist-info/RECORD +30 -0
- gsmap-1.62.dist-info/RECORD +0 -24
- /gsMap/{jackknife.py → utils/jackknife.py} +0 -0
- {gsmap-1.62.dist-info → gsmap-1.64.dist-info}/LICENSE +0 -0
- {gsmap-1.62.dist-info → gsmap-1.64.dist-info}/WHEEL +0 -0
- {gsmap-1.62.dist-info → gsmap-1.64.dist-info}/entry_points.txt +0 -0
gsMap/config.py
CHANGED
@@ -1,36 +1,42 @@
|
|
1
1
|
import argparse
|
2
2
|
import logging
|
3
|
-
from dataclasses import dataclass, field
|
4
|
-
from pprint import pprint
|
5
|
-
from typing import Union, Literal
|
6
|
-
from pathlib import Path
|
7
|
-
|
8
3
|
from collections import OrderedDict, namedtuple
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from pathlib import Path
|
6
|
+
from pprint import pprint
|
9
7
|
from typing import Callable
|
10
|
-
from
|
8
|
+
from typing import Union, Literal, Tuple, Optional, List
|
9
|
+
from functools import wraps
|
11
10
|
import pyfiglet
|
12
11
|
|
12
|
+
from gsMap.__init__ import __version__
|
13
|
+
|
13
14
|
# Global registry to hold functions
|
14
15
|
cli_function_registry = OrderedDict()
|
15
16
|
subcommand = namedtuple('subcommand', ['name', 'func', 'add_args_function', 'description'])
|
16
|
-
logger = logging.getLogger(__name__)
|
17
|
-
logger.setLevel(logging.DEBUG)
|
18
|
-
handler = logging.StreamHandler()
|
19
|
-
handler.setFormatter(logging.Formatter(
|
20
|
-
'[{asctime}] {levelname:6s} {message}', style='{'))
|
21
|
-
logger.addHandler(handler)
|
22
17
|
|
23
18
|
|
19
|
+
def get_gsMap_logger(logger_name):
|
20
|
+
logger = logging.getLogger(logger_name)
|
21
|
+
logger.setLevel(logging.DEBUG)
|
22
|
+
handler = logging.StreamHandler()
|
23
|
+
handler.setFormatter(logging.Formatter(
|
24
|
+
'[{asctime}] {levelname:.5s} | {name} - {message}', style='{'))
|
25
|
+
logger.addHandler(handler)
|
26
|
+
return logger
|
27
|
+
|
28
|
+
logger = get_gsMap_logger('gsMap')
|
29
|
+
|
24
30
|
# Decorator to register functions for cli parsing
|
25
31
|
def register_cli(name: str, description: str, add_args_function: Callable) -> Callable:
|
26
32
|
def decorator(func: Callable) -> Callable:
|
27
33
|
def wrapper(*args, **kwargs):
|
28
34
|
name.replace('_', ' ')
|
29
35
|
gsMap_main_logo = pyfiglet.figlet_format("gsMap", font='doom', width=80, justify='center', ).rstrip()
|
30
|
-
print(gsMap_main_logo, )
|
36
|
+
print(gsMap_main_logo, flush=True)
|
31
37
|
version_number = 'Version: ' + __version__
|
32
|
-
print(version_number.center(80))
|
33
|
-
print('=' * 80)
|
38
|
+
print(version_number.center(80), flush=True)
|
39
|
+
print('=' * 80, flush=True)
|
34
40
|
logger.info(f"Running {name}...")
|
35
41
|
func(*args, **kwargs)
|
36
42
|
logger.info(f"Finished running {name}.")
|
@@ -41,50 +47,28 @@ def register_cli(name: str, description: str, add_args_function: Callable) -> Ca
|
|
41
47
|
|
42
48
|
return decorator
|
43
49
|
|
50
|
+
def add_shared_args(parser):
|
51
|
+
parser.add_argument('--workdir', type=str, required=True, help='Path to the working directory.')
|
52
|
+
parser.add_argument('--sample_name', type=str, required=True, help='Name of the sample.')
|
44
53
|
|
45
54
|
def add_find_latent_representations_args(parser):
|
46
|
-
parser
|
47
|
-
parser.add_argument('--
|
48
|
-
parser.add_argument('--
|
49
|
-
parser.add_argument('--
|
50
|
-
parser.add_argument('--
|
51
|
-
parser.add_argument('--
|
52
|
-
|
53
|
-
|
54
|
-
parser.add_argument('--
|
55
|
-
|
56
|
-
parser.add_argument('--
|
57
|
-
|
58
|
-
parser.add_argument('--
|
59
|
-
|
60
|
-
parser.add_argument('--
|
61
|
-
|
62
|
-
parser.add_argument('--
|
63
|
-
help="Number of units in the second hidden layer of the GCN. Default is 30.")
|
64
|
-
parser.add_argument('--p_drop', default=0.1, type=float,
|
65
|
-
help="Dropout rate used in the GNN-VAE model. Default is 0.1.")
|
66
|
-
parser.add_argument('--gcn_lr', default=0.001, type=float,
|
67
|
-
help="Learning rate for the GCN network. Default is 0.001.")
|
68
|
-
parser.add_argument('--gcn_decay', default=0.01, type=float,
|
69
|
-
help="Weight decay (L2 penalty) for the GCN network. Default is 0.01.")
|
70
|
-
parser.add_argument('--n_neighbors', default=11, type=int,
|
71
|
-
help="Number of neighbors to consider for graph construction in GCN. Default is 11.")
|
72
|
-
parser.add_argument('--label_w', default=1, type=float,
|
73
|
-
help="Weight of the label loss in the loss function. Default is 1.")
|
74
|
-
parser.add_argument('--rec_w', default=1, type=float,
|
75
|
-
help="Weight of the reconstruction loss in the loss function. Default is 1.")
|
76
|
-
parser.add_argument('--n_comps', default=300, type=int,
|
77
|
-
help="Number of principal components to keep if PCA is performed. Default is 300.")
|
78
|
-
parser.add_argument('--weighted_adj', action='store_true',
|
79
|
-
help="Use a weighted adjacency matrix in GCN. Default is False.")
|
80
|
-
parser.add_argument('--nheads', default=3, type=int,
|
81
|
-
help="Number of heads in the attention mechanism of the GNN. Default is 3.")
|
82
|
-
parser.add_argument('--var', action='store_true',
|
83
|
-
help="Enable var. Use --var to enable. Default is False.")
|
84
|
-
parser.add_argument('--convergence_threshold', default=1e-4, type=float,
|
85
|
-
help="Threshold for convergence during training. Training stops if the loss change is below this threshold. Default is 1e-4.")
|
86
|
-
parser.add_argument('--hierarchically', action='store_true',
|
87
|
-
help="Find latent representations hierarchically. Use --hierarchically to enable. Default is False.")
|
55
|
+
add_shared_args(parser)
|
56
|
+
parser.add_argument('--input_hdf5_path', required=True, type=str, help='Path to the input HDF5 file.')
|
57
|
+
parser.add_argument('--annotation', required=True, type=str, help='Name of the annotation in adata.obs to use.')
|
58
|
+
parser.add_argument('--data_layer', required=True, type=str, help='Data layer for gene expression (e.g., "counts", "log1p").')
|
59
|
+
parser.add_argument('--epochs', type=int, default=300, help='Number of training epochs.')
|
60
|
+
parser.add_argument('--feat_hidden1', type=int, default=256, help='Neurons in the first hidden layer.')
|
61
|
+
parser.add_argument('--feat_hidden2', type=int, default=128, help='Neurons in the second hidden layer.')
|
62
|
+
parser.add_argument('--gat_hidden1', type=int, default=64, help='Units in the first GAT hidden layer.')
|
63
|
+
parser.add_argument('--gat_hidden2', type=int, default=30, help='Units in the second GAT hidden layer.')
|
64
|
+
parser.add_argument('--p_drop', type=float, default=0.1, help='Dropout rate.')
|
65
|
+
parser.add_argument('--gat_lr', type=float, default=0.001, help='Learning rate for the GAT.')
|
66
|
+
parser.add_argument('--n_neighbors', type=int, default=11, help='Number of neighbors for GAT.')
|
67
|
+
parser.add_argument('--n_comps', type=int, default=300, help='Number of principal components for PCA.')
|
68
|
+
parser.add_argument('--weighted_adj', action='store_true', help='Use weighted adjacency in GAT.')
|
69
|
+
parser.add_argument('--var', action='store_true', help='Enable variance calculations.')
|
70
|
+
parser.add_argument('--convergence_threshold', type=float, default=1e-4, help='Threshold for convergence.')
|
71
|
+
parser.add_argument('--hierarchically', action='store_true', help='Enable hierarchical latent representation finding.')
|
88
72
|
|
89
73
|
|
90
74
|
def chrom_choice(value):
|
@@ -109,297 +93,284 @@ def get_dataclass_from_parser(args: argparse.Namespace, data_class: dataclass):
|
|
109
93
|
return data_class(**remain_kwargs)
|
110
94
|
|
111
95
|
|
112
|
-
def add_generate_ldscore_args(parser):
|
113
|
-
parser.add_argument('--sample_name', type=str, required=True, help='Sample name')
|
114
|
-
parser.add_argument('--chrom', type=str, required=True, help='Chromosome number (1-22) or "all"')
|
115
|
-
parser.add_argument('--ldscore_save_dir', type=str, required=True, help='Directory to save ld score files')
|
116
|
-
parser.add_argument('--mkscore_feather_file', type=str, required=True, help='Mkscore feather file path')
|
117
|
-
|
118
|
-
# additional baseline annotation
|
119
|
-
parser.add_argument('--additional_baseline_annotation_dir_path', type=str, default=None,)
|
120
|
-
|
121
|
-
# reference panel
|
122
|
-
parser.add_argument('--bfile_root', type=str, required=True, help='Bfile root path')
|
123
|
-
parser.add_argument('--keep_snp_root', type=str, required=True, help='Keep SNP root path')
|
124
|
-
|
125
|
-
# Annotation by gene distance
|
126
|
-
parser.add_argument('--gtf_annotation_file', type=str, required=True, help='GTF file path')
|
127
|
-
parser.add_argument('--gene_window_size', type=int, default=50000, help='Gene window size')
|
128
|
-
|
129
|
-
# Enhancer annotation
|
130
|
-
parser.add_argument('--enhancer_annotation_file', type=str, default=None,
|
131
|
-
help='Enhancer annotation bed file path, optional.')
|
132
|
-
parser.add_argument('--snp_multiple_enhancer_strategy', type=str, default='max_mkscore',
|
133
|
-
choices=['max_mkscore', 'nearest_TSS'], help='Strategy for multiple enhancers per SNP')
|
134
|
-
parser.add_argument('--gene_window_enhancer_priority', type=str, default=None,
|
135
|
-
choices=['gene_window_first', 'enhancer_first', 'enhancer_only'],
|
136
|
-
help='Priority between gene window and enhancer')
|
137
|
-
|
138
|
-
# Arguments for calculating ld score
|
139
|
-
parser.add_argument('--spots_per_chunk', type=int, default=5_000, help='Number of spots per chunk')
|
140
|
-
parser.add_argument('--ld_wind', type=int, default=1, help='LD window size')
|
141
|
-
parser.add_argument('--ld_unit', type=str, default='CM', help='LD window unit (SNP/KB/CM)',
|
142
|
-
choices=['SNP', 'KB', 'CM'])
|
143
|
-
|
144
|
-
|
145
96
|
def add_latent_to_gene_args(parser):
|
146
|
-
parser
|
147
|
-
|
148
|
-
parser.add_argument('--
|
149
|
-
parser.add_argument('--
|
150
|
-
help='
|
151
|
-
parser.add_argument('--
|
152
|
-
parser.add_argument('--
|
153
|
-
|
154
|
-
|
155
|
-
parser.add_argument('--no_expression_fraction', action='store_true', default=False,
|
156
|
-
help='Flag to not use expression fraction as filter when calculate the maker score. Default is False.')
|
157
|
-
|
158
|
-
parser.add_argument('--latent_representation', type=str, default='latent_GVAE',
|
159
|
-
choices=['latent_GVAE', 'latent_PCA'],
|
160
|
-
help='Type of latent representation. Default is "latent_GVAE".')
|
161
|
-
parser.add_argument('--num_neighbour', type=int, default=21,
|
162
|
-
help='Number of neighbours to consider. Default is 21.')
|
163
|
-
parser.add_argument('--num_neighbour_spatial', type=int, default=101,
|
164
|
-
help='Number of spatial neighbours to consider. Default is 101.')
|
165
|
-
parser.add_argument('--species', type=str, default=None, help='Species name, which is the column name in the homolog gene conversion file.')
|
166
|
-
parser.add_argument('--gs_species', type=str, default=None, help='Homologous gene conversion file path, if applicable.')
|
167
|
-
parser.add_argument('--gM_slices', type=str, default=None, help='The mean Gene marker scores path across multiple slices.')
|
97
|
+
add_shared_args(parser)
|
98
|
+
parser.add_argument('--annotation', type=str, help='Name of the annotation in adata.obs to use. (optional).')
|
99
|
+
parser.add_argument('--no_expression_fraction', action='store_true', help='Skip expression fraction filtering.')
|
100
|
+
parser.add_argument('--latent_representation', type=str, choices=['latent_GVAE', 'latent_PCA'], default='latent_GVAE',
|
101
|
+
help='Type of latent representation.')
|
102
|
+
parser.add_argument('--num_neighbour', type=int, default=21, help='Number of neighbors.')
|
103
|
+
parser.add_argument('--num_neighbour_spatial', type=int, default=101, help='Number of spatial neighbors.')
|
104
|
+
# parser.add_argument('--species', type=str, help='Species name for homolog gene mapping (optional).')
|
105
|
+
parser.add_argument('--homolog_file', type=str, help='Path to homologous gene conversion file (optional).')
|
168
106
|
|
169
107
|
|
170
|
-
def
|
171
|
-
|
172
|
-
parser.add_argument('--
|
108
|
+
def add_generate_ldscore_args(parser):
|
109
|
+
add_shared_args(parser)
|
110
|
+
parser.add_argument('--chrom', type=str, required=True, help='Chromosome id (1-22) or "all".')
|
111
|
+
parser.add_argument('--bfile_root', type=str, required=True, help='Root path for genotype plink bfiles (.bim, .bed, .fam).')
|
112
|
+
parser.add_argument('--keep_snp_root', type=str, required=True, help='Root path for SNP files.')
|
113
|
+
parser.add_argument('--gtf_annotation_file', type=str, required=True, help='Path to GTF annotation file.')
|
114
|
+
parser.add_argument('--gene_window_size', type=int, default=50000, help='Gene window size in base pairs.')
|
115
|
+
parser.add_argument('--enhancer_annotation_file', type=str, help='Path to enhancer annotation file (optional).')
|
116
|
+
parser.add_argument('--snp_multiple_enhancer_strategy', type=str, choices=['max_mkscore', 'nearest_TSS'], default='max_mkscore',
|
117
|
+
help='Strategy for handling multiple enhancers per SNP.')
|
118
|
+
parser.add_argument('--gene_window_enhancer_priority', type=str, choices=['gene_window_first', 'enhancer_first', 'enhancer_only'],
|
119
|
+
help='Priority between gene window and enhancer annotations.')
|
120
|
+
parser.add_argument('--spots_per_chunk', type=int, default=1000, help='Number of spots per chunk.')
|
121
|
+
parser.add_argument('--ld_wind', type=int, default=1, help='LD window size.')
|
122
|
+
parser.add_argument('--ld_unit', type=str, choices=['SNP', 'KB', 'CM'], default='CM', help='Unit for LD window.')
|
123
|
+
parser.add_argument('--additional_baseline_annotation', type=str, default=None, help='Path of additional baseline annotations')
|
173
124
|
|
174
|
-
parser.add_argument('--sumstats_file', default=None, help="Path to GWAS summary statistics file.")
|
175
|
-
parser.add_argument('--sumstats_config_file', default=None, help="Path to GWAS summary statistics config file.")
|
176
|
-
parser.add_argument('--w_file', required=True, help="Path to regression weight file.")
|
177
|
-
parser.add_argument('--ldscore_input_dir', required=True, help="Input directory for LD Score files.")
|
178
|
-
parser.add_argument('--ldsc_save_dir', required=True, help="Directory to save Spatial LDSC results.")
|
179
|
-
parser.add_argument('--trait_name', default=None, help="Name of the trait.")
|
180
|
-
parser.add_argument('--not_M_5_50', action='store_true', help="Flag to not use M 5 50 in calculations.")
|
181
|
-
parser.add_argument('--n_blocks', type=int, default=200, help="Number of blocks for jackknife resampling.")
|
182
|
-
parser.add_argument('--chisq_max', type=int, help="Maximum chi-square value for filtering SNPs.")
|
183
|
-
parser.add_argument('--all_chunk', type=int, help="Number of chunks for processing spatial data.")
|
184
125
|
|
185
|
-
|
186
|
-
parser
|
126
|
+
def add_latent_to_gene_args(parser):
|
127
|
+
add_shared_args(parser)
|
128
|
+
parser.add_argument('--annotation', type=str, required=True, help='Name of the annotation layer.')
|
129
|
+
parser.add_argument('--no_expression_fraction', action='store_true', help='Skip expression fraction filtering.')
|
130
|
+
parser.add_argument('--latent_representation', type=str, choices=['latent_GVAE', 'latent_PCA'], default='latent_GVAE',
|
131
|
+
help='Type of latent representation.')
|
132
|
+
parser.add_argument('--num_neighbour', type=int, default=21, help='Number of neighbors.')
|
133
|
+
parser.add_argument('--num_neighbour_spatial', type=int, default=101, help='Number of spatial neighbors.')
|
134
|
+
# parser.add_argument('--species', type=str, help='Species name for homolog gene mapping (optional).')
|
135
|
+
parser.add_argument('--homolog_file', type=str, help='Path to homologous gene conversion file (optional).')
|
187
136
|
|
188
|
-
parser.add_argument('--num_processes', type=int, default=4, help="Number of processes for parallel computing.")
|
189
137
|
|
190
|
-
|
138
|
+
def add_spatial_ldsc_args(parser):
|
139
|
+
add_shared_args(parser)
|
140
|
+
parser.add_argument('--sumstats_file', type=str, required=True, help='Path to GWAS summary statistics file.')
|
141
|
+
parser.add_argument('--w_file', type=str, required=True, help='Path to regression weight file.')
|
142
|
+
parser.add_argument('--trait_name', type=str, required=True, help='Name of the trait being analyzed.')
|
143
|
+
parser.add_argument('--n_blocks', type=int, default=200, help='Number of blocks for jackknife resampling.')
|
144
|
+
parser.add_argument('--chisq_max', type=int, help='Maximum chi-square value for filtering SNPs.')
|
145
|
+
parser.add_argument('--num_processes', type=int, default=4, help='Number of processes for parallel computing.')
|
146
|
+
parser.add_argument('--use_additional_baseline_annotation', type=bool, nargs='?', const=True, default=True, help='Use additional baseline annotations when provided')
|
191
147
|
|
192
148
|
|
193
149
|
def add_Cauchy_combination_args(parser):
|
194
|
-
|
195
|
-
parser.add_argument('--
|
196
|
-
parser.add_argument('--
|
197
|
-
parser.add_argument('--
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
parser.add_argument('--
|
205
|
-
parser.add_argument('--
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
parser.add_argument('--
|
212
|
-
|
213
|
-
|
214
|
-
parser.add_argument('--
|
215
|
-
parser.add_argument('--
|
216
|
-
parser.add_argument('--
|
217
|
-
|
218
|
-
|
219
|
-
# Arguments with defaults
|
220
|
-
parser.add_argument('--fig_title', type=str, default=None, help='Title of figure')
|
221
|
-
parser.add_argument('--fig_height', type=int, default=600, help='Height of figure in pixels')
|
222
|
-
parser.add_argument('--fig_width', type=int, default=800, help='Width of figure in pixels')
|
223
|
-
parser.add_argument('--point_size', type=int, default=None, help='Point size of figure')
|
224
|
-
parser.add_argument('--fig_style', type=str, default='light', choices=['dark', 'light'], help='Plot style of figure')
|
225
|
-
|
226
|
-
def add_all_mode_args(parser):
|
227
|
-
parser.add_argument('--input_hdf5_path', required=True, type=str, help='Path to the input hdf5 file.')
|
228
|
-
parser.add_argument('--save_dir', required=True, type=str, help='Path to the running results.')
|
229
|
-
# output
|
230
|
-
# parser.add_argument('--output_hdf5_path', required=True, type=str, help='Path to the output hdf5 file.')
|
231
|
-
parser.add_argument('--sample_name', required=True, type=str, help='Name of the sample.')
|
232
|
-
parser.add_argument('--annotation', default=None, type=str, help='Name of the annotation layer.')
|
233
|
-
parser.add_argument('--type', default=None, type=str, help="The gene expression layer of the input data (e.g., 'count', 'counts', 'log1p').")
|
234
|
-
|
235
|
-
# latent_to_gene
|
236
|
-
# input
|
237
|
-
# parser.add_argument('--input_hdf5_path', type=str, required=True, help='Path to the input HDF5 file.')
|
238
|
-
# parser.add_argument('--sample_name', type=str, required=True, help='Name of the sample.')
|
239
|
-
# output
|
240
|
-
# parser.add_argument('--output_feather_path', type=str, required=True,
|
241
|
-
# help='Path to save output gene marker score feather file.')
|
242
|
-
# parser.add_argument('--annotation', default=None, type=str, help='Name of the annotation layer.')
|
243
|
-
# parser.add_argument('--type', default=None, type=str, help="Type of input data (e.g., 'count', 'counts').")
|
244
|
-
|
245
|
-
# no_expression_fraction
|
246
|
-
# no_expression_fraction
|
247
|
-
parser.add_argument('--no_expression_fraction', action='store_true', default=False,
|
248
|
-
help='Flag to not use expression fraction as filter when calculate the maker score. Default is False.')
|
249
|
-
|
250
|
-
parser.add_argument('--latent_representation', type=str, default='latent_GVAE',
|
251
|
-
choices=['latent_GVAE', 'latent_PCA'],
|
252
|
-
help='Type of latent representation. Default is "latent_GVAE".')
|
253
|
-
parser.add_argument('--num_neighbour', type=int, default=21,
|
254
|
-
help='Number of neighbours to consider. Default is 21.')
|
255
|
-
parser.add_argument('--num_neighbour_spatial', type=int, default=101,
|
256
|
-
help='Number of spatial neighbours to consider. Default is 101.')
|
257
|
-
parser.add_argument('--species', type=str, default=None, help='Species name, which is the column name in the homolog gene conversion file.')
|
258
|
-
parser.add_argument('--gs_species', type=str, default=None, help='Homologous gene conversion file path, if applicable.')
|
259
|
-
parser.add_argument('--gM_slices', type=str, default=None, help='The mean Gene marker scores path across multiple slices.')
|
260
|
-
|
261
|
-
|
262
|
-
# generate_ldscore
|
263
|
-
# parser.add_argument('--sample_name', type=str, required=True, help='Sample name')
|
264
|
-
# should be all
|
265
|
-
# parser.add_argument('--chrom', type=chrom_choice, required=True, help='Chromosome number (1-22) or "all"')
|
266
|
-
# output
|
267
|
-
# parser.add_argument('--ldscore_save_dir', type=str, required=True, help='Directory to save ld score files')
|
268
|
-
|
269
|
-
# reference panel
|
270
|
-
parser.add_argument('--bfile_root', type=str, required=True, help='Bfile root path')
|
271
|
-
parser.add_argument('--keep_snp_root', type=str, required=True, help='Keep SNP root path')
|
272
|
-
|
273
|
-
# Annotation by gene distance
|
274
|
-
parser.add_argument('--gtf_annotation_file', type=str, required=True, help='GTF file path')
|
275
|
-
parser.add_argument('--gene_window_size', type=int, default=50000, help='Gene window size')
|
276
|
-
|
277
|
-
# Enhancer annotation
|
278
|
-
parser.add_argument('--enhancer_annotation_file', type=str, default=None,
|
279
|
-
help='Enhancer annotation bed file path, optional.')
|
280
|
-
parser.add_argument('--snp_multiple_enhancer_strategy', type=str, default='max_mkscore',
|
281
|
-
choices=['max_mkscore', 'nearest_TSS'], help='Strategy for multiple enhancers per SNP')
|
282
|
-
parser.add_argument('--gene_window_enhancer_priority', type=str, default=None,
|
283
|
-
choices=['gene_window_first', 'enhancer_first', 'enhancer_only'],
|
284
|
-
help='Priority between gene window and enhancer')
|
285
|
-
|
286
|
-
# Arguments for calculating ld score
|
287
|
-
parser.add_argument('--spots_per_chunk', type=int, default=5_000, help='Number of spots per chunk')
|
288
|
-
parser.add_argument('--ld_wind', type=int, default=1, help='LD window size')
|
289
|
-
parser.add_argument('--ld_unit', type=str, default='CM', help='LD window unit (SNP/KB/CM)',
|
290
|
-
choices=['SNP', 'KB', 'CM'])
|
291
|
-
|
292
|
-
# spatial ldsc args:
|
293
|
-
parser.add_argument('--sumstats_file', default=None, help="Path to GWAS summary statistics file.")
|
294
|
-
parser.add_argument('--sumstats_config_file', default=None, help="Path to GWAS summary statistics config file.")
|
295
|
-
parser.add_argument('--w_file', required=True, help="Path to regression weight file.")
|
296
|
-
parser.add_argument('--ldscore_input_dir', required=True, help="Input directory for LD Score files.")
|
297
|
-
parser.add_argument('--ldsc_save_dir', required=True, help="Directory to save Spatial LDSC results.")
|
298
|
-
parser.add_argument('--trait_name', default=None, help="Name of the trait.")
|
299
|
-
parser.add_argument('--not_M_5_50', action='store_true', help="Flag to not use M 5 50 in calculations.")
|
300
|
-
parser.add_argument('--n_blocks', type=int, default=200, help="Number of blocks for jackknife resampling.")
|
301
|
-
parser.add_argument('--chisq_max', type=int, help="Maximum chi-square value for filtering SNPs.")
|
302
|
-
parser.add_argument('--all_chunk', type=int, help="Number of chunks for processing spatial data.")
|
303
|
-
|
304
|
-
|
305
|
-
def get_runall_mode_config(args: argparse.Namespace):
|
306
|
-
# output
|
307
|
-
args.output_hdf5_path = f'{args.save_dir}/{args.sample_name}/find_latent_representations/{args.sample_name}_add_latent.h5ad'
|
308
|
-
args.output_feather_path = f'{args.save_dir}/{args.sample_name}/latent_to_gene/{args.sample_name}_gene_marker_score.feather'
|
309
|
-
args.ldscore_save_dir = f'{args.save_dir}/{args.sample_name}/generate_ldscore'
|
310
|
-
args.ldsc_save_dir = f'{args.save_dir}/{args.sample_name}/spatial_ldsc'
|
311
|
-
args.output_cauchy_dir = f'{args.save_dir}/{args.sample_name}/cauchy_combination/'
|
312
|
-
|
313
|
-
# input
|
314
|
-
args.input_hdf5_with_latent_path = args.output_hdf5_path
|
315
|
-
args.mkscore_feather_file = args.output_feather_path
|
316
|
-
args.ldscore_input_dir = args.ldscore_save_dir
|
317
|
-
args.chrom = 'all'
|
318
|
-
args.input_ldsc_dir = args.ldsc_save_dir
|
319
|
-
args.input_spatial_ldsc = f'{args.save_dir}/{args.sample_name}/spatial_ldsc/{args.sample_name}_{args.trait_name}.gz'
|
320
|
-
# find_latent_representations
|
321
|
-
flr_config = get_dataclass_from_parser(args, FindLatentRepresentationsConfig)
|
322
|
-
# latent_to_gene
|
323
|
-
ltg_config = get_dataclass_from_parser(args, LatentToGeneConfig)
|
324
|
-
# generate_ldscore
|
325
|
-
gls_config = get_dataclass_from_parser(args, GenerateLDScoreConfig)
|
326
|
-
# spatial ldsc
|
327
|
-
ldsc_config = get_dataclass_from_parser(args, SpatialLDSCConfig)
|
328
|
-
# cauchy combination
|
329
|
-
cauchy_config = get_dataclass_from_parser(args, CauchyCombinationConfig)
|
330
|
-
return RunAllModeConfig(flr_config=flr_config, ltg_config=ltg_config, gls_config=gls_config,
|
331
|
-
ldsc_config=ldsc_config, cauchy_config=cauchy_config)
|
332
|
-
|
150
|
+
add_shared_args(parser)
|
151
|
+
parser.add_argument('--trait_name', type=str, required=True, help='Name of the trait being analyzed.')
|
152
|
+
parser.add_argument('--annotation', type=str, required=True, help='Name of the annotation in adata.obs to use.')
|
153
|
+
parser.add_argument('--meta', type=str, help='Optional meta information.')
|
154
|
+
parser.add_argument('--slide', type=str, help='Optional slide information.')
|
155
|
+
|
156
|
+
|
157
|
+
def add_report_args(parser):
|
158
|
+
add_shared_args(parser)
|
159
|
+
parser.add_argument('--trait_name', type=str, required=True, help='Name of the trait to generate the report for.')
|
160
|
+
parser.add_argument('--annotation', type=str, required=True, help='Annotation layer name.')
|
161
|
+
# parser.add_argument('--plot_type', type=str, choices=['manhattan', 'GSS', 'gsMap', 'all'], default='all',
|
162
|
+
# help="Type of diagnostic plot to generate. Choose from 'manhattan', 'GSS', 'gsMap', or 'all'.")
|
163
|
+
parser.add_argument('--top_corr_genes', type=int, default=50,
|
164
|
+
help='Number of top correlated genes to display.')
|
165
|
+
parser.add_argument('--selected_genes', type=str, nargs='*',
|
166
|
+
help='List of specific genes to include in the report (optional).')
|
167
|
+
parser.add_argument('--sumstats_file', type=str, required=True, help='Path to GWAS summary statistics file.')
|
168
|
+
|
169
|
+
# Optional arguments for customization
|
170
|
+
parser.add_argument('--fig_width', type=int, default=None, help='Width of the generated figures in pixels.')
|
171
|
+
parser.add_argument('--fig_height', type=int, default=None, help='Height of the generated figures in pixels.')
|
172
|
+
parser.add_argument('--point_size', type=int, default=None, help='Point size for the figures.')
|
173
|
+
parser.add_argument('--fig_style', type=str, default='light', choices=['dark', 'light'],
|
174
|
+
help='Style of the generated figures.')
|
333
175
|
|
334
176
|
def add_format_sumstats_args(parser):
|
335
177
|
# Required arguments
|
336
|
-
parser.add_argument('--sumstats', required=True, type=str,
|
178
|
+
parser.add_argument('--sumstats', required=True, type=str,
|
337
179
|
help='Path to gwas summary data')
|
338
|
-
parser.add_argument('--out', required=True, type=str,
|
180
|
+
parser.add_argument('--out', required=True, type=str,
|
339
181
|
help='Path to save the formatted gwas data')
|
340
|
-
|
182
|
+
|
341
183
|
# Arguments for specify column name
|
342
|
-
parser.add_argument('--snp', default=None,type=str,
|
184
|
+
parser.add_argument('--snp', default=None, type=str,
|
343
185
|
help="Name of snp column (if not a name that gsMap understands)")
|
344
|
-
parser.add_argument('--a1', default=None,type=str,
|
186
|
+
parser.add_argument('--a1', default=None, type=str,
|
345
187
|
help="Name of effect allele column (if not a name that gsMap understands)")
|
346
|
-
parser.add_argument('--a2', default=None,type=str,
|
188
|
+
parser.add_argument('--a2', default=None, type=str,
|
347
189
|
help="Name of none-effect allele column (if not a name that gsMap understands)")
|
348
|
-
parser.add_argument('--info', default=None,type=str,
|
190
|
+
parser.add_argument('--info', default=None, type=str,
|
349
191
|
help="Name of info column (if not a name that gsMap understands)")
|
350
|
-
parser.add_argument('--beta', default=None,type=str,
|
192
|
+
parser.add_argument('--beta', default=None, type=str,
|
351
193
|
help="Name of gwas beta column (if not a name that gsMap understands).")
|
352
|
-
parser.add_argument('--se', default=None,type=str,
|
194
|
+
parser.add_argument('--se', default=None, type=str,
|
353
195
|
help="Name of gwas standar error of beta column (if not a name that gsMap understands)")
|
354
|
-
parser.add_argument('--p', default=None,type=str,
|
196
|
+
parser.add_argument('--p', default=None, type=str,
|
355
197
|
help="Name of p-value column (if not a name that gsMap understands)")
|
356
|
-
parser.add_argument('--frq', default=None,type=str,
|
198
|
+
parser.add_argument('--frq', default=None, type=str,
|
357
199
|
help="Name of A1 ferquency column (if not a name that gsMap understands)")
|
358
|
-
parser.add_argument('--n', default=None,type=str,
|
200
|
+
parser.add_argument('--n', default=None, type=str,
|
359
201
|
help="Name of sample size column (if not a name that gsMap understands)")
|
360
|
-
parser.add_argument('--z', default=None,type=str,
|
202
|
+
parser.add_argument('--z', default=None, type=str,
|
361
203
|
help="Name of gwas Z-statistics column (if not a name that gsMap understands)")
|
362
|
-
parser.add_argument('--OR', default=None,type=str,
|
204
|
+
parser.add_argument('--OR', default=None, type=str,
|
363
205
|
help="Name of gwas OR column (if not a name that gsMap understands)")
|
364
|
-
parser.add_argument('--se_OR', default=None,type=str,
|
206
|
+
parser.add_argument('--se_OR', default=None, type=str,
|
365
207
|
help="Name of standar error of OR column (if not a name that gsMap understands)")
|
366
|
-
|
208
|
+
|
367
209
|
# Arguments for convert SNP (chr, pos) to rsid
|
368
|
-
parser.add_argument('--chr', default="Chr",type=str,
|
210
|
+
parser.add_argument('--chr', default="Chr", type=str,
|
369
211
|
help="Name of SNP chromosome column (if not a name that gsMap understands)")
|
370
|
-
parser.add_argument('--pos', default="Pos",type=str,
|
212
|
+
parser.add_argument('--pos', default="Pos", type=str,
|
371
213
|
help="Name of SNP positions column (if not a name that gsMap understands)")
|
372
|
-
parser.add_argument('--dbsnp', default=None,type=str,
|
214
|
+
parser.add_argument('--dbsnp', default=None, type=str,
|
373
215
|
help='Path to reference dnsnp file')
|
374
|
-
parser.add_argument('--chunksize', default=1e+6,type=int,
|
216
|
+
parser.add_argument('--chunksize', default=1e+6, type=int,
|
375
217
|
help='Chunk size for loading dbsnp file')
|
376
|
-
|
218
|
+
|
377
219
|
# Arguments for output format and quality
|
378
|
-
parser.add_argument('--format',default='gsMap', type=str,
|
379
|
-
help='Format of output data',choices=['gsMap', 'COJO'])
|
380
|
-
parser.add_argument('--info_min', default=0.9,type=float,
|
220
|
+
parser.add_argument('--format', default='gsMap', type=str,
|
221
|
+
help='Format of output data', choices=['gsMap', 'COJO'])
|
222
|
+
parser.add_argument('--info_min', default=0.9, type=float,
|
381
223
|
help='Minimum INFO score.')
|
382
|
-
parser.add_argument('--maf_min', default=0.01,type=float,
|
224
|
+
parser.add_argument('--maf_min', default=0.01, type=float,
|
383
225
|
help='Minimum MAF.')
|
384
226
|
parser.add_argument('--keep_chr_pos', action='store_true', default=False,
|
385
|
-
|
227
|
+
help='Keep SNP chromosome and position columns in the output data')
|
228
|
+
|
229
|
+
def add_run_all_mode_args(parser):
|
230
|
+
add_shared_args(parser)
|
231
|
+
|
232
|
+
# Required paths and configurations
|
233
|
+
parser.add_argument('--gsMap_resource_dir', type=str, required=True,
|
234
|
+
help='Directory containing gsMap resources (e.g., genome annotations, LD reference panel, etc.).')
|
235
|
+
parser.add_argument('--hdf5_path', type=str, required=True,
|
236
|
+
help='Path to the input spatial transcriptomics data (H5AD format).')
|
237
|
+
parser.add_argument('--annotation', type=str, required=True,
|
238
|
+
help='Name of the annotation in adata.obs to use.')
|
239
|
+
parser.add_argument('--data_layer', type=str, default='X',
|
240
|
+
help='Data layer of h5ad for gene expression (e.g., "counts", "log1p", "X").')
|
241
|
+
|
242
|
+
# GWAS Data Parameters
|
243
|
+
parser.add_argument('--trait_name', type=str, help='Name of the trait for GWAS analysis (required if sumstats_file is provided).')
|
244
|
+
parser.add_argument('--sumstats_file', type=str,
|
245
|
+
help='Path to GWAS summary statistics file. Either sumstats_file or sumstats_config_file is required.')
|
246
|
+
parser.add_argument('--sumstats_config_file', type=str,
|
247
|
+
help='Path to GWAS summary statistics config file. Either sumstats_file or sumstats_config_file is required.')
|
248
|
+
|
249
|
+
# Homolog Data Parameters
|
250
|
+
parser.add_argument('--homolog_file', type=str,
|
251
|
+
help='Path to homologous gene for converting gene names from different species to human (optional, used for cross-species analysis).')
|
252
|
+
|
253
|
+
# Maximum number of processes
|
254
|
+
parser.add_argument('--max_processes', type=int, default=10,
|
255
|
+
help='Maximum number of processes for parallel execution.')
|
256
|
+
|
257
|
+
# # Optional paths for customization
|
258
|
+
# parser.add_argument('--bfile_root', type=str,
|
259
|
+
# help='Root path to PLINK bfiles (LD reference panel). If not provided, it will use the default in gsMap_resource_dir.')
|
260
|
+
# parser.add_argument('--keep_snp_root', type=str,
|
261
|
+
# help='Root path for SNP filtering. If not provided, it will use the default in gsMap_resource_dir.')
|
262
|
+
# parser.add_argument('--w_file', type=str,
|
263
|
+
# help='Path to the regression weight file. If not provided, it will use the default in gsMap_resource_dir.')
|
264
|
+
# parser.add_argument('--snp_gene_weight_adata_path', type=str,
|
265
|
+
# help='Path to the SNP-gene weight matrix file. If not provided, it will use the default in gsMap_resource_dir.')
|
266
|
+
# parser.add_argument('--baseline_annotation_dir', type=str,
|
267
|
+
# help='Directory containing the baseline annotations for quick mode. If not provided, it will use the default in gsMap_resource_dir.')
|
268
|
+
# parser.add_argument('--SNP_gene_pair_dir', type=str,
|
269
|
+
# help='Directory for SNP-gene pair data. If not provided, it will use the default in gsMap_resource_dir.')
|
270
|
+
|
271
|
+
|
272
|
+
def ensure_path_exists(func):
|
273
|
+
@wraps(func)
|
274
|
+
def wrapper(*args, **kwargs):
|
275
|
+
result = func(*args, **kwargs)
|
276
|
+
if isinstance(result, Path):
|
277
|
+
if result.suffix:
|
278
|
+
result.parent.mkdir(parents=True, exist_ok=True, mode=0o755)
|
279
|
+
else: # It's a directory path
|
280
|
+
result.mkdir(parents=True, exist_ok=True, mode=0o755)
|
281
|
+
return result
|
282
|
+
|
283
|
+
return wrapper
|
284
|
+
|
386
285
|
|
387
286
|
@dataclass
|
388
|
-
class
|
389
|
-
|
390
|
-
output_hdf5_path: str
|
287
|
+
class ConfigWithAutoPaths:
|
288
|
+
workdir: str
|
391
289
|
sample_name: str
|
290
|
+
|
291
|
+
def __post_init__(self):
|
292
|
+
if self.workdir is None:
|
293
|
+
raise ValueError('workdir must be provided.')
|
294
|
+
|
295
|
+
@property
|
296
|
+
@ensure_path_exists
|
297
|
+
def hdf5_with_latent_path(self) -> Path:
|
298
|
+
return Path(f'{self.workdir}/{self.sample_name}/find_latent_representations/{self.sample_name}_add_latent.h5ad')
|
299
|
+
|
300
|
+
@property
|
301
|
+
@ensure_path_exists
|
302
|
+
def mkscore_feather_path(self) -> Path:
|
303
|
+
return Path(f'{self.workdir}/{self.sample_name}/latent_to_gene/{self.sample_name}_gene_marker_score.feather')
|
304
|
+
|
305
|
+
@property
|
306
|
+
@ensure_path_exists
|
307
|
+
def ldscore_save_dir(self) -> Path:
|
308
|
+
return Path(f'{self.workdir}/{self.sample_name}/generate_ldscore')
|
309
|
+
|
310
|
+
@property
|
311
|
+
@ensure_path_exists
|
312
|
+
def ldsc_save_dir(self) -> Path:
|
313
|
+
return Path(f'{self.workdir}/{self.sample_name}/spatial_ldsc')
|
314
|
+
|
315
|
+
@property
|
316
|
+
@ensure_path_exists
|
317
|
+
def cauchy_save_dir(self) -> Path:
|
318
|
+
return Path(f'{self.workdir}/{self.sample_name}/cauchy_combination')
|
319
|
+
|
320
|
+
@ensure_path_exists
|
321
|
+
def get_report_dir(self, trait_name: str) -> Path:
|
322
|
+
return Path(f'{self.workdir}/{self.sample_name}/report/{trait_name}')
|
323
|
+
|
324
|
+
def get_gsMap_report_file(self, trait_name: str) -> Path:
|
325
|
+
return self.get_report_dir(trait_name) / f'{self.sample_name}_{trait_name}_gsMap_Report.html'
|
326
|
+
|
327
|
+
@ensure_path_exists
|
328
|
+
def get_manhattan_html_plot_path(self, trait_name: str) -> Path:
|
329
|
+
return Path(
|
330
|
+
f'{self.workdir}/{self.sample_name}/report/{trait_name}/manhattan_plot/{self.sample_name}_{trait_name}_Diagnostic_Manhattan_Plot.html')
|
331
|
+
|
332
|
+
@ensure_path_exists
|
333
|
+
def get_GSS_plot_dir(self, trait_name: str) -> Path:
|
334
|
+
return Path(f'{self.workdir}/{self.sample_name}/report/{trait_name}/GSS_plot')
|
335
|
+
|
336
|
+
def get_GSS_plot_select_gene_file(self, trait_name: str) -> Path:
|
337
|
+
return self.get_GSS_plot_dir(trait_name) / 'plot_genes.csv'
|
338
|
+
|
339
|
+
@ensure_path_exists
|
340
|
+
def get_ldsc_result_file(self, trait_name: str) -> Path:
|
341
|
+
return Path(f'{self.ldsc_save_dir}/{self.sample_name}_{trait_name}.csv.gz')
|
342
|
+
|
343
|
+
@ensure_path_exists
|
344
|
+
def get_cauchy_result_file(self, trait_name: str) -> Path:
|
345
|
+
return Path(f'{self.cauchy_save_dir}/{self.sample_name}_{trait_name}.Cauchy.csv.gz')
|
346
|
+
|
347
|
+
@ensure_path_exists
|
348
|
+
def get_gene_diagnostic_info_save_path(self, trait_name: str) -> Path:
|
349
|
+
return Path(
|
350
|
+
f'{self.workdir}/{self.sample_name}/report/{trait_name}/{self.sample_name}_{trait_name}_Gene_Diagnostic_Info.csv')
|
351
|
+
|
352
|
+
@ensure_path_exists
|
353
|
+
def get_gsMap_plot_save_dir(self, trait_name: str) -> Path:
|
354
|
+
return Path(f'{self.workdir}/{self.sample_name}/report/{trait_name}/gsMap_plot')
|
355
|
+
|
356
|
+
def get_gsMap_html_plot_save_path(self, trait_name: str) -> Path:
|
357
|
+
return self.get_gsMap_plot_save_dir(trait_name) / f'{self.sample_name}_{trait_name}_gsMap_plot.html'
|
358
|
+
|
359
|
+
@dataclass
|
360
|
+
class FindLatentRepresentationsConfig(ConfigWithAutoPaths):
|
361
|
+
input_hdf5_path: str
|
362
|
+
# output_hdf5_path: str
|
392
363
|
annotation: str = None
|
393
|
-
|
364
|
+
data_layer: str = None
|
394
365
|
|
395
366
|
epochs: int = 300
|
396
367
|
feat_hidden1: int = 256
|
397
368
|
feat_hidden2: int = 128
|
398
369
|
feat_cell: int = 3000
|
399
|
-
|
400
|
-
|
370
|
+
gat_hidden1: int = 64
|
371
|
+
gat_hidden2: int = 30
|
401
372
|
p_drop: float = 0.1
|
402
|
-
|
373
|
+
gat_lr: float = 0.001
|
403
374
|
gcn_decay: float = 0.01
|
404
375
|
n_neighbors: int = 11
|
405
376
|
label_w: float = 1
|
@@ -413,6 +384,7 @@ class FindLatentRepresentationsConfig:
|
|
413
384
|
hierarchically: bool = False
|
414
385
|
|
415
386
|
def __post_init__(self):
|
387
|
+
# self.output_hdf5_path = self.hdf5_with_latent_path
|
416
388
|
if self.hierarchically:
|
417
389
|
if self.annotation is None:
|
418
390
|
raise ValueError('annotation must be provided if hierarchically is True.')
|
@@ -428,29 +400,43 @@ class FindLatentRepresentationsConfig:
|
|
428
400
|
|
429
401
|
|
430
402
|
@dataclass
|
431
|
-
class LatentToGeneConfig:
|
432
|
-
input_hdf5_with_latent_path: str
|
433
|
-
|
434
|
-
output_feather_path: str
|
403
|
+
class LatentToGeneConfig(ConfigWithAutoPaths):
|
404
|
+
# input_hdf5_with_latent_path: str
|
405
|
+
# output_feather_path: str
|
435
406
|
no_expression_fraction: bool = False
|
436
407
|
latent_representation: str = 'latent_GVAE'
|
437
408
|
num_neighbour: int = 21
|
438
409
|
num_neighbour_spatial: int = 101
|
439
|
-
|
440
|
-
gs_species: str = None
|
410
|
+
homolog_file: str = None
|
441
411
|
gM_slices: str = None
|
442
412
|
annotation: str = None
|
443
|
-
|
413
|
+
|
414
|
+
def __post_init__(self):
|
415
|
+
if self.homolog_file is not None:
|
416
|
+
logger.info(f"User provided homolog file to map gene names to human: {self.homolog_file}")
|
417
|
+
# check the format of the homolog file
|
418
|
+
with open(self.homolog_file, 'r') as f:
|
419
|
+
first_line = f.readline().strip()
|
420
|
+
_n_col = len(first_line.split())
|
421
|
+
if _n_col != 2:
|
422
|
+
raise ValueError(
|
423
|
+
f"Invalid homolog file format. Expected 2 columns, first column should be other species gene name, second column should be human gene name. "
|
424
|
+
f"Got {_n_col} columns in the first line.")
|
425
|
+
else:
|
426
|
+
first_col_name, second_col_name = first_line.split()
|
427
|
+
self.species = first_col_name
|
428
|
+
logger.info(
|
429
|
+
f"Homolog file provided and will map gene name from column1:{first_col_name} to column2:{second_col_name}")
|
430
|
+
else:
|
431
|
+
logger.info("No homolog file provided. Run in human mode.")
|
444
432
|
|
445
433
|
|
446
434
|
@dataclass
|
447
|
-
class GenerateLDScoreConfig:
|
448
|
-
sample_name: str
|
435
|
+
class GenerateLDScoreConfig(ConfigWithAutoPaths):
|
449
436
|
chrom: Union[int, str]
|
450
|
-
|
451
|
-
mkscore_feather_file: str
|
437
|
+
|
452
438
|
bfile_root: str
|
453
|
-
keep_snp_root: str
|
439
|
+
keep_snp_root: Optional[str]
|
454
440
|
|
455
441
|
# annotation by gene distance
|
456
442
|
gtf_annotation_file: str
|
@@ -459,15 +445,28 @@ class GenerateLDScoreConfig:
|
|
459
445
|
# annotation by enhancer
|
460
446
|
enhancer_annotation_file: str = None
|
461
447
|
snp_multiple_enhancer_strategy: Literal['max_mkscore', 'nearest_TSS'] = 'max_mkscore'
|
462
|
-
gene_window_enhancer_priority: Literal['gene_window_first', 'enhancer_first', 'enhancer_only',] = None
|
448
|
+
gene_window_enhancer_priority: Optional[Literal['gene_window_first', 'enhancer_first', 'enhancer_only',]] = None
|
463
449
|
|
464
450
|
# for calculating ld score
|
465
|
-
|
466
|
-
spots_per_chunk: int =
|
451
|
+
additional_baseline_annotation: str = None
|
452
|
+
spots_per_chunk: int = 1_000
|
467
453
|
ld_wind: int = 1
|
468
454
|
ld_unit: str = 'CM'
|
469
455
|
|
456
|
+
# zarr config
|
457
|
+
ldscore_save_format: Literal['feather', 'zarr', 'quick_mode'] = 'feather'
|
458
|
+
|
459
|
+
zarr_chunk_size: Tuple[int, int] = None
|
460
|
+
|
461
|
+
# for pre calculating the SNP Gene ldscore Weight
|
462
|
+
save_pre_calculate_snp_gene_weight_matrix: bool = False
|
463
|
+
|
464
|
+
baseline_annotation_dir: Optional[str] = None
|
465
|
+
SNP_gene_pair_dir: Optional[str] = None
|
470
466
|
def __post_init__(self):
|
467
|
+
# if self.mkscore_feather_file is None:
|
468
|
+
# self.mkscore_feather_file = self._get_mkscore_feather_path()
|
469
|
+
|
471
470
|
if self.enhancer_annotation_file is not None and self.gene_window_enhancer_priority is None:
|
472
471
|
logger.warning("enhancer_annotation_file is provided but gene_window_enhancer_priority is not provided. "
|
473
472
|
"by default, gene_window_enhancer_priority is set to 'enhancer_only', when enhancer_annotation_file is provided.")
|
@@ -489,42 +488,55 @@ class GenerateLDScoreConfig:
|
|
489
488
|
f'Only gene window annotation will be used to calculate LD score. SNP within +-{self.gene_window_size} bp of gene body will be used. ')
|
490
489
|
|
491
490
|
# remind for baseline annotation
|
492
|
-
if self.
|
491
|
+
if self.additional_baseline_annotation is None:
|
493
492
|
logger.info(f'------Baseline annotation is not provided. Default baseline annotation will be used.')
|
494
493
|
else:
|
495
|
-
logger.info(
|
496
|
-
|
494
|
+
logger.info(
|
495
|
+
f'------Baseline annotation is provided. Additional baseline annotation will be used with the default baseline annotation.')
|
496
|
+
logger.info(f'------Baseline annotation directory: {self.additional_baseline_annotation}')
|
497
497
|
# check the existence of baseline annotation
|
498
498
|
if self.chrom == 'all':
|
499
499
|
for chrom in range(1, 23):
|
500
500
|
chrom = str(chrom)
|
501
|
-
baseline_annotation_path = Path(
|
501
|
+
baseline_annotation_path = Path(
|
502
|
+
self.additional_baseline_annotation) / f'baseline.{chrom}.annot.gz'
|
502
503
|
if not baseline_annotation_path.exists():
|
503
|
-
raise FileNotFoundError(
|
504
|
+
raise FileNotFoundError(
|
505
|
+
f'baseline.{chrom}.annot.gz is not found in {self.additional_baseline_annotation}.')
|
504
506
|
else:
|
505
|
-
baseline_annotation_path = Path(
|
507
|
+
baseline_annotation_path = Path(
|
508
|
+
self.additional_baseline_annotation) / f'baseline.{self.chrom}.annot.gz'
|
506
509
|
if not baseline_annotation_path.exists():
|
507
|
-
raise FileNotFoundError(
|
510
|
+
raise FileNotFoundError(
|
511
|
+
f'baseline.{self.chrom}.annot.gz is not found in {self.additional_baseline_annotation}.')
|
508
512
|
|
513
|
+
# set the default zarr chunk size
|
514
|
+
if self.ldscore_save_format == 'zarr' and self.zarr_chunk_size is None:
|
515
|
+
self.zarr_chunk_size = (10_000, self.spots_per_chunk)
|
509
516
|
|
510
517
|
|
511
518
|
@dataclass
|
512
|
-
class SpatialLDSCConfig:
|
513
|
-
sample_name: str
|
519
|
+
class SpatialLDSCConfig(ConfigWithAutoPaths):
|
514
520
|
w_file: str
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
sumstats_config_file: str = None
|
521
|
+
# ldscore_save_dir: str
|
522
|
+
use_additional_baseline_annotation: bool = True
|
523
|
+
trait_name: Optional[str] = None
|
524
|
+
sumstats_file: Optional[str] = None
|
525
|
+
sumstats_config_file: Optional[str] = None
|
521
526
|
num_processes: int = 4
|
522
527
|
not_M_5_50: bool = False
|
523
528
|
n_blocks: int = 200
|
524
|
-
chisq_max: int = None
|
525
|
-
all_chunk: int = None
|
529
|
+
chisq_max: Optional[int] = None
|
530
|
+
all_chunk: Optional[int] = None
|
531
|
+
chunk_range: Optional[Tuple[int, int]] = None
|
532
|
+
|
533
|
+
ldscore_save_format: Literal['feather', 'zarr', 'quick_mode'] = 'feather'
|
534
|
+
|
535
|
+
spots_per_chunk_quick_mode: int = 1_000
|
536
|
+
snp_gene_weight_adata_path: Optional[str] = None
|
526
537
|
|
527
538
|
def __post_init__(self):
|
539
|
+
super().__post_init__()
|
528
540
|
if self.sumstats_file is None and self.sumstats_config_file is None:
|
529
541
|
raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
|
530
542
|
if self.sumstats_file is not None and self.sumstats_config_file is not None:
|
@@ -540,62 +552,54 @@ class SpatialLDSCConfig:
|
|
540
552
|
with open(self.sumstats_config_file) as f:
|
541
553
|
config = yaml.load(f, Loader=yaml.FullLoader)
|
542
554
|
for trait_name, sumstats_file in config.items():
|
543
|
-
|
555
|
+
assert Path(sumstats_file).exists(), f'{sumstats_file} does not exist.'
|
544
556
|
# load the sumstats file
|
545
557
|
elif self.sumstats_file is not None:
|
546
558
|
self.sumstats_config_dict[self.trait_name] = self.sumstats_file
|
547
559
|
else:
|
548
560
|
raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
|
549
561
|
|
562
|
+
for sumstats_file in self.sumstats_config_dict.values():
|
563
|
+
assert Path(sumstats_file).exists(), f'{sumstats_file} does not exist.'
|
564
|
+
|
550
565
|
# check if additional baseline annotation is exist
|
551
|
-
self.use_additional_baseline_annotation = False
|
552
|
-
|
566
|
+
# self.use_additional_baseline_annotation = False
|
567
|
+
|
568
|
+
if self.use_additional_baseline_annotation:
|
569
|
+
self.process_additional_baseline_annotation()
|
553
570
|
|
554
571
|
def process_additional_baseline_annotation(self):
|
555
|
-
|
556
|
-
dir_exists =
|
572
|
+
additional_baseline_annotation = Path(self.ldscore_save_dir) / 'additional_baseline'
|
573
|
+
dir_exists = additional_baseline_annotation.exists()
|
557
574
|
|
558
575
|
if not dir_exists:
|
559
|
-
if self.use_additional_baseline_annotation:
|
560
|
-
logger.warning(f"additional_baseline directory is not found in {self.ldscore_input_dir}.")
|
561
|
-
print('''\
|
562
|
-
if you want to use additional baseline annotation,
|
563
|
-
please provide additional baseline annotation when calculating ld score.
|
564
|
-
''')
|
565
|
-
raise FileNotFoundError(
|
566
|
-
f'additional_baseline directory is not found. You should disable use_additional_baseline_annotation')
|
567
|
-
return
|
568
|
-
|
569
|
-
self.use_additional_baseline_annotation = self.use_additional_baseline_annotation or True
|
570
|
-
|
571
|
-
if self.disable_additional_baseline_annotation:
|
572
|
-
logger.warning(
|
573
|
-
f"additional_baseline directory is found in {self.ldscore_input_dir}, but use_additional_baseline_annotation is disabled.")
|
574
|
-
print('''\
|
575
|
-
if you want to use additional baseline annotation,
|
576
|
-
please enable by not adding --disable_additional_baseline_annotation.
|
577
|
-
''')
|
578
576
|
self.use_additional_baseline_annotation = False
|
577
|
+
# if self.use_additional_baseline_annotation:
|
578
|
+
# logger.warning(f"additional_baseline directory is not found in {self.ldscore_save_dir}.")
|
579
|
+
# print('''\
|
580
|
+
# if you want to use additional baseline annotation,
|
581
|
+
# please provide additional baseline annotation when calculating ld score.
|
582
|
+
# ''')
|
583
|
+
# raise FileNotFoundError(
|
584
|
+
# f'additional_baseline directory is not found.')
|
585
|
+
# return
|
586
|
+
# self.use_additional_baseline_annotation = self.use_additional_baseline_annotation or True
|
579
587
|
else:
|
580
588
|
logger.info(
|
581
589
|
f'------Additional baseline annotation is provided. It will be used with the default baseline annotation.')
|
582
|
-
logger.info(f'------Additional baseline annotation directory: {
|
590
|
+
logger.info(f'------Additional baseline annotation directory: {additional_baseline_annotation}')
|
583
591
|
|
584
592
|
chrom_list = range(1, 23)
|
585
593
|
for chrom in chrom_list:
|
586
|
-
baseline_annotation_path =
|
594
|
+
baseline_annotation_path = additional_baseline_annotation / f'baseline.{chrom}.l2.ldscore.feather'
|
587
595
|
if not baseline_annotation_path.exists():
|
588
596
|
raise FileNotFoundError(
|
589
|
-
f'baseline.{chrom}.annot.gz is not found in {
|
597
|
+
f'baseline.{chrom}.annot.gz is not found in {additional_baseline_annotation}.')
|
598
|
+
return None
|
590
599
|
|
591
600
|
|
592
|
-
logger
|
593
601
|
@dataclass
|
594
|
-
class CauchyCombinationConfig:
|
595
|
-
input_hdf5_path: str
|
596
|
-
input_ldsc_dir: str
|
597
|
-
output_cauchy_dir: str
|
598
|
-
sample_name: str
|
602
|
+
class CauchyCombinationConfig(ConfigWithAutoPaths):
|
599
603
|
trait_name: str
|
600
604
|
annotation: str
|
601
605
|
meta: str = None
|
@@ -603,11 +607,7 @@ class CauchyCombinationConfig:
|
|
603
607
|
|
604
608
|
|
605
609
|
@dataclass
|
606
|
-
class VisualizeConfig:
|
607
|
-
input_hdf5_path: str
|
608
|
-
input_ldsc_dir: str
|
609
|
-
output_figure_dir: str
|
610
|
-
sample_name: str
|
610
|
+
class VisualizeConfig(ConfigWithAutoPaths):
|
611
611
|
trait_name: str
|
612
612
|
|
613
613
|
annotation: str = None
|
@@ -619,12 +619,91 @@ class VisualizeConfig:
|
|
619
619
|
|
620
620
|
|
621
621
|
@dataclass
|
622
|
-
class
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
622
|
+
class DiagnosisConfig(ConfigWithAutoPaths):
|
623
|
+
annotation: str
|
624
|
+
# mkscore_feather_file: str
|
625
|
+
|
626
|
+
trait_name: str
|
627
|
+
sumstats_file: str
|
628
|
+
plot_type: Literal['manhattan', 'GSS', 'gsMap', 'all'] = 'all'
|
629
|
+
top_corr_genes: int = 50
|
630
|
+
selected_genes: Optional[List[str]] = None
|
631
|
+
|
632
|
+
fig_width: Optional[int] = None
|
633
|
+
fig_height: Optional[int] = None
|
634
|
+
point_size: Optional[int] = None
|
635
|
+
fig_style: Literal['dark', 'light'] = 'light'
|
636
|
+
|
637
|
+
def __post_init__(self):
|
638
|
+
if any([self.fig_width, self.fig_height, self.point_size]):
|
639
|
+
logger.info('Customizing the figure size and point size.')
|
640
|
+
assert all([self.fig_width, self.fig_height, self.point_size]), 'All of fig_width, fig_height, and point_size must be provided.'
|
641
|
+
self.customize_fig = True
|
642
|
+
else:
|
643
|
+
self.customize_fig = False
|
644
|
+
@dataclass
|
645
|
+
class ReportConfig(DiagnosisConfig):
|
646
|
+
pass
|
647
|
+
|
648
|
+
|
649
|
+
@dataclass
|
650
|
+
class RunAllModeConfig(ConfigWithAutoPaths):
|
651
|
+
gsMap_resource_dir: str
|
652
|
+
|
653
|
+
# == ST DATA PARAMETERS ==
|
654
|
+
hdf5_path: str
|
655
|
+
annotation: str
|
656
|
+
data_layer: str = 'X'
|
657
|
+
|
658
|
+
# ==GWAS DATA PARAMETERS==
|
659
|
+
trait_name: Optional[str] = None
|
660
|
+
sumstats_file: Optional[str] = None
|
661
|
+
sumstats_config_file: Optional[str] = None
|
662
|
+
|
663
|
+
# === homolog PARAMETERS ===
|
664
|
+
homolog_file: Optional[str] = None
|
665
|
+
|
666
|
+
max_processes: int = 10
|
667
|
+
|
668
|
+
def __post_init__(self):
|
669
|
+
super().__post_init__()
|
670
|
+
self.gtffile = f"{self.gsMap_resource_dir}/genome_annotation/gtf/gencode.v39lift37.annotation.gtf"
|
671
|
+
self.bfile_root = f"{self.gsMap_resource_dir}/LD_Reference_Panel/1000G_EUR_Phase3_plink/1000G.EUR.QC"
|
672
|
+
self.keep_snp_root = f"{self.gsMap_resource_dir}/LDSC_resource/hapmap3_snps/hm"
|
673
|
+
self.w_file = f"{self.gsMap_resource_dir}/LDSC_resource/weights_hm3_no_hla/weights."
|
674
|
+
self.snp_gene_weight_adata_path = f"{self.gsMap_resource_dir}/quick_mode/snp_gene_weight_matrix.h5ad"
|
675
|
+
self.baseline_annotation_dir = Path(f"{self.gsMap_resource_dir}/quick_mode/baseline").resolve()
|
676
|
+
self.SNP_gene_pair_dir = Path(f"{self.gsMap_resource_dir}/quick_mode/SNP_gene_pair").resolve()
|
677
|
+
# check the existence of the input files and resources files
|
678
|
+
for file in [self.hdf5_path, self.gtffile]:
|
679
|
+
if not Path(file).exists():
|
680
|
+
raise FileNotFoundError(f"File {file} does not exist.")
|
681
|
+
|
682
|
+
if self.sumstats_file is None and self.sumstats_config_file is None:
|
683
|
+
raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
|
684
|
+
if self.sumstats_file is not None and self.sumstats_config_file is not None:
|
685
|
+
raise ValueError('Only one of sumstats_file and sumstats_config_file must be provided.')
|
686
|
+
if self.sumstats_file is not None and self.trait_name is None:
|
687
|
+
raise ValueError('trait_name must be provided if sumstats_file is provided.')
|
688
|
+
if self.sumstats_config_file is not None and self.trait_name is not None:
|
689
|
+
raise ValueError('trait_name must not be provided if sumstats_config_file is provided.')
|
690
|
+
self.sumstats_config_dict = {}
|
691
|
+
# load the sumstats config file
|
692
|
+
if self.sumstats_config_file is not None:
|
693
|
+
import yaml
|
694
|
+
with open(self.sumstats_config_file) as f:
|
695
|
+
config = yaml.load(f, Loader=yaml.FullLoader)
|
696
|
+
for trait_name, sumstats_file in config.items():
|
697
|
+
assert Path(sumstats_file).exists(), f'{sumstats_file} does not exist.'
|
698
|
+
self.sumstats_config_dict[trait_name] = sumstats_file
|
699
|
+
# load the sumstats file
|
700
|
+
elif self.sumstats_file is not None and self.trait_name is not None:
|
701
|
+
self.sumstats_config_dict[self.trait_name] = self.sumstats_file
|
702
|
+
else:
|
703
|
+
raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
|
704
|
+
|
705
|
+
for sumstats_file in self.sumstats_config_dict.values():
|
706
|
+
assert Path(sumstats_file).exists(), f'{sumstats_file} does not exist.'
|
628
707
|
|
629
708
|
|
630
709
|
@dataclass
|
@@ -650,7 +729,7 @@ class FormatSumstatsConfig:
|
|
650
729
|
chunksize: int = 1e+7
|
651
730
|
info_min: float = 0.9
|
652
731
|
maf_min: float = 0.01
|
653
|
-
keep_chr_pos:bool = False
|
732
|
+
keep_chr_pos: bool = False
|
654
733
|
|
655
734
|
|
656
735
|
@register_cli(name='run_find_latent_representations',
|
@@ -698,37 +777,27 @@ def run_Cauchy_combination_from_cli(args: argparse.Namespace):
|
|
698
777
|
run_Cauchy_combination(config)
|
699
778
|
|
700
779
|
|
701
|
-
@register_cli(name='
|
702
|
-
description='
|
703
|
-
add_args_function=
|
704
|
-
def
|
705
|
-
from gsMap.
|
706
|
-
config = get_dataclass_from_parser(args,
|
707
|
-
|
780
|
+
@register_cli(name='run_report',
|
781
|
+
description='Run Report to generate diagnostic plots and tables',
|
782
|
+
add_args_function=add_report_args)
|
783
|
+
def run_Report_from_cli(args: argparse.Namespace):
|
784
|
+
from gsMap.report import run_report
|
785
|
+
config = get_dataclass_from_parser(args, ReportConfig)
|
786
|
+
run_report(config)
|
708
787
|
|
709
788
|
|
710
|
-
@register_cli(name='run_all_mode',
|
711
|
-
description='Run gsMap method (the full process)',
|
712
|
-
add_args_function=add_all_mode_args)
|
713
|
-
def run_all_mode_from_cli(args: argparse.Namespace):
|
714
|
-
from gsMap.find_latent_representation import run_find_latent_representation
|
715
|
-
from gsMap.latent_to_gene import run_latent_to_gene
|
716
|
-
from gsMap.generate_ldscore import run_generate_ldscore
|
717
|
-
from gsMap.spatial_ldsc_multiple_sumstats import run_spatial_ldsc
|
718
|
-
from gsMap.cauchy_combination_test import run_Cauchy_combination
|
719
|
-
config = get_runall_mode_config(args)
|
720
|
-
run_find_latent_representation(config.flr_config)
|
721
|
-
run_latent_to_gene(config.ltg_config)
|
722
|
-
run_generate_ldscore(config.gls_config)
|
723
|
-
run_spatial_ldsc(config.ldsc_config)
|
724
|
-
if args.annotation is not None:
|
725
|
-
config.cauchy_config.annotation = args.annotation
|
726
|
-
run_Cauchy_combination(config.cauchy_config)
|
727
|
-
|
728
789
|
@register_cli(name='format_sumstats',
|
729
790
|
description='Format gwas summary statistics',
|
730
791
|
add_args_function=add_format_sumstats_args)
|
731
792
|
def gwas_format_from_cli(args: argparse.Namespace):
|
732
793
|
from gsMap.format_sumstats import gwas_format
|
733
794
|
config = get_dataclass_from_parser(args, FormatSumstatsConfig)
|
734
|
-
gwas_format(config)
|
795
|
+
gwas_format(config)
|
796
|
+
|
797
|
+
@register_cli(name='quick_mode',
|
798
|
+
description='Run all the gsMap pipeline in quick mode',
|
799
|
+
add_args_function=add_run_all_mode_args)
|
800
|
+
def run_all_mode_from_cli(args: argparse.Namespace):
|
801
|
+
from gsMap.run_all_mode import run_pipeline
|
802
|
+
config = get_dataclass_from_parser(args, RunAllModeConfig)
|
803
|
+
run_pipeline(config)
|