gsMap 1.71.2__py3-none-any.whl → 1.73.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN/adjacency_matrix.py +25 -27
- gsMap/GNN/model.py +9 -7
- gsMap/GNN/train.py +8 -11
- gsMap/__init__.py +3 -3
- gsMap/__main__.py +3 -2
- gsMap/cauchy_combination_test.py +78 -75
- gsMap/config.py +948 -322
- gsMap/create_slice_mean.py +168 -0
- gsMap/diagnosis.py +179 -101
- gsMap/find_latent_representation.py +29 -27
- gsMap/format_sumstats.py +239 -201
- gsMap/generate_ldscore.py +334 -222
- gsMap/latent_to_gene.py +128 -68
- gsMap/main.py +23 -14
- gsMap/report.py +39 -25
- gsMap/run_all_mode.py +87 -46
- gsMap/setup.py +1 -1
- gsMap/spatial_ldsc_multiple_sumstats.py +154 -80
- gsMap/utils/generate_r2_matrix.py +100 -346
- gsMap/utils/jackknife.py +84 -80
- gsMap/utils/manhattan_plot.py +180 -207
- gsMap/utils/regression_read.py +83 -176
- gsMap/visualize.py +82 -64
- gsmap-1.73.0.dist-info/METADATA +169 -0
- gsmap-1.73.0.dist-info/RECORD +31 -0
- {gsmap-1.71.2.dist-info → gsmap-1.73.0.dist-info}/WHEEL +1 -1
- {gsmap-1.71.2.dist-info → gsmap-1.73.0.dist-info/licenses}/LICENSE +6 -6
- gsMap/utils/make_annotations.py +0 -518
- gsmap-1.71.2.dist-info/METADATA +0 -105
- gsmap-1.71.2.dist-info/RECORD +0 -31
- {gsmap-1.71.2.dist-info → gsmap-1.73.0.dist-info}/entry_points.txt +0 -0
gsMap/config.py
CHANGED
@@ -1,75 +1,237 @@
|
|
1
|
-
import sys
|
2
1
|
import argparse
|
2
|
+
import dataclasses
|
3
3
|
import logging
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
import threading
|
7
|
+
import time
|
4
8
|
from collections import OrderedDict, namedtuple
|
9
|
+
from collections.abc import Callable
|
5
10
|
from dataclasses import dataclass
|
11
|
+
from functools import wraps
|
6
12
|
from pathlib import Path
|
7
13
|
from pprint import pprint
|
8
|
-
from typing import
|
9
|
-
|
10
|
-
|
14
|
+
from typing import Literal
|
15
|
+
|
16
|
+
import psutil
|
11
17
|
import pyfiglet
|
18
|
+
import yaml
|
12
19
|
|
13
20
|
from gsMap.__init__ import __version__
|
14
21
|
|
15
22
|
# Global registry to hold functions
|
16
23
|
cli_function_registry = OrderedDict()
|
17
|
-
subcommand = namedtuple(
|
24
|
+
subcommand = namedtuple("subcommand", ["name", "func", "add_args_function", "description"])
|
18
25
|
|
19
26
|
|
20
27
|
def get_gsMap_logger(logger_name):
|
21
28
|
logger = logging.getLogger(logger_name)
|
22
29
|
logger.setLevel(logging.DEBUG)
|
23
30
|
handler = logging.StreamHandler()
|
24
|
-
handler.setFormatter(
|
25
|
-
|
31
|
+
handler.setFormatter(
|
32
|
+
logging.Formatter("[{asctime}] {levelname:.5s} | {name} - {message}", style="{")
|
33
|
+
)
|
26
34
|
logger.addHandler(handler)
|
27
35
|
return logger
|
28
36
|
|
29
|
-
|
37
|
+
|
38
|
+
logger = get_gsMap_logger("gsMap")
|
39
|
+
|
40
|
+
|
41
|
+
def track_resource_usage(func):
|
42
|
+
"""
|
43
|
+
Decorator to track resource usage during function execution.
|
44
|
+
Logs memory usage, CPU time, and wall clock time at the end of the function.
|
45
|
+
"""
|
46
|
+
|
47
|
+
@wraps(func)
|
48
|
+
def wrapper(*args, **kwargs):
|
49
|
+
# Get the current process
|
50
|
+
process = psutil.Process(os.getpid())
|
51
|
+
|
52
|
+
# Initialize tracking variables
|
53
|
+
peak_memory = 0
|
54
|
+
cpu_percent_samples = []
|
55
|
+
stop_thread = False
|
56
|
+
|
57
|
+
# Function to monitor resource usage
|
58
|
+
def resource_monitor():
|
59
|
+
nonlocal peak_memory, cpu_percent_samples
|
60
|
+
while not stop_thread:
|
61
|
+
try:
|
62
|
+
# Get current memory usage in MB
|
63
|
+
current_memory = process.memory_info().rss / (1024 * 1024)
|
64
|
+
peak_memory = max(peak_memory, current_memory)
|
65
|
+
|
66
|
+
# Get CPU usage percentage
|
67
|
+
cpu_percent = process.cpu_percent(interval=None)
|
68
|
+
if cpu_percent > 0: # Skip initial zero readings
|
69
|
+
cpu_percent_samples.append(cpu_percent)
|
70
|
+
|
71
|
+
time.sleep(0.5)
|
72
|
+
except Exception: # Catching all exceptions here because... # noqa: BLE001
|
73
|
+
pass
|
74
|
+
|
75
|
+
# Start resource monitoring in a separate thread
|
76
|
+
monitor_thread = threading.Thread(target=resource_monitor)
|
77
|
+
monitor_thread.daemon = True
|
78
|
+
monitor_thread.start()
|
79
|
+
|
80
|
+
# Get start times
|
81
|
+
start_wall_time = time.time()
|
82
|
+
start_cpu_time = process.cpu_times().user + process.cpu_times().system
|
83
|
+
|
84
|
+
try:
|
85
|
+
# Run the actual function
|
86
|
+
result = func(*args, **kwargs)
|
87
|
+
return result
|
88
|
+
finally:
|
89
|
+
# Stop the monitoring thread
|
90
|
+
stop_thread = True
|
91
|
+
monitor_thread.join(timeout=1.0)
|
92
|
+
|
93
|
+
# Calculate elapsed times
|
94
|
+
end_wall_time = time.time()
|
95
|
+
end_cpu_time = process.cpu_times().user + process.cpu_times().system
|
96
|
+
|
97
|
+
wall_time = end_wall_time - start_wall_time
|
98
|
+
cpu_time = end_cpu_time - start_cpu_time
|
99
|
+
|
100
|
+
# Calculate average CPU percentage
|
101
|
+
avg_cpu_percent = (
|
102
|
+
sum(cpu_percent_samples) / len(cpu_percent_samples) if cpu_percent_samples else 0
|
103
|
+
)
|
104
|
+
|
105
|
+
# Format memory for display
|
106
|
+
if peak_memory < 1024:
|
107
|
+
memory_str = f"{peak_memory:.2f} MB"
|
108
|
+
else:
|
109
|
+
memory_str = f"{peak_memory / 1024:.2f} GB"
|
110
|
+
|
111
|
+
# Format times for display
|
112
|
+
if wall_time < 60:
|
113
|
+
wall_time_str = f"{wall_time:.2f} seconds"
|
114
|
+
elif wall_time < 3600:
|
115
|
+
wall_time_str = f"{wall_time / 60:.2f} minutes"
|
116
|
+
else:
|
117
|
+
wall_time_str = f"{wall_time / 3600:.2f} hours"
|
118
|
+
|
119
|
+
if cpu_time < 60:
|
120
|
+
cpu_time_str = f"{cpu_time:.2f} seconds"
|
121
|
+
elif cpu_time < 3600:
|
122
|
+
cpu_time_str = f"{cpu_time / 60:.2f} minutes"
|
123
|
+
else:
|
124
|
+
cpu_time_str = f"{cpu_time / 3600:.2f} hours"
|
125
|
+
|
126
|
+
# Log the resource usage
|
127
|
+
import logging
|
128
|
+
|
129
|
+
logger = logging.getLogger("gsMap")
|
130
|
+
logger.info("Resource usage summary:")
|
131
|
+
logger.info(f" • Wall clock time: {wall_time_str}")
|
132
|
+
logger.info(f" • CPU time: {cpu_time_str}")
|
133
|
+
logger.info(f" • Average CPU utilization: {avg_cpu_percent:.1f}%")
|
134
|
+
logger.info(f" • Peak memory usage: {memory_str}")
|
135
|
+
|
136
|
+
return wrapper
|
137
|
+
|
30
138
|
|
31
139
|
# Decorator to register functions for cli parsing
|
32
140
|
def register_cli(name: str, description: str, add_args_function: Callable) -> Callable:
|
33
141
|
def decorator(func: Callable) -> Callable:
|
142
|
+
@track_resource_usage # Use enhanced resource tracking
|
143
|
+
@wraps(func)
|
34
144
|
def wrapper(*args, **kwargs):
|
35
|
-
name.replace(
|
36
|
-
gsMap_main_logo = pyfiglet.figlet_format(
|
145
|
+
name.replace("_", " ")
|
146
|
+
gsMap_main_logo = pyfiglet.figlet_format(
|
147
|
+
"gsMap",
|
148
|
+
font="doom",
|
149
|
+
width=80,
|
150
|
+
justify="center",
|
151
|
+
).rstrip()
|
37
152
|
print(gsMap_main_logo, flush=True)
|
38
|
-
version_number =
|
153
|
+
version_number = "Version: " + __version__
|
39
154
|
print(version_number.center(80), flush=True)
|
40
|
-
print(
|
155
|
+
print("=" * 80, flush=True)
|
41
156
|
logger.info(f"Running {name}...")
|
157
|
+
|
158
|
+
# Record start time for the log message
|
159
|
+
start_time = time.strftime("%Y-%m-%d %H:%M:%S")
|
160
|
+
logger.info(f"Started at: {start_time}")
|
161
|
+
|
42
162
|
func(*args, **kwargs)
|
43
|
-
logger.info(f"Finished running {name}.")
|
44
163
|
|
45
|
-
|
46
|
-
|
164
|
+
# Record end time for the log message
|
165
|
+
end_time = time.strftime("%Y-%m-%d %H:%M:%S")
|
166
|
+
logger.info(f"Finished running {name} at: {end_time}.")
|
167
|
+
|
168
|
+
cli_function_registry[name] = subcommand(
|
169
|
+
name=name, func=wrapper, add_args_function=add_args_function, description=description
|
170
|
+
)
|
47
171
|
return wrapper
|
48
172
|
|
49
173
|
return decorator
|
50
174
|
|
175
|
+
|
176
|
+
def str_or_float(value):
|
177
|
+
try:
|
178
|
+
return int(value)
|
179
|
+
except ValueError:
|
180
|
+
return value
|
181
|
+
|
182
|
+
|
51
183
|
def add_shared_args(parser):
|
52
|
-
parser.add_argument(
|
53
|
-
|
184
|
+
parser.add_argument(
|
185
|
+
"--workdir", type=str, required=True, help="Path to the working directory."
|
186
|
+
)
|
187
|
+
parser.add_argument("--sample_name", type=str, required=True, help="Name of the sample.")
|
188
|
+
|
54
189
|
|
55
190
|
def add_find_latent_representations_args(parser):
|
56
191
|
add_shared_args(parser)
|
57
|
-
parser.add_argument(
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
parser.add_argument(
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
parser.add_argument(
|
71
|
-
parser.add_argument(
|
72
|
-
|
192
|
+
parser.add_argument(
|
193
|
+
"--input_hdf5_path", required=True, type=str, help="Path to the input HDF5 file."
|
194
|
+
)
|
195
|
+
parser.add_argument(
|
196
|
+
"--annotation", required=True, type=str, help="Name of the annotation in adata.obs to use."
|
197
|
+
)
|
198
|
+
parser.add_argument(
|
199
|
+
"--data_layer",
|
200
|
+
type=str,
|
201
|
+
default="counts",
|
202
|
+
required=True,
|
203
|
+
help='Data layer for gene expression (e.g., "count", "counts", "log1p").',
|
204
|
+
)
|
205
|
+
parser.add_argument("--epochs", type=int, default=300, help="Number of training epochs.")
|
206
|
+
parser.add_argument(
|
207
|
+
"--feat_hidden1", type=int, default=256, help="Neurons in the first hidden layer."
|
208
|
+
)
|
209
|
+
parser.add_argument(
|
210
|
+
"--feat_hidden2", type=int, default=128, help="Neurons in the second hidden layer."
|
211
|
+
)
|
212
|
+
parser.add_argument(
|
213
|
+
"--gat_hidden1", type=int, default=64, help="Units in the first GAT hidden layer."
|
214
|
+
)
|
215
|
+
parser.add_argument(
|
216
|
+
"--gat_hidden2", type=int, default=30, help="Units in the second GAT hidden layer."
|
217
|
+
)
|
218
|
+
parser.add_argument("--p_drop", type=float, default=0.1, help="Dropout rate.")
|
219
|
+
parser.add_argument("--gat_lr", type=float, default=0.001, help="Learning rate for the GAT.")
|
220
|
+
parser.add_argument("--n_neighbors", type=int, default=11, help="Number of neighbors for GAT.")
|
221
|
+
parser.add_argument(
|
222
|
+
"--n_comps", type=int, default=300, help="Number of principal components for PCA."
|
223
|
+
)
|
224
|
+
parser.add_argument(
|
225
|
+
"--weighted_adj", action="store_true", help="Use weighted adjacency in GAT."
|
226
|
+
)
|
227
|
+
parser.add_argument(
|
228
|
+
"--convergence_threshold", type=float, default=1e-4, help="Threshold for convergence."
|
229
|
+
)
|
230
|
+
parser.add_argument(
|
231
|
+
"--hierarchically",
|
232
|
+
action="store_true",
|
233
|
+
help="Enable hierarchical latent representation finding.",
|
234
|
+
)
|
73
235
|
|
74
236
|
|
75
237
|
def chrom_choice(value):
|
@@ -77,10 +239,12 @@ def chrom_choice(value):
|
|
77
239
|
ivalue = int(value)
|
78
240
|
if 1 <= ivalue <= 22:
|
79
241
|
return ivalue
|
80
|
-
elif value.lower() ==
|
242
|
+
elif value.lower() == "all":
|
81
243
|
return value
|
82
244
|
else:
|
83
|
-
raise argparse.ArgumentTypeError(
|
245
|
+
raise argparse.ArgumentTypeError(
|
246
|
+
f"'{value}' is an invalid chromosome choice. Choose from 1-22 or 'all'."
|
247
|
+
)
|
84
248
|
|
85
249
|
|
86
250
|
def filter_args_for_dataclass(args_dict, data_class: dataclass):
|
@@ -89,7 +253,7 @@ def filter_args_for_dataclass(args_dict, data_class: dataclass):
|
|
89
253
|
|
90
254
|
def get_dataclass_from_parser(args: argparse.Namespace, data_class: dataclass):
|
91
255
|
remain_kwargs = filter_args_for_dataclass(vars(args), data_class)
|
92
|
-
print(f
|
256
|
+
print(f"Using the following arguments for {data_class.__name__}:", flush=True)
|
93
257
|
pprint(remain_kwargs, indent=4)
|
94
258
|
sys.stdout.flush()
|
95
259
|
return data_class(**remain_kwargs)
|
@@ -97,178 +261,423 @@ def get_dataclass_from_parser(args: argparse.Namespace, data_class: dataclass):
|
|
97
261
|
|
98
262
|
def add_latent_to_gene_args(parser):
|
99
263
|
add_shared_args(parser)
|
100
|
-
parser.add_argument('--annotation', type=str, help='Name of the annotation in adata.obs to use. (optional).')
|
101
|
-
parser.add_argument('--no_expression_fraction', action='store_true', help='Skip expression fraction filtering.')
|
102
|
-
parser.add_argument('--latent_representation', type=str, choices=['latent_GVAE', 'latent_PCA'], default='latent_GVAE',
|
103
|
-
help='Type of latent representation.')
|
104
|
-
parser.add_argument('--num_neighbour', type=int, default=21, help='Number of neighbors.')
|
105
|
-
parser.add_argument('--num_neighbour_spatial', type=int, default=101, help='Number of spatial neighbors.')
|
106
|
-
# parser.add_argument('--species', type=str, help='Species name for homolog gene mapping (optional).')
|
107
|
-
parser.add_argument('--homolog_file', type=str, help='Path to homologous gene conversion file (optional).')
|
108
264
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
parser.add_argument(
|
116
|
-
|
117
|
-
|
118
|
-
parser.add_argument(
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
parser.add_argument(
|
125
|
-
parser.add_argument(
|
265
|
+
parser.add_argument(
|
266
|
+
"--input_hdf5_path",
|
267
|
+
type=str,
|
268
|
+
default=None,
|
269
|
+
help="Path to the input HDF5 file with latent representations, if --latent_representation is specified.",
|
270
|
+
)
|
271
|
+
parser.add_argument(
|
272
|
+
"--no_expression_fraction", action="store_true", help="Skip expression fraction filtering."
|
273
|
+
)
|
274
|
+
parser.add_argument(
|
275
|
+
"--latent_representation",
|
276
|
+
type=str,
|
277
|
+
default=None,
|
278
|
+
help="Type of latent representation. This should exist in the h5ad obsm.",
|
279
|
+
)
|
280
|
+
parser.add_argument("--num_neighbour", type=int, default=21, help="Number of neighbors.")
|
281
|
+
parser.add_argument(
|
282
|
+
"--num_neighbour_spatial", type=int, default=101, help="Number of spatial neighbors."
|
283
|
+
)
|
284
|
+
parser.add_argument(
|
285
|
+
"--homolog_file",
|
286
|
+
type=str,
|
287
|
+
default=None,
|
288
|
+
help="Path to homologous gene conversion file (optional).",
|
289
|
+
)
|
290
|
+
parser.add_argument(
|
291
|
+
"--gM_slices", type=str, default=None, help="Path to the slice mean file (optional)."
|
292
|
+
)
|
293
|
+
parser.add_argument(
|
294
|
+
"--annotation",
|
295
|
+
type=str,
|
296
|
+
default=None,
|
297
|
+
help="Name of the annotation in adata.obs to use (optional).",
|
298
|
+
)
|
126
299
|
|
127
300
|
|
128
|
-
def
|
301
|
+
def add_generate_ldscore_args(parser):
|
129
302
|
add_shared_args(parser)
|
130
|
-
parser.add_argument(
|
131
|
-
parser.add_argument(
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
parser.add_argument(
|
303
|
+
parser.add_argument("--chrom", type=str, required=True, help='Chromosome id (1-22) or "all".')
|
304
|
+
parser.add_argument(
|
305
|
+
"--bfile_root",
|
306
|
+
type=str,
|
307
|
+
required=True,
|
308
|
+
help="Root path for genotype plink bfiles (.bim, .bed, .fam).",
|
309
|
+
)
|
310
|
+
parser.add_argument(
|
311
|
+
"--keep_snp_root", type=str, required=True, help="Root path for SNP files."
|
312
|
+
)
|
313
|
+
parser.add_argument(
|
314
|
+
"--gtf_annotation_file", type=str, required=True, help="Path to GTF annotation file."
|
315
|
+
)
|
316
|
+
parser.add_argument(
|
317
|
+
"--gene_window_size", type=int, default=50000, help="Gene window size in base pairs."
|
318
|
+
)
|
319
|
+
parser.add_argument(
|
320
|
+
"--enhancer_annotation_file", type=str, help="Path to enhancer annotation file (optional)."
|
321
|
+
)
|
322
|
+
parser.add_argument(
|
323
|
+
"--snp_multiple_enhancer_strategy",
|
324
|
+
type=str,
|
325
|
+
choices=["max_mkscore", "nearest_TSS"],
|
326
|
+
default="max_mkscore",
|
327
|
+
help="Strategy for handling multiple enhancers per SNP.",
|
328
|
+
)
|
329
|
+
parser.add_argument(
|
330
|
+
"--gene_window_enhancer_priority",
|
331
|
+
type=str,
|
332
|
+
choices=["gene_window_first", "enhancer_first", "enhancer_only"],
|
333
|
+
help="Priority between gene window and enhancer annotations.",
|
334
|
+
)
|
335
|
+
parser.add_argument(
|
336
|
+
"--spots_per_chunk", type=int, default=1000, help="Number of spots per chunk."
|
337
|
+
)
|
338
|
+
parser.add_argument("--ld_wind", type=int, default=1, help="LD window size.")
|
339
|
+
parser.add_argument(
|
340
|
+
"--ld_unit",
|
341
|
+
type=str,
|
342
|
+
choices=["SNP", "KB", "CM"],
|
343
|
+
default="CM",
|
344
|
+
help="Unit for LD window.",
|
345
|
+
)
|
346
|
+
parser.add_argument(
|
347
|
+
"--additional_baseline_annotation",
|
348
|
+
type=str,
|
349
|
+
default=None,
|
350
|
+
help="Path of additional baseline annotations",
|
351
|
+
)
|
138
352
|
|
139
353
|
|
140
354
|
def add_spatial_ldsc_args(parser):
|
141
355
|
add_shared_args(parser)
|
142
|
-
parser.add_argument(
|
143
|
-
|
144
|
-
|
145
|
-
parser.add_argument(
|
146
|
-
|
147
|
-
|
148
|
-
parser.add_argument(
|
356
|
+
parser.add_argument(
|
357
|
+
"--sumstats_file", type=str, required=True, help="Path to GWAS summary statistics file."
|
358
|
+
)
|
359
|
+
parser.add_argument(
|
360
|
+
"--w_file", type=str, required=True, help="Path to regression weight file."
|
361
|
+
)
|
362
|
+
parser.add_argument(
|
363
|
+
"--trait_name", type=str, required=True, help="Name of the trait being analyzed."
|
364
|
+
)
|
365
|
+
parser.add_argument(
|
366
|
+
"--n_blocks", type=int, default=200, help="Number of blocks for jackknife resampling."
|
367
|
+
)
|
368
|
+
parser.add_argument(
|
369
|
+
"--chisq_max", type=int, help="Maximum chi-square value for filtering SNPs."
|
370
|
+
)
|
371
|
+
parser.add_argument(
|
372
|
+
"--num_processes", type=int, default=4, help="Number of processes for parallel computing."
|
373
|
+
)
|
374
|
+
parser.add_argument(
|
375
|
+
"--use_additional_baseline_annotation",
|
376
|
+
type=bool,
|
377
|
+
nargs="?",
|
378
|
+
const=True,
|
379
|
+
default=True,
|
380
|
+
help="Use additional baseline annotations when provided",
|
381
|
+
)
|
149
382
|
|
150
383
|
|
151
384
|
def add_Cauchy_combination_args(parser):
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
parser.add_argument(
|
156
|
-
|
385
|
+
parser.add_argument(
|
386
|
+
"--workdir", type=str, required=True, help="Path to the working directory."
|
387
|
+
)
|
388
|
+
parser.add_argument("--sample_name", type=str, required=False, help="Name of the sample.")
|
389
|
+
|
390
|
+
parser.add_argument(
|
391
|
+
"--trait_name", type=str, required=True, help="Name of the trait being analyzed."
|
392
|
+
)
|
393
|
+
parser.add_argument(
|
394
|
+
"--annotation", type=str, required=True, help="Name of the annotation in adata.obs to use."
|
395
|
+
)
|
396
|
+
|
397
|
+
parser.add_argument(
|
398
|
+
"--sample_name_list",
|
399
|
+
type=str,
|
400
|
+
nargs="+",
|
401
|
+
required=False,
|
402
|
+
help="List of sample names to process. Provide as a space-separated list.",
|
403
|
+
)
|
404
|
+
parser.add_argument(
|
405
|
+
"--output_file",
|
406
|
+
type=str,
|
407
|
+
required=False,
|
408
|
+
help="Path to save the combined Cauchy results. Required when using multiple samples.",
|
409
|
+
)
|
157
410
|
|
158
411
|
|
159
412
|
def add_report_args(parser):
|
160
413
|
add_shared_args(parser)
|
161
|
-
parser.add_argument(
|
162
|
-
|
414
|
+
parser.add_argument(
|
415
|
+
"--trait_name",
|
416
|
+
type=str,
|
417
|
+
required=True,
|
418
|
+
help="Name of the trait to generate the report for.",
|
419
|
+
)
|
420
|
+
parser.add_argument("--annotation", type=str, required=True, help="Annotation layer name.")
|
163
421
|
# parser.add_argument('--plot_type', type=str, choices=['manhattan', 'GSS', 'gsMap', 'all'], default='all',
|
164
422
|
# help="Type of diagnostic plot to generate. Choose from 'manhattan', 'GSS', 'gsMap', or 'all'.")
|
165
|
-
parser.add_argument(
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
423
|
+
parser.add_argument(
|
424
|
+
"--top_corr_genes", type=int, default=50, help="Number of top correlated genes to display."
|
425
|
+
)
|
426
|
+
parser.add_argument(
|
427
|
+
"--selected_genes",
|
428
|
+
type=str,
|
429
|
+
nargs="*",
|
430
|
+
help="List of specific genes to include in the report (optional).",
|
431
|
+
)
|
432
|
+
parser.add_argument(
|
433
|
+
"--sumstats_file", type=str, required=True, help="Path to GWAS summary statistics file."
|
434
|
+
)
|
170
435
|
|
171
436
|
# Optional arguments for customization
|
172
|
-
parser.add_argument(
|
173
|
-
|
174
|
-
|
175
|
-
parser.add_argument(
|
176
|
-
|
437
|
+
parser.add_argument(
|
438
|
+
"--fig_width", type=int, default=None, help="Width of the generated figures in pixels."
|
439
|
+
)
|
440
|
+
parser.add_argument(
|
441
|
+
"--fig_height", type=int, default=None, help="Height of the generated figures in pixels."
|
442
|
+
)
|
443
|
+
parser.add_argument("--point_size", type=int, default=None, help="Point size for the figures.")
|
444
|
+
parser.add_argument(
|
445
|
+
"--fig_style",
|
446
|
+
type=str,
|
447
|
+
default="light",
|
448
|
+
choices=["dark", "light"],
|
449
|
+
help="Style of the generated figures.",
|
450
|
+
)
|
451
|
+
|
452
|
+
|
453
|
+
def add_create_slice_mean_args(parser):
|
454
|
+
parser.add_argument(
|
455
|
+
"--sample_name_list",
|
456
|
+
type=str,
|
457
|
+
nargs="+",
|
458
|
+
required=True,
|
459
|
+
help="List of sample names to process. Provide as a space-separated list.",
|
460
|
+
)
|
461
|
+
|
462
|
+
parser.add_argument(
|
463
|
+
"--h5ad_list",
|
464
|
+
type=str,
|
465
|
+
nargs="+",
|
466
|
+
help="List of h5ad file paths corresponding to the sample names. Provide as a space-separated list.",
|
467
|
+
)
|
468
|
+
parser.add_argument(
|
469
|
+
"--h5ad_yaml",
|
470
|
+
type=str,
|
471
|
+
default=None,
|
472
|
+
help="Path to the YAML file containing sample names and associated h5ad file paths",
|
473
|
+
)
|
474
|
+
parser.add_argument(
|
475
|
+
"--slice_mean_output_file",
|
476
|
+
type=str,
|
477
|
+
required=True,
|
478
|
+
help="Path to the output file for the slice mean",
|
479
|
+
)
|
480
|
+
parser.add_argument(
|
481
|
+
"--homolog_file", type=str, help="Path to homologous gene conversion file (optional)."
|
482
|
+
)
|
483
|
+
parser.add_argument(
|
484
|
+
"--data_layer",
|
485
|
+
type=str,
|
486
|
+
default="counts",
|
487
|
+
required=True,
|
488
|
+
help='Data layer for gene expression (e.g., "count", "counts", "log1p").',
|
489
|
+
)
|
490
|
+
|
177
491
|
|
178
492
|
def add_format_sumstats_args(parser):
|
179
493
|
# Required arguments
|
180
|
-
parser.add_argument(
|
181
|
-
|
182
|
-
|
183
|
-
|
494
|
+
parser.add_argument("--sumstats", required=True, type=str, help="Path to gwas summary data")
|
495
|
+
parser.add_argument(
|
496
|
+
"--out", required=True, type=str, help="Path to save the formatted gwas data"
|
497
|
+
)
|
184
498
|
|
185
499
|
# Arguments for specify column name
|
186
|
-
parser.add_argument(
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
parser.add_argument(
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
parser.add_argument(
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
parser.add_argument(
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
500
|
+
parser.add_argument(
|
501
|
+
"--snp",
|
502
|
+
default=None,
|
503
|
+
type=str,
|
504
|
+
help="Name of snp column (if not a name that gsMap understands)",
|
505
|
+
)
|
506
|
+
parser.add_argument(
|
507
|
+
"--a1",
|
508
|
+
default=None,
|
509
|
+
type=str,
|
510
|
+
help="Name of effect allele column (if not a name that gsMap understands)",
|
511
|
+
)
|
512
|
+
parser.add_argument(
|
513
|
+
"--a2",
|
514
|
+
default=None,
|
515
|
+
type=str,
|
516
|
+
help="Name of none-effect allele column (if not a name that gsMap understands)",
|
517
|
+
)
|
518
|
+
parser.add_argument(
|
519
|
+
"--info",
|
520
|
+
default=None,
|
521
|
+
type=str,
|
522
|
+
help="Name of info column (if not a name that gsMap understands)",
|
523
|
+
)
|
524
|
+
parser.add_argument(
|
525
|
+
"--beta",
|
526
|
+
default=None,
|
527
|
+
type=str,
|
528
|
+
help="Name of gwas beta column (if not a name that gsMap understands).",
|
529
|
+
)
|
530
|
+
parser.add_argument(
|
531
|
+
"--se",
|
532
|
+
default=None,
|
533
|
+
type=str,
|
534
|
+
help="Name of gwas standar error of beta column (if not a name that gsMap understands)",
|
535
|
+
)
|
536
|
+
parser.add_argument(
|
537
|
+
"--p",
|
538
|
+
default=None,
|
539
|
+
type=str,
|
540
|
+
help="Name of p-value column (if not a name that gsMap understands)",
|
541
|
+
)
|
542
|
+
parser.add_argument(
|
543
|
+
"--frq",
|
544
|
+
default=None,
|
545
|
+
type=str,
|
546
|
+
help="Name of A1 ferquency column (if not a name that gsMap understands)",
|
547
|
+
)
|
548
|
+
parser.add_argument(
|
549
|
+
"--n",
|
550
|
+
default=None,
|
551
|
+
type=str_or_float,
|
552
|
+
help="Name of sample size column (if not a name that gsMap understands)",
|
553
|
+
)
|
554
|
+
parser.add_argument(
|
555
|
+
"--z",
|
556
|
+
default=None,
|
557
|
+
type=str,
|
558
|
+
help="Name of gwas Z-statistics column (if not a name that gsMap understands)",
|
559
|
+
)
|
560
|
+
parser.add_argument(
|
561
|
+
"--OR",
|
562
|
+
default=None,
|
563
|
+
type=str,
|
564
|
+
help="Name of gwas OR column (if not a name that gsMap understands)",
|
565
|
+
)
|
566
|
+
parser.add_argument(
|
567
|
+
"--se_OR",
|
568
|
+
default=None,
|
569
|
+
type=str,
|
570
|
+
help="Name of standar error of OR column (if not a name that gsMap understands)",
|
571
|
+
)
|
210
572
|
|
211
573
|
# Arguments for convert SNP (chr, pos) to rsid
|
212
|
-
parser.add_argument(
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
parser.add_argument(
|
219
|
-
|
574
|
+
parser.add_argument(
|
575
|
+
"--chr",
|
576
|
+
default="Chr",
|
577
|
+
type=str,
|
578
|
+
help="Name of SNP chromosome column (if not a name that gsMap understands)",
|
579
|
+
)
|
580
|
+
parser.add_argument(
|
581
|
+
"--pos",
|
582
|
+
default="Pos",
|
583
|
+
type=str,
|
584
|
+
help="Name of SNP positions column (if not a name that gsMap understands)",
|
585
|
+
)
|
586
|
+
parser.add_argument("--dbsnp", default=None, type=str, help="Path to reference dnsnp file")
|
587
|
+
parser.add_argument(
|
588
|
+
"--chunksize", default=1e6, type=int, help="Chunk size for loading dbsnp file"
|
589
|
+
)
|
220
590
|
|
221
591
|
# Arguments for output format and quality
|
222
|
-
parser.add_argument(
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
592
|
+
parser.add_argument(
|
593
|
+
"--format",
|
594
|
+
default="gsMap",
|
595
|
+
type=str,
|
596
|
+
help="Format of output data",
|
597
|
+
choices=["gsMap", "COJO"],
|
598
|
+
)
|
599
|
+
parser.add_argument("--info_min", default=0.9, type=float, help="Minimum INFO score.")
|
600
|
+
parser.add_argument("--maf_min", default=0.01, type=float, help="Minimum MAF.")
|
601
|
+
parser.add_argument(
|
602
|
+
"--keep_chr_pos",
|
603
|
+
action="store_true",
|
604
|
+
default=False,
|
605
|
+
help="Keep SNP chromosome and position columns in the output data",
|
606
|
+
)
|
607
|
+
|
230
608
|
|
231
609
|
def add_run_all_mode_args(parser):
|
232
610
|
add_shared_args(parser)
|
233
611
|
|
234
612
|
# Required paths and configurations
|
235
|
-
parser.add_argument(
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
parser.add_argument(
|
242
|
-
|
613
|
+
parser.add_argument(
|
614
|
+
"--gsMap_resource_dir",
|
615
|
+
type=str,
|
616
|
+
required=True,
|
617
|
+
help="Directory containing gsMap resources (e.g., genome annotations, LD reference panel, etc.).",
|
618
|
+
)
|
619
|
+
parser.add_argument(
|
620
|
+
"--hdf5_path",
|
621
|
+
type=str,
|
622
|
+
required=True,
|
623
|
+
help="Path to the input spatial transcriptomics data (H5AD format).",
|
624
|
+
)
|
625
|
+
parser.add_argument(
|
626
|
+
"--annotation", type=str, required=True, help="Name of the annotation in adata.obs to use."
|
627
|
+
)
|
628
|
+
parser.add_argument(
|
629
|
+
"--data_layer",
|
630
|
+
type=str,
|
631
|
+
default="counts",
|
632
|
+
required=True,
|
633
|
+
help='Data layer for gene expression (e.g., "count", "counts", "log1p").',
|
634
|
+
)
|
243
635
|
|
244
636
|
# GWAS Data Parameters
|
245
|
-
parser.add_argument(
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
637
|
+
parser.add_argument(
|
638
|
+
"--trait_name",
|
639
|
+
type=str,
|
640
|
+
help="Name of the trait for GWAS analysis (required if sumstats_file is provided).",
|
641
|
+
)
|
642
|
+
parser.add_argument(
|
643
|
+
"--sumstats_file",
|
644
|
+
type=str,
|
645
|
+
help="Path to GWAS summary statistics file. Either sumstats_file or sumstats_config_file is required.",
|
646
|
+
)
|
647
|
+
parser.add_argument(
|
648
|
+
"--sumstats_config_file",
|
649
|
+
type=str,
|
650
|
+
help="Path to GWAS summary statistics config file. Either sumstats_file or sumstats_config_file is required.",
|
651
|
+
)
|
250
652
|
|
251
653
|
# Homolog Data Parameters
|
252
|
-
parser.add_argument(
|
253
|
-
|
654
|
+
parser.add_argument(
|
655
|
+
"--homolog_file",
|
656
|
+
type=str,
|
657
|
+
help="Path to homologous gene for converting gene names from different species to human (optional, used for cross-species analysis).",
|
658
|
+
)
|
254
659
|
|
255
660
|
# Maximum number of processes
|
256
|
-
parser.add_argument(
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
661
|
+
parser.add_argument(
|
662
|
+
"--max_processes",
|
663
|
+
type=int,
|
664
|
+
default=10,
|
665
|
+
help="Maximum number of processes for parallel execution.",
|
666
|
+
)
|
667
|
+
|
668
|
+
parser.add_argument(
|
669
|
+
"--latent_representation",
|
670
|
+
type=str,
|
671
|
+
default=None,
|
672
|
+
help="Type of latent representation. This should exist in the h5ad obsm.",
|
673
|
+
)
|
674
|
+
parser.add_argument("--num_neighbour", type=int, default=21, help="Number of neighbors.")
|
675
|
+
parser.add_argument(
|
676
|
+
"--num_neighbour_spatial", type=int, default=101, help="Number of spatial neighbors."
|
677
|
+
)
|
678
|
+
parser.add_argument(
|
679
|
+
"--gM_slices", type=str, default=None, help="Path to the slice mean file (optional)."
|
680
|
+
)
|
272
681
|
|
273
682
|
|
274
683
|
def ensure_path_exists(func):
|
@@ -288,75 +697,136 @@ def ensure_path_exists(func):
|
|
288
697
|
@dataclass
|
289
698
|
class ConfigWithAutoPaths:
|
290
699
|
workdir: str
|
291
|
-
sample_name: str
|
700
|
+
sample_name: str | None
|
292
701
|
|
293
702
|
def __post_init__(self):
|
294
703
|
if self.workdir is None:
|
295
|
-
raise ValueError(
|
704
|
+
raise ValueError("workdir must be provided.")
|
296
705
|
|
297
706
|
@property
|
298
707
|
@ensure_path_exists
|
299
708
|
def hdf5_with_latent_path(self) -> Path:
|
300
|
-
return Path(
|
709
|
+
return Path(
|
710
|
+
f"{self.workdir}/{self.sample_name}/find_latent_representations/{self.sample_name}_add_latent.h5ad"
|
711
|
+
)
|
301
712
|
|
302
713
|
@property
|
303
714
|
@ensure_path_exists
|
304
715
|
def mkscore_feather_path(self) -> Path:
|
305
|
-
return Path(
|
716
|
+
return Path(
|
717
|
+
f"{self.workdir}/{self.sample_name}/latent_to_gene/{self.sample_name}_gene_marker_score.feather"
|
718
|
+
)
|
306
719
|
|
307
720
|
@property
|
308
721
|
@ensure_path_exists
|
309
722
|
def ldscore_save_dir(self) -> Path:
|
310
|
-
return Path(f
|
723
|
+
return Path(f"{self.workdir}/{self.sample_name}/generate_ldscore")
|
311
724
|
|
312
725
|
@property
|
313
726
|
@ensure_path_exists
|
314
727
|
def ldsc_save_dir(self) -> Path:
|
315
|
-
return Path(f
|
728
|
+
return Path(f"{self.workdir}/{self.sample_name}/spatial_ldsc")
|
316
729
|
|
317
730
|
@property
|
318
731
|
@ensure_path_exists
|
319
732
|
def cauchy_save_dir(self) -> Path:
|
320
|
-
return Path(f
|
733
|
+
return Path(f"{self.workdir}/{self.sample_name}/cauchy_combination")
|
321
734
|
|
322
735
|
@ensure_path_exists
|
323
736
|
def get_report_dir(self, trait_name: str) -> Path:
|
324
|
-
return Path(f
|
737
|
+
return Path(f"{self.workdir}/{self.sample_name}/report/{trait_name}")
|
325
738
|
|
326
739
|
def get_gsMap_report_file(self, trait_name: str) -> Path:
|
327
|
-
return
|
740
|
+
return (
|
741
|
+
self.get_report_dir(trait_name) / f"{self.sample_name}_{trait_name}_gsMap_Report.html"
|
742
|
+
)
|
328
743
|
|
329
744
|
@ensure_path_exists
|
330
745
|
def get_manhattan_html_plot_path(self, trait_name: str) -> Path:
|
331
746
|
return Path(
|
332
|
-
f
|
747
|
+
f"{self.workdir}/{self.sample_name}/report/{trait_name}/manhattan_plot/{self.sample_name}_{trait_name}_Diagnostic_Manhattan_Plot.html"
|
748
|
+
)
|
333
749
|
|
334
750
|
@ensure_path_exists
|
335
751
|
def get_GSS_plot_dir(self, trait_name: str) -> Path:
|
336
|
-
return Path(f
|
752
|
+
return Path(f"{self.workdir}/{self.sample_name}/report/{trait_name}/GSS_plot")
|
337
753
|
|
338
754
|
def get_GSS_plot_select_gene_file(self, trait_name: str) -> Path:
|
339
|
-
return self.get_GSS_plot_dir(trait_name) /
|
755
|
+
return self.get_GSS_plot_dir(trait_name) / "plot_genes.csv"
|
340
756
|
|
341
757
|
@ensure_path_exists
|
342
758
|
def get_ldsc_result_file(self, trait_name: str) -> Path:
|
343
|
-
return Path(f
|
759
|
+
return Path(f"{self.ldsc_save_dir}/{self.sample_name}_{trait_name}.csv.gz")
|
344
760
|
|
345
761
|
@ensure_path_exists
|
346
762
|
def get_cauchy_result_file(self, trait_name: str) -> Path:
|
347
|
-
return Path(f
|
763
|
+
return Path(f"{self.cauchy_save_dir}/{self.sample_name}_{trait_name}.Cauchy.csv.gz")
|
348
764
|
|
349
765
|
@ensure_path_exists
|
350
766
|
def get_gene_diagnostic_info_save_path(self, trait_name: str) -> Path:
|
351
767
|
return Path(
|
352
|
-
f
|
768
|
+
f"{self.workdir}/{self.sample_name}/report/{trait_name}/{self.sample_name}_{trait_name}_Gene_Diagnostic_Info.csv"
|
769
|
+
)
|
353
770
|
|
354
771
|
@ensure_path_exists
|
355
772
|
def get_gsMap_plot_save_dir(self, trait_name: str) -> Path:
|
356
|
-
return Path(f
|
773
|
+
return Path(f"{self.workdir}/{self.sample_name}/report/{trait_name}/gsMap_plot")
|
357
774
|
|
358
775
|
def get_gsMap_html_plot_save_path(self, trait_name: str) -> Path:
|
359
|
-
return
|
776
|
+
return (
|
777
|
+
self.get_gsMap_plot_save_dir(trait_name)
|
778
|
+
/ f"{self.sample_name}_{trait_name}_gsMap_plot.html"
|
779
|
+
)
|
780
|
+
|
781
|
+
|
782
|
+
@dataclass
|
783
|
+
class CreateSliceMeanConfig:
|
784
|
+
slice_mean_output_file: str | Path
|
785
|
+
h5ad_yaml: str | dict | None = None
|
786
|
+
sample_name_list: list | None = None
|
787
|
+
h5ad_list: list | None = None
|
788
|
+
homolog_file: str | None = None
|
789
|
+
species: str | None = None
|
790
|
+
data_layer: str = None
|
791
|
+
|
792
|
+
def __post_init__(self):
|
793
|
+
if self.h5ad_list is None and self.h5ad_yaml is None:
|
794
|
+
raise ValueError("At least one of --h5ad_list or --h5ad_yaml must be provided.")
|
795
|
+
if self.h5ad_yaml is not None:
|
796
|
+
if isinstance(self.h5ad_yaml, str):
|
797
|
+
logger.info(f"Reading h5ad yaml file: {self.h5ad_yaml}")
|
798
|
+
h5ad_dict = (
|
799
|
+
yaml.safe_load(open(self.h5ad_yaml))
|
800
|
+
if isinstance(self.h5ad_yaml, str)
|
801
|
+
else self.h5ad_yaml
|
802
|
+
)
|
803
|
+
elif self.sample_name_list and self.h5ad_list:
|
804
|
+
logger.info("Reading sample name list and h5ad list")
|
805
|
+
h5ad_dict = dict(zip(self.sample_name_list, self.h5ad_list, strict=False))
|
806
|
+
else:
|
807
|
+
raise ValueError(
|
808
|
+
"Please provide either h5ad_yaml or both sample_name_list and h5ad_list."
|
809
|
+
)
|
810
|
+
|
811
|
+
# check if sample names is unique
|
812
|
+
assert len(h5ad_dict) == len(set(h5ad_dict)), "Sample names must be unique."
|
813
|
+
assert len(h5ad_dict) > 1, "At least two samples are required."
|
814
|
+
|
815
|
+
logger.info(f"Input h5ad files: {h5ad_dict}")
|
816
|
+
|
817
|
+
# Check if all files exist
|
818
|
+
self.h5ad_dict = {}
|
819
|
+
for sample_name, h5ad_file in h5ad_dict.items():
|
820
|
+
h5ad_file = Path(h5ad_file)
|
821
|
+
if not h5ad_file.exists():
|
822
|
+
raise FileNotFoundError(f"{h5ad_file} does not exist.")
|
823
|
+
self.h5ad_dict[sample_name] = h5ad_file
|
824
|
+
|
825
|
+
self.slice_mean_output_file = Path(self.slice_mean_output_file)
|
826
|
+
self.slice_mean_output_file.parent.mkdir(parents=True, exist_ok=True)
|
827
|
+
|
828
|
+
verify_homolog_file_format(self)
|
829
|
+
|
360
830
|
|
361
831
|
@dataclass
|
362
832
|
class FindLatentRepresentationsConfig(ConfigWithAutoPaths):
|
@@ -389,24 +859,27 @@ class FindLatentRepresentationsConfig(ConfigWithAutoPaths):
|
|
389
859
|
# self.output_hdf5_path = self.hdf5_with_latent_path
|
390
860
|
if self.hierarchically:
|
391
861
|
if self.annotation is None:
|
392
|
-
raise ValueError(
|
862
|
+
raise ValueError("annotation must be provided if hierarchically is True.")
|
393
863
|
logger.info(
|
394
|
-
|
864
|
+
"------Hierarchical mode is enabled. This will find the latent representations within each annotation."
|
865
|
+
)
|
395
866
|
|
396
867
|
# remind for not providing annotation
|
397
868
|
if self.annotation is None:
|
398
869
|
logger.warning(
|
399
|
-
|
870
|
+
"annotation is not provided. This will find the latent representations for the whole dataset."
|
871
|
+
)
|
400
872
|
else:
|
401
|
-
logger.info(f
|
873
|
+
logger.info(f"------Find latent representations for {self.annotation}...")
|
402
874
|
|
403
875
|
|
404
876
|
@dataclass
|
405
877
|
class LatentToGeneConfig(ConfigWithAutoPaths):
|
406
878
|
# input_hdf5_with_latent_path: str
|
407
879
|
# output_feather_path: str
|
880
|
+
input_hdf5_path: str | Path = None
|
408
881
|
no_expression_fraction: bool = False
|
409
|
-
latent_representation: str =
|
882
|
+
latent_representation: str = None
|
410
883
|
num_neighbour: int = 21
|
411
884
|
num_neighbour_spatial: int = 101
|
412
885
|
homolog_file: str = None
|
@@ -415,31 +888,61 @@ class LatentToGeneConfig(ConfigWithAutoPaths):
|
|
415
888
|
species: str = None
|
416
889
|
|
417
890
|
def __post_init__(self):
|
418
|
-
if self.
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
_n_col = len(first_line.split())
|
424
|
-
if _n_col != 2:
|
425
|
-
raise ValueError(
|
426
|
-
f"Invalid homolog file format. Expected 2 columns, first column should be other species gene name, second column should be human gene name. "
|
427
|
-
f"Got {_n_col} columns in the first line.")
|
428
|
-
else:
|
429
|
-
first_col_name, second_col_name = first_line.split()
|
430
|
-
self.species = first_col_name
|
431
|
-
logger.info(
|
432
|
-
f"Homolog file provided and will map gene name from column1:{first_col_name} to column2:{second_col_name}")
|
891
|
+
if self.input_hdf5_path is None:
|
892
|
+
self.input_hdf5_path = self.hdf5_with_latent_path
|
893
|
+
assert self.input_hdf5_path.exists(), (
|
894
|
+
f"{self.input_hdf5_path} does not exist. Please run FindLatentRepresentations first."
|
895
|
+
)
|
433
896
|
else:
|
434
|
-
|
897
|
+
assert Path(self.input_hdf5_path).exists(), f"{self.input_hdf5_path} does not exist."
|
898
|
+
# copy to self.hdf5_with_latent_path
|
899
|
+
import shutil
|
900
|
+
|
901
|
+
shutil.copy2(self.input_hdf5_path, self.hdf5_with_latent_path)
|
902
|
+
|
903
|
+
if self.latent_representation is not None:
|
904
|
+
logger.info(f"Using the provided latent representation: {self.latent_representation}")
|
905
|
+
else:
|
906
|
+
self.latent_representation = "latent_GVAE"
|
907
|
+
logger.info(f"Using default latent representation: {self.latent_representation}")
|
908
|
+
|
909
|
+
if self.gM_slices is not None:
|
910
|
+
assert Path(self.gM_slices).exists(), f"{self.gM_slices} does not exist."
|
911
|
+
logger.info(f"Using the provided slice mean file: {self.gM_slices}.")
|
912
|
+
|
913
|
+
verify_homolog_file_format(self)
|
914
|
+
|
915
|
+
|
916
|
+
def verify_homolog_file_format(config):
|
917
|
+
if config.homolog_file is not None:
|
918
|
+
logger.info(
|
919
|
+
f"User provided homolog file to map gene names to human: {config.homolog_file}"
|
920
|
+
)
|
921
|
+
# check the format of the homolog file
|
922
|
+
with open(config.homolog_file) as f:
|
923
|
+
first_line = f.readline().strip()
|
924
|
+
_n_col = len(first_line.split())
|
925
|
+
if _n_col != 2:
|
926
|
+
raise ValueError(
|
927
|
+
f"Invalid homolog file format. Expected 2 columns, first column should be other species gene name, second column should be human gene name. "
|
928
|
+
f"Got {_n_col} columns in the first line."
|
929
|
+
)
|
930
|
+
else:
|
931
|
+
first_col_name, second_col_name = first_line.split()
|
932
|
+
config.species = first_col_name
|
933
|
+
logger.info(
|
934
|
+
f"Homolog file provided and will map gene name from column1:{first_col_name} to column2:{second_col_name}"
|
935
|
+
)
|
936
|
+
else:
|
937
|
+
logger.info("No homolog file provided. Run in human mode.")
|
435
938
|
|
436
939
|
|
437
940
|
@dataclass
|
438
941
|
class GenerateLDScoreConfig(ConfigWithAutoPaths):
|
439
|
-
chrom:
|
942
|
+
chrom: int | str
|
440
943
|
|
441
944
|
bfile_root: str
|
442
|
-
keep_snp_root:
|
945
|
+
keep_snp_root: str | None
|
443
946
|
|
444
947
|
# annotation by gene distance
|
445
948
|
gtf_annotation_file: str
|
@@ -447,74 +950,106 @@ class GenerateLDScoreConfig(ConfigWithAutoPaths):
|
|
447
950
|
|
448
951
|
# annotation by enhancer
|
449
952
|
enhancer_annotation_file: str = None
|
450
|
-
snp_multiple_enhancer_strategy: Literal[
|
451
|
-
gene_window_enhancer_priority:
|
953
|
+
snp_multiple_enhancer_strategy: Literal["max_mkscore", "nearest_TSS"] = "max_mkscore"
|
954
|
+
gene_window_enhancer_priority: (
|
955
|
+
Literal["gene_window_first", "enhancer_first", "enhancer_only"] | None
|
956
|
+
) = None
|
452
957
|
|
453
958
|
# for calculating ld score
|
454
959
|
additional_baseline_annotation: str = None
|
455
960
|
spots_per_chunk: int = 1_000
|
456
961
|
ld_wind: int = 1
|
457
|
-
ld_unit: str =
|
962
|
+
ld_unit: str = "CM"
|
458
963
|
|
459
964
|
# zarr config
|
460
|
-
ldscore_save_format: Literal[
|
965
|
+
ldscore_save_format: Literal["feather", "zarr", "quick_mode"] = "feather"
|
461
966
|
|
462
|
-
zarr_chunk_size:
|
967
|
+
zarr_chunk_size: tuple[int, int] = None
|
463
968
|
|
464
969
|
# for pre calculating the SNP Gene ldscore Weight
|
465
970
|
save_pre_calculate_snp_gene_weight_matrix: bool = False
|
466
971
|
|
467
|
-
baseline_annotation_dir:
|
468
|
-
SNP_gene_pair_dir:
|
972
|
+
baseline_annotation_dir: str | None = None
|
973
|
+
SNP_gene_pair_dir: str | None = None
|
974
|
+
|
469
975
|
def __post_init__(self):
|
470
976
|
# if self.mkscore_feather_file is None:
|
471
977
|
# self.mkscore_feather_file = self._get_mkscore_feather_path()
|
472
978
|
|
473
|
-
if
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
979
|
+
if (
|
980
|
+
self.enhancer_annotation_file is not None
|
981
|
+
and self.gene_window_enhancer_priority is None
|
982
|
+
):
|
983
|
+
logger.warning(
|
984
|
+
"enhancer_annotation_file is provided but gene_window_enhancer_priority is not provided. "
|
985
|
+
"by default, gene_window_enhancer_priority is set to 'enhancer_only', when enhancer_annotation_file is provided."
|
986
|
+
)
|
987
|
+
self.gene_window_enhancer_priority = "enhancer_only"
|
988
|
+
if (
|
989
|
+
self.enhancer_annotation_file is None
|
990
|
+
and self.gene_window_enhancer_priority is not None
|
991
|
+
):
|
992
|
+
logger.warning(
|
993
|
+
"gene_window_enhancer_priority is provided but enhancer_annotation_file is not provided. "
|
994
|
+
"by default, gene_window_enhancer_priority is set to None, when enhancer_annotation_file is not provided."
|
995
|
+
)
|
480
996
|
self.gene_window_enhancer_priority = None
|
481
|
-
assert self.gene_window_enhancer_priority in [
|
997
|
+
assert self.gene_window_enhancer_priority in [
|
998
|
+
None,
|
999
|
+
"gene_window_first",
|
1000
|
+
"enhancer_first",
|
1001
|
+
"enhancer_only",
|
1002
|
+
], (
|
482
1003
|
f"gene_window_enhancer_priority must be one of None, 'gene_window_first', 'enhancer_first', 'enhancer_only', but got {self.gene_window_enhancer_priority}."
|
483
|
-
|
484
|
-
|
1004
|
+
)
|
1005
|
+
if self.gene_window_enhancer_priority in ["gene_window_first", "enhancer_first"]:
|
485
1006
|
logger.info(
|
486
|
-
|
487
|
-
|
488
|
-
logger.info(
|
1007
|
+
"Both gene_window and enhancer annotation will be used to calculate LD score. "
|
1008
|
+
)
|
1009
|
+
logger.info(
|
1010
|
+
f"SNP within +-{self.gene_window_size} bp of gene body will be used and enhancer annotation will be used to calculate LD score. If a snp maps to multiple enhancers, the strategy to choose by your select strategy: {self.snp_multiple_enhancer_strategy}."
|
1011
|
+
)
|
1012
|
+
elif self.gene_window_enhancer_priority == "enhancer_only":
|
1013
|
+
logger.info("Only enhancer annotation will be used to calculate LD score. ")
|
489
1014
|
else:
|
490
1015
|
logger.info(
|
491
|
-
f
|
1016
|
+
f"Only gene window annotation will be used to calculate LD score. SNP within +-{self.gene_window_size} bp of gene body will be used. "
|
1017
|
+
)
|
492
1018
|
|
493
1019
|
# remind for baseline annotation
|
494
1020
|
if self.additional_baseline_annotation is None:
|
495
|
-
logger.info(
|
1021
|
+
logger.info(
|
1022
|
+
"------Baseline annotation is not provided. Default baseline annotation will be used."
|
1023
|
+
)
|
496
1024
|
else:
|
497
1025
|
logger.info(
|
498
|
-
|
499
|
-
|
1026
|
+
"------Baseline annotation is provided. Additional baseline annotation will be used with the default baseline annotation."
|
1027
|
+
)
|
1028
|
+
logger.info(
|
1029
|
+
f"------Baseline annotation directory: {self.additional_baseline_annotation}"
|
1030
|
+
)
|
500
1031
|
# check the existence of baseline annotation
|
501
|
-
if self.chrom ==
|
1032
|
+
if self.chrom == "all":
|
502
1033
|
for chrom in range(1, 23):
|
503
1034
|
chrom = str(chrom)
|
504
|
-
baseline_annotation_path =
|
505
|
-
self.additional_baseline_annotation) / f
|
1035
|
+
baseline_annotation_path = (
|
1036
|
+
Path(self.additional_baseline_annotation) / f"baseline.{chrom}.annot.gz"
|
1037
|
+
)
|
506
1038
|
if not baseline_annotation_path.exists():
|
507
1039
|
raise FileNotFoundError(
|
508
|
-
f
|
1040
|
+
f"baseline.{chrom}.annot.gz is not found in {self.additional_baseline_annotation}."
|
1041
|
+
)
|
509
1042
|
else:
|
510
|
-
baseline_annotation_path =
|
511
|
-
self.additional_baseline_annotation) / f
|
1043
|
+
baseline_annotation_path = (
|
1044
|
+
Path(self.additional_baseline_annotation) / f"baseline.{self.chrom}.annot.gz"
|
1045
|
+
)
|
512
1046
|
if not baseline_annotation_path.exists():
|
513
1047
|
raise FileNotFoundError(
|
514
|
-
f
|
1048
|
+
f"baseline.{self.chrom}.annot.gz is not found in {self.additional_baseline_annotation}."
|
1049
|
+
)
|
515
1050
|
|
516
1051
|
# set the default zarr chunk size
|
517
|
-
if self.ldscore_save_format ==
|
1052
|
+
if self.ldscore_save_format == "zarr" and self.zarr_chunk_size is None:
|
518
1053
|
self.zarr_chunk_size = (10_000, self.spots_per_chunk)
|
519
1054
|
|
520
1055
|
|
@@ -523,56 +1058,61 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
|
|
523
1058
|
w_file: str
|
524
1059
|
# ldscore_save_dir: str
|
525
1060
|
use_additional_baseline_annotation: bool = True
|
526
|
-
trait_name:
|
527
|
-
sumstats_file:
|
528
|
-
sumstats_config_file:
|
1061
|
+
trait_name: str | None = None
|
1062
|
+
sumstats_file: str | None = None
|
1063
|
+
sumstats_config_file: str | None = None
|
529
1064
|
num_processes: int = 4
|
530
1065
|
not_M_5_50: bool = False
|
531
1066
|
n_blocks: int = 200
|
532
|
-
chisq_max:
|
533
|
-
all_chunk:
|
534
|
-
chunk_range:
|
1067
|
+
chisq_max: int | None = None
|
1068
|
+
all_chunk: int | None = None
|
1069
|
+
chunk_range: tuple[int, int] | None = None
|
535
1070
|
|
536
|
-
ldscore_save_format: Literal[
|
1071
|
+
ldscore_save_format: Literal["feather", "zarr", "quick_mode"] = "feather"
|
537
1072
|
|
538
1073
|
spots_per_chunk_quick_mode: int = 1_000
|
539
|
-
snp_gene_weight_adata_path:
|
1074
|
+
snp_gene_weight_adata_path: str | None = None
|
540
1075
|
|
541
1076
|
def __post_init__(self):
|
542
1077
|
super().__post_init__()
|
543
1078
|
if self.sumstats_file is None and self.sumstats_config_file is None:
|
544
|
-
raise ValueError(
|
1079
|
+
raise ValueError("One of sumstats_file and sumstats_config_file must be provided.")
|
545
1080
|
if self.sumstats_file is not None and self.sumstats_config_file is not None:
|
546
|
-
raise ValueError(
|
1081
|
+
raise ValueError(
|
1082
|
+
"Only one of sumstats_file and sumstats_config_file must be provided."
|
1083
|
+
)
|
547
1084
|
if self.sumstats_file is not None and self.trait_name is None:
|
548
|
-
raise ValueError(
|
1085
|
+
raise ValueError("trait_name must be provided if sumstats_file is provided.")
|
549
1086
|
if self.sumstats_config_file is not None and self.trait_name is not None:
|
550
|
-
raise ValueError(
|
1087
|
+
raise ValueError(
|
1088
|
+
"trait_name must not be provided if sumstats_config_file is provided."
|
1089
|
+
)
|
551
1090
|
self.sumstats_config_dict = {}
|
552
1091
|
# load the sumstats config file
|
553
1092
|
if self.sumstats_config_file is not None:
|
554
1093
|
import yaml
|
1094
|
+
|
555
1095
|
with open(self.sumstats_config_file) as f:
|
556
1096
|
config = yaml.load(f, Loader=yaml.FullLoader)
|
557
|
-
for
|
558
|
-
assert Path(sumstats_file).exists(), f
|
1097
|
+
for _trait_name, sumstats_file in config.items():
|
1098
|
+
assert Path(sumstats_file).exists(), f"{sumstats_file} does not exist."
|
559
1099
|
# load the sumstats file
|
560
1100
|
elif self.sumstats_file is not None:
|
561
1101
|
self.sumstats_config_dict[self.trait_name] = self.sumstats_file
|
562
1102
|
else:
|
563
|
-
raise ValueError(
|
1103
|
+
raise ValueError("One of sumstats_file and sumstats_config_file must be provided.")
|
564
1104
|
|
565
1105
|
for sumstats_file in self.sumstats_config_dict.values():
|
566
|
-
assert Path(sumstats_file).exists(), f
|
1106
|
+
assert Path(sumstats_file).exists(), f"{sumstats_file} does not exist."
|
567
1107
|
|
568
1108
|
# check if additional baseline annotation is exist
|
569
1109
|
# self.use_additional_baseline_annotation = False
|
570
|
-
|
1110
|
+
|
571
1111
|
if self.use_additional_baseline_annotation:
|
572
1112
|
self.process_additional_baseline_annotation()
|
573
1113
|
|
574
1114
|
def process_additional_baseline_annotation(self):
|
575
|
-
additional_baseline_annotation = Path(self.ldscore_save_dir) /
|
1115
|
+
additional_baseline_annotation = Path(self.ldscore_save_dir) / "additional_baseline"
|
576
1116
|
dir_exists = additional_baseline_annotation.exists()
|
577
1117
|
|
578
1118
|
if not dir_exists:
|
@@ -580,7 +1120,7 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
|
|
580
1120
|
# if self.use_additional_baseline_annotation:
|
581
1121
|
# logger.warning(f"additional_baseline directory is not found in {self.ldscore_save_dir}.")
|
582
1122
|
# print('''\
|
583
|
-
# if you want to use additional baseline annotation,
|
1123
|
+
# if you want to use additional baseline annotation,
|
584
1124
|
# please provide additional baseline annotation when calculating ld score.
|
585
1125
|
# ''')
|
586
1126
|
# raise FileNotFoundError(
|
@@ -589,15 +1129,21 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
|
|
589
1129
|
# self.use_additional_baseline_annotation = self.use_additional_baseline_annotation or True
|
590
1130
|
else:
|
591
1131
|
logger.info(
|
592
|
-
|
593
|
-
|
1132
|
+
"------Additional baseline annotation is provided. It will be used with the default baseline annotation."
|
1133
|
+
)
|
1134
|
+
logger.info(
|
1135
|
+
f"------Additional baseline annotation directory: {additional_baseline_annotation}"
|
1136
|
+
)
|
594
1137
|
|
595
1138
|
chrom_list = range(1, 23)
|
596
1139
|
for chrom in chrom_list:
|
597
|
-
baseline_annotation_path =
|
1140
|
+
baseline_annotation_path = (
|
1141
|
+
additional_baseline_annotation / f"baseline.{chrom}.l2.ldscore.feather"
|
1142
|
+
)
|
598
1143
|
if not baseline_annotation_path.exists():
|
599
1144
|
raise FileNotFoundError(
|
600
|
-
f
|
1145
|
+
f"baseline.{chrom}.annot.gz is not found in {additional_baseline_annotation}."
|
1146
|
+
)
|
601
1147
|
return None
|
602
1148
|
|
603
1149
|
|
@@ -605,8 +1151,25 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
|
|
605
1151
|
class CauchyCombinationConfig(ConfigWithAutoPaths):
|
606
1152
|
trait_name: str
|
607
1153
|
annotation: str
|
608
|
-
|
609
|
-
|
1154
|
+
sample_name_list: list[str] = dataclasses.field(default_factory=list)
|
1155
|
+
output_file: str | Path | None = None
|
1156
|
+
|
1157
|
+
def __post_init__(self):
|
1158
|
+
if self.sample_name is not None:
|
1159
|
+
if self.sample_name_list and len(self.sample_name_list) > 0:
|
1160
|
+
raise ValueError("Only one of sample_name and sample_name_list must be provided.")
|
1161
|
+
else:
|
1162
|
+
self.sample_name_list = [self.sample_name]
|
1163
|
+
self.output_file = (
|
1164
|
+
self.get_cauchy_result_file(self.trait_name)
|
1165
|
+
if self.output_file is None
|
1166
|
+
else self.output_file
|
1167
|
+
)
|
1168
|
+
else:
|
1169
|
+
assert len(self.sample_name_list) > 0, "At least one sample name must be provided."
|
1170
|
+
assert self.output_file is not None, (
|
1171
|
+
"Output_file must be provided if sample_name_list is provided."
|
1172
|
+
)
|
610
1173
|
|
611
1174
|
|
612
1175
|
@dataclass
|
@@ -618,7 +1181,7 @@ class VisualizeConfig(ConfigWithAutoPaths):
|
|
618
1181
|
fig_height: int = 600
|
619
1182
|
fig_width: int = 800
|
620
1183
|
point_size: int = None
|
621
|
-
fig_style: Literal[
|
1184
|
+
fig_style: Literal["dark", "light"] = "light"
|
622
1185
|
|
623
1186
|
|
624
1187
|
@dataclass
|
@@ -628,22 +1191,26 @@ class DiagnosisConfig(ConfigWithAutoPaths):
|
|
628
1191
|
|
629
1192
|
trait_name: str
|
630
1193
|
sumstats_file: str
|
631
|
-
plot_type: Literal[
|
1194
|
+
plot_type: Literal["manhattan", "GSS", "gsMap", "all"] = "all"
|
632
1195
|
top_corr_genes: int = 50
|
633
|
-
selected_genes:
|
1196
|
+
selected_genes: list[str] | None = None
|
634
1197
|
|
635
|
-
fig_width:
|
636
|
-
fig_height:
|
637
|
-
point_size:
|
638
|
-
fig_style: Literal[
|
1198
|
+
fig_width: int | None = None
|
1199
|
+
fig_height: int | None = None
|
1200
|
+
point_size: int | None = None
|
1201
|
+
fig_style: Literal["dark", "light"] = "light"
|
639
1202
|
|
640
1203
|
def __post_init__(self):
|
641
1204
|
if any([self.fig_width, self.fig_height, self.point_size]):
|
642
|
-
logger.info(
|
643
|
-
assert all([self.fig_width, self.fig_height, self.point_size]),
|
1205
|
+
logger.info("Customizing the figure size and point size.")
|
1206
|
+
assert all([self.fig_width, self.fig_height, self.point_size]), (
|
1207
|
+
"All of fig_width, fig_height, and point_size must be provided."
|
1208
|
+
)
|
644
1209
|
self.customize_fig = True
|
645
1210
|
else:
|
646
1211
|
self.customize_fig = False
|
1212
|
+
|
1213
|
+
|
647
1214
|
@dataclass
|
648
1215
|
class ReportConfig(DiagnosisConfig):
|
649
1216
|
pass
|
@@ -656,57 +1223,79 @@ class RunAllModeConfig(ConfigWithAutoPaths):
|
|
656
1223
|
# == ST DATA PARAMETERS ==
|
657
1224
|
hdf5_path: str
|
658
1225
|
annotation: str
|
659
|
-
data_layer: str =
|
1226
|
+
data_layer: str = "X"
|
1227
|
+
|
1228
|
+
# == Find Latent Representation PARAMETERS ==
|
1229
|
+
n_comps: int = 300
|
1230
|
+
|
1231
|
+
# == latent 2 Gene PARAMETERS ==
|
1232
|
+
gM_slices: str | None = None
|
1233
|
+
latent_representation: str = None
|
1234
|
+
num_neighbour: int = 21
|
1235
|
+
num_neighbour_spatial: int = 101
|
660
1236
|
|
661
1237
|
# ==GWAS DATA PARAMETERS==
|
662
|
-
trait_name:
|
663
|
-
sumstats_file:
|
664
|
-
sumstats_config_file:
|
1238
|
+
trait_name: str | None = None
|
1239
|
+
sumstats_file: str | None = None
|
1240
|
+
sumstats_config_file: str | None = None
|
665
1241
|
|
666
1242
|
# === homolog PARAMETERS ===
|
667
|
-
homolog_file:
|
1243
|
+
homolog_file: str | None = None
|
668
1244
|
|
669
1245
|
max_processes: int = 10
|
670
1246
|
|
671
1247
|
def __post_init__(self):
|
672
1248
|
super().__post_init__()
|
673
|
-
self.gtffile = f"{self.gsMap_resource_dir}/genome_annotation/gtf/gencode.
|
674
|
-
self.bfile_root =
|
1249
|
+
self.gtffile = f"{self.gsMap_resource_dir}/genome_annotation/gtf/gencode.v46lift37.basic.annotation.gtf"
|
1250
|
+
self.bfile_root = (
|
1251
|
+
f"{self.gsMap_resource_dir}/LD_Reference_Panel/1000G_EUR_Phase3_plink/1000G.EUR.QC"
|
1252
|
+
)
|
675
1253
|
self.keep_snp_root = f"{self.gsMap_resource_dir}/LDSC_resource/hapmap3_snps/hm"
|
676
1254
|
self.w_file = f"{self.gsMap_resource_dir}/LDSC_resource/weights_hm3_no_hla/weights."
|
677
|
-
self.snp_gene_weight_adata_path =
|
678
|
-
|
679
|
-
|
1255
|
+
self.snp_gene_weight_adata_path = (
|
1256
|
+
f"{self.gsMap_resource_dir}/quick_mode/snp_gene_weight_matrix.h5ad"
|
1257
|
+
)
|
1258
|
+
self.baseline_annotation_dir = Path(
|
1259
|
+
f"{self.gsMap_resource_dir}/quick_mode/baseline"
|
1260
|
+
).resolve()
|
1261
|
+
self.SNP_gene_pair_dir = Path(
|
1262
|
+
f"{self.gsMap_resource_dir}/quick_mode/SNP_gene_pair"
|
1263
|
+
).resolve()
|
680
1264
|
# check the existence of the input files and resources files
|
681
1265
|
for file in [self.hdf5_path, self.gtffile]:
|
682
1266
|
if not Path(file).exists():
|
683
1267
|
raise FileNotFoundError(f"File {file} does not exist.")
|
684
1268
|
|
685
1269
|
if self.sumstats_file is None and self.sumstats_config_file is None:
|
686
|
-
raise ValueError(
|
1270
|
+
raise ValueError("One of sumstats_file and sumstats_config_file must be provided.")
|
687
1271
|
if self.sumstats_file is not None and self.sumstats_config_file is not None:
|
688
|
-
raise ValueError(
|
1272
|
+
raise ValueError(
|
1273
|
+
"Only one of sumstats_file and sumstats_config_file must be provided."
|
1274
|
+
)
|
689
1275
|
if self.sumstats_file is not None and self.trait_name is None:
|
690
|
-
raise ValueError(
|
1276
|
+
raise ValueError("trait_name must be provided if sumstats_file is provided.")
|
691
1277
|
if self.sumstats_config_file is not None and self.trait_name is not None:
|
692
|
-
raise ValueError(
|
1278
|
+
raise ValueError(
|
1279
|
+
"trait_name must not be provided if sumstats_config_file is provided."
|
1280
|
+
)
|
693
1281
|
self.sumstats_config_dict = {}
|
694
1282
|
# load the sumstats config file
|
695
1283
|
if self.sumstats_config_file is not None:
|
696
1284
|
import yaml
|
1285
|
+
|
697
1286
|
with open(self.sumstats_config_file) as f:
|
698
1287
|
config = yaml.load(f, Loader=yaml.FullLoader)
|
699
1288
|
for trait_name, sumstats_file in config.items():
|
700
|
-
assert Path(sumstats_file).exists(), f
|
1289
|
+
assert Path(sumstats_file).exists(), f"{sumstats_file} does not exist."
|
701
1290
|
self.sumstats_config_dict[trait_name] = sumstats_file
|
702
1291
|
# load the sumstats file
|
703
1292
|
elif self.sumstats_file is not None and self.trait_name is not None:
|
704
1293
|
self.sumstats_config_dict[self.trait_name] = self.sumstats_file
|
705
1294
|
else:
|
706
|
-
raise ValueError(
|
1295
|
+
raise ValueError("One of sumstats_file and sumstats_config_file must be provided.")
|
707
1296
|
|
708
1297
|
for sumstats_file in self.sumstats_config_dict.values():
|
709
|
-
assert Path(sumstats_file).exists(), f
|
1298
|
+
assert Path(sumstats_file).exists(), f"{sumstats_file} does not exist."
|
710
1299
|
|
711
1300
|
|
712
1301
|
@dataclass
|
@@ -722,85 +1311,122 @@ class FormatSumstatsConfig:
|
|
722
1311
|
se: str = None
|
723
1312
|
p: str = None
|
724
1313
|
frq: str = None
|
725
|
-
n: str = None
|
1314
|
+
n: str | int = None
|
726
1315
|
z: str = None
|
727
1316
|
OR: str = None
|
728
1317
|
se_OR: str = None
|
729
1318
|
format: str = None
|
730
1319
|
chr: str = None
|
731
1320
|
pos: str = None
|
732
|
-
chunksize: int =
|
1321
|
+
chunksize: int = 1e7
|
733
1322
|
info_min: float = 0.9
|
734
1323
|
maf_min: float = 0.01
|
735
1324
|
keep_chr_pos: bool = False
|
736
1325
|
|
737
1326
|
|
738
|
-
@register_cli(
|
739
|
-
|
740
|
-
|
1327
|
+
@register_cli(
|
1328
|
+
name="quick_mode",
|
1329
|
+
description="Run the entire gsMap pipeline in quick mode, utilizing pre-computed weights for faster execution.",
|
1330
|
+
add_args_function=add_run_all_mode_args,
|
1331
|
+
)
|
1332
|
+
def run_all_mode_from_cli(args: argparse.Namespace):
|
1333
|
+
from gsMap.run_all_mode import run_pipeline
|
1334
|
+
|
1335
|
+
config = get_dataclass_from_parser(args, RunAllModeConfig)
|
1336
|
+
run_pipeline(config)
|
1337
|
+
|
1338
|
+
|
1339
|
+
@register_cli(
|
1340
|
+
name="run_find_latent_representations",
|
1341
|
+
description="Run Find_latent_representations \nFind the latent representations of each spot by running GNN",
|
1342
|
+
add_args_function=add_find_latent_representations_args,
|
1343
|
+
)
|
741
1344
|
def run_find_latent_representation_from_cli(args: argparse.Namespace):
|
742
1345
|
from gsMap.find_latent_representation import run_find_latent_representation
|
1346
|
+
|
743
1347
|
config = get_dataclass_from_parser(args, FindLatentRepresentationsConfig)
|
744
1348
|
run_find_latent_representation(config)
|
745
1349
|
|
746
1350
|
|
747
|
-
@register_cli(
|
748
|
-
|
749
|
-
|
1351
|
+
@register_cli(
|
1352
|
+
name="run_latent_to_gene",
|
1353
|
+
description="Run Latent_to_gene \nEstimate gene marker gene scores for each spot by using latent representations from nearby spots",
|
1354
|
+
add_args_function=add_latent_to_gene_args,
|
1355
|
+
)
|
750
1356
|
def run_latent_to_gene_from_cli(args: argparse.Namespace):
|
751
1357
|
from gsMap.latent_to_gene import run_latent_to_gene
|
1358
|
+
|
752
1359
|
config = get_dataclass_from_parser(args, LatentToGeneConfig)
|
753
1360
|
run_latent_to_gene(config)
|
754
1361
|
|
755
1362
|
|
756
|
-
@register_cli(
|
757
|
-
|
758
|
-
|
1363
|
+
@register_cli(
|
1364
|
+
name="run_generate_ldscore",
|
1365
|
+
description="Run Generate_ldscore \nGenerate LD scores for each spot",
|
1366
|
+
add_args_function=add_generate_ldscore_args,
|
1367
|
+
)
|
759
1368
|
def run_generate_ldscore_from_cli(args: argparse.Namespace):
|
760
1369
|
from gsMap.generate_ldscore import run_generate_ldscore
|
1370
|
+
|
761
1371
|
config = get_dataclass_from_parser(args, GenerateLDScoreConfig)
|
762
1372
|
run_generate_ldscore(config)
|
763
1373
|
|
764
1374
|
|
765
|
-
@register_cli(
|
766
|
-
|
767
|
-
|
1375
|
+
@register_cli(
|
1376
|
+
name="run_spatial_ldsc",
|
1377
|
+
description="Run Spatial_ldsc \nRun spatial LDSC for each spot",
|
1378
|
+
add_args_function=add_spatial_ldsc_args,
|
1379
|
+
)
|
768
1380
|
def run_spatial_ldsc_from_cli(args: argparse.Namespace):
|
769
1381
|
from gsMap.spatial_ldsc_multiple_sumstats import run_spatial_ldsc
|
1382
|
+
|
770
1383
|
config = get_dataclass_from_parser(args, SpatialLDSCConfig)
|
771
1384
|
run_spatial_ldsc(config)
|
772
1385
|
|
773
1386
|
|
774
|
-
@register_cli(
|
775
|
-
|
776
|
-
|
1387
|
+
@register_cli(
|
1388
|
+
name="run_cauchy_combination",
|
1389
|
+
description="Run Cauchy_combination for each annotation",
|
1390
|
+
add_args_function=add_Cauchy_combination_args,
|
1391
|
+
)
|
777
1392
|
def run_Cauchy_combination_from_cli(args: argparse.Namespace):
|
778
1393
|
from gsMap.cauchy_combination_test import run_Cauchy_combination
|
1394
|
+
|
779
1395
|
config = get_dataclass_from_parser(args, CauchyCombinationConfig)
|
780
1396
|
run_Cauchy_combination(config)
|
781
1397
|
|
782
1398
|
|
783
|
-
@register_cli(
|
784
|
-
|
785
|
-
|
1399
|
+
@register_cli(
|
1400
|
+
name="run_report",
|
1401
|
+
description="Run Report to generate diagnostic plots and tables",
|
1402
|
+
add_args_function=add_report_args,
|
1403
|
+
)
|
786
1404
|
def run_Report_from_cli(args: argparse.Namespace):
|
787
1405
|
from gsMap.report import run_report
|
1406
|
+
|
788
1407
|
config = get_dataclass_from_parser(args, ReportConfig)
|
789
1408
|
run_report(config)
|
790
1409
|
|
791
1410
|
|
792
|
-
@register_cli(
|
793
|
-
|
794
|
-
|
1411
|
+
@register_cli(
|
1412
|
+
name="format_sumstats",
|
1413
|
+
description="Format GWAS summary statistics",
|
1414
|
+
add_args_function=add_format_sumstats_args,
|
1415
|
+
)
|
795
1416
|
def gwas_format_from_cli(args: argparse.Namespace):
|
796
1417
|
from gsMap.format_sumstats import gwas_format
|
1418
|
+
|
797
1419
|
config = get_dataclass_from_parser(args, FormatSumstatsConfig)
|
798
1420
|
gwas_format(config)
|
799
1421
|
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
1422
|
+
|
1423
|
+
@register_cli(
|
1424
|
+
name="create_slice_mean",
|
1425
|
+
description="Create slice mean from multiple h5ad files",
|
1426
|
+
add_args_function=add_create_slice_mean_args,
|
1427
|
+
)
|
1428
|
+
def create_slice_mean_from_cli(args: argparse.Namespace):
|
1429
|
+
from gsMap.create_slice_mean import run_create_slice_mean
|
1430
|
+
|
1431
|
+
config = get_dataclass_from_parser(args, CreateSliceMeanConfig)
|
1432
|
+
run_create_slice_mean(config)
|