gsMap 1.71.2__py3-none-any.whl → 1.73.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsMap/config.py CHANGED
@@ -1,75 +1,237 @@
1
- import sys
2
1
  import argparse
2
+ import dataclasses
3
3
  import logging
4
+ import os
5
+ import sys
6
+ import threading
7
+ import time
4
8
  from collections import OrderedDict, namedtuple
9
+ from collections.abc import Callable
5
10
  from dataclasses import dataclass
11
+ from functools import wraps
6
12
  from pathlib import Path
7
13
  from pprint import pprint
8
- from typing import Callable
9
- from typing import Union, Literal, Tuple, Optional, List
10
- from functools import wraps
14
+ from typing import Literal
15
+
16
+ import psutil
11
17
  import pyfiglet
18
+ import yaml
12
19
 
13
20
  from gsMap.__init__ import __version__
14
21
 
15
22
  # Global registry to hold functions
16
23
  cli_function_registry = OrderedDict()
17
- subcommand = namedtuple('subcommand', ['name', 'func', 'add_args_function', 'description'])
24
+ subcommand = namedtuple("subcommand", ["name", "func", "add_args_function", "description"])
18
25
 
19
26
 
20
27
  def get_gsMap_logger(logger_name):
21
28
  logger = logging.getLogger(logger_name)
22
29
  logger.setLevel(logging.DEBUG)
23
30
  handler = logging.StreamHandler()
24
- handler.setFormatter(logging.Formatter(
25
- '[{asctime}] {levelname:.5s} | {name} - {message}', style='{'))
31
+ handler.setFormatter(
32
+ logging.Formatter("[{asctime}] {levelname:.5s} | {name} - {message}", style="{")
33
+ )
26
34
  logger.addHandler(handler)
27
35
  return logger
28
36
 
29
- logger = get_gsMap_logger('gsMap')
37
+
38
+ logger = get_gsMap_logger("gsMap")
39
+
40
+
41
+ def track_resource_usage(func):
42
+ """
43
+ Decorator to track resource usage during function execution.
44
+ Logs memory usage, CPU time, and wall clock time at the end of the function.
45
+ """
46
+
47
+ @wraps(func)
48
+ def wrapper(*args, **kwargs):
49
+ # Get the current process
50
+ process = psutil.Process(os.getpid())
51
+
52
+ # Initialize tracking variables
53
+ peak_memory = 0
54
+ cpu_percent_samples = []
55
+ stop_thread = False
56
+
57
+ # Function to monitor resource usage
58
+ def resource_monitor():
59
+ nonlocal peak_memory, cpu_percent_samples
60
+ while not stop_thread:
61
+ try:
62
+ # Get current memory usage in MB
63
+ current_memory = process.memory_info().rss / (1024 * 1024)
64
+ peak_memory = max(peak_memory, current_memory)
65
+
66
+ # Get CPU usage percentage
67
+ cpu_percent = process.cpu_percent(interval=None)
68
+ if cpu_percent > 0: # Skip initial zero readings
69
+ cpu_percent_samples.append(cpu_percent)
70
+
71
+ time.sleep(0.5)
72
+ except Exception: # Catching all exceptions here because... # noqa: BLE001
73
+ pass
74
+
75
+ # Start resource monitoring in a separate thread
76
+ monitor_thread = threading.Thread(target=resource_monitor)
77
+ monitor_thread.daemon = True
78
+ monitor_thread.start()
79
+
80
+ # Get start times
81
+ start_wall_time = time.time()
82
+ start_cpu_time = process.cpu_times().user + process.cpu_times().system
83
+
84
+ try:
85
+ # Run the actual function
86
+ result = func(*args, **kwargs)
87
+ return result
88
+ finally:
89
+ # Stop the monitoring thread
90
+ stop_thread = True
91
+ monitor_thread.join(timeout=1.0)
92
+
93
+ # Calculate elapsed times
94
+ end_wall_time = time.time()
95
+ end_cpu_time = process.cpu_times().user + process.cpu_times().system
96
+
97
+ wall_time = end_wall_time - start_wall_time
98
+ cpu_time = end_cpu_time - start_cpu_time
99
+
100
+ # Calculate average CPU percentage
101
+ avg_cpu_percent = (
102
+ sum(cpu_percent_samples) / len(cpu_percent_samples) if cpu_percent_samples else 0
103
+ )
104
+
105
+ # Format memory for display
106
+ if peak_memory < 1024:
107
+ memory_str = f"{peak_memory:.2f} MB"
108
+ else:
109
+ memory_str = f"{peak_memory / 1024:.2f} GB"
110
+
111
+ # Format times for display
112
+ if wall_time < 60:
113
+ wall_time_str = f"{wall_time:.2f} seconds"
114
+ elif wall_time < 3600:
115
+ wall_time_str = f"{wall_time / 60:.2f} minutes"
116
+ else:
117
+ wall_time_str = f"{wall_time / 3600:.2f} hours"
118
+
119
+ if cpu_time < 60:
120
+ cpu_time_str = f"{cpu_time:.2f} seconds"
121
+ elif cpu_time < 3600:
122
+ cpu_time_str = f"{cpu_time / 60:.2f} minutes"
123
+ else:
124
+ cpu_time_str = f"{cpu_time / 3600:.2f} hours"
125
+
126
+ # Log the resource usage
127
+ import logging
128
+
129
+ logger = logging.getLogger("gsMap")
130
+ logger.info("Resource usage summary:")
131
+ logger.info(f" • Wall clock time: {wall_time_str}")
132
+ logger.info(f" • CPU time: {cpu_time_str}")
133
+ logger.info(f" • Average CPU utilization: {avg_cpu_percent:.1f}%")
134
+ logger.info(f" • Peak memory usage: {memory_str}")
135
+
136
+ return wrapper
137
+
30
138
 
31
139
  # Decorator to register functions for cli parsing
32
140
  def register_cli(name: str, description: str, add_args_function: Callable) -> Callable:
33
141
  def decorator(func: Callable) -> Callable:
142
+ @track_resource_usage # Use enhanced resource tracking
143
+ @wraps(func)
34
144
  def wrapper(*args, **kwargs):
35
- name.replace('_', ' ')
36
- gsMap_main_logo = pyfiglet.figlet_format("gsMap", font='doom', width=80, justify='center', ).rstrip()
145
+ name.replace("_", " ")
146
+ gsMap_main_logo = pyfiglet.figlet_format(
147
+ "gsMap",
148
+ font="doom",
149
+ width=80,
150
+ justify="center",
151
+ ).rstrip()
37
152
  print(gsMap_main_logo, flush=True)
38
- version_number = 'Version: ' + __version__
153
+ version_number = "Version: " + __version__
39
154
  print(version_number.center(80), flush=True)
40
- print('=' * 80, flush=True)
155
+ print("=" * 80, flush=True)
41
156
  logger.info(f"Running {name}...")
157
+
158
+ # Record start time for the log message
159
+ start_time = time.strftime("%Y-%m-%d %H:%M:%S")
160
+ logger.info(f"Started at: {start_time}")
161
+
42
162
  func(*args, **kwargs)
43
- logger.info(f"Finished running {name}.")
44
163
 
45
- cli_function_registry[name] = subcommand(name=name, func=wrapper, add_args_function=add_args_function,
46
- description=description)
164
+ # Record end time for the log message
165
+ end_time = time.strftime("%Y-%m-%d %H:%M:%S")
166
+ logger.info(f"Finished running {name} at: {end_time}.")
167
+
168
+ cli_function_registry[name] = subcommand(
169
+ name=name, func=wrapper, add_args_function=add_args_function, description=description
170
+ )
47
171
  return wrapper
48
172
 
49
173
  return decorator
50
174
 
175
+
176
+ def str_or_float(value):
177
+ try:
178
+ return int(value)
179
+ except ValueError:
180
+ return value
181
+
182
+
51
183
  def add_shared_args(parser):
52
- parser.add_argument('--workdir', type=str, required=True, help='Path to the working directory.')
53
- parser.add_argument('--sample_name', type=str, required=True, help='Name of the sample.')
184
+ parser.add_argument(
185
+ "--workdir", type=str, required=True, help="Path to the working directory."
186
+ )
187
+ parser.add_argument("--sample_name", type=str, required=True, help="Name of the sample.")
188
+
54
189
 
55
190
  def add_find_latent_representations_args(parser):
56
191
  add_shared_args(parser)
57
- parser.add_argument('--input_hdf5_path', required=True, type=str, help='Path to the input HDF5 file.')
58
- parser.add_argument('--annotation', required=True, type=str, help='Name of the annotation in adata.obs to use.')
59
- parser.add_argument('--data_layer', type=str, default='counts', required=True,
60
- help='Data layer for gene expression (e.g., "count", "counts", "log1p").')
61
- parser.add_argument('--epochs', type=int, default=300, help='Number of training epochs.')
62
- parser.add_argument('--feat_hidden1', type=int, default=256, help='Neurons in the first hidden layer.')
63
- parser.add_argument('--feat_hidden2', type=int, default=128, help='Neurons in the second hidden layer.')
64
- parser.add_argument('--gat_hidden1', type=int, default=64, help='Units in the first GAT hidden layer.')
65
- parser.add_argument('--gat_hidden2', type=int, default=30, help='Units in the second GAT hidden layer.')
66
- parser.add_argument('--p_drop', type=float, default=0.1, help='Dropout rate.')
67
- parser.add_argument('--gat_lr', type=float, default=0.001, help='Learning rate for the GAT.')
68
- parser.add_argument('--n_neighbors', type=int, default=11, help='Number of neighbors for GAT.')
69
- parser.add_argument('--n_comps', type=int, default=300, help='Number of principal components for PCA.')
70
- parser.add_argument('--weighted_adj', action='store_true', help='Use weighted adjacency in GAT.')
71
- parser.add_argument('--convergence_threshold', type=float, default=1e-4, help='Threshold for convergence.')
72
- parser.add_argument('--hierarchically', action='store_true', help='Enable hierarchical latent representation finding.')
192
+ parser.add_argument(
193
+ "--input_hdf5_path", required=True, type=str, help="Path to the input HDF5 file."
194
+ )
195
+ parser.add_argument(
196
+ "--annotation", required=True, type=str, help="Name of the annotation in adata.obs to use."
197
+ )
198
+ parser.add_argument(
199
+ "--data_layer",
200
+ type=str,
201
+ default="counts",
202
+ required=True,
203
+ help='Data layer for gene expression (e.g., "count", "counts", "log1p").',
204
+ )
205
+ parser.add_argument("--epochs", type=int, default=300, help="Number of training epochs.")
206
+ parser.add_argument(
207
+ "--feat_hidden1", type=int, default=256, help="Neurons in the first hidden layer."
208
+ )
209
+ parser.add_argument(
210
+ "--feat_hidden2", type=int, default=128, help="Neurons in the second hidden layer."
211
+ )
212
+ parser.add_argument(
213
+ "--gat_hidden1", type=int, default=64, help="Units in the first GAT hidden layer."
214
+ )
215
+ parser.add_argument(
216
+ "--gat_hidden2", type=int, default=30, help="Units in the second GAT hidden layer."
217
+ )
218
+ parser.add_argument("--p_drop", type=float, default=0.1, help="Dropout rate.")
219
+ parser.add_argument("--gat_lr", type=float, default=0.001, help="Learning rate for the GAT.")
220
+ parser.add_argument("--n_neighbors", type=int, default=11, help="Number of neighbors for GAT.")
221
+ parser.add_argument(
222
+ "--n_comps", type=int, default=300, help="Number of principal components for PCA."
223
+ )
224
+ parser.add_argument(
225
+ "--weighted_adj", action="store_true", help="Use weighted adjacency in GAT."
226
+ )
227
+ parser.add_argument(
228
+ "--convergence_threshold", type=float, default=1e-4, help="Threshold for convergence."
229
+ )
230
+ parser.add_argument(
231
+ "--hierarchically",
232
+ action="store_true",
233
+ help="Enable hierarchical latent representation finding.",
234
+ )
73
235
 
74
236
 
75
237
  def chrom_choice(value):
@@ -77,10 +239,12 @@ def chrom_choice(value):
77
239
  ivalue = int(value)
78
240
  if 1 <= ivalue <= 22:
79
241
  return ivalue
80
- elif value.lower() == 'all':
242
+ elif value.lower() == "all":
81
243
  return value
82
244
  else:
83
- raise argparse.ArgumentTypeError(f"'{value}' is an invalid chromosome choice. Choose from 1-22 or 'all'.")
245
+ raise argparse.ArgumentTypeError(
246
+ f"'{value}' is an invalid chromosome choice. Choose from 1-22 or 'all'."
247
+ )
84
248
 
85
249
 
86
250
  def filter_args_for_dataclass(args_dict, data_class: dataclass):
@@ -89,7 +253,7 @@ def filter_args_for_dataclass(args_dict, data_class: dataclass):
89
253
 
90
254
  def get_dataclass_from_parser(args: argparse.Namespace, data_class: dataclass):
91
255
  remain_kwargs = filter_args_for_dataclass(vars(args), data_class)
92
- print(f'Using the following arguments for {data_class.__name__}:', flush=True)
256
+ print(f"Using the following arguments for {data_class.__name__}:", flush=True)
93
257
  pprint(remain_kwargs, indent=4)
94
258
  sys.stdout.flush()
95
259
  return data_class(**remain_kwargs)
@@ -97,178 +261,423 @@ def get_dataclass_from_parser(args: argparse.Namespace, data_class: dataclass):
97
261
 
98
262
  def add_latent_to_gene_args(parser):
99
263
  add_shared_args(parser)
100
- parser.add_argument('--annotation', type=str, help='Name of the annotation in adata.obs to use. (optional).')
101
- parser.add_argument('--no_expression_fraction', action='store_true', help='Skip expression fraction filtering.')
102
- parser.add_argument('--latent_representation', type=str, choices=['latent_GVAE', 'latent_PCA'], default='latent_GVAE',
103
- help='Type of latent representation.')
104
- parser.add_argument('--num_neighbour', type=int, default=21, help='Number of neighbors.')
105
- parser.add_argument('--num_neighbour_spatial', type=int, default=101, help='Number of spatial neighbors.')
106
- # parser.add_argument('--species', type=str, help='Species name for homolog gene mapping (optional).')
107
- parser.add_argument('--homolog_file', type=str, help='Path to homologous gene conversion file (optional).')
108
264
 
109
-
110
- def add_generate_ldscore_args(parser):
111
- add_shared_args(parser)
112
- parser.add_argument('--chrom', type=str, required=True, help='Chromosome id (1-22) or "all".')
113
- parser.add_argument('--bfile_root', type=str, required=True, help='Root path for genotype plink bfiles (.bim, .bed, .fam).')
114
- parser.add_argument('--keep_snp_root', type=str, required=True, help='Root path for SNP files.')
115
- parser.add_argument('--gtf_annotation_file', type=str, required=True, help='Path to GTF annotation file.')
116
- parser.add_argument('--gene_window_size', type=int, default=50000, help='Gene window size in base pairs.')
117
- parser.add_argument('--enhancer_annotation_file', type=str, help='Path to enhancer annotation file (optional).')
118
- parser.add_argument('--snp_multiple_enhancer_strategy', type=str, choices=['max_mkscore', 'nearest_TSS'], default='max_mkscore',
119
- help='Strategy for handling multiple enhancers per SNP.')
120
- parser.add_argument('--gene_window_enhancer_priority', type=str, choices=['gene_window_first', 'enhancer_first', 'enhancer_only'],
121
- help='Priority between gene window and enhancer annotations.')
122
- parser.add_argument('--spots_per_chunk', type=int, default=1000, help='Number of spots per chunk.')
123
- parser.add_argument('--ld_wind', type=int, default=1, help='LD window size.')
124
- parser.add_argument('--ld_unit', type=str, choices=['SNP', 'KB', 'CM'], default='CM', help='Unit for LD window.')
125
- parser.add_argument('--additional_baseline_annotation', type=str, default=None, help='Path of additional baseline annotations')
265
+ parser.add_argument(
266
+ "--input_hdf5_path",
267
+ type=str,
268
+ default=None,
269
+ help="Path to the input HDF5 file with latent representations, if --latent_representation is specified.",
270
+ )
271
+ parser.add_argument(
272
+ "--no_expression_fraction", action="store_true", help="Skip expression fraction filtering."
273
+ )
274
+ parser.add_argument(
275
+ "--latent_representation",
276
+ type=str,
277
+ default=None,
278
+ help="Type of latent representation. This should exist in the h5ad obsm.",
279
+ )
280
+ parser.add_argument("--num_neighbour", type=int, default=21, help="Number of neighbors.")
281
+ parser.add_argument(
282
+ "--num_neighbour_spatial", type=int, default=101, help="Number of spatial neighbors."
283
+ )
284
+ parser.add_argument(
285
+ "--homolog_file",
286
+ type=str,
287
+ default=None,
288
+ help="Path to homologous gene conversion file (optional).",
289
+ )
290
+ parser.add_argument(
291
+ "--gM_slices", type=str, default=None, help="Path to the slice mean file (optional)."
292
+ )
293
+ parser.add_argument(
294
+ "--annotation",
295
+ type=str,
296
+ default=None,
297
+ help="Name of the annotation in adata.obs to use (optional).",
298
+ )
126
299
 
127
300
 
128
- def add_latent_to_gene_args(parser):
301
+ def add_generate_ldscore_args(parser):
129
302
  add_shared_args(parser)
130
- parser.add_argument('--annotation', type=str, required=True, help='Name of the annotation layer.')
131
- parser.add_argument('--no_expression_fraction', action='store_true', help='Skip expression fraction filtering.')
132
- parser.add_argument('--latent_representation', type=str, choices=['latent_GVAE', 'latent_PCA'], default='latent_GVAE',
133
- help='Type of latent representation.')
134
- parser.add_argument('--num_neighbour', type=int, default=21, help='Number of neighbors.')
135
- parser.add_argument('--num_neighbour_spatial', type=int, default=101, help='Number of spatial neighbors.')
136
- # parser.add_argument('--species', type=str, help='Species name for homolog gene mapping (optional).')
137
- parser.add_argument('--homolog_file', type=str, help='Path to homologous gene conversion file (optional).')
303
+ parser.add_argument("--chrom", type=str, required=True, help='Chromosome id (1-22) or "all".')
304
+ parser.add_argument(
305
+ "--bfile_root",
306
+ type=str,
307
+ required=True,
308
+ help="Root path for genotype plink bfiles (.bim, .bed, .fam).",
309
+ )
310
+ parser.add_argument(
311
+ "--keep_snp_root", type=str, required=True, help="Root path for SNP files."
312
+ )
313
+ parser.add_argument(
314
+ "--gtf_annotation_file", type=str, required=True, help="Path to GTF annotation file."
315
+ )
316
+ parser.add_argument(
317
+ "--gene_window_size", type=int, default=50000, help="Gene window size in base pairs."
318
+ )
319
+ parser.add_argument(
320
+ "--enhancer_annotation_file", type=str, help="Path to enhancer annotation file (optional)."
321
+ )
322
+ parser.add_argument(
323
+ "--snp_multiple_enhancer_strategy",
324
+ type=str,
325
+ choices=["max_mkscore", "nearest_TSS"],
326
+ default="max_mkscore",
327
+ help="Strategy for handling multiple enhancers per SNP.",
328
+ )
329
+ parser.add_argument(
330
+ "--gene_window_enhancer_priority",
331
+ type=str,
332
+ choices=["gene_window_first", "enhancer_first", "enhancer_only"],
333
+ help="Priority between gene window and enhancer annotations.",
334
+ )
335
+ parser.add_argument(
336
+ "--spots_per_chunk", type=int, default=1000, help="Number of spots per chunk."
337
+ )
338
+ parser.add_argument("--ld_wind", type=int, default=1, help="LD window size.")
339
+ parser.add_argument(
340
+ "--ld_unit",
341
+ type=str,
342
+ choices=["SNP", "KB", "CM"],
343
+ default="CM",
344
+ help="Unit for LD window.",
345
+ )
346
+ parser.add_argument(
347
+ "--additional_baseline_annotation",
348
+ type=str,
349
+ default=None,
350
+ help="Path of additional baseline annotations",
351
+ )
138
352
 
139
353
 
140
354
  def add_spatial_ldsc_args(parser):
141
355
  add_shared_args(parser)
142
- parser.add_argument('--sumstats_file', type=str, required=True, help='Path to GWAS summary statistics file.')
143
- parser.add_argument('--w_file', type=str, required=True, help='Path to regression weight file.')
144
- parser.add_argument('--trait_name', type=str, required=True, help='Name of the trait being analyzed.')
145
- parser.add_argument('--n_blocks', type=int, default=200, help='Number of blocks for jackknife resampling.')
146
- parser.add_argument('--chisq_max', type=int, help='Maximum chi-square value for filtering SNPs.')
147
- parser.add_argument('--num_processes', type=int, default=4, help='Number of processes for parallel computing.')
148
- parser.add_argument('--use_additional_baseline_annotation', type=bool, nargs='?', const=True, default=True, help='Use additional baseline annotations when provided')
356
+ parser.add_argument(
357
+ "--sumstats_file", type=str, required=True, help="Path to GWAS summary statistics file."
358
+ )
359
+ parser.add_argument(
360
+ "--w_file", type=str, required=True, help="Path to regression weight file."
361
+ )
362
+ parser.add_argument(
363
+ "--trait_name", type=str, required=True, help="Name of the trait being analyzed."
364
+ )
365
+ parser.add_argument(
366
+ "--n_blocks", type=int, default=200, help="Number of blocks for jackknife resampling."
367
+ )
368
+ parser.add_argument(
369
+ "--chisq_max", type=int, help="Maximum chi-square value for filtering SNPs."
370
+ )
371
+ parser.add_argument(
372
+ "--num_processes", type=int, default=4, help="Number of processes for parallel computing."
373
+ )
374
+ parser.add_argument(
375
+ "--use_additional_baseline_annotation",
376
+ type=bool,
377
+ nargs="?",
378
+ const=True,
379
+ default=True,
380
+ help="Use additional baseline annotations when provided",
381
+ )
149
382
 
150
383
 
151
384
  def add_Cauchy_combination_args(parser):
152
- add_shared_args(parser)
153
- parser.add_argument('--trait_name', type=str, required=True, help='Name of the trait being analyzed.')
154
- parser.add_argument('--annotation', type=str, required=True, help='Name of the annotation in adata.obs to use.')
155
- parser.add_argument('--meta', type=str, help='Optional meta information.')
156
- parser.add_argument('--slide', type=str, help='Optional slide information.')
385
+ parser.add_argument(
386
+ "--workdir", type=str, required=True, help="Path to the working directory."
387
+ )
388
+ parser.add_argument("--sample_name", type=str, required=False, help="Name of the sample.")
389
+
390
+ parser.add_argument(
391
+ "--trait_name", type=str, required=True, help="Name of the trait being analyzed."
392
+ )
393
+ parser.add_argument(
394
+ "--annotation", type=str, required=True, help="Name of the annotation in adata.obs to use."
395
+ )
396
+
397
+ parser.add_argument(
398
+ "--sample_name_list",
399
+ type=str,
400
+ nargs="+",
401
+ required=False,
402
+ help="List of sample names to process. Provide as a space-separated list.",
403
+ )
404
+ parser.add_argument(
405
+ "--output_file",
406
+ type=str,
407
+ required=False,
408
+ help="Path to save the combined Cauchy results. Required when using multiple samples.",
409
+ )
157
410
 
158
411
 
159
412
  def add_report_args(parser):
160
413
  add_shared_args(parser)
161
- parser.add_argument('--trait_name', type=str, required=True, help='Name of the trait to generate the report for.')
162
- parser.add_argument('--annotation', type=str, required=True, help='Annotation layer name.')
414
+ parser.add_argument(
415
+ "--trait_name",
416
+ type=str,
417
+ required=True,
418
+ help="Name of the trait to generate the report for.",
419
+ )
420
+ parser.add_argument("--annotation", type=str, required=True, help="Annotation layer name.")
163
421
  # parser.add_argument('--plot_type', type=str, choices=['manhattan', 'GSS', 'gsMap', 'all'], default='all',
164
422
  # help="Type of diagnostic plot to generate. Choose from 'manhattan', 'GSS', 'gsMap', or 'all'.")
165
- parser.add_argument('--top_corr_genes', type=int, default=50,
166
- help='Number of top correlated genes to display.')
167
- parser.add_argument('--selected_genes', type=str, nargs='*',
168
- help='List of specific genes to include in the report (optional).')
169
- parser.add_argument('--sumstats_file', type=str, required=True, help='Path to GWAS summary statistics file.')
423
+ parser.add_argument(
424
+ "--top_corr_genes", type=int, default=50, help="Number of top correlated genes to display."
425
+ )
426
+ parser.add_argument(
427
+ "--selected_genes",
428
+ type=str,
429
+ nargs="*",
430
+ help="List of specific genes to include in the report (optional).",
431
+ )
432
+ parser.add_argument(
433
+ "--sumstats_file", type=str, required=True, help="Path to GWAS summary statistics file."
434
+ )
170
435
 
171
436
  # Optional arguments for customization
172
- parser.add_argument('--fig_width', type=int, default=None, help='Width of the generated figures in pixels.')
173
- parser.add_argument('--fig_height', type=int, default=None, help='Height of the generated figures in pixels.')
174
- parser.add_argument('--point_size', type=int, default=None, help='Point size for the figures.')
175
- parser.add_argument('--fig_style', type=str, default='light', choices=['dark', 'light'],
176
- help='Style of the generated figures.')
437
+ parser.add_argument(
438
+ "--fig_width", type=int, default=None, help="Width of the generated figures in pixels."
439
+ )
440
+ parser.add_argument(
441
+ "--fig_height", type=int, default=None, help="Height of the generated figures in pixels."
442
+ )
443
+ parser.add_argument("--point_size", type=int, default=None, help="Point size for the figures.")
444
+ parser.add_argument(
445
+ "--fig_style",
446
+ type=str,
447
+ default="light",
448
+ choices=["dark", "light"],
449
+ help="Style of the generated figures.",
450
+ )
451
+
452
+
453
+ def add_create_slice_mean_args(parser):
454
+ parser.add_argument(
455
+ "--sample_name_list",
456
+ type=str,
457
+ nargs="+",
458
+ required=True,
459
+ help="List of sample names to process. Provide as a space-separated list.",
460
+ )
461
+
462
+ parser.add_argument(
463
+ "--h5ad_list",
464
+ type=str,
465
+ nargs="+",
466
+ help="List of h5ad file paths corresponding to the sample names. Provide as a space-separated list.",
467
+ )
468
+ parser.add_argument(
469
+ "--h5ad_yaml",
470
+ type=str,
471
+ default=None,
472
+ help="Path to the YAML file containing sample names and associated h5ad file paths",
473
+ )
474
+ parser.add_argument(
475
+ "--slice_mean_output_file",
476
+ type=str,
477
+ required=True,
478
+ help="Path to the output file for the slice mean",
479
+ )
480
+ parser.add_argument(
481
+ "--homolog_file", type=str, help="Path to homologous gene conversion file (optional)."
482
+ )
483
+ parser.add_argument(
484
+ "--data_layer",
485
+ type=str,
486
+ default="counts",
487
+ required=True,
488
+ help='Data layer for gene expression (e.g., "count", "counts", "log1p").',
489
+ )
490
+
177
491
 
178
492
  def add_format_sumstats_args(parser):
179
493
  # Required arguments
180
- parser.add_argument('--sumstats', required=True, type=str,
181
- help='Path to gwas summary data')
182
- parser.add_argument('--out', required=True, type=str,
183
- help='Path to save the formatted gwas data')
494
+ parser.add_argument("--sumstats", required=True, type=str, help="Path to gwas summary data")
495
+ parser.add_argument(
496
+ "--out", required=True, type=str, help="Path to save the formatted gwas data"
497
+ )
184
498
 
185
499
  # Arguments for specify column name
186
- parser.add_argument('--snp', default=None, type=str,
187
- help="Name of snp column (if not a name that gsMap understands)")
188
- parser.add_argument('--a1', default=None, type=str,
189
- help="Name of effect allele column (if not a name that gsMap understands)")
190
- parser.add_argument('--a2', default=None, type=str,
191
- help="Name of none-effect allele column (if not a name that gsMap understands)")
192
- parser.add_argument('--info', default=None, type=str,
193
- help="Name of info column (if not a name that gsMap understands)")
194
- parser.add_argument('--beta', default=None, type=str,
195
- help="Name of gwas beta column (if not a name that gsMap understands).")
196
- parser.add_argument('--se', default=None, type=str,
197
- help="Name of gwas standar error of beta column (if not a name that gsMap understands)")
198
- parser.add_argument('--p', default=None, type=str,
199
- help="Name of p-value column (if not a name that gsMap understands)")
200
- parser.add_argument('--frq', default=None, type=str,
201
- help="Name of A1 ferquency column (if not a name that gsMap understands)")
202
- parser.add_argument('--n', default=None, type=str,
203
- help="Name of sample size column (if not a name that gsMap understands)")
204
- parser.add_argument('--z', default=None, type=str,
205
- help="Name of gwas Z-statistics column (if not a name that gsMap understands)")
206
- parser.add_argument('--OR', default=None, type=str,
207
- help="Name of gwas OR column (if not a name that gsMap understands)")
208
- parser.add_argument('--se_OR', default=None, type=str,
209
- help="Name of standar error of OR column (if not a name that gsMap understands)")
500
+ parser.add_argument(
501
+ "--snp",
502
+ default=None,
503
+ type=str,
504
+ help="Name of snp column (if not a name that gsMap understands)",
505
+ )
506
+ parser.add_argument(
507
+ "--a1",
508
+ default=None,
509
+ type=str,
510
+ help="Name of effect allele column (if not a name that gsMap understands)",
511
+ )
512
+ parser.add_argument(
513
+ "--a2",
514
+ default=None,
515
+ type=str,
516
+ help="Name of none-effect allele column (if not a name that gsMap understands)",
517
+ )
518
+ parser.add_argument(
519
+ "--info",
520
+ default=None,
521
+ type=str,
522
+ help="Name of info column (if not a name that gsMap understands)",
523
+ )
524
+ parser.add_argument(
525
+ "--beta",
526
+ default=None,
527
+ type=str,
528
+ help="Name of gwas beta column (if not a name that gsMap understands).",
529
+ )
530
+ parser.add_argument(
531
+ "--se",
532
+ default=None,
533
+ type=str,
534
+ help="Name of gwas standar error of beta column (if not a name that gsMap understands)",
535
+ )
536
+ parser.add_argument(
537
+ "--p",
538
+ default=None,
539
+ type=str,
540
+ help="Name of p-value column (if not a name that gsMap understands)",
541
+ )
542
+ parser.add_argument(
543
+ "--frq",
544
+ default=None,
545
+ type=str,
546
+ help="Name of A1 ferquency column (if not a name that gsMap understands)",
547
+ )
548
+ parser.add_argument(
549
+ "--n",
550
+ default=None,
551
+ type=str_or_float,
552
+ help="Name of sample size column (if not a name that gsMap understands)",
553
+ )
554
+ parser.add_argument(
555
+ "--z",
556
+ default=None,
557
+ type=str,
558
+ help="Name of gwas Z-statistics column (if not a name that gsMap understands)",
559
+ )
560
+ parser.add_argument(
561
+ "--OR",
562
+ default=None,
563
+ type=str,
564
+ help="Name of gwas OR column (if not a name that gsMap understands)",
565
+ )
566
+ parser.add_argument(
567
+ "--se_OR",
568
+ default=None,
569
+ type=str,
570
+ help="Name of standar error of OR column (if not a name that gsMap understands)",
571
+ )
210
572
 
211
573
  # Arguments for convert SNP (chr, pos) to rsid
212
- parser.add_argument('--chr', default="Chr", type=str,
213
- help="Name of SNP chromosome column (if not a name that gsMap understands)")
214
- parser.add_argument('--pos', default="Pos", type=str,
215
- help="Name of SNP positions column (if not a name that gsMap understands)")
216
- parser.add_argument('--dbsnp', default=None, type=str,
217
- help='Path to reference dnsnp file')
218
- parser.add_argument('--chunksize', default=1e+6, type=int,
219
- help='Chunk size for loading dbsnp file')
574
+ parser.add_argument(
575
+ "--chr",
576
+ default="Chr",
577
+ type=str,
578
+ help="Name of SNP chromosome column (if not a name that gsMap understands)",
579
+ )
580
+ parser.add_argument(
581
+ "--pos",
582
+ default="Pos",
583
+ type=str,
584
+ help="Name of SNP positions column (if not a name that gsMap understands)",
585
+ )
586
+ parser.add_argument("--dbsnp", default=None, type=str, help="Path to reference dnsnp file")
587
+ parser.add_argument(
588
+ "--chunksize", default=1e6, type=int, help="Chunk size for loading dbsnp file"
589
+ )
220
590
 
221
591
  # Arguments for output format and quality
222
- parser.add_argument('--format', default='gsMap', type=str,
223
- help='Format of output data', choices=['gsMap', 'COJO'])
224
- parser.add_argument('--info_min', default=0.9, type=float,
225
- help='Minimum INFO score.')
226
- parser.add_argument('--maf_min', default=0.01, type=float,
227
- help='Minimum MAF.')
228
- parser.add_argument('--keep_chr_pos', action='store_true', default=False,
229
- help='Keep SNP chromosome and position columns in the output data')
592
+ parser.add_argument(
593
+ "--format",
594
+ default="gsMap",
595
+ type=str,
596
+ help="Format of output data",
597
+ choices=["gsMap", "COJO"],
598
+ )
599
+ parser.add_argument("--info_min", default=0.9, type=float, help="Minimum INFO score.")
600
+ parser.add_argument("--maf_min", default=0.01, type=float, help="Minimum MAF.")
601
+ parser.add_argument(
602
+ "--keep_chr_pos",
603
+ action="store_true",
604
+ default=False,
605
+ help="Keep SNP chromosome and position columns in the output data",
606
+ )
607
+
230
608
 
231
609
  def add_run_all_mode_args(parser):
232
610
  add_shared_args(parser)
233
611
 
234
612
  # Required paths and configurations
235
- parser.add_argument('--gsMap_resource_dir', type=str, required=True,
236
- help='Directory containing gsMap resources (e.g., genome annotations, LD reference panel, etc.).')
237
- parser.add_argument('--hdf5_path', type=str, required=True,
238
- help='Path to the input spatial transcriptomics data (H5AD format).')
239
- parser.add_argument('--annotation', type=str, required=True,
240
- help='Name of the annotation in adata.obs to use.')
241
- parser.add_argument('--data_layer', type=str, default='counts', required=True,
242
- help='Data layer for gene expression (e.g., "count", "counts", "log1p").')
613
+ parser.add_argument(
614
+ "--gsMap_resource_dir",
615
+ type=str,
616
+ required=True,
617
+ help="Directory containing gsMap resources (e.g., genome annotations, LD reference panel, etc.).",
618
+ )
619
+ parser.add_argument(
620
+ "--hdf5_path",
621
+ type=str,
622
+ required=True,
623
+ help="Path to the input spatial transcriptomics data (H5AD format).",
624
+ )
625
+ parser.add_argument(
626
+ "--annotation", type=str, required=True, help="Name of the annotation in adata.obs to use."
627
+ )
628
+ parser.add_argument(
629
+ "--data_layer",
630
+ type=str,
631
+ default="counts",
632
+ required=True,
633
+ help='Data layer for gene expression (e.g., "count", "counts", "log1p").',
634
+ )
243
635
 
244
636
  # GWAS Data Parameters
245
- parser.add_argument('--trait_name', type=str, help='Name of the trait for GWAS analysis (required if sumstats_file is provided).')
246
- parser.add_argument('--sumstats_file', type=str,
247
- help='Path to GWAS summary statistics file. Either sumstats_file or sumstats_config_file is required.')
248
- parser.add_argument('--sumstats_config_file', type=str,
249
- help='Path to GWAS summary statistics config file. Either sumstats_file or sumstats_config_file is required.')
637
+ parser.add_argument(
638
+ "--trait_name",
639
+ type=str,
640
+ help="Name of the trait for GWAS analysis (required if sumstats_file is provided).",
641
+ )
642
+ parser.add_argument(
643
+ "--sumstats_file",
644
+ type=str,
645
+ help="Path to GWAS summary statistics file. Either sumstats_file or sumstats_config_file is required.",
646
+ )
647
+ parser.add_argument(
648
+ "--sumstats_config_file",
649
+ type=str,
650
+ help="Path to GWAS summary statistics config file. Either sumstats_file or sumstats_config_file is required.",
651
+ )
250
652
 
251
653
  # Homolog Data Parameters
252
- parser.add_argument('--homolog_file', type=str,
253
- help='Path to homologous gene for converting gene names from different species to human (optional, used for cross-species analysis).')
654
+ parser.add_argument(
655
+ "--homolog_file",
656
+ type=str,
657
+ help="Path to homologous gene for converting gene names from different species to human (optional, used for cross-species analysis).",
658
+ )
254
659
 
255
660
  # Maximum number of processes
256
- parser.add_argument('--max_processes', type=int, default=10,
257
- help='Maximum number of processes for parallel execution.')
258
-
259
- # # Optional paths for customization
260
- # parser.add_argument('--bfile_root', type=str,
261
- # help='Root path to PLINK bfiles (LD reference panel). If not provided, it will use the default in gsMap_resource_dir.')
262
- # parser.add_argument('--keep_snp_root', type=str,
263
- # help='Root path for SNP filtering. If not provided, it will use the default in gsMap_resource_dir.')
264
- # parser.add_argument('--w_file', type=str,
265
- # help='Path to the regression weight file. If not provided, it will use the default in gsMap_resource_dir.')
266
- # parser.add_argument('--snp_gene_weight_adata_path', type=str,
267
- # help='Path to the SNP-gene weight matrix file. If not provided, it will use the default in gsMap_resource_dir.')
268
- # parser.add_argument('--baseline_annotation_dir', type=str,
269
- # help='Directory containing the baseline annotations for quick mode. If not provided, it will use the default in gsMap_resource_dir.')
270
- # parser.add_argument('--SNP_gene_pair_dir', type=str,
271
- # help='Directory for SNP-gene pair data. If not provided, it will use the default in gsMap_resource_dir.')
661
+ parser.add_argument(
662
+ "--max_processes",
663
+ type=int,
664
+ default=10,
665
+ help="Maximum number of processes for parallel execution.",
666
+ )
667
+
668
+ parser.add_argument(
669
+ "--latent_representation",
670
+ type=str,
671
+ default=None,
672
+ help="Type of latent representation. This should exist in the h5ad obsm.",
673
+ )
674
+ parser.add_argument("--num_neighbour", type=int, default=21, help="Number of neighbors.")
675
+ parser.add_argument(
676
+ "--num_neighbour_spatial", type=int, default=101, help="Number of spatial neighbors."
677
+ )
678
+ parser.add_argument(
679
+ "--gM_slices", type=str, default=None, help="Path to the slice mean file (optional)."
680
+ )
272
681
 
273
682
 
274
683
  def ensure_path_exists(func):
@@ -288,75 +697,136 @@ def ensure_path_exists(func):
288
697
  @dataclass
289
698
  class ConfigWithAutoPaths:
290
699
  workdir: str
291
- sample_name: str
700
+ sample_name: str | None
292
701
 
293
702
  def __post_init__(self):
294
703
  if self.workdir is None:
295
- raise ValueError('workdir must be provided.')
704
+ raise ValueError("workdir must be provided.")
296
705
 
297
706
  @property
298
707
  @ensure_path_exists
299
708
  def hdf5_with_latent_path(self) -> Path:
300
- return Path(f'{self.workdir}/{self.sample_name}/find_latent_representations/{self.sample_name}_add_latent.h5ad')
709
+ return Path(
710
+ f"{self.workdir}/{self.sample_name}/find_latent_representations/{self.sample_name}_add_latent.h5ad"
711
+ )
301
712
 
302
713
  @property
303
714
  @ensure_path_exists
304
715
  def mkscore_feather_path(self) -> Path:
305
- return Path(f'{self.workdir}/{self.sample_name}/latent_to_gene/{self.sample_name}_gene_marker_score.feather')
716
+ return Path(
717
+ f"{self.workdir}/{self.sample_name}/latent_to_gene/{self.sample_name}_gene_marker_score.feather"
718
+ )
306
719
 
307
720
  @property
308
721
  @ensure_path_exists
309
722
  def ldscore_save_dir(self) -> Path:
310
- return Path(f'{self.workdir}/{self.sample_name}/generate_ldscore')
723
+ return Path(f"{self.workdir}/{self.sample_name}/generate_ldscore")
311
724
 
312
725
  @property
313
726
  @ensure_path_exists
314
727
  def ldsc_save_dir(self) -> Path:
315
- return Path(f'{self.workdir}/{self.sample_name}/spatial_ldsc')
728
+ return Path(f"{self.workdir}/{self.sample_name}/spatial_ldsc")
316
729
 
317
730
  @property
318
731
  @ensure_path_exists
319
732
  def cauchy_save_dir(self) -> Path:
320
- return Path(f'{self.workdir}/{self.sample_name}/cauchy_combination')
733
+ return Path(f"{self.workdir}/{self.sample_name}/cauchy_combination")
321
734
 
322
735
  @ensure_path_exists
323
736
  def get_report_dir(self, trait_name: str) -> Path:
324
- return Path(f'{self.workdir}/{self.sample_name}/report/{trait_name}')
737
+ return Path(f"{self.workdir}/{self.sample_name}/report/{trait_name}")
325
738
 
326
739
  def get_gsMap_report_file(self, trait_name: str) -> Path:
327
- return self.get_report_dir(trait_name) / f'{self.sample_name}_{trait_name}_gsMap_Report.html'
740
+ return (
741
+ self.get_report_dir(trait_name) / f"{self.sample_name}_{trait_name}_gsMap_Report.html"
742
+ )
328
743
 
329
744
  @ensure_path_exists
330
745
  def get_manhattan_html_plot_path(self, trait_name: str) -> Path:
331
746
  return Path(
332
- f'{self.workdir}/{self.sample_name}/report/{trait_name}/manhattan_plot/{self.sample_name}_{trait_name}_Diagnostic_Manhattan_Plot.html')
747
+ f"{self.workdir}/{self.sample_name}/report/{trait_name}/manhattan_plot/{self.sample_name}_{trait_name}_Diagnostic_Manhattan_Plot.html"
748
+ )
333
749
 
334
750
  @ensure_path_exists
335
751
  def get_GSS_plot_dir(self, trait_name: str) -> Path:
336
- return Path(f'{self.workdir}/{self.sample_name}/report/{trait_name}/GSS_plot')
752
+ return Path(f"{self.workdir}/{self.sample_name}/report/{trait_name}/GSS_plot")
337
753
 
338
754
  def get_GSS_plot_select_gene_file(self, trait_name: str) -> Path:
339
- return self.get_GSS_plot_dir(trait_name) / 'plot_genes.csv'
755
+ return self.get_GSS_plot_dir(trait_name) / "plot_genes.csv"
340
756
 
341
757
  @ensure_path_exists
342
758
  def get_ldsc_result_file(self, trait_name: str) -> Path:
343
- return Path(f'{self.ldsc_save_dir}/{self.sample_name}_{trait_name}.csv.gz')
759
+ return Path(f"{self.ldsc_save_dir}/{self.sample_name}_{trait_name}.csv.gz")
344
760
 
345
761
  @ensure_path_exists
346
762
  def get_cauchy_result_file(self, trait_name: str) -> Path:
347
- return Path(f'{self.cauchy_save_dir}/{self.sample_name}_{trait_name}.Cauchy.csv.gz')
763
+ return Path(f"{self.cauchy_save_dir}/{self.sample_name}_{trait_name}.Cauchy.csv.gz")
348
764
 
349
765
  @ensure_path_exists
350
766
  def get_gene_diagnostic_info_save_path(self, trait_name: str) -> Path:
351
767
  return Path(
352
- f'{self.workdir}/{self.sample_name}/report/{trait_name}/{self.sample_name}_{trait_name}_Gene_Diagnostic_Info.csv')
768
+ f"{self.workdir}/{self.sample_name}/report/{trait_name}/{self.sample_name}_{trait_name}_Gene_Diagnostic_Info.csv"
769
+ )
353
770
 
354
771
  @ensure_path_exists
355
772
  def get_gsMap_plot_save_dir(self, trait_name: str) -> Path:
356
- return Path(f'{self.workdir}/{self.sample_name}/report/{trait_name}/gsMap_plot')
773
+ return Path(f"{self.workdir}/{self.sample_name}/report/{trait_name}/gsMap_plot")
357
774
 
358
775
  def get_gsMap_html_plot_save_path(self, trait_name: str) -> Path:
359
- return self.get_gsMap_plot_save_dir(trait_name) / f'{self.sample_name}_{trait_name}_gsMap_plot.html'
776
+ return (
777
+ self.get_gsMap_plot_save_dir(trait_name)
778
+ / f"{self.sample_name}_{trait_name}_gsMap_plot.html"
779
+ )
780
+
781
+
782
+ @dataclass
783
+ class CreateSliceMeanConfig:
784
+ slice_mean_output_file: str | Path
785
+ h5ad_yaml: str | dict | None = None
786
+ sample_name_list: list | None = None
787
+ h5ad_list: list | None = None
788
+ homolog_file: str | None = None
789
+ species: str | None = None
790
+ data_layer: str = None
791
+
792
+ def __post_init__(self):
793
+ if self.h5ad_list is None and self.h5ad_yaml is None:
794
+ raise ValueError("At least one of --h5ad_list or --h5ad_yaml must be provided.")
795
+ if self.h5ad_yaml is not None:
796
+ if isinstance(self.h5ad_yaml, str):
797
+ logger.info(f"Reading h5ad yaml file: {self.h5ad_yaml}")
798
+ h5ad_dict = (
799
+ yaml.safe_load(open(self.h5ad_yaml))
800
+ if isinstance(self.h5ad_yaml, str)
801
+ else self.h5ad_yaml
802
+ )
803
+ elif self.sample_name_list and self.h5ad_list:
804
+ logger.info("Reading sample name list and h5ad list")
805
+ h5ad_dict = dict(zip(self.sample_name_list, self.h5ad_list, strict=False))
806
+ else:
807
+ raise ValueError(
808
+ "Please provide either h5ad_yaml or both sample_name_list and h5ad_list."
809
+ )
810
+
811
+ # check if sample names is unique
812
+ assert len(h5ad_dict) == len(set(h5ad_dict)), "Sample names must be unique."
813
+ assert len(h5ad_dict) > 1, "At least two samples are required."
814
+
815
+ logger.info(f"Input h5ad files: {h5ad_dict}")
816
+
817
+ # Check if all files exist
818
+ self.h5ad_dict = {}
819
+ for sample_name, h5ad_file in h5ad_dict.items():
820
+ h5ad_file = Path(h5ad_file)
821
+ if not h5ad_file.exists():
822
+ raise FileNotFoundError(f"{h5ad_file} does not exist.")
823
+ self.h5ad_dict[sample_name] = h5ad_file
824
+
825
+ self.slice_mean_output_file = Path(self.slice_mean_output_file)
826
+ self.slice_mean_output_file.parent.mkdir(parents=True, exist_ok=True)
827
+
828
+ verify_homolog_file_format(self)
829
+
360
830
 
361
831
  @dataclass
362
832
  class FindLatentRepresentationsConfig(ConfigWithAutoPaths):
@@ -389,24 +859,27 @@ class FindLatentRepresentationsConfig(ConfigWithAutoPaths):
389
859
  # self.output_hdf5_path = self.hdf5_with_latent_path
390
860
  if self.hierarchically:
391
861
  if self.annotation is None:
392
- raise ValueError('annotation must be provided if hierarchically is True.')
862
+ raise ValueError("annotation must be provided if hierarchically is True.")
393
863
  logger.info(
394
- f'------Hierarchical mode is enabled. This will find the latent representations within each annotation.')
864
+ "------Hierarchical mode is enabled. This will find the latent representations within each annotation."
865
+ )
395
866
 
396
867
  # remind for not providing annotation
397
868
  if self.annotation is None:
398
869
  logger.warning(
399
- 'annotation is not provided. This will find the latent representations for the whole dataset.')
870
+ "annotation is not provided. This will find the latent representations for the whole dataset."
871
+ )
400
872
  else:
401
- logger.info(f'------Find latent representations for {self.annotation}...')
873
+ logger.info(f"------Find latent representations for {self.annotation}...")
402
874
 
403
875
 
404
876
  @dataclass
405
877
  class LatentToGeneConfig(ConfigWithAutoPaths):
406
878
  # input_hdf5_with_latent_path: str
407
879
  # output_feather_path: str
880
+ input_hdf5_path: str | Path = None
408
881
  no_expression_fraction: bool = False
409
- latent_representation: str = 'latent_GVAE'
882
+ latent_representation: str = None
410
883
  num_neighbour: int = 21
411
884
  num_neighbour_spatial: int = 101
412
885
  homolog_file: str = None
@@ -415,31 +888,61 @@ class LatentToGeneConfig(ConfigWithAutoPaths):
415
888
  species: str = None
416
889
 
417
890
  def __post_init__(self):
418
- if self.homolog_file is not None:
419
- logger.info(f"User provided homolog file to map gene names to human: {self.homolog_file}")
420
- # check the format of the homolog file
421
- with open(self.homolog_file, 'r') as f:
422
- first_line = f.readline().strip()
423
- _n_col = len(first_line.split())
424
- if _n_col != 2:
425
- raise ValueError(
426
- f"Invalid homolog file format. Expected 2 columns, first column should be other species gene name, second column should be human gene name. "
427
- f"Got {_n_col} columns in the first line.")
428
- else:
429
- first_col_name, second_col_name = first_line.split()
430
- self.species = first_col_name
431
- logger.info(
432
- f"Homolog file provided and will map gene name from column1:{first_col_name} to column2:{second_col_name}")
891
+ if self.input_hdf5_path is None:
892
+ self.input_hdf5_path = self.hdf5_with_latent_path
893
+ assert self.input_hdf5_path.exists(), (
894
+ f"{self.input_hdf5_path} does not exist. Please run FindLatentRepresentations first."
895
+ )
433
896
  else:
434
- logger.info("No homolog file provided. Run in human mode.")
897
+ assert Path(self.input_hdf5_path).exists(), f"{self.input_hdf5_path} does not exist."
898
+ # copy to self.hdf5_with_latent_path
899
+ import shutil
900
+
901
+ shutil.copy2(self.input_hdf5_path, self.hdf5_with_latent_path)
902
+
903
+ if self.latent_representation is not None:
904
+ logger.info(f"Using the provided latent representation: {self.latent_representation}")
905
+ else:
906
+ self.latent_representation = "latent_GVAE"
907
+ logger.info(f"Using default latent representation: {self.latent_representation}")
908
+
909
+ if self.gM_slices is not None:
910
+ assert Path(self.gM_slices).exists(), f"{self.gM_slices} does not exist."
911
+ logger.info(f"Using the provided slice mean file: {self.gM_slices}.")
912
+
913
+ verify_homolog_file_format(self)
914
+
915
+
916
+ def verify_homolog_file_format(config):
917
+ if config.homolog_file is not None:
918
+ logger.info(
919
+ f"User provided homolog file to map gene names to human: {config.homolog_file}"
920
+ )
921
+ # check the format of the homolog file
922
+ with open(config.homolog_file) as f:
923
+ first_line = f.readline().strip()
924
+ _n_col = len(first_line.split())
925
+ if _n_col != 2:
926
+ raise ValueError(
927
+ f"Invalid homolog file format. Expected 2 columns, first column should be other species gene name, second column should be human gene name. "
928
+ f"Got {_n_col} columns in the first line."
929
+ )
930
+ else:
931
+ first_col_name, second_col_name = first_line.split()
932
+ config.species = first_col_name
933
+ logger.info(
934
+ f"Homolog file provided and will map gene name from column1:{first_col_name} to column2:{second_col_name}"
935
+ )
936
+ else:
937
+ logger.info("No homolog file provided. Run in human mode.")
435
938
 
436
939
 
437
940
  @dataclass
438
941
  class GenerateLDScoreConfig(ConfigWithAutoPaths):
439
- chrom: Union[int, str]
942
+ chrom: int | str
440
943
 
441
944
  bfile_root: str
442
- keep_snp_root: Optional[str]
945
+ keep_snp_root: str | None
443
946
 
444
947
  # annotation by gene distance
445
948
  gtf_annotation_file: str
@@ -447,74 +950,106 @@ class GenerateLDScoreConfig(ConfigWithAutoPaths):
447
950
 
448
951
  # annotation by enhancer
449
952
  enhancer_annotation_file: str = None
450
- snp_multiple_enhancer_strategy: Literal['max_mkscore', 'nearest_TSS'] = 'max_mkscore'
451
- gene_window_enhancer_priority: Optional[Literal['gene_window_first', 'enhancer_first', 'enhancer_only',]] = None
953
+ snp_multiple_enhancer_strategy: Literal["max_mkscore", "nearest_TSS"] = "max_mkscore"
954
+ gene_window_enhancer_priority: (
955
+ Literal["gene_window_first", "enhancer_first", "enhancer_only"] | None
956
+ ) = None
452
957
 
453
958
  # for calculating ld score
454
959
  additional_baseline_annotation: str = None
455
960
  spots_per_chunk: int = 1_000
456
961
  ld_wind: int = 1
457
- ld_unit: str = 'CM'
962
+ ld_unit: str = "CM"
458
963
 
459
964
  # zarr config
460
- ldscore_save_format: Literal['feather', 'zarr', 'quick_mode'] = 'feather'
965
+ ldscore_save_format: Literal["feather", "zarr", "quick_mode"] = "feather"
461
966
 
462
- zarr_chunk_size: Tuple[int, int] = None
967
+ zarr_chunk_size: tuple[int, int] = None
463
968
 
464
969
  # for pre calculating the SNP Gene ldscore Weight
465
970
  save_pre_calculate_snp_gene_weight_matrix: bool = False
466
971
 
467
- baseline_annotation_dir: Optional[str] = None
468
- SNP_gene_pair_dir: Optional[str] = None
972
+ baseline_annotation_dir: str | None = None
973
+ SNP_gene_pair_dir: str | None = None
974
+
469
975
  def __post_init__(self):
470
976
  # if self.mkscore_feather_file is None:
471
977
  # self.mkscore_feather_file = self._get_mkscore_feather_path()
472
978
 
473
- if self.enhancer_annotation_file is not None and self.gene_window_enhancer_priority is None:
474
- logger.warning("enhancer_annotation_file is provided but gene_window_enhancer_priority is not provided. "
475
- "by default, gene_window_enhancer_priority is set to 'enhancer_only', when enhancer_annotation_file is provided.")
476
- self.gene_window_enhancer_priority = 'enhancer_only'
477
- if self.enhancer_annotation_file is None and self.gene_window_enhancer_priority is not None:
478
- logger.warning("gene_window_enhancer_priority is provided but enhancer_annotation_file is not provided. "
479
- "by default, gene_window_enhancer_priority is set to None, when enhancer_annotation_file is not provided.")
979
+ if (
980
+ self.enhancer_annotation_file is not None
981
+ and self.gene_window_enhancer_priority is None
982
+ ):
983
+ logger.warning(
984
+ "enhancer_annotation_file is provided but gene_window_enhancer_priority is not provided. "
985
+ "by default, gene_window_enhancer_priority is set to 'enhancer_only', when enhancer_annotation_file is provided."
986
+ )
987
+ self.gene_window_enhancer_priority = "enhancer_only"
988
+ if (
989
+ self.enhancer_annotation_file is None
990
+ and self.gene_window_enhancer_priority is not None
991
+ ):
992
+ logger.warning(
993
+ "gene_window_enhancer_priority is provided but enhancer_annotation_file is not provided. "
994
+ "by default, gene_window_enhancer_priority is set to None, when enhancer_annotation_file is not provided."
995
+ )
480
996
  self.gene_window_enhancer_priority = None
481
- assert self.gene_window_enhancer_priority in [None, 'gene_window_first', 'enhancer_first', 'enhancer_only', ], \
997
+ assert self.gene_window_enhancer_priority in [
998
+ None,
999
+ "gene_window_first",
1000
+ "enhancer_first",
1001
+ "enhancer_only",
1002
+ ], (
482
1003
  f"gene_window_enhancer_priority must be one of None, 'gene_window_first', 'enhancer_first', 'enhancer_only', but got {self.gene_window_enhancer_priority}."
483
- if self.gene_window_enhancer_priority in ['gene_window_first', 'enhancer_first']:
484
- logger.info(f'Both gene_window and enhancer annotation will be used to calculate LD score. ')
1004
+ )
1005
+ if self.gene_window_enhancer_priority in ["gene_window_first", "enhancer_first"]:
485
1006
  logger.info(
486
- f'SNP within +-{self.gene_window_size} bp of gene body will be used and enhancer annotation will be used to calculate LD score. If a snp maps to multiple enhancers, the strategy to choose by your select strategy: {self.snp_multiple_enhancer_strategy}.')
487
- elif self.gene_window_enhancer_priority == 'enhancer_only':
488
- logger.info(f'Only enhancer annotation will be used to calculate LD score. ')
1007
+ "Both gene_window and enhancer annotation will be used to calculate LD score. "
1008
+ )
1009
+ logger.info(
1010
+ f"SNP within +-{self.gene_window_size} bp of gene body will be used and enhancer annotation will be used to calculate LD score. If a snp maps to multiple enhancers, the strategy to choose by your select strategy: {self.snp_multiple_enhancer_strategy}."
1011
+ )
1012
+ elif self.gene_window_enhancer_priority == "enhancer_only":
1013
+ logger.info("Only enhancer annotation will be used to calculate LD score. ")
489
1014
  else:
490
1015
  logger.info(
491
- f'Only gene window annotation will be used to calculate LD score. SNP within +-{self.gene_window_size} bp of gene body will be used. ')
1016
+ f"Only gene window annotation will be used to calculate LD score. SNP within +-{self.gene_window_size} bp of gene body will be used. "
1017
+ )
492
1018
 
493
1019
  # remind for baseline annotation
494
1020
  if self.additional_baseline_annotation is None:
495
- logger.info(f'------Baseline annotation is not provided. Default baseline annotation will be used.')
1021
+ logger.info(
1022
+ "------Baseline annotation is not provided. Default baseline annotation will be used."
1023
+ )
496
1024
  else:
497
1025
  logger.info(
498
- f'------Baseline annotation is provided. Additional baseline annotation will be used with the default baseline annotation.')
499
- logger.info(f'------Baseline annotation directory: {self.additional_baseline_annotation}')
1026
+ "------Baseline annotation is provided. Additional baseline annotation will be used with the default baseline annotation."
1027
+ )
1028
+ logger.info(
1029
+ f"------Baseline annotation directory: {self.additional_baseline_annotation}"
1030
+ )
500
1031
  # check the existence of baseline annotation
501
- if self.chrom == 'all':
1032
+ if self.chrom == "all":
502
1033
  for chrom in range(1, 23):
503
1034
  chrom = str(chrom)
504
- baseline_annotation_path = Path(
505
- self.additional_baseline_annotation) / f'baseline.{chrom}.annot.gz'
1035
+ baseline_annotation_path = (
1036
+ Path(self.additional_baseline_annotation) / f"baseline.{chrom}.annot.gz"
1037
+ )
506
1038
  if not baseline_annotation_path.exists():
507
1039
  raise FileNotFoundError(
508
- f'baseline.{chrom}.annot.gz is not found in {self.additional_baseline_annotation}.')
1040
+ f"baseline.{chrom}.annot.gz is not found in {self.additional_baseline_annotation}."
1041
+ )
509
1042
  else:
510
- baseline_annotation_path = Path(
511
- self.additional_baseline_annotation) / f'baseline.{self.chrom}.annot.gz'
1043
+ baseline_annotation_path = (
1044
+ Path(self.additional_baseline_annotation) / f"baseline.{self.chrom}.annot.gz"
1045
+ )
512
1046
  if not baseline_annotation_path.exists():
513
1047
  raise FileNotFoundError(
514
- f'baseline.{self.chrom}.annot.gz is not found in {self.additional_baseline_annotation}.')
1048
+ f"baseline.{self.chrom}.annot.gz is not found in {self.additional_baseline_annotation}."
1049
+ )
515
1050
 
516
1051
  # set the default zarr chunk size
517
- if self.ldscore_save_format == 'zarr' and self.zarr_chunk_size is None:
1052
+ if self.ldscore_save_format == "zarr" and self.zarr_chunk_size is None:
518
1053
  self.zarr_chunk_size = (10_000, self.spots_per_chunk)
519
1054
 
520
1055
 
@@ -523,56 +1058,61 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
523
1058
  w_file: str
524
1059
  # ldscore_save_dir: str
525
1060
  use_additional_baseline_annotation: bool = True
526
- trait_name: Optional[str] = None
527
- sumstats_file: Optional[str] = None
528
- sumstats_config_file: Optional[str] = None
1061
+ trait_name: str | None = None
1062
+ sumstats_file: str | None = None
1063
+ sumstats_config_file: str | None = None
529
1064
  num_processes: int = 4
530
1065
  not_M_5_50: bool = False
531
1066
  n_blocks: int = 200
532
- chisq_max: Optional[int] = None
533
- all_chunk: Optional[int] = None
534
- chunk_range: Optional[Tuple[int, int]] = None
1067
+ chisq_max: int | None = None
1068
+ all_chunk: int | None = None
1069
+ chunk_range: tuple[int, int] | None = None
535
1070
 
536
- ldscore_save_format: Literal['feather', 'zarr', 'quick_mode'] = 'feather'
1071
+ ldscore_save_format: Literal["feather", "zarr", "quick_mode"] = "feather"
537
1072
 
538
1073
  spots_per_chunk_quick_mode: int = 1_000
539
- snp_gene_weight_adata_path: Optional[str] = None
1074
+ snp_gene_weight_adata_path: str | None = None
540
1075
 
541
1076
  def __post_init__(self):
542
1077
  super().__post_init__()
543
1078
  if self.sumstats_file is None and self.sumstats_config_file is None:
544
- raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
1079
+ raise ValueError("One of sumstats_file and sumstats_config_file must be provided.")
545
1080
  if self.sumstats_file is not None and self.sumstats_config_file is not None:
546
- raise ValueError('Only one of sumstats_file and sumstats_config_file must be provided.')
1081
+ raise ValueError(
1082
+ "Only one of sumstats_file and sumstats_config_file must be provided."
1083
+ )
547
1084
  if self.sumstats_file is not None and self.trait_name is None:
548
- raise ValueError('trait_name must be provided if sumstats_file is provided.')
1085
+ raise ValueError("trait_name must be provided if sumstats_file is provided.")
549
1086
  if self.sumstats_config_file is not None and self.trait_name is not None:
550
- raise ValueError('trait_name must not be provided if sumstats_config_file is provided.')
1087
+ raise ValueError(
1088
+ "trait_name must not be provided if sumstats_config_file is provided."
1089
+ )
551
1090
  self.sumstats_config_dict = {}
552
1091
  # load the sumstats config file
553
1092
  if self.sumstats_config_file is not None:
554
1093
  import yaml
1094
+
555
1095
  with open(self.sumstats_config_file) as f:
556
1096
  config = yaml.load(f, Loader=yaml.FullLoader)
557
- for trait_name, sumstats_file in config.items():
558
- assert Path(sumstats_file).exists(), f'{sumstats_file} does not exist.'
1097
+ for _trait_name, sumstats_file in config.items():
1098
+ assert Path(sumstats_file).exists(), f"{sumstats_file} does not exist."
559
1099
  # load the sumstats file
560
1100
  elif self.sumstats_file is not None:
561
1101
  self.sumstats_config_dict[self.trait_name] = self.sumstats_file
562
1102
  else:
563
- raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
1103
+ raise ValueError("One of sumstats_file and sumstats_config_file must be provided.")
564
1104
 
565
1105
  for sumstats_file in self.sumstats_config_dict.values():
566
- assert Path(sumstats_file).exists(), f'{sumstats_file} does not exist.'
1106
+ assert Path(sumstats_file).exists(), f"{sumstats_file} does not exist."
567
1107
 
568
1108
  # check if additional baseline annotation is exist
569
1109
  # self.use_additional_baseline_annotation = False
570
-
1110
+
571
1111
  if self.use_additional_baseline_annotation:
572
1112
  self.process_additional_baseline_annotation()
573
1113
 
574
1114
  def process_additional_baseline_annotation(self):
575
- additional_baseline_annotation = Path(self.ldscore_save_dir) / 'additional_baseline'
1115
+ additional_baseline_annotation = Path(self.ldscore_save_dir) / "additional_baseline"
576
1116
  dir_exists = additional_baseline_annotation.exists()
577
1117
 
578
1118
  if not dir_exists:
@@ -580,7 +1120,7 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
580
1120
  # if self.use_additional_baseline_annotation:
581
1121
  # logger.warning(f"additional_baseline directory is not found in {self.ldscore_save_dir}.")
582
1122
  # print('''\
583
- # if you want to use additional baseline annotation,
1123
+ # if you want to use additional baseline annotation,
584
1124
  # please provide additional baseline annotation when calculating ld score.
585
1125
  # ''')
586
1126
  # raise FileNotFoundError(
@@ -589,15 +1129,21 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
589
1129
  # self.use_additional_baseline_annotation = self.use_additional_baseline_annotation or True
590
1130
  else:
591
1131
  logger.info(
592
- f'------Additional baseline annotation is provided. It will be used with the default baseline annotation.')
593
- logger.info(f'------Additional baseline annotation directory: {additional_baseline_annotation}')
1132
+ "------Additional baseline annotation is provided. It will be used with the default baseline annotation."
1133
+ )
1134
+ logger.info(
1135
+ f"------Additional baseline annotation directory: {additional_baseline_annotation}"
1136
+ )
594
1137
 
595
1138
  chrom_list = range(1, 23)
596
1139
  for chrom in chrom_list:
597
- baseline_annotation_path = additional_baseline_annotation / f'baseline.{chrom}.l2.ldscore.feather'
1140
+ baseline_annotation_path = (
1141
+ additional_baseline_annotation / f"baseline.{chrom}.l2.ldscore.feather"
1142
+ )
598
1143
  if not baseline_annotation_path.exists():
599
1144
  raise FileNotFoundError(
600
- f'baseline.{chrom}.annot.gz is not found in {additional_baseline_annotation}.')
1145
+ f"baseline.{chrom}.annot.gz is not found in {additional_baseline_annotation}."
1146
+ )
601
1147
  return None
602
1148
 
603
1149
 
@@ -605,8 +1151,25 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
605
1151
  class CauchyCombinationConfig(ConfigWithAutoPaths):
606
1152
  trait_name: str
607
1153
  annotation: str
608
- meta: str = None
609
- slide: str = None
1154
+ sample_name_list: list[str] = dataclasses.field(default_factory=list)
1155
+ output_file: str | Path | None = None
1156
+
1157
+ def __post_init__(self):
1158
+ if self.sample_name is not None:
1159
+ if self.sample_name_list and len(self.sample_name_list) > 0:
1160
+ raise ValueError("Only one of sample_name and sample_name_list must be provided.")
1161
+ else:
1162
+ self.sample_name_list = [self.sample_name]
1163
+ self.output_file = (
1164
+ self.get_cauchy_result_file(self.trait_name)
1165
+ if self.output_file is None
1166
+ else self.output_file
1167
+ )
1168
+ else:
1169
+ assert len(self.sample_name_list) > 0, "At least one sample name must be provided."
1170
+ assert self.output_file is not None, (
1171
+ "Output_file must be provided if sample_name_list is provided."
1172
+ )
610
1173
 
611
1174
 
612
1175
  @dataclass
@@ -618,7 +1181,7 @@ class VisualizeConfig(ConfigWithAutoPaths):
618
1181
  fig_height: int = 600
619
1182
  fig_width: int = 800
620
1183
  point_size: int = None
621
- fig_style: Literal['dark', 'light'] = 'light'
1184
+ fig_style: Literal["dark", "light"] = "light"
622
1185
 
623
1186
 
624
1187
  @dataclass
@@ -628,22 +1191,26 @@ class DiagnosisConfig(ConfigWithAutoPaths):
628
1191
 
629
1192
  trait_name: str
630
1193
  sumstats_file: str
631
- plot_type: Literal['manhattan', 'GSS', 'gsMap', 'all'] = 'all'
1194
+ plot_type: Literal["manhattan", "GSS", "gsMap", "all"] = "all"
632
1195
  top_corr_genes: int = 50
633
- selected_genes: Optional[List[str]] = None
1196
+ selected_genes: list[str] | None = None
634
1197
 
635
- fig_width: Optional[int] = None
636
- fig_height: Optional[int] = None
637
- point_size: Optional[int] = None
638
- fig_style: Literal['dark', 'light'] = 'light'
1198
+ fig_width: int | None = None
1199
+ fig_height: int | None = None
1200
+ point_size: int | None = None
1201
+ fig_style: Literal["dark", "light"] = "light"
639
1202
 
640
1203
  def __post_init__(self):
641
1204
  if any([self.fig_width, self.fig_height, self.point_size]):
642
- logger.info('Customizing the figure size and point size.')
643
- assert all([self.fig_width, self.fig_height, self.point_size]), 'All of fig_width, fig_height, and point_size must be provided.'
1205
+ logger.info("Customizing the figure size and point size.")
1206
+ assert all([self.fig_width, self.fig_height, self.point_size]), (
1207
+ "All of fig_width, fig_height, and point_size must be provided."
1208
+ )
644
1209
  self.customize_fig = True
645
1210
  else:
646
1211
  self.customize_fig = False
1212
+
1213
+
647
1214
  @dataclass
648
1215
  class ReportConfig(DiagnosisConfig):
649
1216
  pass
@@ -656,57 +1223,79 @@ class RunAllModeConfig(ConfigWithAutoPaths):
656
1223
  # == ST DATA PARAMETERS ==
657
1224
  hdf5_path: str
658
1225
  annotation: str
659
- data_layer: str = 'X'
1226
+ data_layer: str = "X"
1227
+
1228
+ # == Find Latent Representation PARAMETERS ==
1229
+ n_comps: int = 300
1230
+
1231
+ # == latent 2 Gene PARAMETERS ==
1232
+ gM_slices: str | None = None
1233
+ latent_representation: str = None
1234
+ num_neighbour: int = 21
1235
+ num_neighbour_spatial: int = 101
660
1236
 
661
1237
  # ==GWAS DATA PARAMETERS==
662
- trait_name: Optional[str] = None
663
- sumstats_file: Optional[str] = None
664
- sumstats_config_file: Optional[str] = None
1238
+ trait_name: str | None = None
1239
+ sumstats_file: str | None = None
1240
+ sumstats_config_file: str | None = None
665
1241
 
666
1242
  # === homolog PARAMETERS ===
667
- homolog_file: Optional[str] = None
1243
+ homolog_file: str | None = None
668
1244
 
669
1245
  max_processes: int = 10
670
1246
 
671
1247
  def __post_init__(self):
672
1248
  super().__post_init__()
673
- self.gtffile = f"{self.gsMap_resource_dir}/genome_annotation/gtf/gencode.v39lift37.annotation.gtf"
674
- self.bfile_root = f"{self.gsMap_resource_dir}/LD_Reference_Panel/1000G_EUR_Phase3_plink/1000G.EUR.QC"
1249
+ self.gtffile = f"{self.gsMap_resource_dir}/genome_annotation/gtf/gencode.v46lift37.basic.annotation.gtf"
1250
+ self.bfile_root = (
1251
+ f"{self.gsMap_resource_dir}/LD_Reference_Panel/1000G_EUR_Phase3_plink/1000G.EUR.QC"
1252
+ )
675
1253
  self.keep_snp_root = f"{self.gsMap_resource_dir}/LDSC_resource/hapmap3_snps/hm"
676
1254
  self.w_file = f"{self.gsMap_resource_dir}/LDSC_resource/weights_hm3_no_hla/weights."
677
- self.snp_gene_weight_adata_path = f"{self.gsMap_resource_dir}/quick_mode/snp_gene_weight_matrix.h5ad"
678
- self.baseline_annotation_dir = Path(f"{self.gsMap_resource_dir}/quick_mode/baseline").resolve()
679
- self.SNP_gene_pair_dir = Path(f"{self.gsMap_resource_dir}/quick_mode/SNP_gene_pair").resolve()
1255
+ self.snp_gene_weight_adata_path = (
1256
+ f"{self.gsMap_resource_dir}/quick_mode/snp_gene_weight_matrix.h5ad"
1257
+ )
1258
+ self.baseline_annotation_dir = Path(
1259
+ f"{self.gsMap_resource_dir}/quick_mode/baseline"
1260
+ ).resolve()
1261
+ self.SNP_gene_pair_dir = Path(
1262
+ f"{self.gsMap_resource_dir}/quick_mode/SNP_gene_pair"
1263
+ ).resolve()
680
1264
  # check the existence of the input files and resources files
681
1265
  for file in [self.hdf5_path, self.gtffile]:
682
1266
  if not Path(file).exists():
683
1267
  raise FileNotFoundError(f"File {file} does not exist.")
684
1268
 
685
1269
  if self.sumstats_file is None and self.sumstats_config_file is None:
686
- raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
1270
+ raise ValueError("One of sumstats_file and sumstats_config_file must be provided.")
687
1271
  if self.sumstats_file is not None and self.sumstats_config_file is not None:
688
- raise ValueError('Only one of sumstats_file and sumstats_config_file must be provided.')
1272
+ raise ValueError(
1273
+ "Only one of sumstats_file and sumstats_config_file must be provided."
1274
+ )
689
1275
  if self.sumstats_file is not None and self.trait_name is None:
690
- raise ValueError('trait_name must be provided if sumstats_file is provided.')
1276
+ raise ValueError("trait_name must be provided if sumstats_file is provided.")
691
1277
  if self.sumstats_config_file is not None and self.trait_name is not None:
692
- raise ValueError('trait_name must not be provided if sumstats_config_file is provided.')
1278
+ raise ValueError(
1279
+ "trait_name must not be provided if sumstats_config_file is provided."
1280
+ )
693
1281
  self.sumstats_config_dict = {}
694
1282
  # load the sumstats config file
695
1283
  if self.sumstats_config_file is not None:
696
1284
  import yaml
1285
+
697
1286
  with open(self.sumstats_config_file) as f:
698
1287
  config = yaml.load(f, Loader=yaml.FullLoader)
699
1288
  for trait_name, sumstats_file in config.items():
700
- assert Path(sumstats_file).exists(), f'{sumstats_file} does not exist.'
1289
+ assert Path(sumstats_file).exists(), f"{sumstats_file} does not exist."
701
1290
  self.sumstats_config_dict[trait_name] = sumstats_file
702
1291
  # load the sumstats file
703
1292
  elif self.sumstats_file is not None and self.trait_name is not None:
704
1293
  self.sumstats_config_dict[self.trait_name] = self.sumstats_file
705
1294
  else:
706
- raise ValueError('One of sumstats_file and sumstats_config_file must be provided.')
1295
+ raise ValueError("One of sumstats_file and sumstats_config_file must be provided.")
707
1296
 
708
1297
  for sumstats_file in self.sumstats_config_dict.values():
709
- assert Path(sumstats_file).exists(), f'{sumstats_file} does not exist.'
1298
+ assert Path(sumstats_file).exists(), f"{sumstats_file} does not exist."
710
1299
 
711
1300
 
712
1301
  @dataclass
@@ -722,85 +1311,122 @@ class FormatSumstatsConfig:
722
1311
  se: str = None
723
1312
  p: str = None
724
1313
  frq: str = None
725
- n: str = None
1314
+ n: str | int = None
726
1315
  z: str = None
727
1316
  OR: str = None
728
1317
  se_OR: str = None
729
1318
  format: str = None
730
1319
  chr: str = None
731
1320
  pos: str = None
732
- chunksize: int = 1e+7
1321
+ chunksize: int = 1e7
733
1322
  info_min: float = 0.9
734
1323
  maf_min: float = 0.01
735
1324
  keep_chr_pos: bool = False
736
1325
 
737
1326
 
738
- @register_cli(name='run_find_latent_representations',
739
- description='Run Find_latent_representations \nFind the latent representations of each spot by running GNN-VAE',
740
- add_args_function=add_find_latent_representations_args)
1327
+ @register_cli(
1328
+ name="quick_mode",
1329
+ description="Run the entire gsMap pipeline in quick mode, utilizing pre-computed weights for faster execution.",
1330
+ add_args_function=add_run_all_mode_args,
1331
+ )
1332
+ def run_all_mode_from_cli(args: argparse.Namespace):
1333
+ from gsMap.run_all_mode import run_pipeline
1334
+
1335
+ config = get_dataclass_from_parser(args, RunAllModeConfig)
1336
+ run_pipeline(config)
1337
+
1338
+
1339
+ @register_cli(
1340
+ name="run_find_latent_representations",
1341
+ description="Run Find_latent_representations \nFind the latent representations of each spot by running GNN",
1342
+ add_args_function=add_find_latent_representations_args,
1343
+ )
741
1344
  def run_find_latent_representation_from_cli(args: argparse.Namespace):
742
1345
  from gsMap.find_latent_representation import run_find_latent_representation
1346
+
743
1347
  config = get_dataclass_from_parser(args, FindLatentRepresentationsConfig)
744
1348
  run_find_latent_representation(config)
745
1349
 
746
1350
 
747
- @register_cli(name='run_latent_to_gene',
748
- description='Run Latent_to_gene \nEstimate gene marker gene scores for each spot by using latent representations from nearby spots',
749
- add_args_function=add_latent_to_gene_args)
1351
+ @register_cli(
1352
+ name="run_latent_to_gene",
1353
+ description="Run Latent_to_gene \nEstimate gene marker gene scores for each spot by using latent representations from nearby spots",
1354
+ add_args_function=add_latent_to_gene_args,
1355
+ )
750
1356
  def run_latent_to_gene_from_cli(args: argparse.Namespace):
751
1357
  from gsMap.latent_to_gene import run_latent_to_gene
1358
+
752
1359
  config = get_dataclass_from_parser(args, LatentToGeneConfig)
753
1360
  run_latent_to_gene(config)
754
1361
 
755
1362
 
756
- @register_cli(name='run_generate_ldscore',
757
- description='Run Generate_ldscore \nGenerate LD scores for each spot',
758
- add_args_function=add_generate_ldscore_args)
1363
+ @register_cli(
1364
+ name="run_generate_ldscore",
1365
+ description="Run Generate_ldscore \nGenerate LD scores for each spot",
1366
+ add_args_function=add_generate_ldscore_args,
1367
+ )
759
1368
  def run_generate_ldscore_from_cli(args: argparse.Namespace):
760
1369
  from gsMap.generate_ldscore import run_generate_ldscore
1370
+
761
1371
  config = get_dataclass_from_parser(args, GenerateLDScoreConfig)
762
1372
  run_generate_ldscore(config)
763
1373
 
764
1374
 
765
- @register_cli(name='run_spatial_ldsc',
766
- description='Run Spatial_ldsc \nRun spatial LDSC for each spot',
767
- add_args_function=add_spatial_ldsc_args)
1375
+ @register_cli(
1376
+ name="run_spatial_ldsc",
1377
+ description="Run Spatial_ldsc \nRun spatial LDSC for each spot",
1378
+ add_args_function=add_spatial_ldsc_args,
1379
+ )
768
1380
  def run_spatial_ldsc_from_cli(args: argparse.Namespace):
769
1381
  from gsMap.spatial_ldsc_multiple_sumstats import run_spatial_ldsc
1382
+
770
1383
  config = get_dataclass_from_parser(args, SpatialLDSCConfig)
771
1384
  run_spatial_ldsc(config)
772
1385
 
773
1386
 
774
- @register_cli(name='run_cauchy_combination',
775
- description='Run Cauchy_combination for each annotation',
776
- add_args_function=add_Cauchy_combination_args)
1387
+ @register_cli(
1388
+ name="run_cauchy_combination",
1389
+ description="Run Cauchy_combination for each annotation",
1390
+ add_args_function=add_Cauchy_combination_args,
1391
+ )
777
1392
  def run_Cauchy_combination_from_cli(args: argparse.Namespace):
778
1393
  from gsMap.cauchy_combination_test import run_Cauchy_combination
1394
+
779
1395
  config = get_dataclass_from_parser(args, CauchyCombinationConfig)
780
1396
  run_Cauchy_combination(config)
781
1397
 
782
1398
 
783
- @register_cli(name='run_report',
784
- description='Run Report to generate diagnostic plots and tables',
785
- add_args_function=add_report_args)
1399
+ @register_cli(
1400
+ name="run_report",
1401
+ description="Run Report to generate diagnostic plots and tables",
1402
+ add_args_function=add_report_args,
1403
+ )
786
1404
  def run_Report_from_cli(args: argparse.Namespace):
787
1405
  from gsMap.report import run_report
1406
+
788
1407
  config = get_dataclass_from_parser(args, ReportConfig)
789
1408
  run_report(config)
790
1409
 
791
1410
 
792
- @register_cli(name='format_sumstats',
793
- description='Format gwas summary statistics',
794
- add_args_function=add_format_sumstats_args)
1411
+ @register_cli(
1412
+ name="format_sumstats",
1413
+ description="Format GWAS summary statistics",
1414
+ add_args_function=add_format_sumstats_args,
1415
+ )
795
1416
  def gwas_format_from_cli(args: argparse.Namespace):
796
1417
  from gsMap.format_sumstats import gwas_format
1418
+
797
1419
  config = get_dataclass_from_parser(args, FormatSumstatsConfig)
798
1420
  gwas_format(config)
799
1421
 
800
- @register_cli(name='quick_mode',
801
- description='Run all the gsMap pipeline in quick mode',
802
- add_args_function=add_run_all_mode_args)
803
- def run_all_mode_from_cli(args: argparse.Namespace):
804
- from gsMap.run_all_mode import run_pipeline
805
- config = get_dataclass_from_parser(args, RunAllModeConfig)
806
- run_pipeline(config)
1422
+
1423
+ @register_cli(
1424
+ name="create_slice_mean",
1425
+ description="Create slice mean from multiple h5ad files",
1426
+ add_args_function=add_create_slice_mean_args,
1427
+ )
1428
+ def create_slice_mean_from_cli(args: argparse.Namespace):
1429
+ from gsMap.create_slice_mean import run_create_slice_mean
1430
+
1431
+ config = get_dataclass_from_parser(args, CreateSliceMeanConfig)
1432
+ run_create_slice_mean(config)