PyPI - gsMap - Versions diffs - 1.72.3__py3-none-any.whl → 1.73.1__py3-none-any.whl - Mend

gsMap 1.72.3py3-none-any.whl → 1.73.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

gsMap/GNN/train.py +1 -1
gsMap/__init__.py +1 -1
gsMap/cauchy_combination_test.py +5 -5
gsMap/config.py +170 -37
gsMap/create_slice_mean.py +33 -18
gsMap/diagnosis.py +4 -14
gsMap/find_latent_representation.py +19 -3
gsMap/format_sumstats.py +6 -0
gsMap/generate_ldscore.py +1071 -476
gsMap/latent_to_gene.py +57 -19
gsMap/run_all_mode.py +2 -0
gsMap/utils/generate_r2_matrix.py +15 -294
gsMap/utils/regression_read.py +0 -76
gsmap-1.73.1.dist-info/METADATA +177 -0
gsmap-1.73.1.dist-info/RECORD +31 -0
{gsmap-1.72.3.dist-info → gsmap-1.73.1.dist-info}/WHEEL +1 -1
{gsmap-1.72.3.dist-info → gsmap-1.73.1.dist-info/licenses}/LICENSE +6 -6
gsmap-1.72.3.dist-info/METADATA +0 -120
gsmap-1.72.3.dist-info/RECORD +0 -31
{gsmap-1.72.3.dist-info → gsmap-1.73.1.dist-info}/entry_points.txt +0 -0

gsMap/GNN/train.py CHANGED Viewed

@@ -17,7 +17,7 @@ def reconstruction_loss(decoded, x):
 def label_loss(pred_label, true_label):
     """Compute the cross-entropy loss."""
-    return F.cross_entropy(pred_label, true_label)
+    return F.cross_entropy(pred_label, true_label.long())
 class ModelTrainer:

gsMap/__init__.py CHANGED Viewed

@@ -2,4 +2,4 @@
 Genetics-informed pathogenic spatial mapping
 """
-__version__ = "1.72.3"
+__version__ = "1.73.1"

gsMap/cauchy_combination_test.py CHANGED Viewed

@@ -48,16 +48,16 @@ def acat_test(pvalues, weights=None):
     elif any(i < 0 for i in weights):
         raise Exception("All weights must be positive.")
     else:
-        weights = [i / len(weights) for i in weights]
+        weights = [i / np.sum(weights) for i in weights]
     pvalues = np.array(pvalues)
     weights = np.array(weights)
-    if not any(i < 1e-16 for i in pvalues):
+    if not any(i < 1e-15 for i in pvalues):
         cct_stat = sum(weights * np.tan((0.5 - pvalues) * np.pi))
     else:
-        is_small = [i < (1e-16) for i in pvalues]
-        is_large = [i >= (1e-16) for i in pvalues]
+        is_small = [i < (1e-15) for i in pvalues]
+        is_large = [i >= (1e-15) for i in pvalues]
         cct_stat = sum((weights[is_small] / pvalues[is_small]) / np.pi)
         cct_stat += sum(weights[is_large] * np.tan((0.5 - pvalues[is_large]) * np.pi))
@@ -118,7 +118,7 @@ def run_Cauchy_combination(config: CauchyCombinationConfig):
         n_removed = len(p_values) - len(p_values_filtered)
         # Remove outliers if the number is reasonable
-        if 0 < n_removed < 20:
+        if 0 < n_removed < max(len(p_values) * 0.01, 20):
             logger.info(f"Removed {n_removed}/{len(p_values)} outliers (median + 3IQR) for {ct}.")
             p_cauchy_temp = acat_test(p_values_filtered)
         else:

gsMap/config.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import argparse
 import dataclasses
 import logging
+import os
 import sys
+import threading
+import time
 from collections import OrderedDict, namedtuple
 from collections.abc import Callable
 from dataclasses import dataclass
@@ -10,6 +13,7 @@ from pathlib import Path
 from pprint import pprint
 from typing import Literal
+import psutil
 import pyfiglet
 import yaml
@@ -34,9 +38,109 @@ def get_gsMap_logger(logger_name):
 logger = get_gsMap_logger("gsMap")
+def track_resource_usage(func):
+    """
+    Decorator to track resource usage during function execution.
+    Logs memory usage, CPU time, and wall clock time at the end of the function.
+    """
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        # Get the current process
+        process = psutil.Process(os.getpid())
+        # Initialize tracking variables
+        peak_memory = 0
+        cpu_percent_samples = []
+        stop_thread = False
+        # Function to monitor resource usage
+        def resource_monitor():
+            nonlocal peak_memory, cpu_percent_samples
+            while not stop_thread:
+                try:
+                    # Get current memory usage in MB
+                    current_memory = process.memory_info().rss / (1024 * 1024)
+                    peak_memory = max(peak_memory, current_memory)
+                    # Get CPU usage percentage
+                    cpu_percent = process.cpu_percent(interval=None)
+                    if cpu_percent > 0:  # Skip initial zero readings
+                        cpu_percent_samples.append(cpu_percent)
+                    time.sleep(0.5)
+                except Exception:  # Catching all exceptions here because... # noqa: BLE001
+                    pass
+        # Start resource monitoring in a separate thread
+        monitor_thread = threading.Thread(target=resource_monitor)
+        monitor_thread.daemon = True
+        monitor_thread.start()
+        # Get start times
+        start_wall_time = time.time()
+        start_cpu_time = process.cpu_times().user + process.cpu_times().system
+        try:
+            # Run the actual function
+            result = func(*args, **kwargs)
+            return result
+        finally:
+            # Stop the monitoring thread
+            stop_thread = True
+            monitor_thread.join(timeout=1.0)
+            # Calculate elapsed times
+            end_wall_time = time.time()
+            end_cpu_time = process.cpu_times().user + process.cpu_times().system
+            wall_time = end_wall_time - start_wall_time
+            cpu_time = end_cpu_time - start_cpu_time
+            # Calculate average CPU percentage
+            avg_cpu_percent = (
+                sum(cpu_percent_samples) / len(cpu_percent_samples) if cpu_percent_samples else 0
+            )
+            # Format memory for display
+            if peak_memory < 1024:
+                memory_str = f"{peak_memory:.2f} MB"
+            else:
+                memory_str = f"{peak_memory / 1024:.2f} GB"
+            # Format times for display
+            if wall_time < 60:
+                wall_time_str = f"{wall_time:.2f} seconds"
+            elif wall_time < 3600:
+                wall_time_str = f"{wall_time / 60:.2f} minutes"
+            else:
+                wall_time_str = f"{wall_time / 3600:.2f} hours"
+            if cpu_time < 60:
+                cpu_time_str = f"{cpu_time:.2f} seconds"
+            elif cpu_time < 3600:
+                cpu_time_str = f"{cpu_time / 60:.2f} minutes"
+            else:
+                cpu_time_str = f"{cpu_time / 3600:.2f} hours"
+            # Log the resource usage
+            import logging
+            logger = logging.getLogger("gsMap")
+            logger.info("Resource usage summary:")
+            logger.info(f"  • Wall clock time: {wall_time_str}")
+            logger.info(f"  • CPU time: {cpu_time_str}")
+            logger.info(f"  • Average CPU utilization: {avg_cpu_percent:.1f}%")
+            logger.info(f"  • Peak memory usage: {memory_str}")
+    return wrapper
 # Decorator to register functions for cli parsing
 def register_cli(name: str, description: str, add_args_function: Callable) -> Callable:
     def decorator(func: Callable) -> Callable:
+        @track_resource_usage  # Use enhanced resource tracking
+        @wraps(func)
         def wrapper(*args, **kwargs):
             name.replace("_", " ")
             gsMap_main_logo = pyfiglet.figlet_format(
@@ -50,8 +154,16 @@ def register_cli(name: str, description: str, add_args_function: Callable) -> Ca
             print(version_number.center(80), flush=True)
             print("=" * 80, flush=True)
             logger.info(f"Running {name}...")
+            # Record start time for the log message
+            start_time = time.strftime("%Y-%m-%d %H:%M:%S")
+            logger.info(f"Started at: {start_time}")
             func(*args, **kwargs)
-            logger.info(f"Finished running {name}.")
+            # Record end time for the log message
+            end_time = time.strftime("%Y-%m-%d %H:%M:%S")
+            logger.info(f"Finished running {name} at: {end_time}.")
         cli_function_registry[name] = subcommand(
             name=name, func=wrapper, add_args_function=add_args_function, description=description
@@ -61,6 +173,13 @@ def register_cli(name: str, description: str, add_args_function: Callable) -> Ca
     return decorator
+def str_or_float(value):
+    try:
+        return int(value)
+    except ValueError:
+        return value
 def add_shared_args(parser):
     parser.add_argument(
         "--workdir", type=str, required=True, help="Path to the working directory."
@@ -113,6 +232,9 @@ def add_find_latent_representations_args(parser):
         action="store_true",
         help="Enable hierarchical latent representation finding.",
     )
+    parser.add_argument(
+        "--pearson_residuals", action="store_true", help="Using the pearson residuals."
+    )
 def chrom_choice(value):
@@ -189,7 +311,7 @@ def add_generate_ldscore_args(parser):
         help="Root path for genotype plink bfiles (.bim, .bed, .fam).",
     )
     parser.add_argument(
-        "--keep_snp_root", type=str, required=True, help="Root path for SNP files."
+        "--keep_snp_root", type=str, required=False, help="Root path for SNP files"
     )
     parser.add_argument(
         "--gtf_annotation_file", type=str, required=True, help="Path to GTF annotation file."
@@ -238,7 +360,11 @@ def add_spatial_ldsc_args(parser):
         "--sumstats_file", type=str, required=True, help="Path to GWAS summary statistics file."
     )
     parser.add_argument(
-        "--w_file", type=str, required=True, help="Path to regression weight file."
+        "--w_file",
+        type=str,
+        required=False,
+        default=None,
+        help="Path to regression weight file. If not provided, will use weights generated in the generate_ldscore step.",
     )
     parser.add_argument(
         "--trait_name", type=str, required=True, help="Name of the trait being analyzed."
@@ -429,7 +555,7 @@ def add_format_sumstats_args(parser):
     parser.add_argument(
         "--n",
         default=None,
-        type=str,
+        type=str_or_float,
         help="Name of sample size column (if not a name that gsMap understands)",
     )
     parser.add_argument(
@@ -559,6 +685,9 @@ def add_run_all_mode_args(parser):
     parser.add_argument(
         "--gM_slices", type=str, default=None, help="Path to the slice mean file (optional)."
     )
+    parser.add_argument(
+        "--pearson_residuals", action="store_true", help="Using the pearson residuals."
+    )
 def ensure_path_exists(func):
@@ -735,6 +864,7 @@ class FindLatentRepresentationsConfig(ConfigWithAutoPaths):
     var: bool = False
     convergence_threshold: float = 1e-4
     hierarchically: bool = False
+    pearson_residuals: bool = False
     def __post_init__(self):
         # self.output_hdf5_path = self.hdf5_with_latent_path
@@ -823,11 +953,11 @@ class GenerateLDScoreConfig(ConfigWithAutoPaths):
     chrom: int | str
     bfile_root: str
-    keep_snp_root: str | None
     # annotation by gene distance
     gtf_annotation_file: str
     gene_window_size: int = 50000
+    keep_snp_root: str | None = None
     # annotation by enhancer
     enhancer_annotation_file: str = None
@@ -936,7 +1066,7 @@ class GenerateLDScoreConfig(ConfigWithAutoPaths):
 @dataclass
 class SpatialLDSCConfig(ConfigWithAutoPaths):
-    w_file: str
+    w_file: str | None = None
     # ldscore_save_dir: str
     use_additional_baseline_annotation: bool = True
     trait_name: str | None = None
@@ -986,8 +1116,19 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
         for sumstats_file in self.sumstats_config_dict.values():
             assert Path(sumstats_file).exists(), f"{sumstats_file} does not exist."
-        # check if additional baseline annotation is exist
-        # self.use_additional_baseline_annotation = False
+        # Handle w_file
+        if self.w_file is None:
+            w_ld_dir = Path(self.ldscore_save_dir) / "w_ld"
+            if w_ld_dir.exists():
+                self.w_file = str(w_ld_dir / "weights.")
+                logger.info(f"Using weights generated in the generate_ldscore step: {self.w_file}")
+            else:
+                raise ValueError(
+                    "No w_file provided and no weights found in generate_ldscore output. "
+                    "Either provide --w_file or run generate_ldscore first."
+                )
+        else:
+            logger.info(f"Using provided weights file: {self.w_file}")
         if self.use_additional_baseline_annotation:
             self.process_additional_baseline_annotation()
@@ -998,16 +1139,6 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
         if not dir_exists:
             self.use_additional_baseline_annotation = False
-            # if self.use_additional_baseline_annotation:
-            #     logger.warning(f"additional_baseline directory is not found in {self.ldscore_save_dir}.")
-            #     print('''\
-            #         if you want to use additional baseline annotation,
-            #         please provide additional baseline annotation when calculating ld score.
-            #         ''')
-            #     raise FileNotFoundError(
-            #         f'additional_baseline directory is not found.')
-            # return
-            # self.use_additional_baseline_annotation = self.use_additional_baseline_annotation or True
         else:
             logger.info(
                 "------Additional baseline annotation is provided. It will be used with the default baseline annotation."
@@ -1037,7 +1168,7 @@ class CauchyCombinationConfig(ConfigWithAutoPaths):
     def __post_init__(self):
         if self.sample_name is not None:
-            if len(self.sample_name_list) > 0:
+            if self.sample_name_list and len(self.sample_name_list) > 0:
                 raise ValueError("Only one of sample_name and sample_name_list must be provided.")
             else:
                 self.sample_name_list = [self.sample_name]
@@ -1106,6 +1237,10 @@ class RunAllModeConfig(ConfigWithAutoPaths):
     annotation: str
     data_layer: str = "X"
+    # == Find Latent Representation PARAMETERS ==
+    n_comps: int = 300
+    pearson_residuals: bool = False
     # == latent 2 Gene PARAMETERS ==
     gM_slices: str | None = None
     latent_representation: str = None
@@ -1124,9 +1259,7 @@ class RunAllModeConfig(ConfigWithAutoPaths):
     def __post_init__(self):
         super().__post_init__()
-        self.gtffile = (
-            f"{self.gsMap_resource_dir}/genome_annotation/gtf/gencode.v39lift37.annotation.gtf"
-        )
+        self.gtffile = f"{self.gsMap_resource_dir}/genome_annotation/gtf/gencode.v46lift37.basic.annotation.gtf"
         self.bfile_root = (
             f"{self.gsMap_resource_dir}/LD_Reference_Panel/1000G_EUR_Phase3_plink/1000G.EUR.QC"
         )
@@ -1191,7 +1324,7 @@ class FormatSumstatsConfig:
     se: str = None
     p: str = None
     frq: str = None
-    n: str = None
+    n: str | int = None
     z: str = None
     OR: str = None
     se_OR: str = None
@@ -1204,9 +1337,21 @@ class FormatSumstatsConfig:
     keep_chr_pos: bool = False
+@register_cli(
+    name="quick_mode",
+    description="Run the entire gsMap pipeline in quick mode, utilizing pre-computed weights for faster execution.",
+    add_args_function=add_run_all_mode_args,
+)
+def run_all_mode_from_cli(args: argparse.Namespace):
+    from gsMap.run_all_mode import run_pipeline
+    config = get_dataclass_from_parser(args, RunAllModeConfig)
+    run_pipeline(config)
 @register_cli(
     name="run_find_latent_representations",
-    description="Run Find_latent_representations \nFind the latent representations of each spot by running GNN-VAE",
+    description="Run Find_latent_representations \nFind the latent representations of each spot by running GNN",
     add_args_function=add_find_latent_representations_args,
 )
 def run_find_latent_representation_from_cli(args: argparse.Namespace):
@@ -1278,7 +1423,7 @@ def run_Report_from_cli(args: argparse.Namespace):
 @register_cli(
     name="format_sumstats",
-    description="Format gwas summary statistics",
+    description="Format GWAS summary statistics",
     add_args_function=add_format_sumstats_args,
 )
 def gwas_format_from_cli(args: argparse.Namespace):
@@ -1288,18 +1433,6 @@ def gwas_format_from_cli(args: argparse.Namespace):
     gwas_format(config)
-@register_cli(
-    name="quick_mode",
-    description="Run all the gsMap pipeline in quick mode",
-    add_args_function=add_run_all_mode_args,
-)
-def run_all_mode_from_cli(args: argparse.Namespace):
-    from gsMap.run_all_mode import run_pipeline
-    config = get_dataclass_from_parser(args, RunAllModeConfig)
-    run_pipeline(config)
 @register_cli(
     name="create_slice_mean",
     description="Create slice mean from multiple h5ad files",

gsMap/create_slice_mean.py CHANGED Viewed

@@ -5,8 +5,9 @@ import anndata
 import numpy as np
 import pandas as pd
 import scanpy as sc
+import scipy
 import zarr
-from scipy.stats import rankdata
+from scipy.stats import gmean, rankdata
 from tqdm import tqdm
 from gsMap.config import CreateSliceMeanConfig
@@ -22,6 +23,7 @@ def get_common_genes(h5ad_files, config: CreateSliceMeanConfig):
     common_genes = None
     for file in tqdm(h5ad_files, desc="Finding common genes"):
         adata = sc.read_h5ad(file)
+        sc.pp.filter_genes(adata, min_cells=1)
         adata.var_names_make_unique()
         if common_genes is None:
             common_genes = adata.var_names
@@ -62,22 +64,27 @@ def calculate_one_slice_mean(
     adata = adata[:, common_genes].copy()
     n_cells = adata.shape[0]
-    log_ranks = np.zeros((n_cells, adata.n_vars), dtype=np.float32)
-    # Compute log of ranks to avoid overflow when computing geometric mean
-    for i in tqdm(range(n_cells), desc=f"Computing log ranks for {sample_name}"):
-        data = adata.X[i, :].toarray().flatten()
-        ranks = rankdata(data, method="average")
-        log_ranks[i, :] = np.log(ranks)  # Adding small value to avoid log(0)
-    # Calculate geometric mean via log trick: exp(mean(log(values)))
-    gmean = (np.exp(np.mean(log_ranks, axis=0))).reshape(-1, 1)
+    if not scipy.sparse.issparse(adata.X):
+        adata_X = scipy.sparse.csr_matrix(adata.X)
+    elif isinstance(adata.X, scipy.sparse.csr_matrix):
+        adata_X = adata.X  # Avoid copying if already CSR
+    else:
+        adata_X = adata.X.tocsr()
+    ranks = np.zeros((n_cells, adata.n_vars), dtype=np.float16)
+    for i in tqdm(range(n_cells), desc="Computing ranks per cell"):
+        data = adata_X[i, :].toarray().flatten()
+        ranks[i, :] = rankdata(data, method="average")
+    gM = gmean(ranks, axis=0).reshape(-1, 1)
     # Calculate the expression fractio
     adata_X_bool = adata.X.astype(bool)
     frac = (np.asarray(adata_X_bool.sum(axis=0)).flatten()).reshape(-1, 1)
     # Save to zarr group
-    gmean_frac = np.concatenate([gmean, frac], axis=1)
+    gmean_frac = np.concatenate([gM, frac], axis=1)
     s1_zarr = gmean_zarr_group.array(sample_name, data=gmean_frac, chunks=None, dtype="f4")
     s1_zarr.attrs["spot_number"] = adata.shape[0]
@@ -85,34 +92,42 @@ def calculate_one_slice_mean(
 def merge_zarr_means(zarr_group_path, output_file, common_genes):
     """
     Merge all Zarr arrays into a weighted geometric mean and save to a Parquet file.
-    Instead of calculating the mean, it sums the logs and applies the exponential.
     """
     gmean_zarr_group = zarr.open(zarr_group_path, mode="a")
-    log_sum = None
+    sample_gmeans = []
+    sample_weights = []
     frac_sum = None
     total_spot_number = 0
+    # Collect all geometric means and their weights (spot numbers)
     for key in tqdm(gmean_zarr_group.array_keys(), desc="Merging Zarr arrays"):
         s1 = gmean_zarr_group[key]
         s1_array_gmean = s1[:][:, 0]
         s1_array_frac = s1[:][:, 1]
         n = s1.attrs["spot_number"]
-        if log_sum is None:
-            log_sum = np.log(s1_array_gmean) * n
+        sample_gmeans.append(s1_array_gmean)
+        sample_weights.append(n)
+        if frac_sum is None:
             frac_sum = s1_array_frac
         else:
-            log_sum += np.log(s1_array_gmean) * n
             frac_sum += s1_array_frac
         total_spot_number += n
-    # Apply the geometric mean via exponentiation of the averaged logs
-    final_mean = np.exp(log_sum / total_spot_number)
+    # Convert to arrays
+    sample_gmeans = np.array(sample_gmeans)
+    sample_weights = np.array(sample_weights)
+    final_gmean = gmean(sample_gmeans, axis=0, weights=sample_weights[:, np.newaxis])
     final_frac = frac_sum / total_spot_number
     # Save the final mean to a Parquet file
     gene_names = common_genes
-    final_df = pd.DataFrame({"gene": gene_names, "G_Mean": final_mean, "frac": final_frac})
+    final_df = pd.DataFrame({"gene": gene_names, "G_Mean": final_gmean, "frac": final_frac})
     final_df.set_index("gene", inplace=True)
     final_df.to_parquet(output_file)
     return final_df

gsMap/diagnosis.py CHANGED Viewed

@@ -49,7 +49,10 @@ def compute_gene_diagnostic_info(config: DiagnosisConfig):
     # Align marker scores with trait LDSC results
     mk_score = mk_score.loc[trait_ldsc_result.index]
-    mk_score = mk_score.loc[:, mk_score.sum(axis=0) != 0]
+    # Filter out genes with no variation
+    non_zero_std_cols = mk_score.columns[mk_score.std() > 0]
+    mk_score = mk_score.loc[:, non_zero_std_cols]
     logger.info("Calculating correlation between gene marker scores and trait logp-values...")
     corr = mk_score.corrwith(trait_ldsc_result["logp"])
@@ -88,19 +91,6 @@ def compute_gene_diagnostic_info(config: DiagnosisConfig):
     gene_diagnostic_info.to_csv(gene_diagnostic_info_save_path, index=False)
     logger.info(f"Gene diagnostic information saved to {gene_diagnostic_info_save_path}.")
-    # TODO: A new script is needed to save the gene diagnostic info to adata.var and trait_ldsc_result to adata.obs when running multiple traits
-    # # Save to adata.var with the trait_name prefix
-    # logger.info('Saving gene diagnostic info to adata.var...')
-    # gene_diagnostic_info.set_index('Gene', inplace=True)  # Use 'Gene' as the index to align with adata.var
-    # adata.var[f'{config.trait_name}_Annotation'] = gene_diagnostic_info['Annotation']
-    # adata.var[f'{config.trait_name}_Median_GSS'] = gene_diagnostic_info['Median_GSS']
-    # adata.var[f'{config.trait_name}_PCC'] = gene_diagnostic_info['PCC']
-    #
-    # # Save trait_ldsc_result to adata.obs
-    # logger.info(f'Saving trait LDSC results to adata.obs as gsMap_{config.trait_name}_p_value...')
-    # adata.obs[f'gsMap_{config.trait_name}_p_value'] = trait_ldsc_result['p']
-    # adata.write(config.hdf5_with_latent_path, )
     return gene_diagnostic_info.reset_index()

gsMap/find_latent_representation.py CHANGED Viewed

@@ -38,7 +38,7 @@ def preprocess_data(adata, params):
     if params.data_layer in adata.layers.keys():
         logger.info(f"Using data layer: {params.data_layer}...")
-        adata.X = adata.layers[params.data_layer]
+        adata.X = adata.layers[params.data_layer].copy()
     elif params.data_layer == "X":
         logger.info(f"Using data layer: {params.data_layer}...")
         if adata.X.dtype == "float32" or adata.X.dtype == "float64":
@@ -50,6 +50,15 @@ def preprocess_data(adata, params):
         # HVGs based on count
         logger.info("Dealing with count data...")
         sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=params.feat_cell)
+        # Get the pearson residuals
+        if params.pearson_residuals:
+            sc.experimental.pp.normalize_pearson_residuals(adata, inplace=False)
+            pearson_residuals = sc.experimental.pp.normalize_pearson_residuals(
+                adata, inplace=False, clip=10
+            )
+            adata.layers["pearson_residuals"] = pearson_residuals["X"]
         # Normalize the data
         sc.pp.normalize_total(adata, target_sum=1e4)
         sc.pp.log1p(adata)
@@ -64,8 +73,13 @@ class LatentRepresentationFinder:
     def __init__(self, adata, args: FindLatentRepresentationsConfig):
         self.params = args
-        self.expression_array = adata[:, adata.var.highly_variable].X.copy()
-        self.expression_array = sc.pp.scale(self.expression_array, max_value=10)
+        if "pearson_residuals" in adata.layers:
+            self.expression_array = (
+                adata[:, adata.var.highly_variable].layers["pearson_residuals"].copy()
+            )
+        else:
+            self.expression_array = adata[:, adata.var.highly_variable].X.copy()
+            self.expression_array = sc.pp.scale(self.expression_array, max_value=10)
         # Construct the neighboring graph
         self.graph_dict = construct_adjacency_matrix(adata, self.params)
@@ -103,6 +117,8 @@ def run_find_latent_representation(args: FindLatentRepresentationsConfig):
     # Load the ST data
     logger.info(f"Loading ST data of {args.sample_name}...")
     adata = sc.read_h5ad(args.input_hdf5_path)
+    sc.pp.filter_genes(adata, min_cells=1)
     logger.info(f"The ST data contains {adata.shape[0]} cells, {adata.shape[1]} genes.")
     # Load the cell type annotation

gsMap/format_sumstats.py CHANGED Viewed

@@ -409,6 +409,12 @@ def gwas_format(config: FormatSumstatsConfig):
         compression=compression_type,
         na_values=[".", "NA"],
     )
+    if isinstance(config.n, int | float):
+        logger.info(f"Set the sample size of gwas data as {config.n}.")
+        gwas["N"] = config.n
+        config.n = "N"
     logger.info(f"Read {len(gwas)} SNPs from {config.sumstats}.")
     # Check name and format

gsMap 1.72.3__py3-none-any.whl → 1.73.1__py3-none-any.whl

gsMap 1.72.3py3-none-any.whl → 1.73.1py3-none-any.whl