PyPI - gsMap - Versions diffs - 1.73.0__py3-none-any.whl → 1.73.2__py3-none-any.whl - Mend

gsMap 1.73.0py3-none-any.whl → 1.73.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

gsMap/GNN/train.py +1 -1
gsMap/__init__.py +1 -1
gsMap/config.py +29 -16
gsMap/create_slice_mean.py +1 -0
gsMap/diagnosis.py +18 -18
gsMap/find_latent_representation.py +18 -2
gsMap/generate_ldscore.py +1068 -441
gsMap/latent_to_gene.py +15 -5
gsMap/run_all_mode.py +1 -0
gsMap/utils/generate_r2_matrix.py +2 -2
gsMap/utils/manhattan_plot.py +15 -7
{gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/METADATA +9 -1
{gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/RECORD +16 -16
{gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/WHEEL +1 -1
{gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/entry_points.txt +0 -0
{gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/licenses/LICENSE +0 -0

gsMap/GNN/train.py CHANGED Viewed

@@ -17,7 +17,7 @@ def reconstruction_loss(decoded, x):
 def label_loss(pred_label, true_label):
     """Compute the cross-entropy loss."""
-    return F.cross_entropy(pred_label, true_label)
+    return F.cross_entropy(pred_label, true_label.long())
 class ModelTrainer:

gsMap/__init__.py CHANGED Viewed

@@ -2,4 +2,4 @@
 Genetics-informed pathogenic spatial mapping
 """
-__version__ = "1.73.0"
+__version__ = "1.73.2"

gsMap/config.py CHANGED Viewed

@@ -232,6 +232,9 @@ def add_find_latent_representations_args(parser):
         action="store_true",
         help="Enable hierarchical latent representation finding.",
     )
+    parser.add_argument(
+        "--pearson_residuals", action="store_true", help="Using the pearson residuals."
+    )
 def chrom_choice(value):
@@ -308,7 +311,7 @@ def add_generate_ldscore_args(parser):
         help="Root path for genotype plink bfiles (.bim, .bed, .fam).",
     )
     parser.add_argument(
-        "--keep_snp_root", type=str, required=True, help="Root path for SNP files."
+        "--keep_snp_root", type=str, required=False, help="Root path for SNP files"
     )
     parser.add_argument(
         "--gtf_annotation_file", type=str, required=True, help="Path to GTF annotation file."
@@ -357,7 +360,11 @@ def add_spatial_ldsc_args(parser):
         "--sumstats_file", type=str, required=True, help="Path to GWAS summary statistics file."
     )
     parser.add_argument(
-        "--w_file", type=str, required=True, help="Path to regression weight file."
+        "--w_file",
+        type=str,
+        required=False,
+        default=None,
+        help="Path to regression weight file. If not provided, will use weights generated in the generate_ldscore step.",
     )
     parser.add_argument(
         "--trait_name", type=str, required=True, help="Name of the trait being analyzed."
@@ -678,6 +685,9 @@ def add_run_all_mode_args(parser):
     parser.add_argument(
         "--gM_slices", type=str, default=None, help="Path to the slice mean file (optional)."
     )
+    parser.add_argument(
+        "--pearson_residuals", action="store_true", help="Using the pearson residuals."
+    )
 def ensure_path_exists(func):
@@ -854,6 +864,7 @@ class FindLatentRepresentationsConfig(ConfigWithAutoPaths):
     var: bool = False
     convergence_threshold: float = 1e-4
     hierarchically: bool = False
+    pearson_residuals: bool = False
     def __post_init__(self):
         # self.output_hdf5_path = self.hdf5_with_latent_path
@@ -942,11 +953,11 @@ class GenerateLDScoreConfig(ConfigWithAutoPaths):
     chrom: int | str
     bfile_root: str
-    keep_snp_root: str | None
     # annotation by gene distance
     gtf_annotation_file: str
     gene_window_size: int = 50000
+    keep_snp_root: str | None = None
     # annotation by enhancer
     enhancer_annotation_file: str = None
@@ -1055,7 +1066,7 @@ class GenerateLDScoreConfig(ConfigWithAutoPaths):
 @dataclass
 class SpatialLDSCConfig(ConfigWithAutoPaths):
-    w_file: str
+    w_file: str | None = None
     # ldscore_save_dir: str
     use_additional_baseline_annotation: bool = True
     trait_name: str | None = None
@@ -1105,8 +1116,19 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
         for sumstats_file in self.sumstats_config_dict.values():
             assert Path(sumstats_file).exists(), f"{sumstats_file} does not exist."
-        # check if additional baseline annotation is exist
-        # self.use_additional_baseline_annotation = False
+        # Handle w_file
+        if self.w_file is None:
+            w_ld_dir = Path(self.ldscore_save_dir) / "w_ld"
+            if w_ld_dir.exists():
+                self.w_file = str(w_ld_dir / "weights.")
+                logger.info(f"Using weights generated in the generate_ldscore step: {self.w_file}")
+            else:
+                raise ValueError(
+                    "No w_file provided and no weights found in generate_ldscore output. "
+                    "Either provide --w_file or run generate_ldscore first."
+                )
+        else:
+            logger.info(f"Using provided weights file: {self.w_file}")
         if self.use_additional_baseline_annotation:
             self.process_additional_baseline_annotation()
@@ -1117,16 +1139,6 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
         if not dir_exists:
             self.use_additional_baseline_annotation = False
-            # if self.use_additional_baseline_annotation:
-            #     logger.warning(f"additional_baseline directory is not found in {self.ldscore_save_dir}.")
-            #     print('''\
-            #         if you want to use additional baseline annotation,
-            #         please provide additional baseline annotation when calculating ld score.
-            #         ''')
-            #     raise FileNotFoundError(
-            #         f'additional_baseline directory is not found.')
-            # return
-            # self.use_additional_baseline_annotation = self.use_additional_baseline_annotation or True
         else:
             logger.info(
                 "------Additional baseline annotation is provided. It will be used with the default baseline annotation."
@@ -1227,6 +1239,7 @@ class RunAllModeConfig(ConfigWithAutoPaths):
     # == Find Latent Representation PARAMETERS ==
     n_comps: int = 300
+    pearson_residuals: bool = False
     # == latent 2 Gene PARAMETERS ==
     gM_slices: str | None = None

gsMap/create_slice_mean.py CHANGED Viewed

@@ -23,6 +23,7 @@ def get_common_genes(h5ad_files, config: CreateSliceMeanConfig):
     common_genes = None
     for file in tqdm(h5ad_files, desc="Finding common genes"):
         adata = sc.read_h5ad(file)
+        sc.pp.filter_genes(adata, min_cells=1)
         adata.var_names_make_unique()
         if common_genes is None:
             common_genes = adata.var_names

gsMap/diagnosis.py CHANGED Viewed

@@ -49,7 +49,10 @@ def compute_gene_diagnostic_info(config: DiagnosisConfig):
     # Align marker scores with trait LDSC results
     mk_score = mk_score.loc[trait_ldsc_result.index]
-    mk_score = mk_score.loc[:, mk_score.sum(axis=0) != 0]
+    # Filter out genes with no variation
+    has_variation = (~mk_score.eq(mk_score.iloc[0], axis=1)).any()
+    mk_score = mk_score.loc[:, has_variation]
     logger.info("Calculating correlation between gene marker scores and trait logp-values...")
     corr = mk_score.corrwith(trait_ldsc_result["logp"])
@@ -66,10 +69,6 @@ def compute_gene_diagnostic_info(config: DiagnosisConfig):
         }
     )
-    # Filter based on median GSS score
-    high_GSS_Gene_annotation_pair = high_GSS_Gene_annotation_pair[
-        high_GSS_Gene_annotation_pair["Median_GSS"] >= 1.0
-    ]
     high_GSS_Gene_annotation_pair = high_GSS_Gene_annotation_pair.merge(
         corr, left_on="Gene", right_index=True
     )
@@ -88,19 +87,6 @@ def compute_gene_diagnostic_info(config: DiagnosisConfig):
     gene_diagnostic_info.to_csv(gene_diagnostic_info_save_path, index=False)
     logger.info(f"Gene diagnostic information saved to {gene_diagnostic_info_save_path}.")
-    # TODO: A new script is needed to save the gene diagnostic info to adata.var and trait_ldsc_result to adata.obs when running multiple traits
-    # # Save to adata.var with the trait_name prefix
-    # logger.info('Saving gene diagnostic info to adata.var...')
-    # gene_diagnostic_info.set_index('Gene', inplace=True)  # Use 'Gene' as the index to align with adata.var
-    # adata.var[f'{config.trait_name}_Annotation'] = gene_diagnostic_info['Annotation']
-    # adata.var[f'{config.trait_name}_Median_GSS'] = gene_diagnostic_info['Median_GSS']
-    # adata.var[f'{config.trait_name}_PCC'] = gene_diagnostic_info['PCC']
-    #
-    # # Save trait_ldsc_result to adata.obs
-    # logger.info(f'Saving trait LDSC results to adata.obs as gsMap_{config.trait_name}_p_value...')
-    # adata.obs[f'gsMap_{config.trait_name}_p_value'] = trait_ldsc_result['p']
-    # adata.write(config.hdf5_with_latent_path, )
     return gene_diagnostic_info.reset_index()
@@ -171,6 +157,20 @@ def generate_manhattan_plot(config: DiagnosisConfig):
         + gwas_data_to_plot["Annotation"].astype(str)
     )
+    # Verify data integrity
+    if gwas_data_with_gene_annotation_sort.empty:
+        logger.error("Filtered GWAS data is empty, cannot create Manhattan plot")
+        return
+    if len(gwas_data_to_plot) == 0:
+        logger.error("No SNPs passed filtering criteria for Manhattan plot")
+        return
+    # Log some diagnostic information
+    logger.info(f"Creating Manhattan plot with {len(gwas_data_to_plot)} SNPs")
+    logger.info(f"Columns available: {list(gwas_data_to_plot.columns)}")
+    logger.info(f"Chromosome column values: {gwas_data_to_plot['CHR'].unique()}")
     fig = ManhattanPlot(
         dataframe=gwas_data_to_plot,
         title="gsMap Diagnosis Manhattan Plot",

gsMap/find_latent_representation.py CHANGED Viewed

@@ -50,6 +50,15 @@ def preprocess_data(adata, params):
         # HVGs based on count
         logger.info("Dealing with count data...")
         sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=params.feat_cell)
+        # Get the pearson residuals
+        if params.pearson_residuals:
+            sc.experimental.pp.normalize_pearson_residuals(adata, inplace=False)
+            pearson_residuals = sc.experimental.pp.normalize_pearson_residuals(
+                adata, inplace=False, clip=10
+            )
+            adata.layers["pearson_residuals"] = pearson_residuals["X"]
         # Normalize the data
         sc.pp.normalize_total(adata, target_sum=1e4)
         sc.pp.log1p(adata)
@@ -64,8 +73,13 @@ class LatentRepresentationFinder:
     def __init__(self, adata, args: FindLatentRepresentationsConfig):
         self.params = args
-        self.expression_array = adata[:, adata.var.highly_variable].X.copy()
-        self.expression_array = sc.pp.scale(self.expression_array, max_value=10)
+        if "pearson_residuals" in adata.layers:
+            self.expression_array = (
+                adata[:, adata.var.highly_variable].layers["pearson_residuals"].copy()
+            )
+        else:
+            self.expression_array = adata[:, adata.var.highly_variable].X.copy()
+            self.expression_array = sc.pp.scale(self.expression_array, max_value=10)
         # Construct the neighboring graph
         self.graph_dict = construct_adjacency_matrix(adata, self.params)
@@ -103,6 +117,8 @@ def run_find_latent_representation(args: FindLatentRepresentationsConfig):
     # Load the ST data
     logger.info(f"Loading ST data of {args.sample_name}...")
     adata = sc.read_h5ad(args.input_hdf5_path)
+    sc.pp.filter_genes(adata, min_cells=1)
     logger.info(f"The ST data contains {adata.shape[0]} cells, {adata.shape[1]} genes.")
     # Load the cell type annotation

gsMap 1.73.0__py3-none-any.whl → 1.73.2__py3-none-any.whl

gsMap 1.73.0py3-none-any.whl → 1.73.2py3-none-any.whl