gsMap 1.73.0__py3-none-any.whl → 1.73.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsMap/GNN/train.py +1 -1
- gsMap/__init__.py +1 -1
- gsMap/config.py +29 -16
- gsMap/create_slice_mean.py +1 -0
- gsMap/diagnosis.py +18 -18
- gsMap/find_latent_representation.py +18 -2
- gsMap/generate_ldscore.py +1068 -441
- gsMap/latent_to_gene.py +15 -5
- gsMap/run_all_mode.py +1 -0
- gsMap/utils/generate_r2_matrix.py +2 -2
- gsMap/utils/manhattan_plot.py +15 -7
- {gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/METADATA +9 -1
- {gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/RECORD +16 -16
- {gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/WHEEL +1 -1
- {gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/entry_points.txt +0 -0
- {gsmap-1.73.0.dist-info → gsmap-1.73.2.dist-info}/licenses/LICENSE +0 -0
gsMap/GNN/train.py
CHANGED
gsMap/__init__.py
CHANGED
gsMap/config.py
CHANGED
@@ -232,6 +232,9 @@ def add_find_latent_representations_args(parser):
|
|
232
232
|
action="store_true",
|
233
233
|
help="Enable hierarchical latent representation finding.",
|
234
234
|
)
|
235
|
+
parser.add_argument(
|
236
|
+
"--pearson_residuals", action="store_true", help="Using the pearson residuals."
|
237
|
+
)
|
235
238
|
|
236
239
|
|
237
240
|
def chrom_choice(value):
|
@@ -308,7 +311,7 @@ def add_generate_ldscore_args(parser):
|
|
308
311
|
help="Root path for genotype plink bfiles (.bim, .bed, .fam).",
|
309
312
|
)
|
310
313
|
parser.add_argument(
|
311
|
-
"--keep_snp_root", type=str, required=
|
314
|
+
"--keep_snp_root", type=str, required=False, help="Root path for SNP files"
|
312
315
|
)
|
313
316
|
parser.add_argument(
|
314
317
|
"--gtf_annotation_file", type=str, required=True, help="Path to GTF annotation file."
|
@@ -357,7 +360,11 @@ def add_spatial_ldsc_args(parser):
|
|
357
360
|
"--sumstats_file", type=str, required=True, help="Path to GWAS summary statistics file."
|
358
361
|
)
|
359
362
|
parser.add_argument(
|
360
|
-
"--w_file",
|
363
|
+
"--w_file",
|
364
|
+
type=str,
|
365
|
+
required=False,
|
366
|
+
default=None,
|
367
|
+
help="Path to regression weight file. If not provided, will use weights generated in the generate_ldscore step.",
|
361
368
|
)
|
362
369
|
parser.add_argument(
|
363
370
|
"--trait_name", type=str, required=True, help="Name of the trait being analyzed."
|
@@ -678,6 +685,9 @@ def add_run_all_mode_args(parser):
|
|
678
685
|
parser.add_argument(
|
679
686
|
"--gM_slices", type=str, default=None, help="Path to the slice mean file (optional)."
|
680
687
|
)
|
688
|
+
parser.add_argument(
|
689
|
+
"--pearson_residuals", action="store_true", help="Using the pearson residuals."
|
690
|
+
)
|
681
691
|
|
682
692
|
|
683
693
|
def ensure_path_exists(func):
|
@@ -854,6 +864,7 @@ class FindLatentRepresentationsConfig(ConfigWithAutoPaths):
|
|
854
864
|
var: bool = False
|
855
865
|
convergence_threshold: float = 1e-4
|
856
866
|
hierarchically: bool = False
|
867
|
+
pearson_residuals: bool = False
|
857
868
|
|
858
869
|
def __post_init__(self):
|
859
870
|
# self.output_hdf5_path = self.hdf5_with_latent_path
|
@@ -942,11 +953,11 @@ class GenerateLDScoreConfig(ConfigWithAutoPaths):
|
|
942
953
|
chrom: int | str
|
943
954
|
|
944
955
|
bfile_root: str
|
945
|
-
keep_snp_root: str | None
|
946
956
|
|
947
957
|
# annotation by gene distance
|
948
958
|
gtf_annotation_file: str
|
949
959
|
gene_window_size: int = 50000
|
960
|
+
keep_snp_root: str | None = None
|
950
961
|
|
951
962
|
# annotation by enhancer
|
952
963
|
enhancer_annotation_file: str = None
|
@@ -1055,7 +1066,7 @@ class GenerateLDScoreConfig(ConfigWithAutoPaths):
|
|
1055
1066
|
|
1056
1067
|
@dataclass
|
1057
1068
|
class SpatialLDSCConfig(ConfigWithAutoPaths):
|
1058
|
-
w_file: str
|
1069
|
+
w_file: str | None = None
|
1059
1070
|
# ldscore_save_dir: str
|
1060
1071
|
use_additional_baseline_annotation: bool = True
|
1061
1072
|
trait_name: str | None = None
|
@@ -1105,8 +1116,19 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
|
|
1105
1116
|
for sumstats_file in self.sumstats_config_dict.values():
|
1106
1117
|
assert Path(sumstats_file).exists(), f"{sumstats_file} does not exist."
|
1107
1118
|
|
1108
|
-
#
|
1109
|
-
|
1119
|
+
# Handle w_file
|
1120
|
+
if self.w_file is None:
|
1121
|
+
w_ld_dir = Path(self.ldscore_save_dir) / "w_ld"
|
1122
|
+
if w_ld_dir.exists():
|
1123
|
+
self.w_file = str(w_ld_dir / "weights.")
|
1124
|
+
logger.info(f"Using weights generated in the generate_ldscore step: {self.w_file}")
|
1125
|
+
else:
|
1126
|
+
raise ValueError(
|
1127
|
+
"No w_file provided and no weights found in generate_ldscore output. "
|
1128
|
+
"Either provide --w_file or run generate_ldscore first."
|
1129
|
+
)
|
1130
|
+
else:
|
1131
|
+
logger.info(f"Using provided weights file: {self.w_file}")
|
1110
1132
|
|
1111
1133
|
if self.use_additional_baseline_annotation:
|
1112
1134
|
self.process_additional_baseline_annotation()
|
@@ -1117,16 +1139,6 @@ class SpatialLDSCConfig(ConfigWithAutoPaths):
|
|
1117
1139
|
|
1118
1140
|
if not dir_exists:
|
1119
1141
|
self.use_additional_baseline_annotation = False
|
1120
|
-
# if self.use_additional_baseline_annotation:
|
1121
|
-
# logger.warning(f"additional_baseline directory is not found in {self.ldscore_save_dir}.")
|
1122
|
-
# print('''\
|
1123
|
-
# if you want to use additional baseline annotation,
|
1124
|
-
# please provide additional baseline annotation when calculating ld score.
|
1125
|
-
# ''')
|
1126
|
-
# raise FileNotFoundError(
|
1127
|
-
# f'additional_baseline directory is not found.')
|
1128
|
-
# return
|
1129
|
-
# self.use_additional_baseline_annotation = self.use_additional_baseline_annotation or True
|
1130
1142
|
else:
|
1131
1143
|
logger.info(
|
1132
1144
|
"------Additional baseline annotation is provided. It will be used with the default baseline annotation."
|
@@ -1227,6 +1239,7 @@ class RunAllModeConfig(ConfigWithAutoPaths):
|
|
1227
1239
|
|
1228
1240
|
# == Find Latent Representation PARAMETERS ==
|
1229
1241
|
n_comps: int = 300
|
1242
|
+
pearson_residuals: bool = False
|
1230
1243
|
|
1231
1244
|
# == latent 2 Gene PARAMETERS ==
|
1232
1245
|
gM_slices: str | None = None
|
gsMap/create_slice_mean.py
CHANGED
@@ -23,6 +23,7 @@ def get_common_genes(h5ad_files, config: CreateSliceMeanConfig):
|
|
23
23
|
common_genes = None
|
24
24
|
for file in tqdm(h5ad_files, desc="Finding common genes"):
|
25
25
|
adata = sc.read_h5ad(file)
|
26
|
+
sc.pp.filter_genes(adata, min_cells=1)
|
26
27
|
adata.var_names_make_unique()
|
27
28
|
if common_genes is None:
|
28
29
|
common_genes = adata.var_names
|
gsMap/diagnosis.py
CHANGED
@@ -49,7 +49,10 @@ def compute_gene_diagnostic_info(config: DiagnosisConfig):
|
|
49
49
|
|
50
50
|
# Align marker scores with trait LDSC results
|
51
51
|
mk_score = mk_score.loc[trait_ldsc_result.index]
|
52
|
-
|
52
|
+
|
53
|
+
# Filter out genes with no variation
|
54
|
+
has_variation = (~mk_score.eq(mk_score.iloc[0], axis=1)).any()
|
55
|
+
mk_score = mk_score.loc[:, has_variation]
|
53
56
|
|
54
57
|
logger.info("Calculating correlation between gene marker scores and trait logp-values...")
|
55
58
|
corr = mk_score.corrwith(trait_ldsc_result["logp"])
|
@@ -66,10 +69,6 @@ def compute_gene_diagnostic_info(config: DiagnosisConfig):
|
|
66
69
|
}
|
67
70
|
)
|
68
71
|
|
69
|
-
# Filter based on median GSS score
|
70
|
-
high_GSS_Gene_annotation_pair = high_GSS_Gene_annotation_pair[
|
71
|
-
high_GSS_Gene_annotation_pair["Median_GSS"] >= 1.0
|
72
|
-
]
|
73
72
|
high_GSS_Gene_annotation_pair = high_GSS_Gene_annotation_pair.merge(
|
74
73
|
corr, left_on="Gene", right_index=True
|
75
74
|
)
|
@@ -88,19 +87,6 @@ def compute_gene_diagnostic_info(config: DiagnosisConfig):
|
|
88
87
|
gene_diagnostic_info.to_csv(gene_diagnostic_info_save_path, index=False)
|
89
88
|
logger.info(f"Gene diagnostic information saved to {gene_diagnostic_info_save_path}.")
|
90
89
|
|
91
|
-
# TODO: A new script is needed to save the gene diagnostic info to adata.var and trait_ldsc_result to adata.obs when running multiple traits
|
92
|
-
# # Save to adata.var with the trait_name prefix
|
93
|
-
# logger.info('Saving gene diagnostic info to adata.var...')
|
94
|
-
# gene_diagnostic_info.set_index('Gene', inplace=True) # Use 'Gene' as the index to align with adata.var
|
95
|
-
# adata.var[f'{config.trait_name}_Annotation'] = gene_diagnostic_info['Annotation']
|
96
|
-
# adata.var[f'{config.trait_name}_Median_GSS'] = gene_diagnostic_info['Median_GSS']
|
97
|
-
# adata.var[f'{config.trait_name}_PCC'] = gene_diagnostic_info['PCC']
|
98
|
-
#
|
99
|
-
# # Save trait_ldsc_result to adata.obs
|
100
|
-
# logger.info(f'Saving trait LDSC results to adata.obs as gsMap_{config.trait_name}_p_value...')
|
101
|
-
# adata.obs[f'gsMap_{config.trait_name}_p_value'] = trait_ldsc_result['p']
|
102
|
-
# adata.write(config.hdf5_with_latent_path, )
|
103
|
-
|
104
90
|
return gene_diagnostic_info.reset_index()
|
105
91
|
|
106
92
|
|
@@ -171,6 +157,20 @@ def generate_manhattan_plot(config: DiagnosisConfig):
|
|
171
157
|
+ gwas_data_to_plot["Annotation"].astype(str)
|
172
158
|
)
|
173
159
|
|
160
|
+
# Verify data integrity
|
161
|
+
if gwas_data_with_gene_annotation_sort.empty:
|
162
|
+
logger.error("Filtered GWAS data is empty, cannot create Manhattan plot")
|
163
|
+
return
|
164
|
+
|
165
|
+
if len(gwas_data_to_plot) == 0:
|
166
|
+
logger.error("No SNPs passed filtering criteria for Manhattan plot")
|
167
|
+
return
|
168
|
+
|
169
|
+
# Log some diagnostic information
|
170
|
+
logger.info(f"Creating Manhattan plot with {len(gwas_data_to_plot)} SNPs")
|
171
|
+
logger.info(f"Columns available: {list(gwas_data_to_plot.columns)}")
|
172
|
+
logger.info(f"Chromosome column values: {gwas_data_to_plot['CHR'].unique()}")
|
173
|
+
|
174
174
|
fig = ManhattanPlot(
|
175
175
|
dataframe=gwas_data_to_plot,
|
176
176
|
title="gsMap Diagnosis Manhattan Plot",
|
@@ -50,6 +50,15 @@ def preprocess_data(adata, params):
|
|
50
50
|
# HVGs based on count
|
51
51
|
logger.info("Dealing with count data...")
|
52
52
|
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=params.feat_cell)
|
53
|
+
|
54
|
+
# Get the pearson residuals
|
55
|
+
if params.pearson_residuals:
|
56
|
+
sc.experimental.pp.normalize_pearson_residuals(adata, inplace=False)
|
57
|
+
pearson_residuals = sc.experimental.pp.normalize_pearson_residuals(
|
58
|
+
adata, inplace=False, clip=10
|
59
|
+
)
|
60
|
+
adata.layers["pearson_residuals"] = pearson_residuals["X"]
|
61
|
+
|
53
62
|
# Normalize the data
|
54
63
|
sc.pp.normalize_total(adata, target_sum=1e4)
|
55
64
|
sc.pp.log1p(adata)
|
@@ -64,8 +73,13 @@ class LatentRepresentationFinder:
|
|
64
73
|
def __init__(self, adata, args: FindLatentRepresentationsConfig):
|
65
74
|
self.params = args
|
66
75
|
|
67
|
-
|
68
|
-
|
76
|
+
if "pearson_residuals" in adata.layers:
|
77
|
+
self.expression_array = (
|
78
|
+
adata[:, adata.var.highly_variable].layers["pearson_residuals"].copy()
|
79
|
+
)
|
80
|
+
else:
|
81
|
+
self.expression_array = adata[:, adata.var.highly_variable].X.copy()
|
82
|
+
self.expression_array = sc.pp.scale(self.expression_array, max_value=10)
|
69
83
|
|
70
84
|
# Construct the neighboring graph
|
71
85
|
self.graph_dict = construct_adjacency_matrix(adata, self.params)
|
@@ -103,6 +117,8 @@ def run_find_latent_representation(args: FindLatentRepresentationsConfig):
|
|
103
117
|
# Load the ST data
|
104
118
|
logger.info(f"Loading ST data of {args.sample_name}...")
|
105
119
|
adata = sc.read_h5ad(args.input_hdf5_path)
|
120
|
+
sc.pp.filter_genes(adata, min_cells=1)
|
121
|
+
|
106
122
|
logger.info(f"The ST data contains {adata.shape[0]} cells, {adata.shape[1]} genes.")
|
107
123
|
|
108
124
|
# Load the cell type annotation
|