PyPI - gsg - Versions diffs - 0.6.0__tar.gz - Mend

gsg 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

gsg-0.6.0/GSG/__init__.py +4 -0
gsg-0.6.0/GSG/models/__init__.py +46 -0
gsg-0.6.0/GSG/models/edcoder.py +197 -0
gsg-0.6.0/GSG/models/gin.py +202 -0
gsg-0.6.0/GSG/models/loss_func.py +11 -0
gsg-0.6.0/GSG/models/utils.py +177 -0
gsg-0.6.0/GSG/preprocess.py +183 -0
gsg-0.6.0/GSG/train.py +21 -0
gsg-0.6.0/GSG/utils.py +32 -0
gsg-0.6.0/LICENSE.txt +21 -0
gsg-0.6.0/PKG-INFO +64 -0
gsg-0.6.0/README.md +54 -0
gsg-0.6.0/gsg.egg-info/PKG-INFO +64 -0
gsg-0.6.0/gsg.egg-info/SOURCES.txt +17 -0
gsg-0.6.0/gsg.egg-info/dependency_links.txt +1 -0
gsg-0.6.0/gsg.egg-info/requires.txt +10 -0
gsg-0.6.0/gsg.egg-info/top_level.txt +1 -0
gsg-0.6.0/pyproject.toml +31 -0
gsg-0.6.0/setup.cfg +4 -0

gsg-0.6.0/GSG/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from . import preprocess as pp
+from . import train
+from . import utils
+from . import models as model

gsg-0.6.0/GSG/models/__init__.py ADDED Viewed

@@ -0,0 +1,46 @@
+from .edcoder import PreModel
+#####################################################################################################################################
+#   Adapted from:                                                                                                                   #
+#   @inproceedings{hou2022graphmae,                                                                                                 #
+#    title={GraphMAE: Self-Supervised Masked Graph Autoencoders},                                                                   #
+#    author={Hou, Zhenyu and Liu, Xiao and Cen, Yukuo and Dong, Yuxiao and Yang, Hongxia and Wang, Chunjie and Tang, Jie},          #
+#    booktitle={Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},                              #
+#    pages={594--604},                                                                                                              #
+#    year={2022}                                                                                                                    #
+#   }                                                                                                                               #
+#####################################################################################################################################
+def build_model(args):
+    num_hidden = args.num_hidden
+    num_layers = args.num_layers
+    in_drop = args.in_drop
+    norm = args.norm
+    encoder_type = "gin"
+    if args.imputation:
+        decoder_type = "mlp"
+    else:
+        decoder_type = 'gin'
+    mask_rate = args.mask_rate
+    replace_rate = args.replace_rate
+    activation = args.activation
+    alpha_l = args.alpha_l
+    num_features = args.num_features
+    model = PreModel(
+        in_dim=num_features,
+        num_hidden=num_hidden,
+        num_layers=num_layers,
+        activation=activation,
+        feat_drop=in_drop,
+        encoder_type=encoder_type,
+        decoder_type=decoder_type,
+        mask_rate=mask_rate,
+        norm=norm,
+        replace_rate=replace_rate,
+        alpha_l=alpha_l,
+    )
+    return model

gsg-0.6.0/GSG/models/edcoder.py ADDED Viewed

@@ -0,0 +1,197 @@
+from typing import Optional
+from itertools import chain
+from functools import partial
+import torch
+import torch.nn as nn
+from .gin import GIN
+from .loss_func import sce_loss
+from .utils import drop_edge
+def setup_module(m_type, enc_dec, in_dim, num_hidden, out_dim, num_layers, dropout, activation, residual, norm) -> nn.Module:
+    if m_type == "gin":
+        mod = GIN(
+            in_dim=in_dim,
+            num_hidden=num_hidden,
+            out_dim=out_dim,
+            num_layers=num_layers,
+            dropout=dropout,
+            activation=activation,
+            residual=residual,
+            norm=norm,
+            encoding=(enc_dec == "encoding"),
+        )
+    elif m_type == "mlp":
+        # * just for decoder
+        mod = nn.Sequential(
+            nn.Linear(in_dim, num_hidden),
+            nn.PReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(num_hidden, out_dim)
+        )
+    elif m_type == "linear":
+        mod = nn.Linear(in_dim, out_dim)
+    else:
+        raise NotImplementedError
+    return mod
+class PreModel(nn.Module):
+    def __init__(
+            self,
+            in_dim: int,
+            num_hidden: int,
+            num_layers: int,
+            activation: str,
+            feat_drop: float,
+            norm: Optional[str],
+            mask_rate: float = 0.3,
+            encoder_type: str = "gin",
+            decoder_type: str = "gin",
+            loss_fn: str = "sce",
+            drop_edge_rate: float = 0.0,
+            replace_rate: float = 0.1,
+            alpha_l: float = 2,
+            residual: bool = False,
+            concat_hidden: bool = False,
+         ):
+        super(PreModel, self).__init__()
+        self._mask_rate = mask_rate
+        self._encoder_type = encoder_type
+        self._decoder_type = decoder_type
+        self._drop_edge_rate = drop_edge_rate
+        self._output_hidden_size = num_hidden
+        self._concat_hidden = concat_hidden
+        self._replace_rate = replace_rate
+        self._mask_token_rate = 1 - self._replace_rate
+        enc_num_hidden = num_hidden
+        dec_in_dim = num_hidden
+        dec_num_hidden = num_hidden
+        # build encoder
+        self.encoder = setup_module(
+            m_type=encoder_type,
+            enc_dec="encoding",
+            in_dim=in_dim,
+            num_hidden=enc_num_hidden,
+            out_dim=enc_num_hidden,
+            num_layers=num_layers,
+            activation=activation,
+            dropout=feat_drop,
+            residual=residual,
+            norm=norm,
+        )
+        # build decoder for attribute prediction
+        self.decoder = setup_module(
+            m_type=decoder_type,
+            enc_dec="decoding",
+            in_dim=dec_in_dim,
+            num_hidden=dec_num_hidden,
+            out_dim=in_dim,
+            num_layers=1,
+            activation=activation,
+            dropout=feat_drop,
+            residual=residual,
+            norm=norm,
+        )
+        self.enc_mask_token = nn.Parameter(torch.zeros(1, in_dim))
+        if concat_hidden:
+            self.encoder_to_decoder = nn.Linear(dec_in_dim * num_layers, dec_in_dim, bias=False)
+        else:
+            self.encoder_to_decoder = nn.Linear(dec_in_dim, dec_in_dim, bias=False)
+        # * setup loss function
+        self.criterion = partial(sce_loss, alpha=alpha_l)
+    @property
+    def output_hidden_dim(self):
+        return self._output_hidden_size
+    def encoding_mask_noise(self, g, x, mask_rate=0.3):
+        num_nodes = g.num_nodes()
+        perm = torch.randperm(num_nodes, device=x.device)
+        num_mask_nodes = int(mask_rate * num_nodes)
+        # random masking
+        num_mask_nodes = int(mask_rate * num_nodes)
+        mask_nodes = perm[: num_mask_nodes]
+        keep_nodes = perm[num_mask_nodes: ]
+        if self._replace_rate > 0:
+            num_noise_nodes = int(self._replace_rate * num_mask_nodes)
+            perm_mask = torch.randperm(num_mask_nodes, device=x.device)
+            token_nodes = mask_nodes[perm_mask[: int(self._mask_token_rate * num_mask_nodes)]]
+            noise_nodes = mask_nodes[perm_mask[-int(self._replace_rate * num_mask_nodes):]]
+            noise_to_be_chosen = torch.randperm(num_nodes, device=x.device)[:num_noise_nodes]
+            out_x = x.clone()
+            out_x[token_nodes] = 0.0
+            out_x[noise_nodes] = x[noise_to_be_chosen]
+        else:
+            out_x = x.clone()
+            token_nodes = mask_nodes
+            out_x[mask_nodes] = 0.0
+        out_x[token_nodes] += self.enc_mask_token
+        use_g = g.clone()
+        return use_g, out_x, (mask_nodes, keep_nodes)
+    def forward(self, g, x):
+        # ---- attribute reconstruction ----
+        loss = self.mask_attr_prediction(g, x)
+        loss_item = {"loss": loss.item()}
+        return loss, loss_item
+    def mask_attr_prediction(self, g, x, test=False):
+        pre_use_g, use_x, (mask_nodes, keep_nodes) = self.encoding_mask_noise(g, x, self._mask_rate)
+        if self._drop_edge_rate > 0:
+            use_g, masked_edges = drop_edge(pre_use_g, self._drop_edge_rate, return_edges=True)
+        else:
+            use_g = pre_use_g
+        enc_rep, all_hidden = self.encoder(use_g, use_x, return_hidden=True)
+        if self._concat_hidden:
+            enc_rep = torch.cat(all_hidden, dim=1)
+        # ---- attribute reconstruction ----
+        rep = self.encoder_to_decoder(enc_rep)
+        if self._decoder_type not in ("mlp", "linear"):
+            rep[mask_nodes] = 0
+        if self._decoder_type in ("mlp", "linear") :
+            recon = self.decoder(rep)
+        else:
+            recon = self.decoder(pre_use_g, rep)
+        x_init = x[mask_nodes]
+        x_rec = recon[mask_nodes]
+        # x_init = x
+        # x_rec = recon
+        if not test:
+            loss = self.criterion(x_rec, x_init)
+            return loss
+        return recon.detach().cpu().numpy()
+    def embed(self, g, x):
+        rep = self.encoder(g, x)
+        return rep
+    @property
+    def enc_params(self):
+        return self.encoder.parameters()
+    @property
+    def dec_params(self):
+        return chain(*[self.encoder_to_decoder.parameters(), self.decoder.parameters()])

gsg-0.6.0/GSG/models/gin.py ADDED Viewed

@@ -0,0 +1,202 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import dgl.function as fn
+from dgl.utils import expand_as_pair
+from .utils import create_activation, create_norm
+class GIN(nn.Module):
+    def __init__(self,
+                 in_dim,
+                 num_hidden,
+                 out_dim,
+                 num_layers,
+                 dropout,
+                 activation,
+                 residual,
+                 norm,
+                 encoding=False,
+                 learn_eps=False,
+                 aggr="sum",
+                 ):
+        super(GIN, self).__init__()
+        self.out_dim = out_dim
+        self.num_layers = num_layers
+        self.layers = nn.ModuleList()
+        self.activation = activation
+        self.dropout = dropout
+        last_activation = create_activation(activation) if encoding else None
+        last_residual = encoding and residual
+        last_norm = norm if encoding else None
+        if num_layers == 1:
+            apply_func = MLP(2, in_dim, num_hidden, out_dim, activation=activation, norm=norm)
+            if last_norm:
+                apply_func = ApplyNodeFunc(apply_func, norm=norm, activation=activation)
+            self.layers.append(GINConv(in_dim, out_dim, apply_func, init_eps=0, learn_eps=learn_eps, residual=last_residual))
+        else:
+            # input projection (no residual)
+            self.layers.append(GINConv(
+                in_dim,
+                num_hidden,
+                ApplyNodeFunc(MLP(2, in_dim, num_hidden, num_hidden, activation=activation, norm=norm), activation=activation, norm=norm),
+                init_eps=0,
+                learn_eps=learn_eps,
+                residual=residual)
+                )
+            # hidden layers
+            for l in range(1, num_layers - 1):
+                # due to multi-head, the in_dim = num_hidden * num_heads
+                self.layers.append(GINConv(
+                    num_hidden, num_hidden,
+                    ApplyNodeFunc(MLP(2, num_hidden, num_hidden, num_hidden, activation=activation, norm=norm), activation=activation, norm=norm),
+                    init_eps=0,
+                    learn_eps=learn_eps,
+                    residual=residual)
+                )
+            # output projection
+            apply_func = MLP(2, num_hidden, num_hidden, out_dim, activation=activation, norm=norm)
+            if last_norm:
+                apply_func = ApplyNodeFunc(apply_func, activation=activation, norm=norm)
+            self.layers.append(GINConv(num_hidden, out_dim, apply_func, init_eps=0, learn_eps=learn_eps, residual=last_residual))
+        self.head = nn.Identity()
+    def forward(self, g, inputs, return_hidden=False):
+        h = inputs
+        hidden_list = []
+        for l in range(self.num_layers):
+            h = F.dropout(h, p=self.dropout, training=self.training)
+            h = self.layers[l](g, h)
+            hidden_list.append(h)
+        # output projection
+        if return_hidden:
+            return self.head(h), hidden_list
+        else:
+            return self.head(h)
+    def reset_classifier(self, num_classes):
+        self.head = nn.Linear(self.out_dim, num_classes)
+class GINConv(nn.Module):
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 apply_func,
+                 aggregator_type="sum",
+                 init_eps=0,
+                 learn_eps=False,
+                 residual=False,
+                 ):
+        super().__init__()
+        self._in_feats = in_dim
+        self._out_feats = out_dim
+        self.apply_func = apply_func
+        self._aggregator_type = aggregator_type
+        if aggregator_type == 'sum':
+            self._reducer = fn.sum
+        elif aggregator_type == 'max':
+            self._reducer = fn.max
+        elif aggregator_type == 'mean':
+            self._reducer = fn.mean
+        else:
+            raise KeyError('Aggregator type {} not recognized.'.format(aggregator_type))
+        if learn_eps:
+            self.eps = torch.nn.Parameter(torch.FloatTensor([init_eps]))
+        else:
+            self.register_buffer('eps', torch.FloatTensor([init_eps]))
+        if residual:
+            if self._in_feats != self._out_feats:
+                self.res_fc = nn.Linear(
+                    self._in_feats, self._out_feats, bias=False)
+                print("! Linear Residual !")
+            else:
+                print("Identity Residual ")
+                self.res_fc = nn.Identity()
+        else:
+            self.register_buffer('res_fc', None)
+    def forward(self, graph, feat):
+        with graph.local_scope():
+            aggregate_fn = fn.copy_src('h', 'm')
+            feat_src, feat_dst = expand_as_pair(feat, graph)
+            graph.srcdata['h'] = feat_src
+            graph.update_all(aggregate_fn, self._reducer('m', 'neigh'))
+            rst = (1 + self.eps) * feat_dst + graph.dstdata['neigh']
+            if self.apply_func is not None:
+                rst = self.apply_func(rst)
+            if self.res_fc is not None:
+                rst = rst + self.res_fc(feat_dst)
+            return rst
+class ApplyNodeFunc(nn.Module):
+    """Update the node feature hv with MLP, BN and ReLU."""
+    def __init__(self, mlp, norm="batchnorm", activation="relu"):
+        super(ApplyNodeFunc, self).__init__()
+        self.mlp = mlp
+        norm_func = create_norm(norm)
+        if norm_func is None:
+            self.norm = nn.Identity()
+        else:
+            self.norm = norm_func(self.mlp.output_dim)
+        self.act = create_activation(activation)
+    def forward(self, h):
+        h = self.mlp(h)
+        h = self.norm(h)
+        h = self.act(h)
+        return h
+class MLP(nn.Module):
+    """MLP with linear output"""
+    def __init__(self, num_layers, input_dim, hidden_dim, output_dim, activation="relu", norm="batchnorm"):
+        super(MLP, self).__init__()
+        self.linear_or_not = True  # default is linear model
+        self.num_layers = num_layers
+        self.output_dim = output_dim
+        if num_layers < 1:
+            raise ValueError("number of layers should be positive!")
+        elif num_layers == 1:
+            # Linear model
+            self.linear = nn.Linear(input_dim, output_dim)
+        else:
+            # Multi-layer model
+            self.linear_or_not = False
+            self.linears = torch.nn.ModuleList()
+            self.norms = torch.nn.ModuleList()
+            self.activations = torch.nn.ModuleList()
+            self.linears.append(nn.Linear(input_dim, hidden_dim))
+            for layer in range(num_layers - 2):
+                self.linears.append(nn.Linear(hidden_dim, hidden_dim))
+            self.linears.append(nn.Linear(hidden_dim, output_dim))
+            for layer in range(num_layers - 1):
+                self.norms.append(create_norm(norm)(hidden_dim))
+                self.activations.append(create_activation(activation))
+    def forward(self, x):
+        if self.linear_or_not:
+            # If linear model
+            return self.linear(x)
+        else:
+            # If MLP
+            h = x
+            for i in range(self.num_layers - 1):
+                h = self.norms[i](self.linears[i](h))
+                h = self.activations[i](h)
+            return self.linears[-1](h)

gsg-0.6.0/GSG/models/loss_func.py ADDED Viewed

@@ -0,0 +1,11 @@
+import torch.nn.functional as F
+def sce_loss(x, y, alpha=3):
+    x = F.normalize(x, p=2, dim=-1)
+    y = F.normalize(y, p=2, dim=-1)
+    loss = (1 - (x * y).sum(dim=-1)).pow_(alpha)
+    loss = loss.mean()
+    return loss

gsg-0.6.0/GSG/models/utils.py ADDED Viewed

@@ -0,0 +1,177 @@
+import random
+import logging
+import dgl
+import torch
+import numpy as np
+import torch.nn as nn
+from tqdm import tqdm
+from functools import partial
+from torch import optim as optim
+logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO)
+def accuracy(y_pred, y_true):
+    y_true = y_true.squeeze().long()
+    preds = y_pred.max(1)[1].type_as(y_true)
+    correct = preds.eq(y_true).double()
+    correct = correct.sum().item()
+    return correct / len(y_true)
+def set_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.determinstic = True
+def get_current_lr(optimizer):
+    return optimizer.state_dict()["param_groups"][0]["lr"]
+def create_activation(name):
+    if name == "relu":
+        return nn.ReLU()
+    elif name == "gelu":
+        return nn.GELU()
+    elif name == "prelu":
+        return nn.PReLU()
+    elif name is None:
+        return nn.Identity()
+    elif name == "elu":
+        return nn.ELU()
+    else:
+        raise NotImplementedError(f"{name} is not implemented.")
+def create_norm(name):
+    if name == "layernorm":
+        return nn.LayerNorm
+    elif name == "batchnorm":
+        return nn.BatchNorm1d
+    elif name == "graphnorm":
+        return partial(NormLayer, norm_type="groupnorm")
+    else:
+        return None
+def create_optimizer(opt, model, lr, weight_decay, get_num_layer=None, get_layer_scale=None):
+    opt_lower = opt.lower()
+    parameters = model.parameters()
+    opt_args = dict(lr=lr, weight_decay=weight_decay)
+    opt_split = opt_lower.split("_")
+    opt_lower = opt_split[-1]
+    if opt_lower == "adam":
+        optimizer = optim.Adam(parameters, **opt_args)
+    elif opt_lower == "adamw":
+        optimizer = optim.AdamW(parameters, **opt_args)
+    elif opt_lower == "adadelta":
+        optimizer = optim.Adadelta(parameters, **opt_args)
+    elif opt_lower == "radam":
+        optimizer = optim.RAdam(parameters, **opt_args)
+    elif opt_lower == "sgd":
+        opt_args["momentum"] = 0.9
+        return optim.SGD(parameters, **opt_args)
+    else:
+        assert False and "Invalid optimizer"
+    return optimizer
+# -------------------
+def pretrain(model, graph, feat, optimizer, max_epoch, device):
+    logging.info("start training..")
+    graph = graph.to(device)
+    x = feat.to(device)
+    epoch_iter = tqdm(range(max_epoch))
+    for epoch in epoch_iter:
+        model.train()
+        loss, loss_dict = model(graph, x)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        epoch_iter.set_description(f"# Epoch {epoch}: train_loss: {loss.item():.4f}")
+    return model
+def mask_edge(graph, mask_prob):
+    E = graph.num_edges()
+    mask_rates = torch.FloatTensor(np.ones(E) * mask_prob)
+    masks = torch.bernoulli(1 - mask_rates)
+    mask_idx = masks.nonzero().squeeze(1)
+    return mask_idx
+def drop_edge(graph, drop_rate, return_edges=False):
+    if drop_rate <= 0:
+        return graph
+    n_node = graph.num_nodes()
+    edge_mask = mask_edge(graph, drop_rate)
+    src = graph.edges()[0]
+    dst = graph.edges()[1]
+    nsrc = src[edge_mask]
+    ndst = dst[edge_mask]
+    ng = dgl.graph((nsrc, ndst), num_nodes=n_node)
+    ng = ng.add_self_loop()
+    dsrc = src[~edge_mask]
+    ddst = dst[~edge_mask]
+    if return_edges:
+        return ng, (dsrc, ddst)
+    return ng
+class NormLayer(nn.Module):
+    def __init__(self, hidden_dim, norm_type):
+        super().__init__()
+        if norm_type == "batchnorm":
+            self.norm = nn.BatchNorm1d(hidden_dim)
+        elif norm_type == "layernorm":
+            self.norm = nn.LayerNorm(hidden_dim)
+        elif norm_type == "graphnorm":
+            self.norm = norm_type
+            self.weight = nn.Parameter(torch.ones(hidden_dim))
+            self.bias = nn.Parameter(torch.zeros(hidden_dim))
+            self.mean_scale = nn.Parameter(torch.ones(hidden_dim))
+        else:
+            raise NotImplementedError
+    def forward(self, graph, x):
+        tensor = x
+        if self.norm is not None and type(self.norm) != str:
+            return self.norm(tensor)
+        elif self.norm is None:
+            return tensor
+        batch_list = graph.batch_num_nodes
+        batch_size = len(batch_list)
+        batch_list = torch.Tensor(batch_list).long().to(tensor.device)
+        batch_index = torch.arange(batch_size).to(tensor.device).repeat_interleave(batch_list)
+        batch_index = batch_index.view((-1,) + (1,) * (tensor.dim() - 1)).expand_as(tensor)
+        mean = torch.zeros(batch_size, *tensor.shape[1:]).to(tensor.device)
+        mean = mean.scatter_add_(0, batch_index, tensor)
+        mean = (mean.T / batch_list).T
+        mean = mean.repeat_interleave(batch_list, dim=0)
+        sub = tensor - mean * self.mean_scale
+        std = torch.zeros(batch_size, *tensor.shape[1:]).to(tensor.device)
+        std = std.scatter_add_(0, batch_index, sub.pow(2))
+        std = ((std.T / batch_list).T + 1e-6).sqrt()
+        std = std.repeat_interleave(batch_list, dim=0)
+        return self.weight * sub / std + self.bias

gsg-0.6.0/GSG/preprocess.py ADDED Viewed

@@ -0,0 +1,183 @@
+import os
+import warnings
+import itertools
+warnings.filterwarnings("ignore")
+import dgl
+import torch
+import anndata as ad
+import scanpy as sc
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+from scipy import sparse
+from sklearn.cluster import KMeans
+from sklearn.neighbors import BallTree
+from scipy.spatial.distance import pdist, squareform
+from . import utils
+def read_10X_Visium(path,
+                    genome=None,
+                    count_file='filtered_feature_bc_matrix.h5',
+                    library_id=None,
+                    load_images=True,
+                    quality='hires',
+                    image_path = None):
+    adata = sc.read_visium(path,
+                        genome=genome,
+                        count_file=count_file,
+                        library_id=library_id,
+                        load_images=load_images,)
+    adata.var_names_make_unique()
+    if library_id is None:
+        library_id = list(adata.uns["spatial"].keys())[0]
+    if quality == "fulres":
+        image_coor = adata.obsm["spatial"]
+        img = plt.imread(image_path, 0)
+        adata.uns["spatial"][library_id]["images"]["fulres"] = img
+    else:
+        scale = adata.uns["spatial"][library_id]["scalefactors"][
+            "tissue_" + quality + "_scalef"]
+        image_coor = adata.obsm["spatial"] * scale
+    adata.obs["imagecol"] = image_coor[:, 0]
+    adata.obs["imagerow"] = image_coor[:, 1]
+    adata.uns["spatial"][library_id]["use_quality"] = quality
+    return adata
+def read_10X_Visium_with_label(path,
+                    genome=None,
+                    count_file='filtered_feature_bc_matrix.h5',
+                    library_id=None,
+                    load_images=True,
+                    quality='hires',
+                    image_path = None):
+    adata = sc.read_visium(path,
+                        genome=genome,
+                        count_file=count_file,
+                        library_id=library_id,
+                        load_images=load_images,)
+    adata.var_names_make_unique()
+    if library_id is None:
+        library_id = list(adata.uns["spatial"].keys())[0]
+    if quality == "fulres":
+        image_coor = adata.obsm["spatial"]
+        img = plt.imread(image_path, 0)
+        adata.uns["spatial"][library_id]["images"]["fulres"] = img
+    else:
+        scale = adata.uns["spatial"][library_id]["scalefactors"][
+            "tissue_" + quality + "_scalef"]
+        image_coor = adata.obsm["spatial"] * scale
+    if(os.path.exists(path + "/metadata.tsv")):
+        adata.obs = pd.read_table(path + "/metadata.tsv",sep="\t",index_col=0)
+    adata.obs["imagecol"] = image_coor[:, 0]
+    adata.obs["imagerow"] = image_coor[:, 1]
+    adata.uns["spatial"][library_id]["use_quality"] = quality
+    return adata
+def read_stereo_seq(counts_data_path, position_path):
+    counts_file = os.path.join(counts_data_path)
+    coor_file = os.path.join(position_path)
+    coor_df = pd.read_csv(coor_file, sep='\t')
+    counts = pd.read_csv(counts_file, sep='\t', index_col=0)
+    counts.columns = ['Spot_' + str(x) for x in counts.columns]
+    coor_df.index = coor_df['label'].map(lambda x: 'Spot_' + str(x))
+    adata = sc.AnnData(counts.T)
+    adata.obs = coor_df
+    adata.var_names_make_unique()
+    coor_df = coor_df.loc[adata.obs_names, ['y', 'x']]
+    adata.obsm["spatial"] = coor_df.to_numpy()
+    sc.pp.calculate_qc_metrics(adata, inplace=True)
+    adata.obs['imagecol'] = coor_df.iloc[:, 1]
+    adata.obs['imagerow'] = coor_df.iloc[:, 0]
+    return adata
+def read_slide_seq(path,
+                      library_id=None,
+                      scale=None,
+                      quality="hires",
+                      spot_diameter_fullres=50,
+                      background_color="white",):
+    count = pd.read_csv(os.path.join(path, "count_matrix.count"))
+    meta = pd.read_csv(os.path.join(path, "spatial.idx"))
+    adata = AnnData(count.iloc[:, 1:].set_index("gene").T)
+    adata.var["ENSEMBL"] = count["ENSEMBL"].values
+    adata.obs["index"] = meta["index"].values
+    if scale == None:
+        max_coor = np.max(meta[["x", "y"]].values)
+        scale = 2000 / max_coor
+    adata.obs["imagecol"] = meta["x"].values * scale
+    adata.obs["imagerow"] = meta["y"].values * scale
+    # Create image
+    max_size = np.max([adata.obs["imagecol"].max(), adata.obs["imagerow"].max()])
+    max_size = int(max_size + 0.1 * max_size)
+    if background_color == "black":
+        image = Image.new("RGBA", (max_size, max_size), (0, 0, 0, 0))
+    else:
+        image = Image.new("RGBA", (max_size, max_size), (255, 255, 255, 255))
+    imgarr = np.array(image)
+    if library_id is None:
+        library_id = "Slide-seq"
+    adata.uns["spatial"] = {}
+    adata.uns["spatial"][library_id] = {}
+    adata.uns["spatial"][library_id]["images"] = {}
+    adata.uns["spatial"][library_id]["images"][quality] = imgarr
+    adata.uns["spatial"][library_id]["use_quality"] = quality
+    adata.uns["spatial"][library_id]["scalefactors"] = {}
+    adata.uns["spatial"][library_id]["scalefactors"][
+        "tissue_" + quality + "_scalef"] = scale
+    adata.uns["spatial"][library_id]["scalefactors"][
+        "spot_diameter_fullres"
+    ] = spot_diameter_fullres
+    adata.obsm["spatial"] = meta[["x", "y"]].values
+    return adata
+def Graph_10X(adata, args):
+    cell_loc = adata.obs[["imagerow", "imagecol"]].values
+    if args.graph == 'radius':
+        distance_np = pdist(cell_loc, metric = "euclidean")
+        distance_np_X = squareform(distance_np)
+        threshold = args.threshold_radius
+        num_big = np.where((0< distance_np_X)&(distance_np_X < threshold))[0].shape[0]
+        adj_matrix = np.zeros(distance_np_X.shape)
+        non_zero_point = np.where((0< distance_np_X)&(distance_np_X<threshold))
+        for i in tqdm(range(num_big)):
+            x = non_zero_point[0][i]
+            y = non_zero_point[1][i]
+            adj_matrix[x][y] = 1
+        adj_matrix = adj_matrix + np.eye(distance_np_X.shape[0])
+        adj_matrix  = np.float32(adj_matrix)
+        adj_matrix_crs = sparse.csr_matrix(adj_matrix)
+    elif args.graph == 'knn':
+        tree = BallTree(cell_loc)
+        distances, tail_list = tree.query(cell_loc, k=args.num_neighbors)
+        head_list = []
+        head_list = [head_list + [i] * len(tail_list[i]) for i in range(len(tail_list))]
+        head_list = list(itertools.chain.from_iterable(head_list))
+        tail_list = list(itertools.chain.from_iterable(tail_list))
+        distances = np.ones_like(head_list)
+        adj_matrix_crs = sparse.coo_matrix((distances, (head_list, tail_list)), shape=(cell_loc.shape[0], cell_loc.shape[0])).tocsr()
+    graph = dgl.from_scipy(adj_matrix_crs, eweight_name='w')
+    adata.var_names=[i.upper() for i in list(adata.var_names)]
+    adata.var["genename"] = adata.var.index.astype("str")
+    adata.var_names_make_unique()
+    if(args.feature_dim_method == "PCA"):
+        sc.pp.filter_genes(adata, min_cells=5)
+        adata_X = sc.pp.normalize_total(adata, target_sum=1, exclude_highly_expressed=True, inplace=False)['X']
+        adata_X = sc.pp.scale(adata_X)
+        adata_X = sc.pp.pca(adata_X, n_comps=args.num_features)
+    else:
+        sc.pp.filter_genes(adata, min_cells=5)
+        sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=args.num_features)
+        sc.pp.normalize_total(adata, target_sum=1e4)
+        sc.pp.log1p(adata)
+        adata_Vars =  adata[:, adata.var['highly_variable']]
+        adata_X = adata_Vars.X.todense()
+    graph.ndata["feat"] = torch.tensor(adata_X.copy())
+    return adata,graph

gsg-0.6.0/GSG/train.py ADDED Viewed

@@ -0,0 +1,21 @@
+from . import models
+from . import utils
+def GSG_train(adata, graph, args):
+    device = args.device if args.device >= 0 else "cpu"
+    utils.set_random_seed(args.seeds)
+    model = models.build_model(args)
+    model.to(device)
+    optimizer = models.utils.create_optimizer(args.optimizer, model, args.lr, args.weight_decay)
+    x = graph.ndata["feat"]
+    if not args.load_model:
+        model = models.utils.pretrain(model, graph, x, optimizer, args.max_epoch, device)
+    model.train(False)
+    x = graph.ndata["feat"]
+    embedding = model.embed(graph.to(device), x.to(device))
+    adata.obsm["GSG_embedding"] = embedding.cpu().detach().numpy()
+    if args.imputation:
+        latten_embedding = model.encoder_to_decoder(embedding)
+        imputation_embedding =  model.decoder(graph.to(device),latten_embedding)
+        adata.obsm["GSG_imputation"] = imputation_embedding.cpu().detach().numpy()
+    return adata, model

gsg-0.6.0/GSG/utils.py ADDED Viewed

@@ -0,0 +1,32 @@
+import os
+import random
+import torch
+import numpy as np
+from scipy import sparse
+from sklearn.cluster import KMeans
+def mkdir(path):
+	folder = os.path.exists(path)
+	if not folder:
+		os.makedirs(path)
+		print("---  new folder...  ---")
+		print("---  OK  ---")
+	else:
+		print("---  There is this folder!  ---")
+def KMeans_use(embedding,cluster_number):
+    kmeans = KMeans(n_clusters=cluster_number,
+                init="k-means++",
+                random_state=0)
+    pred = kmeans.fit_predict(embedding)
+    return pred
+def set_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.determinstic = True

gsg-0.6.0/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2023 keaml-Guan
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

gsg-0.6.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,64 @@
+Metadata-Version: 2.1
+Name: gsg
+Version: 0.6.0
+Summary: GSG: A generative self-supervised graph learning framework for spatial transcriptomics
+Author-email: Chuyao Wang <chuyao25@mails.jlu.edu.cn>
+License: MIT
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+License-File: LICENSE.txt
+# A masked generative graph representation learning framework empowering precise spatial domain identification
+![GitHub Repo stars](https://img.shields.io/github/stars/keaml-Guan/GSG) &nbsp;&nbsp; ![GitHub watchers](https://img.shields.io/github/watchers/keaml-Guan/GSG) &nbsp;&nbsp; ![GitHub](https://img.shields.io/github/license/keaml-Guan/GSG)
+#
+![](https://raw.githubusercontent.com/keaml-Guan/GSG/main/figures/Fig1_11_reduce.jpg)
+<br>
+## ✨ Overview
+Recent advances in spatial transcriptomics (ST) have opened new avenues for preserving spatial information while measuring gene expression. Yet, the challenge of seamlessly integrating this data into accurate and transferable representation remains. Here, we introduce a generative self-supervised graph (GSG) learning framework to achieve an effective joint embedding of location and gene expression within ST data. Our approach surpasses existing methods in identifying spatial domains within the human dorsolateral prefrontal cortex. Moreover, it can offer reliable analyses across various techniques, including Stereo-seq, Slide-seq, and seqFISH, irrespective of spatial resolution. Furthermore, GSG addresses dropout defects, enhancing gene expression by smoothing spatial patterns, extracting critical features, reducing batch effects, and enabling the integration of disparate datasets. Additionally, we performed spatial transcriptomic analysis on fetal human hearts, and applied GSG to extract biological insights. These experiments highlight GSG's accuracy in identifying spatial domains, uncovering specific *APCDD1* expression in fetal endocardium, and implicating its role in congenital heart disease. Our results showcase GSG's superiority and underscore its valuable contributions to advancing spatial-omics analysis.
+## 🛠️ Installation
+> [!NOTE]
+> **!!! The recommended operating system is Ubuntu 18.04 LTS.** Some packages may not download correctly on Windows.
+### Use python virutal environment with conda
+```sh
+conda create -n gsg python=3.8
+conda activate gsg
+# Need install cudnn based on your CUDA version.Refer to https://developer.nvidia.com/cudnn-archive
+# conda install cudnn[=version]
+```
+### Install GSG
+Install GSG and dgl(for gpu) from PyPi:
+```sh
+pip install GSG==0.5.8
+pip install dgl-cu110 -f https://data.dgl.ai/wheels/repo.html
+```
+Required packages include:
+```sh
+torch==1.9.0, cudnn==8.4, numpy==1.22.0, scanpy==1.8.2, anndata==0.8.0, dgl==0.9.0,
+pandas==1.2.4, scipy==1.7.3, scikit-learn==1.0.1, tqdm==4.64.1, matplotlib==3.5.3,
+tensorboardX==2.5.1, pyyaml==6.0.1, plotly==5.21.0, kaleido==0.2.1, igraph==0.9.8
+```
+## 🚀 Quick Start
+See our model document details from [Docs](https://keaml-guan.github.io/GSG/).
+We provide the scripts for reproducing the quantitative and visualization results of the paper in [/docs/tutorials/](https://github.com/keaml-Guan/GSG/tree/main/docs/tutorials/).
+## 📚 Citation
+Wang C, Zhang T, Sun H, et al. A masked generative graph representation learning framework empowering precise spatial domain identification[J]. *Bioinformatics*, 2026, 42(6). ++[https://doi.org/10.1093/bioinformatics/btag333.](https://doi.org/10.1093/bioinformatics/btag333)++
+## 📩 Contact
+If you have any questions, feel free to contact [chuyao25@mails.jlu.edu.cn](mailto:chuyao25@mails.jlu.edu.cn).

gsg-0.6.0/README.md ADDED Viewed

@@ -0,0 +1,54 @@
+# A masked generative graph representation learning framework empowering precise spatial domain identification
+![GitHub Repo stars](https://img.shields.io/github/stars/keaml-Guan/GSG) &nbsp;&nbsp; ![GitHub watchers](https://img.shields.io/github/watchers/keaml-Guan/GSG) &nbsp;&nbsp; ![GitHub](https://img.shields.io/github/license/keaml-Guan/GSG)
+#
+![](https://raw.githubusercontent.com/keaml-Guan/GSG/main/figures/Fig1_11_reduce.jpg)
+<br>
+## ✨ Overview
+Recent advances in spatial transcriptomics (ST) have opened new avenues for preserving spatial information while measuring gene expression. Yet, the challenge of seamlessly integrating this data into accurate and transferable representation remains. Here, we introduce a generative self-supervised graph (GSG) learning framework to achieve an effective joint embedding of location and gene expression within ST data. Our approach surpasses existing methods in identifying spatial domains within the human dorsolateral prefrontal cortex. Moreover, it can offer reliable analyses across various techniques, including Stereo-seq, Slide-seq, and seqFISH, irrespective of spatial resolution. Furthermore, GSG addresses dropout defects, enhancing gene expression by smoothing spatial patterns, extracting critical features, reducing batch effects, and enabling the integration of disparate datasets. Additionally, we performed spatial transcriptomic analysis on fetal human hearts, and applied GSG to extract biological insights. These experiments highlight GSG's accuracy in identifying spatial domains, uncovering specific *APCDD1* expression in fetal endocardium, and implicating its role in congenital heart disease. Our results showcase GSG's superiority and underscore its valuable contributions to advancing spatial-omics analysis.
+## 🛠️ Installation
+> [!NOTE]
+> **!!! The recommended operating system is Ubuntu 18.04 LTS.** Some packages may not download correctly on Windows.
+### Use python virutal environment with conda
+```sh
+conda create -n gsg python=3.8
+conda activate gsg
+# Need install cudnn based on your CUDA version.Refer to https://developer.nvidia.com/cudnn-archive
+# conda install cudnn[=version]
+```
+### Install GSG
+Install GSG and dgl(for gpu) from PyPi:
+```sh
+pip install GSG==0.5.8
+pip install dgl-cu110 -f https://data.dgl.ai/wheels/repo.html
+```
+Required packages include:
+```sh
+torch==1.9.0, cudnn==8.4, numpy==1.22.0, scanpy==1.8.2, anndata==0.8.0, dgl==0.9.0,
+pandas==1.2.4, scipy==1.7.3, scikit-learn==1.0.1, tqdm==4.64.1, matplotlib==3.5.3,
+tensorboardX==2.5.1, pyyaml==6.0.1, plotly==5.21.0, kaleido==0.2.1, igraph==0.9.8
+```
+## 🚀 Quick Start
+See our model document details from [Docs](https://keaml-guan.github.io/GSG/).
+We provide the scripts for reproducing the quantitative and visualization results of the paper in [/docs/tutorials/](https://github.com/keaml-Guan/GSG/tree/main/docs/tutorials/).
+## 📚 Citation
+Wang C, Zhang T, Sun H, et al. A masked generative graph representation learning framework empowering precise spatial domain identification[J]. *Bioinformatics*, 2026, 42(6). ++[https://doi.org/10.1093/bioinformatics/btag333.](https://doi.org/10.1093/bioinformatics/btag333)++
+## 📩 Contact
+If you have any questions, feel free to contact [chuyao25@mails.jlu.edu.cn](mailto:chuyao25@mails.jlu.edu.cn).

gsg-0.6.0/gsg.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,64 @@
+Metadata-Version: 2.1
+Name: gsg
+Version: 0.6.0
+Summary: GSG: A generative self-supervised graph learning framework for spatial transcriptomics
+Author-email: Chuyao Wang <chuyao25@mails.jlu.edu.cn>
+License: MIT
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+License-File: LICENSE.txt
+# A masked generative graph representation learning framework empowering precise spatial domain identification
+![GitHub Repo stars](https://img.shields.io/github/stars/keaml-Guan/GSG) &nbsp;&nbsp; ![GitHub watchers](https://img.shields.io/github/watchers/keaml-Guan/GSG) &nbsp;&nbsp; ![GitHub](https://img.shields.io/github/license/keaml-Guan/GSG)
+#
+![](https://raw.githubusercontent.com/keaml-Guan/GSG/main/figures/Fig1_11_reduce.jpg)
+<br>
+## ✨ Overview
+Recent advances in spatial transcriptomics (ST) have opened new avenues for preserving spatial information while measuring gene expression. Yet, the challenge of seamlessly integrating this data into accurate and transferable representation remains. Here, we introduce a generative self-supervised graph (GSG) learning framework to achieve an effective joint embedding of location and gene expression within ST data. Our approach surpasses existing methods in identifying spatial domains within the human dorsolateral prefrontal cortex. Moreover, it can offer reliable analyses across various techniques, including Stereo-seq, Slide-seq, and seqFISH, irrespective of spatial resolution. Furthermore, GSG addresses dropout defects, enhancing gene expression by smoothing spatial patterns, extracting critical features, reducing batch effects, and enabling the integration of disparate datasets. Additionally, we performed spatial transcriptomic analysis on fetal human hearts, and applied GSG to extract biological insights. These experiments highlight GSG's accuracy in identifying spatial domains, uncovering specific *APCDD1* expression in fetal endocardium, and implicating its role in congenital heart disease. Our results showcase GSG's superiority and underscore its valuable contributions to advancing spatial-omics analysis.
+## 🛠️ Installation
+> [!NOTE]
+> **!!! The recommended operating system is Ubuntu 18.04 LTS.** Some packages may not download correctly on Windows.
+### Use python virutal environment with conda
+```sh
+conda create -n gsg python=3.8
+conda activate gsg
+# Need install cudnn based on your CUDA version.Refer to https://developer.nvidia.com/cudnn-archive
+# conda install cudnn[=version]
+```
+### Install GSG
+Install GSG and dgl(for gpu) from PyPi:
+```sh
+pip install GSG==0.5.8
+pip install dgl-cu110 -f https://data.dgl.ai/wheels/repo.html
+```
+Required packages include:
+```sh
+torch==1.9.0, cudnn==8.4, numpy==1.22.0, scanpy==1.8.2, anndata==0.8.0, dgl==0.9.0,
+pandas==1.2.4, scipy==1.7.3, scikit-learn==1.0.1, tqdm==4.64.1, matplotlib==3.5.3,
+tensorboardX==2.5.1, pyyaml==6.0.1, plotly==5.21.0, kaleido==0.2.1, igraph==0.9.8
+```
+## 🚀 Quick Start
+See our model document details from [Docs](https://keaml-guan.github.io/GSG/).
+We provide the scripts for reproducing the quantitative and visualization results of the paper in [/docs/tutorials/](https://github.com/keaml-Guan/GSG/tree/main/docs/tutorials/).
+## 📚 Citation
+Wang C, Zhang T, Sun H, et al. A masked generative graph representation learning framework empowering precise spatial domain identification[J]. *Bioinformatics*, 2026, 42(6). ++[https://doi.org/10.1093/bioinformatics/btag333.](https://doi.org/10.1093/bioinformatics/btag333)++
+## 📩 Contact
+If you have any questions, feel free to contact [chuyao25@mails.jlu.edu.cn](mailto:chuyao25@mails.jlu.edu.cn).

gsg-0.6.0/gsg.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,17 @@
+LICENSE.txt
+README.md
+pyproject.toml
+GSG/__init__.py
+GSG/preprocess.py
+GSG/train.py
+GSG/utils.py
+GSG/models/__init__.py
+GSG/models/edcoder.py
+GSG/models/gin.py
+GSG/models/loss_func.py
+GSG/models/utils.py
+gsg.egg-info/PKG-INFO
+gsg.egg-info/SOURCES.txt
+gsg.egg-info/dependency_links.txt
+gsg.egg-info/requires.txt
+gsg.egg-info/top_level.txt

gsg-0.6.0/gsg.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

gsg-0.6.0/gsg.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,10 @@
+numpy==1.21.6
+pandas==1.2.4
+scipy
+scikit-learn
+torch==1.9.0
+dgl==0.9.0
+scanpy==1.8.2
+anndata==0.8.0
+squidpy==1.1.2
+leidenalg<0.11,>=0.8.2

gsg-0.6.0/gsg.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ GSG

gsg-0.6.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,31 @@
+[build-system]
+requires = ["setuptools>=59.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "gsg"
+version = "0.6.0"
+description = "GSG: A generative self-supervised graph learning framework for spatial transcriptomics"
+readme = "README.md"
+requires-python = ">=3.7"
+authors = [
+    {name = "Chuyao Wang", email = "chuyao25@mails.jlu.edu.cn"}
+]
+license = {text = "MIT"}
+dependencies = [
+    "numpy==1.21.6",
+    "pandas==1.2.4",
+    "scipy",
+    "scikit-learn",
+    "torch==1.9.0",
+    "dgl==0.9.0",
+    "scanpy==1.8.2",
+    "anndata==0.8.0",
+    "squidpy==1.1.2",
+    "leidenalg>=0.8.2,<0.11",
+]
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["GSG*"]

gsg-0.6.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0