PyPI - hidt - Versions diffs - 0.1.0__tar.gz - Mend

hidt 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

hidt-0.1.0/LICENSE +21 -0
hidt-0.1.0/PKG-INFO +11 -0
hidt-0.1.0/hidt/__init__.py +11 -0
hidt-0.1.0/hidt/evaluation.py +12 -0
hidt-0.1.0/hidt/load_hic_format.py +72 -0
hidt-0.1.0/hidt/loss.py +9 -0
hidt-0.1.0/hidt/model.py +192 -0
hidt-0.1.0/hidt/utils.py +107 -0
hidt-0.1.0/hidt.egg-info/PKG-INFO +11 -0
hidt-0.1.0/hidt.egg-info/SOURCES.txt +14 -0
hidt-0.1.0/hidt.egg-info/dependency_links.txt +1 -0
hidt-0.1.0/hidt.egg-info/requires.txt +4 -0
hidt-0.1.0/hidt.egg-info/top_level.txt +1 -0
hidt-0.1.0/scripts/HiDT +267 -0
hidt-0.1.0/setup.cfg +4 -0
hidt-0.1.0/setup.py +45 -0

hidt-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Li Junping <lijunping02@qq.com>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

hidt-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,11 @@
+Metadata-Version: 2.1
+Name: hidt
+Version: 0.1.0
+Summary: A computational pipeline for identifying differential TADs from 3D genome contact maps
+Home-page: https://github.com/GaoLabXDU/HiDT
+Author: Li Junping
+Author-email: lijunping02@qq.com
+License: MIT Licence
+Keywords: 3D genome,Comparative analysis,Topologically associating domains
+Platform: any
+License-File: LICENSE

hidt-0.1.0/hidt/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+"""
+Created on June 29 2025
+@author: Li Junping
+"""
+__author__ = 'Li Junping'
+__version__ = '0.1.0'
+Me = __file__

hidt-0.1.0/hidt/evaluation.py ADDED Viewed

@@ -0,0 +1,12 @@
+import torch
+from sklearn import metrics
+import numpy as np
+def euclidean_distance(x, y):
+    """This is the squared Euclidean distance."""
+    return torch.sum((x - y) ** 2, dim=-1)
+def compute_similarity(x, y):
+    """Compute the distance between x and y vectors."""
+    # similarity is negative distance
+    return -euclidean_distance(x, y)

hidt-0.1.0/hidt/load_hic_format.py ADDED Viewed

@@ -0,0 +1,72 @@
+import numpy as np
+import hicstraw
+from scipy.sparse import coo_matrix
+def load_binsNum(hicfile, res):
+    # get chrom lenght in .hic file; remove X, Y and MT chromosomes
+    exclude_chroms = {'Y', 'MT', 'All', 'chrY'}
+    hic = hicstraw.HiCFile(hicfile)
+    chroms = hic.getChromosomes()
+    chrom_bins = {}
+    for chrom in chroms:
+        if any(exclude in chrom.name for exclude in exclude_chroms):
+            continue
+        num_bins = (chrom.length + res) // res
+        chrom_bins[chrom.name] = num_bins
+    return chrom_bins
+def normalizationMat(matrix, binsNum):
+    # make the sum of rows' count equal to 1
+    # count / rows_sum
+    row_sums = np.array(matrix.sum(axis=1)).flatten()
+    non_zero_mask = row_sums != 0
+    matrix = matrix.tocoo()
+    normed_count = np.zeros_like(matrix.data)
+    for i in range(len(matrix.data)):
+        row = matrix.row[i]
+        if non_zero_mask[row]:
+            normed_count[i] = matrix.data[i] / row_sums[row]
+        else:
+            normed_count[i] = matrix.data[i]
+    normed_mat = coo_matrix((normed_count, (matrix.row, matrix.col)), shape=(binsNum, binsNum))
+    return normed_mat
+def constructSpaMat(result, binsNum, res):
+    # Construct sparse matrix (A + A.T - A.diag)
+    rows = []
+    cols = []
+    count = []
+    for i in range(len(result)):
+        rows.append(int(result[i].binX / res))
+        cols.append(int(result[i].binY / res))
+        count.append(result[i].counts)
+    rows = np.array(rows)
+    cols = np.array(cols)
+    count = np.array(count)
+    upper_mat = coo_matrix((count, (rows, cols)), shape=(binsNum, binsNum))
+    lower_mat = coo_matrix((count, (cols, rows)), shape=(binsNum, binsNum))
+    full_mat = upper_mat + lower_mat - coo_matrix((count[rows == cols],
+                                                 (rows[rows == cols],
+                                                  cols[rows == cols])), shape=(binsNum, binsNum))
+    return full_mat
+def filter_matrix(mat):
+    col_sum = np.sum(mat, axis=0)
+    if np.any(col_sum < 0.1):
+        return False
+    else:
+        return True
+def dumpMatrix(chrom, binsNum, res, hicfile):
+    """
+    convert hic file to iced normalization sparse matrix
+    :param chrom: chrom ID
+    :param binsNum: the number of bin
+    :param hicfile:  input hic file
+    :param res: resolution for hic data
+    """
+    # load mat form .hic file
+    result = hicstraw.straw('observed', 'KR', hicfile, str(chrom), str(chrom), 'BP', res)
+    sp_mat = constructSpaMat(result, binsNum, res)
+    normed_mat = normalizationMat(sp_mat, binsNum)
+    return normed_mat.tocsr()

hidt-0.1.0/hidt/loss.py ADDED Viewed

@@ -0,0 +1,9 @@
+import torch
+def euclidean_distance(x, y):
+    """This is the squared Euclidean distance."""
+    return torch.sum((x - y) ** 2, dim=-1)
+def pairwise_loss(x, y, labels, margin=1.0):
+    loss = torch.relu(margin - labels * (1 - euclidean_distance(x, y)))
+    return loss, euclidean_distance(x, y)

hidt-0.1.0/hidt/model.py ADDED Viewed

@@ -0,0 +1,192 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def DSN2(t):
+    a = t.sum(dim=1, keepdim=True)
+    b = t.sum(dim=0, keepdim=True)
+    lamb = torch.cat([a.squeeze(), b.squeeze()], dim=0).max()
+    r = t.shape[0] * lamb - t.sum(dim=0).sum(dim=0)
+    a = a.expand(-1, t.shape[1])
+    b = b.expand(t.shape[0], -1)
+    tt = t + (lamb ** 2 - lamb * (a + b) + a * b) / r
+    ttmatrix = tt / tt.sum(dim=0)[0]
+    ttmatrix = torch.where(t > 0, ttmatrix, t)
+    return ttmatrix
+def DSN(x):
+    """Doubly stochastic normalization"""
+    p = x.shape[0]
+    y1 = []
+    for i in range(p):
+        y1.append(DSN2(x[i]))
+    y1 = torch.stack(y1, dim=0)
+    return y1
+def unsorted_segment_sum(data, segment_ids, num_segments):
+    """
+    Computes the sum along segments of a tensor. Analogous to tf.unsorted_segment_sum.
+    :param data: A tensor whose segments are to be summed.
+    :param segment_ids: The segment indices tensor.
+    :param num_segments: The number of segments.
+    :return: A tensor of same data type as the data argument.
+    """
+    assert all([i in data.shape for i in segment_ids.shape]), "segment_ids.shape should be a prefix of data.shape"
+    # Encourage to use the below code when a deterministic result is
+    # needed (reproducibility). However, the code below is with low efficiency.
+    # tensor = torch.zeros(num_segments, data.shape[1], device=data.device)
+    # for index in range(num_segments):
+    #     tensor[index, :] = torch.sum(data[segment_ids == index, :], dim=0)
+    # return tensor
+    if len(segment_ids.shape) == 1:
+        s = torch.prod(torch.tensor(data.shape[1:], device=data.device)).long()
+        segment_ids = segment_ids.repeat_interleave(s).view(segment_ids.shape[0], *data.shape[1:])
+    assert data.shape == segment_ids.shape, "data.shape and segment_ids.shape should be equal"
+    shape = [num_segments] + list(data.shape[1:])
+    tensor = torch.zeros(*shape, device=data.device).scatter_add(0, segment_ids, data)
+    tensor = tensor.type(data.dtype)
+    return tensor
+# reference Beaconet (https://github.com/GaoLabXDU/Beaconet)
+class BatchSpecificNorm(nn.Module):
+    def __init__(self, n_batches, feature_dim, eps=1e-8):
+        super(BatchSpecificNorm, self).__init__()
+        self.scale = nn.Embedding(n_batches, feature_dim)
+        self.shift = nn.Embedding(n_batches, feature_dim)
+        nn.init.ones_(self.scale.weight)
+        nn.init.zeros_(self.shift.weight)
+        self.eps = eps
+    def forward(self, x, batch_idx):
+        scale = self.scale(batch_idx)
+        shift = self.shift(batch_idx)
+        return x * scale + shift
+class GraphAttentionLayer(nn.Module):
+    """
+    Simple GAT layer, similar to https://arxiv.org/abs/1710.10903
+    """
+    def __init__(self, in_features, out_features, dropout, alpha, concat=True):
+        super(GraphAttentionLayer, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.alpha = alpha
+        self.concat = concat
+        self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
+        nn.init.xavier_uniform_(self.W.data, gain=1.414)
+        self.a = nn.Parameter(torch.empty(size=(2 * out_features, 1)))
+        nn.init.xavier_uniform_(self.a.data, gain=1.414)
+        self.leakyrelu = nn.LeakyReLU(self.alpha)
+    def forward(self, h, edge_attr):
+        Wh = torch.mm(h, self.W)  # h.shape: (N, in_features), Wh.shape: (N, out_features)
+        e = self._prepare_attentional_mechanism_input(Wh)
+        e = e * edge_attr
+        attention = DSN(e)
+        h_prime = []
+        for i in range(edge_attr.shape[0]):
+            h_prime.append(torch.matmul(attention[i], Wh))
+        if self.concat:
+            h_prime = torch.cat(h_prime, dim=1)
+            return F.elu(h_prime), e
+        else:
+            h_prime = torch.stack(h_prime, dim=0)
+            h_prime = torch.sum(h_prime, dim=0)
+            return h_prime, e
+    # compute attention coefficient
+    def _prepare_attentional_mechanism_input(self, Wh):
+        Wh1 = torch.matmul(Wh, self.a[:self.out_features, :])
+        Wh2 = torch.matmul(Wh, self.a[self.out_features:, :])
+        # broadcast add
+        e = Wh1 + Wh2.T
+        return self.leakyrelu(e)
+    def __repr__(self):
+        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'
+class GraphAggregator(nn.Module):
+    """This module computes graph representations by aggregating from parts."""
+    def __init__(self,
+                 node_hidden_sizes,
+                 input_size):
+        super(GraphAggregator, self).__init__()
+        self._node_hidden_sizes = node_hidden_sizes
+        self._graph_state_dim = node_hidden_sizes[-1]
+        self._graph_transform_sizes = node_hidden_sizes[-1]
+        self._input_size = input_size
+        self.MLP1, self.MLP2 = self.build_model()
+    def build_model(self):
+        node_hidden_sizes = self._node_hidden_sizes
+        node_hidden_sizes[-1] = self._graph_state_dim * 2
+        layer = [nn.Linear(self._input_size[0], 64)]
+        layer.append(nn.ReLU())
+        layer.append(nn.Linear(64, node_hidden_sizes[0]))
+        MLP1 = nn.Sequential(*layer)
+        layer = []
+        layer.append(nn.Linear(self._graph_state_dim, 32))
+        layer.append(nn.ReLU())
+        layer.append(nn.Linear(32, 16))
+        MLP2 = nn.Sequential(*layer)
+        return MLP1, MLP2
+    def forward(self, node_states, graph_idx):
+        """Compute aggregated graph representations."""
+        node_states_g = self.MLP1(node_states)
+        gates = torch.sigmoid(node_states_g[:, :self._graph_state_dim])
+        node_states_g = node_states_g[:, self._graph_state_dim:] * gates
+        n_graphs = max(graph_idx) + 1
+        graph_states = unsorted_segment_sum(node_states_g, graph_idx, n_graphs)
+        graph_states = self.MLP2(graph_states)
+        return graph_states
+class EdgeGNN(nn.Module):
+    def __init__(self, node_hidden_dims, edge_feature_dim, dropout, alpha, nheads):
+        super(EdgeGNN, self).__init__()
+        self.attentions = [GraphAttentionLayer(node_hidden_dims[0], node_hidden_dims[1], dropout=dropout, alpha=alpha, concat=True) for _ in range(nheads[0])]
+        for i, attention in enumerate(self.attentions):
+            self.add_module('attention_{}'.format(i), attention)
+        self.out_att = GraphAttentionLayer(node_hidden_dims[1] * nheads[0] * edge_feature_dim, node_hidden_dims[2], dropout=dropout, alpha=alpha,
+                                           concat=False)
+        self.bs_norm = BatchSpecificNorm(n_batches=8, feature_dim=16)
+        self.bn1 = nn.BatchNorm1d(node_hidden_dims[2])
+        self.bn2 = nn.BatchNorm1d(node_hidden_dims[1] * nheads[0] * edge_feature_dim)
+        self.aggregator = GraphAggregator(node_hidden_sizes=[node_hidden_dims[3]], input_size=[node_hidden_dims[2]])
+        self.batch_norm = nn.BatchNorm1d(node_hidden_dims[2])
+    def forward(self, node_features, edge_features, graph_idx, depth_idx):
+        x = node_features
+        n_nodes = x.shape[0]
+        splits = edge_features.split(1, dim=1)
+        edge_attr = [split.view(n_nodes, n_nodes) for split in splits]
+        edge_attr = torch.stack(edge_attr, dim=0)
+        edge_attr = DSN(edge_attr)
+        temp_x = []
+        for att in self.attentions:
+            inn_x, edge_attr = att(x, edge_attr)
+            temp_x.append(inn_x)
+        x = torch.cat(temp_x, dim=1)
+        x, edge_attr = self.out_att(x, edge_attr)
+        x = self.bn1(x)
+        x = F.elu(x)
+        x = self.bs_norm(x, depth_idx)
+        x = F.relu(x)
+        graph_states = self.aggregator(x, graph_idx)
+        graph_states = self.batch_norm(graph_states)
+        return graph_states, edge_attr

hidt-0.1.0/hidt/utils.py ADDED Viewed

@@ -0,0 +1,107 @@
+import numpy as np
+import pandas as pd
+import torch
+def load_TAD_region(TADfile1, TADfile2):
+    # load diffTADs
+    region1 = pd.read_csv(TADfile1, sep='\t')
+    region2 = pd.read_csv(TADfile2, sep='\t')
+    region2 = region2[['chrom', 'start', 'end']]
+    # combine
+    all_TAD_region = pd.concat([region1, region2], ignore_index=True)
+    all_TAD_region = all_TAD_region.sort_values(by=['chrom', 'start'])
+    return all_TAD_region
+def Slice_matrix(mat, start, end):
+    # Getting matrix slice with a TAD
+    row = mat.shape[0]
+    if start > row or end > row:
+        print(start, end, row)
+        raise ValueError("invalid TAD boundary")
+    else:
+        cut_mat = mat[start:end + 1, start:end + 1]
+    return cut_mat.toarray()
+def filter_matrix(mat):
+    # filter invalid TADs (with outlier)
+    col_sum = np.sum(mat, axis=0)
+    rows = np.shape(mat)[0]
+    if np.any(col_sum < 0.1):
+        return False
+    else:
+        return True
+def reshape_and_split_tensor(tensor, n_splits):
+    """Reshape and split a 2D tensor along the last dimension.
+    Args:
+      tensor: a [num_examples, feature_dim] tensor.  num_examples must be a
+        multiple of `n_splits`.
+      n_splits: int, number of splits to split the tensor into.
+    Returns:
+      splits: a list of `n_splits` tensors.  The first split is [tensor[0],
+        tensor[n_splits], tensor[n_splits * 2], ...], the second split is
+        [tensor[1], tensor[n_splits + 1], tensor[n_splits * 2 + 1], ...], etc..
+    """
+    feature_dim = tensor.shape[-1]
+    tensor = torch.reshape(tensor, [-1, feature_dim * n_splits])
+    tensor_split = []
+    for i in range(n_splits):
+        tensor_split.append(tensor[:, feature_dim * i: feature_dim * (i + 1)])
+    return tensor_split
+def get_graphs(graphs, n_features, graph_idx, depth_idx, labels, edge_features):
+    adj = torch.FloatTensor(graphs)
+    flattend_adj = adj.view(-1)
+    reshaped_adj = flattend_adj.view(-1, 1)
+    edge_features = reshaped_adj.repeat(1, edge_features)
+    node_features = torch.FloatTensor(n_features)
+    graph_idx = torch.from_numpy(graph_idx).long()
+    depth_idx = torch.from_numpy(depth_idx).long()
+    labels = torch.FloatTensor(labels)
+    return edge_features, node_features, graph_idx, depth_idx, labels
+def pack_batches(graphs, depths):
+    n_graph = len(graphs)
+    # init adj matrix
+    sum_node = 0
+    for i in range(n_graph):
+        cur_graph = graphs[i][0]
+        cur_node = np.shape(cur_graph)[0]
+        sum_node += cur_node
+    combine_adj = np.zeros((sum_node*2, sum_node*2))
+    # add
+    graph_idx = []
+    depth_idx = []
+    cur_row = 0
+    idx = 0
+    for i in range(n_graph):
+        graph_1 = graphs[i][0]
+        graph_2 = graphs[i][1]
+        cur_depth1 = depths[i][0]
+        cur_depth2 = depths[i][1]
+        cur_node = np.shape(graph_1)[0]
+        combine_adj[cur_row:cur_row + cur_node, cur_row:cur_row + cur_node] = graph_1
+        cur_row += cur_node
+        combine_adj[cur_row:cur_row + cur_node, cur_row:cur_row + cur_node] = graph_2
+        cur_row += cur_node
+        graph_idx.append(np.ones(cur_node, dtype=np.int32) * idx)
+        depth_idx.append(np.ones(cur_node, dtype=np.int32) * cur_depth1)
+        idx += 1
+        graph_idx.append(np.ones(cur_node, dtype=np.int32) * idx)
+        depth_idx.append(np.ones(cur_node, dtype=np.int32) * cur_depth2)
+        idx += 1
+    depth_idx = np.concatenate(depth_idx, axis=0)
+    graph_idx = np.concatenate(graph_idx, axis=0)
+    node_features = np.ones((sum_node*2, 8), dtype=np.float64)
+    return combine_adj, node_features, graph_idx, depth_idx
+def generate_valid_batches(idx, graphs, labels, depths, bs):
+    batch_graphs = graphs[idx:idx + bs]
+    batch_depths = depths[idx:idx + bs]
+    batch_graphs, batch_features, graphs_idx, depth_idx = pack_batches(batch_graphs, batch_depths)
+    batch_labels = labels[idx:idx + bs]
+    return batch_graphs, batch_features, batch_labels, graphs_idx, depth_idx

hidt-0.1.0/hidt.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,11 @@
+Metadata-Version: 2.1
+Name: hidt
+Version: 0.1.0
+Summary: A computational pipeline for identifying differential TADs from 3D genome contact maps
+Home-page: https://github.com/GaoLabXDU/HiDT
+Author: Li Junping
+Author-email: lijunping02@qq.com
+License: MIT Licence
+Keywords: 3D genome,Comparative analysis,Topologically associating domains
+Platform: any
+License-File: LICENSE

hidt-0.1.0/hidt.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,14 @@
+LICENSE
+setup.py
+hidt/__init__.py
+hidt/evaluation.py
+hidt/load_hic_format.py
+hidt/loss.py
+hidt/model.py
+hidt/utils.py
+hidt.egg-info/PKG-INFO
+hidt.egg-info/SOURCES.txt
+hidt.egg-info/dependency_links.txt
+hidt.egg-info/requires.txt
+hidt.egg-info/top_level.txt
+scripts/HiDT

hidt-0.1.0/hidt.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

hidt-0.1.0/hidt.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,4 @@
+numpy
+pandas
+scikit-learn
+hic-straw==1.3.1

hidt-0.1.0/hidt.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ hidt

hidt-0.1.0/scripts/HiDT ADDED Viewed

@@ -0,0 +1,267 @@
+#!/usr/bin/env python
+from hidt.utils import *
+from hidt.model import EdgeGNN
+from hidt.loss import *
+from hidt.evaluation import *
+import pandas as pd
+import numpy as np
+from hidt.load_hic_format import *
+import os, argparse, sys
+def load_TAD(TADfile, res):
+    """
+    Load TADs assuming the first 3 columns are chrom, x1, x2.
+    Check that the file has at least 3 columns.
+    """
+    try:
+        df = pd.read_csv(TADfile, delim_whitespace=True)
+        if df.shape[1] < 3:
+            raise ValueError(f"TAD file '{TADfile}' has fewer than 3 columns.")
+        df = df.iloc[:, :3]
+        df.columns = ['chrom', 'x1', 'x2']
+        df['chrom'] = df['chrom'].astype(str).str.replace('chr', '', regex=False)
+        df['x1'] = df['x1'] // res
+        df['x2'] = df['x2'] // res
+        TADInfo = {}
+        chrom_list = []
+        for chrom, group in df.groupby('chrom'):
+            TADInfo[chrom] = group[['x1', 'x2']].reset_index(drop=True)
+            chrom_list.append(chrom)
+        return TADInfo, chrom_list
+    except Exception as e:
+        print(f"Error loading TAD file: {e}")
+        sys.exit(1)
+def scale_total_contacts(total_contacts, resolution):
+    base_resolution = 25000
+    if resolution == base_resolution:
+        return total_contacts
+    elif resolution > base_resolution:
+        scale_factor = resolution // base_resolution
+        return int(total_contacts * scale_factor)
+    else:
+        scale_factor = base_resolution // resolution
+        return int(total_contacts // scale_factor)
+def load_intra_counts_hic(hicfile):
+    hic = hicstraw.HiCFile(hicfile)
+    chromosomes = hic.getChromosomes()
+    chrom_names = [chrom.name for chrom in chromosomes]
+    chrom_names = [chrom for chrom in chrom_names if chrom not in ['All', 'Y', 'MT', 'ALL', 'chrY', 'chrM', 'M']]
+    # total intra-chromosomal counts
+    total_contacts = 0
+    for chrom in chrom_names:
+        result = hicstraw.straw('observed', 'NONE', hicfile, chrom, chrom, 'BP', resolution)
+        for i in range(len(result)):
+            total_contacts += result[i].counts
+    # scale contacts based on resolution
+    total_contacts = scale_total_contacts(total_contacts, resolution)
+    print(f"Hi-C file '{hicfile}' - Total intra contacts: {total_contacts}")
+    if total_contacts < 50000000:
+        depth = 0
+    elif 50000000 <= total_contacts < 100000000:
+        depth = 1
+    elif 100000000 <= total_contacts < 200000000:
+        depth = 2
+    elif 200000000 <= total_contacts < 250000000:
+        depth = 3
+    elif 250000000 <= total_contacts < 450000000:
+        depth = 4
+    elif 450000000 <= total_contacts < 500000000:
+        depth = 5
+    elif 500000000 <= total_contacts < 600000000:
+        depth = 6
+    else:
+        depth = 7
+    return depth
+def load_total_counts_hic(hicfile):
+    hic = hicstraw.HiCFile(hicfile)
+    chromosomes = hic.getChromosomes()
+    chrom_names = [chrom.name for chrom in chromosomes]
+    chrom_names = [chrom for chrom in chrom_names if chrom not in ['All', 'Y', 'MT', 'ALL', 'chrY', 'chrM', 'M']]
+    # total counts
+    total_contacts = 0
+    for chrom1 in chrom_names:
+        for chrom2 in chrom_names:
+            if chrom1 < chrom2:
+                continue
+            result = hicstraw.straw('observed', 'NONE', hicfile, chrom1, chrom2, 'BP', resolution)
+            for i in range(len(result)):
+                total_contacts += result[i].counts
+    total_contacts = scale_total_contacts(total_contacts, resolution)
+    print(f"Hi-C file '{hicfile}' - Total contacts: {total_contacts}")
+    if total_contacts < 50000000:
+        depth = 0
+    elif 50000000 <= total_contacts < 100000000:
+        depth = 1
+    elif 100000000 <= total_contacts < 200000000:
+        depth = 2
+    elif 200000000 <= total_contacts < 300000000:
+        depth = 3
+    elif 300000000 <= total_contacts < 400000000:
+        depth = 4
+    elif 400000000 <= total_contacts < 650000000:
+        depth = 5
+    elif 650000000 <= total_contacts < 900000000:
+        depth = 6
+    else:
+        depth = 7
+    return depth
+def check_file(path, desc, suffix=None):
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"{desc} not found: {path}")
+    if suffix and not path.endswith(suffix):
+        raise ValueError(f"{desc} must be a {suffix} file: {path}")
+def check_resolution_in_hic(hic_path, resolution):
+    hic = hicstraw.HiCFile(hic_path)
+    available_res = hic.getResolutions()
+    if resolution not in available_res:
+        raise ValueError(f"Resolution {resolution} not found in {hic_path}. "
+                         f"Available: {available_res}")
+def getargs():
+    ## Construct an ArgumentParser object for command-line arguments
+    parser = argparse.ArgumentParser(description='Identify differential TADs from Hi-C contact maps.',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--hicfile1', help='Hi-C file (in .hic format) for condition 1.')
+    parser.add_argument('--hicfile2', help='Hi-C file (in .hic format) for condition 2.')
+    parser.add_argument('--TADfile', help='TAD boundary file used for differential analysis.')
+    parser.add_argument('--res', type=int, help='Resolution of the Hi-C contact maps (e.g., 25000 for 25 kb).')
+    parser.add_argument('--depth', type=str, default='intra', help='Method to compute sequencing depth: "intra" for intra-chromosomal counts or "total" for all contacts.')
+    parser.add_argument('--output', help='Path to the output result file.')
+    ## Parse the command-line arguments
+    commands = sys.argv[1:]
+    if not commands:
+        commands.append('-h')
+    args = parser.parse_args(commands)
+    return args, commands
+if __name__ == '__main__':
+    args, commands = getargs()
+    check_file(args.TADfile, "TAD file")
+    check_file(args.hicfile1, "Hi-C file 1")
+    check_file(args.hicfile2, "Hi-C file 2")
+    check_resolution_in_hic(args.hicfile1, args.res)
+    check_resolution_in_hic(args.hicfile2, args.res)
+    # TAD_file = "/mnt/d/detectTAD/validation_sample/DipC/cortex_hipp_celltype/Cortical_L2a5_Pyramidal_Cell/Cortical_L2a5_Pyramidal_Cell.bed"
+    # resolution = 50000
+    # TADInfo, chroms = load_TAD(TAD_file, resolution)
+    resolution = args.res
+    result_file = args.output
+    TADInfo, chroms = load_TAD(args.TADfile, resolution)
+    hicfile1 = args.hicfile1
+    hicfile2 = args.hicfile2
+    # load hic file
+    # hicfile_1 = "/mnt/d/detectTAD/validation_sample/DipC/cortex_hipp_celltype/Cortical_L2a5_Pyramidal_Cell.hic"
+    # hicfile_2 = "/mnt/d/detectTAD/validation_sample/DipC/cortex_hipp_celltype/Cortical_L6_Pyramidal_Cell.hic"
+    # result_file = "/mnt/d/detectTAD/validation_sample/HiDT_rep_result/DipC/Cortical_L2a5_Pyramidal_Cell_result.txt"
+    if args.depth == 'intra':
+        hic1_depth = load_intra_counts_hic(hicfile1)
+        hic2_depth = load_intra_counts_hic(hicfile2)
+    elif args.depth == 'total':
+        hic1_depth = load_total_counts_hic(hicfile1)
+        hic2_depth = load_total_counts_hic(hicfile2)
+    chrom_bins = load_binsNum(hicfile1, resolution)
+    exclude_chroms = {'All', 'Y', 'MT', 'ALL', 'chrY', 'chrM', 'M'}
+    adjs = []
+    labels = []
+    depths = []
+    chrom_list = []
+    start_list = []
+    end_list = []
+    for i in range(len(chroms)):
+        chrom = chroms[i]
+        if any(exclude == chrom for exclude in exclude_chroms):
+            continue
+        print(f"Processing chromosome: {chrom}")
+        normed_mat_1 = dumpMatrix(chrom, chrom_bins[chrom], resolution, hicfile1)
+        normed_mat_2 = dumpMatrix(chrom, chrom_bins[chrom], resolution, hicfile2)
+        positions = TADInfo[chrom]
+        for start, end in zip(positions['x1'], positions['x2']):
+            normed_M1 = normed_mat_1[start:end, start:end].toarray()
+            normed_M2 = normed_mat_2[start:end, start:end].toarray()
+            if filter_matrix(normed_M1) and filter_matrix(normed_M2):
+                # save TAD information
+                chrom_list.append(chrom)
+                start_list.append(start*resolution)
+                end_list.append(end*resolution)
+                # save graphs
+                adjs.append((normed_M1, normed_M2))
+                depths.append((hic1_depth, hic2_depth))
+                labels.append(-1)
+    valid_graphs = adjs
+    valid_labels = labels
+    valid_depths = depths
+    # run model
+    use_cuda = torch.cuda.is_available()
+    device = torch.device('cuda:0' if use_cuda else 'cpu')
+    edge_feature_dim = 8
+    model = EdgeGNN(node_hidden_dims=[8, 64, 16, 64],
+                    edge_feature_dim=edge_feature_dim,
+                    dropout=0.5,
+                    nheads=[1],
+                    alpha=0.2)
+    model.to(device)
+    model.load_state_dict(torch.load('/home/li/detectTAD/DiffGNN/new_trained_model_25K.pth'))
+    model.eval()
+    batch_size = 40
+    simis = []
+    all_label = []
+    with torch.no_grad():
+        accumulated_pair_auc = []
+        for k_iter in range(0, len(valid_graphs), batch_size):
+            batch_graphs, batch_features, batch_labels, graphs_idx, depth_idx = generate_valid_batches(k_iter,
+                                                                                                    valid_graphs,
+                                                                                                    valid_labels,
+                                                                                                    valid_depths,
+                                                                                                    batch_size)
+            batch_labels = np.array(batch_labels)
+            cur_edge_features, cur_node_features, cur_graphs_idx, cur_depth_idx, cur_batch_labels = get_graphs(batch_graphs,
+                                                                                        batch_features,
+                                                                                        graphs_idx,
+                                                                                        depth_idx,
+                                                                                        batch_labels,
+                                                                                        edge_feature_dim)
+            graph_states, edges = model(cur_node_features.to(device),
+                                cur_edge_features.to(device),
+                                cur_graphs_idx.to(device),
+                                cur_depth_idx.to(device))
+            x, y = reshape_and_split_tensor(graph_states, 2)
+            similarity = compute_similarity(x, y)
+            for elem in similarity:
+                simis.append(-elem.item())
+            for cur_label in cur_batch_labels:
+                all_label.append(cur_label.item())
+    count = 0
+    pos = []
+    neg = []
+    pos_count = 0
+    for i in range(len(simis)):
+        pos_count += 1
+        if simis[i] > 1:
+            count += 1
+    print(
+    f"Total TADs: {pos_count}, "
+    f"Differential TADs: {count}, "
+    f"Percentage: {count / pos_count * 100:.2f}%"
+    )
+    # Save result
+    with open(result_file, 'w') as out:
+        for i in range(len(simis)):
+            out.write(chrom_list[i] + '\t' + str(start_list[i]) + '\t' + str(end_list[i]) + '\t' + str(simis[i]) + '\n')
+    print(f"Results have been saved to '{result_file}'.")

hidt-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

hidt-0.1.0/setup.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""
+Setup script for HiDT.
+"""
+import os, sys, glob
+import setuptools
+def read(fname):
+    return open(os.path.join(os.path.dirname(__file__), fname)).read()
+if (sys.version_info.major < 3) or (sys.version_info.major == 3 and sys.version_info.minor < 7):
+    print(
+        f"Python >=3.7 is required. You are currently using Python {sys.version.split()[0]}"
+    )
+    sys.exit(2)
+# Guarantee Unix Format
+for src in glob.glob('scripts/*'):
+    text = open(src, 'r').read().replace('\r\n', '\n')
+    open(src, 'w').write(text)
+setuptools.setup(
+    name = 'hidt',
+    version = "0.1.0",
+    author = "Li Junping",
+    author_email = 'lijunping02@qq.com',
+    url = 'https://github.com/GaoLabXDU/HiDT',
+    description = 'A computational pipeline for identifying differential TADs from 3D genome contact maps',
+    keywords = ("3D genome", "Comparative analysis", "Topologically associating domains"),
+    scripts = glob.glob('scripts/*'),
+    packages = setuptools.find_packages(),
+    include_package_data = True,
+    package_data={
+        "your_package.model": ["hidt/pretrained_model.pth"],
+    },
+    platforms = "any",
+    license="MIT Licence",
+    install_requires = [
+        "numpy",
+        "pandas",
+        "scikit-learn",
+        "hic-straw==1.3.1"
+        ]
+    )