PyPI - graph-structure - Versions diffs - 0.0.1__py3-none-any.whl - Mend

graph-structure 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

graph_structure/__init__.py +1 -0
graph_structure/graph_module.py +215 -0
graph_structure/graph_structure.py +251 -0
graph_structure/version.py +1 -0
graph_structure-0.0.1.data/data/LICENSE +661 -0
graph_structure-0.0.1.dist-info/METADATA +115 -0
graph_structure-0.0.1.dist-info/RECORD +11 -0
graph_structure-0.0.1.dist-info/WHEEL +5 -0
graph_structure-0.0.1.dist-info/entry_points.txt +2 -0
graph_structure-0.0.1.dist-info/licenses/LICENSE +661 -0
graph_structure-0.0.1.dist-info/top_level.txt +1 -0

graph_structure/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

graph_structure/graph_module.py ADDED Viewed

@@ -0,0 +1,215 @@
+import networkx as nx
+import pandas as pd
+import numpy as np
+# CONSTANTS
+UW = "Unweighted"
+W = "Weighted"
+VALID_TYPES = {W, UW}
+class GraphObject:
+    """
+    A class used to represent graphs and their related metrics.
+    Attributes:
+        edges: Pandas dataframe of edges.
+        nodes: Pandas dataframe of node attributes.
+        graph_type: Type of graph.
+        graph: NetworkX graph.
+        weight: weight distribution of the graph edges.
+        node_number: Number of nodes in the graph.
+        egde_number: NUmber of edges in the graph.
+        density: density of the graph.
+        connec_comp: Number of connected components.
+        transitivity: Transitivity of the graph.
+        degree: node degree distribution.
+        betweenness: node betweenness centrality distribution.
+        closeness: node closeness centrality distribution.
+        node_distribution: Pandas dataframe of all nodes distribution.
+        assortativity: assortativity of a given attribute.
+    """
+    def __init__(
+        self,
+        edges: pd.DataFrame,
+        nodes: pd.DataFrame,
+        graph_type: str
+    ) -> None:
+        """Initialize Graph Object that will store all the metrics.
+        Args:
+            edges: pandas dataframe of edges in a network.
+            nodes: pandas dataframe of nodes attributes of a network.
+            graph_type: string identifying the type of graph: weighted or unweighted
+        Raises:
+            ValueError: If the graph type is not valid.
+        """
+        if graph_type not in VALID_TYPES:
+            raise ValueError(f"Unknown graph type: '{self.graph_type}'. "
+                             f"Use '{W}' or '{UW}'.")
+        self.graph_type = graph_type
+        self.edges = edges
+        self.nodes = nodes.set_index("NodeID").to_dict("index")
+        if self.graph_type == W:
+            graph = nx.Graph()
+            self.weight = pd.to_numeric(self.edges["Weight"], errors='coerce')
+            for _, row in self.edges.iterrows():
+                graph.add_edge(row["Source"], row["Target"], weight=float(row["Weight"]))
+        elif self.graph_type == UW:
+            pairs = [tuple(x) for x in self.edges[["Source", "Target"]].to_numpy(dtype = str)]
+            graph = nx.Graph(pairs)
+        else:
+            raise ValueError(f"Unknown graph type: '{self.graph_type}'. "
+                             f"Use '{W}' or '{UW}'.")
+        self.graph = graph
+        nx.set_node_attributes(self.graph,self.nodes)
+        self.node_number = self.graph.number_of_nodes()
+        self.edge_number = self.graph.number_of_edges()
+        self.density = nx.density(self.graph)
+        self.connec_comp = nx.number_connected_components(self.graph)
+        self.transitivity = nx.transitivity(self.graph)
+        self.degree = {node:deg for (node, deg) in self.graph.degree()}
+        self.betweenness = nx.betweenness_centrality(self.graph, normalized = False)
+        self.closeness = nx.closeness_centrality(self.graph)
+        self.node_distributions =  pd.DataFrame.from_records([self.degree, self.betweenness,self.closeness], index = ["Degree", "Betweeness", "Closeness"]).transpose()
+    def assortativity(self, attribute: str) -> float:
+        """Identify assortativity of the value
+        Args:
+            attribute: node attribute to calculate the assortativity.
+        Returns:
+            float
+        Raises:
+            raise ValueError"""
+        if attribute not in pd.DataFrame.from_dict(self.nodes, orient='index').columns:
+            raise ValueError(f"Unknown attribute: '{attribute}'")
+        self.assortativity = nx.attribute_assortativity_coefficient(self.graph, attribute)
+        return self.assortativity
+    def distributions_statistic(self) -> list[pd.DataFrame]:
+        """Return basic statistics of distributions"""
+        if self.graph_type == W:
+            return self.node_distributions, self.node_distributions.describe().apply(lambda s: s.apply('{0:.4f}'.format)), self.weight.describe().apply(lambda x: format(x, '.4f'))
+        elif self.graph_type == UW:
+            return self.node_distributions, self.node_distributions.describe()
+    def nodes_dict(self) -> dict[dict]:
+        """Return the node dictionary"""
+        return self.nodes
+    def graph_type(self) -> str:
+        """Return graph type"""
+        return self.graph_type
+    def stats(self) -> list[numeric]:
+        """Return stats"""
+        return self.graph_type, self.node_number, self.edge_number, self.density,\
+               self.connec_comp, self.transitivity
+    def base_graph(self) -> dict[dict]:
+        """Return the node dictionary"""
+        return self.graph
+class SubGraphObject():
+    """
+    A class used to represent subgraphs and their related metrics.
+    Attributes:
+        attribute: Attribute used to defined subgraphs.
+        nodes: pandas DataFrame of node attributes.
+        unique_values: values of the attribute of interest.
+        sub_graphs: subgraphs divided based on unique_values attribute.
+        subgraph_metrics: dictionary with main metrics of each subgraph.
+    """
+    def __init__(
+        self,
+        main_graph: GraphObject,
+        attribute: str
+    ) -> None:
+        """
+        Initialize Sub Graph Object that will store all the metrics.
+        """
+        self.attribute = attribute
+        self.graph_type = main_graph.graph_type
+        self.nodes = pd.DataFrame.from_dict(main_graph.nodes_dict(), orient='index')
+        if self.attribute not in self.nodes.columns:
+            raise ValueError("Atribute '{self.attribute}' is not available in the given graph node directory")
+        self.unique_values = self.nodes[self.attribute].unique()
+        if len(self.unique_values) == 1:
+            raise Exception("Attribute selected only have one value. No subgraph can be extracted")
+        sub_graphs = dict()
+        for value in self.unique_values:
+            sub_graphs.update({value : main_graph.base_graph().subgraph(self.nodes[(self.nodes[attribute]== value)].index.to_list())})
+        self.sub_graphs = sub_graphs
+    def calculate_metrics(self) -> None:
+        """Calculate metrics for the stuff"""
+        self.subgraph_metrics = dict()
+        self.subgraph_distributions = dict()
+        for key in self.sub_graphs:
+            sub_graph = self.sub_graphs.get(key)
+            metrics = dict()
+            metrics.update({'node_number' : sub_graph.number_of_nodes()})
+            metrics.update({'edge_number' : sub_graph.number_of_edges()})
+            metrics.update({'density' : nx.density(sub_graph)})
+            metrics.update({'connect_components' : nx.number_connected_components(sub_graph)})
+            metrics.update({'transitivity' : nx.transitivity(sub_graph)})
+            self.subgraph_metrics.update({key : metrics})
+            distributions = dict()
+            degree = {node:deg for (node, deg) in sub_graph.degree()}
+            betweenness = nx.betweenness_centrality(sub_graph, normalized = False)
+            closeness = nx.closeness_centrality(sub_graph)
+            node_distributions =  pd.DataFrame.from_records([degree, betweenness,closeness], index = ["Degree", "Betweeness", "Closeness"]).transpose()
+            distributions.update({'nodes' : node_distributions})
+            if self.graph_type == W:
+                distributions.update({'Weight' : nx.to_pandas_edgelist(sub_graph).rename(columns = { 'source' : 'Source', 'target' : 'Target', 'weight' : 'Weight'})})
+            self.subgraph_distributions.update({key : distributions})
+    def subgraphs(self) -> dict[Graph]:
+        """Return dictionary of subgraphs"""
+        return self.sub_graphs
+    def metrics(self) -> dict[dict]:
+        """Return dictionary of metrics"""
+        return self.subgraph_metrics
+    def distributions(self) -> list[objects]:
+        """Return dictionary of distributions"""
+        return self.subgraph_distributions

graph_structure/graph_structure.py ADDED Viewed

@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+"""
+Graph Structural Analyzing tool
+This tool perform a structural analysis of an undirected graph
+and their attribute-defined subgraphs. returning multiple metrics
+related to composition, connectivity, assortativity, and centrality.
+"""
+import sys
+import os
+import argparse
+import numpy as np
+import pandas as pd
+import networkx as nx
+from graph_structure.graph_module import GraphObject
+from graph_structure.graph_module import SubGraphObject
+from graph_structure.version import __version__
+# CONSTANTS
+UW = "Unweighted"
+W = "Weighted"
+#exec(open("version.py").read())
+def load_graph(edges_file: str,
+               node_file: str,
+               attribute: str
+    ) -> list[pd.DataFrame, pd.DataFrame, str]:
+    """Check and store the input files of the graph
+    Args:
+        gff_path: Path to the input GFF file.
+        fasta_path: Path to the input fasta file.
+        attribute: String representing the attribute for subgraphs.
+    Returns:
+        list [edges_df, nodes_df, graph_type] where:
+            - edges_df: pandas dataframe of edges in a network.
+            - nodes_df: pandas dataframe of nodes attributes of a network.
+            - graph_type: String representing type of graph to analyze.
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        PermissionError: If the file cannot be read.
+        pd.errors.ParserError: If either input file cannot be parsed.
+        OSError: If any other I/O error occurs.
+    """
+    try:
+        edges_df = pd.read_csv(
+            edges_file,
+            sep='\t',
+            dtype={'Source': str, 'Target': str,'Weight': float}
+        )
+        if not set(["Source","Target"]).issubset(edges_df.columns.values):
+            print(f"'Source' or 'Target' columns were not found in the edge file")
+            sys.exit(1)
+        else:
+            if len(edges_df.columns) == 3:
+                if "Weight" not in edges_df.columns.values:
+                    print(f"'Weight' column has a bad name in the edge file")
+                    sys.exit(1)
+                else:
+                    graph_type = W
+            elif len(edges_df.columns) > 3:
+                sys.exit(f"edge file have more than three colums")
+            else:
+                graph_type = UW
+    except FileNotFoundError:
+        sys.exit(f"Error: Data file '{edges_df}' not found.")
+    except PermissionError:
+        sys.exit(f"Error: No read permission for '{edges_df}'.")
+    except pd.errors.ParserError as e:
+        sys.exit(f"Error parsing data file '{edges_df}': {e}")
+    try:
+        nodes_df = pd.read_csv(
+            node_file,
+            sep='\t',
+            dtype={'NodeID': str}
+        )
+        if "NodeID" not in nodes_df.columns.values:
+            sys.exit(f"'NodeID' column is missing in node attributes file")
+        else:
+            if len(nodes_df.columns) == 1:
+                sys.exit(f"nodes attribute file only have the NodeID column")
+    except FileNotFoundError:
+        sys.exit(f"Error: Data file '{node_file}' not found.")
+    except PermissionError:
+        sys.exit(f"Error: No read permission for '{node_file}'.")
+    except pd.errors.ParserError as e:
+        sys.exit(f"Error parsing data file '{node_file}': {e}")
+    if attribute not in nodes_df.columns:
+        sys.exit(f"Atribute '{attribute}' is not available in the given node atribute table")
+    nodes_id_nodes = np.sort(nodes_df["NodeID"].to_numpy())
+    nodes_id_edges = np.unique(np.array([edges_df["Source"].to_numpy(),edges_df["Target"].to_numpy()]))
+    if not np.array_equal(nodes_id_nodes, nodes_id_edges):
+        difference = np.setxor1d(nodes_id_nodes,nodes_id_edges)
+        print(f"The following nodes differ between files:")
+        print(f"{difference}")
+        sys.exit(1)
+    print(f"Input is a {graph_type} graph")
+    return edges_df, nodes_df, graph_type
+def process_graph(edges_file: str,
+                  node_file: str,
+                  attribute: str,
+                  output_dir: str
+    ) -> None:
+    """Process the data and write the output.
+    Args:
+        gff_path: Path to the input GFF file.
+        fasta_path: Path to the input fasta file.
+        attribute: String representing the attribute for subgraphs.
+        output_dir: Path to the output directory.
+    Returns:
+        None
+    Raises:
+        PermissionError: If the file cannot be written.
+        OSError: If any other I/O error occurs.
+    """
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    edges, nodes, graph_type = load_graph(edges_file, node_file, attribute)
+    input_graph = GraphObject(edges, nodes, graph_type)
+    assortativity = input_graph.assortativity(attribute)
+    input_graph.distributions_statistic()
+    main_stats = os.path.join(output_dir, "Main_graph_stats.txt")
+    graph_stats = input_graph.stats()
+    print(f"Writing results in {output_dir} directory")
+    try:
+        with open(main_stats, 'w') as f:
+            f.write(f"Statistics of graph '{edges_file}':\n")
+            f.write(f"  Graph type: {graph_stats[0]}\n")
+            f.write(f"  Number of nodes: {graph_stats[1]}\n")
+            f.write(f"  Number of edges: {graph_stats[2]}\n")
+            f.write(f"  Density: {graph_stats[3]}\n")
+            f.write(f"  Number of connected components: {graph_stats[4]}\n")
+            f.write(f"  Transitivity: {graph_stats[5]}\n")
+            f.write(f"  Assortativity of '{attribute}': {assortativity}\n")
+    except PermissionError:
+        sys.exit(f"Error: No write permission for '{main_stats}'.")
+    except OSError as e:
+        sys.exit(f"Error writing file '{main_stats}': {e}")
+    distributions = input_graph.distributions_statistic()
+    if graph_type == W:
+        distributions[0].to_csv(os.path.join(output_dir, "Node_characteristics.txt"),
+                                sep='\t', index_label = "NodeID")
+        distributions[1].to_csv(os.path.join(output_dir, "Node_stats.txt"),
+                                sep='\t')
+        distributions[2].to_csv(os.path.join(output_dir, "Edge_stats.txt"),
+                                sep='\t')
+    elif graph_type == UW:
+        distributions[0].to_csv(os.path.join(output_dir, "Node_characteristics.txt"),
+                                sep='\t')
+        distributions[1].to_csv(os.path.join(output_dir, "Node_stats.txt"),
+                                sep='\t', index_label = "NodeID")
+    subgraph = SubGraphObject(input_graph, attribute)
+    general_sub = subgraph.subgraphs()
+    subgraph.calculate_metrics()
+    subgraph_metric = subgraph.metrics()
+    subgraph_distribution = subgraph.distributions()
+    subgraph_dir = os.path.join(output_dir, "SubGraphs")
+    if not os.path.exists(subgraph_dir):
+            os.makedirs(subgraph_dir)
+    for key in general_sub:
+        graph = general_sub.get(key)
+        metrics = subgraph_metric.get(key)
+        distributions = subgraph_distribution.get(key)
+        info_dir = os.path.join(subgraph_dir, key)
+        if not os.path.exists(info_dir):
+            os.makedirs(info_dir)
+        edge_file = os.path.join(info_dir, f"{key}_edges_file.txt")
+        nx.to_pandas_edgelist(graph).rename(columns = { 'source' : 'Source', 'target' : 'Target', 'weight' : 'Weight'}).to_csv(edge_file, sep='\t', index = False)
+        node_file = os.path.join(info_dir, f"{key}_nodes_file.txt")
+        pd.DataFrame.from_dict(dict(graph.nodes(data=True)), orient='index').to_csv(node_file, sep='\t', index_label = "NodeID")
+        filename = os.path.join(info_dir, "Main_subgraph_stats.txt")
+        try:
+            with open(filename, 'w') as f:
+                f.write(f"Statistics of subgraph '{key}':\n")
+                f.write(f"  Graph type: {graph_stats[0]}\n")
+                f.write(f"  Number of nodes: {metrics.get('node_number')}\n")
+                f.write(f"  Number of edges: {metrics.get('edge_number')}\n")
+                f.write(f"  Density: {metrics.get('density')}\n")
+                f.write(f"  Number of connected components: {metrics.get('connect_components')}\n")
+                f.write(f"  Transitivity: {metrics.get('transitivity')}\n")
+        except PermissionError:
+            sys.exit(f"Error: No write permission for '{filename}'.")
+        except OSError as e:
+            sys.exit(f"Error writing file '{filename}': {e}")
+        if graph_type == W:
+            distributions.get('nodes').to_csv(os.path.join(info_dir, "Node_characteristics.txt"),
+                                sep='\t', index_label = "NodeID")
+            distributions.get('nodes').describe().to_csv(os.path.join(info_dir, "Node_stats.txt"),
+                                sep='\t')
+            if metrics.get('edge_number') > 0:
+                distributions.get('Weight')['Weight'].describe().to_csv(os.path.join(info_dir, "Edge_stats.txt"),
+                                    sep='\t')
+        elif graph_type == UW:
+            distributions.get('nodes').to_csv(os.path.join(info_dir, "Node_characteristics.txt"),
+                                sep='\t')
+            distributions.get('nodes').describe().to_csv(os.path.join(info_dir, "Node_stats.txt"),
+                                sep='\t')
+def main() -> None:
+    """Parse command-line arguments and launch the graph analyzer pipeline."""
+    parser = argparse.ArgumentParser(prog = f"graph_structure", \
+        description=f"Structural properties analysis of graph and attribute-base subgraphs v{__version__}")
+    parser.add_argument('-e', '--edges-file', type=str, required=True, help="Input TSV file with edges.")
+    parser.add_argument('-n', '--node-file', type=str, required=True, help="Input TSV file with node attributes.")
+    parser.add_argument('-a', '--attribute', type=str, required=True, help="Name of the attribute for subgraphs")
+    parser.add_argument('-o', '--output-dir', type=str, default='./', help="Directory for output files.")
+    args = parser.parse_args()
+    process_graph(args.edges_file, args.node_file, args.attribute, args.output_dir)
+if __name__ == "__main__":
+    main()

graph_structure/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.0.1"