graph-structure 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,215 @@
1
+ import networkx as nx
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ # CONSTANTS
6
+
7
+ UW = "Unweighted"
8
+ W = "Weighted"
9
+ VALID_TYPES = {W, UW}
10
+
11
+
12
+ class GraphObject:
13
+ """
14
+ A class used to represent graphs and their related metrics.
15
+
16
+ Attributes:
17
+ edges: Pandas dataframe of edges.
18
+ nodes: Pandas dataframe of node attributes.
19
+ graph_type: Type of graph.
20
+ graph: NetworkX graph.
21
+ weight: weight distribution of the graph edges.
22
+ node_number: Number of nodes in the graph.
23
+ egde_number: NUmber of edges in the graph.
24
+ density: density of the graph.
25
+ connec_comp: Number of connected components.
26
+ transitivity: Transitivity of the graph.
27
+ degree: node degree distribution.
28
+ betweenness: node betweenness centrality distribution.
29
+ closeness: node closeness centrality distribution.
30
+ node_distribution: Pandas dataframe of all nodes distribution.
31
+ assortativity: assortativity of a given attribute.
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ edges: pd.DataFrame,
37
+ nodes: pd.DataFrame,
38
+ graph_type: str
39
+ ) -> None:
40
+ """Initialize Graph Object that will store all the metrics.
41
+
42
+ Args:
43
+ edges: pandas dataframe of edges in a network.
44
+ nodes: pandas dataframe of nodes attributes of a network.
45
+ graph_type: string identifying the type of graph: weighted or unweighted
46
+
47
+ Raises:
48
+ ValueError: If the graph type is not valid.
49
+ """
50
+
51
+ if graph_type not in VALID_TYPES:
52
+ raise ValueError(f"Unknown graph type: '{self.graph_type}'. "
53
+ f"Use '{W}' or '{UW}'.")
54
+
55
+ self.graph_type = graph_type
56
+ self.edges = edges
57
+ self.nodes = nodes.set_index("NodeID").to_dict("index")
58
+
59
+ if self.graph_type == W:
60
+ graph = nx.Graph()
61
+ self.weight = pd.to_numeric(self.edges["Weight"], errors='coerce')
62
+ for _, row in self.edges.iterrows():
63
+ graph.add_edge(row["Source"], row["Target"], weight=float(row["Weight"]))
64
+ elif self.graph_type == UW:
65
+ pairs = [tuple(x) for x in self.edges[["Source", "Target"]].to_numpy(dtype = str)]
66
+ graph = nx.Graph(pairs)
67
+ else:
68
+ raise ValueError(f"Unknown graph type: '{self.graph_type}'. "
69
+ f"Use '{W}' or '{UW}'.")
70
+
71
+ self.graph = graph
72
+ nx.set_node_attributes(self.graph,self.nodes)
73
+ self.node_number = self.graph.number_of_nodes()
74
+ self.edge_number = self.graph.number_of_edges()
75
+ self.density = nx.density(self.graph)
76
+ self.connec_comp = nx.number_connected_components(self.graph)
77
+ self.transitivity = nx.transitivity(self.graph)
78
+
79
+ self.degree = {node:deg for (node, deg) in self.graph.degree()}
80
+ self.betweenness = nx.betweenness_centrality(self.graph, normalized = False)
81
+ self.closeness = nx.closeness_centrality(self.graph)
82
+ self.node_distributions = pd.DataFrame.from_records([self.degree, self.betweenness,self.closeness], index = ["Degree", "Betweeness", "Closeness"]).transpose()
83
+
84
+ def assortativity(self, attribute: str) -> float:
85
+ """Identify assortativity of the value
86
+
87
+ Args:
88
+ attribute: node attribute to calculate the assortativity.
89
+
90
+ Returns:
91
+ float
92
+
93
+ Raises:
94
+ raise ValueError"""
95
+
96
+ if attribute not in pd.DataFrame.from_dict(self.nodes, orient='index').columns:
97
+ raise ValueError(f"Unknown attribute: '{attribute}'")
98
+
99
+ self.assortativity = nx.attribute_assortativity_coefficient(self.graph, attribute)
100
+ return self.assortativity
101
+
102
+
103
+ def distributions_statistic(self) -> list[pd.DataFrame]:
104
+ """Return basic statistics of distributions"""
105
+
106
+ if self.graph_type == W:
107
+ return self.node_distributions, self.node_distributions.describe().apply(lambda s: s.apply('{0:.4f}'.format)), self.weight.describe().apply(lambda x: format(x, '.4f'))
108
+ elif self.graph_type == UW:
109
+ return self.node_distributions, self.node_distributions.describe()
110
+
111
+ def nodes_dict(self) -> dict[dict]:
112
+ """Return the node dictionary"""
113
+
114
+ return self.nodes
115
+
116
+ def graph_type(self) -> str:
117
+ """Return graph type"""
118
+
119
+ return self.graph_type
120
+
121
+ def stats(self) -> list[numeric]:
122
+ """Return stats"""
123
+
124
+ return self.graph_type, self.node_number, self.edge_number, self.density,\
125
+ self.connec_comp, self.transitivity
126
+
127
+ def base_graph(self) -> dict[dict]:
128
+ """Return the node dictionary"""
129
+
130
+ return self.graph
131
+
132
+
133
+ class SubGraphObject():
134
+ """
135
+ A class used to represent subgraphs and their related metrics.
136
+
137
+ Attributes:
138
+ attribute: Attribute used to defined subgraphs.
139
+ nodes: pandas DataFrame of node attributes.
140
+ unique_values: values of the attribute of interest.
141
+ sub_graphs: subgraphs divided based on unique_values attribute.
142
+ subgraph_metrics: dictionary with main metrics of each subgraph.
143
+ """
144
+
145
+ def __init__(
146
+ self,
147
+ main_graph: GraphObject,
148
+ attribute: str
149
+ ) -> None:
150
+ """
151
+ Initialize Sub Graph Object that will store all the metrics.
152
+ """
153
+
154
+ self.attribute = attribute
155
+ self.graph_type = main_graph.graph_type
156
+ self.nodes = pd.DataFrame.from_dict(main_graph.nodes_dict(), orient='index')
157
+ if self.attribute not in self.nodes.columns:
158
+ raise ValueError("Atribute '{self.attribute}' is not available in the given graph node directory")
159
+
160
+ self.unique_values = self.nodes[self.attribute].unique()
161
+ if len(self.unique_values) == 1:
162
+ raise Exception("Attribute selected only have one value. No subgraph can be extracted")
163
+
164
+ sub_graphs = dict()
165
+ for value in self.unique_values:
166
+ sub_graphs.update({value : main_graph.base_graph().subgraph(self.nodes[(self.nodes[attribute]== value)].index.to_list())})
167
+
168
+ self.sub_graphs = sub_graphs
169
+
170
+
171
+ def calculate_metrics(self) -> None:
172
+ """Calculate metrics for the stuff"""
173
+
174
+ self.subgraph_metrics = dict()
175
+ self.subgraph_distributions = dict()
176
+
177
+ for key in self.sub_graphs:
178
+ sub_graph = self.sub_graphs.get(key)
179
+
180
+ metrics = dict()
181
+ metrics.update({'node_number' : sub_graph.number_of_nodes()})
182
+ metrics.update({'edge_number' : sub_graph.number_of_edges()})
183
+ metrics.update({'density' : nx.density(sub_graph)})
184
+ metrics.update({'connect_components' : nx.number_connected_components(sub_graph)})
185
+ metrics.update({'transitivity' : nx.transitivity(sub_graph)})
186
+
187
+ self.subgraph_metrics.update({key : metrics})
188
+
189
+ distributions = dict()
190
+ degree = {node:deg for (node, deg) in sub_graph.degree()}
191
+ betweenness = nx.betweenness_centrality(sub_graph, normalized = False)
192
+ closeness = nx.closeness_centrality(sub_graph)
193
+ node_distributions = pd.DataFrame.from_records([degree, betweenness,closeness], index = ["Degree", "Betweeness", "Closeness"]).transpose()
194
+ distributions.update({'nodes' : node_distributions})
195
+ if self.graph_type == W:
196
+ distributions.update({'Weight' : nx.to_pandas_edgelist(sub_graph).rename(columns = { 'source' : 'Source', 'target' : 'Target', 'weight' : 'Weight'})})
197
+
198
+ self.subgraph_distributions.update({key : distributions})
199
+
200
+
201
+ def subgraphs(self) -> dict[Graph]:
202
+ """Return dictionary of subgraphs"""
203
+
204
+ return self.sub_graphs
205
+
206
+ def metrics(self) -> dict[dict]:
207
+ """Return dictionary of metrics"""
208
+
209
+ return self.subgraph_metrics
210
+
211
+ def distributions(self) -> list[objects]:
212
+ """Return dictionary of distributions"""
213
+
214
+ return self.subgraph_distributions
215
+
@@ -0,0 +1,251 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Graph Structural Analyzing tool
4
+
5
+ This tool perform a structural analysis of an undirected graph
6
+ and their attribute-defined subgraphs. returning multiple metrics
7
+ related to composition, connectivity, assortativity, and centrality.
8
+ """
9
+
10
+ import sys
11
+ import os
12
+ import argparse
13
+ import numpy as np
14
+ import pandas as pd
15
+ import networkx as nx
16
+ from graph_structure.graph_module import GraphObject
17
+ from graph_structure.graph_module import SubGraphObject
18
+ from graph_structure.version import __version__
19
+
20
+ # CONSTANTS
21
+
22
+ UW = "Unweighted"
23
+ W = "Weighted"
24
+ #exec(open("version.py").read())
25
+
26
+
27
+ def load_graph(edges_file: str,
28
+ node_file: str,
29
+ attribute: str
30
+ ) -> list[pd.DataFrame, pd.DataFrame, str]:
31
+ """Check and store the input files of the graph
32
+
33
+ Args:
34
+ gff_path: Path to the input GFF file.
35
+ fasta_path: Path to the input fasta file.
36
+ attribute: String representing the attribute for subgraphs.
37
+
38
+ Returns:
39
+ list [edges_df, nodes_df, graph_type] where:
40
+ - edges_df: pandas dataframe of edges in a network.
41
+ - nodes_df: pandas dataframe of nodes attributes of a network.
42
+ - graph_type: String representing type of graph to analyze.
43
+
44
+ Raises:
45
+ FileNotFoundError: If the file does not exist.
46
+ PermissionError: If the file cannot be read.
47
+ pd.errors.ParserError: If either input file cannot be parsed.
48
+ OSError: If any other I/O error occurs.
49
+ """
50
+
51
+ try:
52
+ edges_df = pd.read_csv(
53
+ edges_file,
54
+ sep='\t',
55
+ dtype={'Source': str, 'Target': str,'Weight': float}
56
+ )
57
+
58
+ if not set(["Source","Target"]).issubset(edges_df.columns.values):
59
+ print(f"'Source' or 'Target' columns were not found in the edge file")
60
+ sys.exit(1)
61
+ else:
62
+ if len(edges_df.columns) == 3:
63
+ if "Weight" not in edges_df.columns.values:
64
+ print(f"'Weight' column has a bad name in the edge file")
65
+ sys.exit(1)
66
+ else:
67
+ graph_type = W
68
+ elif len(edges_df.columns) > 3:
69
+ sys.exit(f"edge file have more than three colums")
70
+ else:
71
+ graph_type = UW
72
+ except FileNotFoundError:
73
+ sys.exit(f"Error: Data file '{edges_df}' not found.")
74
+ except PermissionError:
75
+ sys.exit(f"Error: No read permission for '{edges_df}'.")
76
+ except pd.errors.ParserError as e:
77
+ sys.exit(f"Error parsing data file '{edges_df}': {e}")
78
+
79
+ try:
80
+ nodes_df = pd.read_csv(
81
+ node_file,
82
+ sep='\t',
83
+ dtype={'NodeID': str}
84
+ )
85
+
86
+ if "NodeID" not in nodes_df.columns.values:
87
+ sys.exit(f"'NodeID' column is missing in node attributes file")
88
+ else:
89
+ if len(nodes_df.columns) == 1:
90
+ sys.exit(f"nodes attribute file only have the NodeID column")
91
+ except FileNotFoundError:
92
+ sys.exit(f"Error: Data file '{node_file}' not found.")
93
+ except PermissionError:
94
+ sys.exit(f"Error: No read permission for '{node_file}'.")
95
+ except pd.errors.ParserError as e:
96
+ sys.exit(f"Error parsing data file '{node_file}': {e}")
97
+
98
+ if attribute not in nodes_df.columns:
99
+ sys.exit(f"Atribute '{attribute}' is not available in the given node atribute table")
100
+
101
+ nodes_id_nodes = np.sort(nodes_df["NodeID"].to_numpy())
102
+ nodes_id_edges = np.unique(np.array([edges_df["Source"].to_numpy(),edges_df["Target"].to_numpy()]))
103
+
104
+ if not np.array_equal(nodes_id_nodes, nodes_id_edges):
105
+ difference = np.setxor1d(nodes_id_nodes,nodes_id_edges)
106
+ print(f"The following nodes differ between files:")
107
+ print(f"{difference}")
108
+ sys.exit(1)
109
+
110
+ print(f"Input is a {graph_type} graph")
111
+
112
+ return edges_df, nodes_df, graph_type
113
+
114
+
115
+ def process_graph(edges_file: str,
116
+ node_file: str,
117
+ attribute: str,
118
+ output_dir: str
119
+ ) -> None:
120
+ """Process the data and write the output.
121
+
122
+ Args:
123
+ gff_path: Path to the input GFF file.
124
+ fasta_path: Path to the input fasta file.
125
+ attribute: String representing the attribute for subgraphs.
126
+ output_dir: Path to the output directory.
127
+
128
+ Returns:
129
+ None
130
+
131
+ Raises:
132
+ PermissionError: If the file cannot be written.
133
+ OSError: If any other I/O error occurs.
134
+ """
135
+
136
+ if not os.path.exists(output_dir):
137
+ os.makedirs(output_dir)
138
+
139
+ edges, nodes, graph_type = load_graph(edges_file, node_file, attribute)
140
+ input_graph = GraphObject(edges, nodes, graph_type)
141
+ assortativity = input_graph.assortativity(attribute)
142
+ input_graph.distributions_statistic()
143
+
144
+ main_stats = os.path.join(output_dir, "Main_graph_stats.txt")
145
+ graph_stats = input_graph.stats()
146
+
147
+ print(f"Writing results in {output_dir} directory")
148
+
149
+ try:
150
+ with open(main_stats, 'w') as f:
151
+ f.write(f"Statistics of graph '{edges_file}':\n")
152
+ f.write(f" Graph type: {graph_stats[0]}\n")
153
+ f.write(f" Number of nodes: {graph_stats[1]}\n")
154
+ f.write(f" Number of edges: {graph_stats[2]}\n")
155
+ f.write(f" Density: {graph_stats[3]}\n")
156
+ f.write(f" Number of connected components: {graph_stats[4]}\n")
157
+ f.write(f" Transitivity: {graph_stats[5]}\n")
158
+ f.write(f" Assortativity of '{attribute}': {assortativity}\n")
159
+ except PermissionError:
160
+ sys.exit(f"Error: No write permission for '{main_stats}'.")
161
+ except OSError as e:
162
+ sys.exit(f"Error writing file '{main_stats}': {e}")
163
+
164
+ distributions = input_graph.distributions_statistic()
165
+
166
+ if graph_type == W:
167
+ distributions[0].to_csv(os.path.join(output_dir, "Node_characteristics.txt"),
168
+ sep='\t', index_label = "NodeID")
169
+ distributions[1].to_csv(os.path.join(output_dir, "Node_stats.txt"),
170
+ sep='\t')
171
+
172
+ distributions[2].to_csv(os.path.join(output_dir, "Edge_stats.txt"),
173
+ sep='\t')
174
+ elif graph_type == UW:
175
+ distributions[0].to_csv(os.path.join(output_dir, "Node_characteristics.txt"),
176
+ sep='\t')
177
+ distributions[1].to_csv(os.path.join(output_dir, "Node_stats.txt"),
178
+ sep='\t', index_label = "NodeID")
179
+
180
+ subgraph = SubGraphObject(input_graph, attribute)
181
+ general_sub = subgraph.subgraphs()
182
+ subgraph.calculate_metrics()
183
+ subgraph_metric = subgraph.metrics()
184
+ subgraph_distribution = subgraph.distributions()
185
+
186
+ subgraph_dir = os.path.join(output_dir, "SubGraphs")
187
+ if not os.path.exists(subgraph_dir):
188
+ os.makedirs(subgraph_dir)
189
+
190
+ for key in general_sub:
191
+ graph = general_sub.get(key)
192
+ metrics = subgraph_metric.get(key)
193
+ distributions = subgraph_distribution.get(key)
194
+ info_dir = os.path.join(subgraph_dir, key)
195
+
196
+ if not os.path.exists(info_dir):
197
+ os.makedirs(info_dir)
198
+
199
+ edge_file = os.path.join(info_dir, f"{key}_edges_file.txt")
200
+
201
+ nx.to_pandas_edgelist(graph).rename(columns = { 'source' : 'Source', 'target' : 'Target', 'weight' : 'Weight'}).to_csv(edge_file, sep='\t', index = False)
202
+
203
+ node_file = os.path.join(info_dir, f"{key}_nodes_file.txt")
204
+
205
+ pd.DataFrame.from_dict(dict(graph.nodes(data=True)), orient='index').to_csv(node_file, sep='\t', index_label = "NodeID")
206
+
207
+ filename = os.path.join(info_dir, "Main_subgraph_stats.txt")
208
+ try:
209
+ with open(filename, 'w') as f:
210
+ f.write(f"Statistics of subgraph '{key}':\n")
211
+ f.write(f" Graph type: {graph_stats[0]}\n")
212
+ f.write(f" Number of nodes: {metrics.get('node_number')}\n")
213
+ f.write(f" Number of edges: {metrics.get('edge_number')}\n")
214
+ f.write(f" Density: {metrics.get('density')}\n")
215
+ f.write(f" Number of connected components: {metrics.get('connect_components')}\n")
216
+ f.write(f" Transitivity: {metrics.get('transitivity')}\n")
217
+ except PermissionError:
218
+ sys.exit(f"Error: No write permission for '{filename}'.")
219
+ except OSError as e:
220
+ sys.exit(f"Error writing file '{filename}': {e}")
221
+
222
+ if graph_type == W:
223
+ distributions.get('nodes').to_csv(os.path.join(info_dir, "Node_characteristics.txt"),
224
+ sep='\t', index_label = "NodeID")
225
+ distributions.get('nodes').describe().to_csv(os.path.join(info_dir, "Node_stats.txt"),
226
+ sep='\t')
227
+ if metrics.get('edge_number') > 0:
228
+ distributions.get('Weight')['Weight'].describe().to_csv(os.path.join(info_dir, "Edge_stats.txt"),
229
+ sep='\t')
230
+ elif graph_type == UW:
231
+ distributions.get('nodes').to_csv(os.path.join(info_dir, "Node_characteristics.txt"),
232
+ sep='\t')
233
+ distributions.get('nodes').describe().to_csv(os.path.join(info_dir, "Node_stats.txt"),
234
+ sep='\t')
235
+
236
+
237
+ def main() -> None:
238
+ """Parse command-line arguments and launch the graph analyzer pipeline."""
239
+ parser = argparse.ArgumentParser(prog = f"graph_structure", \
240
+ description=f"Structural properties analysis of graph and attribute-base subgraphs v{__version__}")
241
+ parser.add_argument('-e', '--edges-file', type=str, required=True, help="Input TSV file with edges.")
242
+ parser.add_argument('-n', '--node-file', type=str, required=True, help="Input TSV file with node attributes.")
243
+ parser.add_argument('-a', '--attribute', type=str, required=True, help="Name of the attribute for subgraphs")
244
+ parser.add_argument('-o', '--output-dir', type=str, default='./', help="Directory for output files.")
245
+
246
+ args = parser.parse_args()
247
+ process_graph(args.edges_file, args.node_file, args.attribute, args.output_dir)
248
+
249
+
250
+ if __name__ == "__main__":
251
+ main()
@@ -0,0 +1 @@
1
+ __version__ = "0.0.1"