agtools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agtools/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ """agtools: Tools for manipulating assembly graphs"""
2
+
3
+ __author__ = "Vijini Mallawaarachchi"
4
+ __copyright__ = "Copyright 2025, agtools Project"
5
+ __credits__ = ["Vijini Mallawaarachchi"]
6
+ __license__ = "MIT"
7
+ __version__ = "0.1.0"
8
+ __maintainer__ = "Vijini Mallawaarachchi"
9
+ __email__ = "viji.mallawaarachchi@gmail.com"
10
+ __status__ = "Alpha"
@@ -0,0 +1,10 @@
1
+ """agtools: Tools for manipulating assembly graphs"""
2
+
3
+ __author__ = "Vijini Mallawaarachchi"
4
+ __copyright__ = "Copyright 2025, agtools Project"
5
+ __credits__ = ["Vijini Mallawaarachchi"]
6
+ __license__ = "MIT"
7
+ __version__ = "0.1.0"
8
+ __maintainer__ = "Vijini Mallawaarachchi"
9
+ __email__ = "viji.mallawaarachchi@gmail.com"
10
+ __status__ = "Alpha"
@@ -0,0 +1,269 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from collections import defaultdict
4
+
5
+ from bidict import bidict
6
+ from igraph import Graph
7
+
8
+ from agtools.core.contig_graph import ContigGraph
9
+ from agtools.core.fasta_parser import FastaParser
10
+ from agtools.core.unitig_graph import UnitigGraph
11
+
12
+
13
+ def _get_links(contig_paths_file: str) -> tuple:
14
+ """
15
+ Parse contig paths file to extract paths and segment-contig mappings.
16
+
17
+ Parameters
18
+ ----------
19
+ contig_paths_file : str
20
+ Path to the file containing contig path information.
21
+
22
+ Returns
23
+ -------
24
+ tuple
25
+ A tuple of:
26
+ - contig_names : dict
27
+ Bidict mapping contig indices to contig names.
28
+ - paths : dict
29
+ Mapping from contig index to list of segment identifiers.
30
+ - segment_contigs : dict
31
+ Mapping from segment ID to a set of contig indices it belongs to.
32
+ """
33
+
34
+ contig_names = bidict()
35
+ contig_num = 0
36
+
37
+ paths = {}
38
+ segment_contigs = {}
39
+
40
+ with open(contig_paths_file, "r") as file:
41
+ for line in file.readlines():
42
+ if not (line.startswith("#") or line.startswith("seq_name")):
43
+ strings = line.strip().split()
44
+
45
+ contig_name = strings[0]
46
+
47
+ path = strings[-1]
48
+ path = path.replace("*", "")
49
+
50
+ if path.startswith(","):
51
+ path = path[1:]
52
+
53
+ if path.endswith(","):
54
+ path = path[:-1]
55
+
56
+ segments = path.rstrip().split(",")
57
+
58
+ contig_names[contig_num] = contig_name
59
+
60
+ if contig_num not in paths:
61
+ paths[contig_num] = segments
62
+
63
+ for segment in segments:
64
+ if segment not in segment_contigs:
65
+ segment_contigs[segment] = set([contig_num])
66
+ else:
67
+ segment_contigs[segment].add(contig_num)
68
+
69
+ contig_num += 1
70
+
71
+ return contig_names, paths, segment_contigs
72
+
73
+
74
+ def _get_graph_edges(graph_file: str, paths: dict, segment_contigs: dict) -> list:
75
+ """
76
+ Generate edges for a contig-level graph based on GFA link information and segment-to-contig mappings.
77
+
78
+ Parameters
79
+ ----------
80
+ graph_file : str
81
+ Path to the GFA file containing link (`L`) lines.
82
+ paths : dict
83
+ Mapping from contig index to list of segment IDs in the path.
84
+ segment_contigs : dict
85
+ Mapping from segment ID to set of contig indices that contain it.
86
+
87
+ Returns
88
+ -------
89
+ list of tuple
90
+ List of edges as (source_index, target_index) pairs.
91
+ """
92
+
93
+ links_map = defaultdict(set)
94
+
95
+ # Get links from assembly_graph_with_scaffolds.gfa
96
+ with open(graph_file) as file:
97
+ line = file.readline()
98
+
99
+ while line != "":
100
+ # Identify lines with link information
101
+ if "L" in line:
102
+ strings = line.split("\t")
103
+
104
+ f1, f2 = "", ""
105
+
106
+ if strings[2] == "+":
107
+ f1 = strings[1][5:]
108
+ if strings[2] == "-":
109
+ f1 = "-" + strings[1][5:]
110
+ if strings[4] == "+":
111
+ f2 = strings[3][5:]
112
+ if strings[4] == "-":
113
+ f2 = "-" + strings[3][5:]
114
+
115
+ links_map[f1].add(f2)
116
+ links_map[f2].add(f1)
117
+
118
+ line = file.readline()
119
+
120
+ # Create list of edges
121
+ edge_list = []
122
+
123
+ for i in paths:
124
+ segments = paths[i]
125
+
126
+ new_links = []
127
+
128
+ for segment in segments:
129
+ my_segment = segment
130
+ my_segment_num = ""
131
+
132
+ my_segment_rev = ""
133
+
134
+ if my_segment.startswith("-"):
135
+ my_segment_rev = my_segment[1:]
136
+ my_segment_num = my_segment[1:]
137
+ else:
138
+ my_segment_rev = "-" + my_segment
139
+ my_segment_num = my_segment
140
+
141
+ if my_segment in links_map:
142
+ new_links.extend(list(links_map[my_segment]))
143
+
144
+ if my_segment_rev in links_map:
145
+ new_links.extend(list(links_map[my_segment_rev]))
146
+
147
+ if my_segment in segment_contigs:
148
+ for contig in segment_contigs[my_segment]:
149
+ if i != contig:
150
+ # Add edge to list of edges
151
+ edge_list.append((i, contig))
152
+
153
+ if my_segment_rev in segment_contigs:
154
+ for contig in segment_contigs[my_segment_rev]:
155
+ if i != contig:
156
+ # Add edge to list of edges
157
+ edge_list.append((i, contig))
158
+
159
+ if my_segment_num in segment_contigs:
160
+ for contig in segment_contigs[my_segment_num]:
161
+ if i != contig:
162
+ # Add edge to list of edges
163
+ edge_list.append((i, contig))
164
+
165
+ for new_link in new_links:
166
+ if new_link in segment_contigs:
167
+ for contig in segment_contigs[new_link]:
168
+ if i != contig:
169
+ # Add edge to list of edges
170
+ edge_list.append((i, contig))
171
+
172
+ if new_link.startswith("-"):
173
+ if new_link[1:] in segment_contigs:
174
+ for contig in segment_contigs[new_link[1:]]:
175
+ if i != contig:
176
+ # Add edge to list of edges
177
+ edge_list.append((i, contig))
178
+
179
+ return edge_list
180
+
181
+
182
+ def get_contig_graph(
183
+ graph_file: str, contigs_file: str, contig_paths_file: str
184
+ ) -> ContigGraph:
185
+ """
186
+ Build a contig-level graph from an assembly GFA file and contig path mappings.
187
+
188
+ This function parses contig metadata, links, and path structure to construct an
189
+ undirected graph where each node represents a contig and edges represent linkages
190
+ inferred from shared segments or GFA link data.
191
+
192
+ Parameters
193
+ ----------
194
+ graph_file : str
195
+ Path to the GFA file.
196
+ contigs_file : str
197
+ Path to the FASTA file with contig sequences.
198
+ contig_paths_file : str
199
+ Path to the file with segment paths used to build contigs.
200
+
201
+ Returns
202
+ -------
203
+ ContigGraph
204
+ An object representing the contig-level graph with node metadata.
205
+ """
206
+
207
+ # Get contigs map, links and contigs of the assembly graph
208
+ (
209
+ contig_names,
210
+ paths,
211
+ segment_contigs,
212
+ ) = _get_links(contig_paths_file)
213
+ node_count = len(contig_names)
214
+
215
+ # Create graph
216
+ graph = Graph()
217
+
218
+ # Add vertices
219
+ graph.add_vertices(node_count)
220
+
221
+ # Name vertices with contig identifiers
222
+ for i in range(node_count):
223
+ graph.vs[i]["id"] = i
224
+ graph.vs[i]["label"] = contig_names[i]
225
+
226
+ edge_list = _get_graph_edges(
227
+ graph_file=graph_file,
228
+ paths=paths,
229
+ segment_contigs=segment_contigs,
230
+ )
231
+
232
+ # Add edges to the graph
233
+ graph.add_edges(edge_list)
234
+
235
+ # Simplify the graph
236
+ graph.simplify(multiple=True, loops=False, combine_edges=None)
237
+
238
+ # Get parser for contigs.fasta
239
+ parser = FastaParser(contigs_file)
240
+
241
+ contig_graph = ContigGraph(
242
+ graph=graph,
243
+ vcount=graph.vcount(),
244
+ ecount=graph.ecount(),
245
+ file_path=graph_file,
246
+ contig_names=contig_names,
247
+ contig_parser=parser,
248
+ )
249
+
250
+ return contig_graph
251
+
252
+
253
+ def get_unitig_graph(graph_file) -> UnitigGraph:
254
+ """
255
+ Build a unitig-level assembly graph from a GFA file.
256
+
257
+ Parameters
258
+ ----------
259
+ graph_file : str
260
+ Path to the GFA file.
261
+
262
+ Returns
263
+ -------
264
+ UnitigGraph
265
+ Parsed unitig graph object.
266
+ """
267
+
268
+ ug = UnitigGraph.from_gfa(graph_file)
269
+ return ug
@@ -0,0 +1,192 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from bidict import bidict
4
+ from Bio import SeqIO
5
+ from igraph import Graph
6
+
7
+ from agtools.core.contig_graph import ContigGraph
8
+ from agtools.core.fasta_parser import FastaParser
9
+
10
+
11
+ def _get_links_megahit(gfa_file: str) -> tuple:
12
+ """
13
+ Parse a GFA file to extract segment sequences and connectivity (links) between segments.
14
+
15
+ Parameters
16
+ ----------
17
+ gfa_file : str
18
+ Path to the MEGAHIT-style GFA file.
19
+
20
+ Returns
21
+ -------
22
+ node_count : int
23
+ Number of unique segments.
24
+ graph_contig_seqs : dict
25
+ Mapping of segment ID -> sequence length in graph file.
26
+ links : list of list
27
+ List of 2-element lists representing linked segment IDs.
28
+ contig_names : bidict
29
+ Mapping of numeric node ID -> segment ID.
30
+ """
31
+
32
+ node_count = 0
33
+
34
+ graph_contig_seqs = {}
35
+
36
+ links = []
37
+
38
+ contig_names = bidict()
39
+
40
+ # Get links from .gfa file
41
+ with open(gfa_file) as file:
42
+ line = file.readline()
43
+
44
+ while line != "":
45
+ # Identify lines with link information
46
+ if line.startswith("L"):
47
+ link = []
48
+
49
+ strings = line.split("\t")
50
+
51
+ link1 = strings[1]
52
+ link2 = strings[3]
53
+
54
+ link.append(link1)
55
+ link.append(link2)
56
+ links.append(link)
57
+
58
+ elif line.startswith("S"):
59
+ strings = line.split()
60
+
61
+ contig_names[node_count] = strings[1]
62
+
63
+ graph_contig_seqs[strings[1]] = len(strings[2])
64
+
65
+ node_count += 1
66
+
67
+ line = file.readline()
68
+
69
+ return node_count, graph_contig_seqs, links, contig_names
70
+
71
+
72
+ def _get_graph_edges_megahit(links: list, contig_names_rev: bidict) -> tuple:
73
+ """
74
+ Convert a list of segment links into igraph-compatible edges.
75
+
76
+ Parameters
77
+ ----------
78
+ links : list of list
79
+ Pairs of linked segment IDs.
80
+ contig_names_rev : bidict
81
+ Mapping of segment ID -> numeric node ID.
82
+
83
+ Returns
84
+ -------
85
+ edge_list : list of tuple
86
+ List of edges as tuples of node IDs.
87
+ self_loops : list of int
88
+ List of node IDs that form self-loops.
89
+ """
90
+
91
+ edge_list = []
92
+ self_loops = []
93
+
94
+ # Iterate links
95
+ for link in links:
96
+ # Remove self loops
97
+ if link[0] != link[1]:
98
+ # Add edge to list of edges
99
+ edge_list.append((contig_names_rev[link[0]], contig_names_rev[link[1]]))
100
+ else:
101
+ self_loops.append(contig_names_rev[link[0]])
102
+
103
+ return edge_list, self_loops
104
+
105
+
106
+ def get_contig_graph(gfa_file: str, contigs_file: str) -> ContigGraph:
107
+ """
108
+ Build a contig-level graph from a MEGAHIT GFA file and a contig FASTA file.
109
+
110
+ Matches sequences between GFA and FASTA to map contig IDs, constructs an igraph
111
+ representation of the graph, and packages it in a ContigGraph object.
112
+
113
+ Parameters
114
+ ----------
115
+ gfa_file : str
116
+ Path to the GFA file.
117
+ contigs_file : str
118
+ Path to the contigs FASTA file.
119
+
120
+ Returns
121
+ -------
122
+ ContigGraph
123
+ Parsed contig graph object.
124
+ """
125
+
126
+ original_contig_seqs = {}
127
+ contig_descriptions = {}
128
+
129
+ # Get mapping of original contig identifiers with descriptions
130
+ for index, record in enumerate(SeqIO.parse(contigs_file, "fasta")):
131
+ original_contig_seqs[record.id] = len(record.seq)
132
+ contig_descriptions[record.id] = record.description
133
+
134
+ # Get links and contigs of the assembly graph
135
+ (
136
+ node_count,
137
+ graph_contig_seqs,
138
+ links,
139
+ contig_names,
140
+ ) = _get_links_megahit(gfa_file)
141
+
142
+ # Get list of edges and self loops
143
+ edge_list, self_loops = _get_graph_edges_megahit(
144
+ links=links, contig_names_rev=contig_names.inverse
145
+ )
146
+
147
+ # Create graph
148
+ graph = Graph()
149
+
150
+ # Add vertices
151
+ graph.add_vertices(node_count)
152
+
153
+ # Name vertices with contig identifiers
154
+ for i in range(node_count):
155
+ graph.vs[i]["id"] = i
156
+ graph.vs[i]["label"] = contig_names[i]
157
+
158
+ # Add edges to the graph
159
+ graph.add_edges(edge_list)
160
+
161
+ # Simplify the graph
162
+ graph.simplify(multiple=True, loops=False, combine_edges=None)
163
+
164
+ # Map original contig identifiers to contig identifiers of MEGAHIT assembly graph
165
+ graph_to_contig_map = bidict()
166
+
167
+ for (n, m), (n2, m2) in zip(
168
+ graph_contig_seqs.items(), original_contig_seqs.items()
169
+ ):
170
+ if m == m2:
171
+ graph_to_contig_map[n] = n2
172
+
173
+ # Clean up temporary sequence maps
174
+ del graph_contig_seqs
175
+ del original_contig_seqs
176
+
177
+ # Get parser for contigs.fasta
178
+ parser = FastaParser(contigs_file, assembler="megahit", mapping=graph_to_contig_map)
179
+
180
+ contig_graph = ContigGraph(
181
+ graph=graph,
182
+ vcount=graph.vcount(),
183
+ ecount=graph.ecount(),
184
+ file_path=gfa_file,
185
+ contig_names=contig_names,
186
+ contig_parser=parser,
187
+ contig_descriptions=contig_descriptions,
188
+ graph_to_contig_map=graph_to_contig_map,
189
+ self_loops=self_loops,
190
+ )
191
+
192
+ return contig_graph
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from bidict import bidict
4
+ from igraph import Graph
5
+
6
+ from agtools.core.contig_graph import ContigGraph
7
+ from agtools.core.fasta_parser import FastaParser
8
+
9
+
10
+ def _get_links_myloasm(gfa_file: str, contig_index: dict) -> tuple:
11
+ """
12
+ Parse a GFA file to extract contig information and connectivity
13
+ information (links) between contigs.
14
+
15
+ Parameters
16
+ ----------
17
+ gfa_file : str
18
+ Path to the myloasm-style GFA file.
19
+
20
+ Returns
21
+ -------
22
+ node_count : int
23
+ Number of unique segments.
24
+ links : list of list
25
+ List of 2-element lists representing linked segment IDs.
26
+ contig_names : bidict
27
+ Mapping of numeric node ID -> contig ID.
28
+ """
29
+ node_count = 0
30
+
31
+ links = []
32
+
33
+ contig_names = bidict()
34
+
35
+ # Get links from .gfa file
36
+ with open(gfa_file) as file:
37
+ for line in file.readlines():
38
+ # Identify lines with link information
39
+ if line.startswith("L"):
40
+ link = []
41
+
42
+ strings = line.strip().split("\t")
43
+
44
+ link1 = strings[1]
45
+ link1_orient = strings[2]
46
+ link2 = strings[3]
47
+ link2_orient = strings[4]
48
+
49
+ if link1 in contig_index and link2 in contig_index:
50
+
51
+ link.append(link1)
52
+ link.append(link1_orient)
53
+ link.append(link2)
54
+ link.append(link2_orient)
55
+
56
+ links.append(link)
57
+
58
+ # Identify lines with contig information
59
+ elif line.startswith("S"):
60
+ strings = line.strip().split("\t")
61
+
62
+ if strings[1] in contig_index:
63
+ contig_names[node_count] = strings[1]
64
+ node_count += 1
65
+
66
+ return node_count, links, contig_names
67
+
68
+
69
+ def _get_graph_edges_myloasm(links: list, contig_names_rev: bidict) -> tuple:
70
+ """
71
+ Convert a list of segment links into igraph-compatible edges.
72
+
73
+ Parameters
74
+ ----------
75
+ links : list of list
76
+ Pairs of linked segment IDs.
77
+ contig_names_rev : bidict
78
+ Mapping of segment ID -> numeric node ID.
79
+
80
+ Returns
81
+ -------
82
+ edge_list : list of tuple
83
+ List of edges as tuples of node IDs.
84
+ self_loops : list of int
85
+ List of node IDs that form self-loops.
86
+ """
87
+
88
+ edge_list = []
89
+ self_loops = []
90
+
91
+ # Iterate links
92
+ for link in links:
93
+ # Remove self loops
94
+ if link[0] != link[2]:
95
+ # Add edge to list of edges
96
+ edge_list.append((contig_names_rev[link[0]], contig_names_rev[link[2]]))
97
+ else:
98
+ self_loops.append(contig_names_rev[link[0]])
99
+
100
+ return edge_list, self_loops
101
+
102
+
103
+ def get_contig_graph(gfa_file: str, contigs_file: str) -> ContigGraph:
104
+ """
105
+ Build a contig-level graph from a myloasm GFA file and a contig FASTA file.
106
+
107
+ Parameters
108
+ ----------
109
+ gfa_file : str
110
+ Path to the GFA file.
111
+ contigs_file : str
112
+ Path to the contigs FASTA file.
113
+
114
+ Returns
115
+ -------
116
+ ContigGraph
117
+ Parsed contig graph object.
118
+ """
119
+
120
+ # Get parser for contigs.fasta
121
+ parser = FastaParser(contigs_file, assembler="myloasm")
122
+
123
+ # Get links and contigs of the assembly graph
124
+ node_count, links, contig_names = _get_links_myloasm(gfa_file, parser.index)
125
+
126
+ # Get list of edges and self loops
127
+ edge_list, self_loops = _get_graph_edges_myloasm(
128
+ links=links, contig_names_rev=contig_names.inverse
129
+ )
130
+
131
+ # Create graph
132
+ graph = Graph()
133
+
134
+ # Add vertices
135
+ graph.add_vertices(node_count)
136
+
137
+ # Name vertices with contig identifiers
138
+ for i in range(node_count):
139
+ graph.vs[i]["id"] = i
140
+ graph.vs[i]["label"] = contig_names[i]
141
+
142
+ # Add edges to the graph
143
+ graph.add_edges(edge_list)
144
+
145
+ # Simplify the graph
146
+ graph.simplify(multiple=True, loops=False, combine_edges=None)
147
+
148
+ contig_graph = ContigGraph(
149
+ graph=graph,
150
+ vcount=graph.vcount(),
151
+ ecount=graph.ecount(),
152
+ file_path=gfa_file,
153
+ contig_names=contig_names,
154
+ contig_parser=parser,
155
+ contig_descriptions=None,
156
+ graph_to_contig_map=None,
157
+ self_loops=self_loops,
158
+ )
159
+
160
+ return contig_graph