agtools 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agtools/__init__.py +10 -0
- agtools/assemblers/__init__.py +10 -0
- agtools/assemblers/flye.py +269 -0
- agtools/assemblers/megahit.py +192 -0
- agtools/assemblers/myloasm.py +160 -0
- agtools/assemblers/spades.py +258 -0
- agtools/cli.py +298 -0
- agtools/commands/__init__.py +24 -0
- agtools/commands/asqg2gfa.py +133 -0
- agtools/commands/clean.py +127 -0
- agtools/commands/component.py +128 -0
- agtools/commands/concat.py +133 -0
- agtools/commands/fastg2gfa.py +153 -0
- agtools/commands/filter.py +129 -0
- agtools/commands/gfa2adj.py +49 -0
- agtools/commands/gfa2dot.py +99 -0
- agtools/commands/gfa2fasta.py +104 -0
- agtools/commands/gfa2fastg.py +185 -0
- agtools/commands/rename.py +186 -0
- agtools/commands/stats.py +125 -0
- agtools/core/__init__.py +10 -0
- agtools/core/contig_graph.py +362 -0
- agtools/core/fasta_parser.py +166 -0
- agtools/core/unitig_graph.py +530 -0
- agtools/log_config.py +24 -0
- agtools-0.1.0.dist-info/LICENSE +21 -0
- agtools-0.1.0.dist-info/METADATA +104 -0
- agtools-0.1.0.dist-info/RECORD +30 -0
- agtools-0.1.0.dist-info/WHEEL +4 -0
- agtools-0.1.0.dist-info/entry_points.txt +3 -0
agtools/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""agtools: Tools for manipulating assembly graphs"""
|
|
2
|
+
|
|
3
|
+
__author__ = "Vijini Mallawaarachchi"
|
|
4
|
+
__copyright__ = "Copyright 2025, agtools Project"
|
|
5
|
+
__credits__ = ["Vijini Mallawaarachchi"]
|
|
6
|
+
__license__ = "MIT"
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
__maintainer__ = "Vijini Mallawaarachchi"
|
|
9
|
+
__email__ = "viji.mallawaarachchi@gmail.com"
|
|
10
|
+
__status__ = "Alpha"
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""agtools: Tools for manipulating assembly graphs"""
|
|
2
|
+
|
|
3
|
+
__author__ = "Vijini Mallawaarachchi"
|
|
4
|
+
__copyright__ = "Copyright 2025, agtools Project"
|
|
5
|
+
__credits__ = ["Vijini Mallawaarachchi"]
|
|
6
|
+
__license__ = "MIT"
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
__maintainer__ = "Vijini Mallawaarachchi"
|
|
9
|
+
__email__ = "viji.mallawaarachchi@gmail.com"
|
|
10
|
+
__status__ = "Alpha"
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
|
|
5
|
+
from bidict import bidict
|
|
6
|
+
from igraph import Graph
|
|
7
|
+
|
|
8
|
+
from agtools.core.contig_graph import ContigGraph
|
|
9
|
+
from agtools.core.fasta_parser import FastaParser
|
|
10
|
+
from agtools.core.unitig_graph import UnitigGraph
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _get_links(contig_paths_file: str) -> tuple:
|
|
14
|
+
"""
|
|
15
|
+
Parse contig paths file to extract paths and segment-contig mappings.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
contig_paths_file : str
|
|
20
|
+
Path to the file containing contig path information.
|
|
21
|
+
|
|
22
|
+
Returns
|
|
23
|
+
-------
|
|
24
|
+
tuple
|
|
25
|
+
A tuple of:
|
|
26
|
+
- contig_names : dict
|
|
27
|
+
Bidict mapping contig indices to contig names.
|
|
28
|
+
- paths : dict
|
|
29
|
+
Mapping from contig index to list of segment identifiers.
|
|
30
|
+
- segment_contigs : dict
|
|
31
|
+
Mapping from segment ID to a set of contig indices it belongs to.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
contig_names = bidict()
|
|
35
|
+
contig_num = 0
|
|
36
|
+
|
|
37
|
+
paths = {}
|
|
38
|
+
segment_contigs = {}
|
|
39
|
+
|
|
40
|
+
with open(contig_paths_file, "r") as file:
|
|
41
|
+
for line in file.readlines():
|
|
42
|
+
if not (line.startswith("#") or line.startswith("seq_name")):
|
|
43
|
+
strings = line.strip().split()
|
|
44
|
+
|
|
45
|
+
contig_name = strings[0]
|
|
46
|
+
|
|
47
|
+
path = strings[-1]
|
|
48
|
+
path = path.replace("*", "")
|
|
49
|
+
|
|
50
|
+
if path.startswith(","):
|
|
51
|
+
path = path[1:]
|
|
52
|
+
|
|
53
|
+
if path.endswith(","):
|
|
54
|
+
path = path[:-1]
|
|
55
|
+
|
|
56
|
+
segments = path.rstrip().split(",")
|
|
57
|
+
|
|
58
|
+
contig_names[contig_num] = contig_name
|
|
59
|
+
|
|
60
|
+
if contig_num not in paths:
|
|
61
|
+
paths[contig_num] = segments
|
|
62
|
+
|
|
63
|
+
for segment in segments:
|
|
64
|
+
if segment not in segment_contigs:
|
|
65
|
+
segment_contigs[segment] = set([contig_num])
|
|
66
|
+
else:
|
|
67
|
+
segment_contigs[segment].add(contig_num)
|
|
68
|
+
|
|
69
|
+
contig_num += 1
|
|
70
|
+
|
|
71
|
+
return contig_names, paths, segment_contigs
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _get_graph_edges(graph_file: str, paths: dict, segment_contigs: dict) -> list:
|
|
75
|
+
"""
|
|
76
|
+
Generate edges for a contig-level graph based on GFA link information and segment-to-contig mappings.
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
graph_file : str
|
|
81
|
+
Path to the GFA file containing link (`L`) lines.
|
|
82
|
+
paths : dict
|
|
83
|
+
Mapping from contig index to list of segment IDs in the path.
|
|
84
|
+
segment_contigs : dict
|
|
85
|
+
Mapping from segment ID to set of contig indices that contain it.
|
|
86
|
+
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
list of tuple
|
|
90
|
+
List of edges as (source_index, target_index) pairs.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
links_map = defaultdict(set)
|
|
94
|
+
|
|
95
|
+
# Get links from assembly_graph_with_scaffolds.gfa
|
|
96
|
+
with open(graph_file) as file:
|
|
97
|
+
line = file.readline()
|
|
98
|
+
|
|
99
|
+
while line != "":
|
|
100
|
+
# Identify lines with link information
|
|
101
|
+
if "L" in line:
|
|
102
|
+
strings = line.split("\t")
|
|
103
|
+
|
|
104
|
+
f1, f2 = "", ""
|
|
105
|
+
|
|
106
|
+
if strings[2] == "+":
|
|
107
|
+
f1 = strings[1][5:]
|
|
108
|
+
if strings[2] == "-":
|
|
109
|
+
f1 = "-" + strings[1][5:]
|
|
110
|
+
if strings[4] == "+":
|
|
111
|
+
f2 = strings[3][5:]
|
|
112
|
+
if strings[4] == "-":
|
|
113
|
+
f2 = "-" + strings[3][5:]
|
|
114
|
+
|
|
115
|
+
links_map[f1].add(f2)
|
|
116
|
+
links_map[f2].add(f1)
|
|
117
|
+
|
|
118
|
+
line = file.readline()
|
|
119
|
+
|
|
120
|
+
# Create list of edges
|
|
121
|
+
edge_list = []
|
|
122
|
+
|
|
123
|
+
for i in paths:
|
|
124
|
+
segments = paths[i]
|
|
125
|
+
|
|
126
|
+
new_links = []
|
|
127
|
+
|
|
128
|
+
for segment in segments:
|
|
129
|
+
my_segment = segment
|
|
130
|
+
my_segment_num = ""
|
|
131
|
+
|
|
132
|
+
my_segment_rev = ""
|
|
133
|
+
|
|
134
|
+
if my_segment.startswith("-"):
|
|
135
|
+
my_segment_rev = my_segment[1:]
|
|
136
|
+
my_segment_num = my_segment[1:]
|
|
137
|
+
else:
|
|
138
|
+
my_segment_rev = "-" + my_segment
|
|
139
|
+
my_segment_num = my_segment
|
|
140
|
+
|
|
141
|
+
if my_segment in links_map:
|
|
142
|
+
new_links.extend(list(links_map[my_segment]))
|
|
143
|
+
|
|
144
|
+
if my_segment_rev in links_map:
|
|
145
|
+
new_links.extend(list(links_map[my_segment_rev]))
|
|
146
|
+
|
|
147
|
+
if my_segment in segment_contigs:
|
|
148
|
+
for contig in segment_contigs[my_segment]:
|
|
149
|
+
if i != contig:
|
|
150
|
+
# Add edge to list of edges
|
|
151
|
+
edge_list.append((i, contig))
|
|
152
|
+
|
|
153
|
+
if my_segment_rev in segment_contigs:
|
|
154
|
+
for contig in segment_contigs[my_segment_rev]:
|
|
155
|
+
if i != contig:
|
|
156
|
+
# Add edge to list of edges
|
|
157
|
+
edge_list.append((i, contig))
|
|
158
|
+
|
|
159
|
+
if my_segment_num in segment_contigs:
|
|
160
|
+
for contig in segment_contigs[my_segment_num]:
|
|
161
|
+
if i != contig:
|
|
162
|
+
# Add edge to list of edges
|
|
163
|
+
edge_list.append((i, contig))
|
|
164
|
+
|
|
165
|
+
for new_link in new_links:
|
|
166
|
+
if new_link in segment_contigs:
|
|
167
|
+
for contig in segment_contigs[new_link]:
|
|
168
|
+
if i != contig:
|
|
169
|
+
# Add edge to list of edges
|
|
170
|
+
edge_list.append((i, contig))
|
|
171
|
+
|
|
172
|
+
if new_link.startswith("-"):
|
|
173
|
+
if new_link[1:] in segment_contigs:
|
|
174
|
+
for contig in segment_contigs[new_link[1:]]:
|
|
175
|
+
if i != contig:
|
|
176
|
+
# Add edge to list of edges
|
|
177
|
+
edge_list.append((i, contig))
|
|
178
|
+
|
|
179
|
+
return edge_list
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def get_contig_graph(
|
|
183
|
+
graph_file: str, contigs_file: str, contig_paths_file: str
|
|
184
|
+
) -> ContigGraph:
|
|
185
|
+
"""
|
|
186
|
+
Build a contig-level graph from an assembly GFA file and contig path mappings.
|
|
187
|
+
|
|
188
|
+
This function parses contig metadata, links, and path structure to construct an
|
|
189
|
+
undirected graph where each node represents a contig and edges represent linkages
|
|
190
|
+
inferred from shared segments or GFA link data.
|
|
191
|
+
|
|
192
|
+
Parameters
|
|
193
|
+
----------
|
|
194
|
+
graph_file : str
|
|
195
|
+
Path to the GFA file.
|
|
196
|
+
contigs_file : str
|
|
197
|
+
Path to the FASTA file with contig sequences.
|
|
198
|
+
contig_paths_file : str
|
|
199
|
+
Path to the file with segment paths used to build contigs.
|
|
200
|
+
|
|
201
|
+
Returns
|
|
202
|
+
-------
|
|
203
|
+
ContigGraph
|
|
204
|
+
An object representing the contig-level graph with node metadata.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
# Get contigs map, links and contigs of the assembly graph
|
|
208
|
+
(
|
|
209
|
+
contig_names,
|
|
210
|
+
paths,
|
|
211
|
+
segment_contigs,
|
|
212
|
+
) = _get_links(contig_paths_file)
|
|
213
|
+
node_count = len(contig_names)
|
|
214
|
+
|
|
215
|
+
# Create graph
|
|
216
|
+
graph = Graph()
|
|
217
|
+
|
|
218
|
+
# Add vertices
|
|
219
|
+
graph.add_vertices(node_count)
|
|
220
|
+
|
|
221
|
+
# Name vertices with contig identifiers
|
|
222
|
+
for i in range(node_count):
|
|
223
|
+
graph.vs[i]["id"] = i
|
|
224
|
+
graph.vs[i]["label"] = contig_names[i]
|
|
225
|
+
|
|
226
|
+
edge_list = _get_graph_edges(
|
|
227
|
+
graph_file=graph_file,
|
|
228
|
+
paths=paths,
|
|
229
|
+
segment_contigs=segment_contigs,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Add edges to the graph
|
|
233
|
+
graph.add_edges(edge_list)
|
|
234
|
+
|
|
235
|
+
# Simplify the graph
|
|
236
|
+
graph.simplify(multiple=True, loops=False, combine_edges=None)
|
|
237
|
+
|
|
238
|
+
# Get parser for contigs.fasta
|
|
239
|
+
parser = FastaParser(contigs_file)
|
|
240
|
+
|
|
241
|
+
contig_graph = ContigGraph(
|
|
242
|
+
graph=graph,
|
|
243
|
+
vcount=graph.vcount(),
|
|
244
|
+
ecount=graph.ecount(),
|
|
245
|
+
file_path=graph_file,
|
|
246
|
+
contig_names=contig_names,
|
|
247
|
+
contig_parser=parser,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
return contig_graph
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def get_unitig_graph(graph_file) -> UnitigGraph:
|
|
254
|
+
"""
|
|
255
|
+
Build a unitig-level assembly graph from a GFA file.
|
|
256
|
+
|
|
257
|
+
Parameters
|
|
258
|
+
----------
|
|
259
|
+
graph_file : str
|
|
260
|
+
Path to the GFA file.
|
|
261
|
+
|
|
262
|
+
Returns
|
|
263
|
+
-------
|
|
264
|
+
UnitigGraph
|
|
265
|
+
Parsed unitig graph object.
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
ug = UnitigGraph.from_gfa(graph_file)
|
|
269
|
+
return ug
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
from bidict import bidict
|
|
4
|
+
from Bio import SeqIO
|
|
5
|
+
from igraph import Graph
|
|
6
|
+
|
|
7
|
+
from agtools.core.contig_graph import ContigGraph
|
|
8
|
+
from agtools.core.fasta_parser import FastaParser
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _get_links_megahit(gfa_file: str) -> tuple:
|
|
12
|
+
"""
|
|
13
|
+
Parse a GFA file to extract segment sequences and connectivity (links) between segments.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
gfa_file : str
|
|
18
|
+
Path to the MEGAHIT-style GFA file.
|
|
19
|
+
|
|
20
|
+
Returns
|
|
21
|
+
-------
|
|
22
|
+
node_count : int
|
|
23
|
+
Number of unique segments.
|
|
24
|
+
graph_contig_seqs : dict
|
|
25
|
+
Mapping of segment ID -> sequence length in graph file.
|
|
26
|
+
links : list of list
|
|
27
|
+
List of 2-element lists representing linked segment IDs.
|
|
28
|
+
contig_names : bidict
|
|
29
|
+
Mapping of numeric node ID -> segment ID.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
node_count = 0
|
|
33
|
+
|
|
34
|
+
graph_contig_seqs = {}
|
|
35
|
+
|
|
36
|
+
links = []
|
|
37
|
+
|
|
38
|
+
contig_names = bidict()
|
|
39
|
+
|
|
40
|
+
# Get links from .gfa file
|
|
41
|
+
with open(gfa_file) as file:
|
|
42
|
+
line = file.readline()
|
|
43
|
+
|
|
44
|
+
while line != "":
|
|
45
|
+
# Identify lines with link information
|
|
46
|
+
if line.startswith("L"):
|
|
47
|
+
link = []
|
|
48
|
+
|
|
49
|
+
strings = line.split("\t")
|
|
50
|
+
|
|
51
|
+
link1 = strings[1]
|
|
52
|
+
link2 = strings[3]
|
|
53
|
+
|
|
54
|
+
link.append(link1)
|
|
55
|
+
link.append(link2)
|
|
56
|
+
links.append(link)
|
|
57
|
+
|
|
58
|
+
elif line.startswith("S"):
|
|
59
|
+
strings = line.split()
|
|
60
|
+
|
|
61
|
+
contig_names[node_count] = strings[1]
|
|
62
|
+
|
|
63
|
+
graph_contig_seqs[strings[1]] = len(strings[2])
|
|
64
|
+
|
|
65
|
+
node_count += 1
|
|
66
|
+
|
|
67
|
+
line = file.readline()
|
|
68
|
+
|
|
69
|
+
return node_count, graph_contig_seqs, links, contig_names
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _get_graph_edges_megahit(links: list, contig_names_rev: bidict) -> tuple:
|
|
73
|
+
"""
|
|
74
|
+
Convert a list of segment links into igraph-compatible edges.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
links : list of list
|
|
79
|
+
Pairs of linked segment IDs.
|
|
80
|
+
contig_names_rev : bidict
|
|
81
|
+
Mapping of segment ID -> numeric node ID.
|
|
82
|
+
|
|
83
|
+
Returns
|
|
84
|
+
-------
|
|
85
|
+
edge_list : list of tuple
|
|
86
|
+
List of edges as tuples of node IDs.
|
|
87
|
+
self_loops : list of int
|
|
88
|
+
List of node IDs that form self-loops.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
edge_list = []
|
|
92
|
+
self_loops = []
|
|
93
|
+
|
|
94
|
+
# Iterate links
|
|
95
|
+
for link in links:
|
|
96
|
+
# Remove self loops
|
|
97
|
+
if link[0] != link[1]:
|
|
98
|
+
# Add edge to list of edges
|
|
99
|
+
edge_list.append((contig_names_rev[link[0]], contig_names_rev[link[1]]))
|
|
100
|
+
else:
|
|
101
|
+
self_loops.append(contig_names_rev[link[0]])
|
|
102
|
+
|
|
103
|
+
return edge_list, self_loops
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_contig_graph(gfa_file: str, contigs_file: str) -> ContigGraph:
|
|
107
|
+
"""
|
|
108
|
+
Build a contig-level graph from a MEGAHIT GFA file and a contig FASTA file.
|
|
109
|
+
|
|
110
|
+
Matches sequences between GFA and FASTA to map contig IDs, constructs an igraph
|
|
111
|
+
representation of the graph, and packages it in a ContigGraph object.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
gfa_file : str
|
|
116
|
+
Path to the GFA file.
|
|
117
|
+
contigs_file : str
|
|
118
|
+
Path to the contigs FASTA file.
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
ContigGraph
|
|
123
|
+
Parsed contig graph object.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
original_contig_seqs = {}
|
|
127
|
+
contig_descriptions = {}
|
|
128
|
+
|
|
129
|
+
# Get mapping of original contig identifiers with descriptions
|
|
130
|
+
for index, record in enumerate(SeqIO.parse(contigs_file, "fasta")):
|
|
131
|
+
original_contig_seqs[record.id] = len(record.seq)
|
|
132
|
+
contig_descriptions[record.id] = record.description
|
|
133
|
+
|
|
134
|
+
# Get links and contigs of the assembly graph
|
|
135
|
+
(
|
|
136
|
+
node_count,
|
|
137
|
+
graph_contig_seqs,
|
|
138
|
+
links,
|
|
139
|
+
contig_names,
|
|
140
|
+
) = _get_links_megahit(gfa_file)
|
|
141
|
+
|
|
142
|
+
# Get list of edges and self loops
|
|
143
|
+
edge_list, self_loops = _get_graph_edges_megahit(
|
|
144
|
+
links=links, contig_names_rev=contig_names.inverse
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Create graph
|
|
148
|
+
graph = Graph()
|
|
149
|
+
|
|
150
|
+
# Add vertices
|
|
151
|
+
graph.add_vertices(node_count)
|
|
152
|
+
|
|
153
|
+
# Name vertices with contig identifiers
|
|
154
|
+
for i in range(node_count):
|
|
155
|
+
graph.vs[i]["id"] = i
|
|
156
|
+
graph.vs[i]["label"] = contig_names[i]
|
|
157
|
+
|
|
158
|
+
# Add edges to the graph
|
|
159
|
+
graph.add_edges(edge_list)
|
|
160
|
+
|
|
161
|
+
# Simplify the graph
|
|
162
|
+
graph.simplify(multiple=True, loops=False, combine_edges=None)
|
|
163
|
+
|
|
164
|
+
# Map original contig identifiers to contig identifiers of MEGAHIT assembly graph
|
|
165
|
+
graph_to_contig_map = bidict()
|
|
166
|
+
|
|
167
|
+
for (n, m), (n2, m2) in zip(
|
|
168
|
+
graph_contig_seqs.items(), original_contig_seqs.items()
|
|
169
|
+
):
|
|
170
|
+
if m == m2:
|
|
171
|
+
graph_to_contig_map[n] = n2
|
|
172
|
+
|
|
173
|
+
# Clean up temporary sequence maps
|
|
174
|
+
del graph_contig_seqs
|
|
175
|
+
del original_contig_seqs
|
|
176
|
+
|
|
177
|
+
# Get parser for contigs.fasta
|
|
178
|
+
parser = FastaParser(contigs_file, assembler="megahit", mapping=graph_to_contig_map)
|
|
179
|
+
|
|
180
|
+
contig_graph = ContigGraph(
|
|
181
|
+
graph=graph,
|
|
182
|
+
vcount=graph.vcount(),
|
|
183
|
+
ecount=graph.ecount(),
|
|
184
|
+
file_path=gfa_file,
|
|
185
|
+
contig_names=contig_names,
|
|
186
|
+
contig_parser=parser,
|
|
187
|
+
contig_descriptions=contig_descriptions,
|
|
188
|
+
graph_to_contig_map=graph_to_contig_map,
|
|
189
|
+
self_loops=self_loops,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
return contig_graph
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
from bidict import bidict
|
|
4
|
+
from igraph import Graph
|
|
5
|
+
|
|
6
|
+
from agtools.core.contig_graph import ContigGraph
|
|
7
|
+
from agtools.core.fasta_parser import FastaParser
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _get_links_myloasm(gfa_file: str, contig_index: dict) -> tuple:
|
|
11
|
+
"""
|
|
12
|
+
Parse a GFA file to extract contig information and connectivity
|
|
13
|
+
information (links) between contigs.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
gfa_file : str
|
|
18
|
+
Path to the myloasm-style GFA file.
|
|
19
|
+
|
|
20
|
+
Returns
|
|
21
|
+
-------
|
|
22
|
+
node_count : int
|
|
23
|
+
Number of unique segments.
|
|
24
|
+
links : list of list
|
|
25
|
+
List of 2-element lists representing linked segment IDs.
|
|
26
|
+
contig_names : bidict
|
|
27
|
+
Mapping of numeric node ID -> contig ID.
|
|
28
|
+
"""
|
|
29
|
+
node_count = 0
|
|
30
|
+
|
|
31
|
+
links = []
|
|
32
|
+
|
|
33
|
+
contig_names = bidict()
|
|
34
|
+
|
|
35
|
+
# Get links from .gfa file
|
|
36
|
+
with open(gfa_file) as file:
|
|
37
|
+
for line in file.readlines():
|
|
38
|
+
# Identify lines with link information
|
|
39
|
+
if line.startswith("L"):
|
|
40
|
+
link = []
|
|
41
|
+
|
|
42
|
+
strings = line.strip().split("\t")
|
|
43
|
+
|
|
44
|
+
link1 = strings[1]
|
|
45
|
+
link1_orient = strings[2]
|
|
46
|
+
link2 = strings[3]
|
|
47
|
+
link2_orient = strings[4]
|
|
48
|
+
|
|
49
|
+
if link1 in contig_index and link2 in contig_index:
|
|
50
|
+
|
|
51
|
+
link.append(link1)
|
|
52
|
+
link.append(link1_orient)
|
|
53
|
+
link.append(link2)
|
|
54
|
+
link.append(link2_orient)
|
|
55
|
+
|
|
56
|
+
links.append(link)
|
|
57
|
+
|
|
58
|
+
# Identify lines with contig information
|
|
59
|
+
elif line.startswith("S"):
|
|
60
|
+
strings = line.strip().split("\t")
|
|
61
|
+
|
|
62
|
+
if strings[1] in contig_index:
|
|
63
|
+
contig_names[node_count] = strings[1]
|
|
64
|
+
node_count += 1
|
|
65
|
+
|
|
66
|
+
return node_count, links, contig_names
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _get_graph_edges_myloasm(links: list, contig_names_rev: bidict) -> tuple:
|
|
70
|
+
"""
|
|
71
|
+
Convert a list of segment links into igraph-compatible edges.
|
|
72
|
+
|
|
73
|
+
Parameters
|
|
74
|
+
----------
|
|
75
|
+
links : list of list
|
|
76
|
+
Pairs of linked segment IDs.
|
|
77
|
+
contig_names_rev : bidict
|
|
78
|
+
Mapping of segment ID -> numeric node ID.
|
|
79
|
+
|
|
80
|
+
Returns
|
|
81
|
+
-------
|
|
82
|
+
edge_list : list of tuple
|
|
83
|
+
List of edges as tuples of node IDs.
|
|
84
|
+
self_loops : list of int
|
|
85
|
+
List of node IDs that form self-loops.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
edge_list = []
|
|
89
|
+
self_loops = []
|
|
90
|
+
|
|
91
|
+
# Iterate links
|
|
92
|
+
for link in links:
|
|
93
|
+
# Remove self loops
|
|
94
|
+
if link[0] != link[2]:
|
|
95
|
+
# Add edge to list of edges
|
|
96
|
+
edge_list.append((contig_names_rev[link[0]], contig_names_rev[link[2]]))
|
|
97
|
+
else:
|
|
98
|
+
self_loops.append(contig_names_rev[link[0]])
|
|
99
|
+
|
|
100
|
+
return edge_list, self_loops
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def get_contig_graph(gfa_file: str, contigs_file: str) -> ContigGraph:
|
|
104
|
+
"""
|
|
105
|
+
Build a contig-level graph from a myloasm GFA file and a contig FASTA file.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
gfa_file : str
|
|
110
|
+
Path to the GFA file.
|
|
111
|
+
contigs_file : str
|
|
112
|
+
Path to the contigs FASTA file.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
ContigGraph
|
|
117
|
+
Parsed contig graph object.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
# Get parser for contigs.fasta
|
|
121
|
+
parser = FastaParser(contigs_file, assembler="myloasm")
|
|
122
|
+
|
|
123
|
+
# Get links and contigs of the assembly graph
|
|
124
|
+
node_count, links, contig_names = _get_links_myloasm(gfa_file, parser.index)
|
|
125
|
+
|
|
126
|
+
# Get list of edges and self loops
|
|
127
|
+
edge_list, self_loops = _get_graph_edges_myloasm(
|
|
128
|
+
links=links, contig_names_rev=contig_names.inverse
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Create graph
|
|
132
|
+
graph = Graph()
|
|
133
|
+
|
|
134
|
+
# Add vertices
|
|
135
|
+
graph.add_vertices(node_count)
|
|
136
|
+
|
|
137
|
+
# Name vertices with contig identifiers
|
|
138
|
+
for i in range(node_count):
|
|
139
|
+
graph.vs[i]["id"] = i
|
|
140
|
+
graph.vs[i]["label"] = contig_names[i]
|
|
141
|
+
|
|
142
|
+
# Add edges to the graph
|
|
143
|
+
graph.add_edges(edge_list)
|
|
144
|
+
|
|
145
|
+
# Simplify the graph
|
|
146
|
+
graph.simplify(multiple=True, loops=False, combine_edges=None)
|
|
147
|
+
|
|
148
|
+
contig_graph = ContigGraph(
|
|
149
|
+
graph=graph,
|
|
150
|
+
vcount=graph.vcount(),
|
|
151
|
+
ecount=graph.ecount(),
|
|
152
|
+
file_path=gfa_file,
|
|
153
|
+
contig_names=contig_names,
|
|
154
|
+
contig_parser=parser,
|
|
155
|
+
contig_descriptions=None,
|
|
156
|
+
graph_to_contig_map=None,
|
|
157
|
+
self_loops=self_loops,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return contig_graph
|