mgnify-pipelines-toolkit 0.1.9__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -0,0 +1,424 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import argparse
4
+ import json
5
+ import logging
6
+ import os
7
+ import re
8
+
9
+ from Bio import SeqIO
10
+
11
+ __version__ = "1.0.4"
12
+
13
+
14
+ class Region:
15
+ def __init__(self, start, end):
16
+ # if end < start: # assuming that for +/- start always lower
17
+ # start, end = end, start
18
+ self.start = int(start)
19
+ self.end = int(end)
20
+
21
+ def __str__(self):
22
+ return "[" + str(self.start) + "," + str(self.end) + "]"
23
+
24
+ def __ge__(self, other):
25
+ return self.start >= other.end
26
+
27
+ def __gt__(self, other):
28
+ return self.start > other.end
29
+
30
+ def __le__(self, other):
31
+ return self.end <= other.start
32
+
33
+ def __lt__(self, other):
34
+ return self.end < other.start
35
+
36
+ def length(self):
37
+ return self.end - self.start + 1
38
+
39
+ # If 'other' overlaps and has a greater end position
40
+ def extends_right(self, other):
41
+ if self.overlaps(other) and self.end > other.end:
42
+ return True
43
+ return False
44
+
45
+ # For overlapping fragments extend start and end to match other
46
+ def extend(self, other):
47
+ if self.overlaps(other):
48
+ if other.end > self.end:
49
+ self.end = other.end
50
+ if other.start < self.start:
51
+ self.start = other.start
52
+
53
+ def within(self, other):
54
+ if self.start >= other.start and self.end <= other.end:
55
+ return True
56
+ return False
57
+
58
+ # Return length of overlap between regions
59
+ def overlaps(self, other):
60
+ if self > other or other > self:
61
+ return False
62
+ # overlap = sum of the individual lengths ...
63
+ ltot = self.length() + other.length()
64
+ # ... minus length of the combined region (i.e. min start to max end)
65
+ lmax = max(self.end, other.end) - min(self.start, other.start) + 1
66
+ return ltot - lmax
67
+
68
+
69
+ # FGS has seq_id/start/end in the fasta files - use those to extract the sequences we want to keep;
70
+ # for prodigal it uses a seq_id/index_number, so need to add an extra field
71
+ class NumberedRegion(Region):
72
+ def __init__(self, start, end, nid):
73
+ super().__init__(start, end)
74
+ self.nid = nid
75
+
76
+
77
+ def flatten_regions(regions):
78
+ """Take a list of regions (possibly overlapping) and return the non-overlapping set"""
79
+ if len(regions) < 2:
80
+ return regions
81
+
82
+ flattened = []
83
+ regions = sorted(regions, key=lambda x: x.start) # sort by start
84
+ flattened = [regions[0]]
85
+ regions = regions[1:] # store the first
86
+ for region in regions:
87
+ if not region.overlaps(flattened[-1]): # doesn't overlap: store new region
88
+ flattened.append(region)
89
+ elif region.extends_right(flattened[-1]): # overlaps to the right: extend previous region
90
+ flattened[-1].extend(region)
91
+ # else end < prev end => new region within old: do nothing
92
+ return flattened
93
+
94
+
95
+ def check_against_gaps(regions, candidates):
96
+ """Given a set of non-overlapping gaps and a list of candidate regions, return the candidates that do not overlap"""
97
+ regions = sorted(regions, key=lambda line: line.start)
98
+ candidates = sorted(candidates, key=lambda line: line.start)
99
+ selected = []
100
+ r = 0
101
+ if not len(regions):
102
+ return candidates # no existing predictions - all candidates accepted
103
+
104
+ for c in candidates:
105
+ if c < regions[0] or c > regions[-1]: # outside any of the regions: just append
106
+ selected.append(c)
107
+ else:
108
+ while r < len(regions) - 1 and c >= regions[r]:
109
+ r += 1
110
+ if c < regions[r]: # found a gap
111
+ selected.append(c)
112
+
113
+ return selected
114
+
115
+
116
+ def output_prodigal(predictions, files, outputs):
117
+ """From the combined predictions output the prodigal data"""
118
+
119
+ sequence_set = set()
120
+ for seq in predictions:
121
+ for strand in ["-", "+"]:
122
+ for region in predictions[seq][strand]:
123
+ sequence_set.add("_".join([seq, str(region.nid)]))
124
+
125
+ # files contains the .faa and .ffn fasta files
126
+ for index in [1, 2]:
127
+ sequences = []
128
+ for record in SeqIO.parse(files[index], "fasta"):
129
+ # remove anything after the first space
130
+ seq_name = record.id.split(" ")[0]
131
+ # Replace ending * #
132
+ record.seq = record.seq.rstrip("*")
133
+ if seq_name in sequence_set:
134
+ sequences.append(record)
135
+
136
+ with open(outputs[index], "a") as output_handle:
137
+ SeqIO.write(sequences, output_handle, "fasta")
138
+
139
+
140
+ def output_fgs(predictions, files, outputs):
141
+ """From the combined predictions output the FGS data"""
142
+ sequence_set = set()
143
+ for seq in predictions:
144
+ for strand in ["-", "+"]:
145
+ for region in predictions[seq][strand]:
146
+ sequence_set.add("_".join([seq, str(region.start), str(region.end), strand]))
147
+
148
+ # files contains the .faa and .ffn fasta files
149
+ for index in [1, 2]:
150
+ sequences = []
151
+ for record in SeqIO.parse(files[index], "fasta"):
152
+ # remove anything after the first space
153
+ seq_name = record.id.split(" ")[0]
154
+ # Replace "*" with "X"
155
+ record.seq = record.seq.replace("*", "X")
156
+ if seq_name in sequence_set:
157
+ sequences.append(record)
158
+
159
+ with open(outputs[index], "a") as output_handle:
160
+ SeqIO.write(sequences, output_handle, "fasta")
161
+
162
+
163
+ def output_files(predictions, summary, files):
164
+ """Output all files"""
165
+ # To avoid that sequences get appended to the merged output files after restart,
166
+ # make sure the files get deleted if they exist
167
+ logging.info("Removing output files if they exist.")
168
+ for file_ in files["merged"]:
169
+ if os.path.exists(file_):
170
+ logging.info(f"Removing {file_}")
171
+ os.remove(file_)
172
+
173
+ for caller in predictions:
174
+ if caller == "fgs":
175
+ output_fgs(predictions["fgs"], files["fgs"], files["merged"])
176
+ if caller == "prodigal":
177
+ output_prodigal(predictions["prodigal"], files["prodigal"], files["merged"])
178
+
179
+ with open(files["merged"][0], "w") as sf:
180
+ sf.write(json.dumps(summary, sort_keys=True, indent=4) + "\n")
181
+
182
+
183
+ def get_regions_fgs(fn):
184
+ """Parse FGS output.
185
+ Example:
186
+ # >Bifidobacterium-longum-subsp-infantis-MC2-contig1
187
+ # 256 2133 - 1 1.263995 I: D:
188
+ """
189
+ regions = {}
190
+ with open(fn) as f:
191
+ for line in f:
192
+ if line[0] == ">":
193
+ id_ = line.split()[0][1:]
194
+ regions[id_] = {}
195
+ regions[id_]["+"] = []
196
+ regions[id_]["-"] = []
197
+ else:
198
+ r = line.split() # start end strand
199
+ s = int(r[0])
200
+ e = int(r[1])
201
+ regions[id_][r[2]].append(Region(s, e))
202
+ return regions
203
+
204
+
205
+ """
206
+ # noqa: E501
207
+ This is from cmsearch
208
+ ERR855786.1000054-HWI-M02024:111:000000000-A8H14:1:1115:23473:14586-1 - LSU_rRNA_bacteria RF02541 hmm 1224 1446 5 227 + - 6 0.61 0.8 135.2 2.8e-38 ! -
209
+ """
210
+
211
+
212
+ def get_regions_mask(mask_file):
213
+ """Parse masked region file (i.e. ncRNA)"""
214
+ regions = {}
215
+ with open(mask_file) as f:
216
+ for line in f:
217
+ if line[:1] == "#":
218
+ continue
219
+ r = line.rstrip().split()
220
+ id_ = r[0]
221
+ start = int(r[7])
222
+ end = int(r[8])
223
+ if id_ not in regions:
224
+ regions[id_] = []
225
+ if start > end:
226
+ start, end = end, start
227
+ regions[id_].append(Region(start, end))
228
+ return regions
229
+
230
+
231
+ # # Sequence Data: seqnum=1;seqlen=25479;seqhdr="Bifidobacterium-longum-subsp-infantis-MC2-contig1"
232
+ # # Model Data: version=Prodigal.v2.6.3;run_type=Single;model="Ab initio";gc_cont=59.94;transl_table=11;uses_sd=1
233
+ # >1_1_279_+
234
+ def get_regions_prodigal(fn):
235
+ """Parse prodigal output"""
236
+ regions = {}
237
+ with open(fn) as f:
238
+ for line in f:
239
+ if line[:12] == "# Model Data":
240
+ continue
241
+ if line[:15] == "# Sequence Data":
242
+ m = re.search(r'seqhdr="(\S+)"', line)
243
+ if m:
244
+ id_ = m.group(1)
245
+ regions[id_] = {}
246
+ regions[id_]["+"] = []
247
+ regions[id_]["-"] = []
248
+ else:
249
+ r = line[1:].rstrip().split("_")
250
+ n = int(
251
+ r[0]
252
+ ) # also store the index of the fragment - prodigal uses these (rather than coords) to identify sequences in the fasta output
253
+ s = int(r[1])
254
+ e = int(r[2])
255
+ regions[id_][r[3]].append(NumberedRegion(s, e, n))
256
+ return regions
257
+
258
+
259
+ def mask_regions(regions, mask):
260
+ """Look for overlaps of more than 5 base pairs of the supplied regions against a set of masks
261
+ This is probably O(N^2) but, in theory, there shouldn't be many mask regions
262
+ """
263
+ new_regions = {}
264
+ for seq in regions:
265
+ new_regions[seq] = {}
266
+ for strand in ["-", "+"]:
267
+ new_regions[seq][strand] = []
268
+ for r in regions[seq][strand]:
269
+ if seq in mask:
270
+ overlap = 0
271
+ for r2 in mask[seq]:
272
+ if r.overlaps(r2) > 5:
273
+ overlap = 1
274
+ if not overlap:
275
+ new_regions[seq][strand].append(r)
276
+ else:
277
+ new_regions[seq][strand].append(r)
278
+
279
+ return new_regions
280
+
281
+
282
+ # FIXME - This won't work if we have only a single set of predictions, but then
283
+ # there's no point in trying to merge
284
+ def merge_predictions(predictions, callers):
285
+ """Check that we have priorities set of for all callers we have data for"""
286
+ p = set(callers)
287
+ new_predictions = {}
288
+ for type_ in predictions:
289
+ if type_ not in p:
290
+ return None
291
+ # throw here? - if we've used a caller that we don't have a priority for
292
+
293
+ # first set of predictions takes priority - just transfer them
294
+ new_predictions[callers[0]] = predictions[callers[0]]
295
+
296
+ # for now assume only two callers, but can be extended
297
+ new_predictions[callers[1]] = {} # empty set for second priority caller
298
+ for seq in predictions[callers[1]]:
299
+ new_predictions[callers[1]][seq] = {}
300
+ for strand in ["-", "+"]:
301
+ new_predictions[callers[1]][seq][strand] = []
302
+ if seq in predictions[callers[0]]: # if this sequence already has predictions
303
+ prev_predictions = flatten_regions(
304
+ predictions[callers[0]][seq][strand]
305
+ ) # non-overlapping set of existing predictions/regions
306
+ new_predictions[callers[1]][seq][strand] = check_against_gaps(
307
+ prev_predictions, predictions[callers[1]][seq][strand]
308
+ ) # plug new predictions/regions into gaps
309
+ else: # no existing predictions: just add them
310
+ new_predictions[callers[1]][seq][strand] = predictions[callers[1]][seq][strand]
311
+
312
+ return new_predictions
313
+
314
+
315
+ def get_counts(predictions):
316
+ total = {}
317
+ for caller in predictions:
318
+ total[caller] = 0
319
+ for sample in predictions[caller]:
320
+ for strand in ["-", "+"]:
321
+ total[caller] += len(predictions[caller][sample][strand])
322
+ return total
323
+
324
+
325
+ def combine_main():
326
+ parser = argparse.ArgumentParser(
327
+ "MGnify gene caller combiner. This script will merge the gene called by prodigal and fraggenescan (in any order)"
328
+ )
329
+ parser.add_argument("-n", "--name", action="store", dest="name", required=True, help="basename")
330
+ parser.add_argument("-k", "--mask", action="store", dest="mask", required=False, help="Sequence mask file")
331
+
332
+ parser.add_argument("-a", "--prodigal-out", action="store", dest="prodigal_out", required=False, help="Stats out prodigal")
333
+ parser.add_argument("-b", "--prodigal-ffn", action="store", dest="prodigal_ffn", required=False, help="Stats ffn prodigal")
334
+ parser.add_argument("-c", "--prodigal-faa", action="store", dest="prodigal_faa", required=False, help="Stats faa prodigal")
335
+
336
+ parser.add_argument("-d", "--fgs-out", action="store", dest="fgs_out", required=False, help="Stats out FGS")
337
+ parser.add_argument("-e", "--fgs-ffn", action="store", dest="fgs_ffn", required=False, help="Stats ffn FGS")
338
+ parser.add_argument("-f", "--fgs-faa", action="store", dest="fgs_faa", required=False, help="Stats faa FGS")
339
+
340
+ parser.add_argument(
341
+ "-p",
342
+ "--caller-priority",
343
+ action="store",
344
+ dest="caller_priority",
345
+ required=False,
346
+ choices=["prodigal_fgs", "fgs_prodigal"],
347
+ default="prodigal_fgs",
348
+ help="Caller priority.",
349
+ )
350
+
351
+ parser.add_argument("-v", "--verbose", help="verbose output", dest="verbose", action="count", required=False)
352
+
353
+ parser.add_argument("--version", action="version", version=f"{__version__}")
354
+
355
+ args = parser.parse_args()
356
+
357
+ # Set up logging system
358
+ verbose_mode = args.verbose or 0
359
+
360
+ log_level = logging.WARNING
361
+ if verbose_mode:
362
+ log_level = logging.DEBUG if verbose_mode > 1 else logging.INFO
363
+
364
+ logging.basicConfig(level=log_level, format="%(levelname)s %(asctime)s - %(message)s", datefmt="%Y/%m/%d %I:%M:%S %p")
365
+
366
+ summary = {}
367
+ all_predictions = {}
368
+ files = {}
369
+ caller_priority = []
370
+ if args.caller_priority:
371
+ caller_priority = args.caller_priority.split("_")
372
+ else:
373
+ caller_priority = ["prodigal", "fgs"]
374
+
375
+ logging.info(f"Caller priority: 1. {caller_priority[0]}, 2. {caller_priority[1]}")
376
+
377
+ if args.prodigal_out:
378
+ logging.info("Prodigal presented")
379
+ logging.info("Getting Prodigal regions...")
380
+ all_predictions["prodigal"] = get_regions_prodigal(args.prodigal_out)
381
+
382
+ files["prodigal"] = [args.prodigal_out, args.prodigal_ffn, args.prodigal_faa]
383
+
384
+ if args.fgs_out:
385
+ logging.info("FGS presented")
386
+ logging.info("Getting FragGeneScan regions ...")
387
+ all_predictions["fgs"] = get_regions_fgs(args.fgs_out)
388
+
389
+ files["fgs"] = [args.fgs_out, args.fgs_ffn, args.fgs_faa]
390
+
391
+ summary["all"] = get_counts(all_predictions)
392
+
393
+ # Apply mask of ncRNA search
394
+ logging.info("Masking non coding RNA regions...")
395
+ if args.mask:
396
+ logging.info("Reading regions for masking...")
397
+ mask = get_regions_mask(args.mask)
398
+ if "prodigal" in all_predictions:
399
+ logging.info("Masking Prodigal outputs...")
400
+ all_predictions["prodigal"] = mask_regions(all_predictions["prodigal"], mask)
401
+ if "fgs" in all_predictions:
402
+ logging.info("Masking FragGeneScan outputs...")
403
+ all_predictions["fgs"] = mask_regions(all_predictions["fgs"], mask)
404
+ summary["masked"] = get_counts(all_predictions)
405
+
406
+ # Run the merging step
407
+ if len(all_predictions) > 1:
408
+ logging.info("Merging combined gene caller results...")
409
+ merged_predictions = merge_predictions(all_predictions, caller_priority)
410
+ else:
411
+ logging.info("Skipping merging step...")
412
+ merged_predictions = all_predictions
413
+ summary["merged"] = get_counts(merged_predictions)
414
+
415
+ # Output fasta files and summary (json)
416
+ logging.info("Writing output files...")
417
+
418
+ files["merged"] = [args.name + ext for ext in [".out", ".ffn", ".faa"]]
419
+
420
+ output_files(merged_predictions, summary, files)
421
+
422
+
423
+ if __name__ == "__main__":
424
+ combine_main()
@@ -0,0 +1,117 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the 'License');
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an 'AS IS' BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ import os
19
+ import logging
20
+
21
+ from mgnify_pipelines_toolkit.analysis.assembly.go_utils import parse_interproscan_tsv
22
+
23
+
24
+ def parse_args():
25
+
26
+ description = "Go slim pipeline for processing InterProScan results"
27
+ parser = argparse.ArgumentParser(description=description)
28
+ parser.add_argument(
29
+ "-i", "--ips_input", help="InterProScan result file", required=True
30
+ )
31
+ parser.add_argument("-o", "--output", help="GO summary output file", required=True)
32
+ args = parser.parse_args()
33
+
34
+ ips_input = args.ips_input
35
+ output = args.output
36
+
37
+ return ips_input, output
38
+
39
+
40
+ # Constants
41
+ PROJECT_NAME = "EBI Metagenomics"
42
+ PROJECT_URL = "http://www.ebi.ac.uk/metagenomics"
43
+ PROJECT_CONTACT = "metagenomics-help@ebi.ac.uk"
44
+ FIXED_TIMESTAMP = "20160528" # What is this timestamp?
45
+
46
+ logging.basicConfig(
47
+ level=logging.INFO, format="%(asctime)s - %(levelname)s: %(message)s"
48
+ )
49
+
50
+
51
+ def write_gaf_file(gaf_input_file_path: str, go_id_set: set[str]) -> None:
52
+ """
53
+ Create a GO Annotation File (GAF) from a set of GO IDs.
54
+
55
+ :param gaf_input_file_path: Path to output GAF file
56
+ :param go_id_set: Set of GO IDs to include in the file
57
+ """
58
+ with open(gaf_input_file_path, "w") as fw:
59
+ # Write GAF header
60
+ fw.write("!gaf-version: 2.1\n")
61
+ fw.write(f"!Project_name: {PROJECT_NAME}\n")
62
+ fw.write(f"!URL: {PROJECT_URL}\n")
63
+ fw.write(f"!Contact Email: {PROJECT_CONTACT}\n")
64
+
65
+ # Write GO entries
66
+ for go_id in go_id_set:
67
+ gaf_entry = "\t".join(
68
+ [
69
+ "EMG",
70
+ go_id,
71
+ "GO",
72
+ "",
73
+ go_id,
74
+ "PMID:12069591",
75
+ "IEA",
76
+ "",
77
+ "P",
78
+ "",
79
+ "",
80
+ "protein",
81
+ "taxon:1310605",
82
+ FIXED_TIMESTAMP,
83
+ "InterPro",
84
+ "",
85
+ ]
86
+ )
87
+ fw.write(gaf_entry + "\n")
88
+
89
+ logging.info(f"GAF file created successfully: {gaf_input_file_path}")
90
+
91
+
92
+ def main():
93
+ """
94
+ Process the InterProScan TSV output and generate a GO annotation file (GAF)).
95
+ """
96
+
97
+ ips_input, output = parse_args()
98
+
99
+ # Validate input file
100
+ if not os.path.exists(ips_input):
101
+ raise FileNotFoundError(f"Input file not found: {ips_input}")
102
+
103
+ if os.path.getsize(ips_input) == 0:
104
+ logging.warning("Input file is empty. Skipping processing.")
105
+ return
106
+
107
+ # Parse InterProScan result file
108
+ logging.info(f"Parsing InterProScan input: {ips_input}")
109
+ go2protein_count_dict = parse_interproscan_tsv(ips_input)
110
+ logging.info("Finished parsing InterProScan file")
111
+
112
+ logging.info("Writing the GAF file")
113
+ write_gaf_file(f"{output}_ips_annotations.gaf", go2protein_count_dict.keys())
114
+
115
+
116
+ if __name__ == "__main__":
117
+ main()
@@ -0,0 +1,135 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the 'License');
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an 'AS IS' BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from collections import defaultdict
18
+ import logging
19
+ import os
20
+ from pathlib import Path
21
+ import re
22
+
23
+ logging.basicConfig(
24
+ level=logging.INFO, format="%(asctime)s - %(levelname)s: %(message)s"
25
+ )
26
+
27
+
28
+ def count_and_assign_go_annotations(
29
+ go2protein_count: defaultdict[int],
30
+ go_annotations: set[str],
31
+ num_of_proteins: int,
32
+ mapped_go_terms: defaultdict[set] = None,
33
+ ) -> defaultdict[int]:
34
+ """Increments counts dictionary for GO terms found on a protein.
35
+ If used for GO-slim terms, then a mapped_go_terms dictionary is required
36
+ (with default value of None).
37
+ :param go2protein_count: Current state of the count dictionary
38
+ :type go2protein_count: defaultdict[int]
39
+ :param go_annotations: GO-terms to be incremented
40
+ :type go_annotations: set[str]
41
+ :param num_of_proteins: Number of proteins to be incremented (not sure if we need this, see TODO below)
42
+ :type num_of_proteins: int
43
+ :param mapped_go_terms: Dictionary containin the GO-slim conversion
44
+ :type mapped_go_terms: defaultdict(set)
45
+ :return: _description_
46
+ :rtype: _type_
47
+ """
48
+
49
+ if not mapped_go_terms:
50
+ for go_id in go_annotations:
51
+ go2protein_count[go_id] += num_of_proteins
52
+ else:
53
+ slim_go_ids_set = set()
54
+ for go_annotation in go_annotations:
55
+ mapped_go_ids = mapped_go_terms.get(go_annotation)
56
+ if mapped_go_ids:
57
+ slim_go_ids_set.update(mapped_go_ids)
58
+ for slim_go_id in slim_go_ids_set:
59
+ go2protein_count[slim_go_id] += num_of_proteins
60
+
61
+ return go2protein_count
62
+
63
+
64
+ def parse_interproscan_tsv(ips_file: Path, mapped_go_terms: dict = None) -> dict:
65
+ """Parses an InterProScan output line by line and return a dictionary of counts for the different GO terms.
66
+ The structure of the IPS file is one annotation per line, some of which will be GO terms. If a protein
67
+ has multiple annotations, then those annotations will follow one by one in order. This function therefore
68
+ parses the file by keeping some flags to track which proteins it's currently on, and which GO terms were found
69
+ for said protein. It then finally increments the count of said protein's GO terms when it's done being parsed.
70
+ :param ips_file: InterProScan .tsv file
71
+ :type ips_file: Path
72
+ :return: Dictionary containing GO term counts in the input InterProScan file
73
+ :rtype: dict
74
+ """
75
+
76
+ go2protein_count = defaultdict(int)
77
+ if not os.path.exists(ips_file):
78
+ logging.error(f"The InterProScan file {ips_file} could not be found. Exiting.")
79
+ exit(1)
80
+
81
+ num_of_proteins_with_go = 0
82
+ total_num_of_proteins = 0
83
+ line_counter = 0
84
+ previous_protein_acc = None
85
+ go_annotations_single_protein = set()
86
+
87
+ fr = open(ips_file, "r")
88
+ go_pattern = re.compile("GO:\\d+")
89
+
90
+ for line in fr:
91
+ # IPS files are parsed line by line - the same protein accession will appear multiple lines in a row with different annotation
92
+ line_counter += 1
93
+ line = line.strip()
94
+ chunks = line.split("\t")
95
+ # Get protein accession
96
+ current_protein_acc = chunks[0]
97
+
98
+ # TODO: not sure if this line is needed - do we ever have more than one protein in a single line of IPS?
99
+ # Will keep just in case
100
+ num_of_proteins = len(current_protein_acc.split("|"))
101
+
102
+ # If we're at a new protein accession in the IPS file then we finally increment
103
+ # the go2protein_count dictionary for each term that was found in that protein
104
+ if current_protein_acc != previous_protein_acc:
105
+ total_num_of_proteins += 1
106
+ if len(go_annotations_single_protein) > 0:
107
+ num_of_proteins_with_go += 1
108
+ go2protein_count = count_and_assign_go_annotations(
109
+ go2protein_count,
110
+ go_annotations_single_protein,
111
+ num_of_proteins,
112
+ mapped_go_terms,
113
+ )
114
+ # reset GO id set because we hit a new protein accession
115
+ go_annotations_single_protein = set()
116
+ previous_protein_acc = current_protein_acc
117
+
118
+ # Parse out GO annotations
119
+ # GO annotations are associated to InterPro entries (InterPro entries start with 'IPR')
120
+ # Than use the regex to extract the GO Ids (e.g. GO:0009842)
121
+ if len(chunks) >= 13 and chunks[11].startswith("IPR"):
122
+ for go_annotation in go_pattern.findall(line):
123
+ go_annotations_single_protein.add(go_annotation)
124
+
125
+ # Do final counting for the last protein
126
+ go2protein_count = count_and_assign_go_annotations(
127
+ go2protein_count,
128
+ go_annotations_single_protein,
129
+ num_of_proteins,
130
+ mapped_go_terms,
131
+ )
132
+
133
+ fr.close()
134
+
135
+ return go2protein_count
@@ -0,0 +1,181 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the 'License');
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an 'AS IS' BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+ from collections import defaultdict
19
+ import logging
20
+ import os
21
+ from pathlib import Path
22
+
23
+ from mgnify_pipelines_toolkit.analysis.assembly.go_utils import parse_interproscan_tsv
24
+
25
+ logging.basicConfig(
26
+ level=logging.INFO, format="%(asctime)s - %(levelname)s: %(message)s"
27
+ )
28
+
29
+
30
+ def parse_args():
31
+
32
+ description = "Go slim pipeline."
33
+ parser = argparse.ArgumentParser(description=description)
34
+ parser.add_argument(
35
+ "-go", "--go_obo", help="Gene Ontology basic file.", required=True
36
+ )
37
+ parser.add_argument(
38
+ "-gb", "--go_banding", help="Subset GO banding file.", required=True
39
+ )
40
+ parser.add_argument(
41
+ "-gaf",
42
+ "--gaf_input",
43
+ help="GAF file, generated by generate_gaf.py",
44
+ required=True,
45
+ )
46
+ parser.add_argument(
47
+ "-i", "--ips_input", help="InterProScan result file.", required=True
48
+ )
49
+ parser.add_argument("-o", "--output", help="GO summary output file.", required=True)
50
+ args = parser.parse_args()
51
+
52
+ go_obo = args.go_obo
53
+ go_banding = args.go_banding
54
+ gaf_input = args.gaf_input
55
+ ips_input = args.ips_input
56
+ output = args.output
57
+
58
+ return go_obo, go_banding, gaf_input, ips_input, output
59
+
60
+
61
+ def parse_mapped_gaf_file(gaf_file: Path) -> defaultdict[set]:
62
+
63
+ mapped_go_dict = defaultdict(set)
64
+ if os.path.exists(gaf_file):
65
+ handle = open(gaf_file, "r")
66
+ for line in handle:
67
+ if not line.startswith("!"):
68
+ line = line.strip()
69
+ splitted_line = line.split("\t")
70
+ go_id = splitted_line[1]
71
+ mapped_go_id = splitted_line[4]
72
+ mapped_go_dict[go_id].add(mapped_go_id)
73
+
74
+ return mapped_go_dict
75
+
76
+
77
+ def get_go_slim_summary(go_slim_banding_file, goslims2_protein_count):
78
+ summary = []
79
+
80
+ fr = open(go_slim_banding_file, "r")
81
+
82
+ for line in fr:
83
+ if line.startswith("GO"):
84
+ line = line.strip()
85
+ line_chunks = line.split("\t")
86
+ go_id = line_chunks[0]
87
+ term = line_chunks[1]
88
+ category = line_chunks[2]
89
+ # Default value for the count
90
+ count = 0
91
+ if go_id in goslims2_protein_count:
92
+ count = goslims2_protein_count[go_id]
93
+ summary.append((go_id, term, category, count))
94
+ return summary
95
+
96
+
97
+ def write_go_summary_to_file(go_summary, output_file):
98
+ fw = open(output_file, "w")
99
+ for go, term, category, count in go_summary:
100
+ fw.write('","'.join(['"' + go, term, category, str(count) + '"']) + "\n")
101
+ fw.close()
102
+
103
+
104
+ def parse_gene_ontology(obo_file):
105
+ """
106
+ Parses OBO formatted file.
107
+ :param obo_file:
108
+ :return:
109
+ """
110
+ go_term_tuples = []
111
+ fr = open(obo_file, "r")
112
+ id, term, category = "", "", ""
113
+ for line in fr:
114
+ line = line.strip()
115
+ split_line = line.split(": ")
116
+ if line.startswith("id:"):
117
+ id = split_line[1]
118
+ elif line.startswith("name:"):
119
+ term = split_line[1]
120
+ elif line.startswith("namespace"):
121
+ category = split_line[1]
122
+ else:
123
+ if id.startswith("GO:") and id and term and category:
124
+ item = (id, term, category)
125
+ go_term_tuples.append(item)
126
+ id, term, category = "", "", ""
127
+ fr.close()
128
+ return go_term_tuples
129
+
130
+
131
+ def get_full_go_summary(core_gene_ontology, go2protein_count_dict, top_level_go_ids):
132
+ summary = []
133
+
134
+ for go_id, term, category in core_gene_ontology:
135
+
136
+ if (go_id in go2protein_count_dict) and (
137
+ go_id not in top_level_go_ids
138
+ ): # make sure that top level terms are not included (they tell you nothing!)
139
+ count = go2protein_count_dict[go_id]
140
+ summary.append((go_id, term, category, count))
141
+ summary.sort(key=lambda x: (x[2], -x[3]))
142
+ return summary
143
+
144
+
145
+ def main():
146
+
147
+ go_obo, go_banding, gaf_input, ips_input, output = parse_args()
148
+
149
+ logging.info("Parsing the InterProScan input: " + ips_input)
150
+ go2protein_count_dict = parse_interproscan_tsv(ips_input)
151
+ logging.info("Finished parsing.")
152
+
153
+ # Generate GO summary
154
+ logging.info("Loading full Gene ontology: " + go_obo)
155
+ go_term_tuples = parse_gene_ontology(go_obo)
156
+ logging.info("Finished loading.")
157
+
158
+ logging.info("Generating full GO summary...")
159
+ top_level_go_ids = ["GO:0008150", "GO:0003674", "GO:0005575"]
160
+ full_go_summary = get_full_go_summary(
161
+ go_term_tuples, go2protein_count_dict, top_level_go_ids
162
+ )
163
+ logging.info("Finished generation.")
164
+
165
+ logging.info("Writing full GO summary: " + output)
166
+ write_go_summary_to_file(full_go_summary, output)
167
+ logging.info("Finished writing.")
168
+
169
+ mapped_go_terms = parse_mapped_gaf_file(gaf_input)
170
+ logging.info("Getting GO slim counts")
171
+ goslims2_protein_count = parse_interproscan_tsv(ips_input, mapped_go_terms)
172
+
173
+ go_slim_summary = get_go_slim_summary(go_banding, goslims2_protein_count)
174
+ go_slim_output_file = output + "_slim"
175
+ logging.info("Writing GO slim summary: " + go_slim_output_file)
176
+ write_go_summary_to_file(go_slim_summary, go_slim_output_file)
177
+ logging.info("Finished writing.")
178
+
179
+
180
+ if __name__ == "__main__":
181
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.1.9
3
+ Version: 0.2.0
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -14,6 +14,10 @@ mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=19NgCYE
14
14
  mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=RDPsaWKf0wIDwvCHXyRCh2zSJf3y9E7uOhHjaAeX8bY,11099
15
15
  mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py,sha256=69iK8vtG5xFgYQ-KJiSQlaxuhSoxzcO59eNLyDS3nm0,4323
16
16
  mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=OODl3XhLvksvG5RZn1iHZlg9L3DXiWIkyxJ6o-y6oeg,6949
17
+ mgnify_pipelines_toolkit/analysis/assembly/cgc_merge.py,sha256=u6r_1GRGgBAJQvU_t5Rtl3ZYjTtGJGd5yHCobtL9ob0,15405
18
+ mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py,sha256=U1Ls3O0CQmukmoyUwEAEN11jHUKuCdS-qVkr5ai243I,3582
19
+ mgnify_pipelines_toolkit/analysis/assembly/go_utils.py,sha256=vsYaFJ_cmbo6DXlWs_X8wpZJfMQOq1CrLX4-3owmYjI,5447
20
+ mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=RthgLO3YTO_JGMC7Nx2JDrowXRimnOtVUDkM1l31rt4,5834
17
21
  mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
22
  mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py,sha256=H5ccd1e_e5dk8vhVOvHLK1lknYbRPbnqPjULCYnU0FQ,4021
19
23
  mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=xl5HduWtGPWiI9yqsjQ3itIzwHSxF2ig5KgjLXmj9EE,4772
@@ -31,9 +35,9 @@ mgnify_pipelines_toolkit/schemas/schemas.py,sha256=fd2xCoA1Ty-XaMG9U_gxNcBokHiYE
31
35
  mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
36
  mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=GbNT7clHso21w_1PbPpWKVRd5bNs_MDbGXt8XVIGl2o,3991
33
37
  mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=zsQ4TuR4vpqYa67MgIdopdscsS0DVJdy4enRe1nCjSs,793
34
- mgnify_pipelines_toolkit-0.1.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
35
- mgnify_pipelines_toolkit-0.1.9.dist-info/METADATA,sha256=JttpU3vw2IbWoETCweOs6T5guAH_ip3aiEQ2HhJmGVo,6068
36
- mgnify_pipelines_toolkit-0.1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
37
- mgnify_pipelines_toolkit-0.1.9.dist-info/entry_points.txt,sha256=SHRTFOo4hHWv3ORwhjnKQY554et4dXS1vVwUVmfKFHM,1844
38
- mgnify_pipelines_toolkit-0.1.9.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
39
- mgnify_pipelines_toolkit-0.1.9.dist-info/RECORD,,
38
+ mgnify_pipelines_toolkit-0.2.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
39
+ mgnify_pipelines_toolkit-0.2.0.dist-info/METADATA,sha256=TR0FyKtC0Xyj0zvDCPiYsI6bGbZI9GkQ8fiC1WWomEk,6068
40
+ mgnify_pipelines_toolkit-0.2.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
41
+ mgnify_pipelines_toolkit-0.2.0.dist-info/entry_points.txt,sha256=60Nov738JAon-uZXUqqjOGy4TXxgS4xtxqYhAi12HY0,2084
42
+ mgnify_pipelines_toolkit-0.2.0.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
43
+ mgnify_pipelines_toolkit-0.2.0.dist-info/RECORD,,
@@ -3,10 +3,12 @@ add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_
3
3
  are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
4
4
  assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
5
5
  assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
6
+ cgc_merge = mgnify_pipelines_toolkit.analysis.assembly.cgc_merge:combine_main
6
7
  classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
7
8
  fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
8
9
  fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
9
10
  find_mcp_inflection_points = mgnify_pipelines_toolkit.analysis.amplicon.find_mcp_inflection_points:main
11
+ generate_gaf = mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main
10
12
  get_mpt_version = mgnify_pipelines_toolkit.utils.get_mpt_version:main
11
13
  get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
12
14
  get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
@@ -19,3 +21,4 @@ remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambig
19
21
  rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
20
22
  standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
21
23
  study_summary_generator = mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:main
24
+ summarise_goslims = mgnify_pipelines_toolkit.analysis.assembly.summarise_goslims:main