mgnify-pipelines-toolkit 1.1.2__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -1,214 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
-
17
- import argparse
18
-
19
- from Bio.Seq import Seq
20
- import numpy as np
21
- import pandas as pd
22
-
23
- from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
24
- get_read_count,
25
- build_cons_seq,
26
- build_mcp_cons_dict_list,
27
- fetch_mcp,
28
- )
29
- from mgnify_pipelines_toolkit.constants.thresholds import MCP_MAX_LINE_COUNT
30
-
31
-
32
- def parse_args():
33
-
34
- parser = argparse.ArgumentParser()
35
-
36
- parser.add_argument(
37
- "-i",
38
- "--input",
39
- required=True,
40
- type=str,
41
- help="Path to fastq file to choose inflection point",
42
- )
43
- parser.add_argument(
44
- "-p", "--points", required=True, type=str, help="Path to inflection points file"
45
- )
46
- parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
47
- parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
48
-
49
- args = parser.parse_args()
50
-
51
- path = args.input
52
- points = args.points
53
- sample = args.sample
54
- output = args.output
55
-
56
- return path, points, sample, output
57
-
58
-
59
- def assess_inflection_point_mcp_for_sample(path, inf_point_list, rev=False):
60
- """
61
- Assess inflection point list, selecting one for automatic primer trimming.
62
-
63
- Takes as input a fastq file and a list of inflection points generated by "find_mcp_inflection_points.py".
64
- Computes the average conservation of mcp before inflection point and after.
65
- Gets the difference in avg. conservation between the pre- and post- points.
66
- Selects the inflection point with the maximum difference as the cutoff.
67
- If an inf point has a similar difference and is earlier than the max, we make a 'conservative' choice and
68
- replace it with the earlier cutoff.
69
-
70
- Returns the cutoff point and the consensus sequence 'forming' the automatically predicted primer
71
- """
72
-
73
- # TODO error handle for empty inflection point list
74
-
75
- start_confs = [] # pre-inf point conservations
76
- end_confs = [] # post-inf point conservations
77
- start_cons_lens = [] # list for storing lengths of pre-inflection point sequences
78
- cons_seq_list = [] # list for storing consensus sequences pre-inflection points
79
-
80
- do_not_include_list = [
81
- i + 5 for i in inf_point_list
82
- ] # ignore conservation of inflection point in calculation
83
-
84
- read_count = get_read_count(path) # get readcount from fastq
85
-
86
- max_line_count = None
87
- if read_count > MCP_MAX_LINE_COUNT:
88
- max_line_count = MCP_MAX_LINE_COUNT
89
-
90
- n_prop = 0.8
91
-
92
- for start in inf_point_list: # Looping through the pre-inflection point mcps
93
- mcp_len = start + 4 # length of pre-inf mcps is inflection point + 4
94
-
95
- mcp_count_dict = fetch_mcp(
96
- path, mcp_len, rev=rev, max_line_count=max_line_count
97
- ) # get MCP count dict
98
- mcp_cons_list = build_mcp_cons_dict_list(
99
- mcp_count_dict, mcp_len
100
- ) # list of base conservation dicts for mcps
101
- cons_seq, cons_confs = build_cons_seq(
102
- mcp_cons_list,
103
- read_count,
104
- n_prop,
105
- do_not_include_list,
106
- max_line_count=max_line_count,
107
- ) # get list of max base conservations for each index
108
- # also get consensus sequence
109
- cons_seq_list.append(cons_seq)
110
- start_confs.append(np.mean(cons_confs))
111
- start_cons_lens.append(len(cons_seq))
112
-
113
- for i, end in enumerate(
114
- inf_point_list
115
- ): # Looping through the post-inflection point mcps
116
- mcp_len = end + 5 # length of pre-inf mcps is inflection point + 5
117
- subs_len = start_cons_lens[i] # length of respective pre-inf point sequence
118
- l = mcp_len + subs_len - 1 # final index of MCP
119
-
120
- mcp_count_dict = fetch_mcp(
121
- path, l, mcp_len, rev=rev, max_line_count=max_line_count
122
- )
123
- mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, subs_len)
124
- cons_seq, cons_confs = build_cons_seq(
125
- mcp_cons_list,
126
- read_count,
127
- n_prop,
128
- do_not_include_list,
129
- subs_len,
130
- max_line_count=max_line_count,
131
- )
132
-
133
- end_confs.append(np.mean(cons_confs))
134
-
135
- diff_res = [
136
- start_confs[i] - end_confs[i] for i in range(len(start_confs))
137
- ] # get differences between pre- and -post avg conservation values
138
- diff_res_sorted = sorted(
139
- diff_res, reverse=True
140
- ) # sort differences from highest to lowest
141
-
142
- ini_max_res = diff_res_sorted[0] # maximum differences
143
- curr_max_index = diff_res.index(ini_max_res) # index of maximum differences
144
-
145
- for res in diff_res_sorted[1:]: # Loop through the rest of the differences
146
- curr_res_index = np.where(diff_res == res)[0][0]
147
-
148
- index_diff = inf_point_list[curr_max_index] - inf_point_list[curr_res_index]
149
-
150
- # if difference between the max and the current is negligible and the index of the current is earlier then..
151
- if ini_max_res - res < 0.05 and (index_diff <= 3 and index_diff > 0):
152
- curr_max_index = (
153
- curr_res_index # replace the selected index with the current one
154
- )
155
-
156
- cutoff = (
157
- inf_point_list[curr_max_index] + 5
158
- ) # cutoff is the inflection point index + 5
159
- primer = cons_seq_list[
160
- curr_max_index
161
- ] # grab the correct consensus sequence as primer
162
-
163
- # if the requested strand is reverse..
164
- if rev:
165
- primer = str(
166
- Seq(primer).complement()
167
- ) # ..get the complement of consensus sequence
168
-
169
- return cutoff, primer
170
-
171
-
172
- def main():
173
-
174
- path, points, sample, output = parse_args()
175
- inf_df = pd.read_csv(points, sep="\t")
176
-
177
- f_slice = inf_df[inf_df.strand == "F"] # get forward inflection points
178
- r_slice = inf_df[inf_df.strand == "R"] # get reverse inflection points
179
- r_slice = r_slice.reset_index(drop=True)
180
-
181
- f_cutoff = ""
182
- r_cutoff = ""
183
- f_primer = ""
184
- r_primer = ""
185
-
186
- if not f_slice.empty: # if there is a forward inflection point..
187
- inf_list = f_slice.inf_point.tolist()
188
- f_cutoff, f_primer = assess_inflection_point_mcp_for_sample(
189
- path, inf_list
190
- ) # .. assess and select
191
-
192
- if not r_slice.empty: # if there is a reverse inflection point..
193
- inf_list = r_slice.inf_point.tolist()
194
- r_cutoff, r_primer = assess_inflection_point_mcp_for_sample(
195
- path, inf_list, rev=True
196
- ) # .. assess and select
197
-
198
- # Output cutoff point(s) to .txt file
199
- with open(f"{output}/{sample}_cutoff.txt", "w") as fw:
200
- if f_cutoff != "":
201
- fw.write(f"F: {f_cutoff}\n")
202
- if r_cutoff != "":
203
- fw.write(f"R: {r_cutoff}\n")
204
-
205
- # Output consensus primer sequence(s) to .fasta file
206
- with open(f"{output}/{sample}_auto_primers.fasta", "w") as fw:
207
- if f_cutoff != "":
208
- fw.write(f">F_auto\n{f_primer}\n")
209
- if r_cutoff != "":
210
- fw.write(f">R_auto\n{r_primer}\n")
211
-
212
-
213
- if __name__ == "__main__":
214
- main()
@@ -1,175 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
-
17
- import argparse
18
- from collections import defaultdict
19
-
20
- import pandas as pd
21
- import numpy as np
22
-
23
- from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
24
- get_read_count,
25
- build_cons_seq,
26
- build_mcp_cons_dict_list,
27
- fetch_mcp,
28
- )
29
- from mgnify_pipelines_toolkit.constants.thresholds import MCP_MAX_LINE_COUNT
30
-
31
-
32
- def parse_args():
33
-
34
- parser = argparse.ArgumentParser()
35
-
36
- parser.add_argument(
37
- "-i",
38
- "--input",
39
- required=True,
40
- type=str,
41
- help="Path to fastq file to assess mcps",
42
- )
43
- parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
44
- parser.add_argument(
45
- "-st",
46
- "--strand",
47
- required=True,
48
- choices=["FR", "F", "R"],
49
- help="F: Forward, R: Reverse",
50
- )
51
- parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
52
-
53
- args = parser.parse_args()
54
-
55
- path = args.input
56
- sample = args.sample
57
- strand = args.strand
58
- output = args.output
59
-
60
- return path, sample, strand, output
61
-
62
-
63
- def find_mcp_props_for_sample(path, rev=False):
64
- """
65
- Generate mcp proportions in a stepwise and windowed manner for a fastq file.
66
-
67
- For a continuous range of starting indices (2 to 25), generate mcps of window size of 5 bases.
68
- Calculate the average conservation of the most common base at each index of a window.
69
- The resulting list of mcp conservations can be considered a conservation curve and used to
70
- identify inflection points where the conservation suddenly changes.
71
-
72
- Output a dictionary where:
73
- key -> an index starting point e.g. base 10
74
- val -> the average conservation of the most common base for the mcp window goign from base 10 to 15 (inclusive)
75
- """
76
-
77
- res_dict = defaultdict(float)
78
- start_range = range(2, 25, 1) # Range of starting indices
79
-
80
- print(f"Processing {path}")
81
-
82
- mcp_len = 5 # length of generated mcps
83
-
84
- for start in start_range:
85
-
86
- end = (
87
- start + mcp_len - 1
88
- ) # compute the final index for the mcp (inclusive). Indices are of base 1 not 0.
89
-
90
- read_count = get_read_count(
91
- path, file_type="fastq"
92
- ) # get read count for fastq file
93
-
94
- max_line_count = None
95
- if read_count > MCP_MAX_LINE_COUNT:
96
- max_line_count = MCP_MAX_LINE_COUNT
97
-
98
- mcp_count_dict = fetch_mcp(
99
- path, end, start, rev, max_line_count
100
- ) # get MCP count dict
101
- mcp_cons_list = build_mcp_cons_dict_list(
102
- mcp_count_dict, mcp_len
103
- ) # list of base conservation dicts for mcps
104
- cons_seq, cons_conf = build_cons_seq(
105
- mcp_cons_list, read_count, max_line_count=max_line_count
106
- ) # get list of max base conservations for each index
107
-
108
- res_dict[start] = np.mean(cons_conf) # compute the mean
109
-
110
- return res_dict
111
-
112
-
113
- def concat_out(fwd_out="", rev_out=""):
114
- """
115
- Generate Pandas dataframe out of mcp dictionary.
116
-
117
- Output looks like this (when both F and R are requested):
118
- 2 3 4
119
- F 0.7814975041597337 0.8736772046589019 0.9434276206322796
120
- R 0.9010981697171381 0.9082861896838601 0.90369384359401
121
-
122
- Columns are the starting indices. Row labels are the strand.
123
- """
124
-
125
- total_res_dict = defaultdict(list)
126
- df_ind = []
127
-
128
- # Check if fwd strand was requested
129
- if fwd_out != "":
130
- [total_res_dict[key].append(fwd_out[key]) for key in fwd_out.keys()]
131
- df_ind.append("F")
132
-
133
- # Check if rev strand was requested
134
- if rev_out != "":
135
- [total_res_dict[key].append(rev_out[key]) for key in rev_out.keys()]
136
- df_ind.append("R")
137
-
138
- res_df = pd.DataFrame.from_dict(total_res_dict)
139
- res_df.index = df_ind
140
-
141
- return res_df
142
-
143
-
144
- def main():
145
-
146
- path, sample, strand, output = parse_args()
147
-
148
- res_df = ""
149
-
150
- # TODO: match-case statement is python 3.10>. We are currently locking the version
151
- # at version 3.9. The day we bump the version we should replace these if statements
152
- # with a match-case block.
153
-
154
- if strand == "FR":
155
- fwd_out = find_mcp_props_for_sample(path)
156
- rev_out = find_mcp_props_for_sample(path, rev=True)
157
- res_df = concat_out(fwd_out, rev_out)
158
- elif strand == "F":
159
- fwd_out = find_mcp_props_for_sample(path)
160
- res_df = concat_out(fwd_out)
161
- elif strand == "R":
162
- rev_out = find_mcp_props_for_sample(path, rev=True)
163
- res_df = concat_out(rev_out=rev_out)
164
- else:
165
- print(
166
- "Incorrect strand input. Should be F for forward, R for reverse, or FR for both."
167
- )
168
- exit(1)
169
-
170
- # Save resulting dataframe to a tsv file
171
- res_df.to_csv(f"{output}/{sample}_mcp_cons.tsv", sep="\t")
172
-
173
-
174
- if __name__ == "__main__":
175
- main()
@@ -1,111 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
-
17
- from collections import defaultdict
18
- import argparse
19
-
20
- import pandas as pd
21
- import numpy as np
22
-
23
-
24
- def parse_args():
25
-
26
- parser = argparse.ArgumentParser()
27
-
28
- parser.add_argument(
29
- "-i",
30
- "--input",
31
- required=True,
32
- type=str,
33
- help="Path to mcp tsv file to find inflection points",
34
- )
35
- parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
36
- parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
37
-
38
- args = parser.parse_args()
39
-
40
- path = args.input
41
- sample = args.sample
42
- output = args.output
43
-
44
- return path, sample, output
45
-
46
-
47
- def find_mcp_inf_points(mcp_df):
48
- """
49
- Find inflection points from an mcp_df file output by "assess_mcp_proportions_MERGED.py"
50
-
51
- Takes the list of average mcp conservations and gets the derivative of the curve
52
- Keep any points of the derivative where value is above the 80th percentile
53
-
54
- Outputs a dictionary with key-val pairs where vals are lists:
55
- 'strand' -> strand list
56
- 'inf_point' -> inf_point list
57
-
58
- """
59
-
60
- inf_point_dict = defaultdict(list)
61
- start_indices = [int(i) for i in mcp_df.columns.tolist()]
62
-
63
- for i in range(len(mcp_df)): # Loop through both possible strands of the mcp_df
64
- strand = mcp_df.index[i]
65
- props = mcp_df.iloc[i].tolist()
66
- props = [-val for val in props]
67
-
68
- prop_diff = np.diff(props) / np.diff(start_indices) # Get the derivative
69
- infl_points = np.where(prop_diff > np.percentile(prop_diff, 80))[
70
- 0
71
- ] # Grab points above 80th percentile
72
-
73
- for ind in infl_points:
74
- inf_point = start_indices[ind]
75
-
76
- if (
77
- inf_point < 10 or inf_point > 20
78
- ): # Rule to facilitate results - won't accept
79
- continue # points below index 10 or above index 20
80
- # 10 means a cutoff of 15 and 20 a cutoff of 25
81
- # literature points to no primers existing that are
82
- # shorter or bigger than these lengths
83
-
84
- inf_point_dict["strand"].append(strand)
85
- inf_point_dict["inf_point"].append(inf_point)
86
-
87
- return inf_point_dict
88
-
89
-
90
- def main():
91
-
92
- path, sample, output = parse_args()
93
-
94
- mcp_df = pd.read_csv(path, sep="\t", index_col=0) # Read mcp_df
95
- inf_point_dict = find_mcp_inf_points(mcp_df) # Generate inflection points dict
96
-
97
- if len(inf_point_dict) > 0: # If the inf_point_dict isn't empty..
98
- inf_point_df = pd.DataFrame.from_dict(
99
- inf_point_dict
100
- ) # .. turn it into a dataframe
101
- inf_point_df.to_csv(
102
- f"{output}/{sample}_inf_points.tsv", sep="\t", index=False
103
- ) # ..save it to a .tsv file
104
-
105
- else: # If it is empty..
106
- fw = open(f"{output}/{sample}_inf_points.tsv", "w") # ..make an empty file
107
- fw.close()
108
-
109
-
110
- if __name__ == "__main__":
111
- main()