mgnify-pipelines-toolkit 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/constants/thresholds.py +0 -4
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/METADATA +1 -2
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/RECORD +7 -14
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/entry_points.txt +0 -5
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +0 -221
- mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -164
- mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -214
- mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +0 -175
- mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -111
- mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -327
- mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -43
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/top_level.txt +0 -0
|
@@ -1,214 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
|
-
#
|
|
6
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
-
# you may not use this file except in compliance with the License.
|
|
8
|
-
# You may obtain a copy of the License at
|
|
9
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
-
#
|
|
11
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
-
# See the License for the specific language governing permissions and
|
|
15
|
-
# limitations under the License.
|
|
16
|
-
|
|
17
|
-
import argparse
|
|
18
|
-
|
|
19
|
-
from Bio.Seq import Seq
|
|
20
|
-
import numpy as np
|
|
21
|
-
import pandas as pd
|
|
22
|
-
|
|
23
|
-
from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
|
|
24
|
-
get_read_count,
|
|
25
|
-
build_cons_seq,
|
|
26
|
-
build_mcp_cons_dict_list,
|
|
27
|
-
fetch_mcp,
|
|
28
|
-
)
|
|
29
|
-
from mgnify_pipelines_toolkit.constants.thresholds import MCP_MAX_LINE_COUNT
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def parse_args():
|
|
33
|
-
|
|
34
|
-
parser = argparse.ArgumentParser()
|
|
35
|
-
|
|
36
|
-
parser.add_argument(
|
|
37
|
-
"-i",
|
|
38
|
-
"--input",
|
|
39
|
-
required=True,
|
|
40
|
-
type=str,
|
|
41
|
-
help="Path to fastq file to choose inflection point",
|
|
42
|
-
)
|
|
43
|
-
parser.add_argument(
|
|
44
|
-
"-p", "--points", required=True, type=str, help="Path to inflection points file"
|
|
45
|
-
)
|
|
46
|
-
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
47
|
-
parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
|
|
48
|
-
|
|
49
|
-
args = parser.parse_args()
|
|
50
|
-
|
|
51
|
-
path = args.input
|
|
52
|
-
points = args.points
|
|
53
|
-
sample = args.sample
|
|
54
|
-
output = args.output
|
|
55
|
-
|
|
56
|
-
return path, points, sample, output
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def assess_inflection_point_mcp_for_sample(path, inf_point_list, rev=False):
|
|
60
|
-
"""
|
|
61
|
-
Assess inflection point list, selecting one for automatic primer trimming.
|
|
62
|
-
|
|
63
|
-
Takes as input a fastq file and a list of inflection points generated by "find_mcp_inflection_points.py".
|
|
64
|
-
Computes the average conservation of mcp before inflection point and after.
|
|
65
|
-
Gets the difference in avg. conservation between the pre- and post- points.
|
|
66
|
-
Selects the inflection point with the maximum difference as the cutoff.
|
|
67
|
-
If an inf point has a similar difference and is earlier than the max, we make a 'conservative' choice and
|
|
68
|
-
replace it with the earlier cutoff.
|
|
69
|
-
|
|
70
|
-
Returns the cutoff point and the consensus sequence 'forming' the automatically predicted primer
|
|
71
|
-
"""
|
|
72
|
-
|
|
73
|
-
# TODO error handle for empty inflection point list
|
|
74
|
-
|
|
75
|
-
start_confs = [] # pre-inf point conservations
|
|
76
|
-
end_confs = [] # post-inf point conservations
|
|
77
|
-
start_cons_lens = [] # list for storing lengths of pre-inflection point sequences
|
|
78
|
-
cons_seq_list = [] # list for storing consensus sequences pre-inflection points
|
|
79
|
-
|
|
80
|
-
do_not_include_list = [
|
|
81
|
-
i + 5 for i in inf_point_list
|
|
82
|
-
] # ignore conservation of inflection point in calculation
|
|
83
|
-
|
|
84
|
-
read_count = get_read_count(path) # get readcount from fastq
|
|
85
|
-
|
|
86
|
-
max_line_count = None
|
|
87
|
-
if read_count > MCP_MAX_LINE_COUNT:
|
|
88
|
-
max_line_count = MCP_MAX_LINE_COUNT
|
|
89
|
-
|
|
90
|
-
n_prop = 0.8
|
|
91
|
-
|
|
92
|
-
for start in inf_point_list: # Looping through the pre-inflection point mcps
|
|
93
|
-
mcp_len = start + 4 # length of pre-inf mcps is inflection point + 4
|
|
94
|
-
|
|
95
|
-
mcp_count_dict = fetch_mcp(
|
|
96
|
-
path, mcp_len, rev=rev, max_line_count=max_line_count
|
|
97
|
-
) # get MCP count dict
|
|
98
|
-
mcp_cons_list = build_mcp_cons_dict_list(
|
|
99
|
-
mcp_count_dict, mcp_len
|
|
100
|
-
) # list of base conservation dicts for mcps
|
|
101
|
-
cons_seq, cons_confs = build_cons_seq(
|
|
102
|
-
mcp_cons_list,
|
|
103
|
-
read_count,
|
|
104
|
-
n_prop,
|
|
105
|
-
do_not_include_list,
|
|
106
|
-
max_line_count=max_line_count,
|
|
107
|
-
) # get list of max base conservations for each index
|
|
108
|
-
# also get consensus sequence
|
|
109
|
-
cons_seq_list.append(cons_seq)
|
|
110
|
-
start_confs.append(np.mean(cons_confs))
|
|
111
|
-
start_cons_lens.append(len(cons_seq))
|
|
112
|
-
|
|
113
|
-
for i, end in enumerate(
|
|
114
|
-
inf_point_list
|
|
115
|
-
): # Looping through the post-inflection point mcps
|
|
116
|
-
mcp_len = end + 5 # length of pre-inf mcps is inflection point + 5
|
|
117
|
-
subs_len = start_cons_lens[i] # length of respective pre-inf point sequence
|
|
118
|
-
l = mcp_len + subs_len - 1 # final index of MCP
|
|
119
|
-
|
|
120
|
-
mcp_count_dict = fetch_mcp(
|
|
121
|
-
path, l, mcp_len, rev=rev, max_line_count=max_line_count
|
|
122
|
-
)
|
|
123
|
-
mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, subs_len)
|
|
124
|
-
cons_seq, cons_confs = build_cons_seq(
|
|
125
|
-
mcp_cons_list,
|
|
126
|
-
read_count,
|
|
127
|
-
n_prop,
|
|
128
|
-
do_not_include_list,
|
|
129
|
-
subs_len,
|
|
130
|
-
max_line_count=max_line_count,
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
end_confs.append(np.mean(cons_confs))
|
|
134
|
-
|
|
135
|
-
diff_res = [
|
|
136
|
-
start_confs[i] - end_confs[i] for i in range(len(start_confs))
|
|
137
|
-
] # get differences between pre- and -post avg conservation values
|
|
138
|
-
diff_res_sorted = sorted(
|
|
139
|
-
diff_res, reverse=True
|
|
140
|
-
) # sort differences from highest to lowest
|
|
141
|
-
|
|
142
|
-
ini_max_res = diff_res_sorted[0] # maximum differences
|
|
143
|
-
curr_max_index = diff_res.index(ini_max_res) # index of maximum differences
|
|
144
|
-
|
|
145
|
-
for res in diff_res_sorted[1:]: # Loop through the rest of the differences
|
|
146
|
-
curr_res_index = np.where(diff_res == res)[0][0]
|
|
147
|
-
|
|
148
|
-
index_diff = inf_point_list[curr_max_index] - inf_point_list[curr_res_index]
|
|
149
|
-
|
|
150
|
-
# if difference between the max and the current is negligible and the index of the current is earlier then..
|
|
151
|
-
if ini_max_res - res < 0.05 and (index_diff <= 3 and index_diff > 0):
|
|
152
|
-
curr_max_index = (
|
|
153
|
-
curr_res_index # replace the selected index with the current one
|
|
154
|
-
)
|
|
155
|
-
|
|
156
|
-
cutoff = (
|
|
157
|
-
inf_point_list[curr_max_index] + 5
|
|
158
|
-
) # cutoff is the inflection point index + 5
|
|
159
|
-
primer = cons_seq_list[
|
|
160
|
-
curr_max_index
|
|
161
|
-
] # grab the correct consensus sequence as primer
|
|
162
|
-
|
|
163
|
-
# if the requested strand is reverse..
|
|
164
|
-
if rev:
|
|
165
|
-
primer = str(
|
|
166
|
-
Seq(primer).complement()
|
|
167
|
-
) # ..get the complement of consensus sequence
|
|
168
|
-
|
|
169
|
-
return cutoff, primer
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
def main():
|
|
173
|
-
|
|
174
|
-
path, points, sample, output = parse_args()
|
|
175
|
-
inf_df = pd.read_csv(points, sep="\t")
|
|
176
|
-
|
|
177
|
-
f_slice = inf_df[inf_df.strand == "F"] # get forward inflection points
|
|
178
|
-
r_slice = inf_df[inf_df.strand == "R"] # get reverse inflection points
|
|
179
|
-
r_slice = r_slice.reset_index(drop=True)
|
|
180
|
-
|
|
181
|
-
f_cutoff = ""
|
|
182
|
-
r_cutoff = ""
|
|
183
|
-
f_primer = ""
|
|
184
|
-
r_primer = ""
|
|
185
|
-
|
|
186
|
-
if not f_slice.empty: # if there is a forward inflection point..
|
|
187
|
-
inf_list = f_slice.inf_point.tolist()
|
|
188
|
-
f_cutoff, f_primer = assess_inflection_point_mcp_for_sample(
|
|
189
|
-
path, inf_list
|
|
190
|
-
) # .. assess and select
|
|
191
|
-
|
|
192
|
-
if not r_slice.empty: # if there is a reverse inflection point..
|
|
193
|
-
inf_list = r_slice.inf_point.tolist()
|
|
194
|
-
r_cutoff, r_primer = assess_inflection_point_mcp_for_sample(
|
|
195
|
-
path, inf_list, rev=True
|
|
196
|
-
) # .. assess and select
|
|
197
|
-
|
|
198
|
-
# Output cutoff point(s) to .txt file
|
|
199
|
-
with open(f"{output}/{sample}_cutoff.txt", "w") as fw:
|
|
200
|
-
if f_cutoff != "":
|
|
201
|
-
fw.write(f"F: {f_cutoff}\n")
|
|
202
|
-
if r_cutoff != "":
|
|
203
|
-
fw.write(f"R: {r_cutoff}\n")
|
|
204
|
-
|
|
205
|
-
# Output consensus primer sequence(s) to .fasta file
|
|
206
|
-
with open(f"{output}/{sample}_auto_primers.fasta", "w") as fw:
|
|
207
|
-
if f_cutoff != "":
|
|
208
|
-
fw.write(f">F_auto\n{f_primer}\n")
|
|
209
|
-
if r_cutoff != "":
|
|
210
|
-
fw.write(f">R_auto\n{r_primer}\n")
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
if __name__ == "__main__":
|
|
214
|
-
main()
|
|
@@ -1,175 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
|
-
#
|
|
6
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
-
# you may not use this file except in compliance with the License.
|
|
8
|
-
# You may obtain a copy of the License at
|
|
9
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
-
#
|
|
11
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
-
# See the License for the specific language governing permissions and
|
|
15
|
-
# limitations under the License.
|
|
16
|
-
|
|
17
|
-
import argparse
|
|
18
|
-
from collections import defaultdict
|
|
19
|
-
|
|
20
|
-
import pandas as pd
|
|
21
|
-
import numpy as np
|
|
22
|
-
|
|
23
|
-
from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
|
|
24
|
-
get_read_count,
|
|
25
|
-
build_cons_seq,
|
|
26
|
-
build_mcp_cons_dict_list,
|
|
27
|
-
fetch_mcp,
|
|
28
|
-
)
|
|
29
|
-
from mgnify_pipelines_toolkit.constants.thresholds import MCP_MAX_LINE_COUNT
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def parse_args():
|
|
33
|
-
|
|
34
|
-
parser = argparse.ArgumentParser()
|
|
35
|
-
|
|
36
|
-
parser.add_argument(
|
|
37
|
-
"-i",
|
|
38
|
-
"--input",
|
|
39
|
-
required=True,
|
|
40
|
-
type=str,
|
|
41
|
-
help="Path to fastq file to assess mcps",
|
|
42
|
-
)
|
|
43
|
-
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
44
|
-
parser.add_argument(
|
|
45
|
-
"-st",
|
|
46
|
-
"--strand",
|
|
47
|
-
required=True,
|
|
48
|
-
choices=["FR", "F", "R"],
|
|
49
|
-
help="F: Forward, R: Reverse",
|
|
50
|
-
)
|
|
51
|
-
parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
|
|
52
|
-
|
|
53
|
-
args = parser.parse_args()
|
|
54
|
-
|
|
55
|
-
path = args.input
|
|
56
|
-
sample = args.sample
|
|
57
|
-
strand = args.strand
|
|
58
|
-
output = args.output
|
|
59
|
-
|
|
60
|
-
return path, sample, strand, output
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def find_mcp_props_for_sample(path, rev=False):
|
|
64
|
-
"""
|
|
65
|
-
Generate mcp proportions in a stepwise and windowed manner for a fastq file.
|
|
66
|
-
|
|
67
|
-
For a continuous range of starting indices (2 to 25), generate mcps of window size of 5 bases.
|
|
68
|
-
Calculate the average conservation of the most common base at each index of a window.
|
|
69
|
-
The resulting list of mcp conservations can be considered a conservation curve and used to
|
|
70
|
-
identify inflection points where the conservation suddenly changes.
|
|
71
|
-
|
|
72
|
-
Output a dictionary where:
|
|
73
|
-
key -> an index starting point e.g. base 10
|
|
74
|
-
val -> the average conservation of the most common base for the mcp window goign from base 10 to 15 (inclusive)
|
|
75
|
-
"""
|
|
76
|
-
|
|
77
|
-
res_dict = defaultdict(float)
|
|
78
|
-
start_range = range(2, 25, 1) # Range of starting indices
|
|
79
|
-
|
|
80
|
-
print(f"Processing {path}")
|
|
81
|
-
|
|
82
|
-
mcp_len = 5 # length of generated mcps
|
|
83
|
-
|
|
84
|
-
for start in start_range:
|
|
85
|
-
|
|
86
|
-
end = (
|
|
87
|
-
start + mcp_len - 1
|
|
88
|
-
) # compute the final index for the mcp (inclusive). Indices are of base 1 not 0.
|
|
89
|
-
|
|
90
|
-
read_count = get_read_count(
|
|
91
|
-
path, file_type="fastq"
|
|
92
|
-
) # get read count for fastq file
|
|
93
|
-
|
|
94
|
-
max_line_count = None
|
|
95
|
-
if read_count > MCP_MAX_LINE_COUNT:
|
|
96
|
-
max_line_count = MCP_MAX_LINE_COUNT
|
|
97
|
-
|
|
98
|
-
mcp_count_dict = fetch_mcp(
|
|
99
|
-
path, end, start, rev, max_line_count
|
|
100
|
-
) # get MCP count dict
|
|
101
|
-
mcp_cons_list = build_mcp_cons_dict_list(
|
|
102
|
-
mcp_count_dict, mcp_len
|
|
103
|
-
) # list of base conservation dicts for mcps
|
|
104
|
-
cons_seq, cons_conf = build_cons_seq(
|
|
105
|
-
mcp_cons_list, read_count, max_line_count=max_line_count
|
|
106
|
-
) # get list of max base conservations for each index
|
|
107
|
-
|
|
108
|
-
res_dict[start] = np.mean(cons_conf) # compute the mean
|
|
109
|
-
|
|
110
|
-
return res_dict
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def concat_out(fwd_out="", rev_out=""):
|
|
114
|
-
"""
|
|
115
|
-
Generate Pandas dataframe out of mcp dictionary.
|
|
116
|
-
|
|
117
|
-
Output looks like this (when both F and R are requested):
|
|
118
|
-
2 3 4
|
|
119
|
-
F 0.7814975041597337 0.8736772046589019 0.9434276206322796
|
|
120
|
-
R 0.9010981697171381 0.9082861896838601 0.90369384359401
|
|
121
|
-
|
|
122
|
-
Columns are the starting indices. Row labels are the strand.
|
|
123
|
-
"""
|
|
124
|
-
|
|
125
|
-
total_res_dict = defaultdict(list)
|
|
126
|
-
df_ind = []
|
|
127
|
-
|
|
128
|
-
# Check if fwd strand was requested
|
|
129
|
-
if fwd_out != "":
|
|
130
|
-
[total_res_dict[key].append(fwd_out[key]) for key in fwd_out.keys()]
|
|
131
|
-
df_ind.append("F")
|
|
132
|
-
|
|
133
|
-
# Check if rev strand was requested
|
|
134
|
-
if rev_out != "":
|
|
135
|
-
[total_res_dict[key].append(rev_out[key]) for key in rev_out.keys()]
|
|
136
|
-
df_ind.append("R")
|
|
137
|
-
|
|
138
|
-
res_df = pd.DataFrame.from_dict(total_res_dict)
|
|
139
|
-
res_df.index = df_ind
|
|
140
|
-
|
|
141
|
-
return res_df
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
def main():
|
|
145
|
-
|
|
146
|
-
path, sample, strand, output = parse_args()
|
|
147
|
-
|
|
148
|
-
res_df = ""
|
|
149
|
-
|
|
150
|
-
# TODO: match-case statement is python 3.10>. We are currently locking the version
|
|
151
|
-
# at version 3.9. The day we bump the version we should replace these if statements
|
|
152
|
-
# with a match-case block.
|
|
153
|
-
|
|
154
|
-
if strand == "FR":
|
|
155
|
-
fwd_out = find_mcp_props_for_sample(path)
|
|
156
|
-
rev_out = find_mcp_props_for_sample(path, rev=True)
|
|
157
|
-
res_df = concat_out(fwd_out, rev_out)
|
|
158
|
-
elif strand == "F":
|
|
159
|
-
fwd_out = find_mcp_props_for_sample(path)
|
|
160
|
-
res_df = concat_out(fwd_out)
|
|
161
|
-
elif strand == "R":
|
|
162
|
-
rev_out = find_mcp_props_for_sample(path, rev=True)
|
|
163
|
-
res_df = concat_out(rev_out=rev_out)
|
|
164
|
-
else:
|
|
165
|
-
print(
|
|
166
|
-
"Incorrect strand input. Should be F for forward, R for reverse, or FR for both."
|
|
167
|
-
)
|
|
168
|
-
exit(1)
|
|
169
|
-
|
|
170
|
-
# Save resulting dataframe to a tsv file
|
|
171
|
-
res_df.to_csv(f"{output}/{sample}_mcp_cons.tsv", sep="\t")
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
if __name__ == "__main__":
|
|
175
|
-
main()
|
|
@@ -1,111 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
|
-
#
|
|
6
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
-
# you may not use this file except in compliance with the License.
|
|
8
|
-
# You may obtain a copy of the License at
|
|
9
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
-
#
|
|
11
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
-
# See the License for the specific language governing permissions and
|
|
15
|
-
# limitations under the License.
|
|
16
|
-
|
|
17
|
-
from collections import defaultdict
|
|
18
|
-
import argparse
|
|
19
|
-
|
|
20
|
-
import pandas as pd
|
|
21
|
-
import numpy as np
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def parse_args():
|
|
25
|
-
|
|
26
|
-
parser = argparse.ArgumentParser()
|
|
27
|
-
|
|
28
|
-
parser.add_argument(
|
|
29
|
-
"-i",
|
|
30
|
-
"--input",
|
|
31
|
-
required=True,
|
|
32
|
-
type=str,
|
|
33
|
-
help="Path to mcp tsv file to find inflection points",
|
|
34
|
-
)
|
|
35
|
-
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
36
|
-
parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
|
|
37
|
-
|
|
38
|
-
args = parser.parse_args()
|
|
39
|
-
|
|
40
|
-
path = args.input
|
|
41
|
-
sample = args.sample
|
|
42
|
-
output = args.output
|
|
43
|
-
|
|
44
|
-
return path, sample, output
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def find_mcp_inf_points(mcp_df):
|
|
48
|
-
"""
|
|
49
|
-
Find inflection points from an mcp_df file output by "assess_mcp_proportions_MERGED.py"
|
|
50
|
-
|
|
51
|
-
Takes the list of average mcp conservations and gets the derivative of the curve
|
|
52
|
-
Keep any points of the derivative where value is above the 80th percentile
|
|
53
|
-
|
|
54
|
-
Outputs a dictionary with key-val pairs where vals are lists:
|
|
55
|
-
'strand' -> strand list
|
|
56
|
-
'inf_point' -> inf_point list
|
|
57
|
-
|
|
58
|
-
"""
|
|
59
|
-
|
|
60
|
-
inf_point_dict = defaultdict(list)
|
|
61
|
-
start_indices = [int(i) for i in mcp_df.columns.tolist()]
|
|
62
|
-
|
|
63
|
-
for i in range(len(mcp_df)): # Loop through both possible strands of the mcp_df
|
|
64
|
-
strand = mcp_df.index[i]
|
|
65
|
-
props = mcp_df.iloc[i].tolist()
|
|
66
|
-
props = [-val for val in props]
|
|
67
|
-
|
|
68
|
-
prop_diff = np.diff(props) / np.diff(start_indices) # Get the derivative
|
|
69
|
-
infl_points = np.where(prop_diff > np.percentile(prop_diff, 80))[
|
|
70
|
-
0
|
|
71
|
-
] # Grab points above 80th percentile
|
|
72
|
-
|
|
73
|
-
for ind in infl_points:
|
|
74
|
-
inf_point = start_indices[ind]
|
|
75
|
-
|
|
76
|
-
if (
|
|
77
|
-
inf_point < 10 or inf_point > 20
|
|
78
|
-
): # Rule to facilitate results - won't accept
|
|
79
|
-
continue # points below index 10 or above index 20
|
|
80
|
-
# 10 means a cutoff of 15 and 20 a cutoff of 25
|
|
81
|
-
# literature points to no primers existing that are
|
|
82
|
-
# shorter or bigger than these lengths
|
|
83
|
-
|
|
84
|
-
inf_point_dict["strand"].append(strand)
|
|
85
|
-
inf_point_dict["inf_point"].append(inf_point)
|
|
86
|
-
|
|
87
|
-
return inf_point_dict
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def main():
|
|
91
|
-
|
|
92
|
-
path, sample, output = parse_args()
|
|
93
|
-
|
|
94
|
-
mcp_df = pd.read_csv(path, sep="\t", index_col=0) # Read mcp_df
|
|
95
|
-
inf_point_dict = find_mcp_inf_points(mcp_df) # Generate inflection points dict
|
|
96
|
-
|
|
97
|
-
if len(inf_point_dict) > 0: # If the inf_point_dict isn't empty..
|
|
98
|
-
inf_point_df = pd.DataFrame.from_dict(
|
|
99
|
-
inf_point_dict
|
|
100
|
-
) # .. turn it into a dataframe
|
|
101
|
-
inf_point_df.to_csv(
|
|
102
|
-
f"{output}/{sample}_inf_points.tsv", sep="\t", index=False
|
|
103
|
-
) # ..save it to a .tsv file
|
|
104
|
-
|
|
105
|
-
else: # If it is empty..
|
|
106
|
-
fw = open(f"{output}/{sample}_inf_points.tsv", "w") # ..make an empty file
|
|
107
|
-
fw.close()
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
if __name__ == "__main__":
|
|
111
|
-
main()
|