mgnify-pipelines-toolkit 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +74 -54
- mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +69 -42
- mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +120 -66
- mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +74 -45
- mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +277 -148
- mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +45 -28
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +53 -32
- mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +54 -16
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +29 -12
- mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +29 -19
- mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +23 -13
- mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +127 -89
- mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +138 -0
- mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +55 -26
- mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +19 -13
- mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +66 -0
- mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +2 -2
- mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +3 -5
- mgnify_pipelines_toolkit/constants/regex_fasta_header.py +20 -0
- mgnify_pipelines_toolkit/constants/tax_ranks.py +21 -2
- mgnify_pipelines_toolkit/constants/thresholds.py +4 -1
- mgnify_pipelines_toolkit/constants/var_region_coordinates.py +4 -4
- mgnify_pipelines_toolkit/utils/__init__.py +0 -0
- mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +144 -0
- mgnify_pipelines_toolkit/utils/get_mpt_version.py +26 -0
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/METADATA +18 -1
- mgnify_pipelines_toolkit-0.1.6.dist-info/RECORD +34 -0
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/entry_points.txt +4 -0
- mgnify_pipelines_toolkit-0.1.4.dist-info/RECORD +0 -28
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/LICENSE +0 -0
- {mgnify_pipelines_toolkit-0.1.4.dist-info → mgnify_pipelines_toolkit-0.1.6.dist-info}/top_level.txt +0 -0
|
@@ -20,28 +20,43 @@ from Bio.Seq import Seq
|
|
|
20
20
|
import numpy as np
|
|
21
21
|
import pandas as pd
|
|
22
22
|
|
|
23
|
-
from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import
|
|
23
|
+
from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
|
|
24
|
+
get_read_count,
|
|
25
|
+
build_cons_seq,
|
|
26
|
+
build_mcp_cons_dict_list,
|
|
27
|
+
fetch_mcp,
|
|
28
|
+
)
|
|
24
29
|
from mgnify_pipelines_toolkit.constants.thresholds import MCP_MAX_LINE_COUNT
|
|
25
30
|
|
|
31
|
+
|
|
26
32
|
def parse_args():
|
|
27
33
|
|
|
28
34
|
parser = argparse.ArgumentParser()
|
|
29
35
|
|
|
30
|
-
parser.add_argument(
|
|
31
|
-
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"-i",
|
|
38
|
+
"--input",
|
|
39
|
+
required=True,
|
|
40
|
+
type=str,
|
|
41
|
+
help="Path to fastq file to choose inflection point",
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"-p", "--points", required=True, type=str, help="Path to inflection points file"
|
|
45
|
+
)
|
|
32
46
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
33
47
|
parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
|
|
34
48
|
|
|
35
49
|
args = parser.parse_args()
|
|
36
|
-
|
|
37
|
-
_PATH = args.input
|
|
38
|
-
_POINTS = args.points
|
|
39
|
-
_SAMPLE = args.sample
|
|
40
|
-
_OUTPUT = args.output
|
|
41
50
|
|
|
42
|
-
|
|
51
|
+
path = args.input
|
|
52
|
+
points = args.points
|
|
53
|
+
sample = args.sample
|
|
54
|
+
output = args.output
|
|
55
|
+
|
|
56
|
+
return path, points, sample, output
|
|
43
57
|
|
|
44
|
-
|
|
58
|
+
|
|
59
|
+
def assess_inflection_point_mcp_for_sample(path, inf_point_list, rev=False):
|
|
45
60
|
"""
|
|
46
61
|
Assess inflection point list, selecting one for automatic primer trimming.
|
|
47
62
|
|
|
@@ -54,17 +69,19 @@ def assess_inflection_point_mcp_for_sample(_PATH, inf_point_list, rev=False):
|
|
|
54
69
|
|
|
55
70
|
Returns the cutoff point and the consensus sequence 'forming' the automatically predicted primer
|
|
56
71
|
"""
|
|
57
|
-
|
|
72
|
+
|
|
58
73
|
# TODO error handle for empty inflection point list
|
|
59
74
|
|
|
60
|
-
start_confs = []
|
|
61
|
-
end_confs = []
|
|
62
|
-
start_cons_lens = []
|
|
63
|
-
cons_seq_list = []
|
|
75
|
+
start_confs = [] # pre-inf point conservations
|
|
76
|
+
end_confs = [] # post-inf point conservations
|
|
77
|
+
start_cons_lens = [] # list for storing lengths of pre-inflection point sequences
|
|
78
|
+
cons_seq_list = [] # list for storing consensus sequences pre-inflection points
|
|
64
79
|
|
|
65
|
-
do_not_include_list = [
|
|
80
|
+
do_not_include_list = [
|
|
81
|
+
i + 5 for i in inf_point_list
|
|
82
|
+
] # ignore conservation of inflection point in calculation
|
|
66
83
|
|
|
67
|
-
read_count = get_read_count(
|
|
84
|
+
read_count = get_read_count(path) # get readcount from fastq
|
|
68
85
|
|
|
69
86
|
max_line_count = None
|
|
70
87
|
if read_count > MCP_MAX_LINE_COUNT:
|
|
@@ -72,89 +89,126 @@ def assess_inflection_point_mcp_for_sample(_PATH, inf_point_list, rev=False):
|
|
|
72
89
|
|
|
73
90
|
n_prop = 0.8
|
|
74
91
|
|
|
75
|
-
for start in inf_point_list:
|
|
76
|
-
mcp_len = start + 4
|
|
77
|
-
|
|
78
|
-
mcp_count_dict = fetch_mcp(
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
92
|
+
for start in inf_point_list: # Looping through the pre-inflection point mcps
|
|
93
|
+
mcp_len = start + 4 # length of pre-inf mcps is inflection point + 4
|
|
94
|
+
|
|
95
|
+
mcp_count_dict = fetch_mcp(
|
|
96
|
+
path, mcp_len, rev=rev, max_line_count=max_line_count
|
|
97
|
+
) # get MCP count dict
|
|
98
|
+
mcp_cons_list = build_mcp_cons_dict_list(
|
|
99
|
+
mcp_count_dict, mcp_len
|
|
100
|
+
) # list of base conservation dicts for mcps
|
|
101
|
+
cons_seq, cons_confs = build_cons_seq(
|
|
102
|
+
mcp_cons_list,
|
|
103
|
+
read_count,
|
|
104
|
+
n_prop,
|
|
105
|
+
do_not_include_list,
|
|
106
|
+
max_line_count=max_line_count,
|
|
107
|
+
) # get list of max base conservations for each index
|
|
108
|
+
# also get consensus sequence
|
|
82
109
|
cons_seq_list.append(cons_seq)
|
|
83
110
|
start_confs.append(np.mean(cons_confs))
|
|
84
111
|
start_cons_lens.append(len(cons_seq))
|
|
85
112
|
|
|
86
|
-
for i, end in enumerate(
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
113
|
+
for i, end in enumerate(
|
|
114
|
+
inf_point_list
|
|
115
|
+
): # Looping through the post-inflection point mcps
|
|
116
|
+
mcp_len = end + 5 # length of pre-inf mcps is inflection point + 5
|
|
117
|
+
subs_len = start_cons_lens[i] # length of respective pre-inf point sequence
|
|
118
|
+
l = mcp_len + subs_len - 1 # final index of MCP
|
|
90
119
|
|
|
91
|
-
mcp_count_dict = fetch_mcp(
|
|
120
|
+
mcp_count_dict = fetch_mcp(
|
|
121
|
+
path, l, mcp_len, rev=rev, max_line_count=max_line_count
|
|
122
|
+
)
|
|
92
123
|
mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, subs_len)
|
|
93
|
-
cons_seq, cons_confs = build_cons_seq(
|
|
124
|
+
cons_seq, cons_confs = build_cons_seq(
|
|
125
|
+
mcp_cons_list,
|
|
126
|
+
read_count,
|
|
127
|
+
n_prop,
|
|
128
|
+
do_not_include_list,
|
|
129
|
+
subs_len,
|
|
130
|
+
max_line_count=max_line_count,
|
|
131
|
+
)
|
|
94
132
|
|
|
95
133
|
end_confs.append(np.mean(cons_confs))
|
|
96
134
|
|
|
97
|
-
diff_res = [
|
|
98
|
-
|
|
135
|
+
diff_res = [
|
|
136
|
+
start_confs[i] - end_confs[i] for i in range(len(start_confs))
|
|
137
|
+
] # get differences between pre- and -post avg conservation values
|
|
138
|
+
diff_res_sorted = sorted(
|
|
139
|
+
diff_res, reverse=True
|
|
140
|
+
) # sort differences from highest to lowest
|
|
99
141
|
|
|
100
|
-
ini_max_res = diff_res_sorted[0]
|
|
101
|
-
curr_max_index = diff_res.index(ini_max_res)
|
|
142
|
+
ini_max_res = diff_res_sorted[0] # maximum differences
|
|
143
|
+
curr_max_index = diff_res.index(ini_max_res) # index of maximum differences
|
|
102
144
|
|
|
103
|
-
for res in diff_res_sorted[1:]:
|
|
104
|
-
curr_res_index = np.where(diff_res == res)[0][0]
|
|
145
|
+
for res in diff_res_sorted[1:]: # Loop through the rest of the differences
|
|
146
|
+
curr_res_index = np.where(diff_res == res)[0][0]
|
|
105
147
|
|
|
106
148
|
index_diff = inf_point_list[curr_max_index] - inf_point_list[curr_res_index]
|
|
107
149
|
|
|
108
150
|
# if difference between the max and the current is negligible and the index of the current is earlier then..
|
|
109
|
-
if ini_max_res - res < 0.05 and (
|
|
110
|
-
curr_max_index =
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
151
|
+
if ini_max_res - res < 0.05 and (index_diff <= 3 and index_diff > 0):
|
|
152
|
+
curr_max_index = (
|
|
153
|
+
curr_res_index # replace the selected index with the current one
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
cutoff = (
|
|
157
|
+
inf_point_list[curr_max_index] + 5
|
|
158
|
+
) # cutoff is the inflection point index + 5
|
|
159
|
+
primer = cons_seq_list[
|
|
160
|
+
curr_max_index
|
|
161
|
+
] # grab the correct consensus sequence as primer
|
|
114
162
|
|
|
115
163
|
# if the requested strand is reverse..
|
|
116
164
|
if rev:
|
|
117
|
-
primer = str(
|
|
165
|
+
primer = str(
|
|
166
|
+
Seq(primer).complement()
|
|
167
|
+
) # ..get the complement of consensus sequence
|
|
118
168
|
|
|
119
169
|
return cutoff, primer
|
|
120
170
|
|
|
171
|
+
|
|
121
172
|
def main():
|
|
122
173
|
|
|
123
|
-
|
|
124
|
-
inf_df = pd.read_csv(
|
|
174
|
+
path, points, sample, output = parse_args()
|
|
175
|
+
inf_df = pd.read_csv(points, sep="\t")
|
|
125
176
|
|
|
126
|
-
f_slice = inf_df[inf_df.strand ==
|
|
127
|
-
r_slice = inf_df[inf_df.strand ==
|
|
177
|
+
f_slice = inf_df[inf_df.strand == "F"] # get forward inflection points
|
|
178
|
+
r_slice = inf_df[inf_df.strand == "R"] # get reverse inflection points
|
|
128
179
|
r_slice = r_slice.reset_index(drop=True)
|
|
129
180
|
|
|
130
|
-
f_cutoff =
|
|
131
|
-
r_cutoff =
|
|
132
|
-
f_primer =
|
|
133
|
-
r_primer =
|
|
181
|
+
f_cutoff = ""
|
|
182
|
+
r_cutoff = ""
|
|
183
|
+
f_primer = ""
|
|
184
|
+
r_primer = ""
|
|
134
185
|
|
|
135
|
-
if not f_slice.empty:
|
|
186
|
+
if not f_slice.empty: # if there is a forward inflection point..
|
|
136
187
|
inf_list = f_slice.inf_point.tolist()
|
|
137
|
-
f_cutoff, f_primer = assess_inflection_point_mcp_for_sample(
|
|
188
|
+
f_cutoff, f_primer = assess_inflection_point_mcp_for_sample(
|
|
189
|
+
path, inf_list
|
|
190
|
+
) # .. assess and select
|
|
138
191
|
|
|
139
|
-
if not r_slice.empty:
|
|
192
|
+
if not r_slice.empty: # if there is a reverse inflection point..
|
|
140
193
|
inf_list = r_slice.inf_point.tolist()
|
|
141
|
-
r_cutoff, r_primer = assess_inflection_point_mcp_for_sample(
|
|
194
|
+
r_cutoff, r_primer = assess_inflection_point_mcp_for_sample(
|
|
195
|
+
path, inf_list, rev=True
|
|
196
|
+
) # .. assess and select
|
|
142
197
|
|
|
143
198
|
# Output cutoff point(s) to .txt file
|
|
144
|
-
with open(f
|
|
145
|
-
if f_cutoff !=
|
|
146
|
-
fw.write(f
|
|
147
|
-
if r_cutoff !=
|
|
148
|
-
fw.write(f
|
|
199
|
+
with open(f"{output}/{sample}_cutoff.txt", "w") as fw:
|
|
200
|
+
if f_cutoff != "":
|
|
201
|
+
fw.write(f"F: {f_cutoff}\n")
|
|
202
|
+
if r_cutoff != "":
|
|
203
|
+
fw.write(f"R: {r_cutoff}\n")
|
|
149
204
|
|
|
150
205
|
# Output consensus primer sequence(s) to .fasta file
|
|
151
|
-
with open(f
|
|
152
|
-
if f_cutoff !=
|
|
153
|
-
fw.write(f
|
|
154
|
-
if r_cutoff !=
|
|
155
|
-
fw.write(f
|
|
156
|
-
|
|
206
|
+
with open(f"{output}/{sample}_auto_primers.fasta", "w") as fw:
|
|
207
|
+
if f_cutoff != "":
|
|
208
|
+
fw.write(f">F_auto\n{f_primer}\n")
|
|
209
|
+
if r_cutoff != "":
|
|
210
|
+
fw.write(f">R_auto\n{r_primer}\n")
|
|
157
211
|
|
|
158
212
|
|
|
159
213
|
if __name__ == "__main__":
|
|
160
|
-
main()
|
|
214
|
+
main()
|
|
@@ -16,33 +16,51 @@
|
|
|
16
16
|
|
|
17
17
|
import argparse
|
|
18
18
|
from collections import defaultdict
|
|
19
|
-
import subprocess
|
|
20
19
|
|
|
21
20
|
import pandas as pd
|
|
22
21
|
import numpy as np
|
|
23
22
|
|
|
24
|
-
from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import
|
|
23
|
+
from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
|
|
24
|
+
get_read_count,
|
|
25
|
+
build_cons_seq,
|
|
26
|
+
build_mcp_cons_dict_list,
|
|
27
|
+
fetch_mcp,
|
|
28
|
+
)
|
|
25
29
|
from mgnify_pipelines_toolkit.constants.thresholds import MCP_MAX_LINE_COUNT
|
|
26
30
|
|
|
31
|
+
|
|
27
32
|
def parse_args():
|
|
28
33
|
|
|
29
34
|
parser = argparse.ArgumentParser()
|
|
30
35
|
|
|
31
|
-
parser.add_argument(
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"-i",
|
|
38
|
+
"--input",
|
|
39
|
+
required=True,
|
|
40
|
+
type=str,
|
|
41
|
+
help="Path to fastq file to assess mcps",
|
|
42
|
+
)
|
|
32
43
|
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
33
|
-
parser.add_argument(
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"-st",
|
|
46
|
+
"--strand",
|
|
47
|
+
required=True,
|
|
48
|
+
choices=["FR", "F", "R"],
|
|
49
|
+
help="F: Forward, R: Reverse",
|
|
50
|
+
)
|
|
34
51
|
parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
|
|
35
52
|
|
|
36
53
|
args = parser.parse_args()
|
|
37
|
-
|
|
38
|
-
_PATH = args.input
|
|
39
|
-
_SAMPLE = args.sample
|
|
40
|
-
_STRAND = args.strand
|
|
41
|
-
_OUTPUT = args.output
|
|
42
54
|
|
|
43
|
-
|
|
55
|
+
path = args.input
|
|
56
|
+
sample = args.sample
|
|
57
|
+
strand = args.strand
|
|
58
|
+
output = args.output
|
|
59
|
+
|
|
60
|
+
return path, sample, strand, output
|
|
61
|
+
|
|
44
62
|
|
|
45
|
-
def find_mcp_props_for_sample(
|
|
63
|
+
def find_mcp_props_for_sample(path, rev=False):
|
|
46
64
|
"""
|
|
47
65
|
Generate mcp proportions in a stepwise and windowed manner for a fastq file.
|
|
48
66
|
|
|
@@ -50,38 +68,47 @@ def find_mcp_props_for_sample(_PATH, rev=False):
|
|
|
50
68
|
Calculate the average conservation of the most common base at each index of a window.
|
|
51
69
|
The resulting list of mcp conservations can be considered a conservation curve and used to
|
|
52
70
|
identify inflection points where the conservation suddenly changes.
|
|
53
|
-
|
|
71
|
+
|
|
54
72
|
Output a dictionary where:
|
|
55
73
|
key -> an index starting point e.g. base 10
|
|
56
74
|
val -> the average conservation of the most common base for the mcp window goign from base 10 to 15 (inclusive)
|
|
57
75
|
"""
|
|
58
76
|
|
|
59
77
|
res_dict = defaultdict(float)
|
|
60
|
-
start_range = range(2, 25, 1)
|
|
61
|
-
|
|
62
|
-
print(f
|
|
78
|
+
start_range = range(2, 25, 1) # Range of starting indices
|
|
79
|
+
|
|
80
|
+
print(f"Processing {path}")
|
|
63
81
|
|
|
64
|
-
mcp_len = 5
|
|
82
|
+
mcp_len = 5 # length of generated mcps
|
|
65
83
|
|
|
66
84
|
for start in start_range:
|
|
67
85
|
|
|
68
|
-
end =
|
|
86
|
+
end = (
|
|
87
|
+
start + mcp_len - 1
|
|
88
|
+
) # compute the final index for the mcp (inclusive). Indices are of base 1 not 0.
|
|
89
|
+
|
|
90
|
+
read_count = get_read_count(path, type="fastq") # get read count for fastq file
|
|
69
91
|
|
|
70
|
-
read_count = get_read_count(_PATH, type='fastq') # get read count for fastq file
|
|
71
|
-
|
|
72
92
|
max_line_count = None
|
|
73
93
|
if read_count > MCP_MAX_LINE_COUNT:
|
|
74
94
|
max_line_count = MCP_MAX_LINE_COUNT
|
|
75
95
|
|
|
76
|
-
mcp_count_dict = fetch_mcp(
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
96
|
+
mcp_count_dict = fetch_mcp(
|
|
97
|
+
path, end, start, rev, max_line_count
|
|
98
|
+
) # get MCP count dict
|
|
99
|
+
mcp_cons_list = build_mcp_cons_dict_list(
|
|
100
|
+
mcp_count_dict, mcp_len
|
|
101
|
+
) # list of base conservation dicts for mcps
|
|
102
|
+
cons_seq, cons_conf = build_cons_seq(
|
|
103
|
+
mcp_cons_list, read_count, max_line_count=max_line_count
|
|
104
|
+
) # get list of max base conservations for each index
|
|
105
|
+
|
|
106
|
+
res_dict[start] = np.mean(cons_conf) # compute the mean
|
|
81
107
|
|
|
82
108
|
return res_dict
|
|
83
109
|
|
|
84
|
-
|
|
110
|
+
|
|
111
|
+
def concat_out(fwd_out="", rev_out=""):
|
|
85
112
|
"""
|
|
86
113
|
Generate Pandas dataframe out of mcp dictionary.
|
|
87
114
|
|
|
@@ -97,16 +124,16 @@ def concat_out(fwd_out='', rev_out=''):
|
|
|
97
124
|
df_ind = []
|
|
98
125
|
|
|
99
126
|
# Check if fwd strand was requested
|
|
100
|
-
if fwd_out !=
|
|
101
|
-
[
|
|
102
|
-
df_ind.append(
|
|
127
|
+
if fwd_out != "":
|
|
128
|
+
[total_res_dict[key].append(fwd_out[key]) for key in fwd_out.keys()]
|
|
129
|
+
df_ind.append("F")
|
|
103
130
|
|
|
104
131
|
# Check if rev strand was requested
|
|
105
|
-
if rev_out !=
|
|
106
|
-
[
|
|
107
|
-
df_ind.append(
|
|
132
|
+
if rev_out != "":
|
|
133
|
+
[total_res_dict[key].append(rev_out[key]) for key in rev_out.keys()]
|
|
134
|
+
df_ind.append("R")
|
|
108
135
|
|
|
109
|
-
res_df= pd.DataFrame.from_dict(total_res_dict)
|
|
136
|
+
res_df = pd.DataFrame.from_dict(total_res_dict)
|
|
110
137
|
res_df.index = df_ind
|
|
111
138
|
|
|
112
139
|
return res_df
|
|
@@ -114,31 +141,33 @@ def concat_out(fwd_out='', rev_out=''):
|
|
|
114
141
|
|
|
115
142
|
def main():
|
|
116
143
|
|
|
117
|
-
|
|
144
|
+
path, sample, strand, output = parse_args()
|
|
118
145
|
|
|
119
|
-
res_df =
|
|
146
|
+
res_df = ""
|
|
120
147
|
|
|
121
148
|
# TODO: match-case statement is python 3.10>. We are currently locking the version
|
|
122
149
|
# at version 3.9. The day we bump the version we should replace these if statements
|
|
123
150
|
# with a match-case block.
|
|
124
151
|
|
|
125
|
-
if
|
|
126
|
-
fwd_out = find_mcp_props_for_sample(
|
|
127
|
-
rev_out = find_mcp_props_for_sample(
|
|
152
|
+
if strand == "FR":
|
|
153
|
+
fwd_out = find_mcp_props_for_sample(path)
|
|
154
|
+
rev_out = find_mcp_props_for_sample(path, rev=True)
|
|
128
155
|
res_df = concat_out(fwd_out, rev_out)
|
|
129
|
-
elif
|
|
130
|
-
fwd_out = find_mcp_props_for_sample(
|
|
156
|
+
elif strand == "F":
|
|
157
|
+
fwd_out = find_mcp_props_for_sample(path)
|
|
131
158
|
res_df = concat_out(fwd_out)
|
|
132
|
-
elif
|
|
133
|
-
rev_out = find_mcp_props_for_sample(
|
|
159
|
+
elif strand == "R":
|
|
160
|
+
rev_out = find_mcp_props_for_sample(path, rev=True)
|
|
134
161
|
res_df = concat_out(rev_out=rev_out)
|
|
135
162
|
else:
|
|
136
|
-
print(
|
|
163
|
+
print(
|
|
164
|
+
"Incorrect strand input. Should be F for forward, R for reverse, or FR for both."
|
|
165
|
+
)
|
|
137
166
|
exit(1)
|
|
138
167
|
|
|
139
168
|
# Save resulting dataframe to a tsv file
|
|
140
|
-
res_df.to_csv(f
|
|
169
|
+
res_df.to_csv(f"{output}/{sample}_mcp_cons.tsv", sep="\t")
|
|
170
|
+
|
|
141
171
|
|
|
142
|
-
|
|
143
172
|
if __name__ == "__main__":
|
|
144
|
-
main()
|
|
173
|
+
main()
|