mgnify-pipelines-toolkit 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (31) hide show
  1. mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +74 -54
  2. mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +69 -42
  3. mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +120 -66
  4. mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +74 -45
  5. mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +277 -148
  6. mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +45 -28
  7. mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +71 -40
  8. mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +54 -16
  9. mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +29 -12
  10. mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +29 -19
  11. mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +23 -13
  12. mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +127 -89
  13. mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +140 -0
  14. mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +55 -26
  15. mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +19 -13
  16. mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +66 -0
  17. mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +2 -2
  18. mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +3 -5
  19. mgnify_pipelines_toolkit/constants/regex_fasta_header.py +20 -0
  20. mgnify_pipelines_toolkit/constants/tax_ranks.py +21 -2
  21. mgnify_pipelines_toolkit/constants/thresholds.py +4 -1
  22. mgnify_pipelines_toolkit/constants/var_region_coordinates.py +4 -4
  23. mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  24. mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +144 -0
  25. {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/METADATA +18 -1
  26. mgnify_pipelines_toolkit-0.1.5.dist-info/RECORD +33 -0
  27. {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/WHEEL +1 -1
  28. {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/entry_points.txt +3 -0
  29. mgnify_pipelines_toolkit-0.1.3.dist-info/RECORD +0 -28
  30. {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/LICENSE +0 -0
  31. {mgnify_pipelines_toolkit-0.1.3.dist-info → mgnify_pipelines_toolkit-0.1.5.dist-info}/top_level.txt +0 -0
@@ -20,71 +20,85 @@ import gzip
20
20
  import os
21
21
  import subprocess
22
22
 
23
- from mgnify_pipelines_toolkit.constants.regex_ambiguous_bases import _AMBIGUOUS_BASES_DICT, _AMBIGUOUS_BASES_DICT_REV
23
+ from mgnify_pipelines_toolkit.constants.regex_ambiguous_bases import (
24
+ _AMBIGUOUS_BASES_DICT,
25
+ _AMBIGUOUS_BASES_DICT_REV,
26
+ )
24
27
 
25
28
  logging.basicConfig(level=logging.DEBUG)
26
29
 
27
- def split_dir_into_sample_paths(_DIR):
28
30
 
29
- file_list = os.listdir(_DIR)
30
- file_list = [ file for file in file_list if '.fastq' in file and ('_1' in file or '_2' in file) ]
31
+ def split_dir_into_sample_paths(dir):
32
+
33
+ file_list = os.listdir(dir)
34
+ file_list = [
35
+ file
36
+ for file in file_list
37
+ if ".fastq" in file and ("_1" in file or "_2" in file)
38
+ ]
31
39
  sample_set = set()
32
- [ sample_set.add(f"{_DIR}/{file.split('_')[0]}") for file in file_list ]
40
+ [sample_set.add(f"{dir}/{file.split('_')[0]}") for file in file_list]
33
41
  sample_list = sorted(list(sample_set))
34
42
 
35
43
  return sample_list
36
44
 
37
- def get_read_count(read_path, type='fastq'):
45
+
46
+ def get_read_count(read_path, type="fastq"):
38
47
 
39
48
  cmd = []
40
- stdout = ''
41
-
42
- if type == 'fastq':
43
- cmd = [
44
- 'zcat',
45
- read_path
46
- ]
47
- zcat_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
48
-
49
- cmd = [
50
- 'wc',
51
- '-l'
52
- ]
53
- wc_proc = subprocess.Popen(cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
49
+ stdout = ""
50
+
51
+ if type == "fastq":
52
+ cmd = ["zcat", read_path]
53
+ zcat_proc = subprocess.Popen(
54
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
55
+ )
56
+
57
+ cmd = ["wc", "-l"]
58
+ wc_proc = subprocess.Popen(
59
+ cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE
60
+ )
54
61
  stdout, stderr = wc_proc.communicate()
55
62
 
56
- elif type == 'fasta':
57
- cmd = [
58
- 'grep',
59
- '-c',
60
- '^>',
61
- read_path
62
- ]
63
- grep_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
63
+ elif type == "fasta":
64
+ cmd = ["grep", "-c", "^>", read_path]
65
+ grep_proc = subprocess.Popen(
66
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
67
+ )
64
68
  stdout, stderr = grep_proc.communicate()
65
69
 
66
- read_count = stdout.strip() if stdout is not None else ""
70
+ read_count = stdout.strip() if stdout is not None else ""
67
71
 
68
72
  if not read_count.isdigit():
69
- logging.error(f"Read count is not a digit, something is wrong. stdout: '{stdout}', stderr: '{stderr}'")
73
+ logging.error(
74
+ f"Read count is not a digit, something is wrong. stdout: '{stdout}', stderr: '{stderr}'"
75
+ )
70
76
  exit(1)
71
77
 
72
78
  read_count = int(read_count)
73
79
 
74
- if type == 'fastq':
80
+ if type == "fastq":
75
81
  read_count /= 4
76
82
 
77
83
  return read_count
78
84
 
79
- def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=None, counter=1, max_line_count=None):
85
+
86
+ def build_cons_seq(
87
+ cons_list,
88
+ read_count,
89
+ cons_threshold=0.80,
90
+ do_not_include=None,
91
+ counter=1,
92
+ max_line_count=None,
93
+ ):
80
94
  """
81
95
  Generate consensus sequence using a list of base conservation dictionaries most likely
82
96
  generated by the `build_mcp_cons_dict_list()` function.
83
97
  Also returns a list containing the conservation value of the most conserved base at every
84
- position in the list of base conservation dictionaries.
98
+ position in the list of base conservation dictionaries.
85
99
  """
86
100
 
87
- cons_seq = ''
101
+ cons_seq = ""
88
102
  cons_confs = []
89
103
 
90
104
  if do_not_include is None:
@@ -96,29 +110,31 @@ def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=No
96
110
 
97
111
  if counter in do_not_include:
98
112
  counter += 1
99
- cons_seq += 'N'
100
- continue
101
-
113
+ cons_seq += "N"
114
+ continue
115
+
102
116
  for base, count in count_dict.items():
103
- if base not in ('A', 'T', 'C', 'G'):
117
+ if base not in ("A", "T", "C", "G"):
104
118
  continue
105
119
 
106
120
  if max_line_count is None:
107
- cons_dict[base] = count/read_count
121
+ cons_dict[base] = count / read_count
108
122
  else:
109
- cons_dict[base] = count/max_line_count
110
-
123
+ cons_dict[base] = count / max_line_count
124
+
111
125
  if count > max_count:
112
126
  max_count = count
113
127
 
114
128
  counter += 1
115
-
129
+
116
130
  try:
117
- max_prop = max_count/read_count
131
+ max_prop = max_count / read_count
118
132
 
119
133
  cons_bases = []
120
134
  curr_prop = 0.0
121
- sorted_cons_dict = dict(sorted(cons_dict.items(), key=lambda x:x[1], reverse=True))
135
+ sorted_cons_dict = dict(
136
+ sorted(cons_dict.items(), key=lambda x: x[1], reverse=True)
137
+ )
122
138
 
123
139
  for base, prop in sorted_cons_dict.items():
124
140
  cons_bases.append(base)
@@ -131,18 +147,18 @@ def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=No
131
147
  if len(cons_bases) == 1:
132
148
  cons_seq += cons_bases[0]
133
149
  else:
134
- amb_string = ','.join(cons_bases)
150
+ amb_string = ",".join(cons_bases)
135
151
  amb_base = _AMBIGUOUS_BASES_DICT_REV[amb_string]
136
152
  cons_seq += amb_base
137
-
153
+
138
154
  except ZeroDivisionError:
139
155
  max_prop = 0.0
140
156
 
141
157
  cons_confs.append(max_prop)
142
158
 
143
-
144
159
  return cons_seq, cons_confs
145
160
 
161
+
146
162
  def primer_regex_query_builder(primer):
147
163
  """
148
164
  Takes an input nucleotide sequence that can contain IUPAC ambiguous codes
@@ -150,10 +166,10 @@ def primer_regex_query_builder(primer):
150
166
  potential bases valid at a position with am abiguity code.
151
167
  """
152
168
 
153
- query = ''
169
+ query = ""
154
170
 
155
171
  for char in primer:
156
- if char in ('A', 'C', 'T', 'G'):
172
+ if char in ("A", "C", "T", "G"):
157
173
  query += char
158
174
  else:
159
175
  query += str(_AMBIGUOUS_BASES_DICT[char])
@@ -162,6 +178,7 @@ def primer_regex_query_builder(primer):
162
178
 
163
179
  return query
164
180
 
181
+
165
182
  def build_mcp_cons_dict_list(mcp_count_dict, mcp_len):
166
183
  """
167
184
  Generate list of dictionaries of base conservation for mcp output (mcp_cons_list)
@@ -178,9 +195,10 @@ def build_mcp_cons_dict_list(mcp_count_dict, mcp_len):
178
195
  base = mcp[i]
179
196
  index_base_dict[base] += mcp_count_dict[mcp]
180
197
  mcp_cons_list.append(index_base_dict)
181
-
198
+
182
199
  return mcp_cons_list
183
200
 
201
+
184
202
  def fetch_mcp(fastq, prefix_len, start=1, rev=False, max_line_count=None):
185
203
  """
186
204
  Generates the most common prefix sequences along with their counts in a fastq file.
@@ -194,15 +212,17 @@ def fetch_mcp(fastq, prefix_len, start=1, rev=False, max_line_count=None):
194
212
  line = line.strip()
195
213
  if i % 4 == 1:
196
214
  if not rev:
197
- selected_lines.append(line[start-1:start+prefix_len-1])
215
+ selected_lines.append(line[start - 1 : start + prefix_len - 1])
198
216
  else:
199
217
  rev_line = line[::-1]
200
- selected_lines.append(rev_line[start-1:start+prefix_len-1])
201
- if max_line_count != None:
218
+ selected_lines.append(rev_line[start - 1 : start + prefix_len - 1])
219
+ if max_line_count is not None:
202
220
  if len(selected_lines) > max_line_count:
203
221
  break
204
222
 
205
223
  sequence_counts = Counter(selected_lines)
206
- mcp_count_dict = dict(sorted(sequence_counts.items(), key=lambda x: x[1], reverse=True))
224
+ mcp_count_dict = dict(
225
+ sorted(sequence_counts.items(), key=lambda x: x[1], reverse=True)
226
+ )
207
227
 
208
228
  return mcp_count_dict
@@ -15,33 +15,45 @@
15
15
  # limitations under the License.
16
16
 
17
17
  import argparse
18
- from collections import defaultdict
19
18
 
20
19
  import numpy as np
21
20
 
22
- from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import get_read_count, build_cons_seq, build_mcp_cons_dict_list, fetch_mcp
21
+ from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
22
+ get_read_count,
23
+ build_cons_seq,
24
+ build_mcp_cons_dict_list,
25
+ fetch_mcp,
26
+ )
27
+
23
28
 
24
29
  def parse_args(argv=None):
25
30
 
26
31
  parser = argparse.ArgumentParser()
27
32
 
28
- parser.add_argument("-i", "--input", required=True, type=str, help="Path to fastq file to check for primers")
33
+ parser.add_argument(
34
+ "-i",
35
+ "--input",
36
+ required=True,
37
+ type=str,
38
+ help="Path to fastq file to check for primers",
39
+ )
29
40
  parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
30
41
  parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
31
42
  args = parser.parse_args(argv)
32
-
33
- _PATH = args.input
34
- _SAMPLE = args.sample
35
- _OUTPUT = args.output
36
43
 
37
- return _PATH, _SAMPLE, _OUTPUT
44
+ path = args.input
45
+ sample = args.sample
46
+ output = args.output
47
+
48
+ return path, sample, output
49
+
38
50
 
39
- def are_there_primers_in_this_sample(_PATH, rev=False):
51
+ def are_there_primers_in_this_sample(path, rev=False):
40
52
  """
41
53
  Predict the presence of primers based on windows of base conservation.
42
54
 
43
55
  Takes a fastq file as input. Extracts proportion of most common base for the first 100 bases.
44
- Computes the a threshold (Q3 - 0.15) based on this proportion and counts the number of bases below
56
+ Computes the a threshold (Q3 - 0.15) based on this proportion and counts the number of bases below
45
57
  it in windows of 10 bases.
46
58
  If at least one of the first two windows contains at most one such a base, then the presence of a primer is flagged as true.
47
59
  A primer is also flagged as true if the combined count of bases below Q3 is at most 4.
@@ -51,14 +63,19 @@ def are_there_primers_in_this_sample(_PATH, rev=False):
51
63
  False if a primer was not identified
52
64
  """
53
65
 
54
- read_count = get_read_count(_PATH, 'fastq') # Get read count for fastq file
55
- mcp_len = 100 # Script will look at first 100 base mcps (for rev=True, it will look at first 100 from 3' to 5')
66
+ read_count = get_read_count(path, "fastq") # Get read count for fastq file
67
+ mcp_len = 100 # Script will look at first 100 base mcps (for rev=True, it will look at first 100 from 3' to 5')
56
68
 
57
- mcp_count_dict = fetch_mcp(_PATH, mcp_len, rev=rev) # mcp dict where key is the mcp and value is the count
58
- mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, mcp_len) # list of base conservation dicts for mcps
59
- cons_seq, cons_confs = build_cons_seq(mcp_cons_list, read_count) # get list of max base conservations for each index
69
+ mcp_count_dict = fetch_mcp(
70
+ path, mcp_len, rev=rev
71
+ ) # mcp dict where key is the mcp and value is the count
72
+ mcp_cons_list = build_mcp_cons_dict_list(
73
+ mcp_count_dict, mcp_len
74
+ ) # list of base conservation dicts for mcps
75
+ cons_seq, cons_confs = build_cons_seq(
76
+ mcp_cons_list, read_count
77
+ ) # get list of max base conservations for each index
60
78
 
61
-
62
79
  window_size = 10
63
80
  # Counter that will reset to 0 every 10 bases
64
81
  window_count = 0
@@ -66,7 +83,7 @@ def are_there_primers_in_this_sample(_PATH, rev=False):
66
83
  window_count_list = []
67
84
  # Compute Q3-based threshold
68
85
  max_cons = np.quantile(cons_confs, 0.75)
69
- threshold = max_cons - 0.15
86
+ threshold = max_cons - 0.15
70
87
 
71
88
  if max_cons < 0.75:
72
89
  threshold = 0.75
@@ -76,19 +93,25 @@ def are_there_primers_in_this_sample(_PATH, rev=False):
76
93
 
77
94
  # Loop through every base
78
95
  for i, val in enumerate(cons_confs):
79
- if i%window_size == 0 and i !=0: # After looping through a window..
80
- window_count_list.append(window_count) # ..append window count
81
- window_count = 0 # ..reset window count
96
+ if i % window_size == 0 and i != 0: # After looping through a window..
97
+ window_count_list.append(window_count) # ..append window count
98
+ window_count = 0 # ..reset window count
82
99
 
83
- if val < threshold: # If the conservation at i is less than threshold, increment count for the window
100
+ if (
101
+ val < threshold
102
+ ): # If the conservation at i is less than threshold, increment count for the window
84
103
  window_count += 1
85
104
 
86
- primer_flag = False # Initialise primer flag as false
105
+ primer_flag = False # Initialise primer flag as false
87
106
 
88
- if 1 in window_count_list[:2] or 0 in window_count_list[:2]: # If window count is at most 1 of first two windows...
89
- primer_flag = True # ..primer flag is true
90
- elif sum(window_count_list[:2]) <= 4: # If sum of window counts of the first two windows is at most 4..
91
- primer_flag = True # ..primer flag is true
107
+ if (
108
+ 1 in window_count_list[:2] or 0 in window_count_list[:2]
109
+ ): # If window count is at most 1 of first two windows...
110
+ primer_flag = True # ..primer flag is true
111
+ elif (
112
+ sum(window_count_list[:2]) <= 4
113
+ ): # If sum of window counts of the first two windows is at most 4..
114
+ primer_flag = True # ..primer flag is true
92
115
 
93
116
  return primer_flag
94
117
 
@@ -99,39 +122,43 @@ def save_out(results, sample_id, output):
99
122
 
100
123
  1: primer exists
101
124
  0: primer doesn't exist
102
-
125
+
103
126
  First line will be the forward strand
104
127
  Second line will be the reverse strand
105
128
  """
106
129
 
107
- with open(f'{output}/{sample_id}_general_primer_out.txt', 'w') as fw:
108
- fw.write(f'{results[0]}\n')
109
- fw.write(f'{results[1]}\n')
130
+ with open(f"{output}/{sample_id}_general_primer_out.txt", "w") as fw:
131
+ fw.write(f"{results[0]}\n")
132
+ fw.write(f"{results[1]}\n")
110
133
 
111
134
 
112
135
  def main(argv=None):
113
136
 
114
- _PATH, _SAMPLE, _OUTPUT = parse_args(argv)
137
+ path, sample, output = parse_args(argv)
115
138
 
116
- fwd_primer_flag = are_there_primers_in_this_sample(_PATH) # Check for general primers in fwd
117
- rev_primer_flag = are_there_primers_in_this_sample(_PATH, rev=True) # Check for general primers in rev
139
+ fwd_primer_flag = are_there_primers_in_this_sample(
140
+ path
141
+ ) # Check for general primers in fwd
142
+ rev_primer_flag = are_there_primers_in_this_sample(
143
+ path, rev=True
144
+ ) # Check for general primers in rev
118
145
 
119
- fwd_status = '0'
120
- rev_status = '0'
146
+ fwd_status = "0"
147
+ rev_status = "0"
121
148
  # Flag for primer presence: 1 for yes 0 for no
122
149
  if fwd_primer_flag:
123
- print('Forward primer detected!')
150
+ print("Forward primer detected!")
124
151
  fwd_status = 1
125
152
  else:
126
- print('No forward primer detected')
153
+ print("No forward primer detected")
127
154
  if rev_primer_flag:
128
- print('Reverse primer detected!')
155
+ print("Reverse primer detected!")
129
156
  rev_status = 1
130
157
  else:
131
- print('No reverse primer detected')
158
+ print("No reverse primer detected")
159
+
160
+ save_out((fwd_status, rev_status), sample, output) # Save primer flags to .txt file
132
161
 
133
- save_out((fwd_status, rev_status), _SAMPLE, _OUTPUT) # Save primer flags to .txt file
134
-
135
162
 
136
163
  if __name__ == "__main__":
137
- main()
164
+ main()