mgnify-pipelines-toolkit 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/constants/thresholds.py +0 -4
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/METADATA +1 -2
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/RECORD +7 -14
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/entry_points.txt +0 -5
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +0 -221
- mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -164
- mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -214
- mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +0 -175
- mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -111
- mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -327
- mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -43
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/top_level.txt +0 -0
|
@@ -14,9 +14,6 @@
|
|
|
14
14
|
# See the License for the specific language governing permissions and
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
|
-
# used by fetch_mcp in analysis.amplicon
|
|
18
|
-
MCP_MAX_LINE_COUNT = 300_000
|
|
19
|
-
|
|
20
17
|
# used by classify_var_regions in analysis.amplicon
|
|
21
18
|
MIN_OVERLAP = 0.95
|
|
22
19
|
MIN_SEQ_COUNT = 5000
|
|
@@ -26,7 +23,6 @@ MAX_INTERNAL_PRIMER_PROPORTION = 0.2
|
|
|
26
23
|
# used by library_strategy_checker in analysis.shared
|
|
27
24
|
MIN_AMPLICON_STRATEGY_CHECK = 0.30
|
|
28
25
|
|
|
29
|
-
|
|
30
26
|
# used by markergene_study_summary in analysis.shared
|
|
31
27
|
MAJORITY_MARKER_PROPORTION = 0.45
|
|
32
28
|
# used by gff_toolkit in analysis.assembly
|
{mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mgnify_pipelines_toolkit
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: Collection of scripts and tools for MGnify pipelines
|
|
5
5
|
Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
|
|
6
6
|
License: Apache Software License 2.0
|
|
@@ -14,7 +14,6 @@ License-File: LICENSE
|
|
|
14
14
|
Requires-Dist: biopython>=1.85
|
|
15
15
|
Requires-Dist: numpy<3,>=2.2.4
|
|
16
16
|
Requires-Dist: pandas<3,>=2.2.3
|
|
17
|
-
Requires-Dist: regex>=2024.11.6
|
|
18
17
|
Requires-Dist: requests<3,>=2.32.3
|
|
19
18
|
Requires-Dist: click<9,>=8.1.8
|
|
20
19
|
Requires-Dist: pandera<0.24,>=0.23.1
|
{mgnify_pipelines_toolkit-1.2.0.dist-info → mgnify_pipelines_toolkit-1.2.1.dist-info}/RECORD
RENAMED
|
@@ -1,17 +1,11 @@
|
|
|
1
1
|
mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py,sha256=8qmb57E2XBrwqo6YcJYyvPyuaIMu82Ifho7yyyUdnSM,6572
|
|
4
|
-
mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py,sha256=2-URxvcl13_8O9bUmoa3-KMPSvdTaLbxfFDY-ycs_4M,5316
|
|
5
|
-
mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py,sha256=cRoHPM-VB_L3NWYgkNWuyzqIqhzwHJuU3-6BiiS2lnw,7553
|
|
6
|
-
mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py,sha256=RAdqakH05Qt_LG9jlV7P2M90o5KmlAXmDFQ4X51NIBE,5387
|
|
7
3
|
mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=W0ob9z_8sjrB1Ck48Ac-_5Vw2kyoRFalcxhrR6KSXpI,20196
|
|
8
|
-
mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py,sha256=vC3nKxggnSljfw4HNkugXbXfGvLx7XnryEE7eEGqfqs,3552
|
|
9
4
|
mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=soTewFddtebW-EcejGh9whs3cBLWJrGCYdPc0KukoAw,8756
|
|
10
5
|
mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=BLqhflblUegCvuQic16PrFXfIXlFWmGkmWJyl4wJoLQ,5040
|
|
11
6
|
mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=Bmc4Yu8inpT6AVTG1zwxp9F9mknIDLY33-UuFdaZuq0,3756
|
|
12
7
|
mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=Wu4tRtuRkgd3hoeuwPl_E5ghxIW7e_1vrcvFGWv_U4A,3173
|
|
13
8
|
mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=yLpzkRJXAeXRUNgz60zopEwHcdprM2UDjquE-GkrFys,1722
|
|
14
|
-
mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=K6gniytuItq5WzHLi1BsaUCOdP4Zm0_ZzW2_ns7-BTI,11114
|
|
15
9
|
mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py,sha256=epVClL10QcllL8yu7YGjx0rXNVHL2GxHi-Ek0MOjsjo,13859
|
|
16
10
|
mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py,sha256=NZSNY2bqs_TQyz8riDqiEFPLKcwTgzh1C7DeVHT6V8Q,4366
|
|
17
11
|
mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=vZdDIcG09hulgCp0FylwHXVSGSlwl2RsDU4_xvsrUC0,6732
|
|
@@ -40,18 +34,17 @@ mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=7-U0DN1joVu0ifLOo
|
|
|
40
34
|
mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py,sha256=sKAo_rKEyVAZXSaIFMkpSoYZxiWwXMA3XDA6Z-hbHgg,7904
|
|
41
35
|
mgnify_pipelines_toolkit/constants/db_labels.py,sha256=omPINMylAjO2PxeFhSk2MbYNcGZH3P82optSlMey3dw,858
|
|
42
36
|
mgnify_pipelines_toolkit/constants/ncrna.py,sha256=a_5hWp446S7BhRbe_JcydFgZM7sgPLuMlaiBvKWN_XM,1928
|
|
43
|
-
mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=7nEOODQq35y9wx9YnvJuo29oBpwTpXg_kIbf_t7N4TQ,1093
|
|
44
37
|
mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=G-xrc9b8zdmPTaOICD2b3RCVeFAEOVkfRkIfotQ7gek,1193
|
|
45
38
|
mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=kMq__kOJcbiwsgolkdvb-XLo3WMnJdEXgedjUyMOYjI,1081
|
|
46
|
-
mgnify_pipelines_toolkit/constants/thresholds.py,sha256=
|
|
39
|
+
mgnify_pipelines_toolkit/constants/thresholds.py,sha256=1AMBmoHBR0WjXZpkwJ7_Q-gfJtHXuCA4tZ-uvPhF0Xc,1085
|
|
47
40
|
mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=0bM4MwarFiM5yTcp5AbAmQ0o-q-gWy7kknir9zJ9R0A,1312
|
|
48
41
|
mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pyDZvCuWbwccQF0D7c5BN1vv36wQdgcAUXU43_zAu74,18164
|
|
49
42
|
mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
43
|
mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
|
|
51
44
|
mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
|
|
52
|
-
mgnify_pipelines_toolkit-1.2.
|
|
53
|
-
mgnify_pipelines_toolkit-1.2.
|
|
54
|
-
mgnify_pipelines_toolkit-1.2.
|
|
55
|
-
mgnify_pipelines_toolkit-1.2.
|
|
56
|
-
mgnify_pipelines_toolkit-1.2.
|
|
57
|
-
mgnify_pipelines_toolkit-1.2.
|
|
45
|
+
mgnify_pipelines_toolkit-1.2.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
46
|
+
mgnify_pipelines_toolkit-1.2.1.dist-info/METADATA,sha256=CIH7XH5WIzQMc0e2MfYFlWCnV1hokgYaQskYsfobzao,5775
|
|
47
|
+
mgnify_pipelines_toolkit-1.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
48
|
+
mgnify_pipelines_toolkit-1.2.1.dist-info/entry_points.txt,sha256=d7r4_VUS1hWNMnTJOy8u2kTRSFcy-sDN5NLRUXz-IhU,3041
|
|
49
|
+
mgnify_pipelines_toolkit-1.2.1.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
|
|
50
|
+
mgnify_pipelines_toolkit-1.2.1.dist-info/RECORD,,
|
|
@@ -2,17 +2,13 @@
|
|
|
2
2
|
add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main
|
|
3
3
|
amplicon_study_summary_generator = mgnify_pipelines_toolkit.analysis.amplicon.study_summary_generator:cli
|
|
4
4
|
antismash_gff_builder = mgnify_pipelines_toolkit.analysis.assembly.antismash_gff_builder:main
|
|
5
|
-
are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
|
|
6
5
|
assembly_study_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.study_summary_generator:cli
|
|
7
|
-
assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
|
|
8
|
-
assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
|
|
9
6
|
classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
|
|
10
7
|
combined_gene_caller_merge = mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main
|
|
11
8
|
convert_cmscan_to_cmsearch_tblout = mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main
|
|
12
9
|
dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:main
|
|
13
10
|
fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
|
|
14
11
|
fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
|
|
15
|
-
find_mcp_inflection_points = mgnify_pipelines_toolkit.analysis.amplicon.find_mcp_inflection_points:main
|
|
16
12
|
generate_gaf = mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main
|
|
17
13
|
genomes_extract_bacterial_rrnas_as_tsv = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_bacterial_rrnas_as_tsv:main
|
|
18
14
|
genomes_extract_rrnas_as_fasta = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_rrnas_as_fasta:main
|
|
@@ -32,7 +28,6 @@ process_dbcan_cazys = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_r
|
|
|
32
28
|
process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_clusters:main
|
|
33
29
|
remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
|
|
34
30
|
rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
|
|
35
|
-
standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
|
|
36
31
|
summarise_antismash_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_antismash_bgcs:main
|
|
37
32
|
summarise_goslims = mgnify_pipelines_toolkit.analysis.assembly.summarise_goslims:main
|
|
38
33
|
summarise_sanntis_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_sanntis_bgcs:main
|
|
@@ -1,221 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
|
-
#
|
|
6
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
-
# you may not use this file except in compliance with the License.
|
|
8
|
-
# You may obtain a copy of the License at
|
|
9
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
-
#
|
|
11
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
-
# See the License for the specific language governing permissions and
|
|
15
|
-
# limitations under the License.
|
|
16
|
-
|
|
17
|
-
from collections import defaultdict, Counter
|
|
18
|
-
import logging
|
|
19
|
-
import gzip
|
|
20
|
-
import os
|
|
21
|
-
import pyfastx
|
|
22
|
-
|
|
23
|
-
from mgnify_pipelines_toolkit.constants.regex_ambiguous_bases import (
|
|
24
|
-
_AMBIGUOUS_BASES_DICT,
|
|
25
|
-
_AMBIGUOUS_BASES_DICT_REV,
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
logging.basicConfig(level=logging.DEBUG)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def split_dir_into_sample_paths(dir):
|
|
32
|
-
file_list = os.listdir(dir)
|
|
33
|
-
file_list = [
|
|
34
|
-
file
|
|
35
|
-
for file in file_list
|
|
36
|
-
if ".fastq" in file and ("_1" in file or "_2" in file)
|
|
37
|
-
]
|
|
38
|
-
sample_set = set()
|
|
39
|
-
[sample_set.add(f"{dir}/{file.split('_')[0]}") for file in file_list]
|
|
40
|
-
sample_list = sorted(list(sample_set))
|
|
41
|
-
|
|
42
|
-
return sample_list
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def get_read_count(read_path: str, file_type: str = "fastq") -> int:
|
|
46
|
-
"""
|
|
47
|
-
Get the read count of a FASTQ or FASTA file.
|
|
48
|
-
|
|
49
|
-
:param read_path: The path to the FASTQ or FASTA file.
|
|
50
|
-
:type read_path: str
|
|
51
|
-
:param fasta_type: The type of the file, either "fastq" or "fasta". Defaults to "fastq".
|
|
52
|
-
:type fasta_type: str
|
|
53
|
-
:return: The number of reads in the file.
|
|
54
|
-
:rtype: int
|
|
55
|
-
:raises ValueError: If the file type is not supported or the read count is not a positive integer.
|
|
56
|
-
"""
|
|
57
|
-
read_count = 0
|
|
58
|
-
|
|
59
|
-
if file_type == "fasta":
|
|
60
|
-
fasta = pyfastx.Fasta(read_path, build_index=False)
|
|
61
|
-
read_count = sum(1 for _ in fasta)
|
|
62
|
-
elif file_type == "fastq":
|
|
63
|
-
fastq = pyfastx.Fastq(read_path, build_index=False)
|
|
64
|
-
read_count = sum(1 for _ in fastq)
|
|
65
|
-
else:
|
|
66
|
-
raise ValueError(
|
|
67
|
-
f"Invalid file_type {file_type}, it needs to be either 'fasta' or 'fastq'"
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
if read_count <= 0:
|
|
71
|
-
raise ValueError(f"Read count is not a positive integer: {read_count}")
|
|
72
|
-
|
|
73
|
-
return read_count
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def build_cons_seq(
|
|
77
|
-
cons_list,
|
|
78
|
-
read_count,
|
|
79
|
-
cons_threshold=0.80,
|
|
80
|
-
do_not_include=None,
|
|
81
|
-
counter=1,
|
|
82
|
-
max_line_count=None,
|
|
83
|
-
):
|
|
84
|
-
"""
|
|
85
|
-
Generate consensus sequence using a list of base conservation dictionaries most likely
|
|
86
|
-
generated by the `build_mcp_cons_dict_list()` function.
|
|
87
|
-
Also returns a list containing the conservation value of the most conserved base at every
|
|
88
|
-
position in the list of base conservation dictionaries.
|
|
89
|
-
"""
|
|
90
|
-
|
|
91
|
-
cons_seq = ""
|
|
92
|
-
cons_confs = []
|
|
93
|
-
|
|
94
|
-
if do_not_include is None:
|
|
95
|
-
do_not_include = []
|
|
96
|
-
|
|
97
|
-
for count_dict in cons_list:
|
|
98
|
-
max_count = 0
|
|
99
|
-
cons_dict = defaultdict(float)
|
|
100
|
-
|
|
101
|
-
if counter in do_not_include:
|
|
102
|
-
counter += 1
|
|
103
|
-
cons_seq += "N"
|
|
104
|
-
continue
|
|
105
|
-
|
|
106
|
-
for base, count in count_dict.items():
|
|
107
|
-
if base not in ("A", "T", "C", "G"):
|
|
108
|
-
continue
|
|
109
|
-
|
|
110
|
-
if max_line_count is None:
|
|
111
|
-
cons_dict[base] = count / read_count
|
|
112
|
-
else:
|
|
113
|
-
cons_dict[base] = count / max_line_count
|
|
114
|
-
|
|
115
|
-
if count > max_count:
|
|
116
|
-
max_count = count
|
|
117
|
-
|
|
118
|
-
counter += 1
|
|
119
|
-
|
|
120
|
-
try:
|
|
121
|
-
if max_line_count is None:
|
|
122
|
-
max_prop = max_count / read_count
|
|
123
|
-
else:
|
|
124
|
-
max_prop = max_count / max_line_count
|
|
125
|
-
|
|
126
|
-
cons_bases = []
|
|
127
|
-
curr_prop = 0.0
|
|
128
|
-
sorted_cons_dict = dict(
|
|
129
|
-
sorted(cons_dict.items(), key=lambda x: x[1], reverse=True)
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
for base, prop in sorted_cons_dict.items():
|
|
133
|
-
cons_bases.append(base)
|
|
134
|
-
curr_prop += prop
|
|
135
|
-
if curr_prop >= cons_threshold:
|
|
136
|
-
break
|
|
137
|
-
|
|
138
|
-
cons_bases = sorted(cons_bases)
|
|
139
|
-
|
|
140
|
-
if len(cons_bases) == 1:
|
|
141
|
-
cons_seq += cons_bases[0]
|
|
142
|
-
else:
|
|
143
|
-
amb_string = ",".join(cons_bases)
|
|
144
|
-
amb_base = _AMBIGUOUS_BASES_DICT_REV[amb_string]
|
|
145
|
-
cons_seq += amb_base
|
|
146
|
-
|
|
147
|
-
except ZeroDivisionError:
|
|
148
|
-
max_prop = 0.0
|
|
149
|
-
|
|
150
|
-
cons_confs.append(max_prop)
|
|
151
|
-
|
|
152
|
-
return cons_seq, cons_confs
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
def primer_regex_query_builder(primer):
|
|
156
|
-
"""
|
|
157
|
-
Takes an input nucleotide sequence that can contain IUPAC ambiguous codes
|
|
158
|
-
Returns a string formatted as a regex query that considers the different
|
|
159
|
-
potential bases valid at a position with am abiguity code.
|
|
160
|
-
"""
|
|
161
|
-
|
|
162
|
-
query = ""
|
|
163
|
-
|
|
164
|
-
for char in primer:
|
|
165
|
-
if char in ("A", "C", "T", "G"):
|
|
166
|
-
query += char
|
|
167
|
-
else:
|
|
168
|
-
query += str(_AMBIGUOUS_BASES_DICT[char])
|
|
169
|
-
|
|
170
|
-
query = f"(.*{query}){{e<=1}}"
|
|
171
|
-
|
|
172
|
-
return query
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
def build_mcp_cons_dict_list(mcp_count_dict, mcp_len):
|
|
176
|
-
"""
|
|
177
|
-
Generate list of dictionaries of base conservation for mcp output (mcp_cons_list)
|
|
178
|
-
e.g. [{'A':0.9, 'C':0.1}, {'T':1.0}, ....] for every base position
|
|
179
|
-
"""
|
|
180
|
-
|
|
181
|
-
mcp_cons_list = []
|
|
182
|
-
|
|
183
|
-
for i in range(mcp_len):
|
|
184
|
-
index_base_dict = defaultdict(int)
|
|
185
|
-
for mcp in mcp_count_dict.keys():
|
|
186
|
-
if len(mcp) < mcp_len:
|
|
187
|
-
continue
|
|
188
|
-
base = mcp[i]
|
|
189
|
-
index_base_dict[base] += mcp_count_dict[mcp]
|
|
190
|
-
mcp_cons_list.append(index_base_dict)
|
|
191
|
-
|
|
192
|
-
return mcp_cons_list
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
def fetch_mcp(fastq, prefix_len, start=1, rev=False, max_line_count=None):
|
|
196
|
-
"""
|
|
197
|
-
Generates the most common prefix sequences along with their counts in a fastq file.
|
|
198
|
-
Outputs dictionary containing counts for each generated MCP in the fastq.
|
|
199
|
-
"""
|
|
200
|
-
|
|
201
|
-
selected_lines = []
|
|
202
|
-
|
|
203
|
-
with gzip.open(fastq, "rt") as file:
|
|
204
|
-
for i, line in enumerate(file):
|
|
205
|
-
line = line.strip()
|
|
206
|
-
if i % 4 == 1:
|
|
207
|
-
if not rev:
|
|
208
|
-
selected_lines.append(line[start - 1 : start + prefix_len - 1])
|
|
209
|
-
else:
|
|
210
|
-
rev_line = line[::-1]
|
|
211
|
-
selected_lines.append(rev_line[start - 1 : start + prefix_len - 1])
|
|
212
|
-
if max_line_count is not None:
|
|
213
|
-
if len(selected_lines) > max_line_count:
|
|
214
|
-
break
|
|
215
|
-
|
|
216
|
-
sequence_counts = Counter(selected_lines)
|
|
217
|
-
mcp_count_dict = dict(
|
|
218
|
-
sorted(sequence_counts.items(), key=lambda x: x[1], reverse=True)
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
return mcp_count_dict
|
|
@@ -1,164 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: utf-8 -*-
|
|
3
|
-
|
|
4
|
-
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
5
|
-
#
|
|
6
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
-
# you may not use this file except in compliance with the License.
|
|
8
|
-
# You may obtain a copy of the License at
|
|
9
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
-
#
|
|
11
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
-
# See the License for the specific language governing permissions and
|
|
15
|
-
# limitations under the License.
|
|
16
|
-
|
|
17
|
-
import argparse
|
|
18
|
-
|
|
19
|
-
import numpy as np
|
|
20
|
-
|
|
21
|
-
from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
|
|
22
|
-
get_read_count,
|
|
23
|
-
build_cons_seq,
|
|
24
|
-
build_mcp_cons_dict_list,
|
|
25
|
-
fetch_mcp,
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def parse_args(argv=None):
|
|
30
|
-
parser = argparse.ArgumentParser()
|
|
31
|
-
|
|
32
|
-
parser.add_argument(
|
|
33
|
-
"-i",
|
|
34
|
-
"--input",
|
|
35
|
-
required=True,
|
|
36
|
-
type=str,
|
|
37
|
-
help="Path to fastq file to check for primers",
|
|
38
|
-
)
|
|
39
|
-
parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
|
|
40
|
-
parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
|
|
41
|
-
args = parser.parse_args(argv)
|
|
42
|
-
|
|
43
|
-
path = args.input
|
|
44
|
-
sample = args.sample
|
|
45
|
-
output = args.output
|
|
46
|
-
|
|
47
|
-
return path, sample, output
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def are_there_primers_in_this_sample(path, rev=False):
|
|
51
|
-
"""
|
|
52
|
-
Predict the presence of primers based on windows of base conservation.
|
|
53
|
-
|
|
54
|
-
Takes a fastq file as input. Extracts proportion of most common base for the first 100 bases.
|
|
55
|
-
Computes the a threshold (Q3 - 0.15) based on this proportion and counts the number of bases below
|
|
56
|
-
it in windows of 10 bases.
|
|
57
|
-
If at least one of the first two windows contains at most one such a base, then the presence of a primer is flagged as true.
|
|
58
|
-
A primer is also flagged as true if the combined count of bases below Q3 is at most 4.
|
|
59
|
-
|
|
60
|
-
The output of this function is a boolean flag:
|
|
61
|
-
True if a primer was identified
|
|
62
|
-
False if a primer was not identified
|
|
63
|
-
"""
|
|
64
|
-
|
|
65
|
-
read_count = get_read_count(
|
|
66
|
-
path, file_type="fastq"
|
|
67
|
-
) # Get read count for fastq file
|
|
68
|
-
mcp_len = 100 # Script will look at first 100 base mcps (for rev=True, it will look at first 100 from 3' to 5')
|
|
69
|
-
|
|
70
|
-
mcp_count_dict = fetch_mcp(
|
|
71
|
-
path, mcp_len, rev=rev
|
|
72
|
-
) # mcp dict where key is the mcp and value is the count
|
|
73
|
-
mcp_cons_list = build_mcp_cons_dict_list(
|
|
74
|
-
mcp_count_dict, mcp_len
|
|
75
|
-
) # list of base conservation dicts for mcps
|
|
76
|
-
cons_seq, cons_confs = build_cons_seq(
|
|
77
|
-
mcp_cons_list, read_count
|
|
78
|
-
) # get list of max base conservations for each index
|
|
79
|
-
|
|
80
|
-
window_size = 10
|
|
81
|
-
# Counter that will reset to 0 every 10 bases
|
|
82
|
-
window_count = 0
|
|
83
|
-
# Will append the window count to this list every 10 bases
|
|
84
|
-
window_count_list = []
|
|
85
|
-
# Compute Q3-based threshold
|
|
86
|
-
max_cons = np.quantile(cons_confs, 0.75)
|
|
87
|
-
threshold = max_cons - 0.15
|
|
88
|
-
|
|
89
|
-
if max_cons < 0.75:
|
|
90
|
-
threshold = 0.75
|
|
91
|
-
# Immediately return false (no primer) if the max conservation is less than 0.6
|
|
92
|
-
if max_cons < 0.6:
|
|
93
|
-
return False
|
|
94
|
-
|
|
95
|
-
# Loop through every base
|
|
96
|
-
for i, val in enumerate(cons_confs):
|
|
97
|
-
if i % window_size == 0 and i != 0: # After looping through a window..
|
|
98
|
-
window_count_list.append(window_count) # ..append window count
|
|
99
|
-
window_count = 0 # ..reset window count
|
|
100
|
-
|
|
101
|
-
if (
|
|
102
|
-
val < threshold
|
|
103
|
-
): # If the conservation at i is less than threshold, increment count for the window
|
|
104
|
-
window_count += 1
|
|
105
|
-
|
|
106
|
-
primer_flag = False # Initialise primer flag as false
|
|
107
|
-
|
|
108
|
-
if (
|
|
109
|
-
1 in window_count_list[:2] or 0 in window_count_list[:2]
|
|
110
|
-
): # If window count is at most 1 of first two windows...
|
|
111
|
-
primer_flag = True # ..primer flag is true
|
|
112
|
-
elif (
|
|
113
|
-
sum(window_count_list[:2]) <= 4
|
|
114
|
-
): # If sum of window counts of the first two windows is at most 4..
|
|
115
|
-
primer_flag = True # ..primer flag is true
|
|
116
|
-
|
|
117
|
-
return primer_flag
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def save_out(results, sample_id, output):
|
|
121
|
-
"""
|
|
122
|
-
Save primer presence flags into output .txt file.
|
|
123
|
-
|
|
124
|
-
1: primer exists
|
|
125
|
-
0: primer doesn't exist
|
|
126
|
-
|
|
127
|
-
First line will be the forward strand
|
|
128
|
-
Second line will be the reverse strand
|
|
129
|
-
"""
|
|
130
|
-
|
|
131
|
-
with open(f"{output}/{sample_id}_general_primer_out.txt", "w") as fw:
|
|
132
|
-
fw.write(f"{results[0]}\n")
|
|
133
|
-
fw.write(f"{results[1]}\n")
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
def main(argv=None):
|
|
137
|
-
path, sample, output = parse_args(argv)
|
|
138
|
-
|
|
139
|
-
fwd_primer_flag = are_there_primers_in_this_sample(
|
|
140
|
-
path
|
|
141
|
-
) # Check for general primers in fwd
|
|
142
|
-
rev_primer_flag = are_there_primers_in_this_sample(
|
|
143
|
-
path, rev=True
|
|
144
|
-
) # Check for general primers in rev
|
|
145
|
-
|
|
146
|
-
fwd_status = "0"
|
|
147
|
-
rev_status = "0"
|
|
148
|
-
# Flag for primer presence: 1 for yes 0 for no
|
|
149
|
-
if fwd_primer_flag:
|
|
150
|
-
print("Forward primer detected!")
|
|
151
|
-
fwd_status = 1
|
|
152
|
-
else:
|
|
153
|
-
print("No forward primer detected")
|
|
154
|
-
if rev_primer_flag:
|
|
155
|
-
print("Reverse primer detected!")
|
|
156
|
-
rev_status = 1
|
|
157
|
-
else:
|
|
158
|
-
print("No reverse primer detected")
|
|
159
|
-
|
|
160
|
-
save_out((fwd_status, rev_status), sample, output) # Save primer flags to .txt file
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
if __name__ == "__main__":
|
|
164
|
-
main()
|