mgnify-pipelines-toolkit 1.1.2__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (62) hide show
  1. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/PKG-INFO +6 -7
  2. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +24 -27
  3. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -4
  4. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +6 -7
  5. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +0 -7
  6. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +0 -5
  7. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit.egg-info/requires.txt +1 -2
  8. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/pyproject.toml +3 -9
  9. mgnify_pipelines_toolkit-1.1.2/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +0 -221
  10. mgnify_pipelines_toolkit-1.1.2/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +0 -164
  11. mgnify_pipelines_toolkit-1.1.2/mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -214
  12. mgnify_pipelines_toolkit-1.1.2/mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +0 -175
  13. mgnify_pipelines_toolkit-1.1.2/mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -111
  14. mgnify_pipelines_toolkit-1.1.2/mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +0 -327
  15. mgnify_pipelines_toolkit-1.1.2/mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -43
  16. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/LICENSE +0 -0
  17. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/README.md +0 -0
  18. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/__init__.py +0 -0
  19. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  20. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
  21. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
  22. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
  23. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +0 -0
  24. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
  25. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
  26. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py +0 -0
  27. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +0 -0
  28. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +0 -0
  29. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +0 -0
  30. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +0 -0
  31. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +0 -0
  32. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +0 -0
  33. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +0 -0
  34. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py +0 -0
  35. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +0 -0
  36. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py +0 -0
  37. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py +0 -0
  38. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +0 -0
  39. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +0 -0
  40. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +0 -0
  41. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/genomes/__init__.py +0 -0
  42. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
  43. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +0 -0
  44. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +0 -0
  45. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
  46. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -0
  47. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
  48. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
  49. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
  50. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +0 -0
  51. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/constants/db_labels.py +0 -0
  52. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/constants/ncrna.py +0 -0
  53. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
  54. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/constants/tax_ranks.py +0 -0
  55. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
  56. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/schemas/schemas.py +0 -0
  57. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  58. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
  59. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
  60. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  61. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  62. {mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.1.2
3
+ Version: 1.2.1
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -8,22 +8,21 @@ Keywords: bioinformatics,pipelines,metagenomics
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: License :: OSI Approved :: Apache Software License
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.10
11
+ Requires-Python: >=3.11
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: biopython>=1.85
15
15
  Requires-Dist: numpy<3,>=2.2.4
16
16
  Requires-Dist: pandas<3,>=2.2.3
17
- Requires-Dist: regex>=2024.11.6
18
17
  Requires-Dist: requests<3,>=2.32.3
19
18
  Requires-Dist: click<9,>=8.1.8
20
19
  Requires-Dist: pandera<0.24,>=0.23.1
21
20
  Requires-Dist: pyfastx<3,>=2.2.0
22
21
  Requires-Dist: intervaltree<4,>=3.1.0
23
- Provides-Extra: tests
24
- Requires-Dist: pytest<9,>=8.3.5; extra == "tests"
25
- Requires-Dist: pytest-md>=0.2.0; extra == "tests"
26
- Requires-Dist: pytest-workflow==2.1.0; extra == "tests"
22
+ Provides-Extra: test
23
+ Requires-Dist: pytest<9,>=8.3.5; extra == "test"
24
+ Requires-Dist: pytest-md>=0.2.0; extra == "test"
25
+ Requires-Dist: pytest-workflow==2.1.0; extra == "test"
27
26
  Provides-Extra: dev
28
27
  Requires-Dist: pre-commit>=4.2.0; extra == "dev"
29
28
  Requires-Dist: black>=25.1.0; extra == "dev"
@@ -22,7 +22,6 @@ import pandas as pd
22
22
 
23
23
 
24
24
  def parse_args():
25
-
26
25
  parser = argparse.ArgumentParser()
27
26
  parser.add_argument(
28
27
  "-i", "--input", required=True, type=str, help="Input JSON from antiSMASH"
@@ -64,7 +63,6 @@ def main():
64
63
  region_name = None
65
64
 
66
65
  for feature in record["features"]:
67
-
68
66
  if feature["type"] == "region":
69
67
  # Annotate region features
70
68
  region_name = (
@@ -129,35 +127,34 @@ def main():
129
127
  cds_by_protocluster = record["modules"][
130
128
  "antismash.detection.hmm_detection"
131
129
  ]["rule_results"]["cds_by_protocluster"]
132
- if len(cds_by_protocluster) > 0:
133
- for feature in cds_by_protocluster[0][1]:
134
- if "cds_name" in feature.keys():
135
- locus_tag = feature["cds_name"]
136
- as_clusters = ",".join(
137
- list(feature["definition_domains"].keys())
130
+
131
+ if not cds_by_protocluster:
132
+ continue
133
+
134
+ for feature in cds_by_protocluster[0][1]:
135
+ if locus_tag := feature.get("cds_name"):
136
+ as_clusters = ",".join(list(feature["definition_domains"].keys()))
137
+ if locus_tag in attributes_dict:
138
+ attributes_dict[locus_tag].update(
139
+ {"as_gene_clusters": as_clusters}
138
140
  )
139
- if locus_tag in attributes_dict.keys():
140
- attributes_dict[locus_tag].update(
141
- {"as_gene_clusters": as_clusters}
142
- )
143
141
 
144
142
  if "antismash.detection.genefunctions" in record["modules"].keys():
145
- for tool in record["modules"]["antismash.detection.genefunctions"]["tools"]:
146
- if tool["tool"] == "smcogs":
147
- for locus_tag in tool["best_hits"]:
148
- hit_id = tool["best_hits"][locus_tag]["hit_id"].split(":")[0]
149
- hit_desc = (
150
- tool["best_hits"][locus_tag]["hit_id"]
151
- .split(":")[1]
152
- .replace(" ", "_")
153
- )
154
- score = tool["best_hits"][locus_tag]["bitscore"]
155
- e_value = tool["best_hits"][locus_tag]["evalue"]
143
+ gene_function_tools = record["modules"][
144
+ "antismash.detection.genefunctions"
145
+ ]["tools"]
146
+ if tool_data := gene_function_tools.get("smcogs"):
147
+
148
+ for locus_tag in tool_data["best_hits"]:
149
+ smcog_id = tool_data["best_hits"][locus_tag]["reference_id"]
150
+ smcog_description = tool_data["best_hits"][locus_tag]["description"]
151
+
152
+ score = tool_data["best_hits"][locus_tag]["bitscore"]
153
+ e_value = tool_data["best_hits"][locus_tag]["evalue"]
156
154
 
157
- smcog_note = f"smCOG:{hit_id}:{hit_desc.replace(' ', '_')}(Score:{score}%3BE-value:{e_value})"
158
- if locus_tag in attributes_dict.keys():
159
- attributes_dict[locus_tag].update({"as_notes": smcog_note})
160
- break
155
+ smcog_note = f"smCOG:{smcog_id}:{smcog_description.replace(' ', '_')}(Score:{score}%3BE-value:{e_value})"
156
+ if locus_tag in attributes_dict.keys():
157
+ attributes_dict[locus_tag].update({"as_notes": smcog_note})
161
158
 
162
159
  attributes = [
163
160
  ";".join(f"{k}={v}" for k, v in attrib_data.items() if v)
@@ -14,9 +14,6 @@
14
14
  # See the License for the specific language governing permissions and
15
15
  # limitations under the License.
16
16
 
17
- # used by fetch_mcp in analysis.amplicon
18
- MCP_MAX_LINE_COUNT = 300_000
19
-
20
17
  # used by classify_var_regions in analysis.amplicon
21
18
  MIN_OVERLAP = 0.95
22
19
  MIN_SEQ_COUNT = 5000
@@ -26,7 +23,6 @@ MAX_INTERNAL_PRIMER_PROPORTION = 0.2
26
23
  # used by library_strategy_checker in analysis.shared
27
24
  MIN_AMPLICON_STRATEGY_CHECK = 0.30
28
25
 
29
-
30
26
  # used by markergene_study_summary in analysis.shared
31
27
  MAJORITY_MARKER_PROPORTION = 0.45
32
28
  # used by gff_toolkit in analysis.assembly
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.1.2
3
+ Version: 1.2.1
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -8,22 +8,21 @@ Keywords: bioinformatics,pipelines,metagenomics
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: License :: OSI Approved :: Apache Software License
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.10
11
+ Requires-Python: >=3.11
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: biopython>=1.85
15
15
  Requires-Dist: numpy<3,>=2.2.4
16
16
  Requires-Dist: pandas<3,>=2.2.3
17
- Requires-Dist: regex>=2024.11.6
18
17
  Requires-Dist: requests<3,>=2.32.3
19
18
  Requires-Dist: click<9,>=8.1.8
20
19
  Requires-Dist: pandera<0.24,>=0.23.1
21
20
  Requires-Dist: pyfastx<3,>=2.2.0
22
21
  Requires-Dist: intervaltree<4,>=3.1.0
23
- Provides-Extra: tests
24
- Requires-Dist: pytest<9,>=8.3.5; extra == "tests"
25
- Requires-Dist: pytest-md>=0.2.0; extra == "tests"
26
- Requires-Dist: pytest-workflow==2.1.0; extra == "tests"
22
+ Provides-Extra: test
23
+ Requires-Dist: pytest<9,>=8.3.5; extra == "test"
24
+ Requires-Dist: pytest-md>=0.2.0; extra == "test"
25
+ Requires-Dist: pytest-workflow==2.1.0; extra == "test"
27
26
  Provides-Extra: dev
28
27
  Requires-Dist: pre-commit>=4.2.0; extra == "dev"
29
28
  Requires-Dist: black>=25.1.0; extra == "dev"
@@ -9,18 +9,12 @@ mgnify_pipelines_toolkit.egg-info/entry_points.txt
9
9
  mgnify_pipelines_toolkit.egg-info/requires.txt
10
10
  mgnify_pipelines_toolkit.egg-info/top_level.txt
11
11
  mgnify_pipelines_toolkit/analysis/__init__.py
12
- mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py
13
- mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py
14
- mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py
15
- mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py
16
12
  mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py
17
- mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py
18
13
  mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py
19
14
  mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py
20
15
  mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py
21
16
  mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py
22
17
  mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py
23
- mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py
24
18
  mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py
25
19
  mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py
26
20
  mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py
@@ -49,7 +43,6 @@ mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py
49
43
  mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py
50
44
  mgnify_pipelines_toolkit/constants/db_labels.py
51
45
  mgnify_pipelines_toolkit/constants/ncrna.py
52
- mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py
53
46
  mgnify_pipelines_toolkit/constants/regex_fasta_header.py
54
47
  mgnify_pipelines_toolkit/constants/tax_ranks.py
55
48
  mgnify_pipelines_toolkit/constants/thresholds.py
@@ -2,17 +2,13 @@
2
2
  add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main
3
3
  amplicon_study_summary_generator = mgnify_pipelines_toolkit.analysis.amplicon.study_summary_generator:cli
4
4
  antismash_gff_builder = mgnify_pipelines_toolkit.analysis.assembly.antismash_gff_builder:main
5
- are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
6
5
  assembly_study_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.study_summary_generator:cli
7
- assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
8
- assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
9
6
  classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
10
7
  combined_gene_caller_merge = mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main
11
8
  convert_cmscan_to_cmsearch_tblout = mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main
12
9
  dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:main
13
10
  fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
14
11
  fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
15
- find_mcp_inflection_points = mgnify_pipelines_toolkit.analysis.amplicon.find_mcp_inflection_points:main
16
12
  generate_gaf = mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main
17
13
  genomes_extract_bacterial_rrnas_as_tsv = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_bacterial_rrnas_as_tsv:main
18
14
  genomes_extract_rrnas_as_fasta = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_rrnas_as_fasta:main
@@ -32,7 +28,6 @@ process_dbcan_cazys = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_r
32
28
  process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_clusters:main
33
29
  remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
34
30
  rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
35
- standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
36
31
  summarise_antismash_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_antismash_bgcs:main
37
32
  summarise_goslims = mgnify_pipelines_toolkit.analysis.assembly.summarise_goslims:main
38
33
  summarise_sanntis_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_sanntis_bgcs:main
@@ -1,7 +1,6 @@
1
1
  biopython>=1.85
2
2
  numpy<3,>=2.2.4
3
3
  pandas<3,>=2.2.3
4
- regex>=2024.11.6
5
4
  requests<3,>=2.32.3
6
5
  click<9,>=8.1.8
7
6
  pandera<0.24,>=0.23.1
@@ -14,7 +13,7 @@ black>=25.1.0
14
13
  flake8>=7.1.2
15
14
  pep8-naming>=0.14.1
16
15
 
17
- [tests]
16
+ [test]
18
17
  pytest<9,>=8.3.5
19
18
  pytest-md>=0.2.0
20
19
  pytest-workflow==2.1.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mgnify_pipelines_toolkit"
3
- version = "1.1.2"
3
+ version = "1.2.1"
4
4
  readme = "README.md"
5
5
  license = { text = "Apache Software License 2.0" }
6
6
  authors = [
@@ -8,7 +8,7 @@ authors = [
8
8
  ]
9
9
  keywords = ["bioinformatics", "pipelines", "metagenomics"]
10
10
  description = "Collection of scripts and tools for MGnify pipelines"
11
- requires-python = ">=3.10"
11
+ requires-python = ">=3.11"
12
12
  classifiers = [
13
13
  "Programming Language :: Python :: 3",
14
14
  "License :: OSI Approved :: Apache Software License",
@@ -19,7 +19,6 @@ dependencies = [
19
19
  "biopython>=1.85",
20
20
  "numpy>=2.2.4,<3",
21
21
  "pandas>=2.2.3,<3",
22
- "regex>=2024.11.6",
23
22
  "requests>=2.32.3,<3",
24
23
  "click>=8.1.8,<9",
25
24
  "pandera>=0.23.1,<0.24",
@@ -54,15 +53,10 @@ markergene_study_summary = "mgnify_pipelines_toolkit.analysis.shared.markergene_
54
53
  convert_cmscan_to_cmsearch_tblout = "mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main"
55
54
  dwc_summary_generator = "mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:main"
56
55
  # analysis.amplicon #
57
- are_there_primers = "mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main"
58
- assess_inflection_point_mcp = "mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main"
59
- assess_mcp_proportions = "mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main"
60
56
  classify_var_regions = "mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main"
61
- find_mcp_inflection_points = "mgnify_pipelines_toolkit.analysis.amplicon.find_mcp_inflection_points:main"
62
57
  make_asv_count_table = "mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main"
63
58
  remove_ambiguous_reads = "mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main"
64
59
  rev_comp_se_primers = "mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main"
65
- standard_primer_matching = "mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main"
66
60
  mapseq_to_asv_table = "mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main"
67
61
  primer_val_classification = "mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main"
68
62
  amplicon_study_summary_generator = "mgnify_pipelines_toolkit.analysis.amplicon.study_summary_generator:cli"
@@ -89,7 +83,7 @@ fasta_to_delimited = "mgnify_pipelines_toolkit.utils.fasta_to_delimited:main"
89
83
  get_mpt_version = "mgnify_pipelines_toolkit.utils.get_mpt_version:main"
90
84
 
91
85
  [project.optional-dependencies]
92
- tests = [
86
+ test = [
93
87
  "pytest>=8.3.5,<9",
94
88
  "pytest-md>=0.2.0",
95
89
  "pytest-workflow==2.1.0",
@@ -1,221 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
-
17
- from collections import defaultdict, Counter
18
- import logging
19
- import gzip
20
- import os
21
- import pyfastx
22
-
23
- from mgnify_pipelines_toolkit.constants.regex_ambiguous_bases import (
24
- _AMBIGUOUS_BASES_DICT,
25
- _AMBIGUOUS_BASES_DICT_REV,
26
- )
27
-
28
- logging.basicConfig(level=logging.DEBUG)
29
-
30
-
31
- def split_dir_into_sample_paths(dir):
32
- file_list = os.listdir(dir)
33
- file_list = [
34
- file
35
- for file in file_list
36
- if ".fastq" in file and ("_1" in file or "_2" in file)
37
- ]
38
- sample_set = set()
39
- [sample_set.add(f"{dir}/{file.split('_')[0]}") for file in file_list]
40
- sample_list = sorted(list(sample_set))
41
-
42
- return sample_list
43
-
44
-
45
- def get_read_count(read_path: str, file_type: str = "fastq") -> int:
46
- """
47
- Get the read count of a FASTQ or FASTA file.
48
-
49
- :param read_path: The path to the FASTQ or FASTA file.
50
- :type read_path: str
51
- :param fasta_type: The type of the file, either "fastq" or "fasta". Defaults to "fastq".
52
- :type fasta_type: str
53
- :return: The number of reads in the file.
54
- :rtype: int
55
- :raises ValueError: If the file type is not supported or the read count is not a positive integer.
56
- """
57
- read_count = 0
58
-
59
- if file_type == "fasta":
60
- fasta = pyfastx.Fasta(read_path, build_index=False)
61
- read_count = sum(1 for _ in fasta)
62
- elif file_type == "fastq":
63
- fastq = pyfastx.Fastq(read_path, build_index=False)
64
- read_count = sum(1 for _ in fastq)
65
- else:
66
- raise ValueError(
67
- f"Invalid file_type {file_type}, it needs to be either 'fasta' or 'fastq'"
68
- )
69
-
70
- if read_count <= 0:
71
- raise ValueError(f"Read count is not a positive integer: {read_count}")
72
-
73
- return read_count
74
-
75
-
76
- def build_cons_seq(
77
- cons_list,
78
- read_count,
79
- cons_threshold=0.80,
80
- do_not_include=None,
81
- counter=1,
82
- max_line_count=None,
83
- ):
84
- """
85
- Generate consensus sequence using a list of base conservation dictionaries most likely
86
- generated by the `build_mcp_cons_dict_list()` function.
87
- Also returns a list containing the conservation value of the most conserved base at every
88
- position in the list of base conservation dictionaries.
89
- """
90
-
91
- cons_seq = ""
92
- cons_confs = []
93
-
94
- if do_not_include is None:
95
- do_not_include = []
96
-
97
- for count_dict in cons_list:
98
- max_count = 0
99
- cons_dict = defaultdict(float)
100
-
101
- if counter in do_not_include:
102
- counter += 1
103
- cons_seq += "N"
104
- continue
105
-
106
- for base, count in count_dict.items():
107
- if base not in ("A", "T", "C", "G"):
108
- continue
109
-
110
- if max_line_count is None:
111
- cons_dict[base] = count / read_count
112
- else:
113
- cons_dict[base] = count / max_line_count
114
-
115
- if count > max_count:
116
- max_count = count
117
-
118
- counter += 1
119
-
120
- try:
121
- if max_line_count is None:
122
- max_prop = max_count / read_count
123
- else:
124
- max_prop = max_count / max_line_count
125
-
126
- cons_bases = []
127
- curr_prop = 0.0
128
- sorted_cons_dict = dict(
129
- sorted(cons_dict.items(), key=lambda x: x[1], reverse=True)
130
- )
131
-
132
- for base, prop in sorted_cons_dict.items():
133
- cons_bases.append(base)
134
- curr_prop += prop
135
- if curr_prop >= cons_threshold:
136
- break
137
-
138
- cons_bases = sorted(cons_bases)
139
-
140
- if len(cons_bases) == 1:
141
- cons_seq += cons_bases[0]
142
- else:
143
- amb_string = ",".join(cons_bases)
144
- amb_base = _AMBIGUOUS_BASES_DICT_REV[amb_string]
145
- cons_seq += amb_base
146
-
147
- except ZeroDivisionError:
148
- max_prop = 0.0
149
-
150
- cons_confs.append(max_prop)
151
-
152
- return cons_seq, cons_confs
153
-
154
-
155
- def primer_regex_query_builder(primer):
156
- """
157
- Takes an input nucleotide sequence that can contain IUPAC ambiguous codes
158
- Returns a string formatted as a regex query that considers the different
159
- potential bases valid at a position with am abiguity code.
160
- """
161
-
162
- query = ""
163
-
164
- for char in primer:
165
- if char in ("A", "C", "T", "G"):
166
- query += char
167
- else:
168
- query += str(_AMBIGUOUS_BASES_DICT[char])
169
-
170
- query = f"(.*{query}){{e<=1}}"
171
-
172
- return query
173
-
174
-
175
- def build_mcp_cons_dict_list(mcp_count_dict, mcp_len):
176
- """
177
- Generate list of dictionaries of base conservation for mcp output (mcp_cons_list)
178
- e.g. [{'A':0.9, 'C':0.1}, {'T':1.0}, ....] for every base position
179
- """
180
-
181
- mcp_cons_list = []
182
-
183
- for i in range(mcp_len):
184
- index_base_dict = defaultdict(int)
185
- for mcp in mcp_count_dict.keys():
186
- if len(mcp) < mcp_len:
187
- continue
188
- base = mcp[i]
189
- index_base_dict[base] += mcp_count_dict[mcp]
190
- mcp_cons_list.append(index_base_dict)
191
-
192
- return mcp_cons_list
193
-
194
-
195
- def fetch_mcp(fastq, prefix_len, start=1, rev=False, max_line_count=None):
196
- """
197
- Generates the most common prefix sequences along with their counts in a fastq file.
198
- Outputs dictionary containing counts for each generated MCP in the fastq.
199
- """
200
-
201
- selected_lines = []
202
-
203
- with gzip.open(fastq, "rt") as file:
204
- for i, line in enumerate(file):
205
- line = line.strip()
206
- if i % 4 == 1:
207
- if not rev:
208
- selected_lines.append(line[start - 1 : start + prefix_len - 1])
209
- else:
210
- rev_line = line[::-1]
211
- selected_lines.append(rev_line[start - 1 : start + prefix_len - 1])
212
- if max_line_count is not None:
213
- if len(selected_lines) > max_line_count:
214
- break
215
-
216
- sequence_counts = Counter(selected_lines)
217
- mcp_count_dict = dict(
218
- sorted(sequence_counts.items(), key=lambda x: x[1], reverse=True)
219
- )
220
-
221
- return mcp_count_dict
@@ -1,164 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
-
4
- # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
-
17
- import argparse
18
-
19
- import numpy as np
20
-
21
- from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
22
- get_read_count,
23
- build_cons_seq,
24
- build_mcp_cons_dict_list,
25
- fetch_mcp,
26
- )
27
-
28
-
29
- def parse_args(argv=None):
30
- parser = argparse.ArgumentParser()
31
-
32
- parser.add_argument(
33
- "-i",
34
- "--input",
35
- required=True,
36
- type=str,
37
- help="Path to fastq file to check for primers",
38
- )
39
- parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
40
- parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
41
- args = parser.parse_args(argv)
42
-
43
- path = args.input
44
- sample = args.sample
45
- output = args.output
46
-
47
- return path, sample, output
48
-
49
-
50
- def are_there_primers_in_this_sample(path, rev=False):
51
- """
52
- Predict the presence of primers based on windows of base conservation.
53
-
54
- Takes a fastq file as input. Extracts proportion of most common base for the first 100 bases.
55
- Computes the a threshold (Q3 - 0.15) based on this proportion and counts the number of bases below
56
- it in windows of 10 bases.
57
- If at least one of the first two windows contains at most one such a base, then the presence of a primer is flagged as true.
58
- A primer is also flagged as true if the combined count of bases below Q3 is at most 4.
59
-
60
- The output of this function is a boolean flag:
61
- True if a primer was identified
62
- False if a primer was not identified
63
- """
64
-
65
- read_count = get_read_count(
66
- path, file_type="fastq"
67
- ) # Get read count for fastq file
68
- mcp_len = 100 # Script will look at first 100 base mcps (for rev=True, it will look at first 100 from 3' to 5')
69
-
70
- mcp_count_dict = fetch_mcp(
71
- path, mcp_len, rev=rev
72
- ) # mcp dict where key is the mcp and value is the count
73
- mcp_cons_list = build_mcp_cons_dict_list(
74
- mcp_count_dict, mcp_len
75
- ) # list of base conservation dicts for mcps
76
- cons_seq, cons_confs = build_cons_seq(
77
- mcp_cons_list, read_count
78
- ) # get list of max base conservations for each index
79
-
80
- window_size = 10
81
- # Counter that will reset to 0 every 10 bases
82
- window_count = 0
83
- # Will append the window count to this list every 10 bases
84
- window_count_list = []
85
- # Compute Q3-based threshold
86
- max_cons = np.quantile(cons_confs, 0.75)
87
- threshold = max_cons - 0.15
88
-
89
- if max_cons < 0.75:
90
- threshold = 0.75
91
- # Immediately return false (no primer) if the max conservation is less than 0.6
92
- if max_cons < 0.6:
93
- return False
94
-
95
- # Loop through every base
96
- for i, val in enumerate(cons_confs):
97
- if i % window_size == 0 and i != 0: # After looping through a window..
98
- window_count_list.append(window_count) # ..append window count
99
- window_count = 0 # ..reset window count
100
-
101
- if (
102
- val < threshold
103
- ): # If the conservation at i is less than threshold, increment count for the window
104
- window_count += 1
105
-
106
- primer_flag = False # Initialise primer flag as false
107
-
108
- if (
109
- 1 in window_count_list[:2] or 0 in window_count_list[:2]
110
- ): # If window count is at most 1 of first two windows...
111
- primer_flag = True # ..primer flag is true
112
- elif (
113
- sum(window_count_list[:2]) <= 4
114
- ): # If sum of window counts of the first two windows is at most 4..
115
- primer_flag = True # ..primer flag is true
116
-
117
- return primer_flag
118
-
119
-
120
- def save_out(results, sample_id, output):
121
- """
122
- Save primer presence flags into output .txt file.
123
-
124
- 1: primer exists
125
- 0: primer doesn't exist
126
-
127
- First line will be the forward strand
128
- Second line will be the reverse strand
129
- """
130
-
131
- with open(f"{output}/{sample_id}_general_primer_out.txt", "w") as fw:
132
- fw.write(f"{results[0]}\n")
133
- fw.write(f"{results[1]}\n")
134
-
135
-
136
- def main(argv=None):
137
- path, sample, output = parse_args(argv)
138
-
139
- fwd_primer_flag = are_there_primers_in_this_sample(
140
- path
141
- ) # Check for general primers in fwd
142
- rev_primer_flag = are_there_primers_in_this_sample(
143
- path, rev=True
144
- ) # Check for general primers in rev
145
-
146
- fwd_status = "0"
147
- rev_status = "0"
148
- # Flag for primer presence: 1 for yes 0 for no
149
- if fwd_primer_flag:
150
- print("Forward primer detected!")
151
- fwd_status = 1
152
- else:
153
- print("No forward primer detected")
154
- if rev_primer_flag:
155
- print("Reverse primer detected!")
156
- rev_status = 1
157
- else:
158
- print("No reverse primer detected")
159
-
160
- save_out((fwd_status, rev_status), sample, output) # Save primer flags to .txt file
161
-
162
-
163
- if __name__ == "__main__":
164
- main()