mgnify-pipelines-toolkit 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (52) hide show
  1. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/PKG-INFO +3 -1
  2. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py +30 -37
  3. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py +3 -3
  4. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py +3 -1
  5. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py +1 -1
  6. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +41 -38
  7. mgnify_pipelines_toolkit-0.2.2/mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +829 -0
  8. mgnify_pipelines_toolkit-0.2.2/mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +82 -0
  9. mgnify_pipelines_toolkit-0.2.2/mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +170 -0
  10. mgnify_pipelines_toolkit-0.2.2/mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +243 -0
  11. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/constants/thresholds.py +7 -0
  12. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +3 -1
  13. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +4 -0
  14. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +2 -1
  15. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit.egg-info/requires.txt +2 -0
  16. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/pyproject.toml +7 -4
  17. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/LICENSE +0 -0
  18. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/README.md +0 -0
  19. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/__init__.py +0 -0
  20. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  21. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py +0 -0
  22. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
  23. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py +0 -0
  24. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
  25. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
  26. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +0 -0
  27. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
  28. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
  29. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +0 -0
  30. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/assembly/cgc_merge.py +0 -0
  31. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +0 -0
  32. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +0 -0
  33. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +0 -0
  34. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
  35. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
  36. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -0
  37. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
  38. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
  39. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
  40. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py +0 -0
  41. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/constants/db_labels.py +0 -0
  42. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py +0 -0
  43. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
  44. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/constants/tax_ranks.py +0 -0
  45. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
  46. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/schemas/schemas.py +0 -0
  47. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  48. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
  49. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
  50. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  51. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  52. {mgnify_pipelines_toolkit-0.2.0 → mgnify_pipelines_toolkit-0.2.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -18,6 +18,7 @@ Requires-Dist: regex==2023.12.25
18
18
  Requires-Dist: requests==2.32.3
19
19
  Requires-Dist: click==8.1.7
20
20
  Requires-Dist: pandera==0.22.1
21
+ Requires-Dist: pyfastx>=2.2.0
21
22
  Provides-Extra: tests
22
23
  Requires-Dist: pytest==7.4.0; extra == "tests"
23
24
  Requires-Dist: pytest-md==0.2.0; extra == "tests"
@@ -29,6 +30,7 @@ Requires-Dist: regex==2023.12.25; extra == "tests"
29
30
  Requires-Dist: requests==2.32.3; extra == "tests"
30
31
  Requires-Dist: click==8.1.7; extra == "tests"
31
32
  Requires-Dist: pandera==0.22.1; extra == "tests"
33
+ Requires-Dist: pyfastx>=2.2.0; extra == "tests"
32
34
  Provides-Extra: dev
33
35
  Requires-Dist: mgnify_pipelines_toolkit[tests]; extra == "dev"
34
36
  Requires-Dist: pre-commit==3.8.0; extra == "dev"
@@ -18,7 +18,7 @@ from collections import defaultdict, Counter
18
18
  import logging
19
19
  import gzip
20
20
  import os
21
- import subprocess
21
+ import pyfastx
22
22
 
23
23
  from mgnify_pipelines_toolkit.constants.regex_ambiguous_bases import (
24
24
  _AMBIGUOUS_BASES_DICT,
@@ -29,7 +29,6 @@ logging.basicConfig(level=logging.DEBUG)
29
29
 
30
30
 
31
31
  def split_dir_into_sample_paths(dir):
32
-
33
32
  file_list = os.listdir(dir)
34
33
  file_list = [
35
34
  file
@@ -43,42 +42,33 @@ def split_dir_into_sample_paths(dir):
43
42
  return sample_list
44
43
 
45
44
 
46
- def get_read_count(read_path, type="fastq"):
47
-
48
- cmd = []
49
- stdout = ""
50
-
51
- if type == "fastq":
52
- cmd = ["zcat", read_path]
53
- zcat_proc = subprocess.Popen(
54
- cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
55
- )
56
-
57
- cmd = ["wc", "-l"]
58
- wc_proc = subprocess.Popen(
59
- cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE
60
- )
61
- stdout, stderr = wc_proc.communicate()
62
-
63
- elif type == "fasta":
64
- cmd = ["grep", "-c", "^>", read_path]
65
- grep_proc = subprocess.Popen(
66
- cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
67
- )
68
- stdout, stderr = grep_proc.communicate()
69
-
70
- read_count = stdout.strip() if stdout is not None else ""
71
-
72
- if not read_count.isdigit():
73
- logging.error(
74
- f"Read count is not a digit, something is wrong. stdout: '{stdout}', stderr: '{stderr}'"
45
+ def get_read_count(read_path: str, file_type: str = "fastq") -> int:
46
+ """
47
+ Get the read count of a FASTQ or FASTA file.
48
+
49
+ :param read_path: The path to the FASTQ or FASTA file.
50
+ :type read_path: str
51
+ :param fasta_type: The type of the file, either "fastq" or "fasta". Defaults to "fastq".
52
+ :type fasta_type: str
53
+ :return: The number of reads in the file.
54
+ :rtype: int
55
+ :raises ValueError: If the file type is not supported or the read count is not a positive integer.
56
+ """
57
+ read_count = 0
58
+
59
+ if file_type == "fasta":
60
+ fasta = pyfastx.Fasta(read_path, build_index=False)
61
+ read_count = sum(1 for _ in fasta)
62
+ elif file_type == "fastq":
63
+ fastq = pyfastx.Fastq(read_path, build_index=False)
64
+ read_count = sum(1 for _ in fastq)
65
+ else:
66
+ raise ValueError(
67
+ f"Invalid file_type {file_type}, it needs to be either 'fasta' or 'fastq'"
75
68
  )
76
- exit(1)
77
69
 
78
- read_count = int(read_count)
79
-
80
- if type == "fastq":
81
- read_count /= 4
70
+ if read_count <= 0:
71
+ raise ValueError(f"Read count is not a positive integer: {read_count}")
82
72
 
83
73
  return read_count
84
74
 
@@ -128,7 +118,10 @@ def build_cons_seq(
128
118
  counter += 1
129
119
 
130
120
  try:
131
- max_prop = max_count / read_count
121
+ if max_line_count is None:
122
+ max_prop = max_count / read_count
123
+ else:
124
+ max_prop = max_count / max_line_count
132
125
 
133
126
  cons_bases = []
134
127
  curr_prop = 0.0
@@ -27,7 +27,6 @@ from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
27
27
 
28
28
 
29
29
  def parse_args(argv=None):
30
-
31
30
  parser = argparse.ArgumentParser()
32
31
 
33
32
  parser.add_argument(
@@ -63,7 +62,9 @@ def are_there_primers_in_this_sample(path, rev=False):
63
62
  False if a primer was not identified
64
63
  """
65
64
 
66
- read_count = get_read_count(path, "fastq") # Get read count for fastq file
65
+ read_count = get_read_count(
66
+ path, file_type="fastq"
67
+ ) # Get read count for fastq file
67
68
  mcp_len = 100 # Script will look at first 100 base mcps (for rev=True, it will look at first 100 from 3' to 5')
68
69
 
69
70
  mcp_count_dict = fetch_mcp(
@@ -133,7 +134,6 @@ def save_out(results, sample_id, output):
133
134
 
134
135
 
135
136
  def main(argv=None):
136
-
137
137
  path, sample, output = parse_args(argv)
138
138
 
139
139
  fwd_primer_flag = are_there_primers_in_this_sample(
@@ -87,7 +87,9 @@ def find_mcp_props_for_sample(path, rev=False):
87
87
  start + mcp_len - 1
88
88
  ) # compute the final index for the mcp (inclusive). Indices are of base 1 not 0.
89
89
 
90
- read_count = get_read_count(path, type="fastq") # get read count for fastq file
90
+ read_count = get_read_count(
91
+ path, file_type="fastq"
92
+ ) # get read count for fastq file
91
93
 
92
94
  max_line_count = None
93
95
  if read_count > MCP_MAX_LINE_COUNT:
@@ -143,7 +143,7 @@ def get_primer_props(std_primer_dict_regex, input_path):
143
143
 
144
144
  threshold = 0.60 # Arbitrary threshold for collecting a matched primer
145
145
  read_count = get_read_count(
146
- input_path, "fastq"
146
+ input_path, file_type="fastq"
147
147
  ) # Get read count of fastq file to calculate proportion with
148
148
  res_dict = defaultdict(defaultdict)
149
149
 
@@ -62,45 +62,13 @@ def process_lines(lines, output_handler, rhea2reaction_dict, protein_hashes):
62
62
  protein_rheas.add(rhea)
63
63
 
64
64
 
65
- def main(input: str, output: Path, proteins: Path, rhea2chebi: Path):
66
- logging.info(
67
- f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
68
- )
69
- protein_hashes = {}
70
- with open(proteins, "r") as fasta_file:
71
- for record in SeqIO.parse(fasta_file, "fasta"):
72
- protein_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
73
- protein_hashes[record.id] = protein_hash
74
-
75
- logging.info(f"Step 2/3: Load reactions from provided file {rhea2chebi.resolve()}")
76
- df = pd.read_csv(rhea2chebi, delimiter="\t")
77
- rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
78
-
79
- logging.info(
80
- f"Step 3/3: Read DIAMOND results from {'STDIN' if input == '-' else Path(input).resolve()} and write output"
81
- )
82
- with open(output, "w") as output_handler:
83
- if input == "-":
84
- process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
85
- else:
86
- with open(args.input, "r") as input_file:
87
- process_lines(
88
- input_file, output_handler, rhea2reaction_dict, protein_hashes
89
- )
90
-
91
- logging.info("Processed successfully. Exiting.")
92
-
93
-
94
- if __name__ == "__main__":
65
+ def main():
95
66
  parser = argparse.ArgumentParser(
96
- """
97
- Use diamond output file to create a table with Rhea and CHEBI
98
- reaction annotation for every protein.
99
- """
67
+ "Use diamond output file to create a table with Rhea and CHEBI reaction annotation for every protein."
100
68
  )
101
69
  parser.add_argument(
102
- "-i",
103
- "--input",
70
+ "-d",
71
+ "--diamond_hits",
104
72
  required=True,
105
73
  type=str,
106
74
  help="DIAMOND results file, use '-' for stdin",
@@ -121,10 +89,45 @@ if __name__ == "__main__":
121
89
  )
122
90
  parser.add_argument(
123
91
  "--rhea2chebi",
124
- default=None,
92
+ required=True,
125
93
  type=Path,
126
94
  help="File that maps rhea_ids to CHEBI",
127
95
  )
128
96
 
129
97
  args = parser.parse_args()
130
- main(args.input, args.output, args.proteins, args.rhea2chebi)
98
+
99
+ diamond_hits = args.diamond_hits
100
+ output = args.output
101
+ proteins = args.proteins
102
+ rhea2chebi = args.rhea2chebi
103
+
104
+ logging.info(
105
+ f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
106
+ )
107
+ protein_hashes = {}
108
+ with open(proteins, "r") as fasta_file:
109
+ for record in SeqIO.parse(fasta_file, "fasta"):
110
+ protein_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
111
+ protein_hashes[record.id] = protein_hash
112
+
113
+ logging.info(f"Step 2/3: Load reactions from provided file {rhea2chebi.resolve()}")
114
+ df = pd.read_csv(rhea2chebi, delimiter="\t")
115
+ rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
116
+
117
+ logging.info(
118
+ f"Step 3/3: Read DIAMOND results from {'STDIN' if diamond_hits == '-' else Path(diamond_hits).resolve()} and write output"
119
+ )
120
+ with open(output, "w") as output_handler:
121
+ if diamond_hits == "-":
122
+ process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
123
+ else:
124
+ with open(diamond_hits, "r") as input_file:
125
+ process_lines(
126
+ input_file, output_handler, rhea2reaction_dict, protein_hashes
127
+ )
128
+
129
+ logging.info("Processed successfully. Exiting.")
130
+
131
+
132
+ if __name__ == "__main__":
133
+ main()