mgnify-pipelines-toolkit 1.2.10__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (58) hide show
  1. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/PKG-INFO +25 -12
  2. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/README.md +23 -10
  3. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +2 -1
  4. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py +30 -69
  5. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py +29 -60
  6. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py +33 -81
  7. mgnify_pipelines_toolkit-1.3.0/mgnify_pipelines_toolkit/schemas/dataframes.py +325 -0
  8. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +25 -12
  9. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +1 -1
  10. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit.egg-info/requires.txt +2 -2
  11. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/pyproject.toml +15 -3
  12. mgnify_pipelines_toolkit-1.2.10/mgnify_pipelines_toolkit/schemas/schemas.py +0 -738
  13. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/LICENSE +0 -0
  14. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/__init__.py +0 -0
  15. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  16. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
  17. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
  18. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
  19. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py +0 -0
  20. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
  21. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
  22. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +0 -0
  23. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +0 -0
  24. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +0 -0
  25. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +0 -0
  26. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +0 -0
  27. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +0 -0
  28. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +0 -0
  29. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +0 -0
  30. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py +0 -0
  31. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +0 -0
  32. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py +0 -0
  33. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +0 -0
  34. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +0 -0
  35. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +0 -0
  36. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/genomes/__init__.py +0 -0
  37. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
  38. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +0 -0
  39. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +0 -0
  40. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
  41. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -0
  42. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
  43. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
  44. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
  45. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +0 -0
  46. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/constants/db_labels.py +0 -0
  47. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/constants/ncrna.py +0 -0
  48. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
  49. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/constants/tax_ranks.py +0 -0
  50. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -0
  51. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
  52. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  53. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
  54. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
  55. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  56. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +0 -0
  57. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  58. {mgnify_pipelines_toolkit-1.2.10 → mgnify_pipelines_toolkit-1.3.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.2.10
3
+ Version: 1.3.0
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -19,6 +19,7 @@ Requires-Dist: click<9,>=8.1.8
19
19
  Requires-Dist: pandera<0.24,>=0.23.1
20
20
  Requires-Dist: pyfastx<3,>=2.2.0
21
21
  Requires-Dist: intervaltree<4,>=3.1.0
22
+ Requires-Dist: isort>=6.1.0
22
23
  Provides-Extra: test
23
24
  Requires-Dist: pytest<9,>=8.3.5; extra == "test"
24
25
  Requires-Dist: pytest-md>=0.2.0; extra == "test"
@@ -26,8 +27,7 @@ Requires-Dist: pytest-workflow==2.1.0; extra == "test"
26
27
  Provides-Extra: dev
27
28
  Requires-Dist: pre-commit>=4.2.0; extra == "dev"
28
29
  Requires-Dist: black>=25.1.0; extra == "dev"
29
- Requires-Dist: flake8>=7.1.2; extra == "dev"
30
- Requires-Dist: pep8-naming>=0.14.1; extra == "dev"
30
+ Requires-Dist: ruff>=0.8.4; extra == "dev"
31
31
  Dynamic: license-file
32
32
 
33
33
  # mgnify-pipelines-toolkit
@@ -57,16 +57,29 @@ You should then be able to run the packages from the command-line. For example t
57
57
 
58
58
  `get_subunits -i ${easel_coords} -n ${meta.id}`
59
59
 
60
- ## Adding a new script to the package
60
+ ## Development
61
61
 
62
- ### Local development requirements
63
- Before starting any development, you should do these few steps:
64
- - Clone the repo if you haven't already and create a feature branch from the `dev` branch (NOT `main`).
65
- - Create a virtual environment with the tool of your choice (i.e. `conda create --name my_new_env`)
66
- - Activate you new environment (i.e. `conda activate my_new_env`)
67
- - Install dev dependencies `pip install -e '.[tests,dev]'`
68
- - Install pre-commit hooks `pre-commit install`
69
- - Run unit tests `pytest`
62
+ ### Quick Start with uv and Taskfile
63
+
64
+ This project uses [uv](https://docs.astral.sh/uv/) for fast Python environment management and [Task](https://taskfile.dev/) for task automation.
65
+
66
+ Prerequisites:
67
+ - Install [uv](https://docs.astral.sh/uv/getting-started/installation/)
68
+ - Install [Task](https://taskfile.dev/installation/)
69
+
70
+ Common tasks:
71
+
72
+ ```bash
73
+ task: Available tasks for this project:
74
+ * clean: Clean up generated files and caches
75
+ * lint: Run linters (ruff check only)
76
+ * lint-fix: Run linters and fix issues automatically
77
+ * pre-commit: Install pre-commit hooks
78
+ * run: Run toolkit scripts with uv (usage: task run -- <script_name> [args])
79
+ * test: Run tests with uv
80
+ * testk: Run specific tests from a file (usage: task testk -- test_path)
81
+ * venv: Create a virtual environment with uv
82
+ ```
70
83
 
71
84
  When doing these steps above, you ensure that the code you add will be linted and formatted properly.
72
85
 
@@ -25,16 +25,29 @@ You should then be able to run the packages from the command-line. For example t
25
25
 
26
26
  `get_subunits -i ${easel_coords} -n ${meta.id}`
27
27
 
28
- ## Adding a new script to the package
29
-
30
- ### Local development requirements
31
- Before starting any development, you should do these few steps:
32
- - Clone the repo if you haven't already and create a feature branch from the `dev` branch (NOT `main`).
33
- - Create a virtual environment with the tool of your choice (i.e. `conda create --name my_new_env`)
34
- - Activate you new environment (i.e. `conda activate my_new_env`)
35
- - Install dev dependencies `pip install -e '.[tests,dev]'`
36
- - Install pre-commit hooks `pre-commit install`
37
- - Run unit tests `pytest`
28
+ ## Development
29
+
30
+ ### Quick Start with uv and Taskfile
31
+
32
+ This project uses [uv](https://docs.astral.sh/uv/) for fast Python environment management and [Task](https://taskfile.dev/) for task automation.
33
+
34
+ Prerequisites:
35
+ - Install [uv](https://docs.astral.sh/uv/getting-started/installation/)
36
+ - Install [Task](https://taskfile.dev/installation/)
37
+
38
+ Common tasks:
39
+
40
+ ```bash
41
+ task: Available tasks for this project:
42
+ * clean: Clean up generated files and caches
43
+ * lint: Run linters (ruff check only)
44
+ * lint-fix: Run linters and fix issues automatically
45
+ * pre-commit: Install pre-commit hooks
46
+ * run: Run toolkit scripts with uv (usage: task run -- <script_name> [args])
47
+ * test: Run tests with uv
48
+ * testk: Run specific tests from a file (usage: task testk -- test_path)
49
+ * venv: Create a virtual environment with uv
50
+ ```
38
51
 
39
52
  When doing these steps above, you ensure that the code you add will be linted and formatted properly.
40
53
 
@@ -167,7 +167,8 @@ def main():
167
167
  matched_primers_list.append(cleaned_primer_name)
168
168
 
169
169
  res_df = pd.DataFrame.from_dict(res_dict)
170
- res_df.to_csv(f"./{sample}_primer_validation.tsv", sep="\t", index=False)
170
+ res_tsv_name = f"./{sample}_primer_validation.tsv"
171
+ res_df.to_csv(res_tsv_name, sep="\t", index=False) if not res_df.empty else open(res_tsv_name, "w").close()
171
172
 
172
173
  fwd_primers_fw.close()
173
174
  rev_primers_fw.close()
@@ -1,7 +1,5 @@
1
1
  #!/usr/bin/env python
2
2
  # -*- coding: utf-8 -*-
3
- import shutil
4
- from shutil import SameFileError
5
3
 
6
4
  # Copyright 2024-2025 EMBL - European Bioinformatics Institute
7
5
  #
@@ -16,25 +14,27 @@ from shutil import SameFileError
16
14
  # See the License for the specific language governing permissions and
17
15
  # limitations under the License.
18
16
 
19
- import click
20
- from collections import defaultdict
21
17
  import glob
22
18
  import logging
19
+ import shutil
20
+ from collections import defaultdict
23
21
  from pathlib import Path
24
- from typing import Union, List
22
+ from shutil import SameFileError
23
+ from typing import List, Union
25
24
 
25
+ import click
26
26
  import pandas as pd
27
27
 
28
- from mgnify_pipelines_toolkit.constants.db_labels import TAXDB_LABELS, ASV_TAXDB_LABELS
28
+ from mgnify_pipelines_toolkit.constants.db_labels import ASV_TAXDB_LABELS, TAXDB_LABELS
29
29
  from mgnify_pipelines_toolkit.constants.tax_ranks import (
30
- _SILVA_TAX_RANKS,
31
30
  _PR2_TAX_RANKS,
31
+ _SILVA_TAX_RANKS,
32
32
  )
33
- from mgnify_pipelines_toolkit.schemas.schemas import (
34
- AmpliconPassedRunsSchema,
33
+ from mgnify_pipelines_toolkit.schemas.dataframes import (
35
34
  AmpliconNonINSDCPassedRunsSchema,
36
- TaxonSchema,
35
+ AmpliconPassedRunsSchema,
37
36
  PR2TaxonSchema,
37
+ TaxonSchema,
38
38
  validate_dataframe,
39
39
  )
40
40
 
@@ -46,9 +46,7 @@ def cli():
46
46
  pass
47
47
 
48
48
 
49
- def get_tax_file(
50
- run_acc: str, analyses_dir: Path, db_label: str
51
- ) -> Union[Path, List[Path]]:
49
+ def get_tax_file(run_acc: str, analyses_dir: Path, db_label: str) -> Union[Path, List[Path]]:
52
50
  """Takes path information for a particular analysis and db_label combo, and returns any existing files.
53
51
 
54
52
  :param run_acc: Run accession for the tax file that should be retrieved.
@@ -69,48 +67,32 @@ def get_tax_file(
69
67
  db_path = Path(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}")
70
68
 
71
69
  if not db_path.exists():
72
- logging.debug(
73
- f"DB {db_path} doesn't exist for {run_acc}. Skipping"
74
- ) # or error?
70
+ logging.debug(f"DB {db_path} doesn't exist for {run_acc}. Skipping") # or error?
75
71
  return
76
72
 
77
73
  if db_label in TAXDB_LABELS:
78
- tax_file = Path(
79
- f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/{run_acc}_{db_label}.txt"
80
- )
74
+ tax_file = Path(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/{run_acc}_{db_label}.txt")
81
75
  if not tax_file.exists():
82
- logging.error(
83
- f"DB path exists but file doesn't - exiting. Path: {tax_file}"
84
- )
76
+ logging.error(f"DB path exists but file doesn't - exiting. Path: {tax_file}")
85
77
  exit(1)
86
78
 
87
79
  file_size = tax_file.stat().st_size
88
- if (
89
- file_size == 0
90
- ): # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
80
+ if file_size == 0: # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
91
81
  # so need to skip those. Should probably fix that at some point
92
- logging.debug(
93
- f"File {tax_file} exists but is empty, so will be skipping it."
94
- )
82
+ logging.debug(f"File {tax_file} exists but is empty, so will be skipping it.")
95
83
  tax_file = None
96
84
  elif db_label in ASV_TAXDB_LABELS:
97
85
  # ASV tax files could have up to two files, one for each amplified region (maximum two from the pipeline).
98
86
  # So will need to handle this differently to closed-reference files
99
- asv_tax_files = glob.glob(
100
- f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/*.txt"
101
- )
102
- asv_tax_files = [
103
- Path(file) for file in asv_tax_files if "concat" not in file
104
- ] # Have to filter out concatenated file if it exists
87
+ asv_tax_files = glob.glob(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/*.txt")
88
+ asv_tax_files = [Path(file) for file in asv_tax_files if "concat" not in file] # Have to filter out concatenated file if it exists
105
89
 
106
90
  tax_file = asv_tax_files
107
91
 
108
92
  return tax_file
109
93
 
110
94
 
111
- def parse_one_tax_file(
112
- run_acc: str, tax_file: Path, long_tax_ranks: list
113
- ) -> pd.DataFrame:
95
+ def parse_one_tax_file(run_acc: str, tax_file: Path, long_tax_ranks: list) -> pd.DataFrame:
114
96
  """Parses a taxonomy file, and returns it as a pandas DataFrame object.
115
97
 
116
98
  :param run_acc: Run accession of the taxonomy file that will be parsed.
@@ -134,9 +116,7 @@ def parse_one_tax_file(
134
116
  elif len(long_tax_ranks) == 9:
135
117
  validate_dataframe(res_df, PR2TaxonSchema, str(tax_file))
136
118
 
137
- res_df["full_taxon"] = res_df.iloc[:, 1:].apply(
138
- lambda x: ";".join(x).strip(";"), axis=1
139
- )
119
+ res_df["full_taxon"] = res_df.iloc[:, 1:].apply(lambda x: ";".join(x).strip(";"), axis=1)
140
120
  final_df = res_df.iloc[:, [0, -1]]
141
121
  final_df = final_df.set_index("full_taxon")
142
122
  final_df.columns = [run_acc]
@@ -144,9 +124,7 @@ def parse_one_tax_file(
144
124
  return final_df
145
125
 
146
126
 
147
- def generate_db_summary(
148
- db_label: str, tax_dfs: defaultdict[Path], output_prefix: str
149
- ) -> None:
127
+ def generate_db_summary(db_label: str, tax_dfs: defaultdict[Path], output_prefix: str) -> None:
150
128
  """Takes paired run accessions taxonomy dataframes in the form of a dictionary,
151
129
  and respective db_label, joins them together, and generates a study-wide summary
152
130
  in the form of a .tsv file.
@@ -185,7 +163,6 @@ def generate_db_summary(
185
163
  )
186
164
 
187
165
  elif db_label in ASV_TAXDB_LABELS:
188
-
189
166
  if "PR2" in db_label:
190
167
  long_tax_ranks = _PR2_TAX_RANKS
191
168
  else:
@@ -196,13 +173,9 @@ def generate_db_summary(
196
173
  for (
197
174
  run_acc,
198
175
  tax_df_asv_lst,
199
- ) in (
200
- tax_dfs.items()
201
- ): # each `tax_file` will be a list containing at most two files (one for each amp_region)
176
+ ) in tax_dfs.items(): # each `tax_file` will be a list containing at most two files (one for each amp_region)
202
177
  for tax_df in tax_df_asv_lst:
203
- amp_region = str(tax_df).split("_")[
204
- -5
205
- ] # there are a lot of underscores in these names... but it is consistent
178
+ amp_region = str(tax_df).split("_")[-5] # there are a lot of underscores in these names... but it is consistent
206
179
  # e.g. ERR4334351_16S-V3-V4_DADA2-SILVA_asv_krona_counts.txt
207
180
  amp_region_df = parse_one_tax_file(run_acc, tax_df, long_tax_ranks)
208
181
  amp_region_dict[amp_region].append(amp_region_df)
@@ -241,13 +214,9 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
241
214
 
242
215
  temp_lst = summary_filename.split("_")
243
216
  if "asv_study_summary" in summary_filename:
244
- summary_db_label = "_".join(
245
- temp_lst[1:3]
246
- ) # For ASVs we need to include the amp_region in the label
217
+ summary_db_label = "_".join(temp_lst[1:3]) # For ASVs we need to include the amp_region in the label
247
218
  else:
248
- summary_db_label = temp_lst[
249
- 1
250
- ] # For closed reference, just the db_label is needed
219
+ summary_db_label = temp_lst[1] # For closed reference, just the db_label is needed
251
220
 
252
221
  summaries_dict[summary_db_label].append(summary_path)
253
222
 
@@ -273,18 +242,14 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
273
242
  help="Input directory to where all the individual analyses subdirectories for summarising",
274
243
  type=click.Path(exists=True, path_type=Path, file_okay=False),
275
244
  )
276
- @click.option(
277
- "-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
278
- )
245
+ @click.option("-p", "--output_prefix", required=True, help="Prefix to summary files", type=str)
279
246
  @click.option(
280
247
  "--non_insdc",
281
248
  default=False,
282
249
  is_flag=True,
283
250
  help="If run accessions aren't INSDC-formatted",
284
251
  )
285
- def summarise_analyses(
286
- runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool
287
- ) -> None:
252
+ def summarise_analyses(runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool) -> None:
288
253
  """Function that will take a file of pipeline-successful run accessions
289
254
  that should be used for the generation of the relevant db-specific
290
255
  study-level summary files. For ASV results, these will also be on a
@@ -302,16 +267,14 @@ def summarise_analyses(
302
267
  """
303
268
  runs_df = pd.read_csv(runs, names=["run", "status"])
304
269
 
270
+ # Run validation on the successful_runs .csv file
305
271
  if not non_insdc:
306
- AmpliconPassedRunsSchema(
307
- runs_df
308
- ) # Run validation on the successful_runs .csv file
272
+ AmpliconPassedRunsSchema(runs_df)
309
273
  else:
310
274
  AmpliconNonINSDCPassedRunsSchema(runs_df)
311
275
 
312
276
  all_db_labels = TAXDB_LABELS + ASV_TAXDB_LABELS
313
277
  for db_label in all_db_labels:
314
-
315
278
  tax_files = defaultdict(Path)
316
279
  for i in range(0, len(runs_df)):
317
280
  run_acc = runs_df.loc[i, "run"]
@@ -376,9 +339,7 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
376
339
  index_label="taxonomy",
377
340
  )
378
341
  elif len(summaries) == 1:
379
- logging.info(
380
- f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}"
381
- )
342
+ logging.info(f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}")
382
343
  try:
383
344
  shutil.copyfile(summaries[0], merged_summary_name)
384
345
  except SameFileError:
@@ -14,39 +14,37 @@
14
14
  # See the License for the specific language governing permissions and
15
15
  # limitations under the License.
16
16
 
17
- import click
18
- from functools import reduce
19
17
  import glob
20
18
  import logging
19
+ from functools import reduce
21
20
  from pathlib import Path
22
21
  from typing import Literal
23
22
 
23
+ import click
24
24
  import pandas as pd
25
25
 
26
- from mgnify_pipelines_toolkit.schemas.schemas import (
26
+ from mgnify_pipelines_toolkit.schemas.dataframes import (
27
+ AntismashStudySummarySchema,
28
+ AntismashSummarySchema,
27
29
  CompletedAnalysisSchema,
28
- TaxonSchema,
30
+ GOStudySummarySchema,
29
31
  GOSummarySchema,
32
+ InterProStudySummarySchema,
30
33
  InterProSummarySchema,
31
- KOSummarySchema,
32
- SanntisSummarySchema,
33
- AntismashSummarySchema,
34
- PFAMSummarySchema,
34
+ KEGGModulesStudySummarySchema,
35
35
  KEGGModulesSummarySchema,
36
- GOStudySummarySchema,
37
- InterProStudySummarySchema,
38
- TaxonomyStudySummarySchema,
39
36
  KOStudySummarySchema,
40
- SanntisStudySummarySchema,
41
- AntismashStudySummarySchema,
37
+ KOSummarySchema,
42
38
  PFAMStudySummarySchema,
43
- KEGGModulesStudySummarySchema,
39
+ PFAMSummarySchema,
40
+ SanntisStudySummarySchema,
41
+ SanntisSummarySchema,
42
+ TaxonomyStudySummarySchema,
43
+ TaxonSchema,
44
44
  validate_dataframe,
45
45
  )
46
46
 
47
- logging.basicConfig(
48
- level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
49
- )
47
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
50
48
 
51
49
  # Keys are the original column names in the input files,
52
50
  # values are the standardised column names used in the generated study summary files
@@ -173,9 +171,7 @@ def check_files_exist(file_list: list[Path]) -> None:
173
171
  """
174
172
  missing_files = [str(path) for path in file_list if not path.is_file()]
175
173
  if missing_files:
176
- raise FileNotFoundError(
177
- f"The following required files are missing: {', '.join(missing_files)}"
178
- )
174
+ raise FileNotFoundError(f"The following required files are missing: {', '.join(missing_files)}")
179
175
 
180
176
 
181
177
  def generate_taxonomy_summary(
@@ -206,9 +202,7 @@ def generate_taxonomy_summary(
206
202
  df = validate_dataframe(df, TaxonSchema, str(path))
207
203
 
208
204
  # Combine all taxonomic ranks in the classification into a single string
209
- df["full_taxon"] = (
210
- df[TAXONOMY_COLUMN_NAMES[1:]].agg(";".join, axis=1).str.strip(";")
211
- )
205
+ df["full_taxon"] = df[TAXONOMY_COLUMN_NAMES[1:]].agg(";".join, axis=1).str.strip(";")
212
206
 
213
207
  # Create a new DataFrame with taxonomy as index and count as the only column
214
208
  result = df[["Count", "full_taxon"]].set_index("full_taxon")
@@ -229,9 +223,7 @@ def generate_functional_summary(
229
223
  file_dict: dict[str, Path],
230
224
  column_names: dict[str, str],
231
225
  output_prefix: str,
232
- label: Literal[
233
- "go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
234
- ],
226
+ label: Literal["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"],
235
227
  outdir: Path = None,
236
228
  allow_missing: bool = False,
237
229
  ) -> None:
@@ -292,9 +284,7 @@ def generate_functional_summary(
292
284
  check_files_exist(list(file_dict.values()))
293
285
  except FileNotFoundError as e:
294
286
  if allow_missing:
295
- logging.warning(
296
- f"One of the expected files is missing, but this is allowed for {label}."
297
- )
287
+ logging.warning(f"One of the expected files is missing, but this is allowed for {label}.")
298
288
  logging.warning(e)
299
289
  return
300
290
  raise
@@ -324,9 +314,7 @@ def generate_functional_summary(
324
314
  dfs.append(df)
325
315
 
326
316
  if not dfs:
327
- logging.warning(
328
- f"No valid files with functional annotation summary were found. Skipping creation of {output_file_name}."
329
- )
317
+ logging.warning(f"No valid files with functional annotation summary were found. Skipping creation of {output_file_name}.")
330
318
  return
331
319
 
332
320
  # Merge all dataframes on the renamed metadata columns
@@ -384,9 +372,7 @@ def generate_functional_summary(
384
372
  help="Directory for the output files, by default it will use the current working directory.",
385
373
  type=click.Path(exists=True, path_type=Path, file_okay=False),
386
374
  )
387
- def summarise_analyses(
388
- assemblies: Path, study_dir: Path, output_prefix: str, outdir: Path
389
- ) -> None:
375
+ def summarise_analyses(assemblies: Path, study_dir: Path, output_prefix: str, outdir: Path) -> None:
390
376
  """
391
377
  Generate study-level summaries for successfully proccessed assemblies.
392
378
 
@@ -405,16 +391,11 @@ def summarise_analyses(
405
391
  Construct file paths for each assembly given a subdirectory and filename template.
406
392
  Template must contain {acc} as a placeholder.
407
393
  """
408
- return {
409
- acc: study_dir / acc / subdir / filename_template.format(acc=acc)
410
- for acc in assembly_list
411
- }
394
+ return {acc: study_dir / acc / subdir / filename_template.format(acc=acc) for acc in assembly_list}
412
395
 
413
396
  logging.info("Start processing of assembly-level summaries.")
414
397
 
415
- logging.info(
416
- "Generating taxonomy summary from assembly-level summaries <accession>.krona.txt"
417
- )
398
+ logging.info("Generating taxonomy summary from assembly-level summaries <accession>.krona.txt")
418
399
  generate_taxonomy_summary(
419
400
  get_file_paths("taxonomy", "{acc}.krona.txt.gz"),
420
401
  f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}",
@@ -422,9 +403,7 @@ def summarise_analyses(
422
403
  )
423
404
 
424
405
  for summary_type, config in SUMMARY_TYPES_MAP.items():
425
- logging.info(
426
- f"Generating study-level {summary_type.capitalize()} summary from file <accession>_{summary_type}_summary.tsv.gz"
427
- )
406
+ logging.info(f"Generating study-level {summary_type.capitalize()} summary from file <accession>_{summary_type}_summary.tsv.gz")
428
407
  generate_functional_summary(
429
408
  get_file_paths(config["folder"], f"{{acc}}_{summary_type}_summary.tsv.gz"),
430
409
  config["column_names"],
@@ -469,9 +448,7 @@ def merge_summaries(study_dir: str, output_prefix: str) -> None:
469
448
 
470
449
  logging.info("Generating combined assembly-level summaries")
471
450
  logging.info("Parsing summary files for taxonomic classification")
472
- merge_taxonomy_summaries(
473
- get_file_paths("taxonomy"), f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}"
474
- )
451
+ merge_taxonomy_summaries(get_file_paths("taxonomy"), f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}")
475
452
 
476
453
  for summary_type, config in SUMMARY_TYPES_MAP.items():
477
454
  logging.info(f"Parsing summary files for {summary_type.capitalize()}.")
@@ -500,9 +477,7 @@ def merge_taxonomy_summaries(summary_files: list[str], output_file_name: str) ->
500
477
  sk__Eukaryota;k__Metazoa;p__Chordata;c__Mammalia;o__Primates 118 94
501
478
  """
502
479
  if not summary_files:
503
- raise FileNotFoundError(
504
- "The required taxonomic classification summary files are missing. Exiting."
505
- )
480
+ raise FileNotFoundError("The required taxonomic classification summary files are missing. Exiting.")
506
481
 
507
482
  summary_dfs = []
508
483
  for file in summary_files:
@@ -527,9 +502,7 @@ def merge_functional_summaries(
527
502
  summary_files: list[str],
528
503
  merge_keys: list[str],
529
504
  output_prefix: str,
530
- label: Literal[
531
- "go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
532
- ],
505
+ label: Literal["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"],
533
506
  ) -> None:
534
507
  """
535
508
  Merge multiple functional study-level summary files into a single study-level summary.
@@ -580,9 +553,7 @@ def merge_functional_summaries(
580
553
  output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
581
554
 
582
555
  if not summary_files:
583
- logging.warning(
584
- f"Skipping creation of {output_file_name} because no summaries were found for this type of functional annotation."
585
- )
556
+ logging.warning(f"Skipping creation of {output_file_name} because no summaries were found for this type of functional annotation.")
586
557
  return
587
558
 
588
559
  validation_schema = SUMMARY_TYPES_MAP[label]["study_schema"]
@@ -596,9 +567,7 @@ def merge_functional_summaries(
596
567
  if len(dfs) == 1:
597
568
  merged_df = dfs[0]
598
569
  else:
599
- merged_df = reduce(
600
- lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs
601
- )
570
+ merged_df = reduce(lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs)
602
571
 
603
572
  # Identify non-key columns (i.e. counts)
604
573
  value_columns = [col for col in merged_df.columns if col not in merge_keys]