mgnify-pipelines-toolkit 1.2.6__tar.gz → 1.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (57) hide show
  1. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/PKG-INFO +1 -1
  2. mgnify_pipelines_toolkit-1.2.7/mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py +446 -0
  3. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/constants/db_labels.py +6 -0
  4. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/constants/tax_ranks.py +10 -1
  5. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/schemas/schemas.py +152 -3
  6. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +1 -1
  7. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +1 -0
  8. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +1 -0
  9. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/pyproject.toml +5 -2
  10. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/LICENSE +0 -0
  11. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/README.md +0 -0
  12. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/__init__.py +0 -0
  13. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  14. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
  15. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +0 -0
  16. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
  17. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py +0 -0
  18. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +0 -0
  19. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
  20. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
  21. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py +0 -0
  22. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +0 -0
  23. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +0 -0
  24. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +0 -0
  25. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +0 -0
  26. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +0 -0
  27. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +0 -0
  28. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +0 -0
  29. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +0 -0
  30. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py +0 -0
  31. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +0 -0
  32. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py +0 -0
  33. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py +0 -0
  34. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +0 -0
  35. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +0 -0
  36. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +0 -0
  37. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/genomes/__init__.py +0 -0
  38. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
  39. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +0 -0
  40. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +0 -0
  41. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
  42. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -0
  43. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
  44. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
  45. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
  46. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +0 -0
  47. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/constants/ncrna.py +0 -0
  48. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
  49. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -0
  50. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
  51. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  52. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
  53. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
  54. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  55. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit.egg-info/requires.txt +0 -0
  56. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  57. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.2.6
3
+ Version: 1.2.7
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -0,0 +1,446 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import shutil
18
+ from shutil import SameFileError
19
+
20
+ import click
21
+ from collections import defaultdict
22
+ import glob
23
+ import logging
24
+ from pathlib import Path
25
+ from typing import Union, List
26
+
27
+ import pandas as pd
28
+
29
+ from mgnify_pipelines_toolkit.constants.db_labels import (
30
+ RRAP_TAXDB_LABELS,
31
+ RRAP_FUNCDB_LABELS,
32
+ )
33
+ from mgnify_pipelines_toolkit.constants.tax_ranks import (
34
+ _SILVA_TAX_RANKS,
35
+ _MOTUS_TAX_RANKS,
36
+ )
37
+ from mgnify_pipelines_toolkit.schemas.schemas import (
38
+ RawReadsPassedRunsSchema,
39
+ RawReadsNonINSDCPassedRunsSchema,
40
+ TaxonSchema,
41
+ MotusTaxonSchema,
42
+ FunctionProfileSchema,
43
+ validate_dataframe,
44
+ )
45
+
46
+ logging.basicConfig(level=logging.DEBUG)
47
+
48
+
49
+ @click.group()
50
+ def cli():
51
+ pass
52
+
53
+
54
+ def get_file(
55
+ run_acc: str, analyses_dir: Path, db_label: str
56
+ ) -> Union[Path, List[Path]]:
57
+ """Takes path information for a particular analysis and db_label combo, and returns any existing files.
58
+
59
+ :param run_acc: Run accession for the tax file that should be retrieved.
60
+ :type run_acc: str
61
+ :param analyses_dir: The path to the directory containing all of the analyses,
62
+ including the tax file corresponding to :param:`run_acc`.
63
+ :type analyses_dir: Path
64
+ :param db_label: One of the database labels that results might exist for,
65
+ values of which come from the imported constants ``RRAP_TAXDB_LABELS`` and ``RRAP_FUNCDB_LABELS``.
66
+ :type db_label: str
67
+ :return: A :class:`Path` object if :param:`db_label` comes from ``RRAP_TAXDB_LABELS`` or ``RRAP_FUNCDB_LABELS``.
68
+ :rtype: Union[Path, List[Path]]
69
+ """
70
+
71
+ if db_label not in RRAP_TAXDB_LABELS + RRAP_FUNCDB_LABELS:
72
+ return
73
+
74
+ if db_label in RRAP_TAXDB_LABELS:
75
+ db_dir = "taxonomy-summary"
76
+ else:
77
+ db_dir = "function-summary"
78
+ db_path = Path(f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}")
79
+
80
+ if not db_path.exists():
81
+ logging.debug(
82
+ f"DB {db_path} doesn't exist for {run_acc}. Skipping"
83
+ ) # or error?
84
+ return
85
+
86
+ analysis_file = Path(
87
+ f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}/{run_acc}_{db_label}.txt"
88
+ )
89
+ if not analysis_file.exists():
90
+ logging.error(
91
+ f"DB path exists but file doesn't - exiting. Path: {analysis_file}"
92
+ )
93
+ exit(1)
94
+
95
+ file_size = analysis_file.stat().st_size
96
+ if (
97
+ file_size == 0
98
+ ): # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
99
+ # so need to skip those. Should probably fix that at some point
100
+ logging.debug(
101
+ f"File {analysis_file} exists but is empty, so will be skipping it."
102
+ )
103
+ analysis_file = None
104
+
105
+ return analysis_file
106
+
107
+
108
+ def parse_one_tax_file(run_acc: str, tax_file: Path, db_label: str) -> pd.DataFrame:
109
+ """Parses a taxonomy file, and returns it as a pandas DataFrame object.
110
+
111
+ :param run_acc: Run accession of the taxonomy file that will be parsed.
112
+ :type run_acc: str
113
+ :param tax_file: Taxonomy file that will be parsed.
114
+ :type tax_file: Path
115
+ :param db_label: One of the database labels that results might exist for,
116
+ values of which come from the imported constants ``RRAP_TAXDB_LABELS` and `RRAP_FUNCDB_LABELS``.
117
+ :type db_label: str
118
+ :return: The parsed :param:`tax_file` as a :class:`pd.DataFrame` object
119
+ :rtype: pd.DataFrame
120
+ """
121
+
122
+ tax_ranks = _MOTUS_TAX_RANKS if db_label == "mOTUs" else _SILVA_TAX_RANKS
123
+ res_df = pd.read_csv(tax_file, sep="\t", skiprows=1, names=["Count"] + tax_ranks)
124
+ res_df = res_df.fillna("")
125
+
126
+ validate_dataframe(
127
+ res_df, MotusTaxonSchema if db_label == "mOTUs" else TaxonSchema, str(tax_file)
128
+ )
129
+
130
+ res_df["full_taxon"] = res_df.iloc[:, 1:].apply(
131
+ lambda x: ";".join(x).strip(";"), axis=1
132
+ )
133
+ final_df = res_df.iloc[:, [0, -1]]
134
+ final_df = final_df.set_index("full_taxon")
135
+ final_df.columns = [run_acc]
136
+
137
+ return final_df
138
+
139
+
140
+ def parse_one_func_file(
141
+ run_acc: str, func_file: Path, db_label: str
142
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
143
+ """Parses a functional profile file, and returns it as a pandas DataFrame object.
144
+
145
+ :param run_acc: Run accession of the taxonomy file that will be parsed.
146
+ :type run_acc: str
147
+ :param func_file: Functional profile file that will be parsed.
148
+ :type func_file: Path
149
+ :param db_label: One of the database labels that results might exist for,
150
+ values of which come from the imported constants ``RRAP_TAXDB_LABELS` and `RRAP_FUNCDB_LABELS``.
151
+ :type db_label: str
152
+ :return: The parsed :param:`func_file` as a :class:`pd.DataFrame` object
153
+ :rtype: pd.DataFrame
154
+ """
155
+
156
+ res_df = pd.read_csv(
157
+ func_file,
158
+ sep="\t",
159
+ names=["function", "read_count", "coverage_depth", "coverage_breadth"],
160
+ skiprows=1,
161
+ dtype={"read_count": int, "coverage_depth": float, "coverage_breadth": float},
162
+ ).set_index("function")
163
+ res_df = res_df.fillna(0)
164
+
165
+ validate_dataframe(res_df, FunctionProfileSchema, str(func_file))
166
+
167
+ count_df = res_df[["read_count"]]
168
+ count_df.columns = [run_acc]
169
+
170
+ depth_df = res_df[["coverage_depth"]]
171
+ depth_df.columns = [run_acc]
172
+
173
+ breadth_df = res_df[["coverage_breadth"]]
174
+ breadth_df.columns = [run_acc]
175
+
176
+ return count_df, depth_df, breadth_df
177
+
178
+
179
+ def generate_db_summary(
180
+ db_label: str, analysis_dfs: dict[str, Path], output_prefix: str
181
+ ) -> None:
182
+ """Takes paired run accessions taxonomy dataframes in the form of a dictionary,
183
+ and respective db_label, joins them together, and generates a study-wide summary
184
+ in the form of a .tsv file.
185
+
186
+ :param db_label: One of the database labels that results might exist for,
187
+ values of which come from the imported constants ``RRAP_TAXDB_LABELS` and `RRAP_FUNCDB_LABELS``.
188
+ :param tax_dfs: Dictionary where the key is a run accession,
189
+ and values are one parsed taxonomy dataframe if the :param:db_label comes from ``RRAP_TAXDB_LABELS` or `RRAP_FUNCDB_LABELS``.
190
+ These dataframes are parsed by :func:`parse_one_tax_file` or `parse_one_func_file`.
191
+ :type tax_dfs: defaultdict[Path]
192
+ :param output_prefix: Prefix to be added to the generated summary file.
193
+ :type output_prefix: str
194
+ """
195
+
196
+ if db_label in RRAP_TAXDB_LABELS:
197
+ df_list = []
198
+
199
+ for run_acc, analysis_df in analysis_dfs.items():
200
+ res_df = parse_one_tax_file(run_acc, analysis_df, db_label)
201
+ df_list.append(res_df)
202
+
203
+ res_df = pd.concat(df_list, axis=1).fillna(0)
204
+ res_df = res_df.sort_index()
205
+ res_df = res_df.astype(int)
206
+
207
+ res_df.to_csv(
208
+ f"{output_prefix}_{db_label}_study_summary.tsv",
209
+ sep="\t",
210
+ index_label="taxonomy",
211
+ )
212
+
213
+ if db_label in RRAP_FUNCDB_LABELS:
214
+ count_df_list = []
215
+ depth_df_list = []
216
+ breadth_df_list = []
217
+
218
+ for run_acc, analysis_df in analysis_dfs.items():
219
+ count_df, depth_df, breadth_df = parse_one_func_file(
220
+ run_acc, analysis_df, db_label
221
+ )
222
+ count_df_list.append(count_df)
223
+ depth_df_list.append(depth_df)
224
+ breadth_df_list.append(breadth_df)
225
+
226
+ count_df = pd.concat(count_df_list, axis=1).fillna(0)
227
+ count_df = count_df.sort_index()
228
+ count_df = count_df.astype(int)
229
+
230
+ count_df.to_csv(
231
+ f"{output_prefix}_{db_label}_read-count_study_summary.tsv",
232
+ sep="\t",
233
+ index_label="function",
234
+ )
235
+
236
+ depth_df = pd.concat(depth_df_list, axis=1).fillna(0)
237
+ depth_df = depth_df.sort_index()
238
+ depth_df = depth_df.astype(float)
239
+
240
+ depth_df.to_csv(
241
+ f"{output_prefix}_{db_label}_coverage-depth_study_summary.tsv",
242
+ sep="\t",
243
+ index_label="function",
244
+ float_format="%.6g",
245
+ )
246
+
247
+ breadth_df = pd.concat(breadth_df_list, axis=1).fillna(0)
248
+ breadth_df = breadth_df.sort_index()
249
+ breadth_df = breadth_df.astype(float)
250
+
251
+ breadth_df.to_csv(
252
+ f"{output_prefix}_{db_label}_coverage-breadth_study_summary.tsv",
253
+ sep="\t",
254
+ index_label="function",
255
+ float_format="%.6g",
256
+ )
257
+
258
+
259
+ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[str, List]:
260
+ """Matches different summary files of the same database label and analysis
261
+ type into a dictionary to help merge
262
+ the correct summaries.
263
+
264
+ :param all_study_summaries: List of file paths to different summary files
265
+ :type all_study_summaries: List[str]
266
+ :return: Organised dictionary where each summary is paired to a specific
267
+ database label key to be merged together.
268
+ :rtype: defaultdict[List]
269
+ """
270
+ summaries_dict = defaultdict(list)
271
+
272
+ for summary in all_study_summaries:
273
+ summary_path = Path(summary)
274
+ summary_filename = summary_path.stem
275
+
276
+ summary_db_label = summary_filename.split("_")[1]
277
+
278
+ summaries_dict[summary_db_label].append(summary_path)
279
+
280
+ return summaries_dict
281
+
282
+
283
+ @cli.command(
284
+ "summarise",
285
+ options_metavar="-r <runs> -a <analyses_dir> -p <output_prefix>",
286
+ short_help="Generate study-level summaries of raw-read analysis results.",
287
+ )
288
+ @click.option(
289
+ "-r",
290
+ "--runs",
291
+ required=True,
292
+ help="CSV file containing successful analyses generated by the pipeline",
293
+ type=click.Path(exists=True, path_type=Path, dir_okay=False),
294
+ )
295
+ @click.option(
296
+ "-a",
297
+ "--analyses_dir",
298
+ required=True,
299
+ help="Input directory to where all the individual analyses subdirectories for summarising",
300
+ type=click.Path(exists=True, path_type=Path, file_okay=False),
301
+ )
302
+ @click.option(
303
+ "-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
304
+ )
305
+ @click.option(
306
+ "--non_insdc",
307
+ default=False,
308
+ is_flag=True,
309
+ help="If run accessions aren't INSDC-formatted",
310
+ )
311
+ def summarise_analyses(
312
+ runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool
313
+ ) -> None:
314
+ """Function that will take a file of pipeline-successful run accessions
315
+ that should be used for the generation of the relevant db-specific
316
+ study-level summary files.
317
+ \f
318
+
319
+ :param runs: Path to a qc_passed_runs file from the pipeline execution.
320
+ Contains the accessions of runs that should therefore be included in the generated
321
+ summaries.
322
+ :type runs: Path
323
+ :param analyses_dir: The path to the directory containing all of the analyses.
324
+ :type analyses_dir: Path
325
+ :param output_prefix: Prefix to be added to the generated summary file.
326
+ :type output_prefix: str
327
+ """
328
+ runs_df = pd.read_csv(runs, names=["run", "status"])
329
+
330
+ if not non_insdc:
331
+ RawReadsPassedRunsSchema(
332
+ runs_df
333
+ ) # Run validation on the successful_runs .csv file
334
+ else:
335
+ RawReadsNonINSDCPassedRunsSchema(runs_df)
336
+
337
+ all_db_labels = RRAP_TAXDB_LABELS + RRAP_FUNCDB_LABELS
338
+ for db_label in all_db_labels:
339
+
340
+ analysis_files = {}
341
+ for run_acc in runs_df["run"]:
342
+ analysis_file = get_file(run_acc, analyses_dir, db_label)
343
+
344
+ if analysis_file:
345
+ analysis_files[run_acc] = analysis_file
346
+
347
+ if analysis_files:
348
+ generate_db_summary(db_label, analysis_files, output_prefix)
349
+
350
+
351
+ @cli.command(
352
+ "merge",
353
+ options_metavar="-a <analyses_dir> -p <output_prefix>",
354
+ short_help="Merge multiple study-level summaries of raw-read analysis.",
355
+ )
356
+ @click.option(
357
+ "-a",
358
+ "--analyses_dir",
359
+ required=True,
360
+ help="Input directory to where all the individual analyses subdirectories for merging",
361
+ type=click.Path(exists=True, file_okay=False),
362
+ )
363
+ @click.option(
364
+ "-p",
365
+ "--output_prefix",
366
+ required=True,
367
+ help="Prefix to merged summary files",
368
+ type=str,
369
+ )
370
+ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
371
+ """Function that will take a file path containing study-level
372
+ summaries that should be merged together on a per-db
373
+ basis.
374
+ \f
375
+
376
+ :param analyses_dir: The filepath to the directory containing all of the analyses.
377
+ :type analyses_dir: str
378
+ :param output_prefix: Prefix to be added to the generated summary file.
379
+ :type output_prefix: str
380
+ """
381
+
382
+ all_study_summaries = glob.glob(f"{analyses_dir}/*_study_summary.tsv")
383
+
384
+ summaries_dict = organise_study_summaries(all_study_summaries)
385
+
386
+ for db_label, summaries in summaries_dict.items():
387
+ if db_label in RRAP_TAXDB_LABELS:
388
+ merged_summary_name = f"{output_prefix}_{db_label}_study_summary.tsv"
389
+ if len(summaries) > 1:
390
+ res_df = pd.read_csv(summaries[0], sep="\t", index_col=0)
391
+ for summary in summaries[1:]:
392
+ curr_df = pd.read_csv(summary, sep="\t", index_col=0)
393
+ res_df = res_df.join(curr_df, how="outer")
394
+ res_df = res_df.fillna(0)
395
+ res_df = res_df.astype(int)
396
+
397
+ res_df = res_df.reindex(sorted(res_df.columns), axis=1)
398
+ res_df.to_csv(
399
+ merged_summary_name,
400
+ sep="\t",
401
+ index_label="taxonomy",
402
+ )
403
+ elif len(summaries) == 1:
404
+ logging.info(
405
+ f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}"
406
+ )
407
+ try:
408
+ shutil.copyfile(summaries[0], merged_summary_name)
409
+ except SameFileError:
410
+ pass
411
+
412
+ if db_label in RRAP_FUNCDB_LABELS:
413
+ for table_type in ["read-count", "coverage-depth", "coverage-breadth"]:
414
+ merged_summary_name = (
415
+ f"{output_prefix}_{db_label}_{table_type}_study_summary.tsv"
416
+ )
417
+ summaries_ = [
418
+ v for v in summaries if Path(v).stem.split("_")[2] == table_type
419
+ ]
420
+ if len(summaries_) > 1:
421
+ res_df = pd.read_csv(summaries_[0], sep="\t", index_col=0)
422
+ for summary in summaries_[1:]:
423
+ curr_df = pd.read_csv(summary, sep="\t", index_col=0)
424
+ res_df = res_df.join(curr_df, how="outer")
425
+ res_df = res_df.fillna(0)
426
+ res_df = res_df.astype(int if table_type == "count" else float)
427
+
428
+ res_df = res_df.reindex(sorted(res_df.columns), axis=1)
429
+ res_df.to_csv(
430
+ merged_summary_name,
431
+ sep="\t",
432
+ index_label="function",
433
+ float_format="%.6g",
434
+ )
435
+ elif len(summaries_) == 1:
436
+ logging.info(
437
+ f"Only one summary ({summaries_[0]}) so will use that as {merged_summary_name}"
438
+ )
439
+ try:
440
+ shutil.copyfile(summaries_[0], merged_summary_name)
441
+ except SameFileError:
442
+ pass
443
+
444
+
445
+ if __name__ == "__main__":
446
+ cli()
@@ -19,3 +19,9 @@ TAXDB_LABELS = ["SILVA-SSU", "SILVA-LSU", "PR2", "UNITE", "ITSoneDB"]
19
19
 
20
20
  # taxonomy_summary for ASV method
21
21
  ASV_TAXDB_LABELS = ["DADA2-SILVA", "DADA2-PR2"]
22
+
23
+ # taxonomy_summary labels for Raw Reads Analysis Pipeline
24
+ RRAP_TAXDB_LABELS = ['SILVA-SSU', 'SILVA-LSU', 'mOTUs']
25
+
26
+ # function_summary labels for Raw Reads Analysis Pipeline
27
+ RRAP_FUNCDB_LABELS = ['Pfam-A']
@@ -35,7 +35,16 @@ _PR2_TAX_RANKS = [
35
35
  "Genus",
36
36
  "Species",
37
37
  ]
38
+ _MOTUS_TAX_RANKS = [
39
+ 'Kingdom',
40
+ 'Phylum',
41
+ 'Class',
42
+ 'Order',
43
+ 'Family',
44
+ 'Genus',
45
+ 'Species'
46
+ ]
38
47
 
39
48
  SHORT_TAX_RANKS = ["sk", "k", "p", "c", "o", "f", "g", "s"]
40
-
49
+ SHORT_MOTUS_TAX_RANKS = ["k", "p", "c", "o", "f", "g", "s"]
41
50
  SHORT_PR2_TAX_RANKS = ["d", "sg", "dv", "sdv", "c", "o", "f", "g", "s"]
@@ -16,7 +16,7 @@
16
16
  import logging
17
17
  import re
18
18
 
19
- from enum import Enum
19
+ from enum import StrEnum
20
20
  from typing import ClassVar, Optional, Type, Literal
21
21
 
22
22
  import pandas as pd
@@ -35,6 +35,7 @@ from pandera.engines.pandas_engine import PydanticModel
35
35
  from mgnify_pipelines_toolkit.constants.tax_ranks import (
36
36
  SHORT_TAX_RANKS,
37
37
  SHORT_PR2_TAX_RANKS,
38
+ SHORT_MOTUS_TAX_RANKS,
38
39
  )
39
40
 
40
41
 
@@ -70,7 +71,7 @@ class INSDCRunAccession(RootModel):
70
71
  return run
71
72
 
72
73
 
73
- class AmpliconResultTypes(str, Enum):
74
+ class AmpliconResultTypes(StrEnum):
74
75
  """Class that models the two allowed statuses for successful amplicon analysis runs.
75
76
  Pydantic validates Enums very simply without needing to declare a new function.
76
77
  """
@@ -545,7 +546,7 @@ class TaxonRecord(Taxon):
545
546
  class PR2TaxonRecord(PR2Taxon):
546
547
  """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
547
548
 
548
- Count: int
549
+ count: int = Field(alias="Count")
549
550
 
550
551
 
551
552
  # This is the schema for the whole DF
@@ -573,6 +574,154 @@ class PR2TaxonSchema(pa.DataFrameModel):
573
574
  coerce = True
574
575
 
575
576
 
577
+ class RawReadsStatusTypes(StrEnum):
578
+ """Class that models the four allowed statuses for successful raw reads analysis runs.
579
+ Pydantic validates Enums very simply without needing to declare a new function.
580
+ """
581
+
582
+ all_results = "all_results"
583
+ no_reads = "no_reads"
584
+ no_results = "no_results"
585
+ missing_results = "missing_results"
586
+
587
+
588
+ class RawReadsPassedRunsRecord(BaseModel):
589
+ """Class defining a Pydantic model for a single "row" of a raw-reads pipeline passed runs file.
590
+ Uses the previous nine classes.
591
+ """
592
+
593
+ run: INSDCRunAccession
594
+ status: RawReadsStatusTypes
595
+
596
+
597
+ class RawReadsNonINSDCSPassedRunsRecord(RawReadsPassedRunsRecord):
598
+ """Class modeling a very similar model as the preceding one, but with no INSDC-validation.
599
+ This is achieved by replacing the type of the runs with just a simple string so no validation
600
+ happens.
601
+ """
602
+
603
+ run: str
604
+
605
+
606
+ # This is the schema for the whole DF
607
+ class RawReadsPassedRunsSchema(pa.DataFrameModel):
608
+ """Class modelling a Pandera dataframe schema that uses the RawReadsPassedRunsRecord class as dtype.
609
+ This is what actually validates the generated dataframe when read by pandas.read_csv.
610
+ """
611
+
612
+ class Config:
613
+ """Config with dataframe-level data type."""
614
+
615
+ dtype = PydanticModel(RawReadsPassedRunsRecord)
616
+ coerce = True
617
+
618
+
619
+ class RawReadsNonINSDCPassedRunsSchema(pa.DataFrameModel):
620
+ """Class modelling the same dataframe schema as the preceding one, except with no INSDC validation.
621
+ Uses the RawReadsNonINSDCSPassedRunsRecord as a dtype to achieve this.
622
+ """
623
+
624
+ class Config:
625
+ """Config with dataframe-level data type."""
626
+
627
+ dtype = PydanticModel(RawReadsNonINSDCSPassedRunsRecord)
628
+ coerce = True
629
+
630
+
631
+ class MotusTaxRank(RootModel):
632
+ """Class for modelling a single Taxonomic Rank in mOTUs output.
633
+ Essentially is just a special string with validation of the structure:
634
+ `${rank}__${taxon}`
635
+ Where `${rank}` is one of the allowed short ranks defined by the imported
636
+ `SHORT_MOTUS_TAX_RANKS` variables.
637
+ And `${taxon}` is the actual taxon for that rank (this isn't validated).
638
+ It will also validate if the whole string is the permitted "unassigned" or "unclassified".
639
+ """
640
+
641
+ valid_tax_ranks: ClassVar = SHORT_MOTUS_TAX_RANKS
642
+
643
+ root: str = Field(
644
+ unique=True,
645
+ description="A single taxon in a taxonomy record",
646
+ examples=["sk__Bacteria", "p__Bacillota", "g__Tundrisphaera"],
647
+ )
648
+
649
+ @field_validator("root", mode="after")
650
+ @classmethod
651
+ def rank_structure_validity_check(cls, taxrank: str) -> bool:
652
+ taxrank_list = taxrank.split("__")
653
+ rank = taxrank_list[0]
654
+ if (
655
+ rank != ""
656
+ and not rank.capitalize() in {"Unclassified", "Unassigned"}
657
+ and rank not in cls.valid_tax_ranks
658
+ ):
659
+ raise ValueError(f"Invalid taxonomy rank {rank}.")
660
+
661
+ return taxrank
662
+
663
+
664
+ class MotusTaxon(BaseModel):
665
+ """Class for modelling an entire MotusTaxon or mOTUs taxonomic assignment.
666
+ All of the ranks are optional, to model for the taxon being "Unclassified" or "Unassigned".
667
+ """
668
+
669
+ Kingdom: Optional[MotusTaxRank] = None
670
+ Phylum: Optional[MotusTaxRank] = None
671
+ Class: Optional[MotusTaxRank] = None
672
+ Order: Optional[MotusTaxRank] = None
673
+ Family: Optional[MotusTaxRank] = None
674
+ Genus: Optional[MotusTaxRank] = None
675
+ Species: Optional[MotusTaxRank] = None
676
+
677
+
678
+ class MotusTaxonRecord(MotusTaxon):
679
+ """Class for modelling a single taxon record in a mOTUs taxonomy file.
680
+ It inherits the MotusTaxon class, and simply adds a Count field, modelling the read counts
681
+ for that particular MotusTaxon record.
682
+ """
683
+
684
+ count: int = Field(alias="Count")
685
+
686
+
687
+ class MotusTaxonSchema(pa.DataFrameModel):
688
+ """Class modelling a Pandera dataframe schema that uses the MotusTaxonRecord class as dtype.
689
+ This is what actually validates the generated dataframe when read by pandas.read_csv.
690
+ """
691
+
692
+ class Config:
693
+ """Config with dataframe-level data type."""
694
+
695
+ dtype = PydanticModel(MotusTaxonRecord)
696
+ coerce = True
697
+
698
+
699
+ class FunctionProfileRecord(BaseModel):
700
+ """Class for modelling a single taxon record in a functional profile file.
701
+ It models the read counts and coverage depth/breadth of each function (gene/protein)
702
+ for each specific record.
703
+ """
704
+
705
+ read_count: int
706
+ coverage_depth: float
707
+ coverage_breadth: float
708
+
709
+ class Config:
710
+ validate_by_name = True
711
+
712
+
713
+ class FunctionProfileSchema(pa.DataFrameModel):
714
+ """Class modelling a Pandera dataframe schema that uses the FunctionProfileRecord class as dtype.
715
+ This is what actually validates the generated dataframe when read by pandas.read_csv.
716
+ """
717
+
718
+ class Config:
719
+ """Config with dataframe-level data type."""
720
+
721
+ dtype = PydanticModel(FunctionProfileRecord)
722
+ coerce = True
723
+
724
+
576
725
  def validate_dataframe(
577
726
  df: pd.DataFrame, schema: Type[pa.DataFrameModel], df_metadata: str
578
727
  ) -> DataFrameBase:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.2.6
3
+ Version: 1.2.7
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -33,6 +33,7 @@ mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py
33
33
  mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py
34
34
  mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py
35
35
  mgnify_pipelines_toolkit/analysis/genomes/__init__.py
36
+ mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py
36
37
  mgnify_pipelines_toolkit/analysis/shared/__init__.py
37
38
  mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py
38
39
  mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py
@@ -27,6 +27,7 @@ permute_primers = mgnify_pipelines_toolkit.analysis.amplicon.permute_primers:mai
27
27
  primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main
28
28
  process_dbcan_cazys = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_cazys:main
29
29
  process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_clusters:main
30
+ rawreads_study_summary_generator = mgnify_pipelines_toolkit.analysis.rawreads.study_summary_generator:cli
30
31
  remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
31
32
  rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
32
33
  summarise_antismash_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_antismash_bgcs:main
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mgnify_pipelines_toolkit"
3
- version = "1.2.6"
3
+ version = "1.2.7"
4
4
  readme = "README.md"
5
5
  license = { text = "Apache Software License 2.0" }
6
6
  authors = [
@@ -39,7 +39,8 @@ packages = ["mgnify_pipelines_toolkit",
39
39
  "mgnify_pipelines_toolkit.analysis.shared",
40
40
  "mgnify_pipelines_toolkit.analysis.amplicon",
41
41
  "mgnify_pipelines_toolkit.analysis.assembly",
42
- "mgnify_pipelines_toolkit.analysis.genomes"
42
+ "mgnify_pipelines_toolkit.analysis.genomes",
43
+ "mgnify_pipelines_toolkit.analysis.rawreads",
43
44
  ]
44
45
 
45
46
  [project.scripts]
@@ -74,6 +75,8 @@ summarise_antismash_bgcs = "mgnify_pipelines_toolkit.analysis.assembly.summarise
74
75
  gff_toolkit = "mgnify_pipelines_toolkit.analysis.assembly.gff_toolkit:main"
75
76
  process_dbcan_clusters = "mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_clusters:main"
76
77
  process_dbcan_cazys = "mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_cazys:main"
78
+ # analysis.rawreads #
79
+ rawreads_study_summary_generator = "mgnify_pipelines_toolkit.analysis.rawreads.study_summary_generator:cli"
77
80
  # genomes #
78
81
  genomes_extract_bacterial_rrnas_as_tsv = "mgnify_pipelines_toolkit.analysis.genomes.rna.extract_bacterial_rrnas_as_tsv:main"
79
82
  genomes_extract_rrnas_as_fasta = "mgnify_pipelines_toolkit.analysis.genomes.rna.extract_rrnas_as_fasta:main"