mgnify-pipelines-toolkit 1.2.6__tar.gz → 1.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (58) hide show
  1. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/PKG-INFO +1 -1
  2. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py +1 -1
  3. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +3 -4
  4. mgnify_pipelines_toolkit-1.2.8/mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py +457 -0
  5. mgnify_pipelines_toolkit-1.2.8/mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +749 -0
  6. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/constants/db_labels.py +6 -0
  7. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/constants/tax_ranks.py +2 -1
  8. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/schemas/schemas.py +152 -3
  9. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit.egg-info/PKG-INFO +1 -1
  10. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt +1 -0
  11. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit.egg-info/entry_points.txt +2 -1
  12. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/pyproject.toml +6 -3
  13. mgnify_pipelines_toolkit-1.2.6/mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py +0 -240
  14. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/LICENSE +0 -0
  15. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/README.md +0 -0
  16. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/__init__.py +0 -0
  17. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/__init__.py +0 -0
  18. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py +0 -0
  19. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py +0 -0
  20. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py +0 -0
  21. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +0 -0
  22. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py +0 -0
  23. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py +0 -0
  24. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py +0 -0
  25. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py +0 -0
  26. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +0 -0
  27. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py +0 -0
  28. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py +0 -0
  29. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py +0 -0
  30. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py +0 -0
  31. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py +0 -0
  32. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py +0 -0
  33. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py +0 -0
  34. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +0 -0
  35. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py +0 -0
  36. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py +0 -0
  37. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +0 -0
  38. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py +0 -0
  39. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/genomes/__init__.py +0 -0
  40. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/shared/__init__.py +0 -0
  41. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py +0 -0
  42. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py +0 -0
  43. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/shared/get_subunits.py +0 -0
  44. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py +0 -0
  45. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py +0 -0
  46. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py +0 -0
  47. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py +0 -0
  48. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/constants/ncrna.py +0 -0
  49. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/constants/regex_fasta_header.py +0 -0
  50. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/constants/thresholds.py +0 -0
  51. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/constants/var_region_coordinates.py +0 -0
  52. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/utils/__init__.py +0 -0
  53. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/utils/fasta_to_delimited.py +0 -0
  54. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit/utils/get_mpt_version.py +0 -0
  55. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit.egg-info/dependency_links.txt +0 -0
  56. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit.egg-info/requires.txt +0 -0
  57. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/mgnify_pipelines_toolkit.egg-info/top_level.txt +0 -0
  58. {mgnify_pipelines_toolkit-1.2.6 → mgnify_pipelines_toolkit-1.2.8}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.2.6
3
+ Version: 1.2.8
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -300,7 +300,7 @@ def main():
300
300
  if paired_end:
301
301
  rev_fr.close()
302
302
 
303
- if asv_dict: # if there are matches between taxonomic and ASV annotations
303
+ if asv_dict: # if there are matches between taxonomic and ASV annotations
304
304
  ref_db = ""
305
305
 
306
306
  if len(taxa_df.columns) == 9:
@@ -110,10 +110,9 @@ def main():
110
110
  df_merged = df_merged[
111
111
  ["nearest_mibig", "nearest_mibig_class", "description", "count"]
112
112
  ]
113
- df_merged = df_merged.rename(columns={
114
- "Description": "description",
115
- "Count": "count"
116
- })
113
+ df_merged = df_merged.rename(
114
+ columns={"Description": "description", "Count": "count"}
115
+ )
117
116
  df_merged.to_csv(output_filename, sep="\t", index=False)
118
117
 
119
118
 
@@ -0,0 +1,457 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024-2025 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import shutil
18
+ from shutil import SameFileError
19
+
20
+ import click
21
+ from collections import defaultdict
22
+ import glob
23
+ import logging
24
+ from pathlib import Path
25
+ from typing import Union, List
26
+
27
+ import pandas as pd
28
+
29
+ from mgnify_pipelines_toolkit.constants.db_labels import (
30
+ RRAP_TAXDB_LABELS,
31
+ RRAP_FUNCDB_LABELS,
32
+ )
33
+ from mgnify_pipelines_toolkit.constants.tax_ranks import (
34
+ _SILVA_TAX_RANKS,
35
+ _MOTUS_TAX_RANKS,
36
+ )
37
+ from mgnify_pipelines_toolkit.schemas.schemas import (
38
+ RawReadsPassedRunsSchema,
39
+ RawReadsNonINSDCPassedRunsSchema,
40
+ TaxonSchema,
41
+ MotusTaxonSchema,
42
+ FunctionProfileSchema,
43
+ validate_dataframe,
44
+ )
45
+
46
+ logging.basicConfig(level=logging.DEBUG)
47
+
48
+
49
+ @click.group()
50
+ def cli():
51
+ pass
52
+
53
+
54
+ def get_file(
55
+ run_acc: str, analyses_dir: Path, db_label: str
56
+ ) -> Union[Path, List[Path], None]:
57
+ """Takes path information for a particular analysis and db_label combo, and returns any existing files.
58
+
59
+ :param run_acc: Run accession for the tax file that should be retrieved.
60
+ :type run_acc: str
61
+ :param analyses_dir: The path to the directory containing all of the analyses,
62
+ including the tax file corresponding to :param:`run_acc`.
63
+ :type analyses_dir: Path
64
+ :param db_label: One of the database labels that results might exist for,
65
+ values of which come from the imported constants ``RRAP_TAXDB_LABELS`` and ``RRAP_FUNCDB_LABELS``.
66
+ :type db_label: str
67
+ :return: A :class:`Path` object if :param:`db_label` comes from ``RRAP_TAXDB_LABELS`` or ``RRAP_FUNCDB_LABELS``.
68
+ :rtype: Union[Path, List[Path]]
69
+ """
70
+
71
+ if db_label not in RRAP_TAXDB_LABELS + RRAP_FUNCDB_LABELS:
72
+ return
73
+
74
+ if db_label in RRAP_TAXDB_LABELS:
75
+ db_dir = "taxonomy-summary"
76
+ else:
77
+ db_dir = "function-summary"
78
+ db_path = Path(f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}")
79
+
80
+ if not db_path.exists():
81
+ logging.debug(
82
+ f"DB {db_path} doesn't exist for {run_acc}. Skipping"
83
+ ) # or error?
84
+ return
85
+
86
+ analysis_file = Path(
87
+ f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}/{run_acc}_{db_label}.txt.gz"
88
+ )
89
+ if not analysis_file.exists():
90
+ logging.error(
91
+ f"DB path exists but file doesn't - exiting. Path: {analysis_file}"
92
+ )
93
+ exit(1)
94
+
95
+ file_size = analysis_file.stat().st_size
96
+ if (
97
+ file_size == 0
98
+ ): # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
99
+ # so need to skip those. Should probably fix that at some point
100
+ logging.debug(
101
+ f"File {analysis_file} exists but is empty, so will be skipping it."
102
+ )
103
+ analysis_file = None
104
+
105
+ return analysis_file
106
+
107
+
108
+ def parse_one_tax_file(run_acc: str, tax_file: Path, db_label: str) -> pd.DataFrame:
109
+ """Parses a taxonomy file, and returns it as a pandas DataFrame object.
110
+
111
+ :param run_acc: Run accession of the taxonomy file that will be parsed.
112
+ :type run_acc: str
113
+ :param tax_file: Taxonomy file that will be parsed.
114
+ :type tax_file: Path
115
+ :param db_label: One of the database labels that results might exist for,
116
+ values of which come from the imported constants ``RRAP_TAXDB_LABELS` and `RRAP_FUNCDB_LABELS``.
117
+ :type db_label: str
118
+ :return: The parsed :param:`tax_file` as a :class:`pd.DataFrame` object
119
+ :rtype: pd.DataFrame
120
+ """
121
+
122
+ tax_ranks = _MOTUS_TAX_RANKS if db_label == "motus" else _SILVA_TAX_RANKS
123
+ res_df = pd.read_csv(tax_file, sep="\t", skiprows=1, names=["Count"] + tax_ranks)
124
+ res_df = res_df.fillna("")
125
+
126
+ if res_df.shape[0] > 0:
127
+ validate_dataframe(
128
+ res_df,
129
+ MotusTaxonSchema if db_label == "motus" else TaxonSchema,
130
+ str(tax_file),
131
+ )
132
+
133
+ res_df["full_taxon"] = [
134
+ ";".join(r[tax_ranks]).strip(";") for _, r in res_df.iterrows()
135
+ ]
136
+ final_df = (
137
+ res_df[["Count", "full_taxon"]]
138
+ .set_index("full_taxon")
139
+ .rename(columns={"Count": run_acc})
140
+ )
141
+
142
+ return final_df
143
+
144
+
145
+ def parse_one_func_file(
146
+ run_acc: str, func_file: Path, db_label: str
147
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
148
+ """Parses a functional profile file, and returns it as a pandas DataFrame object.
149
+
150
+ :param run_acc: Run accession of the taxonomy file that will be parsed.
151
+ :type run_acc: str
152
+ :param func_file: Functional profile file that will be parsed.
153
+ :type func_file: Path
154
+ :param db_label: One of the database labels that results might exist for,
155
+ values of which come from the imported constants ``RRAP_TAXDB_LABELS` and `RRAP_FUNCDB_LABELS``.
156
+ :type db_label: str
157
+ :return: The parsed :param:`func_file` as a :class:`pd.DataFrame` object
158
+ :rtype: pd.DataFrame
159
+ """
160
+
161
+ res_df = pd.read_csv(
162
+ func_file,
163
+ sep="\t",
164
+ names=["function", "read_count", "coverage_depth", "coverage_breadth"],
165
+ skiprows=1,
166
+ dtype={"read_count": int, "coverage_depth": float, "coverage_breadth": float},
167
+ ).set_index("function")
168
+ res_df = res_df.fillna(0)
169
+
170
+ if res_df.shape[0] > 0:
171
+ validate_dataframe(res_df, FunctionProfileSchema, str(func_file))
172
+
173
+ count_df = pd.DataFrame(res_df[["read_count"]]).rename(
174
+ columns={"read_count": run_acc}
175
+ )
176
+
177
+ depth_df = pd.DataFrame(res_df[["coverage_depth"]]).rename(
178
+ columns={"coverage_depth": run_acc}
179
+ )
180
+
181
+ breadth_df = pd.DataFrame(res_df[["coverage_breadth"]]).rename(
182
+ columns={"coverage_breadth": run_acc}
183
+ )
184
+
185
+ return count_df, depth_df, breadth_df
186
+
187
+
188
+ def generate_db_summary(
189
+ db_label: str, analysis_dfs: dict[str, Path], output_prefix: str
190
+ ) -> None:
191
+ """Takes paired run accessions taxonomy dataframes in the form of a dictionary,
192
+ and respective db_label, joins them together, and generates a study-wide summary
193
+ in the form of a .tsv file.
194
+
195
+ :param db_label: One of the database labels that results might exist for,
196
+ values of which come from the imported constants ``RRAP_TAXDB_LABELS` and `RRAP_FUNCDB_LABELS``.
197
+ :param tax_dfs: Dictionary where the key is a run accession,
198
+ and values are one parsed taxonomy dataframe if the :param:db_label comes from ``RRAP_TAXDB_LABELS` or `RRAP_FUNCDB_LABELS``.
199
+ These dataframes are parsed by :func:`parse_one_tax_file` or `parse_one_func_file`.
200
+ :type tax_dfs: defaultdict[Path]
201
+ :param output_prefix: Prefix to be added to the generated summary file.
202
+ :type output_prefix: str
203
+ """
204
+
205
+ if db_label in RRAP_TAXDB_LABELS:
206
+ df_list = []
207
+
208
+ for run_acc, analysis_df in analysis_dfs.items():
209
+ res_df = parse_one_tax_file(run_acc, analysis_df, db_label)
210
+ df_list.append(res_df)
211
+
212
+ res_df = pd.concat(df_list, axis=1).fillna(0)
213
+ res_df = res_df.sort_index()
214
+ res_df = res_df.astype(int)
215
+
216
+ res_df.to_csv(
217
+ f"{output_prefix}_{db_label}_study_summary.tsv",
218
+ sep="\t",
219
+ index_label="taxonomy",
220
+ )
221
+
222
+ if db_label in RRAP_FUNCDB_LABELS:
223
+ count_df_list = []
224
+ depth_df_list = []
225
+ breadth_df_list = []
226
+
227
+ for run_acc, analysis_df in analysis_dfs.items():
228
+ count_df, depth_df, breadth_df = parse_one_func_file(
229
+ run_acc, analysis_df, db_label
230
+ )
231
+ count_df_list.append(count_df)
232
+ depth_df_list.append(depth_df)
233
+ breadth_df_list.append(breadth_df)
234
+
235
+ count_df = pd.concat(count_df_list, axis=1).fillna(0)
236
+ count_df = count_df.sort_index()
237
+ count_df = count_df.astype(int)
238
+
239
+ count_df.to_csv(
240
+ f"{output_prefix}_{db_label}_read-count_study_summary.tsv",
241
+ sep="\t",
242
+ index_label="function",
243
+ )
244
+
245
+ depth_df = pd.concat(depth_df_list, axis=1).fillna(0)
246
+ depth_df = depth_df.sort_index()
247
+ depth_df = depth_df.astype(float)
248
+
249
+ depth_df.to_csv(
250
+ f"{output_prefix}_{db_label}_coverage-depth_study_summary.tsv",
251
+ sep="\t",
252
+ index_label="function",
253
+ float_format="%.6g",
254
+ )
255
+
256
+ breadth_df = pd.concat(breadth_df_list, axis=1).fillna(0)
257
+ breadth_df = breadth_df.sort_index()
258
+ breadth_df = breadth_df.astype(float)
259
+
260
+ breadth_df.to_csv(
261
+ f"{output_prefix}_{db_label}_coverage-breadth_study_summary.tsv",
262
+ sep="\t",
263
+ index_label="function",
264
+ float_format="%.6g",
265
+ )
266
+
267
+
268
+ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[str, List]:
269
+ """Matches different summary files of the same database label and analysis
270
+ type into a dictionary to help merge
271
+ the correct summaries.
272
+
273
+ :param all_study_summaries: List of file paths to different summary files
274
+ :type all_study_summaries: List[str]
275
+ :return: Organised dictionary where each summary is paired to a specific
276
+ database label key to be merged together.
277
+ :rtype: defaultdict[List]
278
+ """
279
+ summaries_dict = defaultdict(list)
280
+
281
+ for summary in all_study_summaries:
282
+ summary_path = Path(summary)
283
+ summary_filename = summary_path.stem
284
+
285
+ summary_db_label = summary_filename.split("_")[1]
286
+
287
+ summaries_dict[summary_db_label].append(summary_path)
288
+
289
+ return summaries_dict
290
+
291
+
292
+ @cli.command(
293
+ "summarise",
294
+ options_metavar="-r <runs> -a <analyses_dir> -p <output_prefix>",
295
+ short_help="Generate study-level summaries of raw-read analysis results.",
296
+ )
297
+ @click.option(
298
+ "-r",
299
+ "--runs",
300
+ required=True,
301
+ help="CSV file containing successful analyses generated by the pipeline",
302
+ type=click.Path(exists=True, path_type=Path, dir_okay=False),
303
+ )
304
+ @click.option(
305
+ "-a",
306
+ "--analyses_dir",
307
+ required=True,
308
+ help="Input directory to where all the individual analyses subdirectories for summarising",
309
+ type=click.Path(exists=True, path_type=Path, file_okay=False),
310
+ )
311
+ @click.option(
312
+ "-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
313
+ )
314
+ @click.option(
315
+ "--non_insdc",
316
+ default=False,
317
+ is_flag=True,
318
+ help="If run accessions aren't INSDC-formatted",
319
+ )
320
+ def summarise_analyses(
321
+ runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool
322
+ ) -> None:
323
+ """Function that will take a file of pipeline-successful run accessions
324
+ that should be used for the generation of the relevant db-specific
325
+ study-level summary files.
326
+ \f
327
+
328
+ :param runs: Path to a qc_passed_runs file from the pipeline execution.
329
+ Contains the accessions of runs that should therefore be included in the generated
330
+ summaries.
331
+ :type runs: Path
332
+ :param analyses_dir: The path to the directory containing all of the analyses.
333
+ :type analyses_dir: Path
334
+ :param output_prefix: Prefix to be added to the generated summary file.
335
+ :type output_prefix: str
336
+ """
337
+ runs_df = pd.read_csv(runs, names=["run", "status"])
338
+
339
+ if not non_insdc:
340
+ RawReadsPassedRunsSchema(
341
+ runs_df
342
+ ) # Run validation on the successful_runs .csv file
343
+ else:
344
+ RawReadsNonINSDCPassedRunsSchema(runs_df)
345
+
346
+ all_db_labels = RRAP_TAXDB_LABELS + RRAP_FUNCDB_LABELS
347
+ for db_label in all_db_labels:
348
+
349
+ analysis_files = {}
350
+ for run_acc in runs_df["run"]:
351
+ analysis_file = get_file(run_acc, analyses_dir, db_label)
352
+
353
+ if analysis_file:
354
+ analysis_files[run_acc] = analysis_file
355
+
356
+ if analysis_files:
357
+ generate_db_summary(db_label, analysis_files, output_prefix)
358
+
359
+
360
+ @cli.command(
361
+ "merge",
362
+ options_metavar="-a <analyses_dir> -p <output_prefix>",
363
+ short_help="Merge multiple study-level summaries of raw-read analysis.",
364
+ )
365
+ @click.option(
366
+ "-a",
367
+ "--analyses_dir",
368
+ required=True,
369
+ help="Input directory to where all the individual analyses subdirectories for merging",
370
+ type=click.Path(exists=True, file_okay=False),
371
+ )
372
+ @click.option(
373
+ "-p",
374
+ "--output_prefix",
375
+ required=True,
376
+ help="Prefix to merged summary files",
377
+ type=str,
378
+ )
379
+ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
380
+ """Function that will take a file path containing study-level
381
+ summaries that should be merged together on a per-db
382
+ basis.
383
+ \f
384
+
385
+ :param analyses_dir: The filepath to the directory containing all of the analyses.
386
+ :type analyses_dir: str
387
+ :param output_prefix: Prefix to be added to the generated summary file.
388
+ :type output_prefix: str
389
+ """
390
+
391
+ all_study_summaries = glob.glob(f"{analyses_dir}/*_study_summary.tsv")
392
+
393
+ summaries_dict = organise_study_summaries(all_study_summaries)
394
+
395
+ for db_label, summaries in summaries_dict.items():
396
+ if db_label in RRAP_TAXDB_LABELS:
397
+ merged_summary_name = f"{output_prefix}_{db_label}_study_summary.tsv"
398
+ if len(summaries) > 1:
399
+ res_df = pd.read_csv(summaries[0], sep="\t", index_col=0)
400
+ for summary in summaries[1:]:
401
+ curr_df = pd.read_csv(summary, sep="\t", index_col=0)
402
+ res_df = res_df.join(curr_df, how="outer")
403
+ res_df = res_df.fillna(0)
404
+ res_df = res_df.astype(int)
405
+
406
+ res_df = res_df.reindex(sorted(res_df.columns), axis=1)
407
+ res_df.to_csv(
408
+ merged_summary_name,
409
+ sep="\t",
410
+ index_label="taxonomy",
411
+ )
412
+ elif len(summaries) == 1:
413
+ logging.info(
414
+ f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}"
415
+ )
416
+ try:
417
+ shutil.copyfile(summaries[0], merged_summary_name)
418
+ except SameFileError:
419
+ pass
420
+
421
+ if db_label in RRAP_FUNCDB_LABELS:
422
+ for table_type in ["read-count", "coverage-depth", "coverage-breadth"]:
423
+ merged_summary_name = (
424
+ f"{output_prefix}_{db_label}_{table_type}_study_summary.tsv"
425
+ )
426
+ summaries_ = [
427
+ v for v in summaries if Path(v).stem.split("_")[2] == table_type
428
+ ]
429
+ if len(summaries_) > 1:
430
+ res_df = pd.read_csv(summaries_[0], sep="\t", index_col=0)
431
+ for summary in summaries_[1:]:
432
+ curr_df = pd.read_csv(summary, sep="\t", index_col=0)
433
+ res_df = res_df.join(curr_df, how="outer")
434
+ res_df = res_df.fillna(0)
435
+ res_df = res_df.astype(
436
+ int if table_type == "read-count" else float
437
+ )
438
+
439
+ res_df = res_df.reindex(sorted(res_df.columns), axis=1)
440
+ res_df.to_csv(
441
+ merged_summary_name,
442
+ sep="\t",
443
+ index_label="function",
444
+ float_format="%.6g",
445
+ )
446
+ elif len(summaries_) == 1:
447
+ logging.info(
448
+ f"Only one summary ({summaries_[0]}) so will use that as {merged_summary_name}"
449
+ )
450
+ try:
451
+ shutil.copyfile(summaries_[0], merged_summary_name)
452
+ except SameFileError:
453
+ pass
454
+
455
+
456
+ if __name__ == "__main__":
457
+ cli()