mgnify-pipelines-toolkit 1.2.10__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py +2 -1
- mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py +30 -69
- mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py +29 -60
- mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py +33 -81
- mgnify_pipelines_toolkit/schemas/dataframes.py +325 -0
- {mgnify_pipelines_toolkit-1.2.10.dist-info → mgnify_pipelines_toolkit-1.3.0.dist-info}/METADATA +25 -12
- {mgnify_pipelines_toolkit-1.2.10.dist-info → mgnify_pipelines_toolkit-1.3.0.dist-info}/RECORD +11 -11
- mgnify_pipelines_toolkit/schemas/schemas.py +0 -738
- {mgnify_pipelines_toolkit-1.2.10.dist-info → mgnify_pipelines_toolkit-1.3.0.dist-info}/WHEEL +0 -0
- {mgnify_pipelines_toolkit-1.2.10.dist-info → mgnify_pipelines_toolkit-1.3.0.dist-info}/entry_points.txt +0 -0
- {mgnify_pipelines_toolkit-1.2.10.dist-info → mgnify_pipelines_toolkit-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {mgnify_pipelines_toolkit-1.2.10.dist-info → mgnify_pipelines_toolkit-1.3.0.dist-info}/top_level.txt +0 -0
|
@@ -167,7 +167,8 @@ def main():
|
|
|
167
167
|
matched_primers_list.append(cleaned_primer_name)
|
|
168
168
|
|
|
169
169
|
res_df = pd.DataFrame.from_dict(res_dict)
|
|
170
|
-
|
|
170
|
+
res_tsv_name = f"./{sample}_primer_validation.tsv"
|
|
171
|
+
res_df.to_csv(res_tsv_name, sep="\t", index=False) if not res_df.empty else open(res_tsv_name, "w").close()
|
|
171
172
|
|
|
172
173
|
fwd_primers_fw.close()
|
|
173
174
|
rev_primers_fw.close()
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# -*- coding: utf-8 -*-
|
|
3
|
-
import shutil
|
|
4
|
-
from shutil import SameFileError
|
|
5
3
|
|
|
6
4
|
# Copyright 2024-2025 EMBL - European Bioinformatics Institute
|
|
7
5
|
#
|
|
@@ -16,25 +14,27 @@ from shutil import SameFileError
|
|
|
16
14
|
# See the License for the specific language governing permissions and
|
|
17
15
|
# limitations under the License.
|
|
18
16
|
|
|
19
|
-
import click
|
|
20
|
-
from collections import defaultdict
|
|
21
17
|
import glob
|
|
22
18
|
import logging
|
|
19
|
+
import shutil
|
|
20
|
+
from collections import defaultdict
|
|
23
21
|
from pathlib import Path
|
|
24
|
-
from
|
|
22
|
+
from shutil import SameFileError
|
|
23
|
+
from typing import List, Union
|
|
25
24
|
|
|
25
|
+
import click
|
|
26
26
|
import pandas as pd
|
|
27
27
|
|
|
28
|
-
from mgnify_pipelines_toolkit.constants.db_labels import
|
|
28
|
+
from mgnify_pipelines_toolkit.constants.db_labels import ASV_TAXDB_LABELS, TAXDB_LABELS
|
|
29
29
|
from mgnify_pipelines_toolkit.constants.tax_ranks import (
|
|
30
|
-
_SILVA_TAX_RANKS,
|
|
31
30
|
_PR2_TAX_RANKS,
|
|
31
|
+
_SILVA_TAX_RANKS,
|
|
32
32
|
)
|
|
33
|
-
from mgnify_pipelines_toolkit.schemas.
|
|
34
|
-
AmpliconPassedRunsSchema,
|
|
33
|
+
from mgnify_pipelines_toolkit.schemas.dataframes import (
|
|
35
34
|
AmpliconNonINSDCPassedRunsSchema,
|
|
36
|
-
|
|
35
|
+
AmpliconPassedRunsSchema,
|
|
37
36
|
PR2TaxonSchema,
|
|
37
|
+
TaxonSchema,
|
|
38
38
|
validate_dataframe,
|
|
39
39
|
)
|
|
40
40
|
|
|
@@ -46,9 +46,7 @@ def cli():
|
|
|
46
46
|
pass
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
def get_tax_file(
|
|
50
|
-
run_acc: str, analyses_dir: Path, db_label: str
|
|
51
|
-
) -> Union[Path, List[Path]]:
|
|
49
|
+
def get_tax_file(run_acc: str, analyses_dir: Path, db_label: str) -> Union[Path, List[Path]]:
|
|
52
50
|
"""Takes path information for a particular analysis and db_label combo, and returns any existing files.
|
|
53
51
|
|
|
54
52
|
:param run_acc: Run accession for the tax file that should be retrieved.
|
|
@@ -69,48 +67,32 @@ def get_tax_file(
|
|
|
69
67
|
db_path = Path(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}")
|
|
70
68
|
|
|
71
69
|
if not db_path.exists():
|
|
72
|
-
logging.debug(
|
|
73
|
-
f"DB {db_path} doesn't exist for {run_acc}. Skipping"
|
|
74
|
-
) # or error?
|
|
70
|
+
logging.debug(f"DB {db_path} doesn't exist for {run_acc}. Skipping") # or error?
|
|
75
71
|
return
|
|
76
72
|
|
|
77
73
|
if db_label in TAXDB_LABELS:
|
|
78
|
-
tax_file = Path(
|
|
79
|
-
f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/{run_acc}_{db_label}.txt"
|
|
80
|
-
)
|
|
74
|
+
tax_file = Path(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/{run_acc}_{db_label}.txt")
|
|
81
75
|
if not tax_file.exists():
|
|
82
|
-
logging.error(
|
|
83
|
-
f"DB path exists but file doesn't - exiting. Path: {tax_file}"
|
|
84
|
-
)
|
|
76
|
+
logging.error(f"DB path exists but file doesn't - exiting. Path: {tax_file}")
|
|
85
77
|
exit(1)
|
|
86
78
|
|
|
87
79
|
file_size = tax_file.stat().st_size
|
|
88
|
-
if (
|
|
89
|
-
file_size == 0
|
|
90
|
-
): # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
|
|
80
|
+
if file_size == 0: # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
|
|
91
81
|
# so need to skip those. Should probably fix that at some point
|
|
92
|
-
logging.debug(
|
|
93
|
-
f"File {tax_file} exists but is empty, so will be skipping it."
|
|
94
|
-
)
|
|
82
|
+
logging.debug(f"File {tax_file} exists but is empty, so will be skipping it.")
|
|
95
83
|
tax_file = None
|
|
96
84
|
elif db_label in ASV_TAXDB_LABELS:
|
|
97
85
|
# ASV tax files could have up to two files, one for each amplified region (maximum two from the pipeline).
|
|
98
86
|
# So will need to handle this differently to closed-reference files
|
|
99
|
-
asv_tax_files = glob.glob(
|
|
100
|
-
|
|
101
|
-
)
|
|
102
|
-
asv_tax_files = [
|
|
103
|
-
Path(file) for file in asv_tax_files if "concat" not in file
|
|
104
|
-
] # Have to filter out concatenated file if it exists
|
|
87
|
+
asv_tax_files = glob.glob(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/*.txt")
|
|
88
|
+
asv_tax_files = [Path(file) for file in asv_tax_files if "concat" not in file] # Have to filter out concatenated file if it exists
|
|
105
89
|
|
|
106
90
|
tax_file = asv_tax_files
|
|
107
91
|
|
|
108
92
|
return tax_file
|
|
109
93
|
|
|
110
94
|
|
|
111
|
-
def parse_one_tax_file(
|
|
112
|
-
run_acc: str, tax_file: Path, long_tax_ranks: list
|
|
113
|
-
) -> pd.DataFrame:
|
|
95
|
+
def parse_one_tax_file(run_acc: str, tax_file: Path, long_tax_ranks: list) -> pd.DataFrame:
|
|
114
96
|
"""Parses a taxonomy file, and returns it as a pandas DataFrame object.
|
|
115
97
|
|
|
116
98
|
:param run_acc: Run accession of the taxonomy file that will be parsed.
|
|
@@ -134,9 +116,7 @@ def parse_one_tax_file(
|
|
|
134
116
|
elif len(long_tax_ranks) == 9:
|
|
135
117
|
validate_dataframe(res_df, PR2TaxonSchema, str(tax_file))
|
|
136
118
|
|
|
137
|
-
res_df["full_taxon"] = res_df.iloc[:, 1:].apply(
|
|
138
|
-
lambda x: ";".join(x).strip(";"), axis=1
|
|
139
|
-
)
|
|
119
|
+
res_df["full_taxon"] = res_df.iloc[:, 1:].apply(lambda x: ";".join(x).strip(";"), axis=1)
|
|
140
120
|
final_df = res_df.iloc[:, [0, -1]]
|
|
141
121
|
final_df = final_df.set_index("full_taxon")
|
|
142
122
|
final_df.columns = [run_acc]
|
|
@@ -144,9 +124,7 @@ def parse_one_tax_file(
|
|
|
144
124
|
return final_df
|
|
145
125
|
|
|
146
126
|
|
|
147
|
-
def generate_db_summary(
|
|
148
|
-
db_label: str, tax_dfs: defaultdict[Path], output_prefix: str
|
|
149
|
-
) -> None:
|
|
127
|
+
def generate_db_summary(db_label: str, tax_dfs: defaultdict[Path], output_prefix: str) -> None:
|
|
150
128
|
"""Takes paired run accessions taxonomy dataframes in the form of a dictionary,
|
|
151
129
|
and respective db_label, joins them together, and generates a study-wide summary
|
|
152
130
|
in the form of a .tsv file.
|
|
@@ -185,7 +163,6 @@ def generate_db_summary(
|
|
|
185
163
|
)
|
|
186
164
|
|
|
187
165
|
elif db_label in ASV_TAXDB_LABELS:
|
|
188
|
-
|
|
189
166
|
if "PR2" in db_label:
|
|
190
167
|
long_tax_ranks = _PR2_TAX_RANKS
|
|
191
168
|
else:
|
|
@@ -196,13 +173,9 @@ def generate_db_summary(
|
|
|
196
173
|
for (
|
|
197
174
|
run_acc,
|
|
198
175
|
tax_df_asv_lst,
|
|
199
|
-
) in (
|
|
200
|
-
tax_dfs.items()
|
|
201
|
-
): # each `tax_file` will be a list containing at most two files (one for each amp_region)
|
|
176
|
+
) in tax_dfs.items(): # each `tax_file` will be a list containing at most two files (one for each amp_region)
|
|
202
177
|
for tax_df in tax_df_asv_lst:
|
|
203
|
-
amp_region = str(tax_df).split("_")[
|
|
204
|
-
-5
|
|
205
|
-
] # there are a lot of underscores in these names... but it is consistent
|
|
178
|
+
amp_region = str(tax_df).split("_")[-5] # there are a lot of underscores in these names... but it is consistent
|
|
206
179
|
# e.g. ERR4334351_16S-V3-V4_DADA2-SILVA_asv_krona_counts.txt
|
|
207
180
|
amp_region_df = parse_one_tax_file(run_acc, tax_df, long_tax_ranks)
|
|
208
181
|
amp_region_dict[amp_region].append(amp_region_df)
|
|
@@ -241,13 +214,9 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
|
|
|
241
214
|
|
|
242
215
|
temp_lst = summary_filename.split("_")
|
|
243
216
|
if "asv_study_summary" in summary_filename:
|
|
244
|
-
summary_db_label = "_".join(
|
|
245
|
-
temp_lst[1:3]
|
|
246
|
-
) # For ASVs we need to include the amp_region in the label
|
|
217
|
+
summary_db_label = "_".join(temp_lst[1:3]) # For ASVs we need to include the amp_region in the label
|
|
247
218
|
else:
|
|
248
|
-
summary_db_label = temp_lst[
|
|
249
|
-
1
|
|
250
|
-
] # For closed reference, just the db_label is needed
|
|
219
|
+
summary_db_label = temp_lst[1] # For closed reference, just the db_label is needed
|
|
251
220
|
|
|
252
221
|
summaries_dict[summary_db_label].append(summary_path)
|
|
253
222
|
|
|
@@ -273,18 +242,14 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
|
|
|
273
242
|
help="Input directory to where all the individual analyses subdirectories for summarising",
|
|
274
243
|
type=click.Path(exists=True, path_type=Path, file_okay=False),
|
|
275
244
|
)
|
|
276
|
-
@click.option(
|
|
277
|
-
"-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
|
|
278
|
-
)
|
|
245
|
+
@click.option("-p", "--output_prefix", required=True, help="Prefix to summary files", type=str)
|
|
279
246
|
@click.option(
|
|
280
247
|
"--non_insdc",
|
|
281
248
|
default=False,
|
|
282
249
|
is_flag=True,
|
|
283
250
|
help="If run accessions aren't INSDC-formatted",
|
|
284
251
|
)
|
|
285
|
-
def summarise_analyses(
|
|
286
|
-
runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool
|
|
287
|
-
) -> None:
|
|
252
|
+
def summarise_analyses(runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool) -> None:
|
|
288
253
|
"""Function that will take a file of pipeline-successful run accessions
|
|
289
254
|
that should be used for the generation of the relevant db-specific
|
|
290
255
|
study-level summary files. For ASV results, these will also be on a
|
|
@@ -302,16 +267,14 @@ def summarise_analyses(
|
|
|
302
267
|
"""
|
|
303
268
|
runs_df = pd.read_csv(runs, names=["run", "status"])
|
|
304
269
|
|
|
270
|
+
# Run validation on the successful_runs .csv file
|
|
305
271
|
if not non_insdc:
|
|
306
|
-
AmpliconPassedRunsSchema(
|
|
307
|
-
runs_df
|
|
308
|
-
) # Run validation on the successful_runs .csv file
|
|
272
|
+
AmpliconPassedRunsSchema(runs_df)
|
|
309
273
|
else:
|
|
310
274
|
AmpliconNonINSDCPassedRunsSchema(runs_df)
|
|
311
275
|
|
|
312
276
|
all_db_labels = TAXDB_LABELS + ASV_TAXDB_LABELS
|
|
313
277
|
for db_label in all_db_labels:
|
|
314
|
-
|
|
315
278
|
tax_files = defaultdict(Path)
|
|
316
279
|
for i in range(0, len(runs_df)):
|
|
317
280
|
run_acc = runs_df.loc[i, "run"]
|
|
@@ -376,9 +339,7 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
|
|
|
376
339
|
index_label="taxonomy",
|
|
377
340
|
)
|
|
378
341
|
elif len(summaries) == 1:
|
|
379
|
-
logging.info(
|
|
380
|
-
f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}"
|
|
381
|
-
)
|
|
342
|
+
logging.info(f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}")
|
|
382
343
|
try:
|
|
383
344
|
shutil.copyfile(summaries[0], merged_summary_name)
|
|
384
345
|
except SameFileError:
|
|
@@ -14,39 +14,37 @@
|
|
|
14
14
|
# See the License for the specific language governing permissions and
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
|
-
import click
|
|
18
|
-
from functools import reduce
|
|
19
17
|
import glob
|
|
20
18
|
import logging
|
|
19
|
+
from functools import reduce
|
|
21
20
|
from pathlib import Path
|
|
22
21
|
from typing import Literal
|
|
23
22
|
|
|
23
|
+
import click
|
|
24
24
|
import pandas as pd
|
|
25
25
|
|
|
26
|
-
from mgnify_pipelines_toolkit.schemas.
|
|
26
|
+
from mgnify_pipelines_toolkit.schemas.dataframes import (
|
|
27
|
+
AntismashStudySummarySchema,
|
|
28
|
+
AntismashSummarySchema,
|
|
27
29
|
CompletedAnalysisSchema,
|
|
28
|
-
|
|
30
|
+
GOStudySummarySchema,
|
|
29
31
|
GOSummarySchema,
|
|
32
|
+
InterProStudySummarySchema,
|
|
30
33
|
InterProSummarySchema,
|
|
31
|
-
|
|
32
|
-
SanntisSummarySchema,
|
|
33
|
-
AntismashSummarySchema,
|
|
34
|
-
PFAMSummarySchema,
|
|
34
|
+
KEGGModulesStudySummarySchema,
|
|
35
35
|
KEGGModulesSummarySchema,
|
|
36
|
-
GOStudySummarySchema,
|
|
37
|
-
InterProStudySummarySchema,
|
|
38
|
-
TaxonomyStudySummarySchema,
|
|
39
36
|
KOStudySummarySchema,
|
|
40
|
-
|
|
41
|
-
AntismashStudySummarySchema,
|
|
37
|
+
KOSummarySchema,
|
|
42
38
|
PFAMStudySummarySchema,
|
|
43
|
-
|
|
39
|
+
PFAMSummarySchema,
|
|
40
|
+
SanntisStudySummarySchema,
|
|
41
|
+
SanntisSummarySchema,
|
|
42
|
+
TaxonomyStudySummarySchema,
|
|
43
|
+
TaxonSchema,
|
|
44
44
|
validate_dataframe,
|
|
45
45
|
)
|
|
46
46
|
|
|
47
|
-
logging.basicConfig(
|
|
48
|
-
level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
49
|
-
)
|
|
47
|
+
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
50
48
|
|
|
51
49
|
# Keys are the original column names in the input files,
|
|
52
50
|
# values are the standardised column names used in the generated study summary files
|
|
@@ -173,9 +171,7 @@ def check_files_exist(file_list: list[Path]) -> None:
|
|
|
173
171
|
"""
|
|
174
172
|
missing_files = [str(path) for path in file_list if not path.is_file()]
|
|
175
173
|
if missing_files:
|
|
176
|
-
raise FileNotFoundError(
|
|
177
|
-
f"The following required files are missing: {', '.join(missing_files)}"
|
|
178
|
-
)
|
|
174
|
+
raise FileNotFoundError(f"The following required files are missing: {', '.join(missing_files)}")
|
|
179
175
|
|
|
180
176
|
|
|
181
177
|
def generate_taxonomy_summary(
|
|
@@ -206,9 +202,7 @@ def generate_taxonomy_summary(
|
|
|
206
202
|
df = validate_dataframe(df, TaxonSchema, str(path))
|
|
207
203
|
|
|
208
204
|
# Combine all taxonomic ranks in the classification into a single string
|
|
209
|
-
df["full_taxon"] = (
|
|
210
|
-
df[TAXONOMY_COLUMN_NAMES[1:]].agg(";".join, axis=1).str.strip(";")
|
|
211
|
-
)
|
|
205
|
+
df["full_taxon"] = df[TAXONOMY_COLUMN_NAMES[1:]].agg(";".join, axis=1).str.strip(";")
|
|
212
206
|
|
|
213
207
|
# Create a new DataFrame with taxonomy as index and count as the only column
|
|
214
208
|
result = df[["Count", "full_taxon"]].set_index("full_taxon")
|
|
@@ -229,9 +223,7 @@ def generate_functional_summary(
|
|
|
229
223
|
file_dict: dict[str, Path],
|
|
230
224
|
column_names: dict[str, str],
|
|
231
225
|
output_prefix: str,
|
|
232
|
-
label: Literal[
|
|
233
|
-
"go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
|
|
234
|
-
],
|
|
226
|
+
label: Literal["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"],
|
|
235
227
|
outdir: Path = None,
|
|
236
228
|
allow_missing: bool = False,
|
|
237
229
|
) -> None:
|
|
@@ -292,9 +284,7 @@ def generate_functional_summary(
|
|
|
292
284
|
check_files_exist(list(file_dict.values()))
|
|
293
285
|
except FileNotFoundError as e:
|
|
294
286
|
if allow_missing:
|
|
295
|
-
logging.warning(
|
|
296
|
-
f"One of the expected files is missing, but this is allowed for {label}."
|
|
297
|
-
)
|
|
287
|
+
logging.warning(f"One of the expected files is missing, but this is allowed for {label}.")
|
|
298
288
|
logging.warning(e)
|
|
299
289
|
return
|
|
300
290
|
raise
|
|
@@ -324,9 +314,7 @@ def generate_functional_summary(
|
|
|
324
314
|
dfs.append(df)
|
|
325
315
|
|
|
326
316
|
if not dfs:
|
|
327
|
-
logging.warning(
|
|
328
|
-
f"No valid files with functional annotation summary were found. Skipping creation of {output_file_name}."
|
|
329
|
-
)
|
|
317
|
+
logging.warning(f"No valid files with functional annotation summary were found. Skipping creation of {output_file_name}.")
|
|
330
318
|
return
|
|
331
319
|
|
|
332
320
|
# Merge all dataframes on the renamed metadata columns
|
|
@@ -384,9 +372,7 @@ def generate_functional_summary(
|
|
|
384
372
|
help="Directory for the output files, by default it will use the current working directory.",
|
|
385
373
|
type=click.Path(exists=True, path_type=Path, file_okay=False),
|
|
386
374
|
)
|
|
387
|
-
def summarise_analyses(
|
|
388
|
-
assemblies: Path, study_dir: Path, output_prefix: str, outdir: Path
|
|
389
|
-
) -> None:
|
|
375
|
+
def summarise_analyses(assemblies: Path, study_dir: Path, output_prefix: str, outdir: Path) -> None:
|
|
390
376
|
"""
|
|
391
377
|
Generate study-level summaries for successfully proccessed assemblies.
|
|
392
378
|
|
|
@@ -405,16 +391,11 @@ def summarise_analyses(
|
|
|
405
391
|
Construct file paths for each assembly given a subdirectory and filename template.
|
|
406
392
|
Template must contain {acc} as a placeholder.
|
|
407
393
|
"""
|
|
408
|
-
return {
|
|
409
|
-
acc: study_dir / acc / subdir / filename_template.format(acc=acc)
|
|
410
|
-
for acc in assembly_list
|
|
411
|
-
}
|
|
394
|
+
return {acc: study_dir / acc / subdir / filename_template.format(acc=acc) for acc in assembly_list}
|
|
412
395
|
|
|
413
396
|
logging.info("Start processing of assembly-level summaries.")
|
|
414
397
|
|
|
415
|
-
logging.info(
|
|
416
|
-
"Generating taxonomy summary from assembly-level summaries <accession>.krona.txt"
|
|
417
|
-
)
|
|
398
|
+
logging.info("Generating taxonomy summary from assembly-level summaries <accession>.krona.txt")
|
|
418
399
|
generate_taxonomy_summary(
|
|
419
400
|
get_file_paths("taxonomy", "{acc}.krona.txt.gz"),
|
|
420
401
|
f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}",
|
|
@@ -422,9 +403,7 @@ def summarise_analyses(
|
|
|
422
403
|
)
|
|
423
404
|
|
|
424
405
|
for summary_type, config in SUMMARY_TYPES_MAP.items():
|
|
425
|
-
logging.info(
|
|
426
|
-
f"Generating study-level {summary_type.capitalize()} summary from file <accession>_{summary_type}_summary.tsv.gz"
|
|
427
|
-
)
|
|
406
|
+
logging.info(f"Generating study-level {summary_type.capitalize()} summary from file <accession>_{summary_type}_summary.tsv.gz")
|
|
428
407
|
generate_functional_summary(
|
|
429
408
|
get_file_paths(config["folder"], f"{{acc}}_{summary_type}_summary.tsv.gz"),
|
|
430
409
|
config["column_names"],
|
|
@@ -469,9 +448,7 @@ def merge_summaries(study_dir: str, output_prefix: str) -> None:
|
|
|
469
448
|
|
|
470
449
|
logging.info("Generating combined assembly-level summaries")
|
|
471
450
|
logging.info("Parsing summary files for taxonomic classification")
|
|
472
|
-
merge_taxonomy_summaries(
|
|
473
|
-
get_file_paths("taxonomy"), f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}"
|
|
474
|
-
)
|
|
451
|
+
merge_taxonomy_summaries(get_file_paths("taxonomy"), f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}")
|
|
475
452
|
|
|
476
453
|
for summary_type, config in SUMMARY_TYPES_MAP.items():
|
|
477
454
|
logging.info(f"Parsing summary files for {summary_type.capitalize()}.")
|
|
@@ -500,9 +477,7 @@ def merge_taxonomy_summaries(summary_files: list[str], output_file_name: str) ->
|
|
|
500
477
|
sk__Eukaryota;k__Metazoa;p__Chordata;c__Mammalia;o__Primates 118 94
|
|
501
478
|
"""
|
|
502
479
|
if not summary_files:
|
|
503
|
-
raise FileNotFoundError(
|
|
504
|
-
"The required taxonomic classification summary files are missing. Exiting."
|
|
505
|
-
)
|
|
480
|
+
raise FileNotFoundError("The required taxonomic classification summary files are missing. Exiting.")
|
|
506
481
|
|
|
507
482
|
summary_dfs = []
|
|
508
483
|
for file in summary_files:
|
|
@@ -527,9 +502,7 @@ def merge_functional_summaries(
|
|
|
527
502
|
summary_files: list[str],
|
|
528
503
|
merge_keys: list[str],
|
|
529
504
|
output_prefix: str,
|
|
530
|
-
label: Literal[
|
|
531
|
-
"go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
|
|
532
|
-
],
|
|
505
|
+
label: Literal["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"],
|
|
533
506
|
) -> None:
|
|
534
507
|
"""
|
|
535
508
|
Merge multiple functional study-level summary files into a single study-level summary.
|
|
@@ -580,9 +553,7 @@ def merge_functional_summaries(
|
|
|
580
553
|
output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
|
|
581
554
|
|
|
582
555
|
if not summary_files:
|
|
583
|
-
logging.warning(
|
|
584
|
-
f"Skipping creation of {output_file_name} because no summaries were found for this type of functional annotation."
|
|
585
|
-
)
|
|
556
|
+
logging.warning(f"Skipping creation of {output_file_name} because no summaries were found for this type of functional annotation.")
|
|
586
557
|
return
|
|
587
558
|
|
|
588
559
|
validation_schema = SUMMARY_TYPES_MAP[label]["study_schema"]
|
|
@@ -596,9 +567,7 @@ def merge_functional_summaries(
|
|
|
596
567
|
if len(dfs) == 1:
|
|
597
568
|
merged_df = dfs[0]
|
|
598
569
|
else:
|
|
599
|
-
merged_df = reduce(
|
|
600
|
-
lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs
|
|
601
|
-
)
|
|
570
|
+
merged_df = reduce(lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs)
|
|
602
571
|
|
|
603
572
|
# Identify non-key columns (i.e. counts)
|
|
604
573
|
value_columns = [col for col in merged_df.columns if col not in merge_keys]
|
|
@@ -14,32 +14,31 @@
|
|
|
14
14
|
# See the License for the specific language governing permissions and
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
|
|
17
|
-
import shutil
|
|
18
|
-
from shutil import SameFileError
|
|
19
|
-
|
|
20
|
-
import click
|
|
21
|
-
from collections import defaultdict
|
|
22
17
|
import glob
|
|
23
18
|
import logging
|
|
19
|
+
import shutil
|
|
20
|
+
from collections import defaultdict
|
|
24
21
|
from pathlib import Path
|
|
25
|
-
from
|
|
22
|
+
from shutil import SameFileError
|
|
23
|
+
from typing import List, Union
|
|
26
24
|
|
|
25
|
+
import click
|
|
27
26
|
import pandas as pd
|
|
28
27
|
|
|
29
28
|
from mgnify_pipelines_toolkit.constants.db_labels import (
|
|
30
|
-
RRAP_TAXDB_LABELS,
|
|
31
29
|
RRAP_FUNCDB_LABELS,
|
|
30
|
+
RRAP_TAXDB_LABELS,
|
|
32
31
|
)
|
|
33
32
|
from mgnify_pipelines_toolkit.constants.tax_ranks import (
|
|
34
|
-
_SILVA_TAX_RANKS,
|
|
35
33
|
_MOTUS_TAX_RANKS,
|
|
34
|
+
_SILVA_TAX_RANKS,
|
|
36
35
|
)
|
|
37
|
-
from mgnify_pipelines_toolkit.schemas.
|
|
38
|
-
|
|
36
|
+
from mgnify_pipelines_toolkit.schemas.dataframes import (
|
|
37
|
+
FunctionProfileSchema,
|
|
38
|
+
MotusTaxonSchema,
|
|
39
39
|
RawReadsNonINSDCPassedRunsSchema,
|
|
40
|
+
RawReadsPassedRunsSchema,
|
|
40
41
|
TaxonSchema,
|
|
41
|
-
MotusTaxonSchema,
|
|
42
|
-
FunctionProfileSchema,
|
|
43
42
|
validate_dataframe,
|
|
44
43
|
)
|
|
45
44
|
|
|
@@ -51,9 +50,7 @@ def cli():
|
|
|
51
50
|
pass
|
|
52
51
|
|
|
53
52
|
|
|
54
|
-
def get_file(
|
|
55
|
-
run_acc: str, analyses_dir: Path, db_label: str
|
|
56
|
-
) -> Union[Path, List[Path], None]:
|
|
53
|
+
def get_file(run_acc: str, analyses_dir: Path, db_label: str) -> Union[Path, List[Path], None]:
|
|
57
54
|
"""Takes path information for a particular analysis and db_label combo, and returns any existing files.
|
|
58
55
|
|
|
59
56
|
:param run_acc: Run accession for the tax file that should be retrieved.
|
|
@@ -78,28 +75,18 @@ def get_file(
|
|
|
78
75
|
db_path = Path(f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}")
|
|
79
76
|
|
|
80
77
|
if not db_path.exists():
|
|
81
|
-
logging.debug(
|
|
82
|
-
f"DB {db_path} doesn't exist for {run_acc}. Skipping"
|
|
83
|
-
) # or error?
|
|
78
|
+
logging.debug(f"DB {db_path} doesn't exist for {run_acc}. Skipping") # or error?
|
|
84
79
|
return
|
|
85
80
|
|
|
86
|
-
analysis_file = Path(
|
|
87
|
-
f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}/{run_acc}_{db_label}.txt.gz"
|
|
88
|
-
)
|
|
81
|
+
analysis_file = Path(f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}/{run_acc}_{db_label}.txt.gz")
|
|
89
82
|
if not analysis_file.exists():
|
|
90
|
-
logging.error(
|
|
91
|
-
f"DB path exists but file doesn't - exiting. Path: {analysis_file}"
|
|
92
|
-
)
|
|
83
|
+
logging.error(f"DB path exists but file doesn't - exiting. Path: {analysis_file}")
|
|
93
84
|
exit(1)
|
|
94
85
|
|
|
95
86
|
file_size = analysis_file.stat().st_size
|
|
96
|
-
if (
|
|
97
|
-
file_size == 0
|
|
98
|
-
): # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
|
|
87
|
+
if file_size == 0: # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
|
|
99
88
|
# so need to skip those. Should probably fix that at some point
|
|
100
|
-
logging.debug(
|
|
101
|
-
f"File {analysis_file} exists but is empty, so will be skipping it."
|
|
102
|
-
)
|
|
89
|
+
logging.debug(f"File {analysis_file} exists but is empty, so will be skipping it.")
|
|
103
90
|
analysis_file = None
|
|
104
91
|
|
|
105
92
|
return analysis_file
|
|
@@ -130,21 +117,13 @@ def parse_one_tax_file(run_acc: str, tax_file: Path, db_label: str) -> pd.DataFr
|
|
|
130
117
|
str(tax_file),
|
|
131
118
|
)
|
|
132
119
|
|
|
133
|
-
res_df["full_taxon"] = [
|
|
134
|
-
|
|
135
|
-
]
|
|
136
|
-
final_df = (
|
|
137
|
-
res_df[["Count", "full_taxon"]]
|
|
138
|
-
.set_index("full_taxon")
|
|
139
|
-
.rename(columns={"Count": run_acc})
|
|
140
|
-
)
|
|
120
|
+
res_df["full_taxon"] = [";".join(r[tax_ranks]).strip(";") for _, r in res_df.iterrows()]
|
|
121
|
+
final_df = res_df[["Count", "full_taxon"]].set_index("full_taxon").rename(columns={"Count": run_acc})
|
|
141
122
|
|
|
142
123
|
return final_df
|
|
143
124
|
|
|
144
125
|
|
|
145
|
-
def parse_one_func_file(
|
|
146
|
-
run_acc: str, func_file: Path, db_label: str
|
|
147
|
-
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
|
126
|
+
def parse_one_func_file(run_acc: str, func_file: Path, db_label: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
|
148
127
|
"""Parses a functional profile file, and returns it as a pandas DataFrame object.
|
|
149
128
|
|
|
150
129
|
:param run_acc: Run accession of the taxonomy file that will be parsed.
|
|
@@ -170,24 +149,16 @@ def parse_one_func_file(
|
|
|
170
149
|
if res_df.shape[0] > 0:
|
|
171
150
|
validate_dataframe(res_df, FunctionProfileSchema, str(func_file))
|
|
172
151
|
|
|
173
|
-
count_df = pd.DataFrame(res_df[["read_count"]]).rename(
|
|
174
|
-
columns={"read_count": run_acc}
|
|
175
|
-
)
|
|
152
|
+
count_df = pd.DataFrame(res_df[["read_count"]]).rename(columns={"read_count": run_acc})
|
|
176
153
|
|
|
177
|
-
depth_df = pd.DataFrame(res_df[["coverage_depth"]]).rename(
|
|
178
|
-
columns={"coverage_depth": run_acc}
|
|
179
|
-
)
|
|
154
|
+
depth_df = pd.DataFrame(res_df[["coverage_depth"]]).rename(columns={"coverage_depth": run_acc})
|
|
180
155
|
|
|
181
|
-
breadth_df = pd.DataFrame(res_df[["coverage_breadth"]]).rename(
|
|
182
|
-
columns={"coverage_breadth": run_acc}
|
|
183
|
-
)
|
|
156
|
+
breadth_df = pd.DataFrame(res_df[["coverage_breadth"]]).rename(columns={"coverage_breadth": run_acc})
|
|
184
157
|
|
|
185
158
|
return count_df, depth_df, breadth_df
|
|
186
159
|
|
|
187
160
|
|
|
188
|
-
def generate_db_summary(
|
|
189
|
-
db_label: str, analysis_dfs: dict[str, Path], output_prefix: str
|
|
190
|
-
) -> None:
|
|
161
|
+
def generate_db_summary(db_label: str, analysis_dfs: dict[str, Path], output_prefix: str) -> None:
|
|
191
162
|
"""Takes paired run accessions taxonomy dataframes in the form of a dictionary,
|
|
192
163
|
and respective db_label, joins them together, and generates a study-wide summary
|
|
193
164
|
in the form of a .tsv file.
|
|
@@ -225,9 +196,7 @@ def generate_db_summary(
|
|
|
225
196
|
breadth_df_list = []
|
|
226
197
|
|
|
227
198
|
for run_acc, analysis_df in analysis_dfs.items():
|
|
228
|
-
count_df, depth_df, breadth_df = parse_one_func_file(
|
|
229
|
-
run_acc, analysis_df, db_label
|
|
230
|
-
)
|
|
199
|
+
count_df, depth_df, breadth_df = parse_one_func_file(run_acc, analysis_df, db_label)
|
|
231
200
|
count_df_list.append(count_df)
|
|
232
201
|
depth_df_list.append(depth_df)
|
|
233
202
|
breadth_df_list.append(breadth_df)
|
|
@@ -308,18 +277,14 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[str,
|
|
|
308
277
|
help="Input directory to where all the individual analyses subdirectories for summarising",
|
|
309
278
|
type=click.Path(exists=True, path_type=Path, file_okay=False),
|
|
310
279
|
)
|
|
311
|
-
@click.option(
|
|
312
|
-
"-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
|
|
313
|
-
)
|
|
280
|
+
@click.option("-p", "--output_prefix", required=True, help="Prefix to summary files", type=str)
|
|
314
281
|
@click.option(
|
|
315
282
|
"--non_insdc",
|
|
316
283
|
default=False,
|
|
317
284
|
is_flag=True,
|
|
318
285
|
help="If run accessions aren't INSDC-formatted",
|
|
319
286
|
)
|
|
320
|
-
def summarise_analyses(
|
|
321
|
-
runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool
|
|
322
|
-
) -> None:
|
|
287
|
+
def summarise_analyses(runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool) -> None:
|
|
323
288
|
"""Function that will take a file of pipeline-successful run accessions
|
|
324
289
|
that should be used for the generation of the relevant db-specific
|
|
325
290
|
study-level summary files.
|
|
@@ -337,15 +302,12 @@ def summarise_analyses(
|
|
|
337
302
|
runs_df = pd.read_csv(runs, names=["run", "status"])
|
|
338
303
|
|
|
339
304
|
if not non_insdc:
|
|
340
|
-
RawReadsPassedRunsSchema(
|
|
341
|
-
runs_df
|
|
342
|
-
) # Run validation on the successful_runs .csv file
|
|
305
|
+
RawReadsPassedRunsSchema(runs_df) # Run validation on the successful_runs .csv file
|
|
343
306
|
else:
|
|
344
307
|
RawReadsNonINSDCPassedRunsSchema(runs_df)
|
|
345
308
|
|
|
346
309
|
all_db_labels = RRAP_TAXDB_LABELS + RRAP_FUNCDB_LABELS
|
|
347
310
|
for db_label in all_db_labels:
|
|
348
|
-
|
|
349
311
|
analysis_files = {}
|
|
350
312
|
for run_acc in runs_df["run"]:
|
|
351
313
|
analysis_file = get_file(run_acc, analyses_dir, db_label)
|
|
@@ -410,9 +372,7 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
|
|
|
410
372
|
index_label="taxonomy",
|
|
411
373
|
)
|
|
412
374
|
elif len(summaries) == 1:
|
|
413
|
-
logging.info(
|
|
414
|
-
f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}"
|
|
415
|
-
)
|
|
375
|
+
logging.info(f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}")
|
|
416
376
|
try:
|
|
417
377
|
shutil.copyfile(summaries[0], merged_summary_name)
|
|
418
378
|
except SameFileError:
|
|
@@ -420,21 +380,15 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
|
|
|
420
380
|
|
|
421
381
|
if db_label in RRAP_FUNCDB_LABELS:
|
|
422
382
|
for table_type in ["read-count", "coverage-depth", "coverage-breadth"]:
|
|
423
|
-
merged_summary_name =
|
|
424
|
-
|
|
425
|
-
)
|
|
426
|
-
summaries_ = [
|
|
427
|
-
v for v in summaries if Path(v).stem.split("_")[2] == table_type
|
|
428
|
-
]
|
|
383
|
+
merged_summary_name = f"{output_prefix}_{db_label}_{table_type}_study_summary.tsv"
|
|
384
|
+
summaries_ = [v for v in summaries if Path(v).stem.split("_")[2] == table_type]
|
|
429
385
|
if len(summaries_) > 1:
|
|
430
386
|
res_df = pd.read_csv(summaries_[0], sep="\t", index_col=0)
|
|
431
387
|
for summary in summaries_[1:]:
|
|
432
388
|
curr_df = pd.read_csv(summary, sep="\t", index_col=0)
|
|
433
389
|
res_df = res_df.join(curr_df, how="outer")
|
|
434
390
|
res_df = res_df.fillna(0)
|
|
435
|
-
res_df = res_df.astype(
|
|
436
|
-
int if table_type == "read-count" else float
|
|
437
|
-
)
|
|
391
|
+
res_df = res_df.astype(int if table_type == "read-count" else float)
|
|
438
392
|
|
|
439
393
|
res_df = res_df.reindex(sorted(res_df.columns), axis=1)
|
|
440
394
|
res_df.to_csv(
|
|
@@ -444,9 +398,7 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
|
|
|
444
398
|
float_format="%.6g",
|
|
445
399
|
)
|
|
446
400
|
elif len(summaries_) == 1:
|
|
447
|
-
logging.info(
|
|
448
|
-
f"Only one summary ({summaries_[0]}) so will use that as {merged_summary_name}"
|
|
449
|
-
)
|
|
401
|
+
logging.info(f"Only one summary ({summaries_[0]}) so will use that as {merged_summary_name}")
|
|
450
402
|
try:
|
|
451
403
|
shutil.copyfile(summaries_[0], merged_summary_name)
|
|
452
404
|
except SameFileError:
|