mgnify-pipelines-toolkit 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/{shared → amplicon}/study_summary_generator.py +2 -2
- mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +58 -65
- mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py +618 -0
- mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +5 -9
- mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +18 -16
- mgnify_pipelines_toolkit/schemas/schemas.py +355 -2
- {mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.2.dist-info}/METADATA +2 -2
- {mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.2.dist-info}/RECORD +12 -11
- {mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.2.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.2.dist-info}/entry_points.txt +2 -1
- {mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.2.dist-info}/licenses/LICENSE +0 -0
- {mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,618 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
# Copyright 2025 EMBL - European Bioinformatics Institute
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import click
|
|
18
|
+
from functools import reduce
|
|
19
|
+
import glob
|
|
20
|
+
import logging
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Literal
|
|
23
|
+
|
|
24
|
+
import pandas as pd
|
|
25
|
+
|
|
26
|
+
from mgnify_pipelines_toolkit.schemas.schemas import (
|
|
27
|
+
CompletedAnalysisSchema,
|
|
28
|
+
TaxonSchema,
|
|
29
|
+
GOSummarySchema,
|
|
30
|
+
InterProSummarySchema,
|
|
31
|
+
KOSummarySchema,
|
|
32
|
+
SanntisSummarySchema,
|
|
33
|
+
AntismashSummarySchema,
|
|
34
|
+
PFAMSummarySchema,
|
|
35
|
+
KEGGModulesSummarySchema,
|
|
36
|
+
GOStudySummarySchema,
|
|
37
|
+
InterProStudySummarySchema,
|
|
38
|
+
TaxonomyStudySummarySchema,
|
|
39
|
+
KOStudySummarySchema,
|
|
40
|
+
SanntisStudySummarySchema,
|
|
41
|
+
AntismashStudySummarySchema,
|
|
42
|
+
PFAMStudySummarySchema,
|
|
43
|
+
KEGGModulesStudySummarySchema,
|
|
44
|
+
validate_dataframe,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
logging.basicConfig(
|
|
48
|
+
level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Keys are the original column names in the input files,
|
|
52
|
+
# values are the standardised column names used in the generated study summary files
|
|
53
|
+
# Note: "Count" or "count" column should be excluded
|
|
54
|
+
GO_COLUMN_NAMES = {
|
|
55
|
+
"go": "GO",
|
|
56
|
+
"term": "description",
|
|
57
|
+
"category": "category",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
INTERPRO_COLUMN_NAMES = {
|
|
61
|
+
"interpro_accession": "IPR",
|
|
62
|
+
"description": "description",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
SANNTIS_COLUMN_NAMES = {
|
|
66
|
+
"nearest_mibig": "nearest_mibig",
|
|
67
|
+
"nearest_mibig_class": "nearest_mibig_class",
|
|
68
|
+
"description": "description",
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
ANTISMASH_COLUMN_NAMES = {
|
|
72
|
+
"label": "label",
|
|
73
|
+
"description": "description",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
KEGG_COLUMN_NAMES = {
|
|
77
|
+
"ko": "KO",
|
|
78
|
+
"description": "description",
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
PFAM_COLUMN_NAMES = {
|
|
82
|
+
"pfam": "PFAM",
|
|
83
|
+
"description": "description",
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
KEGG_MODULES_COLUMN_NAMES = {
|
|
87
|
+
"module_accession": "module_accession",
|
|
88
|
+
"pathway_name": "pathway_name",
|
|
89
|
+
"pathway_class": "pathway_class",
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
# this mapping allows using 'for' cycle later to process all summary types in one way
|
|
93
|
+
SUMMARY_TYPES_MAP = {
|
|
94
|
+
"go": {
|
|
95
|
+
"folder": "functional-annotation/go",
|
|
96
|
+
"column_names": GO_COLUMN_NAMES,
|
|
97
|
+
"schema": GOSummarySchema,
|
|
98
|
+
"study_schema": GOStudySummarySchema,
|
|
99
|
+
},
|
|
100
|
+
"goslim": {
|
|
101
|
+
"folder": "functional-annotation/go",
|
|
102
|
+
"column_names": GO_COLUMN_NAMES,
|
|
103
|
+
"schema": GOSummarySchema,
|
|
104
|
+
"study_schema": GOStudySummarySchema,
|
|
105
|
+
},
|
|
106
|
+
"interpro": {
|
|
107
|
+
"folder": "functional-annotation/interpro",
|
|
108
|
+
"column_names": INTERPRO_COLUMN_NAMES,
|
|
109
|
+
"schema": InterProSummarySchema,
|
|
110
|
+
"study_schema": InterProStudySummarySchema,
|
|
111
|
+
},
|
|
112
|
+
"ko": {
|
|
113
|
+
"folder": "functional-annotation/kegg",
|
|
114
|
+
"column_names": KEGG_COLUMN_NAMES,
|
|
115
|
+
"schema": KOSummarySchema,
|
|
116
|
+
"study_schema": KOStudySummarySchema,
|
|
117
|
+
},
|
|
118
|
+
"sanntis": {
|
|
119
|
+
"folder": "pathways-and-systems/sanntis",
|
|
120
|
+
"allow_missing": True,
|
|
121
|
+
"column_names": SANNTIS_COLUMN_NAMES,
|
|
122
|
+
"schema": SanntisSummarySchema,
|
|
123
|
+
"study_schema": SanntisStudySummarySchema,
|
|
124
|
+
},
|
|
125
|
+
"antismash": {
|
|
126
|
+
"folder": "pathways-and-systems/antismash",
|
|
127
|
+
"column_names": ANTISMASH_COLUMN_NAMES,
|
|
128
|
+
"schema": AntismashSummarySchema,
|
|
129
|
+
"study_schema": AntismashStudySummarySchema,
|
|
130
|
+
},
|
|
131
|
+
"pfam": {
|
|
132
|
+
"folder": "functional-annotation/pfam",
|
|
133
|
+
"column_names": PFAM_COLUMN_NAMES,
|
|
134
|
+
"schema": PFAMSummarySchema,
|
|
135
|
+
"study_schema": PFAMStudySummarySchema,
|
|
136
|
+
},
|
|
137
|
+
"kegg_modules": {
|
|
138
|
+
"folder": "pathways-and-systems/kegg-modules",
|
|
139
|
+
"column_names": KEGG_MODULES_COLUMN_NAMES,
|
|
140
|
+
"schema": KEGGModulesSummarySchema,
|
|
141
|
+
"study_schema": KEGGModulesStudySummarySchema,
|
|
142
|
+
},
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# The taxonomy file is a tab-separated file without any header
|
|
146
|
+
# containing of following columns:
|
|
147
|
+
TAXONOMY_COLUMN_NAMES = [
|
|
148
|
+
"Count",
|
|
149
|
+
"Superkingdom",
|
|
150
|
+
"Kingdom",
|
|
151
|
+
"Phylum",
|
|
152
|
+
"Class",
|
|
153
|
+
"Order",
|
|
154
|
+
"Family",
|
|
155
|
+
"Genus",
|
|
156
|
+
"Species",
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
OUTPUT_SUFFIX = "summary.tsv"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@click.group()
|
|
163
|
+
def cli():
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def check_files_exist(file_list: list[Path]) -> None:
|
|
168
|
+
"""
|
|
169
|
+
Check that all files in the given list exist on disk.
|
|
170
|
+
|
|
171
|
+
:param file_list: List of file paths to check.
|
|
172
|
+
:raises FileNotFoundError: If any file does not exist.
|
|
173
|
+
"""
|
|
174
|
+
missing_files = [str(path) for path in file_list if not path.is_file()]
|
|
175
|
+
if missing_files:
|
|
176
|
+
raise FileNotFoundError(
|
|
177
|
+
f"The following required files are missing: {', '.join(missing_files)}"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def generate_taxonomy_summary(
|
|
182
|
+
file_dict: dict[str, Path],
|
|
183
|
+
output_file_name: str,
|
|
184
|
+
outdir: Path = None,
|
|
185
|
+
) -> None:
|
|
186
|
+
"""
|
|
187
|
+
Generate a combined study-level taxonomic classification summary from multiple input
|
|
188
|
+
assembly-level summary files.
|
|
189
|
+
|
|
190
|
+
:param file_dict: Dictionary mapping assembly accession to its taxonomy file.
|
|
191
|
+
:param output_file_name: Output path for the output summary file.
|
|
192
|
+
:param outdir: Optional output directory for the results.
|
|
193
|
+
|
|
194
|
+
Example of the taxonomy file:
|
|
195
|
+
23651 sk__Bacteria
|
|
196
|
+
4985 sk__Archaea k__Thermoproteati p__Nitrososphaerota
|
|
197
|
+
882 sk__Archaea k__Nanobdellati p__ c__ o__ f__ g__ s__Candidatus Pacearchaeota archaeon
|
|
198
|
+
"""
|
|
199
|
+
check_files_exist(list(file_dict.values()))
|
|
200
|
+
|
|
201
|
+
tax_dfs = []
|
|
202
|
+
for assembly_acc, path in file_dict.items():
|
|
203
|
+
df = pd.read_csv(path, sep="\t", names=TAXONOMY_COLUMN_NAMES).fillna("")
|
|
204
|
+
|
|
205
|
+
# Note: schema validation will fail if the taxonomy file is empty
|
|
206
|
+
df = validate_dataframe(df, TaxonSchema, str(path))
|
|
207
|
+
|
|
208
|
+
# Combine all taxonomic ranks in the classification into a single string
|
|
209
|
+
df["full_taxon"] = (
|
|
210
|
+
df[TAXONOMY_COLUMN_NAMES[1:]].agg(";".join, axis=1).str.strip(";")
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Create a new DataFrame with taxonomy as index and count as the only column
|
|
214
|
+
result = df[["Count", "full_taxon"]].set_index("full_taxon")
|
|
215
|
+
result.columns = [assembly_acc]
|
|
216
|
+
tax_dfs.append(result)
|
|
217
|
+
|
|
218
|
+
summary_df = pd.concat(tax_dfs, axis=1)
|
|
219
|
+
summary_df = summary_df.fillna(0).astype(int).sort_index()
|
|
220
|
+
|
|
221
|
+
outfile = output_file_name
|
|
222
|
+
if outdir:
|
|
223
|
+
outfile = outdir / output_file_name
|
|
224
|
+
|
|
225
|
+
summary_df.to_csv(outfile, sep="\t", index_label="taxonomy")
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def generate_functional_summary(
|
|
229
|
+
file_dict: dict[str, Path],
|
|
230
|
+
column_names: dict[str, str],
|
|
231
|
+
output_prefix: str,
|
|
232
|
+
label: Literal[
|
|
233
|
+
"go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
|
|
234
|
+
],
|
|
235
|
+
outdir: Path = None,
|
|
236
|
+
allow_missing: bool = False,
|
|
237
|
+
) -> None:
|
|
238
|
+
"""
|
|
239
|
+
Generate a combined study-level functional annotation summary from multiple input
|
|
240
|
+
assembly-level summary files.
|
|
241
|
+
|
|
242
|
+
:param file_dict: Dictionary mapping assembly accession to its summary file path.
|
|
243
|
+
:param column_names: Dictionary mapping original column names to standard column names.
|
|
244
|
+
:param output_prefix: Prefix for the output summary file.
|
|
245
|
+
:param label: Label for the functional annotation type
|
|
246
|
+
(expected one of ["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"]).
|
|
247
|
+
:param outdir: Optional output directory for the results.
|
|
248
|
+
:param allow_missing: Whether to allow the summary files to be missing (e.g. because the pipeline doesn't emit them if acceptably empty).
|
|
249
|
+
|
|
250
|
+
In the input files, column orders may vary, but the following columns are expected:
|
|
251
|
+
GO summary input file:
|
|
252
|
+
go term category count
|
|
253
|
+
GO:0016020 membrane cellular_component 30626
|
|
254
|
+
GO:0005524 ATP binding molecular_function 30524
|
|
255
|
+
|
|
256
|
+
InterPro summary input file:
|
|
257
|
+
interpro_accession description count
|
|
258
|
+
IPR036291 NAD(P)-binding domain superfamily 16503
|
|
259
|
+
IPR019734 Tetratricopeptide repeat 14694
|
|
260
|
+
|
|
261
|
+
KEGG summary input file:
|
|
262
|
+
ko description count
|
|
263
|
+
K01552 energy-coupling factor transport system ATP-binding protein [EC:7.-.-.-] 562
|
|
264
|
+
K18889 ATP-binding cassette, subfamily B, multidrug efflux pump 537
|
|
265
|
+
K15497 molybdate/tungstate transport system ATP-binding protein [EC:7.3.2.5 7.3.2.6] 517
|
|
266
|
+
|
|
267
|
+
Sanntis summary input file:
|
|
268
|
+
nearest_mibig nearest_mibig_class description count
|
|
269
|
+
BGC0000787 Saccharide Carbohydrate-based natural products (e.g., aminoglycoside antibiotics) 1
|
|
270
|
+
BGC0000248 Polyketide Built from iterative condensation of acetate units derived from acetyl-CoA 3
|
|
271
|
+
BGC0001327 NRP Polyketide Nonribosomal Peptide Polyketide 2
|
|
272
|
+
|
|
273
|
+
Antismash summary input file:
|
|
274
|
+
label description count
|
|
275
|
+
terpene Terpene 16
|
|
276
|
+
betalactone Beta-lactone containing protease inhibitor 8
|
|
277
|
+
T1PKS Type I PKS (Polyketide synthase) 3
|
|
278
|
+
|
|
279
|
+
PFAM summary input file:
|
|
280
|
+
pfam description count
|
|
281
|
+
PF00265 Thymidine kinase 457
|
|
282
|
+
PF01852 START domain 368
|
|
283
|
+
PF13756 Stimulus-sensing domain 397
|
|
284
|
+
|
|
285
|
+
KEGG modules summary input file:
|
|
286
|
+
module_accession completeness pathway_name pathway_class matching_ko missing_ko
|
|
287
|
+
M00986 100.0 Sulfur reduction, sulfur => sulfide Pathway modules; Energy metabolism; Sulfur metabolism K18367
|
|
288
|
+
M00163 83.33 Photosystem I Pathway modules; Energy metabolism; Photosynthesis K02689,K02690,K02691,K02692,K02694 K02693
|
|
289
|
+
M00615 50.0 Nitrate assimilation Signature modules; Module set; Metabolic capacity K02575 M00531
|
|
290
|
+
"""
|
|
291
|
+
try:
|
|
292
|
+
check_files_exist(list(file_dict.values()))
|
|
293
|
+
except FileNotFoundError as e:
|
|
294
|
+
if allow_missing:
|
|
295
|
+
logging.warning(
|
|
296
|
+
f"One of the expected files is missing, but this is allowed for {label}."
|
|
297
|
+
)
|
|
298
|
+
logging.warning(e)
|
|
299
|
+
return
|
|
300
|
+
raise
|
|
301
|
+
|
|
302
|
+
output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
|
|
303
|
+
|
|
304
|
+
original_col_names = list(column_names.keys())
|
|
305
|
+
renamed_col_names = list(column_names.values())
|
|
306
|
+
value_col_name = "completeness" if label == "kegg_modules" else "count"
|
|
307
|
+
|
|
308
|
+
dfs = []
|
|
309
|
+
for assembly_acc, filepath in file_dict.items():
|
|
310
|
+
try:
|
|
311
|
+
df = pd.read_csv(filepath, sep="\t")
|
|
312
|
+
except pd.errors.EmptyDataError:
|
|
313
|
+
logging.warning(f"File {filepath.resolve()} is empty. Skipping.")
|
|
314
|
+
continue
|
|
315
|
+
|
|
316
|
+
schema = SUMMARY_TYPES_MAP[label]["schema"]
|
|
317
|
+
df = validate_dataframe(df, schema, str(filepath))
|
|
318
|
+
|
|
319
|
+
# Extract only relevant columns
|
|
320
|
+
df = df[original_col_names + [value_col_name]].copy()
|
|
321
|
+
|
|
322
|
+
# Rename columns: metadata columns are renamed according to column_names dict, "count"/"completeness" -> assembly acc
|
|
323
|
+
df.rename(columns={**column_names, value_col_name: assembly_acc}, inplace=True)
|
|
324
|
+
dfs.append(df)
|
|
325
|
+
|
|
326
|
+
if not dfs:
|
|
327
|
+
logging.warning(
|
|
328
|
+
f"No valid files with functional annotation summary were found. Skipping creation of {output_file_name}."
|
|
329
|
+
)
|
|
330
|
+
return
|
|
331
|
+
|
|
332
|
+
# Merge all dataframes on the renamed metadata columns
|
|
333
|
+
merged_df = reduce(
|
|
334
|
+
lambda left, right: pd.merge(left, right, on=renamed_col_names, how="outer"),
|
|
335
|
+
dfs,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
# Fill missing values appropriately, convert completeness percentages to float, counts to integers
|
|
339
|
+
value_columns = [col for col in merged_df.columns if col not in renamed_col_names]
|
|
340
|
+
fill_value = 0.0 if label == "kegg_modules" else 0
|
|
341
|
+
dtype = float if label == "kegg_modules" else int
|
|
342
|
+
merged_df[value_columns] = merged_df[value_columns].fillna(fill_value).astype(dtype)
|
|
343
|
+
|
|
344
|
+
# Reorder columns: merge keys first, then sorted assembly accessions
|
|
345
|
+
merged_df = merged_df[renamed_col_names + sorted(value_columns)]
|
|
346
|
+
|
|
347
|
+
outfile = output_file_name
|
|
348
|
+
if outdir:
|
|
349
|
+
outfile = outdir / output_file_name
|
|
350
|
+
|
|
351
|
+
merged_df.to_csv(outfile, sep="\t", index=False)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
@cli.command(
|
|
355
|
+
"summarise",
|
|
356
|
+
options_metavar="-a <assemblies> -s <study_dir> -p <output_prefix>",
|
|
357
|
+
short_help="Generate study-level summaries for assembly analysis results.",
|
|
358
|
+
)
|
|
359
|
+
@click.option(
|
|
360
|
+
"-a",
|
|
361
|
+
"--assemblies",
|
|
362
|
+
required=True,
|
|
363
|
+
help="CSV file containing successful analyses generated by the pipeline",
|
|
364
|
+
type=click.Path(exists=True, path_type=Path, dir_okay=False),
|
|
365
|
+
)
|
|
366
|
+
@click.option(
|
|
367
|
+
"-s",
|
|
368
|
+
"--study_dir",
|
|
369
|
+
required=True,
|
|
370
|
+
help="Input directory to where all the individual analyses subdirectories for summarising",
|
|
371
|
+
type=click.Path(exists=True, path_type=Path, file_okay=False),
|
|
372
|
+
)
|
|
373
|
+
@click.option(
|
|
374
|
+
"-p",
|
|
375
|
+
"--output_prefix",
|
|
376
|
+
required=True,
|
|
377
|
+
help="Prefix for generated summary files",
|
|
378
|
+
type=str,
|
|
379
|
+
)
|
|
380
|
+
@click.option(
|
|
381
|
+
"-o",
|
|
382
|
+
"--outdir",
|
|
383
|
+
required=False,
|
|
384
|
+
help="Directory for the output files, by default it will use the current working directory.",
|
|
385
|
+
type=click.Path(exists=True, path_type=Path, file_okay=False),
|
|
386
|
+
)
|
|
387
|
+
def summarise_analyses(
|
|
388
|
+
assemblies: Path, study_dir: Path, output_prefix: str, outdir: Path
|
|
389
|
+
) -> None:
|
|
390
|
+
"""
|
|
391
|
+
Generate study-level summaries for successfully proccessed assemblies.
|
|
392
|
+
|
|
393
|
+
:param assemblies: Path to a file listing completed assembly accessions and their status.
|
|
394
|
+
:param study_dir: Path to the directory containing analysis results for each assembly.
|
|
395
|
+
:param output_prefix: Prefix for the generated summary files.
|
|
396
|
+
"""
|
|
397
|
+
logging.info(f"Reading assembly list from {assemblies.resolve()}")
|
|
398
|
+
assemblies_df = pd.read_csv(assemblies, names=["assembly", "status"])
|
|
399
|
+
CompletedAnalysisSchema(assemblies_df)
|
|
400
|
+
assembly_list = assemblies_df["assembly"].tolist()
|
|
401
|
+
logging.info("Assembly list was read successfully.")
|
|
402
|
+
|
|
403
|
+
def get_file_paths(subdir: str, filename_template: str) -> dict[str, Path]:
|
|
404
|
+
"""
|
|
405
|
+
Construct file paths for each assembly given a subdirectory and filename template.
|
|
406
|
+
Template must contain {acc} as a placeholder.
|
|
407
|
+
"""
|
|
408
|
+
return {
|
|
409
|
+
acc: study_dir / acc / subdir / filename_template.format(acc=acc)
|
|
410
|
+
for acc in assembly_list
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
logging.info("Start processing of assembly-level summaries.")
|
|
414
|
+
|
|
415
|
+
logging.info(
|
|
416
|
+
"Generating taxonomy summary from assembly-level summaries <accession>.krona.txt"
|
|
417
|
+
)
|
|
418
|
+
generate_taxonomy_summary(
|
|
419
|
+
get_file_paths("taxonomy", "{acc}.krona.txt.gz"),
|
|
420
|
+
f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}",
|
|
421
|
+
outdir=outdir,
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
for summary_type, config in SUMMARY_TYPES_MAP.items():
|
|
425
|
+
logging.info(
|
|
426
|
+
f"Generating study-level {summary_type.capitalize()} summary from file <accession>_{summary_type}_summary.tsv.gz"
|
|
427
|
+
)
|
|
428
|
+
generate_functional_summary(
|
|
429
|
+
get_file_paths(config["folder"], f"{{acc}}_{summary_type}_summary.tsv.gz"),
|
|
430
|
+
config["column_names"],
|
|
431
|
+
output_prefix,
|
|
432
|
+
summary_type,
|
|
433
|
+
outdir=outdir,
|
|
434
|
+
allow_missing=config.get("allow_missing", False),
|
|
435
|
+
)
|
|
436
|
+
logging.info("Assembly-level summaries were generated successfully.")
|
|
437
|
+
logging.info("Done.")
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
@cli.command(
|
|
441
|
+
"merge",
|
|
442
|
+
options_metavar="-a <study_dir> -p <output_prefix>",
|
|
443
|
+
short_help="Merge multiple study-level summaries of assembly analysis.",
|
|
444
|
+
)
|
|
445
|
+
@click.option(
|
|
446
|
+
"-s",
|
|
447
|
+
"--study_dir",
|
|
448
|
+
required=True,
|
|
449
|
+
help="Input directory to where all the individual analyses subdirectories for merging",
|
|
450
|
+
type=click.Path(exists=True, file_okay=False),
|
|
451
|
+
)
|
|
452
|
+
@click.option(
|
|
453
|
+
"-p",
|
|
454
|
+
"--output_prefix",
|
|
455
|
+
required=True,
|
|
456
|
+
help="Prefix for generated merged summary files",
|
|
457
|
+
type=str,
|
|
458
|
+
)
|
|
459
|
+
def merge_summaries(study_dir: str, output_prefix: str) -> None:
|
|
460
|
+
"""
|
|
461
|
+
Merge multiple study-level summary files into combined summary files.
|
|
462
|
+
|
|
463
|
+
:param study_dir: Path to the directory containing study-level summary files.
|
|
464
|
+
:param output_prefix: Prefix for the output merged summary files.
|
|
465
|
+
"""
|
|
466
|
+
|
|
467
|
+
def get_file_paths(summary_type: str) -> list[str]:
|
|
468
|
+
return glob.glob(f"{study_dir}/*_{summary_type}_{OUTPUT_SUFFIX}")
|
|
469
|
+
|
|
470
|
+
logging.info("Generating combined assembly-level summaries")
|
|
471
|
+
logging.info("Parsing summary files for taxonomic classification")
|
|
472
|
+
merge_taxonomy_summaries(
|
|
473
|
+
get_file_paths("taxonomy"), f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}"
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
for summary_type, config in SUMMARY_TYPES_MAP.items():
|
|
477
|
+
logging.info(f"Parsing summary files for {summary_type.capitalize()}.")
|
|
478
|
+
column_names = config["column_names"]
|
|
479
|
+
merge_functional_summaries(
|
|
480
|
+
get_file_paths(summary_type),
|
|
481
|
+
list(column_names.values()),
|
|
482
|
+
output_prefix,
|
|
483
|
+
summary_type,
|
|
484
|
+
)
|
|
485
|
+
logging.info("Merged assembly-level summaries were generated successfully.")
|
|
486
|
+
logging.info("Done.")
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def merge_taxonomy_summaries(summary_files: list[str], output_file_name: str) -> None:
|
|
490
|
+
"""
|
|
491
|
+
Merge multiple taxonomy study-level summary files into a single study-level summary.
|
|
492
|
+
|
|
493
|
+
:param summary_files: List of paths to taxonomy summary files, each containing
|
|
494
|
+
taxonomic classifications and counts for an individual analysis.
|
|
495
|
+
:param output_file_name: Output path for the merged taxonomy summary.
|
|
496
|
+
|
|
497
|
+
Example of input taxonomy summary file:
|
|
498
|
+
taxonomy ERZ1049444 ERZ1049446
|
|
499
|
+
sk__Eukaryota;k__Metazoa;p__Chordata 2 10
|
|
500
|
+
sk__Eukaryota;k__Metazoa;p__Chordata;c__Mammalia;o__Primates 118 94
|
|
501
|
+
"""
|
|
502
|
+
if not summary_files:
|
|
503
|
+
raise FileNotFoundError(
|
|
504
|
+
"The required taxonomic classification summary files are missing. Exiting."
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
summary_dfs = []
|
|
508
|
+
for file in summary_files:
|
|
509
|
+
df = pd.read_csv(file, sep="\t", index_col=0)
|
|
510
|
+
df = validate_dataframe(df, TaxonomyStudySummarySchema, file)
|
|
511
|
+
summary_dfs.append(df)
|
|
512
|
+
merged_df = pd.concat(summary_dfs, axis=1)
|
|
513
|
+
merged_df = merged_df.fillna(0).astype(int)
|
|
514
|
+
|
|
515
|
+
# Reorder columns: taxonomy first, then sorted assembly accessions
|
|
516
|
+
merged_df = merged_df[sorted(merged_df.columns)]
|
|
517
|
+
merged_df = merged_df.sort_index()
|
|
518
|
+
|
|
519
|
+
merged_df.to_csv(
|
|
520
|
+
output_file_name,
|
|
521
|
+
sep="\t",
|
|
522
|
+
index_label="taxonomy",
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def merge_functional_summaries(
|
|
527
|
+
summary_files: list[str],
|
|
528
|
+
merge_keys: list[str],
|
|
529
|
+
output_prefix: str,
|
|
530
|
+
label: Literal[
|
|
531
|
+
"go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
|
|
532
|
+
],
|
|
533
|
+
) -> None:
|
|
534
|
+
"""
|
|
535
|
+
Merge multiple functional study-level summary files into a single study-level summary.
|
|
536
|
+
|
|
537
|
+
:param summary_files: List of paths to functional summary files, each containing
|
|
538
|
+
annotation terms and counts for an individual analysis.
|
|
539
|
+
:param merge_keys: List of column names to merge on (e.g. term ID, description).
|
|
540
|
+
:param output_prefix: Prefix for the generated output file.
|
|
541
|
+
:param label: Label describing the functional annotation type
|
|
542
|
+
(expected one of ["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"]).
|
|
543
|
+
|
|
544
|
+
In the input files, column orders may vary, but the following columns are expected:
|
|
545
|
+
GO summary input:
|
|
546
|
+
GO description category ERZ1049444 ERZ1049446
|
|
547
|
+
GO:0016020 membrane cellular_component 30626 673
|
|
548
|
+
GO:0005524 ATP binding molecular_function 30524 2873
|
|
549
|
+
|
|
550
|
+
Example of InterPro summary input:
|
|
551
|
+
IPR description ERZ1049444 ERZ1049446
|
|
552
|
+
IPR036291 NAD(P)-binding domain superfamily 16503 13450
|
|
553
|
+
IPR019734 Tetratricopeptide repeat 14694 11021
|
|
554
|
+
|
|
555
|
+
KEGG summary input:
|
|
556
|
+
GO description category ERZ1049440 ERZ1049443
|
|
557
|
+
GO:0003677 DNA binding molecular_function 6125 16417
|
|
558
|
+
GO:0055085 transmembrane transport biological_process 144 13926
|
|
559
|
+
|
|
560
|
+
Sanntis summary input:
|
|
561
|
+
nearest_mibig nearest_mibig_class description ERZ1049440 ERZ1049443
|
|
562
|
+
BGC0001356 RiPP Ribosomally synthesised and Post-translationally modified Peptide 230 185
|
|
563
|
+
BGC0001432 NRP Polyketide Nonribosomal Peptide Polyketide 0 8
|
|
564
|
+
|
|
565
|
+
Antismash summary input:
|
|
566
|
+
label description ERZ1049440 ERZ1049443
|
|
567
|
+
NRPS Non-ribosomal peptide synthetase 368 0
|
|
568
|
+
arylpolyene Aryl polyene 149 447
|
|
569
|
+
|
|
570
|
+
PFAM summary input:
|
|
571
|
+
PFAM description ERZ1049440 ERZ1049443
|
|
572
|
+
PF24718 HTH-like domain 468 1
|
|
573
|
+
PF06039 Malate:quinone oxidoreductase (Mqo) 490 21
|
|
574
|
+
|
|
575
|
+
KEGG modules summary input:
|
|
576
|
+
module_accession pathway_name pathway_class ERZ1049440 ERZ1049443
|
|
577
|
+
M00109 C21-Steroid hormone biosynthesis, progesterone => cortisol/cortisone Pathway modules; Lipid metabolism; Sterol biosynthesis 38.9 0.0
|
|
578
|
+
M00153 Cytochrome bd ubiquinol oxidase Pathway modules; Energy metabolism; ATP synthesis 44.7 84.4
|
|
579
|
+
"""
|
|
580
|
+
output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
|
|
581
|
+
|
|
582
|
+
if not summary_files:
|
|
583
|
+
logging.warning(
|
|
584
|
+
f"Skipping creation of {output_file_name} because no summaries were found for this type of functional annotation."
|
|
585
|
+
)
|
|
586
|
+
return
|
|
587
|
+
|
|
588
|
+
validation_schema = SUMMARY_TYPES_MAP[label]["study_schema"]
|
|
589
|
+
|
|
590
|
+
dfs = []
|
|
591
|
+
for filepath in summary_files:
|
|
592
|
+
df = pd.read_csv(filepath, sep="\t")
|
|
593
|
+
df = validate_dataframe(df, validation_schema, filepath)
|
|
594
|
+
dfs.append(df)
|
|
595
|
+
|
|
596
|
+
if len(dfs) == 1:
|
|
597
|
+
merged_df = dfs[0]
|
|
598
|
+
else:
|
|
599
|
+
merged_df = reduce(
|
|
600
|
+
lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
# Identify non-key columns (i.e. counts)
|
|
604
|
+
value_columns = [col for col in merged_df.columns if col not in merge_keys]
|
|
605
|
+
|
|
606
|
+
# Fill NaNs and set dtype accordingly
|
|
607
|
+
fill_value = 0.0 if label == "kegg_modules" else 0
|
|
608
|
+
dtype = float if label == "kegg_modules" else int
|
|
609
|
+
merged_df[value_columns] = merged_df[value_columns].fillna(fill_value).astype(dtype)
|
|
610
|
+
|
|
611
|
+
# Reorder columns
|
|
612
|
+
merged_df = merged_df[merge_keys + sorted(value_columns)]
|
|
613
|
+
|
|
614
|
+
merged_df.to_csv(output_file_name, sep="\t", index=False)
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
if __name__ == "__main__":
|
|
618
|
+
cli()
|
|
@@ -155,7 +155,7 @@ def parse_args():
|
|
|
155
155
|
description = (
|
|
156
156
|
"antiSMASH output summary generator. "
|
|
157
157
|
"Script takes regions from GFF and counts its appearance in annotation. "
|
|
158
|
-
"Output columns contain
|
|
158
|
+
"Output columns contain label, descriptions and count. "
|
|
159
159
|
f"Descriptions were taken from pre-parsed glossary provided on antiSMASH website. "
|
|
160
160
|
f"Current script supports antiSMASH results for version {ANTISMASH_VERSION} and older."
|
|
161
161
|
)
|
|
@@ -202,15 +202,15 @@ def main():
|
|
|
202
202
|
df = pd.DataFrame(dict_list)
|
|
203
203
|
df = df[df["product"].notna()]
|
|
204
204
|
df_grouped = (
|
|
205
|
-
df.groupby(["product"]).size().reset_index(name="
|
|
206
|
-
).sort_values(by="
|
|
205
|
+
df.groupby(["product"]).size().reset_index(name="count")
|
|
206
|
+
).sort_values(by="count", ascending=False)
|
|
207
207
|
|
|
208
208
|
df_grouped = df_grouped.rename(
|
|
209
209
|
columns={
|
|
210
210
|
"product": "label",
|
|
211
211
|
}
|
|
212
212
|
)
|
|
213
|
-
df_grouped["
|
|
213
|
+
df_grouped["description"] = df_grouped["label"].apply(
|
|
214
214
|
lambda x: ",".join(
|
|
215
215
|
[
|
|
216
216
|
DESCRIPTIONS.get(cls.strip().lower(), cls.strip())
|
|
@@ -218,11 +218,7 @@ def main():
|
|
|
218
218
|
]
|
|
219
219
|
)
|
|
220
220
|
)
|
|
221
|
-
df_grouped = df_grouped[["label", "
|
|
222
|
-
df_grouped = df_grouped.rename(columns={
|
|
223
|
-
"Description": "description",
|
|
224
|
-
"Count": "count"
|
|
225
|
-
})
|
|
221
|
+
df_grouped = df_grouped[["label", "description", "count"]]
|
|
226
222
|
df_grouped.to_csv(output_filename, sep="\t", index=False)
|
|
227
223
|
|
|
228
224
|
|