mgnify-pipelines-toolkit 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py +24 -27
- mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +58 -65
- mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py +14 -1
- {mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/METADATA +6 -6
- {mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/RECORD +9 -9
- {mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/entry_points.txt +0 -0
- {mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -22,7 +22,6 @@ import pandas as pd
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def parse_args():
|
|
25
|
-
|
|
26
25
|
parser = argparse.ArgumentParser()
|
|
27
26
|
parser.add_argument(
|
|
28
27
|
"-i", "--input", required=True, type=str, help="Input JSON from antiSMASH"
|
|
@@ -64,7 +63,6 @@ def main():
|
|
|
64
63
|
region_name = None
|
|
65
64
|
|
|
66
65
|
for feature in record["features"]:
|
|
67
|
-
|
|
68
66
|
if feature["type"] == "region":
|
|
69
67
|
# Annotate region features
|
|
70
68
|
region_name = (
|
|
@@ -129,35 +127,34 @@ def main():
|
|
|
129
127
|
cds_by_protocluster = record["modules"][
|
|
130
128
|
"antismash.detection.hmm_detection"
|
|
131
129
|
]["rule_results"]["cds_by_protocluster"]
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
130
|
+
|
|
131
|
+
if not cds_by_protocluster:
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
for feature in cds_by_protocluster[0][1]:
|
|
135
|
+
if locus_tag := feature.get("cds_name"):
|
|
136
|
+
as_clusters = ",".join(list(feature["definition_domains"].keys()))
|
|
137
|
+
if locus_tag in attributes_dict:
|
|
138
|
+
attributes_dict[locus_tag].update(
|
|
139
|
+
{"as_gene_clusters": as_clusters}
|
|
138
140
|
)
|
|
139
|
-
if locus_tag in attributes_dict.keys():
|
|
140
|
-
attributes_dict[locus_tag].update(
|
|
141
|
-
{"as_gene_clusters": as_clusters}
|
|
142
|
-
)
|
|
143
141
|
|
|
144
142
|
if "antismash.detection.genefunctions" in record["modules"].keys():
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
143
|
+
gene_function_tools = record["modules"][
|
|
144
|
+
"antismash.detection.genefunctions"
|
|
145
|
+
]["tools"]
|
|
146
|
+
if tool_data := gene_function_tools.get("smcogs"):
|
|
147
|
+
|
|
148
|
+
for locus_tag in tool_data["best_hits"]:
|
|
149
|
+
smcog_id = tool_data["best_hits"][locus_tag]["reference_id"]
|
|
150
|
+
smcog_description = tool_data["best_hits"][locus_tag]["description"]
|
|
151
|
+
|
|
152
|
+
score = tool_data["best_hits"][locus_tag]["bitscore"]
|
|
153
|
+
e_value = tool_data["best_hits"][locus_tag]["evalue"]
|
|
156
154
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
break
|
|
155
|
+
smcog_note = f"smCOG:{smcog_id}:{smcog_description.replace(' ', '_')}(Score:{score}%3BE-value:{e_value})"
|
|
156
|
+
if locus_tag in attributes_dict.keys():
|
|
157
|
+
attributes_dict[locus_tag].update({"as_notes": smcog_note})
|
|
161
158
|
|
|
162
159
|
attributes = [
|
|
163
160
|
";".join(f"{k}={v}" for k, v in attrib_data.items() if v)
|
|
@@ -83,70 +83,63 @@ def print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_line
|
|
|
83
83
|
file_out.write("##gff-version 3\n")
|
|
84
84
|
with fileinput.hook_compressed(overview_file, "r", encoding="utf-8") as file_in:
|
|
85
85
|
for line in file_in:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
else:
|
|
113
|
-
continue
|
|
114
|
-
cleaned_substrates = ",".join(
|
|
115
|
-
sorted(
|
|
116
|
-
{
|
|
117
|
-
subsrate.strip()
|
|
118
|
-
for subsrate in substrates.get(transcript, "N/A").split(
|
|
119
|
-
","
|
|
120
|
-
)
|
|
121
|
-
}
|
|
122
|
-
)
|
|
86
|
+
|
|
87
|
+
if not line.startswith("MGYG") and not line.startswith("ERZ"):
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
line = line.strip()
|
|
91
|
+
temp_list = line.split("\t")
|
|
92
|
+
transcript = temp_list[0]
|
|
93
|
+
ec_number_raw = temp_list[1]
|
|
94
|
+
num_of_tools = temp_list[5]
|
|
95
|
+
recc_subfamily = temp_list[6]
|
|
96
|
+
|
|
97
|
+
# EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
|
|
98
|
+
|
|
99
|
+
ec_number = ""
|
|
100
|
+
ec_list = ec_number_raw.split("|")
|
|
101
|
+
for ec in ec_list:
|
|
102
|
+
if ec != "-":
|
|
103
|
+
ec_number += ec.split(":")[0] + "|"
|
|
104
|
+
|
|
105
|
+
ec_number = ec_number.strip("|")
|
|
106
|
+
cleaned_substrates = ",".join(
|
|
107
|
+
sorted(
|
|
108
|
+
{
|
|
109
|
+
subsrate.strip()
|
|
110
|
+
for subsrate in substrates.get(transcript, "N/A").split(",")
|
|
111
|
+
}
|
|
123
112
|
)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
113
|
+
)
|
|
114
|
+
# Assemble information to add to the 9th column
|
|
115
|
+
if recc_subfamily == "-":
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
col9_parts = [
|
|
119
|
+
f"protein_family={recc_subfamily}",
|
|
120
|
+
f"substrate_dbcan-sub={cleaned_substrates}",
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
if ec_number:
|
|
124
|
+
col9_parts.append(f"eC_number={ec_number}")
|
|
125
|
+
|
|
126
|
+
col9_parts.append(f"num_tools={num_of_tools}")
|
|
127
|
+
col9_text = ";".join(col9_parts)
|
|
128
|
+
|
|
129
|
+
for gff_line in genome_gff_lines[transcript]:
|
|
130
|
+
fields = gff_line.strip().split("\t")
|
|
131
|
+
# Replace the tool
|
|
132
|
+
fields[1] = f"dbCAN:{dbcan_version}"
|
|
133
|
+
# Replace the feature
|
|
134
|
+
fields[2] = "CAZyme"
|
|
135
|
+
# Replace the confidence value
|
|
136
|
+
fields[5] = "."
|
|
137
|
+
# Keep only the ID in the 9th column
|
|
138
|
+
attributes = fields[8].split(";")[0]
|
|
139
|
+
# Add dbcan information to the 9th column
|
|
140
|
+
attributes = f"{attributes};{col9_text};"
|
|
141
|
+
fields[8] = attributes
|
|
142
|
+
file_out.write("\t".join(fields) + "\n")
|
|
150
143
|
|
|
151
144
|
|
|
152
145
|
def load_substrates(hmm_path):
|
|
@@ -155,8 +148,8 @@ def load_substrates(hmm_path):
|
|
|
155
148
|
header = next(file_in)
|
|
156
149
|
header_fields = header.strip().split("\t")
|
|
157
150
|
substrate_idx = header_fields.index("Substrate")
|
|
158
|
-
gene_idx = header_fields.index("
|
|
159
|
-
evalue_idx = header_fields.index("
|
|
151
|
+
gene_idx = header_fields.index("Target Name")
|
|
152
|
+
evalue_idx = header_fields.index("i-Evalue")
|
|
160
153
|
for line in file_in:
|
|
161
154
|
fields = line.strip().split("\t")
|
|
162
155
|
if float(fields[evalue_idx]) < 1e-15: # evalue is the default from dbcan
|
|
@@ -117,6 +117,7 @@ SUMMARY_TYPES_MAP = {
|
|
|
117
117
|
},
|
|
118
118
|
"sanntis": {
|
|
119
119
|
"folder": "pathways-and-systems/sanntis",
|
|
120
|
+
"allow_missing": True,
|
|
120
121
|
"column_names": SANNTIS_COLUMN_NAMES,
|
|
121
122
|
"schema": SanntisSummarySchema,
|
|
122
123
|
"study_schema": SanntisStudySummarySchema,
|
|
@@ -232,6 +233,7 @@ def generate_functional_summary(
|
|
|
232
233
|
"go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
|
|
233
234
|
],
|
|
234
235
|
outdir: Path = None,
|
|
236
|
+
allow_missing: bool = False,
|
|
235
237
|
) -> None:
|
|
236
238
|
"""
|
|
237
239
|
Generate a combined study-level functional annotation summary from multiple input
|
|
@@ -243,6 +245,7 @@ def generate_functional_summary(
|
|
|
243
245
|
:param label: Label for the functional annotation type
|
|
244
246
|
(expected one of ["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"]).
|
|
245
247
|
:param outdir: Optional output directory for the results.
|
|
248
|
+
:param allow_missing: Whether to allow the summary files to be missing (e.g. because the pipeline doesn't emit them if acceptably empty).
|
|
246
249
|
|
|
247
250
|
In the input files, column orders may vary, but the following columns are expected:
|
|
248
251
|
GO summary input file:
|
|
@@ -285,7 +288,16 @@ def generate_functional_summary(
|
|
|
285
288
|
M00163 83.33 Photosystem I Pathway modules; Energy metabolism; Photosynthesis K02689,K02690,K02691,K02692,K02694 K02693
|
|
286
289
|
M00615 50.0 Nitrate assimilation Signature modules; Module set; Metabolic capacity K02575 M00531
|
|
287
290
|
"""
|
|
288
|
-
|
|
291
|
+
try:
|
|
292
|
+
check_files_exist(list(file_dict.values()))
|
|
293
|
+
except FileNotFoundError as e:
|
|
294
|
+
if allow_missing:
|
|
295
|
+
logging.warning(
|
|
296
|
+
f"One of the expected files is missing, but this is allowed for {label}."
|
|
297
|
+
)
|
|
298
|
+
logging.warning(e)
|
|
299
|
+
return
|
|
300
|
+
raise
|
|
289
301
|
|
|
290
302
|
output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
|
|
291
303
|
|
|
@@ -419,6 +431,7 @@ def summarise_analyses(
|
|
|
419
431
|
output_prefix,
|
|
420
432
|
summary_type,
|
|
421
433
|
outdir=outdir,
|
|
434
|
+
allow_missing=config.get("allow_missing", False),
|
|
422
435
|
)
|
|
423
436
|
logging.info("Assembly-level summaries were generated successfully.")
|
|
424
437
|
logging.info("Done.")
|
{mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mgnify_pipelines_toolkit
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: Collection of scripts and tools for MGnify pipelines
|
|
5
5
|
Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
|
|
6
6
|
License: Apache Software License 2.0
|
|
@@ -8,7 +8,7 @@ Keywords: bioinformatics,pipelines,metagenomics
|
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python: >=3.
|
|
11
|
+
Requires-Python: >=3.11
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: biopython>=1.85
|
|
@@ -20,10 +20,10 @@ Requires-Dist: click<9,>=8.1.8
|
|
|
20
20
|
Requires-Dist: pandera<0.24,>=0.23.1
|
|
21
21
|
Requires-Dist: pyfastx<3,>=2.2.0
|
|
22
22
|
Requires-Dist: intervaltree<4,>=3.1.0
|
|
23
|
-
Provides-Extra:
|
|
24
|
-
Requires-Dist: pytest<9,>=8.3.5; extra == "
|
|
25
|
-
Requires-Dist: pytest-md>=0.2.0; extra == "
|
|
26
|
-
Requires-Dist: pytest-workflow==2.1.0; extra == "
|
|
23
|
+
Provides-Extra: test
|
|
24
|
+
Requires-Dist: pytest<9,>=8.3.5; extra == "test"
|
|
25
|
+
Requires-Dist: pytest-md>=0.2.0; extra == "test"
|
|
26
|
+
Requires-Dist: pytest-workflow==2.1.0; extra == "test"
|
|
27
27
|
Provides-Extra: dev
|
|
28
28
|
Requires-Dist: pre-commit>=4.2.0; extra == "dev"
|
|
29
29
|
Requires-Dist: black>=25.1.0; extra == "dev"
|
{mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/RECORD
RENAMED
|
@@ -14,7 +14,7 @@ mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=yLpzkRJ
|
|
|
14
14
|
mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=K6gniytuItq5WzHLi1BsaUCOdP4Zm0_ZzW2_ns7-BTI,11114
|
|
15
15
|
mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py,sha256=epVClL10QcllL8yu7YGjx0rXNVHL2GxHi-Ek0MOjsjo,13859
|
|
16
16
|
mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py,sha256=NZSNY2bqs_TQyz8riDqiEFPLKcwTgzh1C7DeVHT6V8Q,4366
|
|
17
|
-
mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=
|
|
17
|
+
mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=vZdDIcG09hulgCp0FylwHXVSGSlwl2RsDU4_xvsrUC0,6732
|
|
18
18
|
mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py,sha256=Pq-9RSt3RCxzDMQVW1VHlHF4NtpVwCWFbg2CMkvpZZc,19089
|
|
19
19
|
mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py,sha256=2T4T7aXMGPac-LZUXJF3lOUzZZF50dAKkKTSaO-4idQ,3587
|
|
20
20
|
mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py,sha256=6gbCRlEX1eBqzFYjOt3og-961dZ--QsCJL-7l5nzg1k,33992
|
|
@@ -22,9 +22,9 @@ mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py,sha256=_4J31wAjK5B1
|
|
|
22
22
|
mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py,sha256=_iaTBvMKbQDi_02_QuSPqLJ_rC37ruxiPHv5lLQmI-w,5480
|
|
23
23
|
mgnify_pipelines_toolkit/analysis/assembly/go_utils.py,sha256=eay9e3Xdc8XxnlC_4SHHjN89k-M9i_cFMc2lI_ZFxqY,5596
|
|
24
24
|
mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py,sha256=uex2T6GagtYFBIc39-Xm4SFHL06KAQ5v0_loOmY_eaw,4289
|
|
25
|
-
mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py,sha256=
|
|
25
|
+
mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py,sha256=5m5AwWEKidJx1FI0y93AFka7z0zEE8dBf1ofgP8TV_Y,7108
|
|
26
26
|
mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py,sha256=DYZhChGD49M-zAtGkCmNHXDoVTnd5Qy6amG-oePO8Ek,5981
|
|
27
|
-
mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py,sha256=
|
|
27
|
+
mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py,sha256=eNichqFFmfPsa2J10IUm_PemVs9fBhbKa2vpDqEvJNU,21791
|
|
28
28
|
mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=jUeA7I12YrtIqnm3hUxpdgsWfa2pP1ALGjb9OMKPcgY,10643
|
|
29
29
|
mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=TPaKlYkoy37_XgYNOskWCCoXtPNku_k5ygSeK4fT1VQ,6689
|
|
30
30
|
mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=1wblLbZl521digIUWoqneAu15gErzvN_oC--5T_xUdw,4582
|
|
@@ -49,9 +49,9 @@ mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pyDZvCuWbwccQF0D7c5BN1vv36wQd
|
|
|
49
49
|
mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
50
|
mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
|
|
51
51
|
mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
|
|
52
|
-
mgnify_pipelines_toolkit-1.
|
|
53
|
-
mgnify_pipelines_toolkit-1.
|
|
54
|
-
mgnify_pipelines_toolkit-1.
|
|
55
|
-
mgnify_pipelines_toolkit-1.
|
|
56
|
-
mgnify_pipelines_toolkit-1.
|
|
57
|
-
mgnify_pipelines_toolkit-1.
|
|
52
|
+
mgnify_pipelines_toolkit-1.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
53
|
+
mgnify_pipelines_toolkit-1.2.0.dist-info/METADATA,sha256=uTDvoF0oYy-1ApjeygcGbjipM9ZLt1tLArKA6xDNyl4,5807
|
|
54
|
+
mgnify_pipelines_toolkit-1.2.0.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
|
55
|
+
mgnify_pipelines_toolkit-1.2.0.dist-info/entry_points.txt,sha256=JSjuxAr71MTeSUPPpno22wmZYgVO-gbsXfDkgWKkF7A,3533
|
|
56
|
+
mgnify_pipelines_toolkit-1.2.0.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
|
|
57
|
+
mgnify_pipelines_toolkit-1.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/top_level.txt
RENAMED
|
File without changes
|