mgnify-pipelines-toolkit 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.
- mgnify_pipelines_toolkit/analysis/{shared → amplicon}/study_summary_generator.py +2 -2
- mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py +58 -65
- mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py +618 -0
- mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py +5 -9
- mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py +18 -16
- mgnify_pipelines_toolkit/schemas/schemas.py +355 -2
- {mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.2.dist-info}/METADATA +2 -2
- {mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.2.dist-info}/RECORD +12 -11
- {mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.2.dist-info}/WHEEL +1 -1
- {mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.2.dist-info}/entry_points.txt +2 -1
- {mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.2.dist-info}/licenses/LICENSE +0 -0
- {mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.2.dist-info}/top_level.txt +0 -0
|
@@ -257,7 +257,7 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
|
|
|
257
257
|
@cli.command(
|
|
258
258
|
"summarise",
|
|
259
259
|
options_metavar="-r <runs> -a <analyses_dir> -p <output_prefix>",
|
|
260
|
-
short_help="Generate study-level analysis
|
|
260
|
+
short_help="Generate study-level summaries of amplicon analysis results.",
|
|
261
261
|
)
|
|
262
262
|
@click.option(
|
|
263
263
|
"-r",
|
|
@@ -327,7 +327,7 @@ def summarise_analyses(
|
|
|
327
327
|
@cli.command(
|
|
328
328
|
"merge",
|
|
329
329
|
options_metavar="-a <analyses_dir> -p <output_prefix>",
|
|
330
|
-
short_help="Merge multiple study-level analysis
|
|
330
|
+
short_help="Merge multiple study-level summaries of amplicon analysis.",
|
|
331
331
|
)
|
|
332
332
|
@click.option(
|
|
333
333
|
"-a",
|
|
@@ -83,70 +83,63 @@ def print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_line
|
|
|
83
83
|
file_out.write("##gff-version 3\n")
|
|
84
84
|
with fileinput.hook_compressed(overview_file, "r", encoding="utf-8") as file_in:
|
|
85
85
|
for line in file_in:
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
else:
|
|
113
|
-
continue
|
|
114
|
-
cleaned_substrates = ",".join(
|
|
115
|
-
sorted(
|
|
116
|
-
{
|
|
117
|
-
subsrate.strip()
|
|
118
|
-
for subsrate in substrates.get(transcript, "N/A").split(
|
|
119
|
-
","
|
|
120
|
-
)
|
|
121
|
-
}
|
|
122
|
-
)
|
|
86
|
+
|
|
87
|
+
if not line.startswith("MGYG") and not line.startswith("ERZ"):
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
line = line.strip()
|
|
91
|
+
temp_list = line.split("\t")
|
|
92
|
+
transcript = temp_list[0]
|
|
93
|
+
ec_number_raw = temp_list[1]
|
|
94
|
+
num_of_tools = temp_list[5]
|
|
95
|
+
recc_subfamily = temp_list[6]
|
|
96
|
+
|
|
97
|
+
# EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
|
|
98
|
+
|
|
99
|
+
ec_number = ""
|
|
100
|
+
ec_list = ec_number_raw.split("|")
|
|
101
|
+
for ec in ec_list:
|
|
102
|
+
if ec != "-":
|
|
103
|
+
ec_number += ec.split(":")[0] + "|"
|
|
104
|
+
|
|
105
|
+
ec_number = ec_number.strip("|")
|
|
106
|
+
cleaned_substrates = ",".join(
|
|
107
|
+
sorted(
|
|
108
|
+
{
|
|
109
|
+
subsrate.strip()
|
|
110
|
+
for subsrate in substrates.get(transcript, "N/A").split(",")
|
|
111
|
+
}
|
|
123
112
|
)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
113
|
+
)
|
|
114
|
+
# Assemble information to add to the 9th column
|
|
115
|
+
if recc_subfamily == "-":
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
col9_parts = [
|
|
119
|
+
f"protein_family={recc_subfamily}",
|
|
120
|
+
f"substrate_dbcan-sub={cleaned_substrates}",
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
if ec_number:
|
|
124
|
+
col9_parts.append(f"eC_number={ec_number}")
|
|
125
|
+
|
|
126
|
+
col9_parts.append(f"num_tools={num_of_tools}")
|
|
127
|
+
col9_text = ";".join(col9_parts)
|
|
128
|
+
|
|
129
|
+
for gff_line in genome_gff_lines[transcript]:
|
|
130
|
+
fields = gff_line.strip().split("\t")
|
|
131
|
+
# Replace the tool
|
|
132
|
+
fields[1] = f"dbCAN:{dbcan_version}"
|
|
133
|
+
# Replace the feature
|
|
134
|
+
fields[2] = "CAZyme"
|
|
135
|
+
# Replace the confidence value
|
|
136
|
+
fields[5] = "."
|
|
137
|
+
# Keep only the ID in the 9th column
|
|
138
|
+
attributes = fields[8].split(";")[0]
|
|
139
|
+
# Add dbcan information to the 9th column
|
|
140
|
+
attributes = f"{attributes};{col9_text};"
|
|
141
|
+
fields[8] = attributes
|
|
142
|
+
file_out.write("\t".join(fields) + "\n")
|
|
150
143
|
|
|
151
144
|
|
|
152
145
|
def load_substrates(hmm_path):
|
|
@@ -155,8 +148,8 @@ def load_substrates(hmm_path):
|
|
|
155
148
|
header = next(file_in)
|
|
156
149
|
header_fields = header.strip().split("\t")
|
|
157
150
|
substrate_idx = header_fields.index("Substrate")
|
|
158
|
-
gene_idx = header_fields.index("
|
|
159
|
-
evalue_idx = header_fields.index("
|
|
151
|
+
gene_idx = header_fields.index("Target Name")
|
|
152
|
+
evalue_idx = header_fields.index("i-Evalue")
|
|
160
153
|
for line in file_in:
|
|
161
154
|
fields = line.strip().split("\t")
|
|
162
155
|
if float(fields[evalue_idx]) < 1e-15: # evalue is the default from dbcan
|