mgnify-pipelines-toolkit 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -257,7 +257,7 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
257
257
  @cli.command(
258
258
  "summarise",
259
259
  options_metavar="-r <runs> -a <analyses_dir> -p <output_prefix>",
260
- short_help="Generate study-level analysis summaries.",
260
+ short_help="Generate study-level summaries of amplicon analysis results.",
261
261
  )
262
262
  @click.option(
263
263
  "-r",
@@ -327,7 +327,7 @@ def summarise_analyses(
327
327
  @cli.command(
328
328
  "merge",
329
329
  options_metavar="-a <analyses_dir> -p <output_prefix>",
330
- short_help="Merge multiple study-level analysis summaries.",
330
+ short_help="Merge multiple study-level summaries of amplicon analysis.",
331
331
  )
332
332
  @click.option(
333
333
  "-a",
@@ -83,70 +83,63 @@ def print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_line
83
83
  file_out.write("##gff-version 3\n")
84
84
  with fileinput.hook_compressed(overview_file, "r", encoding="utf-8") as file_in:
85
85
  for line in file_in:
86
- if line.startswith("MGYG") or line.startswith("ERZ"):
87
- (
88
- transcript,
89
- ec_number_raw,
90
- dbcan_hmmer,
91
- dbcan_sub_ecami,
92
- diamond,
93
- num_of_tools,
94
- ) = line.strip().split("\t")
95
- # EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
96
-
97
- ec_number = ""
98
- ec_list = ec_number_raw.split("|")
99
- for ec in ec_list:
100
- if ec != "-":
101
- ec_number += ec.split(":")[0] + "|"
102
-
103
- ec_number = ec_number.strip("|")
104
-
105
- # Dbcan recommends to use subfamily preference as dbcan_hmmer > dbcan_sub_ecami > diamond
106
- # diamond is messier, so we don't report it here
107
- if dbcan_hmmer != "-":
108
- # the field dbcan_hmmer reports match positions in parentheses, clear them out first:
109
- subfamily = dbcan_hmmer.split("(")[0]
110
- elif dbcan_sub_ecami != "-":
111
- subfamily = dbcan_sub_ecami
112
- else:
113
- continue
114
- cleaned_substrates = ",".join(
115
- sorted(
116
- {
117
- subsrate.strip()
118
- for subsrate in substrates.get(transcript, "N/A").split(
119
- ","
120
- )
121
- }
122
- )
86
+
87
+ if not line.startswith("MGYG") and not line.startswith("ERZ"):
88
+ continue
89
+
90
+ line = line.strip()
91
+ temp_list = line.split("\t")
92
+ transcript = temp_list[0]
93
+ ec_number_raw = temp_list[1]
94
+ num_of_tools = temp_list[5]
95
+ recc_subfamily = temp_list[6]
96
+
97
+ # EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
98
+
99
+ ec_number = ""
100
+ ec_list = ec_number_raw.split("|")
101
+ for ec in ec_list:
102
+ if ec != "-":
103
+ ec_number += ec.split(":")[0] + "|"
104
+
105
+ ec_number = ec_number.strip("|")
106
+ cleaned_substrates = ",".join(
107
+ sorted(
108
+ {
109
+ subsrate.strip()
110
+ for subsrate in substrates.get(transcript, "N/A").split(",")
111
+ }
123
112
  )
124
- # Assemble information to add to the 9th column
125
- col9_parts = [
126
- f"protein_family={subfamily}",
127
- f"substrate_dbcan-sub={cleaned_substrates}",
128
- ]
129
-
130
- if ec_number:
131
- col9_parts.append(f"eC_number={ec_number}")
132
-
133
- col9_parts.append(f"num_tools={num_of_tools}")
134
- col9_text = ";".join(col9_parts)
135
-
136
- for gff_line in genome_gff_lines[transcript]:
137
- fields = gff_line.strip().split("\t")
138
- # Replace the tool
139
- fields[1] = f"dbCAN:{dbcan_version}"
140
- # Replace the feature
141
- fields[2] = "CAZyme"
142
- # Replace the confidence value
143
- fields[5] = "."
144
- # Keep only the ID in the 9th column
145
- attributes = fields[8].split(";")[0]
146
- # Add dbcan information to the 9th column
147
- attributes = f"{attributes};{col9_text};"
148
- fields[8] = attributes
149
- file_out.write("\t".join(fields) + "\n")
113
+ )
114
+ # Assemble information to add to the 9th column
115
+ if recc_subfamily == "-":
116
+ continue
117
+
118
+ col9_parts = [
119
+ f"protein_family={recc_subfamily}",
120
+ f"substrate_dbcan-sub={cleaned_substrates}",
121
+ ]
122
+
123
+ if ec_number:
124
+ col9_parts.append(f"eC_number={ec_number}")
125
+
126
+ col9_parts.append(f"num_tools={num_of_tools}")
127
+ col9_text = ";".join(col9_parts)
128
+
129
+ for gff_line in genome_gff_lines[transcript]:
130
+ fields = gff_line.strip().split("\t")
131
+ # Replace the tool
132
+ fields[1] = f"dbCAN:{dbcan_version}"
133
+ # Replace the feature
134
+ fields[2] = "CAZyme"
135
+ # Replace the confidence value
136
+ fields[5] = "."
137
+ # Keep only the ID in the 9th column
138
+ attributes = fields[8].split(";")[0]
139
+ # Add dbcan information to the 9th column
140
+ attributes = f"{attributes};{col9_text};"
141
+ fields[8] = attributes
142
+ file_out.write("\t".join(fields) + "\n")
150
143
 
151
144
 
152
145
  def load_substrates(hmm_path):
@@ -155,8 +148,8 @@ def load_substrates(hmm_path):
155
148
  header = next(file_in)
156
149
  header_fields = header.strip().split("\t")
157
150
  substrate_idx = header_fields.index("Substrate")
158
- gene_idx = header_fields.index("Gene ID")
159
- evalue_idx = header_fields.index("E Value")
151
+ gene_idx = header_fields.index("Target Name")
152
+ evalue_idx = header_fields.index("i-Evalue")
160
153
  for line in file_in:
161
154
  fields = line.strip().split("\t")
162
155
  if float(fields[evalue_idx]) < 1e-15: # evalue is the default from dbcan