PyPI - mgnify-pipelines-toolkit - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend - Supply Chain Defender

mgnify-pipelines-toolkit 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (12) hide show

mgnify_pipelines_toolkit/analysis/{shared → amplicon}/study_summary_generator.py RENAMED Viewed

@@ -257,7 +257,7 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
 @cli.command(
     "summarise",
     options_metavar="-r <runs> -a <analyses_dir> -p <output_prefix>",
-    short_help="Generate study-level analysis summaries.",
+    short_help="Generate study-level summaries of amplicon analysis results.",
 )
 @click.option(
     "-r",
@@ -327,7 +327,7 @@ def summarise_analyses(
 @cli.command(
     "merge",
     options_metavar="-a <analyses_dir> -p <output_prefix>",
-    short_help="Merge multiple study-level analysis summaries.",
+    short_help="Merge multiple study-level summaries of amplicon analysis.",
 )
 @click.option(
     "-a",

mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py CHANGED Viewed

@@ -83,70 +83,63 @@ def print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_line
         file_out.write("##gff-version 3\n")
         with fileinput.hook_compressed(overview_file, "r", encoding="utf-8") as file_in:
             for line in file_in:
-                if line.startswith("MGYG") or line.startswith("ERZ"):
-                    (
-                        transcript,
-                        ec_number_raw,
-                        dbcan_hmmer,
-                        dbcan_sub_ecami,
-                        diamond,
-                        num_of_tools,
-                    ) = line.strip().split("\t")
-                    # EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
-                    ec_number = ""
-                    ec_list = ec_number_raw.split("|")
-                    for ec in ec_list:
-                        if ec != "-":
-                            ec_number += ec.split(":")[0] + "|"
-                    ec_number = ec_number.strip("|")
-                    # Dbcan recommends to use subfamily preference as dbcan_hmmer > dbcan_sub_ecami > diamond
-                    # diamond is messier, so we don't report it here
-                    if dbcan_hmmer != "-":
-                        # the field dbcan_hmmer reports match positions in parentheses, clear them out first:
-                        subfamily = dbcan_hmmer.split("(")[0]
-                    elif dbcan_sub_ecami != "-":
-                        subfamily = dbcan_sub_ecami
-                    else:
-                        continue
-                    cleaned_substrates = ",".join(
-                        sorted(
-                            {
-                                subsrate.strip()
-                                for subsrate in substrates.get(transcript, "N/A").split(
-                                    ","
-                                )
-                            }
-                        )
+                if not line.startswith("MGYG") and not line.startswith("ERZ"):
+                    continue
+                line = line.strip()
+                temp_list = line.split("\t")
+                transcript = temp_list[0]
+                ec_number_raw = temp_list[1]
+                num_of_tools = temp_list[5]
+                recc_subfamily = temp_list[6]
+                # EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
+                ec_number = ""
+                ec_list = ec_number_raw.split("|")
+                for ec in ec_list:
+                    if ec != "-":
+                        ec_number += ec.split(":")[0] + "|"
+                ec_number = ec_number.strip("|")
+                cleaned_substrates = ",".join(
+                    sorted(
+                        {
+                            subsrate.strip()
+                            for subsrate in substrates.get(transcript, "N/A").split(",")
+                        }
                     )
-                    # Assemble information to add to the 9th column
-                    col9_parts = [
-                        f"protein_family={subfamily}",
-                        f"substrate_dbcan-sub={cleaned_substrates}",
-                    ]
-                    if ec_number:
-                        col9_parts.append(f"eC_number={ec_number}")
-                    col9_parts.append(f"num_tools={num_of_tools}")
-                    col9_text = ";".join(col9_parts)
-                    for gff_line in genome_gff_lines[transcript]:
-                        fields = gff_line.strip().split("\t")
-                        # Replace the tool
-                        fields[1] = f"dbCAN:{dbcan_version}"
-                        # Replace the feature
-                        fields[2] = "CAZyme"
-                        # Replace the confidence value
-                        fields[5] = "."
-                        # Keep only the ID in the 9th column
-                        attributes = fields[8].split(";")[0]
-                        # Add dbcan information to the 9th column
-                        attributes = f"{attributes};{col9_text};"
-                        fields[8] = attributes
-                        file_out.write("\t".join(fields) + "\n")
+                )
+                # Assemble information to add to the 9th column
+                if recc_subfamily == "-":
+                    continue
+                col9_parts = [
+                    f"protein_family={recc_subfamily}",
+                    f"substrate_dbcan-sub={cleaned_substrates}",
+                ]
+                if ec_number:
+                    col9_parts.append(f"eC_number={ec_number}")
+                col9_parts.append(f"num_tools={num_of_tools}")
+                col9_text = ";".join(col9_parts)
+                for gff_line in genome_gff_lines[transcript]:
+                    fields = gff_line.strip().split("\t")
+                    # Replace the tool
+                    fields[1] = f"dbCAN:{dbcan_version}"
+                    # Replace the feature
+                    fields[2] = "CAZyme"
+                    # Replace the confidence value
+                    fields[5] = "."
+                    # Keep only the ID in the 9th column
+                    attributes = fields[8].split(";")[0]
+                    # Add dbcan information to the 9th column
+                    attributes = f"{attributes};{col9_text};"
+                    fields[8] = attributes
+                    file_out.write("\t".join(fields) + "\n")
 def load_substrates(hmm_path):
@@ -155,8 +148,8 @@ def load_substrates(hmm_path):
         header = next(file_in)
         header_fields = header.strip().split("\t")
         substrate_idx = header_fields.index("Substrate")
-        gene_idx = header_fields.index("Gene ID")
-        evalue_idx = header_fields.index("E Value")
+        gene_idx = header_fields.index("Target Name")
+        evalue_idx = header_fields.index("i-Evalue")
         for line in file_in:
             fields = line.strip().split("\t")
             if float(fields[evalue_idx]) < 1e-15:  # evalue is the default from dbcan