PyPI - mgnify-pipelines-toolkit - Versions diffs - 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

mgnify-pipelines-toolkit 1.1.1py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (9) hide show

mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py CHANGED Viewed

@@ -22,7 +22,6 @@ import pandas as pd
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-i", "--input", required=True, type=str, help="Input JSON from antiSMASH"
@@ -64,7 +63,6 @@ def main():
         region_name = None
         for feature in record["features"]:
             if feature["type"] == "region":
                 # Annotate region features
                 region_name = (
@@ -129,35 +127,34 @@ def main():
             cds_by_protocluster = record["modules"][
                 "antismash.detection.hmm_detection"
             ]["rule_results"]["cds_by_protocluster"]
-            if len(cds_by_protocluster) > 0:
-                for feature in cds_by_protocluster[0][1]:
-                    if "cds_name" in feature.keys():
-                        locus_tag = feature["cds_name"]
-                        as_clusters = ",".join(
-                            list(feature["definition_domains"].keys())
+            if not cds_by_protocluster:
+                continue
+            for feature in cds_by_protocluster[0][1]:
+                if locus_tag := feature.get("cds_name"):
+                    as_clusters = ",".join(list(feature["definition_domains"].keys()))
+                    if locus_tag in attributes_dict:
+                        attributes_dict[locus_tag].update(
+                            {"as_gene_clusters": as_clusters}
                         )
-                        if locus_tag in attributes_dict.keys():
-                            attributes_dict[locus_tag].update(
-                                {"as_gene_clusters": as_clusters}
-                            )
         if "antismash.detection.genefunctions" in record["modules"].keys():
-            for tool in record["modules"]["antismash.detection.genefunctions"]["tools"]:
-                if tool["tool"] == "smcogs":
-                    for locus_tag in tool["best_hits"]:
-                        hit_id = tool["best_hits"][locus_tag]["hit_id"].split(":")[0]
-                        hit_desc = (
-                            tool["best_hits"][locus_tag]["hit_id"]
-                            .split(":")[1]
-                            .replace(" ", "_")
-                        )
-                        score = tool["best_hits"][locus_tag]["bitscore"]
-                        e_value = tool["best_hits"][locus_tag]["evalue"]
+            gene_function_tools = record["modules"][
+                "antismash.detection.genefunctions"
+            ]["tools"]
+            if tool_data := gene_function_tools.get("smcogs"):
+                for locus_tag in tool_data["best_hits"]:
+                    smcog_id = tool_data["best_hits"][locus_tag]["reference_id"]
+                    smcog_description = tool_data["best_hits"][locus_tag]["description"]
+                    score = tool_data["best_hits"][locus_tag]["bitscore"]
+                    e_value = tool_data["best_hits"][locus_tag]["evalue"]
-                        smcog_note = f"smCOG:{hit_id}:{hit_desc.replace(' ', '_')}(Score:{score}%3BE-value:{e_value})"
-                        if locus_tag in attributes_dict.keys():
-                            attributes_dict[locus_tag].update({"as_notes": smcog_note})
-                        break
+                    smcog_note = f"smCOG:{smcog_id}:{smcog_description.replace(' ', '_')}(Score:{score}%3BE-value:{e_value})"
+                    if locus_tag in attributes_dict.keys():
+                        attributes_dict[locus_tag].update({"as_notes": smcog_note})
     attributes = [
         ";".join(f"{k}={v}" for k, v in attrib_data.items() if v)

mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py CHANGED Viewed

@@ -83,70 +83,63 @@ def print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_line
         file_out.write("##gff-version 3\n")
         with fileinput.hook_compressed(overview_file, "r", encoding="utf-8") as file_in:
             for line in file_in:
-                if line.startswith("MGYG") or line.startswith("ERZ"):
-                    (
-                        transcript,
-                        ec_number_raw,
-                        dbcan_hmmer,
-                        dbcan_sub_ecami,
-                        diamond,
-                        num_of_tools,
-                    ) = line.strip().split("\t")
-                    # EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
-                    ec_number = ""
-                    ec_list = ec_number_raw.split("|")
-                    for ec in ec_list:
-                        if ec != "-":
-                            ec_number += ec.split(":")[0] + "|"
-                    ec_number = ec_number.strip("|")
-                    # Dbcan recommends to use subfamily preference as dbcan_hmmer > dbcan_sub_ecami > diamond
-                    # diamond is messier, so we don't report it here
-                    if dbcan_hmmer != "-":
-                        # the field dbcan_hmmer reports match positions in parentheses, clear them out first:
-                        subfamily = dbcan_hmmer.split("(")[0]
-                    elif dbcan_sub_ecami != "-":
-                        subfamily = dbcan_sub_ecami
-                    else:
-                        continue
-                    cleaned_substrates = ",".join(
-                        sorted(
-                            {
-                                subsrate.strip()
-                                for subsrate in substrates.get(transcript, "N/A").split(
-                                    ","
-                                )
-                            }
-                        )
+                if not line.startswith("MGYG") and not line.startswith("ERZ"):
+                    continue
+                line = line.strip()
+                temp_list = line.split("\t")
+                transcript = temp_list[0]
+                ec_number_raw = temp_list[1]
+                num_of_tools = temp_list[5]
+                recc_subfamily = temp_list[6]
+                # EC is reported as 2.4.99.-:5 with :5 meaning 5 proteins in the subfamily have EC 2.4.99.-
+                ec_number = ""
+                ec_list = ec_number_raw.split("|")
+                for ec in ec_list:
+                    if ec != "-":
+                        ec_number += ec.split(":")[0] + "|"
+                ec_number = ec_number.strip("|")
+                cleaned_substrates = ",".join(
+                    sorted(
+                        {
+                            subsrate.strip()
+                            for subsrate in substrates.get(transcript, "N/A").split(",")
+                        }
                     )
-                    # Assemble information to add to the 9th column
-                    col9_parts = [
-                        f"protein_family={subfamily}",
-                        f"substrate_dbcan-sub={cleaned_substrates}",
-                    ]
-                    if ec_number:
-                        col9_parts.append(f"eC_number={ec_number}")
-                    col9_parts.append(f"num_tools={num_of_tools}")
-                    col9_text = ";".join(col9_parts)
-                    for gff_line in genome_gff_lines[transcript]:
-                        fields = gff_line.strip().split("\t")
-                        # Replace the tool
-                        fields[1] = f"dbCAN:{dbcan_version}"
-                        # Replace the feature
-                        fields[2] = "CAZyme"
-                        # Replace the confidence value
-                        fields[5] = "."
-                        # Keep only the ID in the 9th column
-                        attributes = fields[8].split(";")[0]
-                        # Add dbcan information to the 9th column
-                        attributes = f"{attributes};{col9_text};"
-                        fields[8] = attributes
-                        file_out.write("\t".join(fields) + "\n")
+                )
+                # Assemble information to add to the 9th column
+                if recc_subfamily == "-":
+                    continue
+                col9_parts = [
+                    f"protein_family={recc_subfamily}",
+                    f"substrate_dbcan-sub={cleaned_substrates}",
+                ]
+                if ec_number:
+                    col9_parts.append(f"eC_number={ec_number}")
+                col9_parts.append(f"num_tools={num_of_tools}")
+                col9_text = ";".join(col9_parts)
+                for gff_line in genome_gff_lines[transcript]:
+                    fields = gff_line.strip().split("\t")
+                    # Replace the tool
+                    fields[1] = f"dbCAN:{dbcan_version}"
+                    # Replace the feature
+                    fields[2] = "CAZyme"
+                    # Replace the confidence value
+                    fields[5] = "."
+                    # Keep only the ID in the 9th column
+                    attributes = fields[8].split(";")[0]
+                    # Add dbcan information to the 9th column
+                    attributes = f"{attributes};{col9_text};"
+                    fields[8] = attributes
+                    file_out.write("\t".join(fields) + "\n")
 def load_substrates(hmm_path):
@@ -155,8 +148,8 @@ def load_substrates(hmm_path):
         header = next(file_in)
         header_fields = header.strip().split("\t")
         substrate_idx = header_fields.index("Substrate")
-        gene_idx = header_fields.index("Gene ID")
-        evalue_idx = header_fields.index("E Value")
+        gene_idx = header_fields.index("Target Name")
+        evalue_idx = header_fields.index("i-Evalue")
         for line in file_in:
             fields = line.strip().split("\t")
             if float(fields[evalue_idx]) < 1e-15:  # evalue is the default from dbcan

mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py CHANGED Viewed

@@ -117,6 +117,7 @@ SUMMARY_TYPES_MAP = {
     },
     "sanntis": {
         "folder": "pathways-and-systems/sanntis",
+        "allow_missing": True,
         "column_names": SANNTIS_COLUMN_NAMES,
         "schema": SanntisSummarySchema,
         "study_schema": SanntisStudySummarySchema,
@@ -232,6 +233,7 @@ def generate_functional_summary(
         "go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
     ],
     outdir: Path = None,
+    allow_missing: bool = False,
 ) -> None:
     """
     Generate a combined study-level functional annotation summary from multiple input
@@ -243,6 +245,7 @@ def generate_functional_summary(
     :param label: Label for the functional annotation type
     (expected one of ["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"]).
     :param outdir: Optional output directory for the results.
+    :param allow_missing: Whether to allow the summary files to be missing (e.g. because the pipeline doesn't emit them if acceptably empty).
     In the input files, column orders may vary, but the following columns are expected:
     GO summary input file:
@@ -285,7 +288,16 @@ def generate_functional_summary(
     M00163	83.33	Photosystem I	Pathway modules; Energy metabolism; Photosynthesis	K02689,K02690,K02691,K02692,K02694	K02693
     M00615	50.0	Nitrate assimilation	Signature modules; Module set; Metabolic capacity	K02575	M00531
     """
-    check_files_exist(list(file_dict.values()))
+    try:
+        check_files_exist(list(file_dict.values()))
+    except FileNotFoundError as e:
+        if allow_missing:
+            logging.warning(
+                f"One of the expected files is missing, but this is allowed for {label}."
+            )
+            logging.warning(e)
+            return
+        raise
     output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
@@ -419,6 +431,7 @@ def summarise_analyses(
             output_prefix,
             summary_type,
             outdir=outdir,
+            allow_missing=config.get("allow_missing", False),
         )
     logging.info("Assembly-level summaries were generated successfully.")
     logging.info("Done.")

{mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mgnify_pipelines_toolkit
-Version: 1.1.1
+Version: 1.2.0
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0
@@ -8,7 +8,7 @@ Keywords: bioinformatics,pipelines,metagenomics
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
-Requires-Python: >=3.10
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: biopython>=1.85
@@ -20,10 +20,10 @@ Requires-Dist: click<9,>=8.1.8
 Requires-Dist: pandera<0.24,>=0.23.1
 Requires-Dist: pyfastx<3,>=2.2.0
 Requires-Dist: intervaltree<4,>=3.1.0
-Provides-Extra: tests
-Requires-Dist: pytest<9,>=8.3.5; extra == "tests"
-Requires-Dist: pytest-md>=0.2.0; extra == "tests"
-Requires-Dist: pytest-workflow==2.1.0; extra == "tests"
+Provides-Extra: test
+Requires-Dist: pytest<9,>=8.3.5; extra == "test"
+Requires-Dist: pytest-md>=0.2.0; extra == "test"
+Requires-Dist: pytest-workflow==2.1.0; extra == "test"
 Provides-Extra: dev
 Requires-Dist: pre-commit>=4.2.0; extra == "dev"
 Requires-Dist: black>=25.1.0; extra == "dev"

{mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/RECORD RENAMED Viewed

@@ -14,7 +14,7 @@ mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=yLpzkRJ
 mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=K6gniytuItq5WzHLi1BsaUCOdP4Zm0_ZzW2_ns7-BTI,11114
 mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py,sha256=epVClL10QcllL8yu7YGjx0rXNVHL2GxHi-Ek0MOjsjo,13859
 mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py,sha256=NZSNY2bqs_TQyz8riDqiEFPLKcwTgzh1C7DeVHT6V8Q,4366
-mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=wXrw1B-z4hOu5oA27Vp1WYxGP2Mk6ZY4i_T5jDZgek0,6954
+mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=vZdDIcG09hulgCp0FylwHXVSGSlwl2RsDU4_xvsrUC0,6732
 mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py,sha256=Pq-9RSt3RCxzDMQVW1VHlHF4NtpVwCWFbg2CMkvpZZc,19089
 mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py,sha256=2T4T7aXMGPac-LZUXJF3lOUzZZF50dAKkKTSaO-4idQ,3587
 mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py,sha256=6gbCRlEX1eBqzFYjOt3og-961dZ--QsCJL-7l5nzg1k,33992
@@ -22,9 +22,9 @@ mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py,sha256=_4J31wAjK5B1
 mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py,sha256=_iaTBvMKbQDi_02_QuSPqLJ_rC37ruxiPHv5lLQmI-w,5480
 mgnify_pipelines_toolkit/analysis/assembly/go_utils.py,sha256=eay9e3Xdc8XxnlC_4SHHjN89k-M9i_cFMc2lI_ZFxqY,5596
 mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py,sha256=uex2T6GagtYFBIc39-Xm4SFHL06KAQ5v0_loOmY_eaw,4289
-mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py,sha256=KaJHOKfbIurbD1iiMssjdAaSAT8Nv-_ZUFwxkLqukAE,7799
+mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py,sha256=5m5AwWEKidJx1FI0y93AFka7z0zEE8dBf1ofgP8TV_Y,7108
 mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py,sha256=DYZhChGD49M-zAtGkCmNHXDoVTnd5Qy6amG-oePO8Ek,5981
-mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py,sha256=J4cIWaFyWihqo2JtaOR531aXtVxIfOi_hcwZZw-vP8g,21252
+mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py,sha256=eNichqFFmfPsa2J10IUm_PemVs9fBhbKa2vpDqEvJNU,21791
 mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=jUeA7I12YrtIqnm3hUxpdgsWfa2pP1ALGjb9OMKPcgY,10643
 mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=TPaKlYkoy37_XgYNOskWCCoXtPNku_k5ygSeK4fT1VQ,6689
 mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=1wblLbZl521digIUWoqneAu15gErzvN_oC--5T_xUdw,4582
@@ -49,9 +49,9 @@ mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pyDZvCuWbwccQF0D7c5BN1vv36wQd
 mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
 mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
-mgnify_pipelines_toolkit-1.1.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-mgnify_pipelines_toolkit-1.1.1.dist-info/METADATA,sha256=E86Tp9qJuQUrkNIklK4PEATQ4ovZfhRbgMKVTyxGSx0,5811
-mgnify_pipelines_toolkit-1.1.1.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
-mgnify_pipelines_toolkit-1.1.1.dist-info/entry_points.txt,sha256=JSjuxAr71MTeSUPPpno22wmZYgVO-gbsXfDkgWKkF7A,3533
-mgnify_pipelines_toolkit-1.1.1.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
-mgnify_pipelines_toolkit-1.1.1.dist-info/RECORD,,
+mgnify_pipelines_toolkit-1.2.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+mgnify_pipelines_toolkit-1.2.0.dist-info/METADATA,sha256=uTDvoF0oYy-1ApjeygcGbjipM9ZLt1tLArKA6xDNyl4,5807
+mgnify_pipelines_toolkit-1.2.0.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
+mgnify_pipelines_toolkit-1.2.0.dist-info/entry_points.txt,sha256=JSjuxAr71MTeSUPPpno22wmZYgVO-gbsXfDkgWKkF7A,3533
+mgnify_pipelines_toolkit-1.2.0.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
+mgnify_pipelines_toolkit-1.2.0.dist-info/RECORD,,

{mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.1.0)
+Generator: setuptools (80.7.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mgnify_pipelines_toolkit-1.1.1.dist-info → mgnify_pipelines_toolkit-1.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

mgnify-pipelines-toolkit 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

mgnify-pipelines-toolkit 1.1.1py3-none-any.whl → 1.2.0py3-none-any.whl