PyPI - mgnify-pipelines-toolkit - Versions diffs - 1.0.6__tar.gz → 1.0.8__tar.gz - Mend

@@ -49,7 +49,7 @@ def main():
 def load_cgcs(standard_path):
     cgc_locations = dict()
-    with fileinput.hook_compressed(standard_path, "rt") as file_in:
+    with fileinput.hook_compressed(standard_path, "r", encoding="utf-8") as file_in:
         for line in file_in:
             if not line.startswith("CGC#"):
                 cgc, _, contig, _, start, end, _, _ = line.strip().split("\t")
@@ -72,7 +72,7 @@ def print_gff(standard_path, outfile, dbcan_version, substrates, cgc_locations):
     with open(outfile, "w") as file_out:
         file_out.write("##gff-version 3\n")
         cgcs_printed = list()
-        with fileinput.hook_compressed(standard_path, "rt") as file_in:
+        with fileinput.hook_compressed(standard_path, "r", encoding="utf-8") as file_in:
             for line in file_in:
                 if not line.startswith("CGC#"):
                     cgc, gene_type, contig, prot_id, start, end, strand, protein_fam = (
@@ -107,7 +107,7 @@ def print_gff(standard_path, outfile, dbcan_version, substrates, cgc_locations):
 def load_substrates(substrate_path):
     substrates = dict()
-    with fileinput.hook_compressed(substrate_path, "rt") as file_in:
+    with fileinput.hook_compressed(substrate_path, "r", encoding="utf-8") as file_in:
         for line in file_in:
             if not line.startswith("#"):
                 parts = line.strip().split("\t")

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mgnify_pipelines_toolkit
-Version: 1.0.6
+Version: 1.0.8
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0

@@ -401,12 +401,10 @@ def retrieve_regions(
         region_counter = defaultdict(int)
         regions_to_remove = []
         for model, value in multiregion_matches.items():
             marker_gene = determine_marker_gene(determine_domain(model))
             for region in value:
                 region_counter[f"{marker_gene}.{region}"] += 1
         for region, count in region_counter.items():
             if count < MIN_SEQ_COUNT:
                 regions_to_remove.append(region)
@@ -421,6 +419,8 @@ def retrieve_regions(
         for model, value in multiregion_matches.items():
             new_value = []
             for region in value:
+                if region == "":
+                    continue
                 marker_gene = determine_marker_gene(determine_domain(model))
                 full_region = f"{marker_gene}.{region}"
                 if full_region not in regions_to_remove:
@@ -463,6 +463,9 @@ def retrieve_regions(
             for key, value in temp_seq_counter.items():
                 seq_per_variable_region_count.setdefault(key, 0)
                 seq_per_variable_region_count[key] += value
+        else:
+            logging.info("No output will be produced - the run is ambiguous.")
+            continue
     json_outfile = "{}.json".format(outfile_prefix)
     tsv_outfile = "{}.tsv".format(outfile_prefix)

@@ -52,7 +52,7 @@ def main():
 def load_gff(gff):
     genome_gff_lines = dict()
-    with fileinput.hook_compressed(gff, "rt") as gff:
+    with fileinput.hook_compressed(gff, "r", encoding="utf-8") as gff:
         for line in gff:
             if line.startswith("##FASTA"):
                 return genome_gff_lines
@@ -81,7 +81,7 @@ def load_gff(gff):
 def print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_lines):
     with open(outfile, "w") as file_out:
         file_out.write("##gff-version 3\n")
-        with fileinput.hook_compressed(overview_file, "rt") as file_in:
+        with fileinput.hook_compressed(overview_file, "r", encoding="utf-8") as file_in:
             for line in file_in:
                 if line.startswith("MGYG") or line.startswith("ERZ"):
                     (
@@ -151,7 +151,7 @@ def print_gff(overview_file, outfile, dbcan_version, substrates, genome_gff_line
 def load_substrates(hmm_path):
     substrates = dict()
-    with fileinput.hook_compressed(hmm_path, "rt") as file_in:
+    with fileinput.hook_compressed(hmm_path, "r", encoding="utf-8") as file_in:
         header = next(file_in)
         header_fields = header.strip().split("\t")
         substrate_idx = header_fields.index("Substrate")

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mgnify_pipelines_toolkit
-Version: 1.0.6
+Version: 1.0.8
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0

@@ -1,6 +1,6 @@
 [project]
 name = "mgnify_pipelines_toolkit"
-version = "1.0.6"
+version = "1.0.8"
 readme = "README.md"
 license = {text = "Apache Software License 2.0"}
 authors = [

mgnify-pipelines-toolkit 1.0.6tar.gz → 1.0.8tar.gz