PyPI - mgnify-pipelines-toolkit - Versions diffs - 1.4.1__tar.gz → 1.4.4__tar.gz - Mend

mgnify-pipelines-toolkit 1.4.1tar.gz → 1.4.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (60) hide show

{mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mgnify_pipelines_toolkit
-Version: 1.4.1
+Version: 1.4.4
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0

{mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py RENAMED Viewed

@@ -73,11 +73,7 @@ def get_multiregion(raw_sequence_coords, regions):
         region_coverages[region] = overlap
     # check if any of the coords are inside the region
-    matched_regions = [
-        region
-        for region, limits in regions.items()
-        if calc_overlap(raw_sequence_coords, limits) >= MIN_OVERLAP
-    ]
+    matched_regions = [region for region, limits in regions.items() if calc_overlap(raw_sequence_coords, limits) >= MIN_OVERLAP]
     if len(matched_regions) > 1:
         amplified_region = "{}-{}".format(min(matched_regions), max(matched_regions))
     elif len(matched_regions) == 1:
@@ -121,13 +117,8 @@ def unsplit_region(long_region):
 def check_inclusiveness(more_frequent, less_frequent):
-    unsplit_more_frequent, unsplit_less_frequent = [
-        unsplit_region(region) for region in [more_frequent, less_frequent]
-    ]
-    return (
-        unsplit_more_frequent[0] <= unsplit_less_frequent[0]
-        and unsplit_more_frequent[1] >= unsplit_less_frequent[1]
-    )
+    unsplit_more_frequent, unsplit_less_frequent = [unsplit_region(region) for region in [more_frequent, less_frequent]]
+    return unsplit_more_frequent[0] <= unsplit_less_frequent[0] and unsplit_more_frequent[1] >= unsplit_less_frequent[1]
 def normalise_results(region_matches):
@@ -150,9 +141,7 @@ def normalise_results(region_matches):
         if count / len(region_matches) >= MAX_ERROR_PROPORTION and region != ""
     ]
     # sort by frequency in reverse order
-    var_region_proportions = sorted(
-        var_region_proportions, key=lambda x: x[1], reverse=True
-    )
+    var_region_proportions = sorted(var_region_proportions, key=lambda x: x[1], reverse=True)
     if len(var_region_proportions) == 1:
         return dict(var_region_proportions)
@@ -165,9 +154,7 @@ def normalise_results(region_matches):
             else:
                 return None
         else:
-            if min(
-                more_frequent[1], less_frequent[1]
-            ) > 0.1 and not check_inclusiveness(less_frequent[0], more_frequent[0]):
+            if min(more_frequent[1], less_frequent[1]) > 0.1 and not check_inclusiveness(less_frequent[0], more_frequent[0]):
                 return dict(var_region_proportions)
             else:
                 return None
@@ -221,9 +208,7 @@ def determine_marker_gene(domain):
         return "18S"
-def print_stats(
-    run_id, num_sequences, num_unsupported, num_inside_vr, run_result, stats_out
-):
+def print_stats(run_id, num_sequences, num_unsupported, num_inside_vr, run_result, stats_out):
     summary_num = dict()
     for cm in run_result:
         summary_num[cm] = dict()
@@ -233,14 +218,7 @@ def print_stats(
         del stats[""]
         summary_num[cm]["regions"] = ", ".join(stats.keys())
         summary_num[cm]["freqs"] = ", ".join(
-            [
-                (
-                    "{0:.4f}".format(val / len(run_result[cm]))
-                    if len(run_result[cm]) > 0
-                    else "0"
-                )
-                for val in stats.values()
-            ]
+            [("{0:.4f}".format(val / len(run_result[cm])) if len(run_result[cm]) > 0 else "0") for val in stats.values()]
         )
     print_str = ""
@@ -291,9 +269,7 @@ def print_to_table(tsv_out, results, per_read_info):
             marker_gene = determine_marker_gene(domain)
             for vr in amplified_regions.keys():
                 if not vr == "":
-                    record = "{}\tECO_0000363\tautomatic assertion\t{}\t{}\n".format(
-                        run, determine_marker_gene(domain), vr
-                    )
+                    record = "{}\tECO_0000363\tautomatic assertion\t{}\t{}\n".format(run, determine_marker_gene(domain), vr)
                     records.add(record)
                     records_regions.add(f"{marker_gene}.{vr}\n")
                     gene_hv_to_write.append(f"{marker_gene}.{vr}")
@@ -325,9 +301,7 @@ def retrieve_regions(
     sequence_counter_total = 0  # count how many sequences in total were analyzed
     sequence_counter_useful = 0  # count how many sequences an output was generated for
     normalised_matches = dict()  # dictionary that will contain results for all runs
-    failed_run_counter = (
-        0  # total number of excluded runs for any reason (except non-existing files)
-    )
+    failed_run_counter = 0  # total number of excluded runs for any reason (except non-existing files)
     run_counters = {k: 0 for k in ["one", "two", "ambiguous"]}  # counters
     seq_per_variable_region_count = dict()
@@ -343,13 +317,9 @@ def retrieve_regions(
         data = load_data(tblout_file)
         run_id = identify_run(tblout_file)
         multiregion_matches = dict()
-        unsupported_matches = (
-            0  # tracks the number of sequences that map to unsupported models
-        )
+        unsupported_matches = 0  # tracks the number of sequences that map to unsupported models
         primer_inside_vr = 0  # tracks the number of sequences that start and/or end inside a variable region
-        per_read_info = (
-            dict()
-        )  # dictionary will contain read names for each variable region
+        per_read_info = dict()  # dictionary will contain read names for each variable region
         all_region_coverages = defaultdict(lambda: defaultdict(list))
         for read in data:
             # Example structure of `read`
@@ -362,18 +332,13 @@ def retrieve_regions(
             if not regions == "unsupported":
                 matches, coverages = get_multiregion(limits, regions)
-                [
-                    all_region_coverages[domain][region].append(coverage)
-                    for region, coverage in coverages.items()
-                ]
+                [all_region_coverages[domain][region].append(coverage) for region, coverage in coverages.items()]
                 multiregion_matches.setdefault(read[2], []).append(matches)
                 if check_primer_position(limits, regions):
                     primer_inside_vr += 1
                 sequence_counter_useful += 1
-                per_read_info.setdefault(marker_gene + "." + matches, []).append(
-                    read[0]
-                )
+                per_read_info.setdefault(marker_gene + "." + matches, []).append(read[0])
             else:
                 unsupported_matches += 1
@@ -394,11 +359,7 @@ def retrieve_regions(
         if unsupported_fract >= MAX_ERROR_PROPORTION:
             failed_run_counter += 1
             logging.info("No output will be produced - too many unsupported models")
-            logging.info(
-                "Excluded\t{}\t{}\t{}\n".format(
-                    tblout_file, "{0:.2f}".format(unsupported_fract), len(data)
-                )
-            )
+            logging.info("Excluded\t{}\t{}\t{}\n".format(tblout_file, "{0:.2f}".format(unsupported_fract), len(data)))
             continue
         normalised_matches[run_id] = dict()
@@ -451,9 +412,7 @@ def retrieve_regions(
             run_result[determine_domain(model)] = result
             for reg, freq in result.items():
                 total_useful_sequences += len(model_regions) * freq
-                temp_seq_counter[determine_domain(model) + " " + reg] = (
-                    len(model_regions) * freq
-                )
+                temp_seq_counter[determine_domain(model) + " " + reg] = len(model_regions) * freq
         if total_useful_sequences / len(data) < 0.75 and run_status != "ambiguous":
             failed_run_counter += 1
             logging.info("No output will be produced - too few useful sequences")
@@ -511,16 +470,12 @@ def retrieve_regions(
         seq_count_out.write("{}\t{}\n".format(key, int(value)))
     logging.info(
-        "Analyzed {} files and {} sequences. Output generated for {} sequences".format(
-            file_counter, sequence_counter_total, sequence_counter_useful
-        )
+        "Analyzed {} files and {} sequences. Output generated for {} sequences".format(file_counter, sequence_counter_total, sequence_counter_useful)
     )
 def parse_args(argv):
-    parser = argparse.ArgumentParser(
-        description="Tool to determine which regions were amplified in 16S data"
-    )
+    parser = argparse.ArgumentParser(description="Tool to determine which regions were amplified in 16S data")
     parser.add_argument("files", nargs="+", help="A list of overlapped tblout files")
     parser.add_argument(
         "-d",
@@ -534,9 +489,7 @@ def parse_args(argv):
         default="amplified_regions",
         help="Prefix for all outputs",
     )
-    parser.add_argument(
-        "--statistics", action="store_true", help="Print statistics files"
-    )
+    parser.add_argument("--statistics", action="store_true", help="Print statistics files")
     return parser.parse_args(argv)
@@ -546,18 +499,10 @@ def main(argv=None):
     if not os.path.isdir(args.output_dir):
         os.mkdir(args.output_dir)
     prefix = os.path.join(args.output_dir, args.output_prefix)
-    stats_file = "{}.stats".format(
-        prefix
-    )  # detailed stats for each run before filtration steps
-    condensed_stats_file = "{}.condensed_stats".format(
-        prefix
-    )  # basic stats for the batch of runs
-    missing_files_log = "{}.missing_files.txt".format(
-        prefix
-    )  # the names of non-existent files
-    seq_count_log = "{}.seq_count.txt".format(
-        prefix
-    )  # the number of sequences per domain/VR in the batch
+    stats_file = "{}.stats".format(prefix)  # detailed stats for each run before filtration steps
+    condensed_stats_file = "{}.condensed_stats".format(prefix)  # basic stats for the batch of runs
+    missing_files_log = "{}.missing_files.txt".format(prefix)  # the names of non-existent files
+    seq_count_log = "{}.seq_count.txt".format(prefix)  # the number of sequences per domain/VR in the batch
     stats_out = open(stats_file, "w")
     condensed_out = open(condensed_stats_file, "w")
     missing_out = open(missing_files_log, "w")
@@ -568,9 +513,7 @@ def main(argv=None):
         "Fraction archaea\tFraction eukaryotes\tUnidentified bact\tRegions bact\tFreqs bact\t"
         "Unidentified arch\tRegions arch\tFreqs arch\tUnidentified euk\tRegions euk\tFreqs euk\n"
     )
-    retrieve_regions(
-        args.files, prefix, stats_out, condensed_out, missing_out, seq_count_out
-    )
+    retrieve_regions(args.files, prefix, stats_out, condensed_out, missing_out, seq_count_out)
     stats_out.close()
     condensed_out.close()
     missing_out.close()

{mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py RENAMED Viewed

@@ -25,9 +25,7 @@ logging.basicConfig(level=logging.DEBUG)
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-i", "--input", required=True, type=str, help="Input from MAPseq output"
-    )
+    parser.add_argument("-i", "--input", required=True, type=str, help="Input from MAPseq output")
     parser.add_argument(
         "-l",
         "--label",
@@ -135,19 +133,13 @@ def process_blank_tax_ends(res_df, ranks):
     for i in range(len(res_df)):
         last_empty_rank = ""
         currently_empty = False
-        for j in reversed(
-            range(len(ranks))
-        ):  # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
+        for j in reversed(range(len(ranks))):  # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
             curr_rank = res_df.iloc[i, j + 1]
             if curr_rank in ranks:
-                if (
-                    last_empty_rank == ""
-                ):  # Last rank is empty, start window of consecutive blanks
+                if last_empty_rank == "":  # Last rank is empty, start window of consecutive blanks
                     last_empty_rank = j + 1
                     currently_empty = True
-                elif (
-                    currently_empty
-                ):  # If we're in a window of consecutive blank assignments that started at the beginning
+                elif currently_empty:  # If we're in a window of consecutive blank assignments that started at the beginning
                     last_empty_rank = j + 1
                 else:
                     break

{mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py RENAMED Viewed

@@ -15,22 +15,25 @@
 # limitations under the License.
 import argparse
-from collections import defaultdict
+import logging
 import re
+from collections import defaultdict
+import pandas as pd
 from Bio import SeqIO
 from Bio.Seq import Seq
-import pandas as pd
 from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
-    REGIONS_16S_BACTERIA,
     REGIONS_16S_ARCHAEA,
+    REGIONS_16S_BACTERIA,
     REGIONS_18S,
 )
 STRAND_FWD = "fwd"
 STRAND_REV = "rev"
+logging.basicConfig(level=logging.INFO)
 def parse_args():
     parser = argparse.ArgumentParser()
@@ -65,23 +68,44 @@ def parse_args():
     return input, fasta, sample, single_end
-def get_amp_region(beg, end, strand, model):
+def get_amp_region(primer_beg: float, primer_end: float, strand: str, model: dict) -> str:
     prev_region = ""
+    # some valid primers go inside HV regions a little bit, this margin is to account for that
     margin = -10
     for region, region_coords in model.items():
+        # get current region start and end coordinates
         region_beg = region_coords[0]
-        beg_diff = region_beg - beg
-        end_diff = region_beg - end
-        if strand == STRAND_FWD:
-            if beg_diff >= margin and end_diff >= margin:
+        region_end = region_coords[1]
+        # compute where primer beginning is in relation to current region
+        region_beg_primer_beg_diff = region_beg - primer_beg
+        region_beg_primer_end_diff = region_beg - primer_end
+        primer_beg_near_region_start = region_beg_primer_beg_diff >= margin
+        primer_end_near_region_start = region_beg_primer_end_diff >= margin
+        # compute where primer end is in relation to current region
+        region_end_primer_beg_diff = region_end - primer_beg
+        region_end_primer_end_diff = region_end - primer_end
+        primer_beg_before_region_end = region_end_primer_beg_diff >= margin
+        primer_end_before_region_end = region_end_primer_end_diff >= margin
+        if primer_beg_near_region_start and primer_end_near_region_start:
+            # if both these statements are true then primer is before a HV region
+            # i.e. validation = true
+            if strand == STRAND_FWD:
                 return region
-        else:
-            if beg_diff >= margin and end_diff >= margin:
+            else:
+                # if primer strand is REV then we return the previous region
                 return prev_region
+        elif primer_beg_before_region_end and primer_end_before_region_end:
+            # if the previous if statement is FALSE
+            # AND if both these statements are true then primer is within a HV region
+            # i.e. validation = false
+            logging.warning(f"This primer is within HV region {region}: {str(int(primer_beg))}-{str(int(primer_end))} vs {region_beg}-{region_end}")
+            return ""
+        # keep iterating through HV regions otherwise
         prev_region = region
@@ -89,10 +113,11 @@ def get_amp_region(beg, end, strand, model):
 def main():
     input, fasta, sample, single_end = parse_args()
     res_dict = defaultdict(list)
     fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
+    logging.info(f"Total primers read (including permutations): {len(fasta_dict)}")
     fwd_primers_fw = open("./fwd_primers.fasta", "w")
     rev_primers_fw = open("./rev_primers.fasta", "w")
@@ -100,6 +125,7 @@ def main():
     matched_primers_list = []
     with open(input, "r") as fr:
+        logging.info(f"Reading deoverlap file: {input}")
         for line in fr:
             line = line.strip()
             line = re.sub("[ \t]+", "\t", line)
@@ -133,10 +159,6 @@ def main():
                 amp_region = "Unknown"
                 model = ""
-            res_dict["Run"].append(sample)
-            res_dict["AssertionEvidence"].append("ECO_0000363")
-            res_dict["AssertionMethod"].append("automatic assertion")
             strand = ""
             if primer_name[-1] == "F":
@@ -144,18 +166,26 @@ def main():
             elif primer_name[-1] == "R":
                 strand = STRAND_REV
             else:
-                print(f"Not sure what strand this is, exiting: {primer_name}")
+                logging.warning(f"Not sure what strand this is, skipping: {primer_name}")
+                continue
             if model:
+                logging.info(f"Checking match coordinates for primer {primer_name}")
                 amp_region = get_amp_region(beg, end, strand, model)
+            if not amp_region:
+                logging.warning(f"Primer validation failed for {primer_name}, skipping")
+                continue
             primer_seq = str(fasta_dict[cleaned_primer_name].seq)
+            res_dict["Run"].append(sample)
+            res_dict["AssertionEvidence"].append("ECO_0000363")
+            res_dict["AssertionMethod"].append("automatic assertion")
             res_dict["Gene"].append(gene)
             res_dict["VariableRegion"].append(amp_region)
             res_dict["PrimerName"].append(cleaned_primer_name)
             res_dict["PrimerStrand"].append(strand)
-            res_dict["PrimerSeq"].append(primer_seq)
             if strand == STRAND_FWD:
                 fwd_primers_fw.write(f">{cleaned_primer_name}\n{primer_seq}\n")
@@ -164,11 +194,21 @@ def main():
                     primer_seq = Seq(primer_seq).reverse_complement()
                 rev_primers_fw.write(f">{cleaned_primer_name}\n{primer_seq}\n")
+            res_dict["PrimerSeq"].append(primer_seq)
             matched_primers_list.append(cleaned_primer_name)
+            logging.info(f"Added {cleaned_primer_name} to list of matched primers")
-    res_df = pd.DataFrame.from_dict(res_dict)
     res_tsv_name = f"./{sample}_primer_validation.tsv"
-    res_df.to_csv(res_tsv_name, sep="\t", index=False) if not res_df.empty else open(res_tsv_name, "w").close()
+    if res_dict:
+        res_df = pd.DataFrame.from_dict(res_dict)
+        res_df.to_csv(res_tsv_name, sep="\t", index=False) if not res_df.empty else open(res_tsv_name, "w").close()
+        logging.info(f"{len(res_df)} primers validated, generating output")
+    else:
+        logging.warning("No primers were successfully validated, generating empty outputs")
+        primer_val_fw = open(res_tsv_name, "w")
+        primer_val_fw.close()
     fwd_primers_fw.close()
     rev_primers_fw.close()

{mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py RENAMED Viewed

@@ -33,9 +33,7 @@ def parse_args():
         type=str,
         help="Path to forward (or single-end) fastq file",
     )
-    parser.add_argument(
-        "-r", "--rev", required=False, type=str, help="Path to reverse fastq file"
-    )
+    parser.add_argument("-r", "--rev", required=False, type=str, help="Path to reverse fastq file")
     parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
     args = parser.parse_args()

{mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py RENAMED Viewed

@@ -55,9 +55,7 @@ def main():
         if "R" in primer_name:
             primers_dict[primer_key].seq = primer.seq.reverse_complement()
-    SeqIO.write(
-        primers_dict.values(), f"{output}/{sample}_rev_comp_se_primers.fasta", "fasta"
-    )
+    SeqIO.write(primers_dict.values(), f"{output}/{sample}_rev_comp_se_primers.fasta", "fasta")
 if __name__ == "__main__":

{mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py RENAMED Viewed

@@ -63,9 +63,7 @@ def process_lines(lines, output_handler, rhea2reaction_dict, protein_hashes):
 def main():
-    parser = argparse.ArgumentParser(
-        "Use diamond output file to create a table with Rhea and CHEBI reaction annotation for every protein."
-    )
+    parser = argparse.ArgumentParser("Use diamond output file to create a table with Rhea and CHEBI reaction annotation for every protein.")
     parser.add_argument(
         "-d",
         "--diamond_hits",
@@ -105,9 +103,7 @@ def main():
     proteins = args.proteins
     rhea2chebi = args.rhea2chebi
-    logging.info(
-        f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
-    )
+    logging.info(f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}")
     protein_hashes = {}
     with open(proteins, "r") as fasta_file:
         for record in SeqIO.parse(fasta_file, "fasta"):
@@ -118,17 +114,13 @@ def main():
     df = pd.read_csv(rhea2chebi, delimiter="\t")
     rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
-    logging.info(
-        f"Step 3/3: Read DIAMOND results from {'STDIN' if diamond_hits == '-' else Path(diamond_hits).resolve()} and write output"
-    )
+    logging.info(f"Step 3/3: Read DIAMOND results from {'STDIN' if diamond_hits == '-' else Path(diamond_hits).resolve()} and write output")
     with open(output, "w") as output_handler:
         if diamond_hits == "-":
             process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
         else:
             with open(diamond_hits, "r") as input_file:
-                process_lines(
-                    input_file, output_handler, rhea2reaction_dict, protein_hashes
-                )
+                process_lines(input_file, output_handler, rhea2reaction_dict, protein_hashes)
     logging.info("Processed successfully. Exiting.")

{mgnify_pipelines_toolkit-1.4.1 → mgnify_pipelines_toolkit-1.4.4}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py RENAMED Viewed

@@ -23,12 +23,8 @@ import pandas as pd
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-i", "--input", required=True, type=str, help="Input JSON from antiSMASH"
-    )
-    parser.add_argument(
-        "-o", "--output", required=True, type=str, help="Output GFF3 file name"
-    )
+    parser.add_argument("-i", "--input", required=True, type=str, help="Input JSON from antiSMASH")
+    parser.add_argument("-o", "--output", required=True, type=str, help="Output GFF3 file name")
     parser.add_argument(
         "--cds_tag",
         default="ID",
@@ -57,17 +53,13 @@ def main():
     for record in antismash_analysis["records"]:
         record_id = record["id"]
-        iter_cds = (
-            "antismash.detection.genefunctions" in record["modules"].keys()
-        )  # Flag to iterate CDS
+        iter_cds = "antismash.detection.genefunctions" in record["modules"].keys()  # Flag to iterate CDS
         region_name = None
         for feature in record["features"]:
             if feature["type"] == "region":
                 # Annotate region features
-                region_name = (
-                    f"{record_id}_region{feature['qualifiers']['region_number'][0]}"
-                )
+                region_name = f"{record_id}_region{feature['qualifiers']['region_number'][0]}"
                 region_start = int(feature["location"].split(":")[0].split("[")[1])
                 region_end = int(feature["location"].split(":")[1].split("]")[0])
@@ -82,9 +74,7 @@ def main():
                 product = ",".join(feature["qualifiers"].get("product", []))
-                attributes_dict[region_name].update(
-                    {"ID": region_name, "product": product}
-                )
+                attributes_dict[region_name].update({"ID": region_name, "product": product})
             if iter_cds and feature["type"] == "CDS":
                 # Annotate CDS features
@@ -111,12 +101,8 @@ def main():
                 attributes_dict[locus_tag].update(
                     {
                         "ID": locus_tag,
-                        "as_type": ",".join(
-                            feature["qualifiers"].get("gene_kind", ["other"])
-                        ),
-                        "gene_functions": ",".join(
-                            feature["qualifiers"].get("gene_functions", [])
-                        )
+                        "as_type": ",".join(feature["qualifiers"].get("gene_kind", ["other"])),
+                        "gene_functions": ",".join(feature["qualifiers"].get("gene_functions", []))
                         .replace(" ", "_")
                         .replace(":_", ":")
                         .replace(";_", "%3B"),
@@ -126,9 +112,7 @@ def main():
         # Extended CDS attributes
         if "antismash.detection.hmm_detection" in record["modules"].keys():
-            cds_by_protocluster = record["modules"][
-                "antismash.detection.hmm_detection"
-            ]["rule_results"]["cds_by_protocluster"]
+            cds_by_protocluster = record["modules"]["antismash.detection.hmm_detection"]["rule_results"]["cds_by_protocluster"]
             if not cds_by_protocluster:
                 continue
@@ -137,14 +121,10 @@ def main():
                 if locus_tag := feature.get("cds_name"):
                     as_clusters = ",".join(list(feature["definition_domains"].keys()))
                     if locus_tag in attributes_dict:
-                        attributes_dict[locus_tag].update(
-                            {"as_gene_clusters": as_clusters}
-                        )
+                        attributes_dict[locus_tag].update({"as_gene_clusters": as_clusters})
         if "antismash.detection.genefunctions" in record["modules"].keys():
-            gene_function_tools = record["modules"][
-                "antismash.detection.genefunctions"
-            ]["tools"]
+            gene_function_tools = record["modules"]["antismash.detection.genefunctions"]["tools"]
             if tool_data := gene_function_tools.get("smcogs"):
                 for locus_tag in tool_data["best_hits"]:
@@ -158,18 +138,13 @@ def main():
                     if locus_tag in attributes_dict.keys():
                         attributes_dict[locus_tag].update({"as_notes": smcog_note})
-    attributes = [
-        ";".join(f"{k}={v}" for k, v in attrib_data.items() if v)
-        for attrib_data in attributes_dict.values()
-    ]
+    attributes = [";".join(f"{k}={v}" for k, v in attrib_data.items() if v) for attrib_data in attributes_dict.values()]
     res_dict["attributes"] = attributes
     res_df = pd.DataFrame.from_dict(res_dict)
     with open(output_file, "w") as f_out:
-        f_out.write(
-            "##gff-version 3\n"
-        )  # Save data to the GFF3 file with the proper header
+        f_out.write("##gff-version 3\n")  # Save data to the GFF3 file with the proper header
         res_df.to_csv(f_out, header=False, index=False, sep="\t")

mgnify-pipelines-toolkit 1.4.1__tar.gz → 1.4.4__tar.gz

Potentially problematic release.

mgnify-pipelines-toolkit 1.4.1tar.gz → 1.4.4tar.gz