PyPI - mgnify-pipelines-toolkit - Versions diffs - 1.1.2__tar.gz → 1.2.1__tar.gz - Mend

mgnify-pipelines-toolkit 1.1.2tar.gz → 1.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (62) hide show

{mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mgnify_pipelines_toolkit
-Version: 1.1.2
+Version: 1.2.1
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0
@@ -8,22 +8,21 @@ Keywords: bioinformatics,pipelines,metagenomics
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
-Requires-Python: >=3.10
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: biopython>=1.85
 Requires-Dist: numpy<3,>=2.2.4
 Requires-Dist: pandas<3,>=2.2.3
-Requires-Dist: regex>=2024.11.6
 Requires-Dist: requests<3,>=2.32.3
 Requires-Dist: click<9,>=8.1.8
 Requires-Dist: pandera<0.24,>=0.23.1
 Requires-Dist: pyfastx<3,>=2.2.0
 Requires-Dist: intervaltree<4,>=3.1.0
-Provides-Extra: tests
-Requires-Dist: pytest<9,>=8.3.5; extra == "tests"
-Requires-Dist: pytest-md>=0.2.0; extra == "tests"
-Requires-Dist: pytest-workflow==2.1.0; extra == "tests"
+Provides-Extra: test
+Requires-Dist: pytest<9,>=8.3.5; extra == "test"
+Requires-Dist: pytest-md>=0.2.0; extra == "test"
+Requires-Dist: pytest-workflow==2.1.0; extra == "test"
 Provides-Extra: dev
 Requires-Dist: pre-commit>=4.2.0; extra == "dev"
 Requires-Dist: black>=25.1.0; extra == "dev"

{mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py RENAMED Viewed

@@ -22,7 +22,6 @@ import pandas as pd
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-i", "--input", required=True, type=str, help="Input JSON from antiSMASH"
@@ -64,7 +63,6 @@ def main():
         region_name = None
         for feature in record["features"]:
             if feature["type"] == "region":
                 # Annotate region features
                 region_name = (
@@ -129,35 +127,34 @@ def main():
             cds_by_protocluster = record["modules"][
                 "antismash.detection.hmm_detection"
             ]["rule_results"]["cds_by_protocluster"]
-            if len(cds_by_protocluster) > 0:
-                for feature in cds_by_protocluster[0][1]:
-                    if "cds_name" in feature.keys():
-                        locus_tag = feature["cds_name"]
-                        as_clusters = ",".join(
-                            list(feature["definition_domains"].keys())
+            if not cds_by_protocluster:
+                continue
+            for feature in cds_by_protocluster[0][1]:
+                if locus_tag := feature.get("cds_name"):
+                    as_clusters = ",".join(list(feature["definition_domains"].keys()))
+                    if locus_tag in attributes_dict:
+                        attributes_dict[locus_tag].update(
+                            {"as_gene_clusters": as_clusters}
                         )
-                        if locus_tag in attributes_dict.keys():
-                            attributes_dict[locus_tag].update(
-                                {"as_gene_clusters": as_clusters}
-                            )
         if "antismash.detection.genefunctions" in record["modules"].keys():
-            for tool in record["modules"]["antismash.detection.genefunctions"]["tools"]:
-                if tool["tool"] == "smcogs":
-                    for locus_tag in tool["best_hits"]:
-                        hit_id = tool["best_hits"][locus_tag]["hit_id"].split(":")[0]
-                        hit_desc = (
-                            tool["best_hits"][locus_tag]["hit_id"]
-                            .split(":")[1]
-                            .replace(" ", "_")
-                        )
-                        score = tool["best_hits"][locus_tag]["bitscore"]
-                        e_value = tool["best_hits"][locus_tag]["evalue"]
+            gene_function_tools = record["modules"][
+                "antismash.detection.genefunctions"
+            ]["tools"]
+            if tool_data := gene_function_tools.get("smcogs"):
+                for locus_tag in tool_data["best_hits"]:
+                    smcog_id = tool_data["best_hits"][locus_tag]["reference_id"]
+                    smcog_description = tool_data["best_hits"][locus_tag]["description"]
+                    score = tool_data["best_hits"][locus_tag]["bitscore"]
+                    e_value = tool_data["best_hits"][locus_tag]["evalue"]
-                        smcog_note = f"smCOG:{hit_id}:{hit_desc.replace(' ', '_')}(Score:{score}%3BE-value:{e_value})"
-                        if locus_tag in attributes_dict.keys():
-                            attributes_dict[locus_tag].update({"as_notes": smcog_note})
-                        break
+                    smcog_note = f"smCOG:{smcog_id}:{smcog_description.replace(' ', '_')}(Score:{score}%3BE-value:{e_value})"
+                    if locus_tag in attributes_dict.keys():
+                        attributes_dict[locus_tag].update({"as_notes": smcog_note})
     attributes = [
         ";".join(f"{k}={v}" for k, v in attrib_data.items() if v)

{mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit/constants/thresholds.py RENAMED Viewed

@@ -14,9 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# used by fetch_mcp in analysis.amplicon
-MCP_MAX_LINE_COUNT = 300_000
 # used by classify_var_regions in analysis.amplicon
 MIN_OVERLAP = 0.95
 MIN_SEQ_COUNT = 5000
@@ -26,7 +23,6 @@ MAX_INTERNAL_PRIMER_PROPORTION = 0.2
 # used by library_strategy_checker in analysis.shared
 MIN_AMPLICON_STRATEGY_CHECK = 0.30
 # used by markergene_study_summary in analysis.shared
 MAJORITY_MARKER_PROPORTION = 0.45
 # used by gff_toolkit in analysis.assembly

{mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mgnify_pipelines_toolkit
-Version: 1.1.2
+Version: 1.2.1
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0
@@ -8,22 +8,21 @@ Keywords: bioinformatics,pipelines,metagenomics
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
-Requires-Python: >=3.10
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: biopython>=1.85
 Requires-Dist: numpy<3,>=2.2.4
 Requires-Dist: pandas<3,>=2.2.3
-Requires-Dist: regex>=2024.11.6
 Requires-Dist: requests<3,>=2.32.3
 Requires-Dist: click<9,>=8.1.8
 Requires-Dist: pandera<0.24,>=0.23.1
 Requires-Dist: pyfastx<3,>=2.2.0
 Requires-Dist: intervaltree<4,>=3.1.0
-Provides-Extra: tests
-Requires-Dist: pytest<9,>=8.3.5; extra == "tests"
-Requires-Dist: pytest-md>=0.2.0; extra == "tests"
-Requires-Dist: pytest-workflow==2.1.0; extra == "tests"
+Provides-Extra: test
+Requires-Dist: pytest<9,>=8.3.5; extra == "test"
+Requires-Dist: pytest-md>=0.2.0; extra == "test"
+Requires-Dist: pytest-workflow==2.1.0; extra == "test"
 Provides-Extra: dev
 Requires-Dist: pre-commit>=4.2.0; extra == "dev"
 Requires-Dist: black>=25.1.0; extra == "dev"

{mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt RENAMED Viewed

@@ -9,18 +9,12 @@ mgnify_pipelines_toolkit.egg-info/entry_points.txt
 mgnify_pipelines_toolkit.egg-info/requires.txt
 mgnify_pipelines_toolkit.egg-info/top_level.txt
 mgnify_pipelines_toolkit/analysis/__init__.py
-mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py
-mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py
-mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py
-mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py
 mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py
-mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py
 mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py
 mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py
 mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py
 mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py
 mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py
-mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py
 mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py
 mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py
 mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py
@@ -49,7 +43,6 @@ mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py
 mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py
 mgnify_pipelines_toolkit/constants/db_labels.py
 mgnify_pipelines_toolkit/constants/ncrna.py
-mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py
 mgnify_pipelines_toolkit/constants/regex_fasta_header.py
 mgnify_pipelines_toolkit/constants/tax_ranks.py
 mgnify_pipelines_toolkit/constants/thresholds.py

{mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit.egg-info/entry_points.txt RENAMED Viewed

@@ -2,17 +2,13 @@
 add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main
 amplicon_study_summary_generator = mgnify_pipelines_toolkit.analysis.amplicon.study_summary_generator:cli
 antismash_gff_builder = mgnify_pipelines_toolkit.analysis.assembly.antismash_gff_builder:main
-are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
 assembly_study_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.study_summary_generator:cli
-assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
-assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
 classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
 combined_gene_caller_merge = mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main
 convert_cmscan_to_cmsearch_tblout = mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main
 dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:main
 fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
 fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
-find_mcp_inflection_points = mgnify_pipelines_toolkit.analysis.amplicon.find_mcp_inflection_points:main
 generate_gaf = mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main
 genomes_extract_bacterial_rrnas_as_tsv = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_bacterial_rrnas_as_tsv:main
 genomes_extract_rrnas_as_fasta = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_rrnas_as_fasta:main
@@ -32,7 +28,6 @@ process_dbcan_cazys = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_r
 process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_clusters:main
 remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
 rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
-standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
 summarise_antismash_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_antismash_bgcs:main
 summarise_goslims = mgnify_pipelines_toolkit.analysis.assembly.summarise_goslims:main
 summarise_sanntis_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_sanntis_bgcs:main

{mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/mgnify_pipelines_toolkit.egg-info/requires.txt RENAMED Viewed

@@ -1,7 +1,6 @@
 biopython>=1.85
 numpy<3,>=2.2.4
 pandas<3,>=2.2.3
-regex>=2024.11.6
 requests<3,>=2.32.3
 click<9,>=8.1.8
 pandera<0.24,>=0.23.1
@@ -14,7 +13,7 @@ black>=25.1.0
 flake8>=7.1.2
 pep8-naming>=0.14.1
-[tests]
+[test]
 pytest<9,>=8.3.5
 pytest-md>=0.2.0
 pytest-workflow==2.1.0

{mgnify_pipelines_toolkit-1.1.2 → mgnify_pipelines_toolkit-1.2.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "mgnify_pipelines_toolkit"
-version = "1.1.2"
+version = "1.2.1"
 readme = "README.md"
 license = { text = "Apache Software License 2.0" }
 authors = [
@@ -8,7 +8,7 @@ authors = [
 ]
 keywords = ["bioinformatics", "pipelines", "metagenomics"]
 description = "Collection of scripts and tools for MGnify pipelines"
-requires-python = ">=3.10"
+requires-python = ">=3.11"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: Apache Software License",
@@ -19,7 +19,6 @@ dependencies = [
     "biopython>=1.85",
     "numpy>=2.2.4,<3",
     "pandas>=2.2.3,<3",
-    "regex>=2024.11.6",
     "requests>=2.32.3,<3",
     "click>=8.1.8,<9",
     "pandera>=0.23.1,<0.24",
@@ -54,15 +53,10 @@ markergene_study_summary = "mgnify_pipelines_toolkit.analysis.shared.markergene_
 convert_cmscan_to_cmsearch_tblout = "mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main"
 dwc_summary_generator = "mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:main"
 # analysis.amplicon #
-are_there_primers = "mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main"
-assess_inflection_point_mcp = "mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main"
-assess_mcp_proportions = "mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main"
 classify_var_regions = "mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main"
-find_mcp_inflection_points = "mgnify_pipelines_toolkit.analysis.amplicon.find_mcp_inflection_points:main"
 make_asv_count_table = "mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main"
 remove_ambiguous_reads = "mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main"
 rev_comp_se_primers = "mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main"
-standard_primer_matching = "mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main"
 mapseq_to_asv_table = "mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main"
 primer_val_classification = "mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main"
 amplicon_study_summary_generator = "mgnify_pipelines_toolkit.analysis.amplicon.study_summary_generator:cli"
@@ -89,7 +83,7 @@ fasta_to_delimited = "mgnify_pipelines_toolkit.utils.fasta_to_delimited:main"
 get_mpt_version = "mgnify_pipelines_toolkit.utils.get_mpt_version:main"
 [project.optional-dependencies]
-tests = [
+test = [
     "pytest>=8.3.5,<9",
     "pytest-md>=0.2.0",
     "pytest-workflow==2.1.0",

mgnify_pipelines_toolkit-1.1.2/mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py DELETED Viewed

@@ -1,221 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# Copyright 2024-2025 EMBL - European Bioinformatics Institute
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from collections import defaultdict, Counter
-import logging
-import gzip
-import os
-import pyfastx
-from mgnify_pipelines_toolkit.constants.regex_ambiguous_bases import (
-    _AMBIGUOUS_BASES_DICT,
-    _AMBIGUOUS_BASES_DICT_REV,
-)
-logging.basicConfig(level=logging.DEBUG)
-def split_dir_into_sample_paths(dir):
-    file_list = os.listdir(dir)
-    file_list = [
-        file
-        for file in file_list
-        if ".fastq" in file and ("_1" in file or "_2" in file)
-    ]
-    sample_set = set()
-    [sample_set.add(f"{dir}/{file.split('_')[0]}") for file in file_list]
-    sample_list = sorted(list(sample_set))
-    return sample_list
-def get_read_count(read_path: str, file_type: str = "fastq") -> int:
-    """
-    Get the read count of a FASTQ or FASTA file.
-    :param read_path: The path to the FASTQ or FASTA file.
-    :type read_path: str
-    :param fasta_type: The type of the file, either "fastq" or "fasta". Defaults to "fastq".
-    :type fasta_type: str
-    :return: The number of reads in the file.
-    :rtype: int
-    :raises ValueError: If the file type is not supported or the read count is not a positive integer.
-    """
-    read_count = 0
-    if file_type == "fasta":
-        fasta = pyfastx.Fasta(read_path, build_index=False)
-        read_count = sum(1 for _ in fasta)
-    elif file_type == "fastq":
-        fastq = pyfastx.Fastq(read_path, build_index=False)
-        read_count = sum(1 for _ in fastq)
-    else:
-        raise ValueError(
-            f"Invalid file_type {file_type}, it needs to be either 'fasta' or 'fastq'"
-        )
-    if read_count <= 0:
-        raise ValueError(f"Read count is not a positive integer: {read_count}")
-    return read_count
-def build_cons_seq(
-    cons_list,
-    read_count,
-    cons_threshold=0.80,
-    do_not_include=None,
-    counter=1,
-    max_line_count=None,
-):
-    """
-    Generate consensus sequence using a list of base conservation dictionaries most likely
-    generated by the `build_mcp_cons_dict_list()` function.
-    Also returns a list containing the conservation value of the most conserved base at every
-    position in the list of base conservation dictionaries.
-    """
-    cons_seq = ""
-    cons_confs = []
-    if do_not_include is None:
-        do_not_include = []
-    for count_dict in cons_list:
-        max_count = 0
-        cons_dict = defaultdict(float)
-        if counter in do_not_include:
-            counter += 1
-            cons_seq += "N"
-            continue
-        for base, count in count_dict.items():
-            if base not in ("A", "T", "C", "G"):
-                continue
-            if max_line_count is None:
-                cons_dict[base] = count / read_count
-            else:
-                cons_dict[base] = count / max_line_count
-            if count > max_count:
-                max_count = count
-        counter += 1
-        try:
-            if max_line_count is None:
-                max_prop = max_count / read_count
-            else:
-                max_prop = max_count / max_line_count
-            cons_bases = []
-            curr_prop = 0.0
-            sorted_cons_dict = dict(
-                sorted(cons_dict.items(), key=lambda x: x[1], reverse=True)
-            )
-            for base, prop in sorted_cons_dict.items():
-                cons_bases.append(base)
-                curr_prop += prop
-                if curr_prop >= cons_threshold:
-                    break
-            cons_bases = sorted(cons_bases)
-            if len(cons_bases) == 1:
-                cons_seq += cons_bases[0]
-            else:
-                amb_string = ",".join(cons_bases)
-                amb_base = _AMBIGUOUS_BASES_DICT_REV[amb_string]
-                cons_seq += amb_base
-        except ZeroDivisionError:
-            max_prop = 0.0
-        cons_confs.append(max_prop)
-    return cons_seq, cons_confs
-def primer_regex_query_builder(primer):
-    """
-    Takes an input nucleotide sequence that can contain IUPAC ambiguous codes
-    Returns a string formatted as a regex query that considers the different
-    potential bases valid at a position with am abiguity code.
-    """
-    query = ""
-    for char in primer:
-        if char in ("A", "C", "T", "G"):
-            query += char
-        else:
-            query += str(_AMBIGUOUS_BASES_DICT[char])
-    query = f"(.*{query}){{e<=1}}"
-    return query
-def build_mcp_cons_dict_list(mcp_count_dict, mcp_len):
-    """
-    Generate list of dictionaries of base conservation for mcp output (mcp_cons_list)
-    e.g. [{'A':0.9, 'C':0.1}, {'T':1.0}, ....] for every base position
-    """
-    mcp_cons_list = []
-    for i in range(mcp_len):
-        index_base_dict = defaultdict(int)
-        for mcp in mcp_count_dict.keys():
-            if len(mcp) < mcp_len:
-                continue
-            base = mcp[i]
-            index_base_dict[base] += mcp_count_dict[mcp]
-        mcp_cons_list.append(index_base_dict)
-    return mcp_cons_list
-def fetch_mcp(fastq, prefix_len, start=1, rev=False, max_line_count=None):
-    """
-    Generates the most common prefix sequences along with their counts in a fastq file.
-    Outputs dictionary containing counts for each generated MCP in the fastq.
-    """
-    selected_lines = []
-    with gzip.open(fastq, "rt") as file:
-        for i, line in enumerate(file):
-            line = line.strip()
-            if i % 4 == 1:
-                if not rev:
-                    selected_lines.append(line[start - 1 : start + prefix_len - 1])
-                else:
-                    rev_line = line[::-1]
-                    selected_lines.append(rev_line[start - 1 : start + prefix_len - 1])
-            if max_line_count is not None:
-                if len(selected_lines) > max_line_count:
-                    break
-    sequence_counts = Counter(selected_lines)
-    mcp_count_dict = dict(
-        sorted(sequence_counts.items(), key=lambda x: x[1], reverse=True)
-    )
-    return mcp_count_dict

mgnify_pipelines_toolkit-1.1.2/mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py DELETED Viewed

@@ -1,164 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# Copyright 2024-2025 EMBL - European Bioinformatics Institute
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import numpy as np
-from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
-    get_read_count,
-    build_cons_seq,
-    build_mcp_cons_dict_list,
-    fetch_mcp,
-)
-def parse_args(argv=None):
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-i",
-        "--input",
-        required=True,
-        type=str,
-        help="Path to fastq file to check for primers",
-    )
-    parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
-    parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
-    args = parser.parse_args(argv)
-    path = args.input
-    sample = args.sample
-    output = args.output
-    return path, sample, output
-def are_there_primers_in_this_sample(path, rev=False):
-    """
-    Predict the presence of primers based on windows of base conservation.
-    Takes a fastq file as input. Extracts proportion of most common base for the first 100 bases.
-    Computes the a threshold (Q3 - 0.15) based on this proportion and counts the number of bases below
-    it in windows of 10 bases.
-    If at least one of the first two windows contains at most one such a base, then the presence of a primer is flagged as true.
-    A primer is also flagged as true if the combined count of bases below Q3 is at most 4.
-    The output of this function is a boolean flag:
-        True if a primer was identified
-        False if a primer was not identified
-    """
-    read_count = get_read_count(
-        path, file_type="fastq"
-    )  # Get read count for fastq file
-    mcp_len = 100  # Script will look at first 100 base mcps (for rev=True, it will look at first 100 from 3' to 5')
-    mcp_count_dict = fetch_mcp(
-        path, mcp_len, rev=rev
-    )  # mcp dict where key is the mcp and value is the count
-    mcp_cons_list = build_mcp_cons_dict_list(
-        mcp_count_dict, mcp_len
-    )  # list of base conservation dicts for mcps
-    cons_seq, cons_confs = build_cons_seq(
-        mcp_cons_list, read_count
-    )  # get list of max base conservations for each index
-    window_size = 10
-    # Counter that will reset to 0 every 10 bases
-    window_count = 0
-    # Will append the window count to this list every 10 bases
-    window_count_list = []
-    # Compute Q3-based threshold
-    max_cons = np.quantile(cons_confs, 0.75)
-    threshold = max_cons - 0.15
-    if max_cons < 0.75:
-        threshold = 0.75
-    # Immediately return false (no primer) if the max conservation is less than 0.6
-    if max_cons < 0.6:
-        return False
-    # Loop through every base
-    for i, val in enumerate(cons_confs):
-        if i % window_size == 0 and i != 0:  # After looping through a window..
-            window_count_list.append(window_count)  # ..append window count
-            window_count = 0  # ..reset window count
-        if (
-            val < threshold
-        ):  # If the conservation at i is less than threshold, increment count for the window
-            window_count += 1
-    primer_flag = False  # Initialise primer flag as false
-    if (
-        1 in window_count_list[:2] or 0 in window_count_list[:2]
-    ):  # If window count is at most 1 of first two windows...
-        primer_flag = True  # ..primer flag is true
-    elif (
-        sum(window_count_list[:2]) <= 4
-    ):  # If sum of window counts of the first two windows is at most 4..
-        primer_flag = True  # ..primer flag is true
-    return primer_flag
-def save_out(results, sample_id, output):
-    """
-    Save primer presence flags into output .txt file.
-    1: primer exists
-    0: primer doesn't exist
-    First line will be the forward strand
-    Second line will be the reverse strand
-    """
-    with open(f"{output}/{sample_id}_general_primer_out.txt", "w") as fw:
-        fw.write(f"{results[0]}\n")
-        fw.write(f"{results[1]}\n")
-def main(argv=None):
-    path, sample, output = parse_args(argv)
-    fwd_primer_flag = are_there_primers_in_this_sample(
-        path
-    )  # Check for general primers in fwd
-    rev_primer_flag = are_there_primers_in_this_sample(
-        path, rev=True
-    )  # Check for general primers in rev
-    fwd_status = "0"
-    rev_status = "0"
-    # Flag for primer presence: 1 for yes 0 for no
-    if fwd_primer_flag:
-        print("Forward primer detected!")
-        fwd_status = 1
-    else:
-        print("No forward primer detected")
-    if rev_primer_flag:
-        print("Reverse primer detected!")
-        rev_status = 1
-    else:
-        print("No reverse primer detected")
-    save_out((fwd_status, rev_status), sample, output)  # Save primer flags to .txt file
-if __name__ == "__main__":
-    main()

mgnify-pipelines-toolkit 1.1.2__tar.gz → 1.2.1__tar.gz

Potentially problematic release.

mgnify-pipelines-toolkit 1.1.2tar.gz → 1.2.1tar.gz