PyPI - mgnify-pipelines-toolkit - Versions diffs - 0.1.8__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

mgnify-pipelines-toolkit 0.1.8py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (16) hide show

mgnify_pipelines_toolkit/schemas/schemas.py ADDED Viewed

@@ -0,0 +1,217 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2024 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from enum import Enum
+from typing import ClassVar, Optional
+import pandera as pa
+from pydantic import (
+    Field,
+    BaseModel,
+    field_validator,
+    RootModel,
+)
+from pandera.engines.pandas_engine import PydanticModel
+from mgnify_pipelines_toolkit.constants.tax_ranks import (
+    SHORT_TAX_RANKS,
+    SHORT_PR2_TAX_RANKS,
+)
+class INSDCRunAccession(RootModel):
+    """Class for modelling for INSDC-specific run accessions.
+    Essentially is just a special string with regex-based validation of the accession.
+    """
+    # RootModel example:
+    # https://stackoverflow.com/questions/78393675/how-to-make-a-custom-type-inheriting-from-uuid-work-as-a-pydantic-model
+    root: str = Field(
+        unique=True,
+        description="The run needs to be a valid ENA accession",
+        examples=["ERR123456", "DRR789012", "SRR345678"],
+    )
+    @field_validator("root", mode="after")
+    @classmethod
+    def run_validity_check(cls, run: str) -> bool:
+        """Checks that the run string matches the regex code of an INSDC run accession.
+        Throws a `ValueError` exception if not, which is what Pydantic prefers for validation errors.
+        """
+        run_accession_regex = "(E|D|S)RR[0-9]{6,}"
+        regex_res = re.match(run_accession_regex, run)
+        if regex_res is None:
+            raise ValueError(
+                f"Accession `{run}` does not fit INSDC format [ERR*,SRR*,DRR*]."
+            )
+        return run
+class AmpliconResultTypes(str, Enum):
+    """Class that models the two allowed statuses for successful amplicon analysis runs.
+    Pydantic validates Enums very simply without needing to declare a new function.
+    """
+    all_results = "all_results"
+    no_asvs = "no_asvs"
+class AmpliconPassedRunsRecord(BaseModel):
+    """Class defining a Pydantic model for a single "row" of an amplicon passed runs file.
+    Uses the previous two classes.
+    """
+    run: INSDCRunAccession
+    status: AmpliconResultTypes
+class AmpliconNonINSDCSPassedRunsRecord(BaseModel):
+    """Class modeling a very similar model as the preceding one, but with no INSDC-validation.
+    This is achieved by replacing the type of the runs with just a simple string so no validation
+    happens.
+    """
+    run: str
+    status: AmpliconResultTypes
+# This is the schema for the whole DF
+class AmpliconPassedRunsSchema(pa.DataFrameModel):
+    """Class modelling a Pandera dataframe schema that uses the AmpliconPassedRunsRecord class as dtype.
+    This is what actually validates the generated dataframe when read by pandas.read_csv.
+    """
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(AmpliconPassedRunsRecord)
+        coerce = True
+class AmpliconNonINSDCPassedRunsSchema(pa.DataFrameModel):
+    """Class modelling the same dataframe schema as the preceding one, except with no INSDC validation.
+    Uses the AmpliconNonINSDCSPassedRunsRecord as a dtype to achieve this.
+    """
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(AmpliconNonINSDCSPassedRunsRecord)
+        coerce = True
+class TaxRank(RootModel):
+    """Class for modelling a single Taxonomic Rank.
+    Essentially is just a special string with validation of the structure:
+    `${rank}__${taxon}`
+    Where `${rank}` is one of the allowed short ranks defined by the imported
+    `SHORT_TAX_RANKS` and `SHORT_PR2_TAX_RANKS` variables.
+    And `${taxon}` is the actual taxon for that rank (this isn't validated).
+    It will also validate if the whole string is the permitted "Unclassified".
+    """
+    valid_tax_ranks: ClassVar = SHORT_TAX_RANKS + SHORT_PR2_TAX_RANKS
+    root: str = Field(
+        unique=True,
+        description="A single taxon in a taxonomy record",
+        examples=["sk__Bacteria", "p__Bacillota", "g__Tundrisphaera"],
+    )
+    @field_validator("root", mode="after")
+    @classmethod
+    def rank_structure_validity_check(cls, taxrank: str) -> bool:
+        taxrank_list = taxrank.split("__")
+        rank = taxrank_list[0]
+        if rank != "" and rank != "Unclassified" and rank not in cls.valid_tax_ranks:
+            raise ValueError(f"Invalid taxonomy rank {rank}.")
+        return taxrank
+# TODO: see if we can simplify the declaration of two Taxon classes by using one of these solutions
+# None of the solutions have a model-only way of doing it, but worth considering maybe
+# https://stackoverflow.com/questions/76537360/initialize-one-of-two-pydantic-models-depending-on-an-init-parameter
+class Taxon(BaseModel):
+    """Class for modelling an entire Taxon or taxonomic assignment.
+    All of the ranks are optional, to model for the taxon being "Unclassified".
+    """
+    Superkingdom: Optional[TaxRank] = None
+    Kingdom: Optional[TaxRank] = None
+    Phylum: Optional[TaxRank] = None
+    Class: Optional[TaxRank] = None
+    Order: Optional[TaxRank] = None
+    Family: Optional[TaxRank] = None
+    Genus: Optional[TaxRank] = None
+    Species: Optional[TaxRank] = None
+class PR2Taxon(Taxon):
+    """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
+    Domain: Optional[TaxRank] = None
+    Supergroup: Optional[TaxRank] = None
+    Division: Optional[TaxRank] = None
+    Subdivision: Optional[TaxRank] = None
+class TaxonRecord(Taxon):
+    """Class for modelling a single taxon record in a taxonomy file.
+    It inherits the Taxon class, and simply adds a Count field, modelling the read counts
+    for that particular Taxon record.
+    """
+    Count: int
+class PR2TaxonRecord(PR2Taxon):
+    """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
+    Count: int
+# This is the schema for the whole DF
+class TaxonSchema(pa.DataFrameModel):
+    """Class modelling a Pandera dataframe schema that uses the TaxonRecord class as dtype.
+    This is what actually validates the generated dataframe when read by pandas.read_csv.
+    """
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(TaxonRecord)
+        coerce = True
+class PR2TaxonSchema(pa.DataFrameModel):
+    """Class modelling the same dataframe schema as the preceding one, except for the PR2 taxonomy.
+    Uses the PR2TaxonSchema as a dtype to achieve this.
+    """
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(PR2TaxonRecord)
+        coerce = True

{mgnify_pipelines_toolkit-0.1.8.dist-info → mgnify_pipelines_toolkit-0.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: mgnify_pipelines_toolkit
-Version: 0.1.8
+Version: 0.2.0
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0
@@ -11,24 +11,30 @@ Classifier: Operating System :: OS Independent
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: biopython ==1.82
-Requires-Dist: numpy ==1.26.0
-Requires-Dist: pandas ==2.0.2
-Requires-Dist: regex ==2023.12.25
-Provides-Extra: dev
-Requires-Dist: mgnify-pipelines-toolkit[tests] ; extra == 'dev'
-Requires-Dist: pre-commit ==3.8.0 ; extra == 'dev'
-Requires-Dist: black ==24.8.0 ; extra == 'dev'
-Requires-Dist: flake8 ==7.1.1 ; extra == 'dev'
-Requires-Dist: pep8-naming ==0.14.1 ; extra == 'dev'
+Requires-Dist: biopython==1.82
+Requires-Dist: numpy==1.26.0
+Requires-Dist: pandas==2.0.2
+Requires-Dist: regex==2023.12.25
+Requires-Dist: requests==2.32.3
+Requires-Dist: click==8.1.7
+Requires-Dist: pandera==0.22.1
 Provides-Extra: tests
-Requires-Dist: pytest ==7.4.0 ; extra == 'tests'
-Requires-Dist: pytest-md ==0.2.0 ; extra == 'tests'
-Requires-Dist: pytest-workflow ==2.0.1 ; extra == 'tests'
-Requires-Dist: biopython ==1.82 ; extra == 'tests'
-Requires-Dist: pandas ==2.0.2 ; extra == 'tests'
-Requires-Dist: numpy ==1.26.0 ; extra == 'tests'
-Requires-Dist: regex ==2023.12.25 ; extra == 'tests'
+Requires-Dist: pytest==7.4.0; extra == "tests"
+Requires-Dist: pytest-md==0.2.0; extra == "tests"
+Requires-Dist: pytest-workflow==2.0.1; extra == "tests"
+Requires-Dist: biopython==1.82; extra == "tests"
+Requires-Dist: pandas==2.0.2; extra == "tests"
+Requires-Dist: numpy==1.26.0; extra == "tests"
+Requires-Dist: regex==2023.12.25; extra == "tests"
+Requires-Dist: requests==2.32.3; extra == "tests"
+Requires-Dist: click==8.1.7; extra == "tests"
+Requires-Dist: pandera==0.22.1; extra == "tests"
+Provides-Extra: dev
+Requires-Dist: mgnify_pipelines_toolkit[tests]; extra == "dev"
+Requires-Dist: pre-commit==3.8.0; extra == "dev"
+Requires-Dist: black==24.8.0; extra == "dev"
+Requires-Dist: flake8==7.1.1; extra == "dev"
+Requires-Dist: pep8-naming==0.14.1; extra == "dev"
 # mgnify-pipelines-toolkit

{mgnify_pipelines_toolkit-0.1.8.dist-info → mgnify_pipelines_toolkit-0.2.0.dist-info}/RECORD RENAMED Viewed

@@ -12,23 +12,32 @@ mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=d
 mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=8vwH6PY-XwMZhaUo08tOwdFsoREfNumvvDawTb9Y98U,3168
 mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=19NgCYE12bEvRBVibhZtZywwRiMdiBUBJjzL4by3_qo,1717
 mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=RDPsaWKf0wIDwvCHXyRCh2zSJf3y9E7uOhHjaAeX8bY,11099
+mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py,sha256=69iK8vtG5xFgYQ-KJiSQlaxuhSoxzcO59eNLyDS3nm0,4323
+mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=OODl3XhLvksvG5RZn1iHZlg9L3DXiWIkyxJ6o-y6oeg,6949
+mgnify_pipelines_toolkit/analysis/assembly/cgc_merge.py,sha256=u6r_1GRGgBAJQvU_t5Rtl3ZYjTtGJGd5yHCobtL9ob0,15405
+mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py,sha256=U1Ls3O0CQmukmoyUwEAEN11jHUKuCdS-qVkr5ai243I,3582
+mgnify_pipelines_toolkit/analysis/assembly/go_utils.py,sha256=vsYaFJ_cmbo6DXlWs_X8wpZJfMQOq1CrLX4-3owmYjI,5447
+mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=RthgLO3YTO_JGMC7Nx2JDrowXRimnOtVUDkM1l31rt4,5834
 mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py,sha256=H5ccd1e_e5dk8vhVOvHLK1lknYbRPbnqPjULCYnU0FQ,4021
 mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=xl5HduWtGPWiI9yqsjQ3itIzwHSxF2ig5KgjLXmj9EE,4772
 mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=DTX7S1P_BkGPEeDkbmUn1YoB247hpdNIe5rdFdRYDdA,1929
 mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py,sha256=XV1vjkjIHhzouM1k5hu_51XK_mgC_EOOGDN3mx4LOvc,1991
 mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=exzWyuK0YxDiVSu4WX2H7g-uT5Y00w_EmrFqSHjRObU,5554
+mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py,sha256=aWD-1B_fJg4rYZj2p8t8CUZdG1lDSo-oeFtLvjLgsak,13680
+mgnify_pipelines_toolkit/constants/db_labels.py,sha256=_2sGzTlfX7unGqkLylQFEUWNPQ8NZnQMtzlfVFuWtyU,853
 mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=dCP3u_Qo-JMk3aqVapkqEbVUGE06jBQmUH6bB3bT8k0,1088
 mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=_2UTWfHKJyyFkIRQIPM2wDf-QkRTdLJ4xsA6gAkY9f4,1188
-mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=VaHL4mbof_9Gl7Ca3b2UkqjRqjAAvBYqprfbchae480,942
+mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=63dQlW7jAjLPOSCT670QCS5WhTp13vwaHqfmFYbKMyg,1076
 mgnify_pipelines_toolkit/constants/thresholds.py,sha256=zz8paGQfZAU8tT-RbSGpzZ1Aopf77yEs97BAblHH5fk,964
 mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=jbOB_bTnW2TRjmdF7IS1A7nNOLt-lGnGyVXUHu0TmvQ,1307
+mgnify_pipelines_toolkit/schemas/schemas.py,sha256=fd2xCoA1Ty-XaMG9U_gxNcBokHiYENbA85n9YTsqbpU,7098
 mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=GbNT7clHso21w_1PbPpWKVRd5bNs_MDbGXt8XVIGl2o,3991
 mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=zsQ4TuR4vpqYa67MgIdopdscsS0DVJdy4enRe1nCjSs,793
-mgnify_pipelines_toolkit-0.1.8.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-mgnify_pipelines_toolkit-0.1.8.dist-info/METADATA,sha256=Uf-ukd_8rWTprPyipZQXTy4ZKdpxNezdwwPNwtNFbRk,5859
-mgnify_pipelines_toolkit-0.1.8.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
-mgnify_pipelines_toolkit-0.1.8.dist-info/entry_points.txt,sha256=tCZ7ijAgfIn47xXGxNtoZbHTDyUfHjUzjXg-NBRlj6g,1646
-mgnify_pipelines_toolkit-0.1.8.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
-mgnify_pipelines_toolkit-0.1.8.dist-info/RECORD,,
+mgnify_pipelines_toolkit-0.2.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+mgnify_pipelines_toolkit-0.2.0.dist-info/METADATA,sha256=TR0FyKtC0Xyj0zvDCPiYsI6bGbZI9GkQ8fiC1WWomEk,6068
+mgnify_pipelines_toolkit-0.2.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+mgnify_pipelines_toolkit-0.2.0.dist-info/entry_points.txt,sha256=60Nov738JAon-uZXUqqjOGy4TXxgS4xtxqYhAi12HY0,2084
+mgnify_pipelines_toolkit-0.2.0.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
+mgnify_pipelines_toolkit-0.2.0.dist-info/RECORD,,

{mgnify_pipelines_toolkit-0.1.8.dist-info → mgnify_pipelines_toolkit-0.2.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.2.0)
+Generator: setuptools (75.8.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{mgnify_pipelines_toolkit-0.1.8.dist-info → mgnify_pipelines_toolkit-0.2.0.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,11 +1,14 @@
 [console_scripts]
+add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main
 are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
 assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
 assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
+cgc_merge = mgnify_pipelines_toolkit.analysis.assembly.cgc_merge:combine_main
 classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
 fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
 fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
 find_mcp_inflection_points = mgnify_pipelines_toolkit.analysis.amplicon.find_mcp_inflection_points:main
+generate_gaf = mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main
 get_mpt_version = mgnify_pipelines_toolkit.utils.get_mpt_version:main
 get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
 get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
@@ -17,3 +20,5 @@ primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_va
 remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
 rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
 standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
+study_summary_generator = mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:main
+summarise_goslims = mgnify_pipelines_toolkit.analysis.assembly.summarise_goslims:main

{mgnify_pipelines_toolkit-0.1.8.dist-info → mgnify_pipelines_toolkit-0.2.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{mgnify_pipelines_toolkit-0.1.8.dist-info → mgnify_pipelines_toolkit-0.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

mgnify-pipelines-toolkit 0.1.8__py3-none-any.whl → 0.2.0__py3-none-any.whl

Potentially problematic release.

mgnify-pipelines-toolkit 0.1.8py3-none-any.whl → 0.2.0py3-none-any.whl