PyPI - mgnify-pipelines-toolkit - Versions diffs - 1.2.6__py3-none-any.whl → 1.2.8__py3-none-any.whl - Mend

mgnify-pipelines-toolkit 1.2.6py3-none-any.whl → 1.2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (13) hide show

mgnify_pipelines_toolkit/constants/db_labels.py CHANGED Viewed

@@ -19,3 +19,9 @@ TAXDB_LABELS = ["SILVA-SSU", "SILVA-LSU", "PR2", "UNITE", "ITSoneDB"]
 # taxonomy_summary for ASV method
 ASV_TAXDB_LABELS = ["DADA2-SILVA", "DADA2-PR2"]
+# taxonomy_summary labels for Raw Reads Analysis Pipeline
+RRAP_TAXDB_LABELS = ["silva-ssu", "silva-lsu", "motus"]
+# function_summary labels for Raw Reads Analysis Pipeline
+RRAP_FUNCDB_LABELS = ["pfam"]

mgnify_pipelines_toolkit/constants/tax_ranks.py CHANGED Viewed

@@ -35,7 +35,8 @@ _PR2_TAX_RANKS = [
     "Genus",
     "Species",
 ]
+_MOTUS_TAX_RANKS = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
 SHORT_TAX_RANKS = ["sk", "k", "p", "c", "o", "f", "g", "s"]
+SHORT_MOTUS_TAX_RANKS = ["k", "p", "c", "o", "f", "g", "s"]
 SHORT_PR2_TAX_RANKS = ["d", "sg", "dv", "sdv", "c", "o", "f", "g", "s"]

mgnify_pipelines_toolkit/schemas/schemas.py CHANGED Viewed

@@ -16,7 +16,7 @@
 import logging
 import re
-from enum import Enum
+from enum import StrEnum
 from typing import ClassVar, Optional, Type, Literal
 import pandas as pd
@@ -35,6 +35,7 @@ from pandera.engines.pandas_engine import PydanticModel
 from mgnify_pipelines_toolkit.constants.tax_ranks import (
     SHORT_TAX_RANKS,
     SHORT_PR2_TAX_RANKS,
+    SHORT_MOTUS_TAX_RANKS,
 )
@@ -70,7 +71,7 @@ class INSDCRunAccession(RootModel):
         return run
-class AmpliconResultTypes(str, Enum):
+class AmpliconResultTypes(StrEnum):
     """Class that models the two allowed statuses for successful amplicon analysis runs.
     Pydantic validates Enums very simply without needing to declare a new function.
     """
@@ -545,7 +546,7 @@ class TaxonRecord(Taxon):
 class PR2TaxonRecord(PR2Taxon):
     """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
-    Count: int
+    count: int = Field(alias="Count")
 # This is the schema for the whole DF
@@ -573,6 +574,154 @@ class PR2TaxonSchema(pa.DataFrameModel):
         coerce = True
+class RawReadsStatusTypes(StrEnum):
+    """Class that models the four allowed statuses for successful raw reads analysis runs.
+    Pydantic validates Enums very simply without needing to declare a new function.
+    """
+    all_results = "all_results"
+    no_reads = "no_reads"
+    all_empty_results = "all_empty_results"
+    some_empty_results = "some_empty_results"
+class RawReadsPassedRunsRecord(BaseModel):
+    """Class defining a Pydantic model for a single "row" of a raw-reads pipeline passed runs file.
+    Uses the previous nine classes.
+    """
+    run: INSDCRunAccession
+    status: RawReadsStatusTypes
+class RawReadsNonINSDCSPassedRunsRecord(RawReadsPassedRunsRecord):
+    """Class modeling a very similar model as the preceding one, but with no INSDC-validation.
+    This is achieved by replacing the type of the runs with just a simple string so no validation
+    happens.
+    """
+    run: str
+# This is the schema for the whole DF
+class RawReadsPassedRunsSchema(pa.DataFrameModel):
+    """Class modelling a Pandera dataframe schema that uses the RawReadsPassedRunsRecord class as dtype.
+    This is what actually validates the generated dataframe when read by pandas.read_csv.
+    """
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(RawReadsPassedRunsRecord)
+        coerce = True
+class RawReadsNonINSDCPassedRunsSchema(pa.DataFrameModel):
+    """Class modelling the same dataframe schema as the preceding one, except with no INSDC validation.
+    Uses the RawReadsNonINSDCSPassedRunsRecord as a dtype to achieve this.
+    """
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(RawReadsNonINSDCSPassedRunsRecord)
+        coerce = True
+class MotusTaxRank(RootModel):
+    """Class for modelling a single Taxonomic Rank in mOTUs output.
+    Essentially is just a special string with validation of the structure:
+    `${rank}__${taxon}`
+    Where `${rank}` is one of the allowed short ranks defined by the imported
+    `SHORT_MOTUS_TAX_RANKS` variables.
+    And `${taxon}` is the actual taxon for that rank (this isn't validated).
+    It will also validate if the whole string is the permitted "unassigned" or "unclassified".
+    """
+    valid_tax_ranks: ClassVar = SHORT_MOTUS_TAX_RANKS
+    root: str = Field(
+        unique=True,
+        description="A single taxon in a taxonomy record",
+        examples=["sk__Bacteria", "p__Bacillota", "g__Tundrisphaera"],
+    )
+    @field_validator("root", mode="after")
+    @classmethod
+    def rank_structure_validity_check(cls, taxrank: str) -> bool:
+        taxrank_list = taxrank.split("__")
+        rank = taxrank_list[0]
+        if (
+            rank != ""
+            and not rank.capitalize() in {"Unclassified", "Unassigned"}
+            and rank not in cls.valid_tax_ranks
+        ):
+            raise ValueError(f"Invalid taxonomy rank {rank}.")
+        return taxrank
+class MotusTaxon(BaseModel):
+    """Class for modelling an entire MotusTaxon or mOTUs taxonomic assignment.
+    All of the ranks are optional, to model for the taxon being "Unclassified" or "Unassigned".
+    """
+    Kingdom: Optional[MotusTaxRank] = None
+    Phylum: Optional[MotusTaxRank] = None
+    Class: Optional[MotusTaxRank] = None
+    Order: Optional[MotusTaxRank] = None
+    Family: Optional[MotusTaxRank] = None
+    Genus: Optional[MotusTaxRank] = None
+    Species: Optional[MotusTaxRank] = None
+class MotusTaxonRecord(MotusTaxon):
+    """Class for modelling a single taxon record in a mOTUs taxonomy file.
+    It inherits the MotusTaxon class, and simply adds a Count field, modelling the read counts
+    for that particular MotusTaxon record.
+    """
+    count: int = Field(alias="Count")
+class MotusTaxonSchema(pa.DataFrameModel):
+    """Class modelling a Pandera dataframe schema that uses the MotusTaxonRecord class as dtype.
+    This is what actually validates the generated dataframe when read by pandas.read_csv.
+    """
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(MotusTaxonRecord)
+        coerce = True
+class FunctionProfileRecord(BaseModel):
+    """Class for modelling a single taxon record in a functional profile file.
+    It models the read counts and coverage depth/breadth of each function (gene/protein)
+    for each specific record.
+    """
+    read_count: int
+    coverage_depth: float
+    coverage_breadth: float
+    class Config:
+        validate_by_name = True
+class FunctionProfileSchema(pa.DataFrameModel):
+    """Class modelling a Pandera dataframe schema that uses the FunctionProfileRecord class as dtype.
+    This is what actually validates the generated dataframe when read by pandas.read_csv.
+    """
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(FunctionProfileRecord)
+        coerce = True
 def validate_dataframe(
     df: pd.DataFrame, schema: Type[pa.DataFrameModel], df_metadata: str
 ) -> DataFrameBase:

{mgnify_pipelines_toolkit-1.2.6.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mgnify_pipelines_toolkit
-Version: 1.2.6
+Version: 1.2.8
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0

{mgnify_pipelines_toolkit-1.2.6.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=8yFhmHQXVDPXvRX8oWSANV3VMu0X-zNnz12u1fcGwTE,20649
-mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=ohguvrMSg7GuiiZ5aHj1DvCnfThKFUG4s13LUSMM0mo,8892
+mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=-g1FDwdEndWH9VvYLmc_NEs2l204kKjMHk65wag8T_s,8891
 mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=BLqhflblUegCvuQic16PrFXfIXlFWmGkmWJyl4wJoLQ,5040
 mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py,sha256=1aGOJX9tC7M1rnd0U2PeJ681sUo02wxk7_ycJqeVt6s,2216
 mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=-W_QmdmKAIqVC5n-RS8LX11hEQM4xdp5r1jkITB1CI8,5256
@@ -22,30 +22,31 @@ mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py,sha2
 mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py,sha256=eNichqFFmfPsa2J10IUm_PemVs9fBhbKa2vpDqEvJNU,21791
 mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=jUeA7I12YrtIqnm3hUxpdgsWfa2pP1ALGjb9OMKPcgY,10643
 mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=TPaKlYkoy37_XgYNOskWCCoXtPNku_k5ygSeK4fT1VQ,6689
-mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=1wblLbZl521digIUWoqneAu15gErzvN_oC--5T_xUdw,4582
+mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=lxe7R2RQFyNCzEm6YuNRrqKZLZOUPq5W1P23Pt2sKBU,4570
 mgnify_pipelines_toolkit/analysis/genomes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py,sha256=ltyNHwzaZZkK1ScH2vV2QV1eUXTHQUMYyadJwO-zSQY,16028
 mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py,sha256=kAGU5kQyj-Hlcdx32i-xOJSuHYYUDj-kqnyYHMohHGc,4477
-mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py,sha256=hggPqv9QawWAccm5tmru4VF9VnQAHF5LCXnqyLw_BWI,6727
+mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py,sha256=RaFopUjJI4UO1ttnSEHj7iUXpAL5-2FTbDXlhOmNy0s,25534
 mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py,sha256=ye0Jka6_lNn4dQGb2QG3YT46y7QK0QvyaIitIaS8JVQ,4026
 mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=UrU0CpZj3pfHZWI7Uuhv2a_C0JsO8pnVErY0sWGgNdw,4920
 mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=EH5RyzesLqsonnTQbSDs7kAOV6IskS4oyqZYlex1tAY,1934
 mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py,sha256=6Ck2NhwRWw66GctUtKDdPT5fwJhWFR_YOZq-Vxwoa8A,1996
 mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=7-U0DN1joVu0ifLOoDUK2Pfqy8rb1RDKT6khVg3jky0,5559
 mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py,sha256=sKAo_rKEyVAZXSaIFMkpSoYZxiWwXMA3XDA6Z-hbHgg,7904
-mgnify_pipelines_toolkit/constants/db_labels.py,sha256=omPINMylAjO2PxeFhSk2MbYNcGZH3P82optSlMey3dw,858
+mgnify_pipelines_toolkit/constants/db_labels.py,sha256=12mksTtAwTE1smLnemdoItxGw1AmtJPOzbnW2aGj0u0,1062
 mgnify_pipelines_toolkit/constants/ncrna.py,sha256=a_5hWp446S7BhRbe_JcydFgZM7sgPLuMlaiBvKWN_XM,1928
 mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=G-xrc9b8zdmPTaOICD2b3RCVeFAEOVkfRkIfotQ7gek,1193
-mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=kMq__kOJcbiwsgolkdvb-XLo3WMnJdEXgedjUyMOYjI,1081
+mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=ekZN5OcMBhDRcj7XB_27wQ8fEnmAqMJc4aQ3pv4BRmI,1229
 mgnify_pipelines_toolkit/constants/thresholds.py,sha256=1AMBmoHBR0WjXZpkwJ7_Q-gfJtHXuCA4tZ-uvPhF0Xc,1085
 mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=0bM4MwarFiM5yTcp5AbAmQ0o-q-gWy7kknir9zJ9R0A,1312
-mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pyDZvCuWbwccQF0D7c5BN1vv36wQdgcAUXU43_zAu74,18164
+mgnify_pipelines_toolkit/schemas/schemas.py,sha256=he9igC80YTR32v1e5NslwTgtdVySmnXwK9iY9IBPNBg,23133
 mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
 mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
-mgnify_pipelines_toolkit-1.2.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-mgnify_pipelines_toolkit-1.2.6.dist-info/METADATA,sha256=5XvhGHG2Zc5p8PNmR5ZYHUuRG1G6IIWT67t_FxkXkus,5775
-mgnify_pipelines_toolkit-1.2.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-mgnify_pipelines_toolkit-1.2.6.dist-info/entry_points.txt,sha256=sHDxlHizt_iZPtkNp0EDuohDGvC4O12B57JtpUmHwYk,3123
-mgnify_pipelines_toolkit-1.2.6.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
-mgnify_pipelines_toolkit-1.2.6.dist-info/RECORD,,
+mgnify_pipelines_toolkit-1.2.8.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+mgnify_pipelines_toolkit-1.2.8.dist-info/METADATA,sha256=RkF31O7GjADzb2k96oZxbyWOmDvN1bKzIThNTb0e7Qg,5775
+mgnify_pipelines_toolkit-1.2.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+mgnify_pipelines_toolkit-1.2.8.dist-info/entry_points.txt,sha256=7TJ8GgbKoX1xnQsOdWwMvwhIv4uuHCx7pMxKmZabPOs,3228
+mgnify_pipelines_toolkit-1.2.8.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
+mgnify_pipelines_toolkit-1.2.8.dist-info/RECORD,,

{mgnify_pipelines_toolkit-1.2.6.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/entry_points.txt RENAMED Viewed

@@ -6,7 +6,7 @@ assembly_study_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.st
 classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
 combined_gene_caller_merge = mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main
 convert_cmscan_to_cmsearch_tblout = mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main
-dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:main
+dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:cli
 fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
 fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
 generate_gaf = mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main
@@ -27,6 +27,7 @@ permute_primers = mgnify_pipelines_toolkit.analysis.amplicon.permute_primers:mai
 primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main
 process_dbcan_cazys = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_cazys:main
 process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_clusters:main
+rawreads_study_summary_generator = mgnify_pipelines_toolkit.analysis.rawreads.study_summary_generator:cli
 remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
 rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
 summarise_antismash_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_antismash_bgcs:main

{mgnify_pipelines_toolkit-1.2.6.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{mgnify_pipelines_toolkit-1.2.6.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mgnify_pipelines_toolkit-1.2.6.dist-info → mgnify_pipelines_toolkit-1.2.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

mgnify-pipelines-toolkit 1.2.6__py3-none-any.whl → 1.2.8__py3-none-any.whl

Potentially problematic release.

mgnify-pipelines-toolkit 1.2.6py3-none-any.whl → 1.2.8py3-none-any.whl