mgnify-pipelines-toolkit 1.2.6__py3-none-any.whl → 1.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -19,3 +19,9 @@ TAXDB_LABELS = ["SILVA-SSU", "SILVA-LSU", "PR2", "UNITE", "ITSoneDB"]
19
19
 
20
20
  # taxonomy_summary for ASV method
21
21
  ASV_TAXDB_LABELS = ["DADA2-SILVA", "DADA2-PR2"]
22
+
23
+ # taxonomy_summary labels for Raw Reads Analysis Pipeline
24
+ RRAP_TAXDB_LABELS = ["silva-ssu", "silva-lsu", "motus"]
25
+
26
+ # function_summary labels for Raw Reads Analysis Pipeline
27
+ RRAP_FUNCDB_LABELS = ["pfam"]
@@ -35,7 +35,8 @@ _PR2_TAX_RANKS = [
35
35
  "Genus",
36
36
  "Species",
37
37
  ]
38
+ _MOTUS_TAX_RANKS = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
38
39
 
39
40
  SHORT_TAX_RANKS = ["sk", "k", "p", "c", "o", "f", "g", "s"]
40
-
41
+ SHORT_MOTUS_TAX_RANKS = ["k", "p", "c", "o", "f", "g", "s"]
41
42
  SHORT_PR2_TAX_RANKS = ["d", "sg", "dv", "sdv", "c", "o", "f", "g", "s"]
@@ -16,7 +16,7 @@
16
16
  import logging
17
17
  import re
18
18
 
19
- from enum import Enum
19
+ from enum import StrEnum
20
20
  from typing import ClassVar, Optional, Type, Literal
21
21
 
22
22
  import pandas as pd
@@ -35,6 +35,7 @@ from pandera.engines.pandas_engine import PydanticModel
35
35
  from mgnify_pipelines_toolkit.constants.tax_ranks import (
36
36
  SHORT_TAX_RANKS,
37
37
  SHORT_PR2_TAX_RANKS,
38
+ SHORT_MOTUS_TAX_RANKS,
38
39
  )
39
40
 
40
41
 
@@ -70,7 +71,7 @@ class INSDCRunAccession(RootModel):
70
71
  return run
71
72
 
72
73
 
73
- class AmpliconResultTypes(str, Enum):
74
+ class AmpliconResultTypes(StrEnum):
74
75
  """Class that models the two allowed statuses for successful amplicon analysis runs.
75
76
  Pydantic validates Enums very simply without needing to declare a new function.
76
77
  """
@@ -545,7 +546,7 @@ class TaxonRecord(Taxon):
545
546
  class PR2TaxonRecord(PR2Taxon):
546
547
  """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
547
548
 
548
- Count: int
549
+ count: int = Field(alias="Count")
549
550
 
550
551
 
551
552
  # This is the schema for the whole DF
@@ -573,6 +574,154 @@ class PR2TaxonSchema(pa.DataFrameModel):
573
574
  coerce = True
574
575
 
575
576
 
577
+ class RawReadsStatusTypes(StrEnum):
578
+ """Class that models the four allowed statuses for successful raw reads analysis runs.
579
+ Pydantic validates Enums very simply without needing to declare a new function.
580
+ """
581
+
582
+ all_results = "all_results"
583
+ no_reads = "no_reads"
584
+ all_empty_results = "all_empty_results"
585
+ some_empty_results = "some_empty_results"
586
+
587
+
588
+ class RawReadsPassedRunsRecord(BaseModel):
589
+ """Class defining a Pydantic model for a single "row" of a raw-reads pipeline passed runs file.
590
+ Uses the previous nine classes.
591
+ """
592
+
593
+ run: INSDCRunAccession
594
+ status: RawReadsStatusTypes
595
+
596
+
597
+ class RawReadsNonINSDCSPassedRunsRecord(RawReadsPassedRunsRecord):
598
+ """Class modeling a very similar model as the preceding one, but with no INSDC-validation.
599
+ This is achieved by replacing the type of the runs with just a simple string so no validation
600
+ happens.
601
+ """
602
+
603
+ run: str
604
+
605
+
606
+ # This is the schema for the whole DF
607
+ class RawReadsPassedRunsSchema(pa.DataFrameModel):
608
+ """Class modelling a Pandera dataframe schema that uses the RawReadsPassedRunsRecord class as dtype.
609
+ This is what actually validates the generated dataframe when read by pandas.read_csv.
610
+ """
611
+
612
+ class Config:
613
+ """Config with dataframe-level data type."""
614
+
615
+ dtype = PydanticModel(RawReadsPassedRunsRecord)
616
+ coerce = True
617
+
618
+
619
+ class RawReadsNonINSDCPassedRunsSchema(pa.DataFrameModel):
620
+ """Class modelling the same dataframe schema as the preceding one, except with no INSDC validation.
621
+ Uses the RawReadsNonINSDCSPassedRunsRecord as a dtype to achieve this.
622
+ """
623
+
624
+ class Config:
625
+ """Config with dataframe-level data type."""
626
+
627
+ dtype = PydanticModel(RawReadsNonINSDCSPassedRunsRecord)
628
+ coerce = True
629
+
630
+
631
+ class MotusTaxRank(RootModel):
632
+ """Class for modelling a single Taxonomic Rank in mOTUs output.
633
+ Essentially is just a special string with validation of the structure:
634
+ `${rank}__${taxon}`
635
+ Where `${rank}` is one of the allowed short ranks defined by the imported
636
+ `SHORT_MOTUS_TAX_RANKS` variables.
637
+ And `${taxon}` is the actual taxon for that rank (this isn't validated).
638
+ It will also validate if the whole string is the permitted "unassigned" or "unclassified".
639
+ """
640
+
641
+ valid_tax_ranks: ClassVar = SHORT_MOTUS_TAX_RANKS
642
+
643
+ root: str = Field(
644
+ unique=True,
645
+ description="A single taxon in a taxonomy record",
646
+ examples=["sk__Bacteria", "p__Bacillota", "g__Tundrisphaera"],
647
+ )
648
+
649
+ @field_validator("root", mode="after")
650
+ @classmethod
651
+ def rank_structure_validity_check(cls, taxrank: str) -> bool:
652
+ taxrank_list = taxrank.split("__")
653
+ rank = taxrank_list[0]
654
+ if (
655
+ rank != ""
656
+ and not rank.capitalize() in {"Unclassified", "Unassigned"}
657
+ and rank not in cls.valid_tax_ranks
658
+ ):
659
+ raise ValueError(f"Invalid taxonomy rank {rank}.")
660
+
661
+ return taxrank
662
+
663
+
664
+ class MotusTaxon(BaseModel):
665
+ """Class for modelling an entire MotusTaxon or mOTUs taxonomic assignment.
666
+ All of the ranks are optional, to model for the taxon being "Unclassified" or "Unassigned".
667
+ """
668
+
669
+ Kingdom: Optional[MotusTaxRank] = None
670
+ Phylum: Optional[MotusTaxRank] = None
671
+ Class: Optional[MotusTaxRank] = None
672
+ Order: Optional[MotusTaxRank] = None
673
+ Family: Optional[MotusTaxRank] = None
674
+ Genus: Optional[MotusTaxRank] = None
675
+ Species: Optional[MotusTaxRank] = None
676
+
677
+
678
+ class MotusTaxonRecord(MotusTaxon):
679
+ """Class for modelling a single taxon record in a mOTUs taxonomy file.
680
+ It inherits the MotusTaxon class, and simply adds a Count field, modelling the read counts
681
+ for that particular MotusTaxon record.
682
+ """
683
+
684
+ count: int = Field(alias="Count")
685
+
686
+
687
+ class MotusTaxonSchema(pa.DataFrameModel):
688
+ """Class modelling a Pandera dataframe schema that uses the MotusTaxonRecord class as dtype.
689
+ This is what actually validates the generated dataframe when read by pandas.read_csv.
690
+ """
691
+
692
+ class Config:
693
+ """Config with dataframe-level data type."""
694
+
695
+ dtype = PydanticModel(MotusTaxonRecord)
696
+ coerce = True
697
+
698
+
699
+ class FunctionProfileRecord(BaseModel):
700
+ """Class for modelling a single taxon record in a functional profile file.
701
+ It models the read counts and coverage depth/breadth of each function (gene/protein)
702
+ for each specific record.
703
+ """
704
+
705
+ read_count: int
706
+ coverage_depth: float
707
+ coverage_breadth: float
708
+
709
+ class Config:
710
+ validate_by_name = True
711
+
712
+
713
+ class FunctionProfileSchema(pa.DataFrameModel):
714
+ """Class modelling a Pandera dataframe schema that uses the FunctionProfileRecord class as dtype.
715
+ This is what actually validates the generated dataframe when read by pandas.read_csv.
716
+ """
717
+
718
+ class Config:
719
+ """Config with dataframe-level data type."""
720
+
721
+ dtype = PydanticModel(FunctionProfileRecord)
722
+ coerce = True
723
+
724
+
576
725
  def validate_dataframe(
577
726
  df: pd.DataFrame, schema: Type[pa.DataFrameModel], df_metadata: str
578
727
  ) -> DataFrameBase:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 1.2.6
3
+ Version: 1.2.8
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -1,7 +1,7 @@
1
1
  mgnify_pipelines_toolkit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  mgnify_pipelines_toolkit/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py,sha256=8yFhmHQXVDPXvRX8oWSANV3VMu0X-zNnz12u1fcGwTE,20649
4
- mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=ohguvrMSg7GuiiZ5aHj1DvCnfThKFUG4s13LUSMM0mo,8892
4
+ mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py,sha256=-g1FDwdEndWH9VvYLmc_NEs2l204kKjMHk65wag8T_s,8891
5
5
  mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py,sha256=BLqhflblUegCvuQic16PrFXfIXlFWmGkmWJyl4wJoLQ,5040
6
6
  mgnify_pipelines_toolkit/analysis/amplicon/permute_primers.py,sha256=1aGOJX9tC7M1rnd0U2PeJ681sUo02wxk7_ycJqeVt6s,2216
7
7
  mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=-W_QmdmKAIqVC5n-RS8LX11hEQM4xdp5r1jkITB1CI8,5256
@@ -22,30 +22,31 @@ mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py,sha2
22
22
  mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py,sha256=eNichqFFmfPsa2J10IUm_PemVs9fBhbKa2vpDqEvJNU,21791
23
23
  mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=jUeA7I12YrtIqnm3hUxpdgsWfa2pP1ALGjb9OMKPcgY,10643
24
24
  mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=TPaKlYkoy37_XgYNOskWCCoXtPNku_k5ygSeK4fT1VQ,6689
25
- mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=1wblLbZl521digIUWoqneAu15gErzvN_oC--5T_xUdw,4582
25
+ mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=lxe7R2RQFyNCzEm6YuNRrqKZLZOUPq5W1P23Pt2sKBU,4570
26
26
  mgnify_pipelines_toolkit/analysis/genomes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
+ mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py,sha256=ltyNHwzaZZkK1ScH2vV2QV1eUXTHQUMYyadJwO-zSQY,16028
27
28
  mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
29
  mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py,sha256=kAGU5kQyj-Hlcdx32i-xOJSuHYYUDj-kqnyYHMohHGc,4477
29
- mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py,sha256=hggPqv9QawWAccm5tmru4VF9VnQAHF5LCXnqyLw_BWI,6727
30
+ mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py,sha256=RaFopUjJI4UO1ttnSEHj7iUXpAL5-2FTbDXlhOmNy0s,25534
30
31
  mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py,sha256=ye0Jka6_lNn4dQGb2QG3YT46y7QK0QvyaIitIaS8JVQ,4026
31
32
  mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=UrU0CpZj3pfHZWI7Uuhv2a_C0JsO8pnVErY0sWGgNdw,4920
32
33
  mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=EH5RyzesLqsonnTQbSDs7kAOV6IskS4oyqZYlex1tAY,1934
33
34
  mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py,sha256=6Ck2NhwRWw66GctUtKDdPT5fwJhWFR_YOZq-Vxwoa8A,1996
34
35
  mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=7-U0DN1joVu0ifLOoDUK2Pfqy8rb1RDKT6khVg3jky0,5559
35
36
  mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py,sha256=sKAo_rKEyVAZXSaIFMkpSoYZxiWwXMA3XDA6Z-hbHgg,7904
36
- mgnify_pipelines_toolkit/constants/db_labels.py,sha256=omPINMylAjO2PxeFhSk2MbYNcGZH3P82optSlMey3dw,858
37
+ mgnify_pipelines_toolkit/constants/db_labels.py,sha256=12mksTtAwTE1smLnemdoItxGw1AmtJPOzbnW2aGj0u0,1062
37
38
  mgnify_pipelines_toolkit/constants/ncrna.py,sha256=a_5hWp446S7BhRbe_JcydFgZM7sgPLuMlaiBvKWN_XM,1928
38
39
  mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=G-xrc9b8zdmPTaOICD2b3RCVeFAEOVkfRkIfotQ7gek,1193
39
- mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=kMq__kOJcbiwsgolkdvb-XLo3WMnJdEXgedjUyMOYjI,1081
40
+ mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=ekZN5OcMBhDRcj7XB_27wQ8fEnmAqMJc4aQ3pv4BRmI,1229
40
41
  mgnify_pipelines_toolkit/constants/thresholds.py,sha256=1AMBmoHBR0WjXZpkwJ7_Q-gfJtHXuCA4tZ-uvPhF0Xc,1085
41
42
  mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=0bM4MwarFiM5yTcp5AbAmQ0o-q-gWy7kknir9zJ9R0A,1312
42
- mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pyDZvCuWbwccQF0D7c5BN1vv36wQdgcAUXU43_zAu74,18164
43
+ mgnify_pipelines_toolkit/schemas/schemas.py,sha256=he9igC80YTR32v1e5NslwTgtdVySmnXwK9iY9IBPNBg,23133
43
44
  mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
45
  mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
45
46
  mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
46
- mgnify_pipelines_toolkit-1.2.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
47
- mgnify_pipelines_toolkit-1.2.6.dist-info/METADATA,sha256=5XvhGHG2Zc5p8PNmR5ZYHUuRG1G6IIWT67t_FxkXkus,5775
48
- mgnify_pipelines_toolkit-1.2.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
49
- mgnify_pipelines_toolkit-1.2.6.dist-info/entry_points.txt,sha256=sHDxlHizt_iZPtkNp0EDuohDGvC4O12B57JtpUmHwYk,3123
50
- mgnify_pipelines_toolkit-1.2.6.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
51
- mgnify_pipelines_toolkit-1.2.6.dist-info/RECORD,,
47
+ mgnify_pipelines_toolkit-1.2.8.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
48
+ mgnify_pipelines_toolkit-1.2.8.dist-info/METADATA,sha256=RkF31O7GjADzb2k96oZxbyWOmDvN1bKzIThNTb0e7Qg,5775
49
+ mgnify_pipelines_toolkit-1.2.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
50
+ mgnify_pipelines_toolkit-1.2.8.dist-info/entry_points.txt,sha256=7TJ8GgbKoX1xnQsOdWwMvwhIv4uuHCx7pMxKmZabPOs,3228
51
+ mgnify_pipelines_toolkit-1.2.8.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
52
+ mgnify_pipelines_toolkit-1.2.8.dist-info/RECORD,,
@@ -6,7 +6,7 @@ assembly_study_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.st
6
6
  classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
7
7
  combined_gene_caller_merge = mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main
8
8
  convert_cmscan_to_cmsearch_tblout = mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main
9
- dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:main
9
+ dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:cli
10
10
  fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
11
11
  fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
12
12
  generate_gaf = mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main
@@ -27,6 +27,7 @@ permute_primers = mgnify_pipelines_toolkit.analysis.amplicon.permute_primers:mai
27
27
  primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main
28
28
  process_dbcan_cazys = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_cazys:main
29
29
  process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbcan_result_clusters:main
30
+ rawreads_study_summary_generator = mgnify_pipelines_toolkit.analysis.rawreads.study_summary_generator:cli
30
31
  remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
31
32
  rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
32
33
  summarise_antismash_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_antismash_bgcs:main