mgnify-pipelines-toolkit 0.1.8__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -0,0 +1,217 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ # Copyright 2024 EMBL - European Bioinformatics Institute
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import re
18
+
19
+ from enum import Enum
20
+ from typing import ClassVar, Optional
21
+ import pandera as pa
22
+
23
+ from pydantic import (
24
+ Field,
25
+ BaseModel,
26
+ field_validator,
27
+ RootModel,
28
+ )
29
+ from pandera.engines.pandas_engine import PydanticModel
30
+
31
+ from mgnify_pipelines_toolkit.constants.tax_ranks import (
32
+ SHORT_TAX_RANKS,
33
+ SHORT_PR2_TAX_RANKS,
34
+ )
35
+
36
+
37
+ class INSDCRunAccession(RootModel):
38
+ """Class for modelling for INSDC-specific run accessions.
39
+ Essentially is just a special string with regex-based validation of the accession.
40
+ """
41
+
42
+ # RootModel example:
43
+ # https://stackoverflow.com/questions/78393675/how-to-make-a-custom-type-inheriting-from-uuid-work-as-a-pydantic-model
44
+
45
+ root: str = Field(
46
+ unique=True,
47
+ description="The run needs to be a valid ENA accession",
48
+ examples=["ERR123456", "DRR789012", "SRR345678"],
49
+ )
50
+
51
+ @field_validator("root", mode="after")
52
+ @classmethod
53
+ def run_validity_check(cls, run: str) -> bool:
54
+ """Checks that the run string matches the regex code of an INSDC run accession.
55
+ Throws a `ValueError` exception if not, which is what Pydantic prefers for validation errors.
56
+ """
57
+
58
+ run_accession_regex = "(E|D|S)RR[0-9]{6,}"
59
+ regex_res = re.match(run_accession_regex, run)
60
+
61
+ if regex_res is None:
62
+ raise ValueError(
63
+ f"Accession `{run}` does not fit INSDC format [ERR*,SRR*,DRR*]."
64
+ )
65
+
66
+ return run
67
+
68
+
69
+ class AmpliconResultTypes(str, Enum):
70
+ """Class that models the two allowed statuses for successful amplicon analysis runs.
71
+ Pydantic validates Enums very simply without needing to declare a new function.
72
+ """
73
+
74
+ all_results = "all_results"
75
+ no_asvs = "no_asvs"
76
+
77
+
78
+ class AmpliconPassedRunsRecord(BaseModel):
79
+ """Class defining a Pydantic model for a single "row" of an amplicon passed runs file.
80
+ Uses the previous two classes.
81
+ """
82
+
83
+ run: INSDCRunAccession
84
+ status: AmpliconResultTypes
85
+
86
+
87
+ class AmpliconNonINSDCSPassedRunsRecord(BaseModel):
88
+ """Class modeling a very similar model as the preceding one, but with no INSDC-validation.
89
+ This is achieved by replacing the type of the runs with just a simple string so no validation
90
+ happens.
91
+ """
92
+
93
+ run: str
94
+ status: AmpliconResultTypes
95
+
96
+
97
+ # This is the schema for the whole DF
98
+ class AmpliconPassedRunsSchema(pa.DataFrameModel):
99
+ """Class modelling a Pandera dataframe schema that uses the AmpliconPassedRunsRecord class as dtype.
100
+ This is what actually validates the generated dataframe when read by pandas.read_csv.
101
+ """
102
+
103
+ class Config:
104
+ """Config with dataframe-level data type."""
105
+
106
+ dtype = PydanticModel(AmpliconPassedRunsRecord)
107
+ coerce = True
108
+
109
+
110
+ class AmpliconNonINSDCPassedRunsSchema(pa.DataFrameModel):
111
+ """Class modelling the same dataframe schema as the preceding one, except with no INSDC validation.
112
+ Uses the AmpliconNonINSDCSPassedRunsRecord as a dtype to achieve this.
113
+ """
114
+
115
+ class Config:
116
+ """Config with dataframe-level data type."""
117
+
118
+ dtype = PydanticModel(AmpliconNonINSDCSPassedRunsRecord)
119
+ coerce = True
120
+
121
+
122
+ class TaxRank(RootModel):
123
+ """Class for modelling a single Taxonomic Rank.
124
+ Essentially is just a special string with validation of the structure:
125
+ `${rank}__${taxon}`
126
+ Where `${rank}` is one of the allowed short ranks defined by the imported
127
+ `SHORT_TAX_RANKS` and `SHORT_PR2_TAX_RANKS` variables.
128
+ And `${taxon}` is the actual taxon for that rank (this isn't validated).
129
+ It will also validate if the whole string is the permitted "Unclassified".
130
+ """
131
+
132
+ valid_tax_ranks: ClassVar = SHORT_TAX_RANKS + SHORT_PR2_TAX_RANKS
133
+
134
+ root: str = Field(
135
+ unique=True,
136
+ description="A single taxon in a taxonomy record",
137
+ examples=["sk__Bacteria", "p__Bacillota", "g__Tundrisphaera"],
138
+ )
139
+
140
+ @field_validator("root", mode="after")
141
+ @classmethod
142
+ def rank_structure_validity_check(cls, taxrank: str) -> bool:
143
+ taxrank_list = taxrank.split("__")
144
+ rank = taxrank_list[0]
145
+ if rank != "" and rank != "Unclassified" and rank not in cls.valid_tax_ranks:
146
+ raise ValueError(f"Invalid taxonomy rank {rank}.")
147
+
148
+ return taxrank
149
+
150
+
151
+ # TODO: see if we can simplify the declaration of two Taxon classes by using one of these solutions
152
+ # None of the solutions have a model-only way of doing it, but worth considering maybe
153
+ # https://stackoverflow.com/questions/76537360/initialize-one-of-two-pydantic-models-depending-on-an-init-parameter
154
+
155
+
156
+ class Taxon(BaseModel):
157
+ """Class for modelling an entire Taxon or taxonomic assignment.
158
+ All of the ranks are optional, to model for the taxon being "Unclassified".
159
+ """
160
+
161
+ Superkingdom: Optional[TaxRank] = None
162
+ Kingdom: Optional[TaxRank] = None
163
+ Phylum: Optional[TaxRank] = None
164
+ Class: Optional[TaxRank] = None
165
+ Order: Optional[TaxRank] = None
166
+ Family: Optional[TaxRank] = None
167
+ Genus: Optional[TaxRank] = None
168
+ Species: Optional[TaxRank] = None
169
+
170
+
171
+ class PR2Taxon(Taxon):
172
+ """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
173
+
174
+ Domain: Optional[TaxRank] = None
175
+ Supergroup: Optional[TaxRank] = None
176
+ Division: Optional[TaxRank] = None
177
+ Subdivision: Optional[TaxRank] = None
178
+
179
+
180
+ class TaxonRecord(Taxon):
181
+ """Class for modelling a single taxon record in a taxonomy file.
182
+ It inherits the Taxon class, and simply adds a Count field, modelling the read counts
183
+ for that particular Taxon record.
184
+ """
185
+
186
+ Count: int
187
+
188
+
189
+ class PR2TaxonRecord(PR2Taxon):
190
+ """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
191
+
192
+ Count: int
193
+
194
+
195
+ # This is the schema for the whole DF
196
+ class TaxonSchema(pa.DataFrameModel):
197
+ """Class modelling a Pandera dataframe schema that uses the TaxonRecord class as dtype.
198
+ This is what actually validates the generated dataframe when read by pandas.read_csv.
199
+ """
200
+
201
+ class Config:
202
+ """Config with dataframe-level data type."""
203
+
204
+ dtype = PydanticModel(TaxonRecord)
205
+ coerce = True
206
+
207
+
208
+ class PR2TaxonSchema(pa.DataFrameModel):
209
+ """Class modelling the same dataframe schema as the preceding one, except for the PR2 taxonomy.
210
+ Uses the PR2TaxonSchema as a dtype to achieve this.
211
+ """
212
+
213
+ class Config:
214
+ """Config with dataframe-level data type."""
215
+
216
+ dtype = PydanticModel(PR2TaxonRecord)
217
+ coerce = True
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: mgnify_pipelines_toolkit
3
- Version: 0.1.8
3
+ Version: 0.2.0
4
4
  Summary: Collection of scripts and tools for MGnify pipelines
5
5
  Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
6
6
  License: Apache Software License 2.0
@@ -11,24 +11,30 @@ Classifier: Operating System :: OS Independent
11
11
  Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
- Requires-Dist: biopython ==1.82
15
- Requires-Dist: numpy ==1.26.0
16
- Requires-Dist: pandas ==2.0.2
17
- Requires-Dist: regex ==2023.12.25
18
- Provides-Extra: dev
19
- Requires-Dist: mgnify-pipelines-toolkit[tests] ; extra == 'dev'
20
- Requires-Dist: pre-commit ==3.8.0 ; extra == 'dev'
21
- Requires-Dist: black ==24.8.0 ; extra == 'dev'
22
- Requires-Dist: flake8 ==7.1.1 ; extra == 'dev'
23
- Requires-Dist: pep8-naming ==0.14.1 ; extra == 'dev'
14
+ Requires-Dist: biopython==1.82
15
+ Requires-Dist: numpy==1.26.0
16
+ Requires-Dist: pandas==2.0.2
17
+ Requires-Dist: regex==2023.12.25
18
+ Requires-Dist: requests==2.32.3
19
+ Requires-Dist: click==8.1.7
20
+ Requires-Dist: pandera==0.22.1
24
21
  Provides-Extra: tests
25
- Requires-Dist: pytest ==7.4.0 ; extra == 'tests'
26
- Requires-Dist: pytest-md ==0.2.0 ; extra == 'tests'
27
- Requires-Dist: pytest-workflow ==2.0.1 ; extra == 'tests'
28
- Requires-Dist: biopython ==1.82 ; extra == 'tests'
29
- Requires-Dist: pandas ==2.0.2 ; extra == 'tests'
30
- Requires-Dist: numpy ==1.26.0 ; extra == 'tests'
31
- Requires-Dist: regex ==2023.12.25 ; extra == 'tests'
22
+ Requires-Dist: pytest==7.4.0; extra == "tests"
23
+ Requires-Dist: pytest-md==0.2.0; extra == "tests"
24
+ Requires-Dist: pytest-workflow==2.0.1; extra == "tests"
25
+ Requires-Dist: biopython==1.82; extra == "tests"
26
+ Requires-Dist: pandas==2.0.2; extra == "tests"
27
+ Requires-Dist: numpy==1.26.0; extra == "tests"
28
+ Requires-Dist: regex==2023.12.25; extra == "tests"
29
+ Requires-Dist: requests==2.32.3; extra == "tests"
30
+ Requires-Dist: click==8.1.7; extra == "tests"
31
+ Requires-Dist: pandera==0.22.1; extra == "tests"
32
+ Provides-Extra: dev
33
+ Requires-Dist: mgnify_pipelines_toolkit[tests]; extra == "dev"
34
+ Requires-Dist: pre-commit==3.8.0; extra == "dev"
35
+ Requires-Dist: black==24.8.0; extra == "dev"
36
+ Requires-Dist: flake8==7.1.1; extra == "dev"
37
+ Requires-Dist: pep8-naming==0.14.1; extra == "dev"
32
38
 
33
39
  # mgnify-pipelines-toolkit
34
40
 
@@ -12,23 +12,32 @@ mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=d
12
12
  mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=8vwH6PY-XwMZhaUo08tOwdFsoREfNumvvDawTb9Y98U,3168
13
13
  mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=19NgCYE12bEvRBVibhZtZywwRiMdiBUBJjzL4by3_qo,1717
14
14
  mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=RDPsaWKf0wIDwvCHXyRCh2zSJf3y9E7uOhHjaAeX8bY,11099
15
+ mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py,sha256=69iK8vtG5xFgYQ-KJiSQlaxuhSoxzcO59eNLyDS3nm0,4323
16
+ mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=OODl3XhLvksvG5RZn1iHZlg9L3DXiWIkyxJ6o-y6oeg,6949
17
+ mgnify_pipelines_toolkit/analysis/assembly/cgc_merge.py,sha256=u6r_1GRGgBAJQvU_t5Rtl3ZYjTtGJGd5yHCobtL9ob0,15405
18
+ mgnify_pipelines_toolkit/analysis/assembly/generate_gaf.py,sha256=U1Ls3O0CQmukmoyUwEAEN11jHUKuCdS-qVkr5ai243I,3582
19
+ mgnify_pipelines_toolkit/analysis/assembly/go_utils.py,sha256=vsYaFJ_cmbo6DXlWs_X8wpZJfMQOq1CrLX4-3owmYjI,5447
20
+ mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=RthgLO3YTO_JGMC7Nx2JDrowXRimnOtVUDkM1l31rt4,5834
15
21
  mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
22
  mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py,sha256=H5ccd1e_e5dk8vhVOvHLK1lknYbRPbnqPjULCYnU0FQ,4021
17
23
  mgnify_pipelines_toolkit/analysis/shared/get_subunits.py,sha256=xl5HduWtGPWiI9yqsjQ3itIzwHSxF2ig5KgjLXmj9EE,4772
18
24
  mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=DTX7S1P_BkGPEeDkbmUn1YoB247hpdNIe5rdFdRYDdA,1929
19
25
  mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py,sha256=XV1vjkjIHhzouM1k5hu_51XK_mgC_EOOGDN3mx4LOvc,1991
20
26
  mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=exzWyuK0YxDiVSu4WX2H7g-uT5Y00w_EmrFqSHjRObU,5554
27
+ mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py,sha256=aWD-1B_fJg4rYZj2p8t8CUZdG1lDSo-oeFtLvjLgsak,13680
28
+ mgnify_pipelines_toolkit/constants/db_labels.py,sha256=_2sGzTlfX7unGqkLylQFEUWNPQ8NZnQMtzlfVFuWtyU,853
21
29
  mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=dCP3u_Qo-JMk3aqVapkqEbVUGE06jBQmUH6bB3bT8k0,1088
22
30
  mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=_2UTWfHKJyyFkIRQIPM2wDf-QkRTdLJ4xsA6gAkY9f4,1188
23
- mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=VaHL4mbof_9Gl7Ca3b2UkqjRqjAAvBYqprfbchae480,942
31
+ mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=63dQlW7jAjLPOSCT670QCS5WhTp13vwaHqfmFYbKMyg,1076
24
32
  mgnify_pipelines_toolkit/constants/thresholds.py,sha256=zz8paGQfZAU8tT-RbSGpzZ1Aopf77yEs97BAblHH5fk,964
25
33
  mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=jbOB_bTnW2TRjmdF7IS1A7nNOLt-lGnGyVXUHu0TmvQ,1307
34
+ mgnify_pipelines_toolkit/schemas/schemas.py,sha256=fd2xCoA1Ty-XaMG9U_gxNcBokHiYENbA85n9YTsqbpU,7098
26
35
  mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
36
  mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=GbNT7clHso21w_1PbPpWKVRd5bNs_MDbGXt8XVIGl2o,3991
28
37
  mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=zsQ4TuR4vpqYa67MgIdopdscsS0DVJdy4enRe1nCjSs,793
29
- mgnify_pipelines_toolkit-0.1.8.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
30
- mgnify_pipelines_toolkit-0.1.8.dist-info/METADATA,sha256=Uf-ukd_8rWTprPyipZQXTy4ZKdpxNezdwwPNwtNFbRk,5859
31
- mgnify_pipelines_toolkit-0.1.8.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
32
- mgnify_pipelines_toolkit-0.1.8.dist-info/entry_points.txt,sha256=tCZ7ijAgfIn47xXGxNtoZbHTDyUfHjUzjXg-NBRlj6g,1646
33
- mgnify_pipelines_toolkit-0.1.8.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
34
- mgnify_pipelines_toolkit-0.1.8.dist-info/RECORD,,
38
+ mgnify_pipelines_toolkit-0.2.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
39
+ mgnify_pipelines_toolkit-0.2.0.dist-info/METADATA,sha256=TR0FyKtC0Xyj0zvDCPiYsI6bGbZI9GkQ8fiC1WWomEk,6068
40
+ mgnify_pipelines_toolkit-0.2.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
41
+ mgnify_pipelines_toolkit-0.2.0.dist-info/entry_points.txt,sha256=60Nov738JAon-uZXUqqjOGy4TXxgS4xtxqYhAi12HY0,2084
42
+ mgnify_pipelines_toolkit-0.2.0.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
43
+ mgnify_pipelines_toolkit-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.2.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,11 +1,14 @@
1
1
  [console_scripts]
2
+ add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main
2
3
  are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
3
4
  assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
4
5
  assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
6
+ cgc_merge = mgnify_pipelines_toolkit.analysis.assembly.cgc_merge:combine_main
5
7
  classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
6
8
  fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
7
9
  fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
8
10
  find_mcp_inflection_points = mgnify_pipelines_toolkit.analysis.amplicon.find_mcp_inflection_points:main
11
+ generate_gaf = mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main
9
12
  get_mpt_version = mgnify_pipelines_toolkit.utils.get_mpt_version:main
10
13
  get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
11
14
  get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
@@ -17,3 +20,5 @@ primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_va
17
20
  remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
18
21
  rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
19
22
  standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
23
+ study_summary_generator = mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:main
24
+ summarise_goslims = mgnify_pipelines_toolkit.analysis.assembly.summarise_goslims:main