mgnify-pipelines-toolkit 1.2.10__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

@@ -1,738 +0,0 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
- # Copyright 2024-2025 EMBL - European Bioinformatics Institute
4
- #
5
- # Licensed under the Apache License, Version 2.0 (the "License");
6
- # you may not use this file except in compliance with the License.
7
- # You may obtain a copy of the License at
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import logging
17
- import re
18
-
19
- from enum import StrEnum
20
- from typing import ClassVar, Optional, Type, Literal
21
-
22
- import pandas as pd
23
- import pandera as pa
24
- from pandera.typing import Series
25
- from pandera.typing.common import DataFrameBase
26
-
27
- from pydantic import (
28
- Field,
29
- BaseModel,
30
- field_validator,
31
- RootModel,
32
- )
33
- from pandera.engines.pandas_engine import PydanticModel
34
-
35
- from mgnify_pipelines_toolkit.constants.tax_ranks import (
36
- SHORT_TAX_RANKS,
37
- SHORT_PR2_TAX_RANKS,
38
- SHORT_MOTUS_TAX_RANKS,
39
- )
40
-
41
-
42
- class INSDCRunAccession(RootModel):
43
- """Class for modelling for INSDC-specific run accessions.
44
- Essentially is just a special string with regex-based validation of the accession.
45
- """
46
-
47
- # RootModel example:
48
- # https://stackoverflow.com/questions/78393675/how-to-make-a-custom-type-inheriting-from-uuid-work-as-a-pydantic-model
49
-
50
- root: str = Field(
51
- unique=True,
52
- description="The run needs to be a valid ENA accession",
53
- examples=["ERR123456", "DRR789012", "SRR345678"],
54
- )
55
-
56
- @field_validator("root", mode="after")
57
- @classmethod
58
- def run_validity_check(cls, run: str) -> bool:
59
- """Checks that the run string matches the regex code of an INSDC run accession.
60
- Throws a `ValueError` exception if not, which is what Pydantic prefers for validation errors.
61
- """
62
-
63
- run_accession_regex = "(E|D|S)RR[0-9]{6,}"
64
- regex_res = re.match(run_accession_regex, run)
65
-
66
- if regex_res is None:
67
- raise ValueError(
68
- f"Accession `{run}` does not fit INSDC format [ERR*,SRR*,DRR*]."
69
- )
70
-
71
- return run
72
-
73
-
74
- class AmpliconResultTypes(StrEnum):
75
- """Class that models the three allowed statuses for successful amplicon analysis runs.
76
- Pydantic validates Enums very simply without needing to declare a new function.
77
- """
78
-
79
- all_results = "all_results"
80
- no_asvs = "no_asvs"
81
- dada2_stats_fail = "dada2_stats_fail"
82
-
83
-
84
- class AmpliconPassedRunsRecord(BaseModel):
85
- """Class defining a Pydantic model for a single "row" of an amplicon passed runs file.
86
- Uses the previous two classes.
87
- """
88
-
89
- run: INSDCRunAccession
90
- status: AmpliconResultTypes
91
-
92
-
93
- class AmpliconNonINSDCSPassedRunsRecord(BaseModel):
94
- """Class modeling a very similar model as the preceding one, but with no INSDC-validation.
95
- This is achieved by replacing the type of the runs with just a simple string so no validation
96
- happens.
97
- """
98
-
99
- run: str
100
- status: AmpliconResultTypes
101
-
102
-
103
- # This is the schema for the whole DF
104
- class AmpliconPassedRunsSchema(pa.DataFrameModel):
105
- """Class modelling a Pandera dataframe schema that uses the AmpliconPassedRunsRecord class as dtype.
106
- This is what actually validates the generated dataframe when read by pandas.read_csv.
107
- """
108
-
109
- class Config:
110
- """Config with dataframe-level data type."""
111
-
112
- dtype = PydanticModel(AmpliconPassedRunsRecord)
113
- coerce = True
114
-
115
-
116
- class CompletedAnalysisRecord(BaseModel):
117
- """Class defining a Pydantic model for a single "row" of an successfully analysed assemblies file."""
118
-
119
- assembly: str = Field(
120
- ...,
121
- description="Assembly accession",
122
- examples=["ERZ789012"],
123
- pattern=r"ERZ\d{6,}",
124
- )
125
- status: Literal["success"] = Field(
126
- ...,
127
- description="Pipeline output for whether this assembly's analysis succeeded or not",
128
- )
129
-
130
-
131
- class CompletedAnalysisSchema(pa.DataFrameModel):
132
- """Class modelling a Pandera dataframe schema that uses the CompletedAnalysisSchema class as dtype.
133
- This is what actually validates the generated dataframe when read by pandas.read_csv.
134
- """
135
-
136
- assembly: Series[str]
137
-
138
- @pa.check("assembly")
139
- def accessions_unique(self, series: Series[str]) -> Series[bool]:
140
- return ~series.duplicated()
141
-
142
- class Config:
143
- """Config with dataframe-level data type."""
144
-
145
- dtype = PydanticModel(CompletedAnalysisRecord)
146
- coerce = True
147
-
148
-
149
- class InterProSummaryRecord(BaseModel):
150
- """Model of a row in the InterPro summary file."""
151
-
152
- count: int = Field(
153
- ..., ge=0, description="Number of hits for the InterPro accession"
154
- )
155
- interpro_accession: str = Field(
156
- ...,
157
- description="InterPro accession ID",
158
- examples=["IPR123456"],
159
- pattern=r"IPR\d{6}",
160
- )
161
- description: str = Field(..., description="Description of the InterPro domain")
162
-
163
-
164
- class GOSummaryRecord(BaseModel):
165
- """Model of a row in the GO summary file."""
166
-
167
- go: str = Field(
168
- ...,
169
- description="GO term identifier",
170
- examples=["GO:1234567"],
171
- pattern=r"GO:\d{7}",
172
- )
173
- term: str = Field(..., description="GO term name")
174
- category: str = Field(
175
- ...,
176
- description="GO category",
177
- examples=["biological_process", "molecular_function", "cellular_component"],
178
- )
179
- count: int = Field(..., ge=0, description="Number of times the GO term is observed")
180
-
181
-
182
- class BaseSummarySchema(pa.DataFrameModel):
183
- """Base schema for summary files."""
184
-
185
- @staticmethod
186
- def is_unique(series: Series[str]) -> Series[bool]:
187
- return ~series.duplicated()
188
-
189
-
190
- class InterProSummarySchema(BaseSummarySchema):
191
- """Schema for InterPro summary file validation."""
192
-
193
- interpro_accession: Series[str]
194
-
195
- @pa.check("interpro_accession")
196
- def interpro_ids_unique(self, series: Series[str]) -> Series[bool]:
197
- return self.is_unique(series)
198
-
199
- class Config:
200
- dtype = PydanticModel(InterProSummaryRecord)
201
- coerce = True
202
-
203
-
204
- class GOSummarySchema(BaseSummarySchema):
205
- """Schema for GO or GOslim summary file validation."""
206
-
207
- go: Series[str]
208
-
209
- @pa.check("go")
210
- def go_ids_unique(self, series: Series[str]) -> Series[bool]:
211
- return self.is_unique(series)
212
-
213
- class Config:
214
- dtype = PydanticModel(GOSummaryRecord)
215
- coerce = True
216
-
217
-
218
- class SanntisSummaryRecord(BaseModel):
219
- """Model of a row in the Sanntis assembly-level summary file."""
220
-
221
- nearest_mibig: str = Field(
222
- ...,
223
- description="The accession ID of the closest matching biosynthetic gene cluster (BGC) in the MIBiG database",
224
- examples=["BGC0000073"],
225
- pattern=r"BGC\d{7}",
226
- )
227
- nearest_mibig_class: str = Field(
228
- ...,
229
- description="The biosynthetic class of the nearest MIBiG BGC",
230
- examples=["Polyketide"],
231
- )
232
- description: str = Field(
233
- ...,
234
- description="A brief summary of the biosynthetic process or type of metabolite associated with the nearest MIBiG cluster",
235
- )
236
-
237
- count: int = Field(
238
- ..., ge=0, description="Number of times the MIBiG entry is observed"
239
- )
240
-
241
-
242
- class AntismashSummaryRecord(BaseModel):
243
- """Model of a row in the Antismash summary file."""
244
-
245
- label: str = Field(
246
- ...,
247
- description="Biosynthetic class or label assigned by Antismash based on sequence similarity to known biosynthetic gene clusters.",
248
- examples=["RiPP-like", "T1PKS", "terpene"],
249
- )
250
- description: str = Field(
251
- ...,
252
- description="Brief explanation of the biosynthetic class, often indicating compound type or functional characteristics.",
253
- examples=["Type I PKS (Polyketide synthase)", "Redox-cofactors such as PQQ"],
254
- )
255
- count: int = Field(
256
- ...,
257
- ge=0,
258
- description="Number of BGCs (biosynthetic gene clusters) in the dataset assigned to this label.",
259
- )
260
-
261
-
262
- class KOSummaryRecord(BaseModel):
263
- """Model of a row in the KEGG summary file."""
264
-
265
- ko: str = Field(
266
- ...,
267
- description="KEGG Orthology (KO) identifier representing a functional gene or pathway component.",
268
- examples=["K07547", "K04874", "K19946"],
269
- pattern=r"K\d{5,}",
270
- )
271
- description: str = Field(
272
- ...,
273
- description="Name or function of the KO, sometimes including EC numbers and protein families.",
274
- examples=["optineurin", "MFS transporter, POT/PTR family"],
275
- )
276
- count: int = Field(
277
- ...,
278
- ge=0,
279
- description="Number of times this KO identifier is observed in the dataset.",
280
- )
281
-
282
-
283
- class PFAMSummaryRecord(BaseModel):
284
- """Model of a row in the PFAM summary file."""
285
-
286
- pfam: str = Field(
287
- ...,
288
- description="PFAM accession identifier representing a protein domain or family.",
289
- examples=["PF00265", "PF01956", "PF00673"],
290
- pattern=r"PF\d{5}",
291
- )
292
- description: str = Field(
293
- ...,
294
- description="Description of the protein domain or family associated with the PFAM ID.",
295
- examples=["Thymidine kinase", "Integral membrane protein EMC3/TMCO1-like"],
296
- )
297
- count: int = Field(
298
- ...,
299
- ge=0,
300
- description="Number of times the PFAM domain is observed in the dataset.",
301
- )
302
-
303
-
304
- class KEGGModulesSummaryRecord(BaseModel):
305
- """Model of a row in the KEGG Modules summary file."""
306
-
307
- module_accession: str = Field(
308
- ...,
309
- description="KEGG Module identifier representing a specific metabolic pathway or module.",
310
- examples=["M00123", "M00234"],
311
- pattern=r"M\d{5}",
312
- )
313
- completeness: float = Field(
314
- ...,
315
- ge=0,
316
- description="Completeness score of the KEGG Module, indicating the extent to which the module is present in the metagenome.",
317
- )
318
- pathway_name: str = Field(
319
- ...,
320
- description="Name of the metabolic pathway associated with the KEGG Module.",
321
- examples=["Sulfur reduction, sulfur => sulfide"],
322
- )
323
- pathway_class: str = Field(
324
- ...,
325
- description="Biosynthetic class or category associated with the KEGG Module, semi colon separated.",
326
- examples=["Pathway modules; Energy metabolism; Photosynthesis"],
327
- )
328
-
329
-
330
- class SanntisSummarySchema(BaseSummarySchema):
331
- nearest_mibig: Series[str]
332
-
333
- @pa.check("nearest_mibig")
334
- def mibig_ids_unique(self, series: Series[str]) -> Series[bool]:
335
- return self.is_unique(series)
336
-
337
- class Config:
338
- dtype = PydanticModel(SanntisSummaryRecord)
339
- coerce = True
340
-
341
-
342
- class AntismashSummarySchema(BaseSummarySchema):
343
- label: Series[str]
344
-
345
- @pa.check("label")
346
- def class_names_unique(self, series: Series[str]) -> Series[bool]:
347
- return self.is_unique(series)
348
-
349
- class Config:
350
- dtype = PydanticModel(AntismashSummaryRecord)
351
- coerce = True
352
-
353
-
354
- class KOSummarySchema(BaseSummarySchema):
355
- ko: Series[str]
356
-
357
- @pa.check("ko")
358
- def ko_ids_unique(self, series: Series[str]) -> Series[bool]:
359
- return self.is_unique(series)
360
-
361
- class Config:
362
- dtype = PydanticModel(KOSummaryRecord)
363
- coerce = True
364
-
365
-
366
- class PFAMSummarySchema(BaseSummarySchema):
367
- pfam: Series[str]
368
-
369
- @pa.check("pfam")
370
- def pfam_ids_unique(self, series: Series[str]) -> Series[bool]:
371
- return self.is_unique(series)
372
-
373
- class Config:
374
- dtype = PydanticModel(PFAMSummaryRecord)
375
- coerce = True
376
-
377
-
378
- class KEGGModulesSummarySchema(BaseSummarySchema):
379
- module_accession: Series[str]
380
-
381
- @pa.check("module_accession")
382
- def module_ids_unique(self, series: Series[str]) -> Series[bool]:
383
- return self.is_unique(series)
384
-
385
- class Config:
386
- dtype = PydanticModel(KEGGModulesSummaryRecord)
387
- coerce = True
388
-
389
-
390
- class BaseStudySummarySchema(BaseSummarySchema):
391
- """Base schema for study summary files with ERZ* columns and count checks."""
392
-
393
- @pa.check(regex=r"^ERZ\d+")
394
- def count_columns_are_non_negative(self, s: Series[int]) -> Series[bool]:
395
- return s >= 0
396
-
397
- class Config:
398
- strict = False # allow extra ERZ* columns not declared above
399
-
400
-
401
- class GOStudySummarySchema(BaseStudySummarySchema):
402
- GO: Series[str] = pa.Field(str_matches=r"^GO:\d{7}$")
403
- description: Series[str]
404
- category: Series[str]
405
-
406
- @pa.check("GO")
407
- def go_ids_unique(self, series: Series[str]) -> Series[bool]:
408
- return self.is_unique(series)
409
-
410
-
411
- class InterProStudySummarySchema(BaseStudySummarySchema):
412
- IPR: Series[str] = pa.Field(str_matches=r"^IPR\d{6}$")
413
- description: Series[str]
414
-
415
- @pa.check("IPR")
416
- def interpro_ids_unique(self, series: Series[str]) -> Series[bool]:
417
- return self.is_unique(series)
418
-
419
-
420
- class AntismashStudySummarySchema(BaseStudySummarySchema):
421
- label: Series[str]
422
-
423
- @pa.check("label")
424
- def class_names_unique(self, series: Series[str]) -> Series[bool]:
425
- return self.is_unique(series)
426
-
427
-
428
- class SanntisStudySummarySchema(BaseStudySummarySchema):
429
- nearest_mibig: Series[str]
430
-
431
- @pa.check("nearest_mibig")
432
- def mibig_ids_unique(self, series: Series[str]) -> Series[bool]:
433
- return self.is_unique(series)
434
-
435
-
436
- class KOStudySummarySchema(BaseStudySummarySchema):
437
- KO: Series[str]
438
-
439
- @pa.check("KO")
440
- def ko_ids_unique(self, series: Series[str]) -> Series[bool]:
441
- return self.is_unique(series)
442
-
443
-
444
- class PFAMStudySummarySchema(BaseStudySummarySchema):
445
- PFAM: Series[str]
446
-
447
- @pa.check("PFAM")
448
- def pfam_ids_unique(self, series: Series[str]) -> Series[bool]:
449
- return self.is_unique(series)
450
-
451
-
452
- class KEGGModulesStudySummarySchema(BaseStudySummarySchema):
453
- module_accession: Series[str]
454
-
455
- @pa.check("module_accession")
456
- def module_ids_unique(self, series: Series[str]) -> Series[bool]:
457
- return self.is_unique(series)
458
-
459
-
460
- class TaxonomyStudySummarySchema(BaseStudySummarySchema):
461
- pass
462
-
463
-
464
- class AmpliconNonINSDCPassedRunsSchema(pa.DataFrameModel):
465
- """Class modelling the same dataframe schema as the preceding one, except with no INSDC validation.
466
- Uses the AmpliconNonINSDCSPassedRunsRecord as a dtype to achieve this.
467
- """
468
-
469
- class Config:
470
- """Config with dataframe-level data type."""
471
-
472
- dtype = PydanticModel(AmpliconNonINSDCSPassedRunsRecord)
473
- coerce = True
474
-
475
-
476
- class TaxRank(RootModel):
477
- """Class for modelling a single Taxonomic Rank.
478
- Essentially is just a special string with validation of the structure:
479
- `${rank}__${taxon}`
480
- Where `${rank}` is one of the allowed short ranks defined by the imported
481
- `SHORT_TAX_RANKS` and `SHORT_PR2_TAX_RANKS` variables.
482
- And `${taxon}` is the actual taxon for that rank (this isn't validated).
483
- It will also validate if the whole string is the permitted "Unclassified".
484
- """
485
-
486
- valid_tax_ranks: ClassVar = SHORT_TAX_RANKS + SHORT_PR2_TAX_RANKS
487
-
488
- root: str = Field(
489
- unique=True,
490
- description="A single taxon in a taxonomy record",
491
- examples=["sk__Bacteria", "p__Bacillota", "g__Tundrisphaera"],
492
- )
493
-
494
- @field_validator("root", mode="after")
495
- @classmethod
496
- def rank_structure_validity_check(cls, taxrank: str) -> bool:
497
- taxrank_list = taxrank.split("__")
498
- rank = taxrank_list[0]
499
- if (
500
- rank != ""
501
- and rank.capitalize() != "Unclassified"
502
- and rank not in cls.valid_tax_ranks
503
- ):
504
- raise ValueError(f"Invalid taxonomy rank {rank}.")
505
-
506
- return taxrank
507
-
508
-
509
- # TODO: see if we can simplify the declaration of two Taxon classes by using one of these solutions
510
- # None of the solutions have a model-only way of doing it, but worth considering maybe
511
- # https://stackoverflow.com/questions/76537360/initialize-one-of-two-pydantic-models-depending-on-an-init-parameter
512
-
513
-
514
- class Taxon(BaseModel):
515
- """Class for modelling an entire Taxon or taxonomic assignment.
516
- All of the ranks are optional, to model for the taxon being "Unclassified".
517
- """
518
-
519
- Superkingdom: Optional[TaxRank] = None
520
- Kingdom: Optional[TaxRank] = None
521
- Phylum: Optional[TaxRank] = None
522
- Class: Optional[TaxRank] = None
523
- Order: Optional[TaxRank] = None
524
- Family: Optional[TaxRank] = None
525
- Genus: Optional[TaxRank] = None
526
- Species: Optional[TaxRank] = None
527
-
528
-
529
- class PR2Taxon(Taxon):
530
- """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
531
-
532
- Domain: Optional[TaxRank] = None
533
- Supergroup: Optional[TaxRank] = None
534
- Division: Optional[TaxRank] = None
535
- Subdivision: Optional[TaxRank] = None
536
-
537
-
538
- class TaxonRecord(Taxon):
539
- """Class for modelling a single taxon record in a taxonomy file.
540
- It inherits the Taxon class, and simply adds a Count field, modelling the read counts
541
- for that particular Taxon record.
542
- """
543
-
544
- Count: int
545
-
546
-
547
- class PR2TaxonRecord(PR2Taxon):
548
- """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
549
-
550
- count: int = Field(alias="Count")
551
-
552
-
553
- # This is the schema for the whole DF
554
- class TaxonSchema(pa.DataFrameModel):
555
- """Class modelling a Pandera dataframe schema that uses the TaxonRecord class as dtype.
556
- This is what actually validates the generated dataframe when read by pandas.read_csv.
557
- """
558
-
559
- class Config:
560
- """Config with dataframe-level data type."""
561
-
562
- dtype = PydanticModel(TaxonRecord)
563
- coerce = True
564
-
565
-
566
- class PR2TaxonSchema(pa.DataFrameModel):
567
- """Class modelling the same dataframe schema as the preceding one, except for the PR2 taxonomy.
568
- Uses the PR2TaxonSchema as a dtype to achieve this.
569
- """
570
-
571
- class Config:
572
- """Config with dataframe-level data type."""
573
-
574
- dtype = PydanticModel(PR2TaxonRecord)
575
- coerce = True
576
-
577
-
578
- class RawReadsStatusTypes(StrEnum):
579
- """Class that models the four allowed statuses for successful raw reads analysis runs.
580
- Pydantic validates Enums very simply without needing to declare a new function.
581
- """
582
-
583
- all_results = "all_results"
584
- no_reads = "no_reads"
585
- all_empty_results = "all_empty_results"
586
- some_empty_results = "some_empty_results"
587
-
588
-
589
- class RawReadsPassedRunsRecord(BaseModel):
590
- """Class defining a Pydantic model for a single "row" of a raw-reads pipeline passed runs file.
591
- Uses the previous nine classes.
592
- """
593
-
594
- run: INSDCRunAccession
595
- status: RawReadsStatusTypes
596
-
597
-
598
- class RawReadsNonINSDCSPassedRunsRecord(RawReadsPassedRunsRecord):
599
- """Class modeling a very similar model as the preceding one, but with no INSDC-validation.
600
- This is achieved by replacing the type of the runs with just a simple string so no validation
601
- happens.
602
- """
603
-
604
- run: str
605
-
606
-
607
- # This is the schema for the whole DF
608
- class RawReadsPassedRunsSchema(pa.DataFrameModel):
609
- """Class modelling a Pandera dataframe schema that uses the RawReadsPassedRunsRecord class as dtype.
610
- This is what actually validates the generated dataframe when read by pandas.read_csv.
611
- """
612
-
613
- class Config:
614
- """Config with dataframe-level data type."""
615
-
616
- dtype = PydanticModel(RawReadsPassedRunsRecord)
617
- coerce = True
618
-
619
-
620
- class RawReadsNonINSDCPassedRunsSchema(pa.DataFrameModel):
621
- """Class modelling the same dataframe schema as the preceding one, except with no INSDC validation.
622
- Uses the RawReadsNonINSDCSPassedRunsRecord as a dtype to achieve this.
623
- """
624
-
625
- class Config:
626
- """Config with dataframe-level data type."""
627
-
628
- dtype = PydanticModel(RawReadsNonINSDCSPassedRunsRecord)
629
- coerce = True
630
-
631
-
632
- class MotusTaxRank(RootModel):
633
- """Class for modelling a single Taxonomic Rank in mOTUs output.
634
- Essentially is just a special string with validation of the structure:
635
- `${rank}__${taxon}`
636
- Where `${rank}` is one of the allowed short ranks defined by the imported
637
- `SHORT_MOTUS_TAX_RANKS` variables.
638
- And `${taxon}` is the actual taxon for that rank (this isn't validated).
639
- It will also validate if the whole string is the permitted "unassigned" or "unclassified".
640
- """
641
-
642
- valid_tax_ranks: ClassVar = SHORT_MOTUS_TAX_RANKS
643
-
644
- root: str = Field(
645
- unique=True,
646
- description="A single taxon in a taxonomy record",
647
- examples=["sk__Bacteria", "p__Bacillota", "g__Tundrisphaera"],
648
- )
649
-
650
- @field_validator("root", mode="after")
651
- @classmethod
652
- def rank_structure_validity_check(cls, taxrank: str) -> bool:
653
- taxrank_list = taxrank.split("__")
654
- rank = taxrank_list[0]
655
- if (
656
- rank != ""
657
- and not rank.capitalize() in {"Unclassified", "Unassigned"}
658
- and rank not in cls.valid_tax_ranks
659
- ):
660
- raise ValueError(f"Invalid taxonomy rank {rank}.")
661
-
662
- return taxrank
663
-
664
-
665
- class MotusTaxon(BaseModel):
666
- """Class for modelling an entire MotusTaxon or mOTUs taxonomic assignment.
667
- All of the ranks are optional, to model for the taxon being "Unclassified" or "Unassigned".
668
- """
669
-
670
- Kingdom: Optional[MotusTaxRank] = None
671
- Phylum: Optional[MotusTaxRank] = None
672
- Class: Optional[MotusTaxRank] = None
673
- Order: Optional[MotusTaxRank] = None
674
- Family: Optional[MotusTaxRank] = None
675
- Genus: Optional[MotusTaxRank] = None
676
- Species: Optional[MotusTaxRank] = None
677
-
678
-
679
- class MotusTaxonRecord(MotusTaxon):
680
- """Class for modelling a single taxon record in a mOTUs taxonomy file.
681
- It inherits the MotusTaxon class, and simply adds a Count field, modelling the read counts
682
- for that particular MotusTaxon record.
683
- """
684
-
685
- count: int = Field(alias="Count")
686
-
687
-
688
- class MotusTaxonSchema(pa.DataFrameModel):
689
- """Class modelling a Pandera dataframe schema that uses the MotusTaxonRecord class as dtype.
690
- This is what actually validates the generated dataframe when read by pandas.read_csv.
691
- """
692
-
693
- class Config:
694
- """Config with dataframe-level data type."""
695
-
696
- dtype = PydanticModel(MotusTaxonRecord)
697
- coerce = True
698
-
699
-
700
- class FunctionProfileRecord(BaseModel):
701
- """Class for modelling a single taxon record in a functional profile file.
702
- It models the read counts and coverage depth/breadth of each function (gene/protein)
703
- for each specific record.
704
- """
705
-
706
- read_count: int
707
- coverage_depth: float
708
- coverage_breadth: float
709
-
710
- class Config:
711
- validate_by_name = True
712
-
713
-
714
- class FunctionProfileSchema(pa.DataFrameModel):
715
- """Class modelling a Pandera dataframe schema that uses the FunctionProfileRecord class as dtype.
716
- This is what actually validates the generated dataframe when read by pandas.read_csv.
717
- """
718
-
719
- class Config:
720
- """Config with dataframe-level data type."""
721
-
722
- dtype = PydanticModel(FunctionProfileRecord)
723
- coerce = True
724
-
725
-
726
- def validate_dataframe(
727
- df: pd.DataFrame, schema: Type[pa.DataFrameModel], df_metadata: str
728
- ) -> DataFrameBase:
729
- """
730
- Validate a pandas dataframe using a pandera schema.
731
- df_metadata will be shown in logs on failure: example, the TSV filename from which the df was read.
732
- """
733
- try:
734
- dfs = schema.validate(df, lazy=True)
735
- except pa.errors.SchemaErrors as e:
736
- logging.error(f"{schema.__name__} validation failure for {df_metadata}")
737
- raise e
738
- return dfs