tol-sdk 1.7.4__py3-none-any.whl → 1.7.5b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tol/api_base/__init__.py +1 -0
- tol/api_base/blueprint.py +19 -8
- tol/{s3/data_upload/blueprint.py → api_base/data_upload.py} +21 -6
- tol/api_base/pipeline_steps.py +4 -4
- tol/api_client/api_datasource.py +8 -8
- tol/api_client/converter.py +38 -52
- tol/api_client/factory.py +21 -19
- tol/api_client/parser.py +138 -98
- tol/api_client/view.py +118 -43
- tol/core/__init__.py +2 -1
- tol/core/data_object.py +27 -9
- tol/core/data_object_converter.py +37 -2
- tol/core/factory.py +51 -62
- tol/core/validate.py +1 -0
- tol/ena/client.py +61 -10
- tol/ena/ena_datasource.py +16 -10
- tol/ena/ena_methods.py +33 -32
- tol/ena/parser.py +15 -2
- tol/flows/converters/__init__.py +2 -0
- tol/flows/converters/incoming_sample_to_ena_sample_converter.py +130 -0
- tol/flows/converters/incoming_sample_to_incoming_sample_with_lists_converter.py +46 -0
- tol/s3/__init__.py +0 -1
- tol/sql/model.py +1 -1
- tol/sql/pipeline_step/factory.py +1 -1
- tol/sql/sql_converter.py +7 -1
- tol/validators/__init__.py +12 -1
- tol/validators/allowed_keys.py +17 -12
- tol/validators/allowed_values.py +21 -63
- tol/validators/allowed_values_from_datasource.py +89 -0
- tol/validators/assert_on_condition.py +56 -0
- tol/validators/ena_checklist.py +73 -0
- tol/validators/ena_submittable.py +61 -0
- tol/validators/interfaces/__init__.py +5 -0
- tol/validators/interfaces/condition_evaluator.py +102 -0
- tol/validators/min_one_valid_value.py +55 -0
- tol/validators/mutually_exclusive.py +111 -0
- tol/validators/regex.py +30 -23
- tol/validators/regex_by_value.py +33 -33
- tol/validators/specimens_have_same_taxon.py +60 -0
- tol/validators/sts_fields.py +88 -0
- tol/validators/tolid.py +110 -0
- tol/validators/unique_values.py +25 -17
- tol/validators/unique_whole_organisms.py +109 -0
- {tol_sdk-1.7.4.dist-info → tol_sdk-1.7.5b2.dist-info}/METADATA +1 -1
- {tol_sdk-1.7.4.dist-info → tol_sdk-1.7.5b2.dist-info}/RECORD +49 -36
- tol/s3/data_upload/__init__.py +0 -3
- {tol_sdk-1.7.4.dist-info → tol_sdk-1.7.5b2.dist-info}/WHEEL +0 -0
- {tol_sdk-1.7.4.dist-info → tol_sdk-1.7.5b2.dist-info}/entry_points.txt +0 -0
- {tol_sdk-1.7.4.dist-info → tol_sdk-1.7.5b2.dist-info}/licenses/LICENSE +0 -0
- {tol_sdk-1.7.4.dist-info → tol_sdk-1.7.5b2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Genome Research Ltd.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Dict
|
|
7
|
+
|
|
8
|
+
from tol.core import DataObject, DataSource
|
|
9
|
+
from tol.core.validate import Validator
|
|
10
|
+
from tol.sources.ena import ena
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EnaSubmittableValidator(Validator):
|
|
14
|
+
"""
|
|
15
|
+
Validates that a stream of `DataObject` instances
|
|
16
|
+
contains field that is part of a list.
|
|
17
|
+
"""
|
|
18
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
19
|
+
class Config:
|
|
20
|
+
field_name: str
|
|
21
|
+
|
|
22
|
+
__slots__ = ['__config', '__ena_datasource', '__cached_species']
|
|
23
|
+
__config: Config
|
|
24
|
+
__ena_datasource: DataSource
|
|
25
|
+
__cached_species: Dict[str, Any]
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
config: Config,
|
|
30
|
+
ena_datasource: DataSource | None = ena(), # For testing
|
|
31
|
+
) -> None:
|
|
32
|
+
|
|
33
|
+
super().__init__()
|
|
34
|
+
|
|
35
|
+
self.__config = config
|
|
36
|
+
self.__ena_datasource = ena_datasource
|
|
37
|
+
self.__cached_species = {}
|
|
38
|
+
|
|
39
|
+
def _validate_data_object(
|
|
40
|
+
self,
|
|
41
|
+
obj: DataObject
|
|
42
|
+
) -> None:
|
|
43
|
+
taxon_id = obj.get_field_by_name(self.__config.field_name)
|
|
44
|
+
if taxon_id not in self.__cached_species:
|
|
45
|
+
ena_taxon = self.__ena_datasource.get_one('submittable_taxon', taxon_id)
|
|
46
|
+
if ena_taxon:
|
|
47
|
+
self.__cached_species[taxon_id] = ena_taxon
|
|
48
|
+
if taxon_id not in self.__cached_species:
|
|
49
|
+
self.add_error(
|
|
50
|
+
object_id=obj.id,
|
|
51
|
+
detail=f'Field {self.__config.field_name} value '
|
|
52
|
+
f'"{taxon_id}" not found in ENA',
|
|
53
|
+
field=self.__config.field_name,
|
|
54
|
+
)
|
|
55
|
+
elif not self.__cached_species[taxon_id].submittable:
|
|
56
|
+
self.add_error(
|
|
57
|
+
object_id=obj.id,
|
|
58
|
+
detail=f'Field {self.__config.field_name} value '
|
|
59
|
+
f'"{taxon_id}" is not submittable in ENA',
|
|
60
|
+
field=self.__config.field_name,
|
|
61
|
+
)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Genome Research Ltd.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
from abc import ABC
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any, Dict, Tuple, cast
|
|
8
|
+
|
|
9
|
+
from tol.core import DataObject
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
ConditionDict = Dict[str, str | Any | bool]
|
|
13
|
+
"""
|
|
14
|
+
The dict representation of a Condition. Conditions can be constructed
|
|
15
|
+
from such a dict through Condition.from_dict(condition_dict)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(slots=True)
|
|
20
|
+
class Condition:
|
|
21
|
+
field: str
|
|
22
|
+
operator: str
|
|
23
|
+
value: Any
|
|
24
|
+
# If this condition fails, should it be an error or a warning?
|
|
25
|
+
# The reporting of this error or warning is done in the calling validator
|
|
26
|
+
is_error: bool = True
|
|
27
|
+
|
|
28
|
+
def __repr__(self) -> str:
|
|
29
|
+
return f'{self.field} {self.operator} {self.value}'
|
|
30
|
+
|
|
31
|
+
@staticmethod
|
|
32
|
+
def from_dict(condition_dict: ConditionDict) -> 'Condition':
|
|
33
|
+
"""
|
|
34
|
+
A means of instantiating a Condition from a dictionary.
|
|
35
|
+
This is a separate method (rather than constructing with kwargs
|
|
36
|
+
like `Condition(**condition_dict))`) to allow for both precense
|
|
37
|
+
and type checking for each field.
|
|
38
|
+
"""
|
|
39
|
+
try:
|
|
40
|
+
# Extract fields
|
|
41
|
+
field = condition_dict['field']
|
|
42
|
+
operator = condition_dict['operator']
|
|
43
|
+
value = condition_dict['value']
|
|
44
|
+
is_error = condition_dict.get('is_error', True)
|
|
45
|
+
|
|
46
|
+
# Ensure fields are the correct type
|
|
47
|
+
if not isinstance(field, str) and not isinstance(operator, str):
|
|
48
|
+
raise Exception(
|
|
49
|
+
f'Dictionary {condition_dict} not in valid format '
|
|
50
|
+
f'to convert to Condition (type of condition dict incorrect)'
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
return Condition(
|
|
54
|
+
cast(str, field),
|
|
55
|
+
cast(str, operator),
|
|
56
|
+
value,
|
|
57
|
+
cast(bool, is_error),
|
|
58
|
+
)
|
|
59
|
+
except IndexError as e:
|
|
60
|
+
raise Exception(
|
|
61
|
+
f'Dictionary {condition_dict} not in valid format '
|
|
62
|
+
f'to convert to Condition (key "{e.args[0]}" not found)'
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ConditionEvaluator(ABC):
|
|
67
|
+
"""
|
|
68
|
+
Interface to be inherited by validators. Evaluates the provided condition given its
|
|
69
|
+
operator and operands
|
|
70
|
+
"""
|
|
71
|
+
def _evaluate_condition(self, condition: Condition, obj: DataObject) -> Tuple[bool, Any]:
|
|
72
|
+
"""
|
|
73
|
+
Evaluates the provided condition given its operator and operands.
|
|
74
|
+
If `operator` is not one of the supported operators, an exception is thrown.
|
|
75
|
+
"""
|
|
76
|
+
value_to_test = obj.get_field_by_name(condition.field)
|
|
77
|
+
|
|
78
|
+
match condition.operator:
|
|
79
|
+
case '==':
|
|
80
|
+
return (value_to_test == condition.value, value_to_test)
|
|
81
|
+
case '!=':
|
|
82
|
+
return (value_to_test != condition.value, value_to_test)
|
|
83
|
+
case '<':
|
|
84
|
+
return (value_to_test < condition.value, value_to_test)
|
|
85
|
+
case '<=':
|
|
86
|
+
return (value_to_test <= condition.value, value_to_test)
|
|
87
|
+
case '>':
|
|
88
|
+
return (value_to_test > condition.value, value_to_test)
|
|
89
|
+
case '>=':
|
|
90
|
+
return (value_to_test >= condition.value, value_to_test)
|
|
91
|
+
case 'in':
|
|
92
|
+
return (value_to_test in condition.value, value_to_test)
|
|
93
|
+
case _:
|
|
94
|
+
raise Exception(f'VALIDATOR SETUP ERROR: `{condition.operator}` is not '
|
|
95
|
+
f'a supported operator for {type(self).__name__}')
|
|
96
|
+
|
|
97
|
+
def _does_condition_pass(self, condition: Condition, obj: DataObject) -> bool:
|
|
98
|
+
"""
|
|
99
|
+
Helper function for when you only want to know whether the condition passes,
|
|
100
|
+
and don't need the actual value
|
|
101
|
+
"""
|
|
102
|
+
return self._evaluate_condition(condition, obj)[0]
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Genome Research Ltd.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from tol.core import DataObject
|
|
8
|
+
from tol.core.validate import Validator
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MinOneValidValueValidator(Validator):
|
|
12
|
+
"""
|
|
13
|
+
Validates that a stream of `DataObject` instances
|
|
14
|
+
have at least one valid value present in a list of specified keys.
|
|
15
|
+
"""
|
|
16
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
17
|
+
class Config:
|
|
18
|
+
non_valid_values: list[str]
|
|
19
|
+
keys: list[str]
|
|
20
|
+
|
|
21
|
+
__slots__ = ['__config']
|
|
22
|
+
__config: Config
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
config: Config,
|
|
27
|
+
) -> None:
|
|
28
|
+
|
|
29
|
+
super().__init__()
|
|
30
|
+
self.__config = config
|
|
31
|
+
|
|
32
|
+
def _validate_data_object(
|
|
33
|
+
self,
|
|
34
|
+
obj: DataObject
|
|
35
|
+
) -> None:
|
|
36
|
+
|
|
37
|
+
found_valid_value = False
|
|
38
|
+
|
|
39
|
+
for key in self.__config.keys:
|
|
40
|
+
value = obj.attributes[key]
|
|
41
|
+
|
|
42
|
+
if value is not None and value not in self.__config.non_valid_values:
|
|
43
|
+
found_valid_value = True
|
|
44
|
+
break
|
|
45
|
+
|
|
46
|
+
if not found_valid_value:
|
|
47
|
+
self.add_error(
|
|
48
|
+
object_id=obj.id,
|
|
49
|
+
detail=(
|
|
50
|
+
f'At least one of: {self.__config.keys} '
|
|
51
|
+
'must not be: ' + ', '.join(self.__config.non_valid_values)
|
|
52
|
+
+ ' or empty.'
|
|
53
|
+
),
|
|
54
|
+
field=', '.join(self.__config.keys),
|
|
55
|
+
)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Genome Research Ltd.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, List
|
|
7
|
+
|
|
8
|
+
from tol.core import DataObject, Validator
|
|
9
|
+
|
|
10
|
+
from .interfaces import Condition, ConditionDict, ConditionEvaluator
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MutuallyExclusiveValidator(Validator, ConditionEvaluator):
|
|
14
|
+
"""
|
|
15
|
+
Validates an incoming stream of `DataObject` instances,
|
|
16
|
+
where the resultant field from field_one_condition must not
|
|
17
|
+
have the same values for target_fields as the resultant
|
|
18
|
+
field from field_two_condition
|
|
19
|
+
"""
|
|
20
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
21
|
+
class Config:
|
|
22
|
+
first_field_where: ConditionDict
|
|
23
|
+
second_field_where: ConditionDict
|
|
24
|
+
target_fields: List[str]
|
|
25
|
+
detail: str | None = None
|
|
26
|
+
|
|
27
|
+
def _get_error_message(self) -> str:
|
|
28
|
+
# Extract conditions
|
|
29
|
+
first_condition = Condition.from_dict(self.first_field_where)
|
|
30
|
+
second_condition = Condition.from_dict(self.second_field_where)
|
|
31
|
+
|
|
32
|
+
# Use a pre-defined, hard-coded detail message if one was not provided
|
|
33
|
+
if self.detail is None:
|
|
34
|
+
multiple_target_fields = len(self.target_fields) > 1
|
|
35
|
+
possible_plural = 's' if multiple_target_fields else ''
|
|
36
|
+
|
|
37
|
+
target_fields_str = ''
|
|
38
|
+
if multiple_target_fields:
|
|
39
|
+
for index, field in enumerate(self.target_fields):
|
|
40
|
+
if index == 0:
|
|
41
|
+
# First field in the list
|
|
42
|
+
target_fields_str += f'{field}'
|
|
43
|
+
elif index == len(self.target_fields) - 1:
|
|
44
|
+
# Last field in the list
|
|
45
|
+
target_fields_str += f' and {field}'
|
|
46
|
+
else:
|
|
47
|
+
# Middle fields
|
|
48
|
+
target_fields_str += f', {field}'
|
|
49
|
+
else: # Only one field
|
|
50
|
+
target_fields_str = self.target_fields[0]
|
|
51
|
+
|
|
52
|
+
return (
|
|
53
|
+
f'The field{possible_plural} {target_fields_str} cannot have the same '
|
|
54
|
+
f'value{possible_plural} both when {first_condition} and when '
|
|
55
|
+
f'{second_condition}'
|
|
56
|
+
)
|
|
57
|
+
else:
|
|
58
|
+
return self.detail
|
|
59
|
+
|
|
60
|
+
__slots__ = ['__config', '__first_list', '__second_list']
|
|
61
|
+
__config: Config
|
|
62
|
+
__first_list: List[Any]
|
|
63
|
+
__second_list: List[Any]
|
|
64
|
+
|
|
65
|
+
def __init__(self, config: Config) -> None:
|
|
66
|
+
super().__init__()
|
|
67
|
+
|
|
68
|
+
self.__config = config
|
|
69
|
+
self.__first_list = []
|
|
70
|
+
self.__second_list = []
|
|
71
|
+
|
|
72
|
+
def _validate_data_object(self, obj: DataObject) -> None:
|
|
73
|
+
# Check first field
|
|
74
|
+
if self._does_condition_pass(Condition.from_dict(self.__config.first_field_where), obj):
|
|
75
|
+
# Check whether the values of the target fields were found in the second list
|
|
76
|
+
if [
|
|
77
|
+
obj.get_field_by_name(target_field)
|
|
78
|
+
for target_field in self.__config.target_fields
|
|
79
|
+
] in self.__second_list:
|
|
80
|
+
self.add_error(
|
|
81
|
+
object_id=obj.id,
|
|
82
|
+
detail=self.__config._get_error_message()
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Add the values of the target fields to the first list
|
|
86
|
+
self.__first_list.append(
|
|
87
|
+
[
|
|
88
|
+
obj.get_field_by_name(target_field)
|
|
89
|
+
for target_field in self.__config.target_fields
|
|
90
|
+
]
|
|
91
|
+
)
|
|
92
|
+
# Check second field (same as the first condition, but for the second!)
|
|
93
|
+
elif self._does_condition_pass(Condition.from_dict(self.__config.second_field_where), obj):
|
|
94
|
+
# Check whether the values of the target fields were found in the first list
|
|
95
|
+
if [
|
|
96
|
+
obj.get_field_by_name(target_field)
|
|
97
|
+
for target_field in self.__config.target_fields
|
|
98
|
+
] in self.__first_list:
|
|
99
|
+
self.add_error(
|
|
100
|
+
object_id=obj.id,
|
|
101
|
+
detail=self.__config._get_error_message()
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Add the values of the target fields to the second list
|
|
105
|
+
self.__second_list.append(
|
|
106
|
+
[
|
|
107
|
+
obj.get_field_by_name(target_field)
|
|
108
|
+
for target_field in self.__config.target_fields
|
|
109
|
+
]
|
|
110
|
+
)
|
|
111
|
+
# If neither condition passes, the data object must be valid (for this validator anyway!)
|
tol/validators/regex.py
CHANGED
|
@@ -4,12 +4,18 @@
|
|
|
4
4
|
|
|
5
5
|
import re
|
|
6
6
|
from dataclasses import dataclass
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, List
|
|
8
8
|
|
|
9
9
|
from tol.core import DataObject
|
|
10
10
|
from tol.core.validate import Validator
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
RegexDict = dict[
|
|
14
|
+
str,
|
|
15
|
+
str | bool | list[Any],
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
13
19
|
@dataclass(frozen=True, kw_only=True)
|
|
14
20
|
class Regex:
|
|
15
21
|
key: str
|
|
@@ -23,23 +29,22 @@ class Regex:
|
|
|
23
29
|
return re.search(self.regex, str(__v or ''))
|
|
24
30
|
|
|
25
31
|
|
|
26
|
-
RegexDict = dict[
|
|
27
|
-
str,
|
|
28
|
-
str | bool | list[Any],
|
|
29
|
-
]
|
|
30
|
-
"""Can also specify `Regex` as a `dict`"""
|
|
31
|
-
|
|
32
|
-
|
|
33
32
|
class RegexValidator(Validator):
|
|
34
33
|
"""
|
|
35
34
|
Validates an incoming stream of `DataObject` instances
|
|
36
35
|
according to the specified allowed values for a given
|
|
37
36
|
key.
|
|
38
37
|
"""
|
|
38
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
39
|
+
class Config:
|
|
40
|
+
regexes: List[Regex | RegexDict]
|
|
41
|
+
|
|
42
|
+
__slots__ = ['__config']
|
|
43
|
+
__config: Config
|
|
39
44
|
|
|
40
45
|
def __init__(
|
|
41
46
|
self,
|
|
42
|
-
config:
|
|
47
|
+
config: Config
|
|
43
48
|
) -> None:
|
|
44
49
|
|
|
45
50
|
super().__init__()
|
|
@@ -54,19 +59,6 @@ class RegexValidator(Validator):
|
|
|
54
59
|
for k, v in obj.attributes.items():
|
|
55
60
|
self.__validate_attribute(obj, k, v)
|
|
56
61
|
|
|
57
|
-
def __get_config(
|
|
58
|
-
self,
|
|
59
|
-
config: list[Regex | RegexDict],
|
|
60
|
-
) -> list[Regex]:
|
|
61
|
-
|
|
62
|
-
# Ensure config is in Regex format
|
|
63
|
-
# (as you can either pass in a list of Regex or a RegexDict,
|
|
64
|
-
# which can be used to initialize a Regex)
|
|
65
|
-
return [
|
|
66
|
-
c if isinstance(c, Regex) else Regex(**c)
|
|
67
|
-
for c in config
|
|
68
|
-
]
|
|
69
|
-
|
|
70
62
|
def __validate_attribute(
|
|
71
63
|
self,
|
|
72
64
|
obj: DataObject,
|
|
@@ -85,7 +77,7 @@ class RegexValidator(Validator):
|
|
|
85
77
|
key: str,
|
|
86
78
|
) -> list[Regex]:
|
|
87
79
|
return [
|
|
88
|
-
a for a in self.__config
|
|
80
|
+
a for a in self.__config.regexes
|
|
89
81
|
if a.key == key
|
|
90
82
|
]
|
|
91
83
|
|
|
@@ -107,3 +99,18 @@ class RegexValidator(Validator):
|
|
|
107
99
|
detail=c.detail,
|
|
108
100
|
field=c.key,
|
|
109
101
|
)
|
|
102
|
+
|
|
103
|
+
def __get_config(
|
|
104
|
+
self,
|
|
105
|
+
config: Config,
|
|
106
|
+
) -> Config:
|
|
107
|
+
|
|
108
|
+
# Ensure config is in Regex format
|
|
109
|
+
# (as you can either pass in a list of Regex or a RegexDict,
|
|
110
|
+
# which can be used to initialize a Regex)
|
|
111
|
+
return self.Config(
|
|
112
|
+
regexes=[
|
|
113
|
+
c if isinstance(c, Regex) else Regex(**c)
|
|
114
|
+
for c in config.regexes
|
|
115
|
+
]
|
|
116
|
+
)
|
tol/validators/regex_by_value.py
CHANGED
|
@@ -2,20 +2,13 @@
|
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MIT
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Dict, List
|
|
6
7
|
|
|
7
8
|
from tol.core import DataObject
|
|
8
9
|
from tol.core.validate import Validator
|
|
9
10
|
|
|
10
|
-
from .regex import Regex
|
|
11
|
-
|
|
12
|
-
RegexDict = dict[
|
|
13
|
-
str,
|
|
14
|
-
str | bool | list[Any],
|
|
15
|
-
]
|
|
16
|
-
Config = dict[str, str | dict[str, list[Regex | RegexDict]]]
|
|
17
|
-
|
|
18
|
-
"""Can also specify `Regex` as a `dict`"""
|
|
11
|
+
from .regex import Regex, RegexDict
|
|
19
12
|
|
|
20
13
|
|
|
21
14
|
class RegexByValueValidator(Validator):
|
|
@@ -24,46 +17,34 @@ class RegexByValueValidator(Validator):
|
|
|
24
17
|
according to the specified allowed values for a given
|
|
25
18
|
key.
|
|
26
19
|
"""
|
|
20
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
21
|
+
class Config:
|
|
22
|
+
key_column: str
|
|
23
|
+
regexes: Dict[str, List[Regex | RegexDict]]
|
|
24
|
+
|
|
25
|
+
__slots__ = ['__config']
|
|
26
|
+
config: Config
|
|
27
27
|
|
|
28
28
|
def __init__(
|
|
29
29
|
self,
|
|
30
|
-
config:
|
|
30
|
+
config: Config
|
|
31
31
|
) -> None:
|
|
32
32
|
|
|
33
33
|
super().__init__()
|
|
34
34
|
|
|
35
35
|
self.__config = self.__get_config(config)
|
|
36
36
|
|
|
37
|
-
def __get_config(
|
|
38
|
-
self,
|
|
39
|
-
config: Config,
|
|
40
|
-
) -> Config:
|
|
41
|
-
|
|
42
|
-
return {
|
|
43
|
-
'key_column': config['key_column'],
|
|
44
|
-
'regexes': {
|
|
45
|
-
k: [
|
|
46
|
-
# Ensure they're all in Regex format
|
|
47
|
-
# (as you can either pass in a list of Regex or a RegexDict,
|
|
48
|
-
# which can be used to initialize a Regex)
|
|
49
|
-
c if isinstance(c, Regex) else Regex(**c)
|
|
50
|
-
for c in v
|
|
51
|
-
]
|
|
52
|
-
for k, v in config['regexes'].items()
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
|
|
56
37
|
def _validate_data_object(
|
|
57
38
|
self,
|
|
58
39
|
obj: DataObject
|
|
59
40
|
) -> None:
|
|
60
41
|
# Pull out value of the 'key_column' attribute
|
|
61
|
-
key_column_value = obj.attributes.get(self.__config
|
|
42
|
+
key_column_value = obj.attributes.get(self.__config.key_column)
|
|
62
43
|
if not key_column_value:
|
|
63
44
|
return
|
|
64
45
|
|
|
65
46
|
# Pull out relevant regex list based on this value: {[{'name': 'regex'}]}
|
|
66
|
-
regex_list = self.__config
|
|
47
|
+
regex_list = self.__config.regexes.get(key_column_value)
|
|
67
48
|
if not regex_list:
|
|
68
49
|
return
|
|
69
50
|
self.__validate_attribute(obj, regex_list)
|
|
@@ -71,7 +52,7 @@ class RegexByValueValidator(Validator):
|
|
|
71
52
|
def __validate_attribute(
|
|
72
53
|
self,
|
|
73
54
|
obj: DataObject,
|
|
74
|
-
regexes:
|
|
55
|
+
regexes: List[Regex],
|
|
75
56
|
) -> None:
|
|
76
57
|
for r in regexes:
|
|
77
58
|
attribute_name = r.key
|
|
@@ -97,3 +78,22 @@ class RegexByValueValidator(Validator):
|
|
|
97
78
|
detail=c.detail,
|
|
98
79
|
field=c.key,
|
|
99
80
|
)
|
|
81
|
+
|
|
82
|
+
def __get_config(
|
|
83
|
+
self,
|
|
84
|
+
config: Config,
|
|
85
|
+
) -> Config:
|
|
86
|
+
|
|
87
|
+
# Ensure config is in Regex format
|
|
88
|
+
# (as you can either pass in a list of Regex or a RegexDict,
|
|
89
|
+
# which can be used to initialize a Regex)
|
|
90
|
+
return self.Config(
|
|
91
|
+
key_column=config.key_column,
|
|
92
|
+
regexes={
|
|
93
|
+
k: [
|
|
94
|
+
c if isinstance(c, Regex) else Regex(**c)
|
|
95
|
+
for c in v
|
|
96
|
+
]
|
|
97
|
+
for k, v in config.regexes.items()
|
|
98
|
+
}
|
|
99
|
+
)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Genome Research Ltd.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Dict
|
|
7
|
+
|
|
8
|
+
from tol.core import Validator
|
|
9
|
+
from tol.core.data_object import DataObject
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SpecimensHaveSameTaxonValidator(Validator):
|
|
13
|
+
"""
|
|
14
|
+
Validates an incoming stream of `DataObject` instances.
|
|
15
|
+
For each data object (sample) not a SYMBIONT, it checks that
|
|
16
|
+
there are no samples with SPECIMEN_ID which has different TAXON_ID
|
|
17
|
+
"""
|
|
18
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
19
|
+
class Config:
|
|
20
|
+
taxon_id_field: str
|
|
21
|
+
symbiont_field: str
|
|
22
|
+
specimen_id_field: str
|
|
23
|
+
|
|
24
|
+
__slots__ = ['__config', '__seen']
|
|
25
|
+
__config: Config
|
|
26
|
+
__seen: Dict[str, str]
|
|
27
|
+
|
|
28
|
+
def __init__(self, config: Config) -> None:
|
|
29
|
+
super().__init__()
|
|
30
|
+
self.__seen = {}
|
|
31
|
+
self.__config = config
|
|
32
|
+
|
|
33
|
+
def _validate_data_object(self, obj: DataObject) -> None:
|
|
34
|
+
# Explaining the code concept using a standard example
|
|
35
|
+
# seen{}
|
|
36
|
+
# 1st Pass=> element['specimen_id'] = A
|
|
37
|
+
# element['taxon_id'] = AA
|
|
38
|
+
# seen{ A:AA }
|
|
39
|
+
# 2nd pass=> element['specimen_id'] = A
|
|
40
|
+
# element['taxon_id'] = AB
|
|
41
|
+
# AB != AA
|
|
42
|
+
# Flag error
|
|
43
|
+
# From Nithin :)
|
|
44
|
+
|
|
45
|
+
# Ensure the data object is not a SYMBIONT
|
|
46
|
+
if obj.attributes.get(self.__config.symbiont_field) != 'SYMBIONT':
|
|
47
|
+
specimen_id = obj.attributes.get(self.__config.specimen_id_field)
|
|
48
|
+
if specimen_id is None:
|
|
49
|
+
return
|
|
50
|
+
taxon_id = obj.attributes.get(self.__config.taxon_id_field)
|
|
51
|
+
if taxon_id is None:
|
|
52
|
+
return
|
|
53
|
+
if specimen_id in self.__seen and taxon_id != self.__seen[specimen_id]:
|
|
54
|
+
self.add_error(
|
|
55
|
+
object_id=obj.id,
|
|
56
|
+
detail='A non-symbiont must have a matching Specimen ID and Taxon ID',
|
|
57
|
+
field=self.__config.specimen_id_field,
|
|
58
|
+
)
|
|
59
|
+
if specimen_id not in self.__seen:
|
|
60
|
+
self.__seen[specimen_id] = taxon_id
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Genome Research Ltd.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from tol.core import DataObject, DataSource
|
|
9
|
+
from tol.core.validate import Validator
|
|
10
|
+
from tol.sources.sts import sts
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StsFieldsValidator(Validator):
|
|
14
|
+
"""
|
|
15
|
+
Validates that a stream of `DataObject` instances
|
|
16
|
+
contains fields that observe the validations in STS
|
|
17
|
+
"""
|
|
18
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
19
|
+
class Config:
|
|
20
|
+
project_code: str
|
|
21
|
+
|
|
22
|
+
__slots__ = ['__config', '__datasource', '__fields']
|
|
23
|
+
__config: Config
|
|
24
|
+
__datasource: DataSource
|
|
25
|
+
__fields: List[str | int | float]
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
config: Config,
|
|
30
|
+
datasource: DataSource = sts(), # For testing
|
|
31
|
+
) -> None:
|
|
32
|
+
|
|
33
|
+
super().__init__()
|
|
34
|
+
|
|
35
|
+
self.__config = config
|
|
36
|
+
self.__datasource = datasource
|
|
37
|
+
self.__fields = self.__initialize_fields_from_datasource()
|
|
38
|
+
|
|
39
|
+
def __initialize_fields_from_datasource(self) -> List[str | int | float]:
|
|
40
|
+
return {
|
|
41
|
+
field.get('data_input_key'): field
|
|
42
|
+
for field in self.__datasource.get_one(
|
|
43
|
+
'project', self.__config.project_code
|
|
44
|
+
).template.get('data_fields', [])
|
|
45
|
+
if field.get('in_manifest')
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def _validate_data_object(
|
|
49
|
+
self,
|
|
50
|
+
obj: DataObject
|
|
51
|
+
) -> None:
|
|
52
|
+
for field in self.__fields.values():
|
|
53
|
+
# Get the value from the data object
|
|
54
|
+
field_value = obj.get_field_by_name(field.get('data_input_key'))
|
|
55
|
+
if field.get('mandatory_input') and (field_value is None or field_value == ''):
|
|
56
|
+
self.add_error(
|
|
57
|
+
object_id=obj.id,
|
|
58
|
+
detail=f'Field {field.get("data_input_key")} is required '
|
|
59
|
+
f'for project {self.__config.project_code}',
|
|
60
|
+
field=field.get('data_input_key'),
|
|
61
|
+
)
|
|
62
|
+
elif field.get('allowed_values') and field_value not in field.get('allowed_values'):
|
|
63
|
+
self.add_error(
|
|
64
|
+
object_id=obj.id,
|
|
65
|
+
detail=f'Field {field.get("data_input_key")} value '
|
|
66
|
+
f'"{field_value}" not found in allowed values '
|
|
67
|
+
f'{field.get("allowed_values")} for project '
|
|
68
|
+
f'{self.__config.project_code}',
|
|
69
|
+
field=field.get('data_input_key'),
|
|
70
|
+
)
|
|
71
|
+
elif field.get('min') and field_value < field.get('min'):
|
|
72
|
+
self.add_error(
|
|
73
|
+
object_id=obj.id,
|
|
74
|
+
detail=f'Field {field.get("data_input_key")} value '
|
|
75
|
+
f'"{field_value}" is less than minimum value '
|
|
76
|
+
f'"{field.get("min")}" for project '
|
|
77
|
+
f'{self.__config.project_code}',
|
|
78
|
+
field=field.get('data_input_key'),
|
|
79
|
+
)
|
|
80
|
+
elif field.get('max') and field_value > field.get('max'):
|
|
81
|
+
self.add_error(
|
|
82
|
+
object_id=obj.id,
|
|
83
|
+
detail=f'Field {field.get("data_input_key")} value '
|
|
84
|
+
f'"{field_value}" is greater than maximum value '
|
|
85
|
+
f'"{field.get("max")}" for project '
|
|
86
|
+
f'{self.__config.project_code}',
|
|
87
|
+
field=field.get('data_input_key'),
|
|
88
|
+
)
|