tol-sdk 1.7.3__py3-none-any.whl → 1.7.5b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tol/api_base/__init__.py +1 -0
- tol/api_base/blueprint.py +19 -8
- tol/api_base/data_upload.py +98 -0
- tol/api_base/pipeline_steps.py +12 -9
- tol/api_client/api_datasource.py +8 -8
- tol/api_client/converter.py +38 -52
- tol/api_client/factory.py +21 -19
- tol/api_client/parser.py +138 -98
- tol/api_client/view.py +118 -43
- tol/core/__init__.py +2 -1
- tol/core/data_object.py +27 -9
- tol/core/data_object_converter.py +37 -2
- tol/core/factory.py +51 -62
- tol/core/validate.py +1 -0
- tol/ena/client.py +60 -10
- tol/ena/ena_datasource.py +16 -10
- tol/ena/ena_methods.py +33 -32
- tol/ena/parser.py +15 -2
- tol/services/s3_client.py +5 -3
- tol/sql/model.py +1 -1
- tol/sql/pipeline_step/factory.py +2 -2
- tol/sql/sql_converter.py +7 -1
- tol/validators/__init__.py +12 -1
- tol/validators/allowed_keys.py +17 -12
- tol/validators/allowed_values.py +21 -63
- tol/validators/allowed_values_from_datasource.py +91 -0
- tol/validators/assert_on_condition.py +56 -0
- tol/validators/ena_submittable.py +61 -0
- tol/validators/interfaces/__init__.py +5 -0
- tol/validators/interfaces/condition_evaluator.py +61 -0
- tol/validators/min_one_valid_value.py +55 -0
- tol/validators/mutually_exclusive.py +107 -0
- tol/validators/regex.py +10 -24
- tol/validators/regex_by_value.py +14 -33
- tol/validators/specimens_have_same_taxon.py +60 -0
- tol/validators/sts_fields.py +88 -0
- tol/validators/tolid.py +110 -0
- tol/validators/unique_values.py +55 -19
- tol/validators/unique_whole_organisms.py +109 -0
- {tol_sdk-1.7.3.dist-info → tol_sdk-1.7.5b0.dist-info}/METADATA +1 -1
- {tol_sdk-1.7.3.dist-info → tol_sdk-1.7.5b0.dist-info}/RECORD +45 -33
- {tol_sdk-1.7.3.dist-info → tol_sdk-1.7.5b0.dist-info}/WHEEL +0 -0
- {tol_sdk-1.7.3.dist-info → tol_sdk-1.7.5b0.dist-info}/entry_points.txt +0 -0
- {tol_sdk-1.7.3.dist-info → tol_sdk-1.7.5b0.dist-info}/licenses/LICENSE +0 -0
- {tol_sdk-1.7.3.dist-info → tol_sdk-1.7.5b0.dist-info}/top_level.txt +0 -0
tol/validators/regex.py
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
import re
|
|
6
6
|
from dataclasses import dataclass
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, List
|
|
8
8
|
|
|
9
9
|
from tol.core import DataObject
|
|
10
10
|
from tol.core.validate import Validator
|
|
@@ -23,28 +23,27 @@ class Regex:
|
|
|
23
23
|
return re.search(self.regex, str(__v or ''))
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
RegexDict = dict[
|
|
27
|
-
str,
|
|
28
|
-
str | bool | list[Any],
|
|
29
|
-
]
|
|
30
|
-
"""Can also specify `Regex` as a `dict`"""
|
|
31
|
-
|
|
32
|
-
|
|
33
26
|
class RegexValidator(Validator):
|
|
34
27
|
"""
|
|
35
28
|
Validates an incoming stream of `DataObject` instances
|
|
36
29
|
according to the specified allowed values for a given
|
|
37
30
|
key.
|
|
38
31
|
"""
|
|
32
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
33
|
+
class Config:
|
|
34
|
+
regexes: List[Regex]
|
|
35
|
+
|
|
36
|
+
__slots__ = ['__config']
|
|
37
|
+
__config: Config
|
|
39
38
|
|
|
40
39
|
def __init__(
|
|
41
40
|
self,
|
|
42
|
-
config:
|
|
41
|
+
config: Config
|
|
43
42
|
) -> None:
|
|
44
43
|
|
|
45
44
|
super().__init__()
|
|
46
45
|
|
|
47
|
-
self.__config =
|
|
46
|
+
self.__config = config
|
|
48
47
|
|
|
49
48
|
def _validate_data_object(
|
|
50
49
|
self,
|
|
@@ -54,19 +53,6 @@ class RegexValidator(Validator):
|
|
|
54
53
|
for k, v in obj.attributes.items():
|
|
55
54
|
self.__validate_attribute(obj, k, v)
|
|
56
55
|
|
|
57
|
-
def __get_config(
|
|
58
|
-
self,
|
|
59
|
-
config: list[Regex | RegexDict],
|
|
60
|
-
) -> list[Regex]:
|
|
61
|
-
|
|
62
|
-
# Ensure config is in Regex format
|
|
63
|
-
# (as you can either pass in a list of Regex or a RegexDict,
|
|
64
|
-
# which can be used to initialize a Regex)
|
|
65
|
-
return [
|
|
66
|
-
c if isinstance(c, Regex) else Regex(**c)
|
|
67
|
-
for c in config
|
|
68
|
-
]
|
|
69
|
-
|
|
70
56
|
def __validate_attribute(
|
|
71
57
|
self,
|
|
72
58
|
obj: DataObject,
|
|
@@ -85,7 +71,7 @@ class RegexValidator(Validator):
|
|
|
85
71
|
key: str,
|
|
86
72
|
) -> list[Regex]:
|
|
87
73
|
return [
|
|
88
|
-
a for a in self.__config
|
|
74
|
+
a for a in self.__config.regexes
|
|
89
75
|
if a.key == key
|
|
90
76
|
]
|
|
91
77
|
|
tol/validators/regex_by_value.py
CHANGED
|
@@ -2,21 +2,14 @@
|
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MIT
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Dict, List
|
|
6
7
|
|
|
7
8
|
from tol.core import DataObject
|
|
8
9
|
from tol.core.validate import Validator
|
|
9
10
|
|
|
10
11
|
from .regex import Regex
|
|
11
12
|
|
|
12
|
-
RegexDict = dict[
|
|
13
|
-
str,
|
|
14
|
-
str | bool | list[Any],
|
|
15
|
-
]
|
|
16
|
-
Config = dict[str, str | dict[str, list[Regex | RegexDict]]]
|
|
17
|
-
|
|
18
|
-
"""Can also specify `Regex` as a `dict`"""
|
|
19
|
-
|
|
20
13
|
|
|
21
14
|
class RegexByValueValidator(Validator):
|
|
22
15
|
"""
|
|
@@ -24,46 +17,34 @@ class RegexByValueValidator(Validator):
|
|
|
24
17
|
according to the specified allowed values for a given
|
|
25
18
|
key.
|
|
26
19
|
"""
|
|
20
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
21
|
+
class Config:
|
|
22
|
+
key_column: str
|
|
23
|
+
regexes: Dict[str, List[Regex]]
|
|
24
|
+
|
|
25
|
+
__slots__ = ['__config']
|
|
26
|
+
config: Config
|
|
27
27
|
|
|
28
28
|
def __init__(
|
|
29
29
|
self,
|
|
30
|
-
config:
|
|
30
|
+
config: Config
|
|
31
31
|
) -> None:
|
|
32
32
|
|
|
33
33
|
super().__init__()
|
|
34
34
|
|
|
35
|
-
self.__config =
|
|
36
|
-
|
|
37
|
-
def __get_config(
|
|
38
|
-
self,
|
|
39
|
-
config: Config,
|
|
40
|
-
) -> Config:
|
|
41
|
-
|
|
42
|
-
return {
|
|
43
|
-
'key_column': config['key_column'],
|
|
44
|
-
'regexes': {
|
|
45
|
-
k: [
|
|
46
|
-
# Ensure they're all in Regex format
|
|
47
|
-
# (as you can either pass in a list of Regex or a RegexDict,
|
|
48
|
-
# which can be used to initialize a Regex)
|
|
49
|
-
c if isinstance(c, Regex) else Regex(**c)
|
|
50
|
-
for c in v
|
|
51
|
-
]
|
|
52
|
-
for k, v in config['regexes'].items()
|
|
53
|
-
}
|
|
54
|
-
}
|
|
35
|
+
self.__config = config
|
|
55
36
|
|
|
56
37
|
def _validate_data_object(
|
|
57
38
|
self,
|
|
58
39
|
obj: DataObject
|
|
59
40
|
) -> None:
|
|
60
41
|
# Pull out value of the 'key_column' attribute
|
|
61
|
-
key_column_value = obj.attributes.get(self.__config
|
|
42
|
+
key_column_value = obj.attributes.get(self.__config.key_column)
|
|
62
43
|
if not key_column_value:
|
|
63
44
|
return
|
|
64
45
|
|
|
65
46
|
# Pull out relevant regex list based on this value: {[{'name': 'regex'}]}
|
|
66
|
-
regex_list = self.__config
|
|
47
|
+
regex_list = self.__config.regexes.get(key_column_value)
|
|
67
48
|
if not regex_list:
|
|
68
49
|
return
|
|
69
50
|
self.__validate_attribute(obj, regex_list)
|
|
@@ -71,7 +52,7 @@ class RegexByValueValidator(Validator):
|
|
|
71
52
|
def __validate_attribute(
|
|
72
53
|
self,
|
|
73
54
|
obj: DataObject,
|
|
74
|
-
regexes:
|
|
55
|
+
regexes: List[Regex],
|
|
75
56
|
) -> None:
|
|
76
57
|
for r in regexes:
|
|
77
58
|
attribute_name = r.key
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Genome Research Ltd.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Dict
|
|
7
|
+
|
|
8
|
+
from tol.core import Validator
|
|
9
|
+
from tol.core.data_object import DataObject
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SpecimensHaveSameTaxonValidator(Validator):
|
|
13
|
+
"""
|
|
14
|
+
Validates an incoming stream of `DataObject` instances.
|
|
15
|
+
For each data object (sample) not a SYMBIONT, it checks that
|
|
16
|
+
there are no samples with SPECIMEN_ID which has different TAXON_ID
|
|
17
|
+
"""
|
|
18
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
19
|
+
class Config:
|
|
20
|
+
taxon_id_field: str
|
|
21
|
+
symbiont_field: str
|
|
22
|
+
specimen_id_field: str
|
|
23
|
+
|
|
24
|
+
__slots__ = ['__config', '__seen']
|
|
25
|
+
__config: Config
|
|
26
|
+
__seen: Dict[str, str]
|
|
27
|
+
|
|
28
|
+
def __init__(self, config: Config) -> None:
|
|
29
|
+
super().__init__()
|
|
30
|
+
self.__seen = {}
|
|
31
|
+
self.__config = config
|
|
32
|
+
|
|
33
|
+
def _validate_data_object(self, obj: DataObject) -> None:
|
|
34
|
+
# Explaining the code concept using a standard example
|
|
35
|
+
# seen{}
|
|
36
|
+
# 1st Pass=> element['specimen_id'] = A
|
|
37
|
+
# element['taxon_id'] = AA
|
|
38
|
+
# seen{ A:AA }
|
|
39
|
+
# 2nd pass=> element['specimen_id'] = A
|
|
40
|
+
# element['taxon_id'] = AB
|
|
41
|
+
# AB != AA
|
|
42
|
+
# Flag error
|
|
43
|
+
# From Nithin :)
|
|
44
|
+
|
|
45
|
+
# Ensure the data object is not a SYMBIONT
|
|
46
|
+
if obj.attributes.get(self.__config.symbiont_field) != 'SYMBIONT':
|
|
47
|
+
specimen_id = obj.attributes.get(self.__config.specimen_id_field)
|
|
48
|
+
if specimen_id is None:
|
|
49
|
+
return
|
|
50
|
+
taxon_id = obj.attributes.get(self.__config.taxon_id_field)
|
|
51
|
+
if taxon_id is None:
|
|
52
|
+
return
|
|
53
|
+
if specimen_id in self.__seen and taxon_id != self.__seen[specimen_id]:
|
|
54
|
+
self.add_error(
|
|
55
|
+
object_id=obj.id,
|
|
56
|
+
detail='A non-symbiont must have a matching Specimen ID and Taxon ID',
|
|
57
|
+
field=self.__config.specimen_id_field,
|
|
58
|
+
)
|
|
59
|
+
if specimen_id not in self.__seen:
|
|
60
|
+
self.__seen[specimen_id] = taxon_id
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Genome Research Ltd.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from tol.core import DataObject, DataSource
|
|
9
|
+
from tol.core.validate import Validator
|
|
10
|
+
from tol.sources.sts import sts
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StsFieldsValidator(Validator):
|
|
14
|
+
"""
|
|
15
|
+
Validates that a stream of `DataObject` instances
|
|
16
|
+
contains fields that observe the validations in STS
|
|
17
|
+
"""
|
|
18
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
19
|
+
class Config:
|
|
20
|
+
project_code: str
|
|
21
|
+
|
|
22
|
+
__slots__ = ['__config', '__datasource', '__fields']
|
|
23
|
+
__config: Config
|
|
24
|
+
__datasource: DataSource
|
|
25
|
+
__fields: List[str | int | float]
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
config: Config,
|
|
30
|
+
datasource: DataSource = sts(), # For testing
|
|
31
|
+
) -> None:
|
|
32
|
+
|
|
33
|
+
super().__init__()
|
|
34
|
+
|
|
35
|
+
self.__config = config
|
|
36
|
+
self.__datasource = datasource
|
|
37
|
+
self.__fields = self.__initialize_fields_from_datasource()
|
|
38
|
+
|
|
39
|
+
def __initialize_fields_from_datasource(self) -> List[str | int | float]:
|
|
40
|
+
return {
|
|
41
|
+
field.get('data_input_key'): field
|
|
42
|
+
for field in self.__datasource.get_one(
|
|
43
|
+
'project', self.__config.project_code
|
|
44
|
+
).template.get('data_fields', [])
|
|
45
|
+
if field.get('in_manifest')
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def _validate_data_object(
|
|
49
|
+
self,
|
|
50
|
+
obj: DataObject
|
|
51
|
+
) -> None:
|
|
52
|
+
for field in self.__fields.values():
|
|
53
|
+
# Get the value from the data object
|
|
54
|
+
field_value = obj.get_field_by_name(field.get('data_input_key'))
|
|
55
|
+
if field.get('mandatory_input') and (field_value is None or field_value == ''):
|
|
56
|
+
self.add_error(
|
|
57
|
+
object_id=obj.id,
|
|
58
|
+
detail=f'Field {field.get("data_input_key")} is required '
|
|
59
|
+
f'for project {self.__config.project_code}',
|
|
60
|
+
field=field.get('data_input_key'),
|
|
61
|
+
)
|
|
62
|
+
elif field.get('allowed_values') and field_value not in field.get('allowed_values'):
|
|
63
|
+
self.add_error(
|
|
64
|
+
object_id=obj.id,
|
|
65
|
+
detail=f'Field {field.get("data_input_key")} value '
|
|
66
|
+
f'"{field_value}" not found in allowed values '
|
|
67
|
+
f'{field.get("allowed_values")} for project '
|
|
68
|
+
f'{self.__config.project_code}',
|
|
69
|
+
field=field.get('data_input_key'),
|
|
70
|
+
)
|
|
71
|
+
elif field.get('min') and field_value < field.get('min'):
|
|
72
|
+
self.add_error(
|
|
73
|
+
object_id=obj.id,
|
|
74
|
+
detail=f'Field {field.get("data_input_key")} value '
|
|
75
|
+
f'"{field_value}" is less than minimum value '
|
|
76
|
+
f'"{field.get("min")}" for project '
|
|
77
|
+
f'{self.__config.project_code}',
|
|
78
|
+
field=field.get('data_input_key'),
|
|
79
|
+
)
|
|
80
|
+
elif field.get('max') and field_value > field.get('max'):
|
|
81
|
+
self.add_error(
|
|
82
|
+
object_id=obj.id,
|
|
83
|
+
detail=f'Field {field.get("data_input_key")} value '
|
|
84
|
+
f'"{field_value}" is greater than maximum value '
|
|
85
|
+
f'"{field.get("max")}" for project '
|
|
86
|
+
f'{self.__config.project_code}',
|
|
87
|
+
field=field.get('data_input_key'),
|
|
88
|
+
)
|
tol/validators/tolid.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Genome Research Ltd.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Dict
|
|
7
|
+
|
|
8
|
+
from tol.core import DataObject, DataSource
|
|
9
|
+
from tol.core import DataSourceError, DataSourceFilter
|
|
10
|
+
from tol.core.validate import Validator
|
|
11
|
+
from tol.sources.tolid import tolid
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TolidValidator(Validator):
|
|
15
|
+
"""
|
|
16
|
+
Validates that a stream of `DataObject` instances
|
|
17
|
+
contains unique Tol IDs.
|
|
18
|
+
"""
|
|
19
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
20
|
+
class Config:
|
|
21
|
+
species_id_field: str
|
|
22
|
+
specimen_id_field: str
|
|
23
|
+
error_ignore_field: str
|
|
24
|
+
error_ignore_value: str
|
|
25
|
+
warning_detail: str = 'Species not found in Tol ID source'
|
|
26
|
+
|
|
27
|
+
__slots__ = ['__config', '__datasource', '__cached_species_id', '__cached_tolids']
|
|
28
|
+
__config: Config
|
|
29
|
+
__datasource: DataSource
|
|
30
|
+
__cached_species_ids: Dict[str, Any]
|
|
31
|
+
__cached_tolids: Dict[str, Any]
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
config: Config,
|
|
36
|
+
datasource=tolid(),
|
|
37
|
+
) -> None:
|
|
38
|
+
|
|
39
|
+
super().__init__()
|
|
40
|
+
|
|
41
|
+
self.__config = config
|
|
42
|
+
self.__datasource = datasource
|
|
43
|
+
self.__cached_species_ids = {}
|
|
44
|
+
self.__cached_tolids = {}
|
|
45
|
+
|
|
46
|
+
def _validate_data_object(
|
|
47
|
+
self,
|
|
48
|
+
obj: DataObject
|
|
49
|
+
) -> None:
|
|
50
|
+
self.__warning_on_species_not_in_tolid(obj=obj)
|
|
51
|
+
self.__error_on_specimen_id_and_taxon_not_matching_tolid(obj=obj)
|
|
52
|
+
|
|
53
|
+
def __warning_on_species_not_in_tolid(
|
|
54
|
+
self,
|
|
55
|
+
obj: DataObject,
|
|
56
|
+
) -> None:
|
|
57
|
+
|
|
58
|
+
obj_species_id = obj.get_field_by_name(self.__config.species_id_field)
|
|
59
|
+
if self.__config.species_id_field in obj.attributes:
|
|
60
|
+
try:
|
|
61
|
+
if obj_species_id not in self.__cached_species_ids:
|
|
62
|
+
self.__cached_species_ids[obj_species_id] = (
|
|
63
|
+
self.__datasource.get_one('species', obj_species_id) is not None)
|
|
64
|
+
|
|
65
|
+
except DataSourceError as e:
|
|
66
|
+
if e.status_code == 404:
|
|
67
|
+
self.__cached_species_ids[obj_species_id] = False
|
|
68
|
+
|
|
69
|
+
species_in_tolid = self.__cached_species_ids[obj_species_id]
|
|
70
|
+
if species_in_tolid is False:
|
|
71
|
+
self.add_warning(
|
|
72
|
+
object_id=obj.id,
|
|
73
|
+
detail=self.__config.warning_detail,
|
|
74
|
+
field=self.__config.species_id_field,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def __error_on_specimen_id_and_taxon_not_matching_tolid(
|
|
78
|
+
self,
|
|
79
|
+
obj: DataObject,
|
|
80
|
+
) -> None:
|
|
81
|
+
|
|
82
|
+
if (obj.get_field_by_name(self.__config.error_ignore_field) is
|
|
83
|
+
self.__config.error_ignore_value):
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
if self.__config.specimen_id_field in obj.attributes:
|
|
87
|
+
specimen_id = obj.get_field_by_name(self.__config.specimen_id_field)
|
|
88
|
+
if specimen_id not in self.__cached_tolids:
|
|
89
|
+
f = DataSourceFilter()
|
|
90
|
+
f.and_ = {'specimen_id': {'eq': {'value': specimen_id}}}
|
|
91
|
+
self.__cached_tolids[specimen_id] = list(self.__datasource.get_list(
|
|
92
|
+
object_type='specimen',
|
|
93
|
+
filters=f
|
|
94
|
+
))
|
|
95
|
+
|
|
96
|
+
if (len(self.__cached_tolids[specimen_id]) == 0):
|
|
97
|
+
return
|
|
98
|
+
else:
|
|
99
|
+
taxons = set()
|
|
100
|
+
for tolid_ in self.__cached_tolids[specimen_id]:
|
|
101
|
+
taxons.add(str(tolid_.species.id))
|
|
102
|
+
|
|
103
|
+
if str(obj.get_field_by_name(self.__config.species_id_field)) not in taxons:
|
|
104
|
+
self.add_error(
|
|
105
|
+
object_id=obj.id,
|
|
106
|
+
detail=f'Specimen ID {specimen_id} does not match Taxon ID '
|
|
107
|
+
f'{obj.get_field_by_name(self.__config.species_id_field)}'
|
|
108
|
+
'in TolID source',
|
|
109
|
+
field=[self.__config.specimen_id_field, self.__config.species_id_field]
|
|
110
|
+
)
|
tol/validators/unique_values.py
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MIT
|
|
4
4
|
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Dict, List, Set
|
|
7
|
+
|
|
5
8
|
from tol.core import DataObject
|
|
6
9
|
from tol.core.validate import Validator
|
|
7
10
|
|
|
@@ -11,36 +14,69 @@ class UniqueValuesValidator(Validator):
|
|
|
11
14
|
Validates that a stream of `DataObject` instances
|
|
12
15
|
contains unique values for specified keys.
|
|
13
16
|
"""
|
|
17
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
18
|
+
class Config:
|
|
19
|
+
unique_keys: List[List[str] | str]
|
|
20
|
+
detail: str = 'Value is not unique'
|
|
21
|
+
is_error: bool = True
|
|
22
|
+
|
|
23
|
+
__slots__ = ['__config', '__duplicates', '__existing_values']
|
|
24
|
+
__config: Config
|
|
25
|
+
__duplicates: Dict[str, List[str]]
|
|
26
|
+
__existing_values: Dict[str, Set]
|
|
14
27
|
|
|
15
28
|
def __init__(
|
|
16
29
|
self,
|
|
17
|
-
|
|
18
|
-
*,
|
|
19
|
-
detail: str = 'Value is not unique',
|
|
20
|
-
is_error: bool = True,
|
|
30
|
+
config: Config
|
|
21
31
|
) -> None:
|
|
22
32
|
|
|
23
33
|
super().__init__()
|
|
24
34
|
|
|
25
|
-
self.
|
|
26
|
-
self.
|
|
27
|
-
self.
|
|
28
|
-
self.
|
|
29
|
-
|
|
35
|
+
self.__config = config
|
|
36
|
+
self.__duplicates = {}
|
|
37
|
+
self.__existing_values = {}
|
|
38
|
+
for key in self.__config.unique_keys:
|
|
39
|
+
if isinstance(key, str):
|
|
40
|
+
self.__existing_values[key] = set()
|
|
41
|
+
elif isinstance(key, list):
|
|
42
|
+
concat_key = '/'.join(key)
|
|
43
|
+
self.__existing_values[concat_key] = set()
|
|
30
44
|
|
|
31
45
|
def _validate_data_object(
|
|
32
46
|
self,
|
|
33
47
|
obj: DataObject
|
|
34
48
|
) -> None:
|
|
35
49
|
|
|
36
|
-
for
|
|
37
|
-
if
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
50
|
+
for unique_key in self.__config.unique_keys:
|
|
51
|
+
if isinstance(unique_key, list):
|
|
52
|
+
concat = ''
|
|
53
|
+
for key in unique_key:
|
|
54
|
+
concat = concat + '/' + (str(obj.attributes[key]))
|
|
55
|
+
if concat in self.__existing_values['/'.join(unique_key)]:
|
|
56
|
+
self.__duplicate_checks(
|
|
57
|
+
key=key,
|
|
58
|
+
value=concat
|
|
59
|
+
)
|
|
60
|
+
else:
|
|
61
|
+
self.__existing_values['/'.join(unique_key)].add(concat)
|
|
62
|
+
|
|
63
|
+
else:
|
|
64
|
+
if obj.attributes[unique_key] in self.__existing_values[unique_key]:
|
|
65
|
+
self.__duplicate_checks(
|
|
66
|
+
key=unique_key,
|
|
67
|
+
value=obj.attributes[unique_key]
|
|
68
|
+
)
|
|
42
69
|
else:
|
|
43
|
-
self.__existing_values[
|
|
70
|
+
self.__existing_values[unique_key].add(obj.attributes[unique_key])
|
|
71
|
+
|
|
72
|
+
def __duplicate_checks(
|
|
73
|
+
self,
|
|
74
|
+
key: str,
|
|
75
|
+
value: str
|
|
76
|
+
):
|
|
77
|
+
if key not in self.__duplicates:
|
|
78
|
+
self.__duplicates[key] = []
|
|
79
|
+
self.__duplicates[key].append(value)
|
|
44
80
|
|
|
45
81
|
def _post_validation(
|
|
46
82
|
self,
|
|
@@ -58,15 +94,15 @@ class UniqueValuesValidator(Validator):
|
|
|
58
94
|
key: str,
|
|
59
95
|
) -> None:
|
|
60
96
|
|
|
61
|
-
if self.
|
|
97
|
+
if self.__config.is_error:
|
|
62
98
|
self.add_error(
|
|
63
99
|
object_id=obj.id,
|
|
64
|
-
detail=self.
|
|
100
|
+
detail=self.__config.detail,
|
|
65
101
|
field=key,
|
|
66
102
|
)
|
|
67
103
|
else:
|
|
68
104
|
self.add_warning(
|
|
69
105
|
object_id=obj.id,
|
|
70
|
-
detail=self.
|
|
106
|
+
detail=self.__config.detail,
|
|
71
107
|
field=key,
|
|
72
108
|
)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Genome Research Ltd.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from tol.core import Validator
|
|
9
|
+
from tol.core.data_object import DataObject
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class UniqueWholeOrganismsValidator(Validator):
|
|
13
|
+
"""
|
|
14
|
+
Validates an incoming stream of `DataObject` instances.
|
|
15
|
+
For each data object (sample) not a SYMBIONT, it checks:
|
|
16
|
+
1. There are no two samples with organism part WHOLE_ORGANISM with the same SPECIMEN_ID
|
|
17
|
+
2. There are no samples with organism part *not* WHOLE_ORGANISM that have a SPECIMEN_ID
|
|
18
|
+
the same as a WHOLE_ORGANISM in the manifest.
|
|
19
|
+
"""
|
|
20
|
+
@dataclass(slots=True, frozen=True, kw_only=True)
|
|
21
|
+
class Config:
|
|
22
|
+
symbiont_field: str
|
|
23
|
+
organism_part_field: str
|
|
24
|
+
specimen_id_field: str
|
|
25
|
+
|
|
26
|
+
__slots__ = ['__config', '__whole_organisms', '__part_organisms']
|
|
27
|
+
__config: Config
|
|
28
|
+
__whole_organisms: List[str]
|
|
29
|
+
__part_organisms: List[str]
|
|
30
|
+
|
|
31
|
+
def __init__(self, config: Config) -> None:
|
|
32
|
+
super().__init__()
|
|
33
|
+
self.__whole_organisms = []
|
|
34
|
+
self.__part_organisms = []
|
|
35
|
+
self.__config = config
|
|
36
|
+
|
|
37
|
+
def _validate_data_object(self, obj: DataObject) -> None:
|
|
38
|
+
# This function uses a bit of a confusing method for its validation, so I'm going to
|
|
39
|
+
# leave an explanation here as to how it works for anyone who needs to modify it
|
|
40
|
+
# in the future!
|
|
41
|
+
#
|
|
42
|
+
# In the original code to be adapted, two loops were used. The first looped over each
|
|
43
|
+
# data object whose ORGANISM_PART was 'WHOLE_ORGANISM', adding them to a list. Before it
|
|
44
|
+
# did this though, it would check to see if the SPECIMEN_ID of this data object was already
|
|
45
|
+
# contained in said list (to ensure the specimen IDs were unique). In the second loop, the
|
|
46
|
+
# rest of the data objects (those whose ORGANISM PART was *not* 'WHOLE_ORGANISM') were
|
|
47
|
+
# looped over, each being checked to see if their SPECIMEN_ID was contained in the list
|
|
48
|
+
# (the one containing the whole organisms). In all, this ensures that all whole organisms
|
|
49
|
+
# have unique specimen IDs, and all part organisms do not share the specimen IDs of any of
|
|
50
|
+
# the whole organisms.
|
|
51
|
+
#
|
|
52
|
+
# The issue when adapting this to a Validator to be used in a pipeline, is that this
|
|
53
|
+
# function only takes in one data object at a time, via a generator (to save needing to
|
|
54
|
+
# load many into memory at once). The problem this left us with is that we could no longer
|
|
55
|
+
# achieve the same result by using two passes of the data, as only one pass was feasible.
|
|
56
|
+
#
|
|
57
|
+
# So here's what I ended up with. This validator stores the SPECIMEN_IDs of both all of the
|
|
58
|
+
# whole organisms *and* part organisms. From this, detecting duplicate whole organisms is
|
|
59
|
+
# the same, but detecting whether a part organism shared the SPECIMEN_ID of a whole
|
|
60
|
+
# organism now has two separate cases: when the data object passed into this function is
|
|
61
|
+
# a whole organism, or a part organism. In the case of it being a part organism, a very
|
|
62
|
+
# similar solution to before can be used: we simply check whether self.__whole_organisms
|
|
63
|
+
# conatins the same SPECIMEN_ID. However, for the case where the data object is a whole
|
|
64
|
+
# organism, effectively the inverse is done; it is the self.__part_organisms list that is
|
|
65
|
+
# checked. This covers all cases:
|
|
66
|
+
# 1. There are no duplicates, in which case there will never be a time when the same
|
|
67
|
+
# SPECIMEN_ID will be in both lists.
|
|
68
|
+
# 2. A whole organism is checked, then a part organism with the same SPECIMEN_ID is
|
|
69
|
+
# checked. In this case, self.__whole_organisms will contain the same SPECIMEN_ID,
|
|
70
|
+
# so the duplicate is detected.
|
|
71
|
+
# 3. A part organism is checked, then a whole organism with the same SPECIMEN_ID is
|
|
72
|
+
# checked. In this case, self.__part_organisms will contain the same SPECIMEN_ID,
|
|
73
|
+
# so the duplicate is detected.
|
|
74
|
+
#
|
|
75
|
+
# From Thomas :)
|
|
76
|
+
|
|
77
|
+
# Ensure the data object is not a SYMBIONT, because organism part checks do not apply
|
|
78
|
+
if obj.attributes.get(self.__config.symbiont_field) != 'SYMBIONT':
|
|
79
|
+
specimen_id = obj.attributes.get(self.__config.specimen_id_field)
|
|
80
|
+
if specimen_id is None:
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
organism_part = obj.attributes.get(self.__config.organism_part_field)
|
|
84
|
+
if organism_part == 'WHOLE_ORGANISM':
|
|
85
|
+
if specimen_id in self.__whole_organisms:
|
|
86
|
+
self.add_error(
|
|
87
|
+
object_id=obj.id,
|
|
88
|
+
detail='No two whole organisms can have the same Specimen ID',
|
|
89
|
+
field=self.__config.specimen_id_field,
|
|
90
|
+
)
|
|
91
|
+
if specimen_id in self.__part_organisms:
|
|
92
|
+
self.add_error(
|
|
93
|
+
object_id=obj.id,
|
|
94
|
+
detail='A whole organism cannot have a Specimen ID already used for'
|
|
95
|
+
'a non-whole organism',
|
|
96
|
+
field=self.__config.specimen_id_field,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
self.__whole_organisms.append(specimen_id)
|
|
100
|
+
else:
|
|
101
|
+
if specimen_id in self.__whole_organisms:
|
|
102
|
+
self.add_error(
|
|
103
|
+
object_id=obj.id,
|
|
104
|
+
detail='A non-whole organism cannot have a Specimen ID already used for'
|
|
105
|
+
'a whole organism',
|
|
106
|
+
field=self.__config.specimen_id_field,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
self.__part_organisms.append(specimen_id)
|