tol-sdk 1.7.3__py3-none-any.whl → 1.7.5b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. tol/api_base/__init__.py +1 -0
  2. tol/api_base/blueprint.py +19 -8
  3. tol/api_base/data_upload.py +98 -0
  4. tol/api_base/pipeline_steps.py +12 -9
  5. tol/api_client/api_datasource.py +8 -8
  6. tol/api_client/converter.py +38 -52
  7. tol/api_client/factory.py +21 -19
  8. tol/api_client/parser.py +138 -98
  9. tol/api_client/view.py +118 -43
  10. tol/core/__init__.py +2 -1
  11. tol/core/data_object.py +27 -9
  12. tol/core/data_object_converter.py +37 -2
  13. tol/core/factory.py +51 -62
  14. tol/core/validate.py +1 -0
  15. tol/ena/client.py +60 -10
  16. tol/ena/ena_datasource.py +16 -10
  17. tol/ena/ena_methods.py +33 -32
  18. tol/ena/parser.py +15 -2
  19. tol/services/s3_client.py +5 -3
  20. tol/sql/model.py +1 -1
  21. tol/sql/pipeline_step/factory.py +2 -2
  22. tol/sql/sql_converter.py +7 -1
  23. tol/validators/__init__.py +12 -1
  24. tol/validators/allowed_keys.py +17 -12
  25. tol/validators/allowed_values.py +21 -63
  26. tol/validators/allowed_values_from_datasource.py +91 -0
  27. tol/validators/assert_on_condition.py +56 -0
  28. tol/validators/ena_submittable.py +61 -0
  29. tol/validators/interfaces/__init__.py +5 -0
  30. tol/validators/interfaces/condition_evaluator.py +61 -0
  31. tol/validators/min_one_valid_value.py +55 -0
  32. tol/validators/mutually_exclusive.py +107 -0
  33. tol/validators/regex.py +10 -24
  34. tol/validators/regex_by_value.py +14 -33
  35. tol/validators/specimens_have_same_taxon.py +60 -0
  36. tol/validators/sts_fields.py +88 -0
  37. tol/validators/tolid.py +110 -0
  38. tol/validators/unique_values.py +55 -19
  39. tol/validators/unique_whole_organisms.py +109 -0
  40. {tol_sdk-1.7.3.dist-info → tol_sdk-1.7.5b0.dist-info}/METADATA +1 -1
  41. {tol_sdk-1.7.3.dist-info → tol_sdk-1.7.5b0.dist-info}/RECORD +45 -33
  42. {tol_sdk-1.7.3.dist-info → tol_sdk-1.7.5b0.dist-info}/WHEEL +0 -0
  43. {tol_sdk-1.7.3.dist-info → tol_sdk-1.7.5b0.dist-info}/entry_points.txt +0 -0
  44. {tol_sdk-1.7.3.dist-info → tol_sdk-1.7.5b0.dist-info}/licenses/LICENSE +0 -0
  45. {tol_sdk-1.7.3.dist-info → tol_sdk-1.7.5b0.dist-info}/top_level.txt +0 -0
tol/validators/regex.py CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  import re
6
6
  from dataclasses import dataclass
7
- from typing import Any
7
+ from typing import Any, List
8
8
 
9
9
  from tol.core import DataObject
10
10
  from tol.core.validate import Validator
@@ -23,28 +23,27 @@ class Regex:
23
23
  return re.search(self.regex, str(__v or ''))
24
24
 
25
25
 
26
- RegexDict = dict[
27
- str,
28
- str | bool | list[Any],
29
- ]
30
- """Can also specify `Regex` as a `dict`"""
31
-
32
-
33
26
  class RegexValidator(Validator):
34
27
  """
35
28
  Validates an incoming stream of `DataObject` instances
36
29
  according to the specified allowed values for a given
37
30
  key.
38
31
  """
32
+ @dataclass(slots=True, frozen=True, kw_only=True)
33
+ class Config:
34
+ regexes: List[Regex]
35
+
36
+ __slots__ = ['__config']
37
+ __config: Config
39
38
 
40
39
  def __init__(
41
40
  self,
42
- config: list[Regex | RegexDict]
41
+ config: Config
43
42
  ) -> None:
44
43
 
45
44
  super().__init__()
46
45
 
47
- self.__config = self.__get_config(config)
46
+ self.__config = config
48
47
 
49
48
  def _validate_data_object(
50
49
  self,
@@ -54,19 +53,6 @@ class RegexValidator(Validator):
54
53
  for k, v in obj.attributes.items():
55
54
  self.__validate_attribute(obj, k, v)
56
55
 
57
- def __get_config(
58
- self,
59
- config: list[Regex | RegexDict],
60
- ) -> list[Regex]:
61
-
62
- # Ensure config is in Regex format
63
- # (as you can either pass in a list of Regex or a RegexDict,
64
- # which can be used to initialize a Regex)
65
- return [
66
- c if isinstance(c, Regex) else Regex(**c)
67
- for c in config
68
- ]
69
-
70
56
  def __validate_attribute(
71
57
  self,
72
58
  obj: DataObject,
@@ -85,7 +71,7 @@ class RegexValidator(Validator):
85
71
  key: str,
86
72
  ) -> list[Regex]:
87
73
  return [
88
- a for a in self.__config
74
+ a for a in self.__config.regexes
89
75
  if a.key == key
90
76
  ]
91
77
 
@@ -2,21 +2,14 @@
2
2
  #
3
3
  # SPDX-License-Identifier: MIT
4
4
 
5
- from typing import Any
5
+ from dataclasses import dataclass
6
+ from typing import Dict, List
6
7
 
7
8
  from tol.core import DataObject
8
9
  from tol.core.validate import Validator
9
10
 
10
11
  from .regex import Regex
11
12
 
12
- RegexDict = dict[
13
- str,
14
- str | bool | list[Any],
15
- ]
16
- Config = dict[str, str | dict[str, list[Regex | RegexDict]]]
17
-
18
- """Can also specify `Regex` as a `dict`"""
19
-
20
13
 
21
14
  class RegexByValueValidator(Validator):
22
15
  """
@@ -24,46 +17,34 @@ class RegexByValueValidator(Validator):
24
17
  according to the specified allowed values for a given
25
18
  key.
26
19
  """
20
+ @dataclass(slots=True, frozen=True, kw_only=True)
21
+ class Config:
22
+ key_column: str
23
+ regexes: Dict[str, List[Regex]]
24
+
25
+ __slots__ = ['__config']
26
+ config: Config
27
27
 
28
28
  def __init__(
29
29
  self,
30
- config: dict[str, str | list[str]]
30
+ config: Config
31
31
  ) -> None:
32
32
 
33
33
  super().__init__()
34
34
 
35
- self.__config = self.__get_config(config)
36
-
37
- def __get_config(
38
- self,
39
- config: Config,
40
- ) -> Config:
41
-
42
- return {
43
- 'key_column': config['key_column'],
44
- 'regexes': {
45
- k: [
46
- # Ensure they're all in Regex format
47
- # (as you can either pass in a list of Regex or a RegexDict,
48
- # which can be used to initialize a Regex)
49
- c if isinstance(c, Regex) else Regex(**c)
50
- for c in v
51
- ]
52
- for k, v in config['regexes'].items()
53
- }
54
- }
35
+ self.__config = config
55
36
 
56
37
  def _validate_data_object(
57
38
  self,
58
39
  obj: DataObject
59
40
  ) -> None:
60
41
  # Pull out value of the 'key_column' attribute
61
- key_column_value = obj.attributes.get(self.__config['key_column'])
42
+ key_column_value = obj.attributes.get(self.__config.key_column)
62
43
  if not key_column_value:
63
44
  return
64
45
 
65
46
  # Pull out relevant regex list based on this value: {[{'name': 'regex'}]}
66
- regex_list = self.__config['regexes'].get(key_column_value)
47
+ regex_list = self.__config.regexes.get(key_column_value)
67
48
  if not regex_list:
68
49
  return
69
50
  self.__validate_attribute(obj, regex_list)
@@ -71,7 +52,7 @@ class RegexByValueValidator(Validator):
71
52
  def __validate_attribute(
72
53
  self,
73
54
  obj: DataObject,
74
- regexes: list[Regex],
55
+ regexes: List[Regex],
75
56
  ) -> None:
76
57
  for r in regexes:
77
58
  attribute_name = r.key
@@ -0,0 +1,60 @@
1
+ # SPDX-FileCopyrightText: 2025 Genome Research Ltd.
2
+ #
3
+ # SPDX-License-Identifier: MIT
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Dict
7
+
8
+ from tol.core import Validator
9
+ from tol.core.data_object import DataObject
10
+
11
+
12
+ class SpecimensHaveSameTaxonValidator(Validator):
13
+ """
14
+ Validates an incoming stream of `DataObject` instances.
15
+ For each data object (sample) not a SYMBIONT, it checks that
16
+ there are no samples with SPECIMEN_ID which has different TAXON_ID
17
+ """
18
+ @dataclass(slots=True, frozen=True, kw_only=True)
19
+ class Config:
20
+ taxon_id_field: str
21
+ symbiont_field: str
22
+ specimen_id_field: str
23
+
24
+ __slots__ = ['__config', '__seen']
25
+ __config: Config
26
+ __seen: Dict[str, str]
27
+
28
+ def __init__(self, config: Config) -> None:
29
+ super().__init__()
30
+ self.__seen = {}
31
+ self.__config = config
32
+
33
+ def _validate_data_object(self, obj: DataObject) -> None:
34
+ # Explaining the code concept using a standard example
35
+ # seen{}
36
+ # 1st Pass=> element['specimen_id'] = A
37
+ # element['taxon_id'] = AA
38
+ # seen{ A:AA }
39
+ # 2nd pass=> element['specimen_id'] = A
40
+ # element['taxon_id'] = AB
41
+ # AB != AA
42
+ # Flag error
43
+ # From Nithin :)
44
+
45
+ # Ensure the data object is not a SYMBIONT
46
+ if obj.attributes.get(self.__config.symbiont_field) != 'SYMBIONT':
47
+ specimen_id = obj.attributes.get(self.__config.specimen_id_field)
48
+ if specimen_id is None:
49
+ return
50
+ taxon_id = obj.attributes.get(self.__config.taxon_id_field)
51
+ if taxon_id is None:
52
+ return
53
+ if specimen_id in self.__seen and taxon_id != self.__seen[specimen_id]:
54
+ self.add_error(
55
+ object_id=obj.id,
56
+ detail='A non-symbiont must have a matching Specimen ID and Taxon ID',
57
+ field=self.__config.specimen_id_field,
58
+ )
59
+ if specimen_id not in self.__seen:
60
+ self.__seen[specimen_id] = taxon_id
@@ -0,0 +1,88 @@
1
+ # SPDX-FileCopyrightText: 2025 Genome Research Ltd.
2
+ #
3
+ # SPDX-License-Identifier: MIT
4
+
5
+ from dataclasses import dataclass
6
+ from typing import List
7
+
8
+ from tol.core import DataObject, DataSource
9
+ from tol.core.validate import Validator
10
+ from tol.sources.sts import sts
11
+
12
+
13
+ class StsFieldsValidator(Validator):
14
+ """
15
+ Validates that a stream of `DataObject` instances
16
+ contains fields that observe the validations in STS
17
+ """
18
+ @dataclass(slots=True, frozen=True, kw_only=True)
19
+ class Config:
20
+ project_code: str
21
+
22
+ __slots__ = ['__config', '__datasource', '__fields']
23
+ __config: Config
24
+ __datasource: DataSource
25
+ __fields: List[str | int | float]
26
+
27
+ def __init__(
28
+ self,
29
+ config: Config,
30
+ datasource: DataSource = sts(), # For testing
31
+ ) -> None:
32
+
33
+ super().__init__()
34
+
35
+ self.__config = config
36
+ self.__datasource = datasource
37
+ self.__fields = self.__initialize_fields_from_datasource()
38
+
39
+ def __initialize_fields_from_datasource(self) -> List[str | int | float]:
40
+ return {
41
+ field.get('data_input_key'): field
42
+ for field in self.__datasource.get_one(
43
+ 'project', self.__config.project_code
44
+ ).template.get('data_fields', [])
45
+ if field.get('in_manifest')
46
+ }
47
+
48
+ def _validate_data_object(
49
+ self,
50
+ obj: DataObject
51
+ ) -> None:
52
+ for field in self.__fields.values():
53
+ # Get the value from the data object
54
+ field_value = obj.get_field_by_name(field.get('data_input_key'))
55
+ if field.get('mandatory_input') and (field_value is None or field_value == ''):
56
+ self.add_error(
57
+ object_id=obj.id,
58
+ detail=f'Field {field.get("data_input_key")} is required '
59
+ f'for project {self.__config.project_code}',
60
+ field=field.get('data_input_key'),
61
+ )
62
+ elif field.get('allowed_values') and field_value not in field.get('allowed_values'):
63
+ self.add_error(
64
+ object_id=obj.id,
65
+ detail=f'Field {field.get("data_input_key")} value '
66
+ f'"{field_value}" not found in allowed values '
67
+ f'{field.get("allowed_values")} for project '
68
+ f'{self.__config.project_code}',
69
+ field=field.get('data_input_key'),
70
+ )
71
+ elif field.get('min') and field_value < field.get('min'):
72
+ self.add_error(
73
+ object_id=obj.id,
74
+ detail=f'Field {field.get("data_input_key")} value '
75
+ f'"{field_value}" is less than minimum value '
76
+ f'"{field.get("min")}" for project '
77
+ f'{self.__config.project_code}',
78
+ field=field.get('data_input_key'),
79
+ )
80
+ elif field.get('max') and field_value > field.get('max'):
81
+ self.add_error(
82
+ object_id=obj.id,
83
+ detail=f'Field {field.get("data_input_key")} value '
84
+ f'"{field_value}" is greater than maximum value '
85
+ f'"{field.get("max")}" for project '
86
+ f'{self.__config.project_code}',
87
+ field=field.get('data_input_key'),
88
+ )
@@ -0,0 +1,110 @@
1
+ # SPDX-FileCopyrightText: 2025 Genome Research Ltd.
2
+ #
3
+ # SPDX-License-Identifier: MIT
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict
7
+
8
+ from tol.core import DataObject, DataSource
9
+ from tol.core import DataSourceError, DataSourceFilter
10
+ from tol.core.validate import Validator
11
+ from tol.sources.tolid import tolid
12
+
13
+
14
+ class TolidValidator(Validator):
15
+ """
16
+ Validates that a stream of `DataObject` instances
17
+ contains unique Tol IDs.
18
+ """
19
+ @dataclass(slots=True, frozen=True, kw_only=True)
20
+ class Config:
21
+ species_id_field: str
22
+ specimen_id_field: str
23
+ error_ignore_field: str
24
+ error_ignore_value: str
25
+ warning_detail: str = 'Species not found in Tol ID source'
26
+
27
+ __slots__ = ['__config', '__datasource', '__cached_species_id', '__cached_tolids']
28
+ __config: Config
29
+ __datasource: DataSource
30
+ __cached_species_ids: Dict[str, Any]
31
+ __cached_tolids: Dict[str, Any]
32
+
33
+ def __init__(
34
+ self,
35
+ config: Config,
36
+ datasource=tolid(),
37
+ ) -> None:
38
+
39
+ super().__init__()
40
+
41
+ self.__config = config
42
+ self.__datasource = datasource
43
+ self.__cached_species_ids = {}
44
+ self.__cached_tolids = {}
45
+
46
+ def _validate_data_object(
47
+ self,
48
+ obj: DataObject
49
+ ) -> None:
50
+ self.__warning_on_species_not_in_tolid(obj=obj)
51
+ self.__error_on_specimen_id_and_taxon_not_matching_tolid(obj=obj)
52
+
53
+ def __warning_on_species_not_in_tolid(
54
+ self,
55
+ obj: DataObject,
56
+ ) -> None:
57
+
58
+ obj_species_id = obj.get_field_by_name(self.__config.species_id_field)
59
+ if self.__config.species_id_field in obj.attributes:
60
+ try:
61
+ if obj_species_id not in self.__cached_species_ids:
62
+ self.__cached_species_ids[obj_species_id] = (
63
+ self.__datasource.get_one('species', obj_species_id) is not None)
64
+
65
+ except DataSourceError as e:
66
+ if e.status_code == 404:
67
+ self.__cached_species_ids[obj_species_id] = False
68
+
69
+ species_in_tolid = self.__cached_species_ids[obj_species_id]
70
+ if species_in_tolid is False:
71
+ self.add_warning(
72
+ object_id=obj.id,
73
+ detail=self.__config.warning_detail,
74
+ field=self.__config.species_id_field,
75
+ )
76
+
77
+ def __error_on_specimen_id_and_taxon_not_matching_tolid(
78
+ self,
79
+ obj: DataObject,
80
+ ) -> None:
81
+
82
+ if (obj.get_field_by_name(self.__config.error_ignore_field) is
83
+ self.__config.error_ignore_value):
84
+ return
85
+
86
+ if self.__config.specimen_id_field in obj.attributes:
87
+ specimen_id = obj.get_field_by_name(self.__config.specimen_id_field)
88
+ if specimen_id not in self.__cached_tolids:
89
+ f = DataSourceFilter()
90
+ f.and_ = {'specimen_id': {'eq': {'value': specimen_id}}}
91
+ self.__cached_tolids[specimen_id] = list(self.__datasource.get_list(
92
+ object_type='specimen',
93
+ filters=f
94
+ ))
95
+
96
+ if (len(self.__cached_tolids[specimen_id]) == 0):
97
+ return
98
+ else:
99
+ taxons = set()
100
+ for tolid_ in self.__cached_tolids[specimen_id]:
101
+ taxons.add(str(tolid_.species.id))
102
+
103
+ if str(obj.get_field_by_name(self.__config.species_id_field)) not in taxons:
104
+ self.add_error(
105
+ object_id=obj.id,
106
+ detail=f'Specimen ID {specimen_id} does not match Taxon ID '
107
+ f'{obj.get_field_by_name(self.__config.species_id_field)}'
108
+ 'in TolID source',
109
+ field=[self.__config.specimen_id_field, self.__config.species_id_field]
110
+ )
@@ -2,6 +2,9 @@
2
2
  #
3
3
  # SPDX-License-Identifier: MIT
4
4
 
5
+ from dataclasses import dataclass
6
+ from typing import Dict, List, Set
7
+
5
8
  from tol.core import DataObject
6
9
  from tol.core.validate import Validator
7
10
 
@@ -11,36 +14,69 @@ class UniqueValuesValidator(Validator):
11
14
  Validates that a stream of `DataObject` instances
12
15
  contains unique values for specified keys.
13
16
  """
17
+ @dataclass(slots=True, frozen=True, kw_only=True)
18
+ class Config:
19
+ unique_keys: List[List[str] | str]
20
+ detail: str = 'Value is not unique'
21
+ is_error: bool = True
22
+
23
+ __slots__ = ['__config', '__duplicates', '__existing_values']
24
+ __config: Config
25
+ __duplicates: Dict[str, List[str]]
26
+ __existing_values: Dict[str, Set]
14
27
 
15
28
  def __init__(
16
29
  self,
17
- unique_keys: list[str],
18
- *,
19
- detail: str = 'Value is not unique',
20
- is_error: bool = True,
30
+ config: Config
21
31
  ) -> None:
22
32
 
23
33
  super().__init__()
24
34
 
25
- self.__keys = unique_keys
26
- self.__detail = detail
27
- self.__is_error = is_error
28
- self.__duplicates: dict[str, list[str]] = {}
29
- self.__existing_values: dict[str, set] = {key: set() for key in unique_keys}
35
+ self.__config = config
36
+ self.__duplicates = {}
37
+ self.__existing_values = {}
38
+ for key in self.__config.unique_keys:
39
+ if isinstance(key, str):
40
+ self.__existing_values[key] = set()
41
+ elif isinstance(key, list):
42
+ concat_key = '/'.join(key)
43
+ self.__existing_values[concat_key] = set()
30
44
 
31
45
  def _validate_data_object(
32
46
  self,
33
47
  obj: DataObject
34
48
  ) -> None:
35
49
 
36
- for key in obj.attributes:
37
- if key in self.__keys:
38
- if obj.attributes[key] in self.__existing_values[key]:
39
- if key not in self.__duplicates:
40
- self.__duplicates[key] = []
41
- self.__duplicates[key].append(obj.attributes[key])
50
+ for unique_key in self.__config.unique_keys:
51
+ if isinstance(unique_key, list):
52
+ concat = ''
53
+ for key in unique_key:
54
+ concat = concat + '/' + (str(obj.attributes[key]))
55
+ if concat in self.__existing_values['/'.join(unique_key)]:
56
+ self.__duplicate_checks(
57
+ key=key,
58
+ value=concat
59
+ )
60
+ else:
61
+ self.__existing_values['/'.join(unique_key)].add(concat)
62
+
63
+ else:
64
+ if obj.attributes[unique_key] in self.__existing_values[unique_key]:
65
+ self.__duplicate_checks(
66
+ key=unique_key,
67
+ value=obj.attributes[unique_key]
68
+ )
42
69
  else:
43
- self.__existing_values[key].add(obj.attributes[key])
70
+ self.__existing_values[unique_key].add(obj.attributes[unique_key])
71
+
72
+ def __duplicate_checks(
73
+ self,
74
+ key: str,
75
+ value: str
76
+ ):
77
+ if key not in self.__duplicates:
78
+ self.__duplicates[key] = []
79
+ self.__duplicates[key].append(value)
44
80
 
45
81
  def _post_validation(
46
82
  self,
@@ -58,15 +94,15 @@ class UniqueValuesValidator(Validator):
58
94
  key: str,
59
95
  ) -> None:
60
96
 
61
- if self.__is_error:
97
+ if self.__config.is_error:
62
98
  self.add_error(
63
99
  object_id=obj.id,
64
- detail=self.__detail,
100
+ detail=self.__config.detail,
65
101
  field=key,
66
102
  )
67
103
  else:
68
104
  self.add_warning(
69
105
  object_id=obj.id,
70
- detail=self.__detail,
106
+ detail=self.__config.detail,
71
107
  field=key,
72
108
  )
@@ -0,0 +1,109 @@
1
+ # SPDX-FileCopyrightText: 2025 Genome Research Ltd.
2
+ #
3
+ # SPDX-License-Identifier: MIT
4
+
5
+ from dataclasses import dataclass
6
+ from typing import List
7
+
8
+ from tol.core import Validator
9
+ from tol.core.data_object import DataObject
10
+
11
+
12
+ class UniqueWholeOrganismsValidator(Validator):
13
+ """
14
+ Validates an incoming stream of `DataObject` instances.
15
+ For each data object (sample) not a SYMBIONT, it checks:
16
+ 1. There are no two samples with organism part WHOLE_ORGANISM with the same SPECIMEN_ID
17
+ 2. There are no samples with organism part *not* WHOLE_ORGANISM that have a SPECIMEN_ID
18
+ the same as a WHOLE_ORGANISM in the manifest.
19
+ """
20
+ @dataclass(slots=True, frozen=True, kw_only=True)
21
+ class Config:
22
+ symbiont_field: str
23
+ organism_part_field: str
24
+ specimen_id_field: str
25
+
26
+ __slots__ = ['__config', '__whole_organisms', '__part_organisms']
27
+ __config: Config
28
+ __whole_organisms: List[str]
29
+ __part_organisms: List[str]
30
+
31
+ def __init__(self, config: Config) -> None:
32
+ super().__init__()
33
+ self.__whole_organisms = []
34
+ self.__part_organisms = []
35
+ self.__config = config
36
+
37
+ def _validate_data_object(self, obj: DataObject) -> None:
38
+ # This function uses a bit of a confusing method for its validation, so I'm going to
39
+ # leave an explanation here as to how it works for anyone who needs to modify it
40
+ # in the future!
41
+ #
42
+ # In the original code to be adapted, two loops were used. The first looped over each
43
+ # data object whose ORGANISM_PART was 'WHOLE_ORGANISM', adding them to a list. Before it
44
+ # did this though, it would check to see if the SPECIMEN_ID of this data object was already
45
+ # contained in said list (to ensure the specimen IDs were unique). In the second loop, the
46
+ # rest of the data objects (those whose ORGANISM PART was *not* 'WHOLE_ORGANISM') were
47
+ # looped over, each being checked to see if their SPECIMEN_ID was contained in the list
48
+ # (the one containing the whole organisms). In all, this ensures that all whole organisms
49
+ # have unique specimen IDs, and all part organisms do not share the specimen IDs of any of
50
+ # the whole organisms.
51
+ #
52
+ # The issue when adapting this to a Validator to be used in a pipeline, is that this
53
+ # function only takes in one data object at a time, via a generator (to save needing to
54
+ # load many into memory at once). The problem this left us with is that we could no longer
55
+ # achieve the same result by using two passes of the data, as only one pass was feasible.
56
+ #
57
+ # So here's what I ended up with. This validator stores the SPECIMEN_IDs of both all of the
58
+ # whole organisms *and* part organisms. From this, detecting duplicate whole organisms is
59
+ # the same, but detecting whether a part organism shared the SPECIMEN_ID of a whole
60
+ # organism now has two separate cases: when the data object passed into this function is
61
+ # a whole organism, or a part organism. In the case of it being a part organism, a very
62
+ # similar solution to before can be used: we simply check whether self.__whole_organisms
63
+ # conatins the same SPECIMEN_ID. However, for the case where the data object is a whole
64
+ # organism, effectively the inverse is done; it is the self.__part_organisms list that is
65
+ # checked. This covers all cases:
66
+ # 1. There are no duplicates, in which case there will never be a time when the same
67
+ # SPECIMEN_ID will be in both lists.
68
+ # 2. A whole organism is checked, then a part organism with the same SPECIMEN_ID is
69
+ # checked. In this case, self.__whole_organisms will contain the same SPECIMEN_ID,
70
+ # so the duplicate is detected.
71
+ # 3. A part organism is checked, then a whole organism with the same SPECIMEN_ID is
72
+ # checked. In this case, self.__part_organisms will contain the same SPECIMEN_ID,
73
+ # so the duplicate is detected.
74
+ #
75
+ # From Thomas :)
76
+
77
+ # Ensure the data object is not a SYMBIONT, because organism part checks do not apply
78
+ if obj.attributes.get(self.__config.symbiont_field) != 'SYMBIONT':
79
+ specimen_id = obj.attributes.get(self.__config.specimen_id_field)
80
+ if specimen_id is None:
81
+ return
82
+
83
+ organism_part = obj.attributes.get(self.__config.organism_part_field)
84
+ if organism_part == 'WHOLE_ORGANISM':
85
+ if specimen_id in self.__whole_organisms:
86
+ self.add_error(
87
+ object_id=obj.id,
88
+ detail='No two whole organisms can have the same Specimen ID',
89
+ field=self.__config.specimen_id_field,
90
+ )
91
+ if specimen_id in self.__part_organisms:
92
+ self.add_error(
93
+ object_id=obj.id,
94
+ detail='A whole organism cannot have a Specimen ID already used for'
95
+ 'a non-whole organism',
96
+ field=self.__config.specimen_id_field,
97
+ )
98
+
99
+ self.__whole_organisms.append(specimen_id)
100
+ else:
101
+ if specimen_id in self.__whole_organisms:
102
+ self.add_error(
103
+ object_id=obj.id,
104
+ detail='A non-whole organism cannot have a Specimen ID already used for'
105
+ 'a whole organism',
106
+ field=self.__config.specimen_id_field,
107
+ )
108
+
109
+ self.__part_organisms.append(specimen_id)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tol-sdk
3
- Version: 1.7.3
3
+ Version: 1.7.5b0
4
4
  Summary: SDK for interaction with ToL, Sanger and external services
5
5
  Author-email: ToL Platforms Team <tol-platforms@sanger.ac.uk>
6
6
  License: MIT