esgvoc 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of esgvoc might be problematic. Click here for more details.
- esgvoc/api/__init__.py +15 -4
- esgvoc/api/data_descriptors/__init__.py +3 -0
- esgvoc/api/data_descriptors/directory_date.py +48 -0
- esgvoc/api/project_specs.py +82 -0
- esgvoc/api/projects.py +160 -130
- esgvoc/api/report.py +78 -50
- esgvoc/api/search.py +28 -10
- esgvoc/api/universe.py +17 -18
- esgvoc/apps/__init__.py +7 -0
- esgvoc/apps/drs/__init__.py +0 -16
- esgvoc/apps/drs/constants.py +2 -0
- esgvoc/apps/drs/generator.py +424 -0
- esgvoc/apps/drs/report.py +401 -0
- esgvoc/apps/drs/validator.py +332 -0
- esgvoc/cli/config.py +3 -0
- esgvoc/cli/drs.py +238 -0
- esgvoc/cli/get.py +1 -1
- esgvoc/cli/main.py +4 -3
- esgvoc/cli/status.py +13 -1
- esgvoc/cli/valid.py +1 -5
- esgvoc/core/db/models/mixins.py +7 -0
- esgvoc/core/db/models/project.py +3 -8
- esgvoc/core/db/project_ingestion.py +4 -1
- esgvoc/core/db/universe_ingestion.py +3 -3
- esgvoc/core/service/settings.py +17 -8
- esgvoc/core/service/settings.toml +11 -6
- esgvoc/core/service/settings_default.toml +11 -14
- esgvoc/core/service/state.py +19 -12
- esgvoc-0.2.1.dist-info/METADATA +58 -0
- {esgvoc-0.1.2.dist-info → esgvoc-0.2.1.dist-info}/RECORD +33 -26
- esgvoc-0.2.1.dist-info/licenses/LICENSE.txt +519 -0
- esgvoc/apps/drs/models.py +0 -43
- esgvoc/apps/drs/parser.py +0 -27
- esgvoc-0.1.2.dist-info/METADATA +0 -54
- {esgvoc-0.1.2.dist-info → esgvoc-0.2.1.dist-info}/WHEEL +0 -0
- {esgvoc-0.1.2.dist-info → esgvoc-0.2.1.dist-info}/entry_points.txt +0 -0
esgvoc/api/search.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
1
|
from enum import Enum
|
|
3
2
|
|
|
4
3
|
from pydantic import BaseModel
|
|
@@ -6,30 +5,49 @@ from sqlalchemy import ColumnElement, func
|
|
|
6
5
|
from sqlmodel import col
|
|
7
6
|
|
|
8
7
|
|
|
9
|
-
|
|
10
|
-
|
|
8
|
+
class MatchingTerm(BaseModel):
|
|
9
|
+
"""
|
|
10
|
+
Place holder for a term that matches a value (term validation).
|
|
11
|
+
"""
|
|
11
12
|
project_id: str
|
|
13
|
+
"""The project id to which the term belongs."""
|
|
12
14
|
collection_id: str
|
|
15
|
+
"""The collection id to which the term belongs."""
|
|
13
16
|
term_id: str
|
|
17
|
+
"""The term id."""
|
|
14
18
|
|
|
15
19
|
|
|
16
20
|
class SearchType(Enum):
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
21
|
+
"""
|
|
22
|
+
The search types used for to find terms.
|
|
23
|
+
"""
|
|
24
|
+
EXACT = "exact"
|
|
25
|
+
"""Performs exact match."""
|
|
26
|
+
LIKE = "like" # can interpret %
|
|
27
|
+
"""As SQL operator, it can interpret % as a wildcard."""
|
|
28
|
+
STARTS_WITH = "starts_with" # can interpret %
|
|
29
|
+
"""Prefix based search."""
|
|
20
30
|
ENDS_WITH = "ends_with" # can interpret %
|
|
21
|
-
|
|
31
|
+
"""Suffix based search."""
|
|
32
|
+
REGEX = "regex"
|
|
33
|
+
"""Search based on regex."""
|
|
22
34
|
|
|
23
35
|
|
|
24
36
|
class SearchSettings(BaseModel):
|
|
37
|
+
"""
|
|
38
|
+
Search configuration.
|
|
39
|
+
"""
|
|
25
40
|
type: SearchType = SearchType.EXACT
|
|
41
|
+
"""The type of search."""
|
|
26
42
|
case_sensitive: bool = True
|
|
43
|
+
"""Enable case sensitivity or not."""
|
|
27
44
|
not_operator: bool = False
|
|
45
|
+
"""Give the opposite result like the NOT SQL operator."""
|
|
28
46
|
|
|
29
47
|
|
|
30
|
-
def
|
|
31
|
-
|
|
32
|
-
|
|
48
|
+
def _create_str_comparison_expression(field: str,
|
|
49
|
+
value: str,
|
|
50
|
+
settings: SearchSettings|None) -> ColumnElement:
|
|
33
51
|
'''
|
|
34
52
|
SQLite LIKE is case insensitive (and so STARTS/ENDS_WITH which are implemented with LIKE).
|
|
35
53
|
So the case sensitive LIKE is implemented with REGEX.
|
esgvoc/api/universe.py
CHANGED
|
@@ -2,7 +2,7 @@ from typing import Sequence
|
|
|
2
2
|
|
|
3
3
|
from esgvoc.api._utils import (get_universe_session,
|
|
4
4
|
instantiate_pydantic_terms)
|
|
5
|
-
from esgvoc.api.search import SearchSettings,
|
|
5
|
+
from esgvoc.api.search import SearchSettings, _create_str_comparison_expression
|
|
6
6
|
from esgvoc.core.db.models.universe import DataDescriptor, UTerm
|
|
7
7
|
from pydantic import BaseModel
|
|
8
8
|
from sqlmodel import Session, select
|
|
@@ -13,9 +13,9 @@ def _find_terms_in_data_descriptor(data_descriptor_id: str,
|
|
|
13
13
|
session: Session,
|
|
14
14
|
settings: SearchSettings|None) -> Sequence[UTerm]:
|
|
15
15
|
"""Settings only apply on the term_id comparison."""
|
|
16
|
-
where_expression =
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
where_expression = _create_str_comparison_expression(field=UTerm.id,
|
|
17
|
+
value=term_id,
|
|
18
|
+
settings=settings)
|
|
19
19
|
statement = select(UTerm).join(DataDescriptor).where(DataDescriptor.id==data_descriptor_id,
|
|
20
20
|
where_expression)
|
|
21
21
|
results = session.exec(statement)
|
|
@@ -39,9 +39,9 @@ def find_terms_in_data_descriptor(data_descriptor_id: str,
|
|
|
39
39
|
returns an empty list.
|
|
40
40
|
|
|
41
41
|
Behavior based on search type:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
42
|
+
- `EXACT` and absence of `settings`: returns zero or one Pydantic term instance in the list.
|
|
43
|
+
- `REGEX`, `LIKE`, `STARTS_WITH` and `ENDS_WITH`: returns zero, one or more Pydantic term \
|
|
44
|
+
instances in the list.
|
|
45
45
|
|
|
46
46
|
:param data_descriptor_id: A data descriptor id
|
|
47
47
|
:type data_descriptor_id: str
|
|
@@ -49,8 +49,7 @@ def find_terms_in_data_descriptor(data_descriptor_id: str,
|
|
|
49
49
|
:type term_id: str
|
|
50
50
|
:param settings: The search settings
|
|
51
51
|
:type settings: SearchSettings|None
|
|
52
|
-
:returns: A list of Pydantic model term instances.
|
|
53
|
-
Returns an empty list if no matches are found.
|
|
52
|
+
:returns: A list of Pydantic model term instances. Returns an empty list if no matches are found.
|
|
54
53
|
:rtype: list[BaseModel]
|
|
55
54
|
"""
|
|
56
55
|
result: list[BaseModel] = list()
|
|
@@ -63,9 +62,9 @@ def find_terms_in_data_descriptor(data_descriptor_id: str,
|
|
|
63
62
|
def _find_terms_in_universe(term_id: str,
|
|
64
63
|
session: Session,
|
|
65
64
|
settings: SearchSettings|None) -> Sequence[UTerm]:
|
|
66
|
-
where_expression =
|
|
67
|
-
|
|
68
|
-
|
|
65
|
+
where_expression = _create_str_comparison_expression(field=UTerm.id,
|
|
66
|
+
value=term_id,
|
|
67
|
+
settings=settings)
|
|
69
68
|
statement = select(UTerm).where(where_expression)
|
|
70
69
|
results = session.exec(statement).all()
|
|
71
70
|
return results
|
|
@@ -106,9 +105,9 @@ def _get_all_terms_in_data_descriptor(data_descriptor: DataDescriptor) -> list[B
|
|
|
106
105
|
def _find_data_descriptors_in_universe(data_descriptor_id: str,
|
|
107
106
|
session: Session,
|
|
108
107
|
settings: SearchSettings|None) -> Sequence[DataDescriptor]:
|
|
109
|
-
where_expression =
|
|
110
|
-
|
|
111
|
-
|
|
108
|
+
where_expression = _create_str_comparison_expression(field=DataDescriptor.id,
|
|
109
|
+
value=data_descriptor_id,
|
|
110
|
+
settings=settings)
|
|
112
111
|
statement = select(DataDescriptor).where(where_expression)
|
|
113
112
|
results = session.exec(statement)
|
|
114
113
|
result = results.all()
|
|
@@ -153,9 +152,9 @@ def find_data_descriptors_in_universe(data_descriptor_id: str,
|
|
|
153
152
|
If the provided `data_descriptor_id` is not found, the function returns an empty list.
|
|
154
153
|
|
|
155
154
|
Behavior based on search type:
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
155
|
+
- `EXACT` and absence of `settings`: returns zero or one data descriptor context in the list.
|
|
156
|
+
- `REGEX`, `LIKE`, `STARTS_WITH` and `ENDS_WITH`: returns zero, one or more \
|
|
157
|
+
data descriptor contexts in the list.
|
|
159
158
|
|
|
160
159
|
:param data_descriptor_id: A data descriptor id to be found
|
|
161
160
|
:type data_descriptor_id: str
|
esgvoc/apps/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
|
|
2
|
+
from esgvoc.apps.drs.validator import DrsValidator
|
|
3
|
+
from esgvoc.apps.drs.report import DrsValidationReport
|
|
4
|
+
from esgvoc.apps.drs.generator import DrsGenerator
|
|
5
|
+
from esgvoc.apps.drs.report import DrsGeneratorReport
|
|
6
|
+
|
|
7
|
+
__all__ = ["DrsValidator", "DrsValidationReport", "DrsGenerator", "DrsGeneratorReport"]
|
esgvoc/apps/drs/__init__.py
CHANGED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
from esgvoc.apps.drs.models import (DrsType,
|
|
2
|
-
DrsPartType,
|
|
3
|
-
DrsConstant,
|
|
4
|
-
DrsCollection,
|
|
5
|
-
DrsPart,
|
|
6
|
-
DrsSpecification,
|
|
7
|
-
ProjectSpecs)
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
__all__ = ["DrsType",
|
|
11
|
-
"DrsPartType",
|
|
12
|
-
"DrsConstant",
|
|
13
|
-
"DrsCollection",
|
|
14
|
-
"DrsPart",
|
|
15
|
-
"DrsSpecification",
|
|
16
|
-
"ProjectSpecs"]
|
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
from typing import cast, Iterable, Mapping, Any
|
|
2
|
+
|
|
3
|
+
import esgvoc.api.projects as projects
|
|
4
|
+
|
|
5
|
+
from esgvoc.api.project_specs import (DrsSpecification,
|
|
6
|
+
DrsPartKind,
|
|
7
|
+
DrsCollection,
|
|
8
|
+
DrsConstant,
|
|
9
|
+
DrsType)
|
|
10
|
+
|
|
11
|
+
from esgvoc.apps.drs.validator import DrsApplication
|
|
12
|
+
from esgvoc.apps.drs.report import (DrsGeneratorReport,
|
|
13
|
+
DrsIssue,
|
|
14
|
+
GeneratorIssue,
|
|
15
|
+
TooManyTokensCollection,
|
|
16
|
+
InvalidToken,
|
|
17
|
+
MissingToken,
|
|
18
|
+
ConflictingCollections,
|
|
19
|
+
AssignedToken)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_first_item(items: set[Any]) -> Any:
|
|
23
|
+
result = None
|
|
24
|
+
for result in items:
|
|
25
|
+
break
|
|
26
|
+
return result
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _transform_set_and_sort(_set: set[Any]) -> list[Any]:
|
|
30
|
+
result = list(_set)
|
|
31
|
+
result.sort()
|
|
32
|
+
return result
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DrsGenerator(DrsApplication):
|
|
36
|
+
"""
|
|
37
|
+
Generate a directory, dataset id and file name expression specified by the given project from
|
|
38
|
+
a mapping of collection ids and tokens or an unordered bag of tokens.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def generate_directory_from_mapping(self, mapping: Mapping[str, str]) -> DrsGeneratorReport:
|
|
42
|
+
"""
|
|
43
|
+
Generate a directory DRS expression from a mapping of collection ids and tokens.
|
|
44
|
+
|
|
45
|
+
:param mapping: A mapping of collection ids (keys) and tokens (values).
|
|
46
|
+
:type mapping: Mapping[str, str]
|
|
47
|
+
:returns: A generation report.
|
|
48
|
+
:rtype: DrsGeneratorReport
|
|
49
|
+
"""
|
|
50
|
+
return self._generate_from_mapping(mapping, self.directory_specs)
|
|
51
|
+
|
|
52
|
+
def generate_directory_from_bag_of_tokens(self, tokens: Iterable[str]) -> DrsGeneratorReport:
|
|
53
|
+
"""
|
|
54
|
+
Generate a directory DRS expression from an unordered bag of tokens.
|
|
55
|
+
|
|
56
|
+
:param tokens: An unordered bag of tokens.
|
|
57
|
+
:type tokens: Iterable[str]
|
|
58
|
+
:returns: A generation report.
|
|
59
|
+
:rtype: DrsGeneratorReport
|
|
60
|
+
"""
|
|
61
|
+
return self._generate_from_bag_of_tokens(tokens, self.directory_specs)
|
|
62
|
+
|
|
63
|
+
def generate_dataset_id_from_mapping(self, mapping: Mapping[str, str]) -> DrsGeneratorReport:
|
|
64
|
+
"""
|
|
65
|
+
Generate a dataset id DRS expression from a mapping of collection ids and tokens.
|
|
66
|
+
|
|
67
|
+
:param mapping: A mapping of collection ids (keys) and tokens (values).
|
|
68
|
+
:type mapping: Mapping[str, str]
|
|
69
|
+
:returns: A generation report.
|
|
70
|
+
:rtype: DrsGeneratorReport
|
|
71
|
+
"""
|
|
72
|
+
return self._generate_from_mapping(mapping, self.dataset_id_specs)
|
|
73
|
+
|
|
74
|
+
def generate_dataset_id_from_bag_of_tokens(self, tokens: Iterable[str]) -> DrsGeneratorReport:
|
|
75
|
+
"""
|
|
76
|
+
Generate a dataset id DRS expression from an unordered bag of tokens.
|
|
77
|
+
|
|
78
|
+
:param tokens: An unordered bag of tokens.
|
|
79
|
+
:type tokens: Iterable[str]
|
|
80
|
+
:returns: A generation report.
|
|
81
|
+
:rtype: DrsGeneratorReport
|
|
82
|
+
"""
|
|
83
|
+
return self._generate_from_bag_of_tokens(tokens, self.dataset_id_specs)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def generate_file_name_from_mapping(self, mapping: Mapping[str, str]) -> DrsGeneratorReport:
|
|
87
|
+
"""
|
|
88
|
+
Generate a file name DRS expression from a mapping of collection ids and tokens.
|
|
89
|
+
The file name extension is append automatically, according to the DRS specification,
|
|
90
|
+
so none of the tokens given must include the extension.
|
|
91
|
+
|
|
92
|
+
:param mapping: A mapping of collection ids (keys) and tokens (values).
|
|
93
|
+
:type mapping: Mapping[str, str]
|
|
94
|
+
:returns: A generation report.
|
|
95
|
+
:rtype: DrsGeneratorReport
|
|
96
|
+
"""
|
|
97
|
+
report = self._generate_from_mapping(mapping, self.file_name_specs)
|
|
98
|
+
report.generated_drs_expression = report.generated_drs_expression + self._get_full_file_name_extension()
|
|
99
|
+
return report
|
|
100
|
+
|
|
101
|
+
def generate_file_name_from_bag_of_tokens(self, tokens: Iterable[str]) -> DrsGeneratorReport:
|
|
102
|
+
"""
|
|
103
|
+
Generate a file name DRS expression from an unordered bag of tokens.
|
|
104
|
+
The file name extension is append automatically, according to the DRS specification,
|
|
105
|
+
so none of the tokens given must include the extension.
|
|
106
|
+
|
|
107
|
+
:param tokens: An unordered bag of tokens.
|
|
108
|
+
:type tokens: Iterable[str]
|
|
109
|
+
:returns: A generation report.
|
|
110
|
+
:rtype: DrsGeneratorReport
|
|
111
|
+
"""
|
|
112
|
+
report = self._generate_from_bag_of_tokens(tokens, self.file_name_specs)
|
|
113
|
+
report.generated_drs_expression = report.generated_drs_expression + self._get_full_file_name_extension()
|
|
114
|
+
return report
|
|
115
|
+
|
|
116
|
+
def generate_from_mapping(self, mapping: Mapping[str, str],
|
|
117
|
+
drs_type: DrsType|str) -> DrsGeneratorReport:
|
|
118
|
+
"""
|
|
119
|
+
Generate a DRS expression from a mapping of collection ids and tokens.
|
|
120
|
+
|
|
121
|
+
:param mapping: A mapping of collection ids (keys) and tokens (values).
|
|
122
|
+
:type mapping: Mapping[str, str]
|
|
123
|
+
:param drs_type: The type of the given DRS expression (directory, file_name or dataset_id)
|
|
124
|
+
:type drs_type: DrsType|str
|
|
125
|
+
:returns: A generation report.
|
|
126
|
+
:rtype: DrsGeneratorReport
|
|
127
|
+
"""
|
|
128
|
+
specs = self._get_specs(drs_type)
|
|
129
|
+
report = self._generate_from_mapping(mapping, specs)
|
|
130
|
+
if DrsType.FILE_NAME == drs_type:
|
|
131
|
+
report.generated_drs_expression = report.generated_drs_expression + self._get_full_file_name_extension()
|
|
132
|
+
return report
|
|
133
|
+
|
|
134
|
+
def generate_from_bag_of_tokens(self, tokens: Iterable[str], drs_type: DrsType|str) \
|
|
135
|
+
-> DrsGeneratorReport:
|
|
136
|
+
"""
|
|
137
|
+
Generate a DRS expression from an unordered bag of tokens.
|
|
138
|
+
|
|
139
|
+
:param tokens: An unordered bag of tokens.
|
|
140
|
+
:type tokens: Iterable[str]
|
|
141
|
+
:param drs_type: The type of the given DRS expression (directory, file_name or dataset_id)
|
|
142
|
+
:type drs_type: DrsType|str
|
|
143
|
+
:returns: A generation report.
|
|
144
|
+
:rtype: DrsGeneratorReport
|
|
145
|
+
"""
|
|
146
|
+
specs = self._get_specs(drs_type)
|
|
147
|
+
return self._generate_from_bag_of_tokens(tokens, specs)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _generate_from_mapping(self, mapping: Mapping[str, str], specs: DrsSpecification) \
|
|
151
|
+
-> DrsGeneratorReport:
|
|
152
|
+
drs_expression, errors, warnings = self.__generate_from_mapping(mapping, specs, True)
|
|
153
|
+
if self.pedantic:
|
|
154
|
+
errors.extend(warnings)
|
|
155
|
+
warnings.clear()
|
|
156
|
+
return DrsGeneratorReport(project_id=self.project_id, type=specs.type,
|
|
157
|
+
given_mapping_or_bag_of_tokens=mapping,
|
|
158
|
+
mapping_used=mapping,
|
|
159
|
+
generated_drs_expression=drs_expression,
|
|
160
|
+
errors=cast(list[DrsIssue], errors),
|
|
161
|
+
warnings=cast(list[DrsIssue], warnings))
|
|
162
|
+
|
|
163
|
+
def __generate_from_mapping(self, mapping: Mapping[str, str],
|
|
164
|
+
specs: DrsSpecification,
|
|
165
|
+
has_to_valid_terms: bool)\
|
|
166
|
+
-> tuple[str, list[GeneratorIssue], list[GeneratorIssue]]:
|
|
167
|
+
errors: list[GeneratorIssue] = list()
|
|
168
|
+
warnings: list[GeneratorIssue] = list()
|
|
169
|
+
drs_expression = ""
|
|
170
|
+
part_position: int = 0
|
|
171
|
+
for part in specs.parts:
|
|
172
|
+
part_position += 1
|
|
173
|
+
if part.kind == DrsPartKind.COLLECTION:
|
|
174
|
+
collection_part = cast(DrsCollection, part)
|
|
175
|
+
collection_id = collection_part.collection_id
|
|
176
|
+
if collection_id in mapping:
|
|
177
|
+
part_value = mapping[collection_id]
|
|
178
|
+
if has_to_valid_terms:
|
|
179
|
+
matching_terms = projects.valid_term_in_collection(part_value,
|
|
180
|
+
self.project_id,
|
|
181
|
+
collection_id)
|
|
182
|
+
if not matching_terms:
|
|
183
|
+
issue = InvalidToken(token=part_value,
|
|
184
|
+
token_position=part_position,
|
|
185
|
+
collection_id_or_constant_value=collection_id)
|
|
186
|
+
errors.append(issue)
|
|
187
|
+
part_value = DrsGeneratorReport.INVALID_TAG
|
|
188
|
+
else:
|
|
189
|
+
other_issue = MissingToken(collection_id=collection_id,
|
|
190
|
+
collection_position=part_position)
|
|
191
|
+
if collection_part.is_required:
|
|
192
|
+
errors.append(other_issue)
|
|
193
|
+
part_value = DrsGeneratorReport.MISSING_TAG
|
|
194
|
+
else:
|
|
195
|
+
warnings.append(other_issue)
|
|
196
|
+
continue # The for loop.
|
|
197
|
+
else:
|
|
198
|
+
constant_part = cast(DrsConstant, part)
|
|
199
|
+
part_value = constant_part.value
|
|
200
|
+
|
|
201
|
+
drs_expression += part_value + specs.separator
|
|
202
|
+
|
|
203
|
+
drs_expression = drs_expression[0:len(drs_expression)-len(specs.separator)]
|
|
204
|
+
return drs_expression, errors, warnings
|
|
205
|
+
|
|
206
|
+
def _generate_from_bag_of_tokens(self, tokens: Iterable[str], specs: DrsSpecification) \
|
|
207
|
+
-> DrsGeneratorReport:
|
|
208
|
+
collection_tokens_mapping: dict[str, set[str]] = dict()
|
|
209
|
+
for token in tokens:
|
|
210
|
+
matching_terms = projects.valid_term_in_project(token, self.project_id)
|
|
211
|
+
for matching_term in matching_terms:
|
|
212
|
+
if matching_term.collection_id not in collection_tokens_mapping:
|
|
213
|
+
collection_tokens_mapping[matching_term.collection_id] = set()
|
|
214
|
+
collection_tokens_mapping[matching_term.collection_id].add(token)
|
|
215
|
+
collection_tokens_mapping, warnings = DrsGenerator._resolve_conflicts(collection_tokens_mapping)
|
|
216
|
+
mapping, errors = DrsGenerator._check_collection_tokens_mapping(collection_tokens_mapping)
|
|
217
|
+
drs_expression, errs, warns = self.__generate_from_mapping(mapping, specs, False)
|
|
218
|
+
errors.extend(errs)
|
|
219
|
+
warnings.extend(warns)
|
|
220
|
+
if self.pedantic:
|
|
221
|
+
errors.extend(warnings)
|
|
222
|
+
warnings.clear()
|
|
223
|
+
return DrsGeneratorReport(project_id=self.project_id, type=specs.type,
|
|
224
|
+
given_mapping_or_bag_of_tokens=tokens,
|
|
225
|
+
mapping_used=mapping,generated_drs_expression=drs_expression,
|
|
226
|
+
errors=cast(list[DrsIssue], errors),
|
|
227
|
+
warnings=cast(list[DrsIssue], warnings))
|
|
228
|
+
|
|
229
|
+
@staticmethod
|
|
230
|
+
def _resolve_conflicts(collection_tokens_mapping: dict[str, set[str]]) \
|
|
231
|
+
-> tuple[dict[str, set[str]], list[GeneratorIssue]]:
|
|
232
|
+
warnings: list[GeneratorIssue] = list()
|
|
233
|
+
conflicting_collection_ids_list: list[list[str]] = list()
|
|
234
|
+
collection_ids: list[str] = list(collection_tokens_mapping.keys())
|
|
235
|
+
len_collection_ids: int = len(collection_ids)
|
|
236
|
+
|
|
237
|
+
for l_collection_index in range(0, len_collection_ids - 1):
|
|
238
|
+
conflicting_collection_ids: list[str] = list()
|
|
239
|
+
for r_collection_index in range(l_collection_index + 1, len_collection_ids):
|
|
240
|
+
if collection_tokens_mapping[collection_ids[l_collection_index]].isdisjoint \
|
|
241
|
+
(collection_tokens_mapping[collection_ids[r_collection_index]]):
|
|
242
|
+
continue
|
|
243
|
+
else:
|
|
244
|
+
not_registered = True
|
|
245
|
+
for cc_ids in conflicting_collection_ids_list:
|
|
246
|
+
if collection_ids[l_collection_index] in cc_ids and \
|
|
247
|
+
collection_ids[r_collection_index] in cc_ids:
|
|
248
|
+
not_registered = False
|
|
249
|
+
break
|
|
250
|
+
if not_registered:
|
|
251
|
+
conflicting_collection_ids.append(collection_ids[r_collection_index])
|
|
252
|
+
if conflicting_collection_ids:
|
|
253
|
+
conflicting_collection_ids.append(collection_ids[l_collection_index])
|
|
254
|
+
conflicting_collection_ids_list.append(conflicting_collection_ids)
|
|
255
|
+
|
|
256
|
+
# Each time a collection is resolved, we must restart the loop so as to check if others can be,
|
|
257
|
+
# until no progress is made.
|
|
258
|
+
while True:
|
|
259
|
+
# 1. Non-conflicting collections with only one token are assigned.
|
|
260
|
+
# Non-conflicting collections with more than one token will be raise an error
|
|
261
|
+
# in the _check method.
|
|
262
|
+
|
|
263
|
+
# Nothing to do.
|
|
264
|
+
|
|
265
|
+
# 2a. Collections with one token that are conflicting to each other will raise an error.
|
|
266
|
+
# We don't search for collection with more than one token which token sets are exactly
|
|
267
|
+
# the same, because we cannot choose which token will be removed in 2b.
|
|
268
|
+
# So stick with one token collections: those collection will be detected in method _check.
|
|
269
|
+
collection_ids_with_len_eq_1_list: list[list[str]] = list()
|
|
270
|
+
for collection_ids in conflicting_collection_ids_list:
|
|
271
|
+
tmp_conflicting_collection_ids: list[str] = list()
|
|
272
|
+
for collection_id in collection_ids:
|
|
273
|
+
if len(collection_tokens_mapping[collection_id]) == 1:
|
|
274
|
+
tmp_conflicting_collection_ids.append(collection_id)
|
|
275
|
+
if len(tmp_conflicting_collection_ids) > 1:
|
|
276
|
+
collection_ids_with_len_eq_1_list.append(tmp_conflicting_collection_ids)
|
|
277
|
+
# 2b. As it is not possible to resolve collections sharing the same unique token:
|
|
278
|
+
# raise errors, remove the faulty collections and their token.
|
|
279
|
+
if collection_ids_with_len_eq_1_list:
|
|
280
|
+
for collection_ids_to_be_removed in collection_ids_with_len_eq_1_list:
|
|
281
|
+
DrsGenerator._remove_ids_from_conflicts(conflicting_collection_ids_list,
|
|
282
|
+
collection_ids_to_be_removed)
|
|
283
|
+
DrsGenerator._remove_token_from_other_token_sets(collection_tokens_mapping,
|
|
284
|
+
collection_ids_to_be_removed)
|
|
285
|
+
# Every time conflicting_collection_ids_list is modified, we must restart the loop,
|
|
286
|
+
# as conflicting collections may be resolved.
|
|
287
|
+
continue
|
|
288
|
+
|
|
289
|
+
# 3.a For each collections with only one token, assign their token to the detriment of
|
|
290
|
+
# collections with more than one token.
|
|
291
|
+
wining_collection_ids: list[str] = list()
|
|
292
|
+
for collection_ids in conflicting_collection_ids_list:
|
|
293
|
+
for collection_id in collection_ids:
|
|
294
|
+
if len(collection_tokens_mapping[collection_id]) == 1:
|
|
295
|
+
wining_collection_ids.append(collection_id)
|
|
296
|
+
token = _get_first_item(collection_tokens_mapping[collection_id])
|
|
297
|
+
issue = AssignedToken(collection_id=collection_id, token=token)
|
|
298
|
+
warnings.append(issue)
|
|
299
|
+
# 3.b Update conflicting collections.
|
|
300
|
+
if wining_collection_ids:
|
|
301
|
+
DrsGenerator._remove_ids_from_conflicts(conflicting_collection_ids_list,
|
|
302
|
+
wining_collection_ids)
|
|
303
|
+
DrsGenerator._remove_token_from_other_token_sets(collection_tokens_mapping,
|
|
304
|
+
wining_collection_ids)
|
|
305
|
+
# Every time conflicting_collection_ids_list is modified, we must restart the loop,
|
|
306
|
+
# as conflicting collections may be resolved.
|
|
307
|
+
continue
|
|
308
|
+
|
|
309
|
+
# 4.a For each token set of the remaining conflicting collections, compute their difference.
|
|
310
|
+
# If the difference is one token, this token is assigned to the collection that owns it.
|
|
311
|
+
wining_id_and_token_pairs: list[tuple[str, str]] = list()
|
|
312
|
+
for collection_ids in conflicting_collection_ids_list:
|
|
313
|
+
for collection_index in range(0, len(collection_ids)):
|
|
314
|
+
diff: set[str] = collection_tokens_mapping[collection_ids[collection_index]]\
|
|
315
|
+
.difference(
|
|
316
|
+
*[collection_tokens_mapping[index]
|
|
317
|
+
for index in collection_ids[collection_index + 1 :] +\
|
|
318
|
+
collection_ids[:collection_index]
|
|
319
|
+
]
|
|
320
|
+
)
|
|
321
|
+
if len(diff) == 1:
|
|
322
|
+
wining_id_and_token_pairs.append((collection_ids[collection_index],
|
|
323
|
+
_get_first_item(diff)))
|
|
324
|
+
# 4.b Update conflicting collections.
|
|
325
|
+
if wining_id_and_token_pairs:
|
|
326
|
+
wining_collection_ids = list()
|
|
327
|
+
for collection_id, token in wining_id_and_token_pairs:
|
|
328
|
+
wining_collection_ids.append(collection_id)
|
|
329
|
+
collection_tokens_mapping[collection_id].clear()
|
|
330
|
+
collection_tokens_mapping[collection_id].add(token)
|
|
331
|
+
issue = AssignedToken(collection_id=collection_id, token=token)
|
|
332
|
+
warnings.append(issue)
|
|
333
|
+
DrsGenerator._remove_ids_from_conflicts(conflicting_collection_ids_list,
|
|
334
|
+
wining_collection_ids)
|
|
335
|
+
DrsGenerator._remove_token_from_other_token_sets(collection_tokens_mapping,
|
|
336
|
+
wining_collection_ids)
|
|
337
|
+
continue
|
|
338
|
+
else:
|
|
339
|
+
break # Stop the loop when no progress is made.
|
|
340
|
+
return collection_tokens_mapping, warnings
|
|
341
|
+
|
|
342
|
+
@staticmethod
|
|
343
|
+
def _check_collection_tokens_mapping(collection_tokens_mapping: dict[str, set[str]]) \
|
|
344
|
+
-> tuple[dict[str, str], list[GeneratorIssue]]:
|
|
345
|
+
errors: list[GeneratorIssue] = list()
|
|
346
|
+
# 1. Looking for collections that share strictly the same token(s).
|
|
347
|
+
collection_ids: list[str] = list(collection_tokens_mapping.keys())
|
|
348
|
+
len_collection_ids: int = len(collection_ids)
|
|
349
|
+
faulty_collections_list: list[set[str]] = list()
|
|
350
|
+
for l_collection_index in range(0, len_collection_ids - 1):
|
|
351
|
+
l_collection_id = collection_ids[l_collection_index]
|
|
352
|
+
l_token_set = collection_tokens_mapping[l_collection_id]
|
|
353
|
+
for r_collection_index in range(l_collection_index + 1, len_collection_ids):
|
|
354
|
+
r_collection_id = collection_ids[r_collection_index]
|
|
355
|
+
r_token_set = collection_tokens_mapping[r_collection_id]
|
|
356
|
+
# check if the set is empty because the difference will always be an empty set!
|
|
357
|
+
if l_token_set and (not l_token_set.difference(r_token_set)):
|
|
358
|
+
not_registered = True
|
|
359
|
+
for faulty_collections in faulty_collections_list:
|
|
360
|
+
if l_collection_id in faulty_collections or \
|
|
361
|
+
r_collection_id in faulty_collections:
|
|
362
|
+
faulty_collections.add(l_collection_id)
|
|
363
|
+
faulty_collections.add(r_collection_id)
|
|
364
|
+
not_registered = False
|
|
365
|
+
break
|
|
366
|
+
if not_registered:
|
|
367
|
+
faulty_collections_list.append({l_collection_id, r_collection_id})
|
|
368
|
+
for faulty_collections in faulty_collections_list:
|
|
369
|
+
tokens = collection_tokens_mapping[_get_first_item(faulty_collections)]
|
|
370
|
+
issue = ConflictingCollections(collection_ids=_transform_set_and_sort(faulty_collections),
|
|
371
|
+
tokens=_transform_set_and_sort(tokens))
|
|
372
|
+
errors.append(issue)
|
|
373
|
+
for collection_id in faulty_collections:
|
|
374
|
+
del collection_tokens_mapping[collection_id]
|
|
375
|
+
|
|
376
|
+
# 2. Looking for collections with more than one token.
|
|
377
|
+
result: dict[str, str] = dict()
|
|
378
|
+
for collection_id, token_set in collection_tokens_mapping.items():
|
|
379
|
+
len_token_set = len(token_set)
|
|
380
|
+
if len_token_set == 1:
|
|
381
|
+
result[collection_id] = _get_first_item(token_set)
|
|
382
|
+
elif len_token_set > 1:
|
|
383
|
+
other_issue = TooManyTokensCollection(collection_id=collection_id,
|
|
384
|
+
tokens=_transform_set_and_sort(token_set))
|
|
385
|
+
errors.append(other_issue)
|
|
386
|
+
#else: Don't add emptied collection to the result.
|
|
387
|
+
return result, errors
|
|
388
|
+
|
|
389
|
+
@staticmethod
|
|
390
|
+
def _remove_token_from_other_token_sets(collection_tokens_mapping: dict[str, set[str]],
|
|
391
|
+
collection_ids_to_be_removed: list[str]) -> None:
|
|
392
|
+
for collection_id_to_be_removed in collection_ids_to_be_removed:
|
|
393
|
+
# Should only be one token.
|
|
394
|
+
token_to_be_removed: str = _get_first_item(collection_tokens_mapping[collection_id_to_be_removed])
|
|
395
|
+
for collection_id in collection_tokens_mapping.keys():
|
|
396
|
+
if (collection_id not in collection_ids_to_be_removed):
|
|
397
|
+
collection_tokens_mapping[collection_id].discard(token_to_be_removed)
|
|
398
|
+
|
|
399
|
+
@staticmethod
|
|
400
|
+
def _remove_ids_from_conflicts(conflicting_collection_ids_list: list[list[str]],
|
|
401
|
+
collection_ids_to_be_removed: list[str]) -> None:
|
|
402
|
+
for collection_id_to_be_removed in collection_ids_to_be_removed:
|
|
403
|
+
for conflicting_collection_ids in conflicting_collection_ids_list:
|
|
404
|
+
if collection_id_to_be_removed in conflicting_collection_ids:
|
|
405
|
+
conflicting_collection_ids.remove(collection_id_to_be_removed)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
if __name__ == "__main__":
|
|
409
|
+
project_id = 'cmip6plus'
|
|
410
|
+
generator = DrsGenerator(project_id)
|
|
411
|
+
mapping = \
|
|
412
|
+
{
|
|
413
|
+
'member_id': 'r2i2p1f2',
|
|
414
|
+
'activity_id': 'CMIP',
|
|
415
|
+
'source_id': 'MIROC6',
|
|
416
|
+
'mip_era': 'CMIP6Plus',
|
|
417
|
+
'experiment_id': 'amip',
|
|
418
|
+
'variable_id': 'od550aer',
|
|
419
|
+
'table_id': 'ACmon',
|
|
420
|
+
'grid_label': 'gn',
|
|
421
|
+
'institution_id': 'IPSL',
|
|
422
|
+
}
|
|
423
|
+
report = generator.generate_file_name_from_mapping(mapping)
|
|
424
|
+
print(report.warnings)
|