esgvoc 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of esgvoc might be problematic. Click here for more details.
- esgvoc/__init__.py +1 -1
- esgvoc/api/__init__.py +0 -6
- esgvoc/api/data_descriptors/__init__.py +8 -0
- esgvoc/api/data_descriptors/archive.py +5 -0
- esgvoc/api/data_descriptors/citation_url.py +5 -0
- esgvoc/api/data_descriptors/experiment.py +2 -2
- esgvoc/api/data_descriptors/known_branded_variable.py +58 -5
- esgvoc/api/data_descriptors/member_id.py +9 -0
- esgvoc/api/data_descriptors/regex.py +5 -0
- esgvoc/api/data_descriptors/vertical_label.py +2 -2
- esgvoc/api/project_specs.py +48 -130
- esgvoc/api/projects.py +185 -66
- esgvoc/apps/drs/generator.py +103 -85
- esgvoc/apps/drs/validator.py +22 -38
- esgvoc/apps/jsg/json_schema_generator.py +255 -130
- esgvoc/apps/jsg/templates/template.jinja +249 -0
- esgvoc/apps/test_cv/README.md +214 -0
- esgvoc/apps/test_cv/cv_tester.py +1368 -0
- esgvoc/apps/test_cv/example_usage.py +216 -0
- esgvoc/apps/vr/__init__.py +12 -0
- esgvoc/apps/vr/build_variable_registry.py +71 -0
- esgvoc/apps/vr/example_usage.py +60 -0
- esgvoc/apps/vr/vr_app.py +333 -0
- esgvoc/cli/config.py +671 -86
- esgvoc/cli/drs.py +39 -21
- esgvoc/cli/main.py +2 -0
- esgvoc/cli/test_cv.py +257 -0
- esgvoc/core/constants.py +10 -7
- esgvoc/core/data_handler.py +24 -22
- esgvoc/core/db/connection.py +7 -0
- esgvoc/core/db/project_ingestion.py +34 -9
- esgvoc/core/db/universe_ingestion.py +1 -2
- esgvoc/core/service/configuration/setting.py +192 -21
- esgvoc/core/service/data_merger.py +1 -1
- esgvoc/core/service/state.py +18 -2
- {esgvoc-1.0.0.dist-info → esgvoc-1.1.1.dist-info}/METADATA +2 -3
- {esgvoc-1.0.0.dist-info → esgvoc-1.1.1.dist-info}/RECORD +41 -30
- esgvoc/apps/jsg/cmip6_template.json +0 -74
- esgvoc/apps/jsg/cmip6plus_template.json +0 -74
- /esgvoc/apps/{py.typed → test_cv/__init__.py} +0 -0
- {esgvoc-1.0.0.dist-info → esgvoc-1.1.1.dist-info}/WHEEL +0 -0
- {esgvoc-1.0.0.dist-info → esgvoc-1.1.1.dist-info}/entry_points.txt +0 -0
- {esgvoc-1.0.0.dist-info → esgvoc-1.1.1.dist-info}/licenses/LICENSE.txt +0 -0
esgvoc/api/projects.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import itertools
|
|
1
2
|
import re
|
|
2
|
-
from typing import Iterable, Sequence
|
|
3
|
+
from typing import Iterable, Sequence, cast
|
|
3
4
|
|
|
4
5
|
from sqlalchemy import text
|
|
5
6
|
from sqlmodel import Session, and_, col, select
|
|
@@ -48,22 +49,36 @@ def _get_project_session_with_exception(project_id: str) -> Session:
|
|
|
48
49
|
raise EsgvocNotFoundError(f"unable to find project '{project_id}'")
|
|
49
50
|
|
|
50
51
|
|
|
51
|
-
def
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
52
|
+
def _resolve_composite_term_part(composite_term_part: dict,
|
|
53
|
+
universe_session: Session,
|
|
54
|
+
project_session: Session) -> UTerm | PTerm | Sequence[UTerm | PTerm]:
|
|
55
|
+
if constants.TERM_ID_JSON_KEY in composite_term_part:
|
|
56
|
+
# First find the term in the universe than in the current project
|
|
57
|
+
term_id = composite_term_part[constants.TERM_ID_JSON_KEY]
|
|
58
|
+
term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
|
|
59
|
+
uterm = universe._get_term_in_data_descriptor(data_descriptor_id=term_type,
|
|
60
|
+
term_id=term_id, session=universe_session)
|
|
61
|
+
if uterm:
|
|
62
|
+
return uterm
|
|
63
|
+
else:
|
|
64
|
+
pterm = _get_term_in_collection(collection_id=term_type, term_id=term_id, session=project_session)
|
|
65
|
+
if pterm:
|
|
66
|
+
return pterm
|
|
67
|
+
else:
|
|
68
|
+
msg = f"unable to find the term '{term_id}' in '{term_type}'"
|
|
69
|
+
raise EsgvocNotFoundError(msg)
|
|
64
70
|
else:
|
|
65
|
-
|
|
66
|
-
|
|
71
|
+
term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
|
|
72
|
+
data_descriptor = universe._get_data_descriptor_in_universe(term_type, universe_session)
|
|
73
|
+
if data_descriptor is not None:
|
|
74
|
+
return data_descriptor.terms
|
|
75
|
+
else:
|
|
76
|
+
collection = _get_collection_in_project(term_type, project_session)
|
|
77
|
+
if collection is not None:
|
|
78
|
+
return collection.terms
|
|
79
|
+
else:
|
|
80
|
+
msg = f"unable to find the terms of '{term_type}'"
|
|
81
|
+
raise EsgvocNotFoundError(msg)
|
|
67
82
|
|
|
68
83
|
|
|
69
84
|
def _get_composite_term_separator_parts(term: UTerm | PTerm) -> tuple[str, list]:
|
|
@@ -72,42 +87,82 @@ def _get_composite_term_separator_parts(term: UTerm | PTerm) -> tuple[str, list]
|
|
|
72
87
|
return separator, parts
|
|
73
88
|
|
|
74
89
|
|
|
75
|
-
# TODO: support optionality of parts of composite.
|
|
76
|
-
# It is backtrack possible for more than one missing parts.
|
|
77
90
|
def _valid_value_composite_term_with_separator(
|
|
78
91
|
value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
|
|
79
92
|
) -> list[UniverseTermError | ProjectTermError]:
|
|
80
|
-
result = list()
|
|
81
93
|
separator, parts = _get_composite_term_separator_parts(term)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
94
|
+
required_indices = {i for i, p in enumerate(parts) if p.get("is_required", False)}
|
|
95
|
+
|
|
96
|
+
splits = value.split(separator)
|
|
97
|
+
nb_splits = len(splits)
|
|
98
|
+
nb_parts = len(parts)
|
|
99
|
+
|
|
100
|
+
if nb_splits > nb_parts:
|
|
101
|
+
return [_create_term_error(value, term)]
|
|
102
|
+
|
|
103
|
+
# Generate all possible assignments of split values into parts
|
|
104
|
+
# Only keep those that include all required parts
|
|
105
|
+
all_positions = [i for i in range(nb_parts)]
|
|
106
|
+
valid_combinations = [
|
|
107
|
+
comb for comb in itertools.combinations(all_positions, nb_splits) if required_indices.issubset(comb)
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
for positions in valid_combinations:
|
|
111
|
+
candidate = [None] * nb_parts
|
|
112
|
+
for idx, pos in enumerate(positions):
|
|
113
|
+
candidate[pos] = splits[idx]
|
|
114
|
+
|
|
115
|
+
# Separator structure validation:
|
|
116
|
+
# - No leading separator if the first part is None
|
|
117
|
+
# - No trailing separator if the last part is None
|
|
118
|
+
# - No double separators where two adjacent optional parts are missing
|
|
119
|
+
if candidate[0] is None and value.startswith(separator):
|
|
120
|
+
continue
|
|
121
|
+
if candidate[-1] is None and value.endswith(separator):
|
|
122
|
+
continue
|
|
123
|
+
if any(
|
|
124
|
+
candidate[i] is None and candidate[i + 1] is None and separator * 2 in value for i in range(nb_parts - 1)
|
|
125
|
+
):
|
|
126
|
+
continue # invalid double separator between two missing parts
|
|
127
|
+
|
|
128
|
+
# Validate each filled part value
|
|
129
|
+
all_valid = True
|
|
130
|
+
for i, given_value in enumerate(candidate):
|
|
131
|
+
if given_value is None:
|
|
132
|
+
if parts[i].get("is_required", False):
|
|
133
|
+
all_valid = False
|
|
134
|
+
break
|
|
135
|
+
continue # optional and missing part is allowed
|
|
136
|
+
|
|
137
|
+
part = parts[i]
|
|
138
|
+
|
|
139
|
+
# Resolve term ID list if not present
|
|
140
|
+
if "id" not in part:
|
|
141
|
+
terms = universe.get_all_terms_in_data_descriptor(part["type"], None)
|
|
142
|
+
part["id"] = [term.id for term in terms]
|
|
143
|
+
if isinstance(part["id"], str):
|
|
144
|
+
part["id"] = [part["id"]]
|
|
145
|
+
|
|
146
|
+
# Try all possible term IDs to find a valid match
|
|
147
|
+
valid_for_this_part = False
|
|
148
|
+
for id in part["id"]:
|
|
149
|
+
part_copy = dict(part)
|
|
150
|
+
part_copy["id"] = id
|
|
151
|
+
resolved_term = _resolve_composite_term_part(part_copy, universe_session, project_session)
|
|
152
|
+
# resolved_term can't be a list of terms here.
|
|
153
|
+
resolved_term = cast(UTerm | PTerm, resolved_term)
|
|
154
|
+
errors = _valid_value(given_value, resolved_term, universe_session, project_session)
|
|
155
|
+
if not errors:
|
|
156
|
+
valid_for_this_part = True
|
|
157
|
+
break
|
|
158
|
+
if not valid_for_this_part:
|
|
159
|
+
all_valid = False
|
|
160
|
+
break
|
|
161
|
+
|
|
162
|
+
if all_valid:
|
|
163
|
+
return [] # At least one valid combination found
|
|
164
|
+
|
|
165
|
+
return [_create_term_error(value, term)] # No valid combination found
|
|
111
166
|
|
|
112
167
|
|
|
113
168
|
def _transform_to_pattern(term: UTerm | PTerm, universe_session: Session, project_session: Session) -> str:
|
|
@@ -123,8 +178,13 @@ def _transform_to_pattern(term: UTerm | PTerm, universe_session: Session, projec
|
|
|
123
178
|
separator, parts = _get_composite_term_separator_parts(term)
|
|
124
179
|
result = ""
|
|
125
180
|
for part in parts:
|
|
126
|
-
resolved_term =
|
|
127
|
-
|
|
181
|
+
resolved_term = _resolve_composite_term_part(part, universe_session, project_session)
|
|
182
|
+
if isinstance(resolved_term, Sequence):
|
|
183
|
+
pattern = ""
|
|
184
|
+
for r_term in resolved_term:
|
|
185
|
+
pattern += _transform_to_pattern(r_term, universe_session, project_session)
|
|
186
|
+
else:
|
|
187
|
+
pattern = _transform_to_pattern(resolved_term, universe_session, project_session)
|
|
128
188
|
result = f"{result}{pattern}{separator}"
|
|
129
189
|
result = result.rstrip(separator)
|
|
130
190
|
case _:
|
|
@@ -452,7 +512,52 @@ def get_all_terms_in_collection(
|
|
|
452
512
|
def _get_all_collections_in_project(session: Session) -> list[PCollection]:
|
|
453
513
|
project = session.get(Project, constants.SQLITE_FIRST_PK)
|
|
454
514
|
# Project can't be missing if session exists.
|
|
455
|
-
|
|
515
|
+
try:
|
|
516
|
+
return project.collections # type: ignore
|
|
517
|
+
except Exception as e:
|
|
518
|
+
# Enhanced error context for collection retrieval failures
|
|
519
|
+
import logging
|
|
520
|
+
logger = logging.getLogger(__name__)
|
|
521
|
+
logger.error(f"Failed to retrieve collections for project '{project.id}': {str(e)}")
|
|
522
|
+
|
|
523
|
+
# Use raw SQL to inspect collections without Pydantic validation
|
|
524
|
+
from sqlalchemy import text
|
|
525
|
+
try:
|
|
526
|
+
# Query raw data to identify problematic collections
|
|
527
|
+
raw_query = text("""
|
|
528
|
+
SELECT id, term_kind, data_descriptor_id
|
|
529
|
+
FROM pcollections
|
|
530
|
+
WHERE project_pk = :project_pk
|
|
531
|
+
""")
|
|
532
|
+
result = session.execute(raw_query, {"project_pk": project.pk})
|
|
533
|
+
|
|
534
|
+
problematic_collections = []
|
|
535
|
+
|
|
536
|
+
for row in result:
|
|
537
|
+
collection_id, term_kind_value, data_descriptor_id = row
|
|
538
|
+
|
|
539
|
+
# Only empty string is invalid - indicates ingestion couldn't determine termkind
|
|
540
|
+
if term_kind_value == '' or term_kind_value is None:
|
|
541
|
+
problematic_collections.append((collection_id, term_kind_value, data_descriptor_id))
|
|
542
|
+
msg = f"Collection '{collection_id}' has empty term_kind (data_descriptor: " + \
|
|
543
|
+
f"{data_descriptor_id}) - CV ingestion failed to determine termkind"
|
|
544
|
+
logger.error(msg)
|
|
545
|
+
|
|
546
|
+
if problematic_collections:
|
|
547
|
+
error_details = []
|
|
548
|
+
for col_id, _, data_desc in problematic_collections:
|
|
549
|
+
error_details.append(f" • Collection '{col_id}' (data_descriptor: {data_desc}): EMPTY termkind")
|
|
550
|
+
|
|
551
|
+
error_msg = (
|
|
552
|
+
f"Found {len(problematic_collections)} collections with empty term_kind:\n" +
|
|
553
|
+
"\n".join(error_details)
|
|
554
|
+
)
|
|
555
|
+
raise ValueError(error_msg) from e
|
|
556
|
+
|
|
557
|
+
except Exception as inner_e:
|
|
558
|
+
logger.error(f"Failed to analyze problematic collections using raw SQL: {inner_e}")
|
|
559
|
+
|
|
560
|
+
raise e
|
|
456
561
|
|
|
457
562
|
|
|
458
563
|
def get_all_collections_in_project(project_id: str) -> list[str]:
|
|
@@ -469,10 +574,24 @@ def get_all_collections_in_project(project_id: str) -> list[str]:
|
|
|
469
574
|
"""
|
|
470
575
|
result = list()
|
|
471
576
|
if connection := _get_project_connection(project_id):
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
577
|
+
try:
|
|
578
|
+
with connection.create_session() as session:
|
|
579
|
+
collections = _get_all_collections_in_project(session)
|
|
580
|
+
for collection in collections:
|
|
581
|
+
result.append(collection.id)
|
|
582
|
+
except Exception as e:
|
|
583
|
+
# Enhanced error context for project collection retrieval
|
|
584
|
+
import logging
|
|
585
|
+
logger = logging.getLogger(__name__)
|
|
586
|
+
logger.error(f"Failed to get collections for project '{project_id}': {str(e)}")
|
|
587
|
+
|
|
588
|
+
# Re-raise with enhanced context
|
|
589
|
+
raise ValueError(
|
|
590
|
+
f"Failed to retrieve collections for project '{project_id}'. "
|
|
591
|
+
f"This may be due to invalid termkind values in the database. "
|
|
592
|
+
f"Check the project database for collections with empty or invalid termkind values. "
|
|
593
|
+
f"Original error: {str(e)}"
|
|
594
|
+
) from e
|
|
476
595
|
return result
|
|
477
596
|
|
|
478
597
|
|
|
@@ -1113,16 +1232,16 @@ def find_items_in_project(
|
|
|
1113
1232
|
collection_column = col(PCollectionFTS5.id) # TODO: use specs when implemented!
|
|
1114
1233
|
term_column = col(PTermFTS5.specs) # type: ignore
|
|
1115
1234
|
collection_where_condition = collection_column.match(processed_expression)
|
|
1116
|
-
collection_statement = select(
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
text('rank')).where(collection_where_condition)
|
|
1235
|
+
collection_statement = select(
|
|
1236
|
+
PCollectionFTS5.id, text("'collection' AS TYPE"), text(f"'{project_id}' AS TYPE"), text("rank")
|
|
1237
|
+
).where(collection_where_condition)
|
|
1120
1238
|
term_where_condition = term_column.match(processed_expression)
|
|
1121
|
-
term_statement =
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
result = execute_find_item_statements(
|
|
1127
|
-
|
|
1239
|
+
term_statement = (
|
|
1240
|
+
select(PTermFTS5.id, text("'term' AS TYPE"), PCollection.id, text("rank"))
|
|
1241
|
+
.join(PCollection)
|
|
1242
|
+
.where(term_where_condition)
|
|
1243
|
+
)
|
|
1244
|
+
result = execute_find_item_statements(
|
|
1245
|
+
session, processed_expression, collection_statement, term_statement, limit, offset
|
|
1246
|
+
)
|
|
1128
1247
|
return result
|
esgvoc/apps/drs/generator.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from typing import Any, Iterable, Mapping, cast
|
|
2
2
|
|
|
3
3
|
import esgvoc.api.projects as projects
|
|
4
|
-
from esgvoc.api.project_specs import
|
|
4
|
+
from esgvoc.api.project_specs import DrsSpecification, DrsType
|
|
5
|
+
from esgvoc.api.search import MatchingTerm
|
|
5
6
|
from esgvoc.apps.drs.report import (
|
|
6
7
|
AssignedTerm,
|
|
7
8
|
ConflictingCollections,
|
|
@@ -92,8 +93,7 @@ class DrsGenerator(DrsApplication):
|
|
|
92
93
|
:rtype: DrsGeneratorReport
|
|
93
94
|
"""
|
|
94
95
|
report = self._generate_from_mapping(mapping, self.file_name_specs)
|
|
95
|
-
report.generated_drs_expression = report.generated_drs_expression +
|
|
96
|
-
self._get_full_file_name_extension() # noqa E127
|
|
96
|
+
report.generated_drs_expression = report.generated_drs_expression + self._get_full_file_name_extension() # noqa E127
|
|
97
97
|
return report
|
|
98
98
|
|
|
99
99
|
def generate_file_name_from_bag_of_terms(self, terms: Iterable[str]) -> DrsGenerationReport:
|
|
@@ -108,12 +108,10 @@ class DrsGenerator(DrsApplication):
|
|
|
108
108
|
:rtype: DrsGeneratorReport
|
|
109
109
|
"""
|
|
110
110
|
report = self._generate_from_bag_of_terms(terms, self.file_name_specs)
|
|
111
|
-
report.generated_drs_expression = report.generated_drs_expression +
|
|
112
|
-
self._get_full_file_name_extension() # noqa E127
|
|
111
|
+
report.generated_drs_expression = report.generated_drs_expression + self._get_full_file_name_extension() # noqa E127
|
|
113
112
|
return report
|
|
114
113
|
|
|
115
|
-
def generate_from_mapping(self, mapping: Mapping[str, str],
|
|
116
|
-
drs_type: DrsType | str) -> DrsGenerationReport:
|
|
114
|
+
def generate_from_mapping(self, mapping: Mapping[str, str], drs_type: DrsType | str) -> DrsGenerationReport:
|
|
117
115
|
"""
|
|
118
116
|
Generate a DRS expression from a mapping of collection ids and terms.
|
|
119
117
|
|
|
@@ -134,8 +132,7 @@ class DrsGenerator(DrsApplication):
|
|
|
134
132
|
case _:
|
|
135
133
|
raise EsgvocDbError(f"unsupported drs type '{drs_type}'")
|
|
136
134
|
|
|
137
|
-
def generate_from_bag_of_terms(self, terms: Iterable[str], drs_type: DrsType | str)
|
|
138
|
-
-> DrsGenerationReport: # noqa E127
|
|
135
|
+
def generate_from_bag_of_terms(self, terms: Iterable[str], drs_type: DrsType | str) -> DrsGenerationReport: # noqa E127
|
|
139
136
|
"""
|
|
140
137
|
Generate a DRS expression from an unordered bag of terms.
|
|
141
138
|
|
|
@@ -156,67 +153,78 @@ class DrsGenerator(DrsApplication):
|
|
|
156
153
|
case _:
|
|
157
154
|
raise EsgvocDbError(f"unsupported drs type '{drs_type}'")
|
|
158
155
|
|
|
159
|
-
def _generate_from_mapping(self, mapping: Mapping[str, str], specs: DrsSpecification)
|
|
160
|
-
-> DrsGenerationReport: # noqa E127
|
|
156
|
+
def _generate_from_mapping(self, mapping: Mapping[str, str], specs: DrsSpecification) -> DrsGenerationReport: # noqa E127
|
|
161
157
|
drs_expression, errors, warnings = self.__generate_from_mapping(mapping, specs, True)
|
|
162
158
|
if self.pedantic:
|
|
163
159
|
errors.extend(warnings)
|
|
164
160
|
warnings.clear()
|
|
165
|
-
return DrsGenerationReport(
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
161
|
+
return DrsGenerationReport(
|
|
162
|
+
project_id=self.project_id,
|
|
163
|
+
type=specs.type,
|
|
164
|
+
given_mapping_or_bag_of_terms=mapping,
|
|
165
|
+
mapping_used=mapping,
|
|
166
|
+
generated_drs_expression=drs_expression,
|
|
167
|
+
errors=cast(list[GenerationError], errors),
|
|
168
|
+
warnings=cast(list[GenerationWarning], warnings),
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def __generate_from_mapping(
|
|
172
|
+
self, mapping: Mapping[str, str], specs: DrsSpecification, has_to_valid_terms: bool
|
|
173
|
+
) -> tuple[str, list[GenerationIssue], list[GenerationIssue]]: # noqa E127
|
|
176
174
|
errors: list[GenerationIssue] = list()
|
|
177
175
|
warnings: list[GenerationIssue] = list()
|
|
178
176
|
drs_expression = ""
|
|
179
177
|
part_position: int = 0
|
|
180
178
|
for part in specs.parts:
|
|
181
179
|
part_position += 1
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
if
|
|
186
|
-
|
|
187
|
-
if has_to_valid_terms:
|
|
180
|
+
collection_id = part.source_collection
|
|
181
|
+
if collection_id in mapping:
|
|
182
|
+
part_value = mapping[collection_id]
|
|
183
|
+
if has_to_valid_terms:
|
|
184
|
+
if part.source_collection_term is None:
|
|
188
185
|
matching_terms = projects.valid_term_in_collection(part_value,
|
|
189
186
|
self.project_id,
|
|
190
187
|
collection_id)
|
|
191
|
-
if not matching_terms:
|
|
192
|
-
issue = InvalidTerm(term=part_value,
|
|
193
|
-
term_position=part_position,
|
|
194
|
-
collection_id_or_constant_value=collection_id)
|
|
195
|
-
errors.append(issue)
|
|
196
|
-
part_value = DrsGenerationReport.INVALID_TAG
|
|
197
|
-
else:
|
|
198
|
-
other_issue = MissingTerm(collection_id=collection_id,
|
|
199
|
-
collection_position=part_position)
|
|
200
|
-
if collection_part.is_required:
|
|
201
|
-
errors.append(other_issue)
|
|
202
|
-
part_value = DrsGenerationReport.MISSING_TAG
|
|
203
188
|
else:
|
|
204
|
-
|
|
205
|
-
|
|
189
|
+
matching_terms = projects.valid_term(
|
|
190
|
+
part_value,
|
|
191
|
+
self.project_id,
|
|
192
|
+
collection_id,
|
|
193
|
+
part.source_collection_term).validated
|
|
194
|
+
if not matching_terms:
|
|
195
|
+
issue = InvalidTerm(term=part_value,
|
|
196
|
+
term_position=part_position,
|
|
197
|
+
collection_id_or_constant_value=collection_id)
|
|
198
|
+
errors.append(issue)
|
|
199
|
+
part_value = DrsGenerationReport.INVALID_TAG
|
|
206
200
|
else:
|
|
207
|
-
|
|
208
|
-
|
|
201
|
+
other_issue = MissingTerm(collection_id=collection_id, collection_position=part_position)
|
|
202
|
+
if part.is_required:
|
|
203
|
+
errors.append(other_issue)
|
|
204
|
+
part_value = DrsGenerationReport.MISSING_TAG
|
|
205
|
+
else:
|
|
206
|
+
warnings.append(other_issue)
|
|
207
|
+
continue # The for loop.
|
|
209
208
|
|
|
210
209
|
drs_expression += part_value + specs.separator
|
|
211
210
|
|
|
212
|
-
drs_expression = drs_expression[0:len(drs_expression)-len(specs.separator)]
|
|
211
|
+
drs_expression = drs_expression[0: len(drs_expression) - len(specs.separator)]
|
|
213
212
|
return drs_expression, errors, warnings
|
|
214
213
|
|
|
215
|
-
def _generate_from_bag_of_terms(self, terms: Iterable[str], specs: DrsSpecification)
|
|
216
|
-
-> DrsGenerationReport: # noqa E127
|
|
214
|
+
def _generate_from_bag_of_terms(self, terms: Iterable[str], specs: DrsSpecification) -> DrsGenerationReport: # noqa E127
|
|
217
215
|
collection_terms_mapping: dict[str, set[str]] = dict()
|
|
218
216
|
for term in terms:
|
|
219
|
-
matching_terms =
|
|
217
|
+
matching_terms: list[MatchingTerm] = list()
|
|
218
|
+
for part in specs.parts:
|
|
219
|
+
if part.source_collection_term is None:
|
|
220
|
+
matching_terms.extend(projects.valid_term_in_collection(term, self.project_id,
|
|
221
|
+
part.source_collection))
|
|
222
|
+
else:
|
|
223
|
+
if projects.valid_term(term, self.project_id, part.source_collection,
|
|
224
|
+
part.source_collection_term).validated:
|
|
225
|
+
matching_terms.append(MatchingTerm(project_id=self.project_id,
|
|
226
|
+
collection_id=part.source_collection,
|
|
227
|
+
term_id=part.source_collection_term))
|
|
220
228
|
for matching_term in matching_terms:
|
|
221
229
|
if matching_term.collection_id not in collection_terms_mapping:
|
|
222
230
|
collection_terms_mapping[matching_term.collection_id] = set()
|
|
@@ -229,15 +237,18 @@ class DrsGenerator(DrsApplication):
|
|
|
229
237
|
if self.pedantic:
|
|
230
238
|
errors.extend(warnings)
|
|
231
239
|
warnings.clear()
|
|
232
|
-
return DrsGenerationReport(project_id=self.project_id,
|
|
240
|
+
return DrsGenerationReport(project_id=self.project_id,
|
|
241
|
+
type=specs.type,
|
|
233
242
|
given_mapping_or_bag_of_terms=terms,
|
|
234
|
-
mapping_used=mapping,
|
|
243
|
+
mapping_used=mapping,
|
|
244
|
+
generated_drs_expression=drs_expression,
|
|
235
245
|
errors=cast(list[GenerationError], errors),
|
|
236
246
|
warnings=cast(list[GenerationWarning], warnings))
|
|
237
247
|
|
|
238
248
|
@staticmethod
|
|
239
|
-
def _resolve_conflicts(
|
|
240
|
-
|
|
249
|
+
def _resolve_conflicts(
|
|
250
|
+
collection_terms_mapping: dict[str, set[str]],
|
|
251
|
+
) -> tuple[dict[str, set[str]], list[GenerationIssue]]: # noqa E127
|
|
241
252
|
warnings: list[GenerationIssue] = list()
|
|
242
253
|
conflicting_collection_ids_list: list[list[str]] = list()
|
|
243
254
|
collection_ids: list[str] = list(collection_terms_mapping.keys())
|
|
@@ -247,13 +258,16 @@ class DrsGenerator(DrsApplication):
|
|
|
247
258
|
conflicting_collection_ids: list[str] = list()
|
|
248
259
|
for r_collection_index in range(l_collection_index + 1, len_collection_ids):
|
|
249
260
|
if collection_terms_mapping[collection_ids[l_collection_index]].isdisjoint(
|
|
250
|
-
|
|
261
|
+
collection_terms_mapping[collection_ids[r_collection_index]]
|
|
262
|
+
):
|
|
251
263
|
continue
|
|
252
264
|
else:
|
|
253
265
|
not_registered = True
|
|
254
266
|
for cc_ids in conflicting_collection_ids_list:
|
|
255
|
-
if
|
|
256
|
-
|
|
267
|
+
if (
|
|
268
|
+
collection_ids[l_collection_index] in cc_ids
|
|
269
|
+
and collection_ids[r_collection_index] in cc_ids
|
|
270
|
+
):
|
|
257
271
|
not_registered = False
|
|
258
272
|
break
|
|
259
273
|
if not_registered:
|
|
@@ -287,10 +301,12 @@ class DrsGenerator(DrsApplication):
|
|
|
287
301
|
# raise errors, remove the faulty collections and their term.
|
|
288
302
|
if collection_ids_with_len_eq_1_list:
|
|
289
303
|
for collection_ids_to_be_removed in collection_ids_with_len_eq_1_list:
|
|
290
|
-
DrsGenerator._remove_ids_from_conflicts(
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
304
|
+
DrsGenerator._remove_ids_from_conflicts(
|
|
305
|
+
conflicting_collection_ids_list, collection_ids_to_be_removed
|
|
306
|
+
)
|
|
307
|
+
DrsGenerator._remove_term_from_other_term_sets(
|
|
308
|
+
collection_terms_mapping, collection_ids_to_be_removed
|
|
309
|
+
)
|
|
294
310
|
# Every time conflicting_collection_ids_list is modified, we must restart the loop,
|
|
295
311
|
# as conflicting collections may be resolved.
|
|
296
312
|
continue
|
|
@@ -307,10 +323,8 @@ class DrsGenerator(DrsApplication):
|
|
|
307
323
|
warnings.append(issue)
|
|
308
324
|
# 3.b Update conflicting collections.
|
|
309
325
|
if wining_collection_ids:
|
|
310
|
-
DrsGenerator._remove_ids_from_conflicts(conflicting_collection_ids_list,
|
|
311
|
-
|
|
312
|
-
DrsGenerator._remove_term_from_other_term_sets(collection_terms_mapping,
|
|
313
|
-
wining_collection_ids)
|
|
326
|
+
DrsGenerator._remove_ids_from_conflicts(conflicting_collection_ids_list, wining_collection_ids)
|
|
327
|
+
DrsGenerator._remove_term_from_other_term_sets(collection_terms_mapping, wining_collection_ids)
|
|
314
328
|
# Every time conflicting_collection_ids_list is modified, we must restart the loop,
|
|
315
329
|
# as conflicting collections may be resolved.
|
|
316
330
|
continue
|
|
@@ -321,12 +335,14 @@ class DrsGenerator(DrsApplication):
|
|
|
321
335
|
for collection_ids in conflicting_collection_ids_list:
|
|
322
336
|
for collection_index in range(0, len(collection_ids)):
|
|
323
337
|
collection_set = collection_ids[collection_index + 1:] + collection_ids[:collection_index]
|
|
324
|
-
diff: set[str] = collection_terms_mapping[collection_ids[collection_index]]
|
|
325
|
-
|
|
326
|
-
|
|
338
|
+
diff: set[str] = collection_terms_mapping[collection_ids[collection_index]].difference(
|
|
339
|
+
*[
|
|
340
|
+
collection_terms_mapping[index] # noqa E127
|
|
341
|
+
for index in collection_set
|
|
342
|
+
]
|
|
343
|
+
)
|
|
327
344
|
if len(diff) == 1:
|
|
328
|
-
wining_id_and_term_pairs.append((collection_ids[collection_index],
|
|
329
|
-
_get_first_item(diff)))
|
|
345
|
+
wining_id_and_term_pairs.append((collection_ids[collection_index], _get_first_item(diff)))
|
|
330
346
|
# 4.b Update conflicting collections.
|
|
331
347
|
if wining_id_and_term_pairs:
|
|
332
348
|
wining_collection_ids = list()
|
|
@@ -336,18 +352,17 @@ class DrsGenerator(DrsApplication):
|
|
|
336
352
|
collection_terms_mapping[collection_id].add(term)
|
|
337
353
|
issue = AssignedTerm(collection_id=collection_id, term=term)
|
|
338
354
|
warnings.append(issue)
|
|
339
|
-
DrsGenerator._remove_ids_from_conflicts(conflicting_collection_ids_list,
|
|
340
|
-
|
|
341
|
-
DrsGenerator._remove_term_from_other_term_sets(collection_terms_mapping,
|
|
342
|
-
wining_collection_ids)
|
|
355
|
+
DrsGenerator._remove_ids_from_conflicts(conflicting_collection_ids_list, wining_collection_ids)
|
|
356
|
+
DrsGenerator._remove_term_from_other_term_sets(collection_terms_mapping, wining_collection_ids)
|
|
343
357
|
continue
|
|
344
358
|
else:
|
|
345
359
|
break # Stop the loop when no progress is made.
|
|
346
360
|
return collection_terms_mapping, warnings
|
|
347
361
|
|
|
348
362
|
@staticmethod
|
|
349
|
-
def _check_collection_terms_mapping(
|
|
350
|
-
|
|
363
|
+
def _check_collection_terms_mapping(
|
|
364
|
+
collection_terms_mapping: dict[str, set[str]],
|
|
365
|
+
) -> tuple[dict[str, str], list[GenerationIssue]]: # noqa E127
|
|
351
366
|
errors: list[GenerationIssue] = list()
|
|
352
367
|
# 1. Looking for collections that share strictly the same term(s).
|
|
353
368
|
collection_ids: list[str] = list(collection_terms_mapping.keys())
|
|
@@ -363,8 +378,7 @@ class DrsGenerator(DrsApplication):
|
|
|
363
378
|
if l_term_set and (not l_term_set.difference(r_term_set)):
|
|
364
379
|
not_registered = True
|
|
365
380
|
for faulty_collections in faulty_collections_list:
|
|
366
|
-
if l_collection_id in faulty_collections or
|
|
367
|
-
r_collection_id in faulty_collections:
|
|
381
|
+
if l_collection_id in faulty_collections or r_collection_id in faulty_collections:
|
|
368
382
|
faulty_collections.add(l_collection_id)
|
|
369
383
|
faulty_collections.add(r_collection_id)
|
|
370
384
|
not_registered = False
|
|
@@ -373,8 +387,9 @@ class DrsGenerator(DrsApplication):
|
|
|
373
387
|
faulty_collections_list.append({l_collection_id, r_collection_id})
|
|
374
388
|
for faulty_collections in faulty_collections_list:
|
|
375
389
|
terms = collection_terms_mapping[_get_first_item(faulty_collections)]
|
|
376
|
-
issue = ConflictingCollections(
|
|
377
|
-
|
|
390
|
+
issue = ConflictingCollections(
|
|
391
|
+
collection_ids=_transform_set_and_sort(faulty_collections), terms=_transform_set_and_sort(terms)
|
|
392
|
+
)
|
|
378
393
|
errors.append(issue)
|
|
379
394
|
for collection_id in faulty_collections:
|
|
380
395
|
del collection_terms_mapping[collection_id]
|
|
@@ -386,25 +401,28 @@ class DrsGenerator(DrsApplication):
|
|
|
386
401
|
if len_term_set == 1:
|
|
387
402
|
result[collection_id] = _get_first_item(term_set)
|
|
388
403
|
elif len_term_set > 1:
|
|
389
|
-
other_issue = TooManyTermCollection(
|
|
390
|
-
|
|
404
|
+
other_issue = TooManyTermCollection(
|
|
405
|
+
collection_id=collection_id, terms=_transform_set_and_sort(term_set)
|
|
406
|
+
)
|
|
391
407
|
errors.append(other_issue)
|
|
392
408
|
# else: Don't add emptied collection to the result.
|
|
393
409
|
return result, errors
|
|
394
410
|
|
|
395
411
|
@staticmethod
|
|
396
|
-
def _remove_term_from_other_term_sets(
|
|
397
|
-
|
|
412
|
+
def _remove_term_from_other_term_sets(
|
|
413
|
+
collection_terms_mapping: dict[str, set[str]], collection_ids_to_be_removed: list[str]
|
|
414
|
+
) -> None:
|
|
398
415
|
for collection_id_to_be_removed in collection_ids_to_be_removed:
|
|
399
416
|
# Should only be one term.
|
|
400
417
|
term_to_be_removed: str = _get_first_item(collection_terms_mapping[collection_id_to_be_removed])
|
|
401
418
|
for collection_id in collection_terms_mapping.keys():
|
|
402
|
-
if
|
|
419
|
+
if collection_id not in collection_ids_to_be_removed:
|
|
403
420
|
collection_terms_mapping[collection_id].discard(term_to_be_removed)
|
|
404
421
|
|
|
405
422
|
@staticmethod
|
|
406
|
-
def _remove_ids_from_conflicts(
|
|
407
|
-
|
|
423
|
+
def _remove_ids_from_conflicts(
|
|
424
|
+
conflicting_collection_ids_list: list[list[str]], collection_ids_to_be_removed: list[str]
|
|
425
|
+
) -> None:
|
|
408
426
|
for collection_id_to_be_removed in collection_ids_to_be_removed:
|
|
409
427
|
for conflicting_collection_ids in conflicting_collection_ids_list:
|
|
410
428
|
if collection_id_to_be_removed in conflicting_collection_ids:
|