esgvoc 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esgvoc/__init__.py +3 -0
- esgvoc/api/__init__.py +91 -0
- esgvoc/api/data_descriptors/EMD_models/__init__.py +66 -0
- esgvoc/api/data_descriptors/EMD_models/arrangement.py +21 -0
- esgvoc/api/data_descriptors/EMD_models/calendar.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/cell_variable_type.py +20 -0
- esgvoc/api/data_descriptors/EMD_models/component_type.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/coordinate.py +52 -0
- esgvoc/api/data_descriptors/EMD_models/grid_mapping.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/grid_region.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/grid_type.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_computational_grid.py +56 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_grid_cells.py +230 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_subgrid.py +41 -0
- esgvoc/api/data_descriptors/EMD_models/horizontal_units.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/model.py +139 -0
- esgvoc/api/data_descriptors/EMD_models/model_component.py +115 -0
- esgvoc/api/data_descriptors/EMD_models/reference.py +61 -0
- esgvoc/api/data_descriptors/EMD_models/resolution.py +48 -0
- esgvoc/api/data_descriptors/EMD_models/temporal_refinement.py +19 -0
- esgvoc/api/data_descriptors/EMD_models/truncation_method.py +17 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_computational_grid.py +91 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_coordinate.py +5 -0
- esgvoc/api/data_descriptors/EMD_models/vertical_units.py +19 -0
- esgvoc/api/data_descriptors/__init__.py +159 -0
- esgvoc/api/data_descriptors/activity.py +72 -0
- esgvoc/api/data_descriptors/archive.py +5 -0
- esgvoc/api/data_descriptors/area_label.py +30 -0
- esgvoc/api/data_descriptors/branded_suffix.py +30 -0
- esgvoc/api/data_descriptors/branded_variable.py +21 -0
- esgvoc/api/data_descriptors/citation_url.py +5 -0
- esgvoc/api/data_descriptors/contact.py +5 -0
- esgvoc/api/data_descriptors/conventions.py +28 -0
- esgvoc/api/data_descriptors/creation_date.py +18 -0
- esgvoc/api/data_descriptors/data_descriptor.py +127 -0
- esgvoc/api/data_descriptors/data_specs_version.py +25 -0
- esgvoc/api/data_descriptors/date.py +5 -0
- esgvoc/api/data_descriptors/directory_date.py +22 -0
- esgvoc/api/data_descriptors/drs_specs.py +38 -0
- esgvoc/api/data_descriptors/experiment.py +215 -0
- esgvoc/api/data_descriptors/forcing_index.py +21 -0
- esgvoc/api/data_descriptors/frequency.py +48 -0
- esgvoc/api/data_descriptors/further_info_url.py +5 -0
- esgvoc/api/data_descriptors/grid.py +43 -0
- esgvoc/api/data_descriptors/horizontal_label.py +20 -0
- esgvoc/api/data_descriptors/initialization_index.py +27 -0
- esgvoc/api/data_descriptors/institution.py +80 -0
- esgvoc/api/data_descriptors/known_branded_variable.py +75 -0
- esgvoc/api/data_descriptors/license.py +31 -0
- esgvoc/api/data_descriptors/member_id.py +9 -0
- esgvoc/api/data_descriptors/mip_era.py +26 -0
- esgvoc/api/data_descriptors/model_component.py +32 -0
- esgvoc/api/data_descriptors/models_test/models.py +17 -0
- esgvoc/api/data_descriptors/nominal_resolution.py +50 -0
- esgvoc/api/data_descriptors/obs_type.py +5 -0
- esgvoc/api/data_descriptors/organisation.py +22 -0
- esgvoc/api/data_descriptors/physics_index.py +21 -0
- esgvoc/api/data_descriptors/product.py +16 -0
- esgvoc/api/data_descriptors/publication_status.py +5 -0
- esgvoc/api/data_descriptors/realization_index.py +24 -0
- esgvoc/api/data_descriptors/realm.py +16 -0
- esgvoc/api/data_descriptors/regex.py +5 -0
- esgvoc/api/data_descriptors/region.py +35 -0
- esgvoc/api/data_descriptors/resolution.py +7 -0
- esgvoc/api/data_descriptors/source.py +120 -0
- esgvoc/api/data_descriptors/source_type.py +5 -0
- esgvoc/api/data_descriptors/sub_experiment.py +5 -0
- esgvoc/api/data_descriptors/table.py +28 -0
- esgvoc/api/data_descriptors/temporal_label.py +20 -0
- esgvoc/api/data_descriptors/time_range.py +17 -0
- esgvoc/api/data_descriptors/title.py +5 -0
- esgvoc/api/data_descriptors/tracking_id.py +67 -0
- esgvoc/api/data_descriptors/variable.py +56 -0
- esgvoc/api/data_descriptors/variant_label.py +25 -0
- esgvoc/api/data_descriptors/vertical_label.py +20 -0
- esgvoc/api/project_specs.py +143 -0
- esgvoc/api/projects.py +1253 -0
- esgvoc/api/py.typed +0 -0
- esgvoc/api/pydantic_handler.py +146 -0
- esgvoc/api/report.py +127 -0
- esgvoc/api/search.py +171 -0
- esgvoc/api/universe.py +434 -0
- esgvoc/apps/__init__.py +6 -0
- esgvoc/apps/cmor_tables/__init__.py +7 -0
- esgvoc/apps/cmor_tables/cvs_table.py +948 -0
- esgvoc/apps/drs/__init__.py +0 -0
- esgvoc/apps/drs/constants.py +2 -0
- esgvoc/apps/drs/generator.py +429 -0
- esgvoc/apps/drs/report.py +540 -0
- esgvoc/apps/drs/validator.py +312 -0
- esgvoc/apps/ga/__init__.py +104 -0
- esgvoc/apps/ga/example_usage.py +315 -0
- esgvoc/apps/ga/models/__init__.py +47 -0
- esgvoc/apps/ga/models/netcdf_header.py +306 -0
- esgvoc/apps/ga/models/validator.py +491 -0
- esgvoc/apps/ga/test_ga.py +161 -0
- esgvoc/apps/ga/validator.py +277 -0
- esgvoc/apps/jsg/json_schema_generator.py +341 -0
- esgvoc/apps/jsg/templates/template.jinja +241 -0
- esgvoc/apps/test_cv/README.md +214 -0
- esgvoc/apps/test_cv/__init__.py +0 -0
- esgvoc/apps/test_cv/cv_tester.py +1611 -0
- esgvoc/apps/test_cv/example_usage.py +216 -0
- esgvoc/apps/vr/__init__.py +12 -0
- esgvoc/apps/vr/build_variable_registry.py +71 -0
- esgvoc/apps/vr/example_usage.py +60 -0
- esgvoc/apps/vr/vr_app.py +333 -0
- esgvoc/cli/clean.py +304 -0
- esgvoc/cli/cmor.py +46 -0
- esgvoc/cli/config.py +1300 -0
- esgvoc/cli/drs.py +267 -0
- esgvoc/cli/find.py +138 -0
- esgvoc/cli/get.py +155 -0
- esgvoc/cli/install.py +41 -0
- esgvoc/cli/main.py +60 -0
- esgvoc/cli/offline.py +269 -0
- esgvoc/cli/status.py +79 -0
- esgvoc/cli/test_cv.py +258 -0
- esgvoc/cli/valid.py +147 -0
- esgvoc/core/constants.py +17 -0
- esgvoc/core/convert.py +0 -0
- esgvoc/core/data_handler.py +206 -0
- esgvoc/core/db/__init__.py +3 -0
- esgvoc/core/db/connection.py +40 -0
- esgvoc/core/db/models/mixins.py +25 -0
- esgvoc/core/db/models/project.py +102 -0
- esgvoc/core/db/models/universe.py +98 -0
- esgvoc/core/db/project_ingestion.py +231 -0
- esgvoc/core/db/universe_ingestion.py +172 -0
- esgvoc/core/exceptions.py +33 -0
- esgvoc/core/logging_handler.py +26 -0
- esgvoc/core/repo_fetcher.py +345 -0
- esgvoc/core/service/__init__.py +41 -0
- esgvoc/core/service/configuration/config_manager.py +196 -0
- esgvoc/core/service/configuration/setting.py +363 -0
- esgvoc/core/service/data_merger.py +634 -0
- esgvoc/core/service/esg_voc.py +77 -0
- esgvoc/core/service/resolver_config.py +56 -0
- esgvoc/core/service/state.py +324 -0
- esgvoc/core/service/string_heuristics.py +98 -0
- esgvoc/core/service/term_cache.py +108 -0
- esgvoc/core/service/uri_resolver.py +133 -0
- esgvoc-2.0.2.dist-info/METADATA +82 -0
- esgvoc-2.0.2.dist-info/RECORD +147 -0
- esgvoc-2.0.2.dist-info/WHEEL +4 -0
- esgvoc-2.0.2.dist-info/entry_points.txt +2 -0
- esgvoc-2.0.2.dist-info/licenses/LICENSE.txt +519 -0
esgvoc/api/projects.py
ADDED
|
@@ -0,0 +1,1253 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import re
|
|
3
|
+
from typing import Iterable, Sequence, cast
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import text
|
|
6
|
+
from sqlmodel import Session, and_, col, select
|
|
7
|
+
|
|
8
|
+
import esgvoc.api.universe as universe
|
|
9
|
+
import esgvoc.core.constants as constants
|
|
10
|
+
import esgvoc.core.service as service
|
|
11
|
+
from esgvoc.api.data_descriptors.data_descriptor import DataDescriptor
|
|
12
|
+
from esgvoc.api.project_specs import ProjectSpecs
|
|
13
|
+
from esgvoc.api.report import ProjectTermError, UniverseTermError, ValidationReport
|
|
14
|
+
from esgvoc.api.pydantic_handler import instantiate_pydantic_term
|
|
15
|
+
from esgvoc.api.search import (
|
|
16
|
+
Item,
|
|
17
|
+
MatchingTerm,
|
|
18
|
+
execute_find_item_statements,
|
|
19
|
+
execute_match_statement,
|
|
20
|
+
generate_matching_condition,
|
|
21
|
+
get_universe_session,
|
|
22
|
+
handle_rank_limit_offset,
|
|
23
|
+
instantiate_pydantic_terms,
|
|
24
|
+
process_expression,
|
|
25
|
+
)
|
|
26
|
+
from esgvoc.core.db.connection import DBConnection
|
|
27
|
+
from esgvoc.core.db.models.mixins import TermKind
|
|
28
|
+
from esgvoc.core.db.models.project import PCollection, PCollectionFTS5, Project, PTerm, PTermFTS5
|
|
29
|
+
from esgvoc.core.db.models.universe import UTerm
|
|
30
|
+
from esgvoc.core.exceptions import EsgvocDbError, EsgvocNotFoundError, EsgvocNotImplementedError, EsgvocValueError
|
|
31
|
+
|
|
32
|
+
# [OPTIMIZATION]
|
|
33
|
+
_VALID_TERM_IN_COLLECTION_CACHE: dict[str, list[MatchingTerm]] = dict()
|
|
34
|
+
_VALID_VALUE_AGAINST_GIVEN_TERM_CACHE: dict[str, list[UniverseTermError | ProjectTermError]] = dict()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_project_connection(project_id: str) -> DBConnection | None:
|
|
38
|
+
if project_id in service.current_state.projects:
|
|
39
|
+
return service.current_state.projects[project_id].db_connection
|
|
40
|
+
else:
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _get_project_session_with_exception(project_id: str) -> Session:
|
|
45
|
+
if connection := _get_project_connection(project_id):
|
|
46
|
+
project_session = connection.create_session()
|
|
47
|
+
return project_session
|
|
48
|
+
else:
|
|
49
|
+
raise EsgvocNotFoundError(f"unable to find project '{project_id}'")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _resolve_composite_term_part(
|
|
53
|
+
composite_term_part: dict, universe_session: Session, project_session: Session
|
|
54
|
+
) -> UTerm | PTerm | Sequence[UTerm | PTerm]:
|
|
55
|
+
if constants.TERM_ID_JSON_KEY in composite_term_part:
|
|
56
|
+
# First find the term in the universe than in the current project
|
|
57
|
+
term_id = composite_term_part[constants.TERM_ID_JSON_KEY]
|
|
58
|
+
term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
|
|
59
|
+
uterm = universe._get_term_in_data_descriptor(
|
|
60
|
+
data_descriptor_id=term_type, term_id=term_id, session=universe_session
|
|
61
|
+
)
|
|
62
|
+
if uterm:
|
|
63
|
+
return uterm
|
|
64
|
+
else:
|
|
65
|
+
pterm = _get_term_in_collection(collection_id=term_type, term_id=term_id, session=project_session)
|
|
66
|
+
if pterm:
|
|
67
|
+
return pterm
|
|
68
|
+
else:
|
|
69
|
+
msg = f"unable to find the term '{term_id}' in '{term_type}'"
|
|
70
|
+
raise EsgvocNotFoundError(msg)
|
|
71
|
+
else:
|
|
72
|
+
term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
|
|
73
|
+
data_descriptor = universe._get_data_descriptor_in_universe(term_type, universe_session)
|
|
74
|
+
if data_descriptor is not None:
|
|
75
|
+
return data_descriptor.terms
|
|
76
|
+
else:
|
|
77
|
+
collection = _get_collection_in_project(term_type, project_session)
|
|
78
|
+
if collection is not None:
|
|
79
|
+
return collection.terms
|
|
80
|
+
else:
|
|
81
|
+
msg = f"unable to find the terms of '{term_type}'"
|
|
82
|
+
raise EsgvocNotFoundError(msg)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _get_composite_term_separator_parts(term: UTerm | PTerm) -> tuple[str, list]:
|
|
86
|
+
separator = term.specs[constants.COMPOSITE_SEPARATOR_JSON_KEY]
|
|
87
|
+
parts = term.specs[constants.COMPOSITE_PARTS_JSON_KEY]
|
|
88
|
+
return separator, parts
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _valid_value_composite_term_with_separator(
|
|
92
|
+
value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
|
|
93
|
+
) -> list[UniverseTermError | ProjectTermError]:
|
|
94
|
+
separator, parts = _get_composite_term_separator_parts(term)
|
|
95
|
+
required_indices = {i for i, p in enumerate(parts) if p.get(constants.COMPOSITE_REQUIRED_KEY, False)}
|
|
96
|
+
|
|
97
|
+
splits = value.split(separator)
|
|
98
|
+
nb_splits = len(splits)
|
|
99
|
+
nb_parts = len(parts)
|
|
100
|
+
|
|
101
|
+
if nb_splits > nb_parts:
|
|
102
|
+
return [_create_term_error(value, term)]
|
|
103
|
+
|
|
104
|
+
# Generate all possible assignments of split values into parts
|
|
105
|
+
# Only keep those that include all required parts
|
|
106
|
+
all_positions = [i for i in range(nb_parts)]
|
|
107
|
+
valid_combinations = [
|
|
108
|
+
comb for comb in itertools.combinations(all_positions, nb_splits) if required_indices.issubset(comb)
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
for positions in valid_combinations:
|
|
112
|
+
candidate = [None] * nb_parts
|
|
113
|
+
for idx, pos in enumerate(positions):
|
|
114
|
+
candidate[pos] = splits[idx]
|
|
115
|
+
|
|
116
|
+
# Separator structure validation:
|
|
117
|
+
# - No leading separator if the first part is None
|
|
118
|
+
# - No trailing separator if the last part is None
|
|
119
|
+
# - No double separators where two adjacent optional parts are missing
|
|
120
|
+
if candidate[0] is None and value.startswith(separator):
|
|
121
|
+
continue
|
|
122
|
+
if candidate[-1] is None and value.endswith(separator):
|
|
123
|
+
continue
|
|
124
|
+
if any(
|
|
125
|
+
candidate[i] is None and candidate[i + 1] is None and separator * 2 in value for i in range(nb_parts - 1)
|
|
126
|
+
):
|
|
127
|
+
continue # invalid double separator between two missing parts
|
|
128
|
+
|
|
129
|
+
# Validate each filled part value
|
|
130
|
+
all_valid = True
|
|
131
|
+
for i, given_value in enumerate(candidate):
|
|
132
|
+
if given_value is None:
|
|
133
|
+
if parts[i].get(constants.COMPOSITE_REQUIRED_KEY, False):
|
|
134
|
+
all_valid = False
|
|
135
|
+
break
|
|
136
|
+
continue # optional and missing part is allowed
|
|
137
|
+
|
|
138
|
+
part = parts[i]
|
|
139
|
+
|
|
140
|
+
# Resolve term ID list if not present
|
|
141
|
+
if "id" not in part:
|
|
142
|
+
terms = universe.get_all_terms_in_data_descriptor(part["type"], None)
|
|
143
|
+
part["id"] = [term.id for term in terms]
|
|
144
|
+
if isinstance(part["id"], str):
|
|
145
|
+
part["id"] = [part["id"]]
|
|
146
|
+
|
|
147
|
+
# Try all possible term IDs to find a valid match
|
|
148
|
+
valid_for_this_part = False
|
|
149
|
+
for id in part["id"]:
|
|
150
|
+
part_copy = dict(part)
|
|
151
|
+
part_copy["id"] = id
|
|
152
|
+
resolved_term = _resolve_composite_term_part(part_copy, universe_session, project_session)
|
|
153
|
+
# resolved_term can't be a list of terms here.
|
|
154
|
+
resolved_term = cast(UTerm | PTerm, resolved_term)
|
|
155
|
+
errors = _valid_value(given_value, resolved_term, universe_session, project_session)
|
|
156
|
+
if not errors:
|
|
157
|
+
valid_for_this_part = True
|
|
158
|
+
break
|
|
159
|
+
if not valid_for_this_part:
|
|
160
|
+
all_valid = False
|
|
161
|
+
break
|
|
162
|
+
|
|
163
|
+
if all_valid:
|
|
164
|
+
return [] # At least one valid combination found
|
|
165
|
+
|
|
166
|
+
return [_create_term_error(value, term)] # No valid combination found
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _transform_to_pattern(term: UTerm | PTerm, universe_session: Session, project_session: Session) -> str:
|
|
170
|
+
match term.kind:
|
|
171
|
+
case TermKind.PLAIN:
|
|
172
|
+
if constants.DRS_SPECS_JSON_KEY in term.specs:
|
|
173
|
+
result = term.specs[constants.DRS_SPECS_JSON_KEY]
|
|
174
|
+
else:
|
|
175
|
+
raise EsgvocValueError(f"the term '{term.id}' doesn't have drs name. " + "Can't validate it.")
|
|
176
|
+
case TermKind.PATTERN:
|
|
177
|
+
result = term.specs[constants.PATTERN_JSON_KEY]
|
|
178
|
+
case TermKind.COMPOSITE:
|
|
179
|
+
separator, parts = _get_composite_term_separator_parts(term)
|
|
180
|
+
result = ""
|
|
181
|
+
for part in parts:
|
|
182
|
+
resolved_term = _resolve_composite_term_part(part, universe_session, project_session)
|
|
183
|
+
if isinstance(resolved_term, Sequence):
|
|
184
|
+
pattern = ""
|
|
185
|
+
for r_term in resolved_term:
|
|
186
|
+
pattern += _transform_to_pattern(r_term, universe_session, project_session)
|
|
187
|
+
else:
|
|
188
|
+
pattern = _transform_to_pattern(resolved_term, universe_session, project_session)
|
|
189
|
+
result = f"{result}{pattern}{separator}"
|
|
190
|
+
result = result.rstrip(separator)
|
|
191
|
+
case _:
|
|
192
|
+
raise EsgvocDbError(f"unsupported term kind '{term.kind}'")
|
|
193
|
+
return result
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# TODO: support optionality of parts of composite.
|
|
197
|
+
# It is backtrack possible for more than one missing parts.
|
|
198
|
+
def _valid_value_composite_term_separator_less(
|
|
199
|
+
value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
|
|
200
|
+
) -> list[UniverseTermError | ProjectTermError]:
|
|
201
|
+
result = list()
|
|
202
|
+
try:
|
|
203
|
+
pattern = _transform_to_pattern(term, universe_session, project_session)
|
|
204
|
+
try:
|
|
205
|
+
# Patterns terms are meant to be validated individually.
|
|
206
|
+
# So their regex are defined as a whole (begins by a ^, ends by a $).
|
|
207
|
+
# As the pattern is a concatenation of plain or regex, multiple ^ and $ can exist.
|
|
208
|
+
# The later, must be removed.
|
|
209
|
+
pattern = pattern.replace("^", "").replace("$", "")
|
|
210
|
+
pattern = f"^{pattern}$"
|
|
211
|
+
regex = re.compile(pattern)
|
|
212
|
+
except Exception as e:
|
|
213
|
+
msg = f"regex compilation error while processing term '{term.id}'':\n{e}"
|
|
214
|
+
raise EsgvocDbError(msg) from e
|
|
215
|
+
match = regex.match(value)
|
|
216
|
+
if match is None:
|
|
217
|
+
result.append(_create_term_error(value, term))
|
|
218
|
+
return result
|
|
219
|
+
except Exception as e:
|
|
220
|
+
msg = f"cannot validate separator less composite term '{term.id}':\n{e}"
|
|
221
|
+
raise EsgvocNotImplementedError(msg) from e
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _valid_value_for_composite_term(
|
|
225
|
+
value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
|
|
226
|
+
) -> list[UniverseTermError | ProjectTermError]:
|
|
227
|
+
result = list()
|
|
228
|
+
separator, _ = _get_composite_term_separator_parts(term)
|
|
229
|
+
if separator:
|
|
230
|
+
result = _valid_value_composite_term_with_separator(value, term, universe_session, project_session)
|
|
231
|
+
else:
|
|
232
|
+
result = _valid_value_composite_term_separator_less(value, term, universe_session, project_session)
|
|
233
|
+
return result
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _create_term_error(value: str, term: UTerm | PTerm) -> UniverseTermError | ProjectTermError:
|
|
237
|
+
if isinstance(term, UTerm):
|
|
238
|
+
return UniverseTermError(
|
|
239
|
+
value=value, term=term.specs, term_kind=term.kind, data_descriptor_id=term.data_descriptor.id
|
|
240
|
+
)
|
|
241
|
+
else:
|
|
242
|
+
return ProjectTermError(value=value, term=term.specs, term_kind=term.kind, collection_id=term.collection.id)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _valid_value(
|
|
246
|
+
value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
|
|
247
|
+
) -> list[UniverseTermError | ProjectTermError]:
|
|
248
|
+
result = list()
|
|
249
|
+
match term.kind:
|
|
250
|
+
case TermKind.PLAIN:
|
|
251
|
+
if constants.DRS_SPECS_JSON_KEY in term.specs:
|
|
252
|
+
if term.specs[constants.DRS_SPECS_JSON_KEY] != value:
|
|
253
|
+
result.append(_create_term_error(value, term))
|
|
254
|
+
else:
|
|
255
|
+
raise EsgvocValueError(f"the term '{term.id}' doesn't have drs name. " + "Can't validate it.")
|
|
256
|
+
case TermKind.PATTERN:
|
|
257
|
+
# TODO: Pattern can be compiled and stored for further matching.
|
|
258
|
+
pattern_match = re.match(term.specs[constants.PATTERN_JSON_KEY], value)
|
|
259
|
+
if pattern_match is None:
|
|
260
|
+
result.append(_create_term_error(value, term))
|
|
261
|
+
case TermKind.COMPOSITE:
|
|
262
|
+
result.extend(_valid_value_for_composite_term(value, term, universe_session, project_session))
|
|
263
|
+
case _:
|
|
264
|
+
raise EsgvocDbError(f"unsupported term kind '{term.kind}'")
|
|
265
|
+
return result
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _check_value(value: str) -> str:
|
|
269
|
+
if not value or value.isspace():
|
|
270
|
+
raise EsgvocValueError("value should be set")
|
|
271
|
+
else:
|
|
272
|
+
return value
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _search_plain_term_and_valid_value(value: str, collection_id: str, project_session: Session) -> str | None:
|
|
276
|
+
where_expression = and_(PCollection.id == collection_id, PTerm.specs[constants.DRS_SPECS_JSON_KEY] == f'"{value}"')
|
|
277
|
+
statement = select(PTerm).join(PCollection).where(where_expression)
|
|
278
|
+
term = project_session.exec(statement).one_or_none()
|
|
279
|
+
return term.id if term else None
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _valid_value_against_all_terms_of_collection(
|
|
283
|
+
value: str, collection: PCollection, universe_session: Session, project_session: Session
|
|
284
|
+
) -> list[str]:
|
|
285
|
+
if collection.terms:
|
|
286
|
+
result = list()
|
|
287
|
+
for pterm in collection.terms:
|
|
288
|
+
_errors = _valid_value(value, pterm, universe_session, project_session)
|
|
289
|
+
if not _errors:
|
|
290
|
+
result.append(pterm.id)
|
|
291
|
+
return result
|
|
292
|
+
else:
|
|
293
|
+
raise EsgvocDbError(f"collection '{collection.id}' has no term")
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _valid_value_against_given_term(
|
|
297
|
+
value: str, project_id: str, collection_id: str, term_id: str, universe_session: Session, project_session: Session
|
|
298
|
+
) -> list[UniverseTermError | ProjectTermError]:
|
|
299
|
+
# [OPTIMIZATION]
|
|
300
|
+
key = value + project_id + collection_id + term_id
|
|
301
|
+
if key in _VALID_VALUE_AGAINST_GIVEN_TERM_CACHE:
|
|
302
|
+
result = _VALID_VALUE_AGAINST_GIVEN_TERM_CACHE[key]
|
|
303
|
+
else:
|
|
304
|
+
term = _get_term_in_collection(collection_id, term_id, project_session)
|
|
305
|
+
if term:
|
|
306
|
+
result = _valid_value(value, term, universe_session, project_session)
|
|
307
|
+
else:
|
|
308
|
+
raise EsgvocNotFoundError(f"unable to find term '{term_id}' " + f"in collection '{collection_id}'")
|
|
309
|
+
_VALID_VALUE_AGAINST_GIVEN_TERM_CACHE[key] = result
|
|
310
|
+
return result
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def valid_term(value: str, project_id: str, collection_id: str, term_id: str) -> ValidationReport:
|
|
314
|
+
"""
|
|
315
|
+
Check if the given value may or may not represent the given term. The functions returns
|
|
316
|
+
a report that contains the possible errors.
|
|
317
|
+
|
|
318
|
+
Behavior based on the nature of the term:
|
|
319
|
+
- plain term: the function try to match the value on the drs_name field.
|
|
320
|
+
- pattern term: the function try to match the value on the pattern field (regex).
|
|
321
|
+
- composite term:
|
|
322
|
+
- if the composite has got a separator, the function splits the value according to the\
|
|
323
|
+
separator of the term then it try to match every part of the composite\
|
|
324
|
+
with every split of the value.
|
|
325
|
+
- if the composite hasn't got a separator, the function aggregates the parts of the \
|
|
326
|
+
composite so as to compare it as a regex to the value.
|
|
327
|
+
|
|
328
|
+
If any of the provided ids (`project_id`, `collection_id` or `term_id`) is not found,
|
|
329
|
+
the function raises a EsgvocNotFoundError.
|
|
330
|
+
|
|
331
|
+
:param value: A value to be validated
|
|
332
|
+
:type value: str
|
|
333
|
+
:param project_id: A project id
|
|
334
|
+
:type project_id: str
|
|
335
|
+
:param collection_id: A collection id
|
|
336
|
+
:type collection_id: str
|
|
337
|
+
:param term_id: A term id
|
|
338
|
+
:type term_id: str
|
|
339
|
+
:returns: A validation report that contains the possible errors
|
|
340
|
+
:rtype: ValidationReport
|
|
341
|
+
:raises EsgvocNotFoundError: If any of the provided ids is not found
|
|
342
|
+
"""
|
|
343
|
+
value = _check_value(value)
|
|
344
|
+
with get_universe_session() as universe_session, _get_project_session_with_exception(project_id) as project_session:
|
|
345
|
+
errors = _valid_value_against_given_term(
|
|
346
|
+
value, project_id, collection_id, term_id, universe_session, project_session
|
|
347
|
+
)
|
|
348
|
+
return ValidationReport(expression=value, errors=errors)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def _valid_term_in_collection(
|
|
352
|
+
value: str, project_id: str, collection_id: str, universe_session: Session, project_session: Session
|
|
353
|
+
) -> list[MatchingTerm]:
|
|
354
|
+
# [OPTIMIZATION]
|
|
355
|
+
key = value + project_id + collection_id
|
|
356
|
+
if key in _VALID_TERM_IN_COLLECTION_CACHE:
|
|
357
|
+
result = _VALID_TERM_IN_COLLECTION_CACHE[key]
|
|
358
|
+
else:
|
|
359
|
+
value = _check_value(value)
|
|
360
|
+
result = list()
|
|
361
|
+
collection = _get_collection_in_project(collection_id, project_session)
|
|
362
|
+
if collection:
|
|
363
|
+
match collection.term_kind:
|
|
364
|
+
case TermKind.PLAIN:
|
|
365
|
+
term_id_found = _search_plain_term_and_valid_value(value, collection_id, project_session)
|
|
366
|
+
if term_id_found:
|
|
367
|
+
result.append(
|
|
368
|
+
MatchingTerm(project_id=project_id, collection_id=collection_id, term_id=term_id_found)
|
|
369
|
+
)
|
|
370
|
+
case _:
|
|
371
|
+
term_ids_found = _valid_value_against_all_terms_of_collection(
|
|
372
|
+
value, collection, universe_session, project_session
|
|
373
|
+
)
|
|
374
|
+
for term_id_found in term_ids_found:
|
|
375
|
+
result.append(
|
|
376
|
+
MatchingTerm(project_id=project_id, collection_id=collection_id, term_id=term_id_found)
|
|
377
|
+
)
|
|
378
|
+
else:
|
|
379
|
+
msg = f"unable to find collection '{collection_id}'"
|
|
380
|
+
raise EsgvocNotFoundError(msg)
|
|
381
|
+
_VALID_TERM_IN_COLLECTION_CACHE[key] = result
|
|
382
|
+
return result
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def valid_term_in_collection(value: str, project_id: str, collection_id: str) -> list[MatchingTerm]:
|
|
386
|
+
"""
|
|
387
|
+
Check if the given value may or may not represent a term in the given collection. The function
|
|
388
|
+
returns the terms that the value matches.
|
|
389
|
+
|
|
390
|
+
Behavior based on the nature of the term:
|
|
391
|
+
- plain term: the function try to match the value on the drs_name field.
|
|
392
|
+
- pattern term: the function try to match the value on the pattern field (regex).
|
|
393
|
+
- composite term:
|
|
394
|
+
- if the composite has got a separator, the function splits the value according to the \
|
|
395
|
+
separator of the term then it try to match every part of the composite \
|
|
396
|
+
with every split of the value.
|
|
397
|
+
- if the composite hasn't got a separator, the function aggregates the parts of the \
|
|
398
|
+
composite so as to compare it as a regex to the value.
|
|
399
|
+
|
|
400
|
+
If any of the provided ids (`project_id` or `collection_id`) is not found,
|
|
401
|
+
the function raises a EsgvocNotFoundError.
|
|
402
|
+
|
|
403
|
+
:param value: A value to be validated
|
|
404
|
+
:type value: str
|
|
405
|
+
:param project_id: A project id
|
|
406
|
+
:type project_id: str
|
|
407
|
+
:param collection_id: A collection id
|
|
408
|
+
:type collection_id: str
|
|
409
|
+
:returns: The list of terms that the value matches.
|
|
410
|
+
:rtype: list[MatchingTerm]
|
|
411
|
+
:raises EsgvocNotFoundError: If any of the provided ids is not found
|
|
412
|
+
"""
|
|
413
|
+
with get_universe_session() as universe_session, _get_project_session_with_exception(project_id) as project_session:
|
|
414
|
+
return _valid_term_in_collection(value, project_id, collection_id, universe_session, project_session)
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def _valid_term_in_project(
|
|
418
|
+
value: str, project_id: str, universe_session: Session, project_session: Session
|
|
419
|
+
) -> list[MatchingTerm]:
|
|
420
|
+
result = list()
|
|
421
|
+
collections = _get_all_collections_in_project(project_session)
|
|
422
|
+
for collection in collections:
|
|
423
|
+
result.extend(_valid_term_in_collection(value, project_id, collection.id, universe_session, project_session))
|
|
424
|
+
return result
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def valid_term_in_project(value: str, project_id: str) -> list[MatchingTerm]:
|
|
428
|
+
"""
|
|
429
|
+
Check if the given value may or may not represent a term in the given project. The function
|
|
430
|
+
returns the terms that the value matches.
|
|
431
|
+
|
|
432
|
+
Behavior based on the nature of the term:
|
|
433
|
+
- plain term: the function try to match the value on the drs_name field.
|
|
434
|
+
- pattern term: the function try to match the value on the pattern field (regex).
|
|
435
|
+
- composite term:
|
|
436
|
+
- if the composite has got a separator, the function splits the value according to the \
|
|
437
|
+
separator of the term then it try to match every part of the composite \
|
|
438
|
+
with every split of the value.
|
|
439
|
+
- if the composite hasn't got a separator, the function aggregates the parts of the \
|
|
440
|
+
composite so as to compare it as a regex to the value.
|
|
441
|
+
|
|
442
|
+
If the `project_id` is not found, the function raises a EsgvocNotFoundError.
|
|
443
|
+
|
|
444
|
+
:param value: A value to be validated
|
|
445
|
+
:type value: str
|
|
446
|
+
:param project_id: A project id
|
|
447
|
+
:type project_id: str
|
|
448
|
+
:returns: The list of terms that the value matches.
|
|
449
|
+
:rtype: list[MatchingTerm]
|
|
450
|
+
:raises EsgvocNotFoundError: If the `project_id` is not found
|
|
451
|
+
"""
|
|
452
|
+
with get_universe_session() as universe_session, _get_project_session_with_exception(project_id) as project_session:
|
|
453
|
+
return _valid_term_in_project(value, project_id, universe_session, project_session)
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def valid_term_in_all_projects(value: str) -> list[MatchingTerm]:
|
|
457
|
+
"""
|
|
458
|
+
Check if the given value may or may not represent a term in all projects. The function
|
|
459
|
+
returns the terms that the value matches.
|
|
460
|
+
|
|
461
|
+
Behavior based on the nature of the term:
|
|
462
|
+
- plain term: the function try to match the value on the drs_name field.
|
|
463
|
+
- pattern term: the function try to match the value on the pattern field (regex).
|
|
464
|
+
- composite term:
|
|
465
|
+
- if the composite has got a separator, the function splits the value according to the \
|
|
466
|
+
separator of the term then it try to match every part of the composite \
|
|
467
|
+
with every split of the value.
|
|
468
|
+
- if the composite hasn't got a separator, the function aggregates the parts of the \
|
|
469
|
+
composite so as to compare it as a regex to the value.
|
|
470
|
+
|
|
471
|
+
:param value: A value to be validated
|
|
472
|
+
:type value: str
|
|
473
|
+
:returns: The list of terms that the value matches.
|
|
474
|
+
:rtype: list[MatchingTerm]
|
|
475
|
+
"""
|
|
476
|
+
result = list()
|
|
477
|
+
with get_universe_session() as universe_session:
|
|
478
|
+
for project_id in get_all_projects():
|
|
479
|
+
with _get_project_session_with_exception(project_id) as project_session:
|
|
480
|
+
result.extend(_valid_term_in_project(value, project_id, universe_session, project_session))
|
|
481
|
+
return result
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def get_all_terms_in_collection(
|
|
485
|
+
project_id: str, collection_id: str, selected_term_fields: Iterable[str] | None = None
|
|
486
|
+
) -> list[DataDescriptor]:
|
|
487
|
+
"""
|
|
488
|
+
Gets all terms of the given collection of a project.
|
|
489
|
+
This function performs an exact match on the `project_id` and `collection_id`,
|
|
490
|
+
and does not search for similar or related projects and collections.
|
|
491
|
+
If any of the provided ids (`project_id` or `collection_id`) is not found, the function
|
|
492
|
+
returns an empty list.
|
|
493
|
+
|
|
494
|
+
:param project_id: A project id
|
|
495
|
+
:type project_id: str
|
|
496
|
+
:param collection_id: A collection id
|
|
497
|
+
:type collection_id: str
|
|
498
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
499
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
500
|
+
:type selected_term_fields: Iterable[str] | None
|
|
501
|
+
:returns: a list of term instances. Returns an empty list if no matches are found.
|
|
502
|
+
:rtype: list[DataDescriptor]
|
|
503
|
+
"""
|
|
504
|
+
result = list()
|
|
505
|
+
if connection := _get_project_connection(project_id):
|
|
506
|
+
with connection.create_session() as session:
|
|
507
|
+
collection = _get_collection_in_project(collection_id, session)
|
|
508
|
+
if collection:
|
|
509
|
+
result = _get_all_terms_in_collection(collection, selected_term_fields)
|
|
510
|
+
return result
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def _get_all_collections_in_project(session: Session) -> list[PCollection]:
|
|
514
|
+
project = session.get(Project, constants.SQLITE_FIRST_PK)
|
|
515
|
+
# Project can't be missing if session exists.
|
|
516
|
+
try:
|
|
517
|
+
return project.collections # type: ignore
|
|
518
|
+
except Exception as e:
|
|
519
|
+
# Enhanced error context for collection retrieval failures
|
|
520
|
+
import logging
|
|
521
|
+
|
|
522
|
+
logger = logging.getLogger(__name__)
|
|
523
|
+
logger.error(f"Failed to retrieve collections for project '{project.id}': {str(e)}")
|
|
524
|
+
|
|
525
|
+
# Use raw SQL to inspect collections without Pydantic validation
|
|
526
|
+
from sqlalchemy import text
|
|
527
|
+
|
|
528
|
+
try:
|
|
529
|
+
# Query raw data to identify problematic collections
|
|
530
|
+
raw_query = text("""
|
|
531
|
+
SELECT id, term_kind, data_descriptor_id
|
|
532
|
+
FROM pcollections
|
|
533
|
+
WHERE project_pk = :project_pk
|
|
534
|
+
""")
|
|
535
|
+
result = session.execute(raw_query, {"project_pk": project.pk})
|
|
536
|
+
|
|
537
|
+
problematic_collections = []
|
|
538
|
+
|
|
539
|
+
for row in result:
|
|
540
|
+
collection_id, term_kind_value, data_descriptor_id = row
|
|
541
|
+
|
|
542
|
+
# Only empty string is invalid - indicates ingestion couldn't determine termkind
|
|
543
|
+
if term_kind_value == "" or term_kind_value is None:
|
|
544
|
+
problematic_collections.append((collection_id, term_kind_value, data_descriptor_id))
|
|
545
|
+
msg = (
|
|
546
|
+
f"Collection '{collection_id}' has empty term_kind (data_descriptor: "
|
|
547
|
+
+ f"{data_descriptor_id}) - CV ingestion failed to determine termkind"
|
|
548
|
+
)
|
|
549
|
+
logger.error(msg)
|
|
550
|
+
|
|
551
|
+
if problematic_collections:
|
|
552
|
+
error_details = []
|
|
553
|
+
for col_id, _, data_desc in problematic_collections:
|
|
554
|
+
error_details.append(f" • Collection '{col_id}' (data_descriptor: {data_desc}): EMPTY termkind")
|
|
555
|
+
|
|
556
|
+
error_msg = f"Found {len(problematic_collections)} collections with empty term_kind:\n" + "\n".join(
|
|
557
|
+
error_details
|
|
558
|
+
)
|
|
559
|
+
raise ValueError(error_msg) from e
|
|
560
|
+
|
|
561
|
+
except Exception as inner_e:
|
|
562
|
+
logger.error(f"Failed to analyze problematic collections using raw SQL: {inner_e}")
|
|
563
|
+
|
|
564
|
+
raise e
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def get_all_collections_in_project(project_id: str) -> list[str]:
|
|
568
|
+
"""
|
|
569
|
+
Gets all collections of the given project.
|
|
570
|
+
This function performs an exact match on the `project_id` and
|
|
571
|
+
does not search for similar or related projects.
|
|
572
|
+
If the provided `project_id` is not found, the function returns an empty list.
|
|
573
|
+
|
|
574
|
+
:param project_id: A project id
|
|
575
|
+
:type project_id: str
|
|
576
|
+
:returns: A list of collection ids. Returns an empty list if no matches are found.
|
|
577
|
+
:rtype: list[str]
|
|
578
|
+
"""
|
|
579
|
+
result = list()
|
|
580
|
+
if connection := _get_project_connection(project_id):
|
|
581
|
+
try:
|
|
582
|
+
with connection.create_session() as session:
|
|
583
|
+
collections = _get_all_collections_in_project(session)
|
|
584
|
+
for collection in collections:
|
|
585
|
+
result.append(collection.id)
|
|
586
|
+
except Exception as e:
|
|
587
|
+
# Enhanced error context for project collection retrieval
|
|
588
|
+
import logging
|
|
589
|
+
|
|
590
|
+
logger = logging.getLogger(__name__)
|
|
591
|
+
logger.error(f"Failed to get collections for project '{project_id}': {str(e)}")
|
|
592
|
+
|
|
593
|
+
# Re-raise with enhanced context
|
|
594
|
+
raise ValueError(
|
|
595
|
+
f"Failed to retrieve collections for project '{project_id}'. "
|
|
596
|
+
f"This may be due to invalid termkind values in the database. "
|
|
597
|
+
f"Check the project database for collections with empty or invalid termkind values. "
|
|
598
|
+
f"Original error: {str(e)}"
|
|
599
|
+
) from e
|
|
600
|
+
return result
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def _get_all_terms_in_collection(
|
|
604
|
+
collection: PCollection, selected_term_fields: Iterable[str] | None
|
|
605
|
+
) -> list[DataDescriptor]:
|
|
606
|
+
result: list[DataDescriptor] = list()
|
|
607
|
+
instantiate_pydantic_terms(collection.terms, result, selected_term_fields)
|
|
608
|
+
return result
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def get_all_terms_in_project(
|
|
612
|
+
project_id: str, selected_term_fields: Iterable[str] | None = None
|
|
613
|
+
) -> list[DataDescriptor]:
|
|
614
|
+
"""
|
|
615
|
+
Gets all terms of the given project.
|
|
616
|
+
This function performs an exact match on the `project_id` and
|
|
617
|
+
does not search for similar or related projects.
|
|
618
|
+
Terms are unique within a collection but may have some synonyms in a project.
|
|
619
|
+
If the provided `project_id` is not found, the function returns an empty list.
|
|
620
|
+
|
|
621
|
+
:param project_id: A project id
|
|
622
|
+
:type project_id: str
|
|
623
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
624
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
625
|
+
:type selected_term_fields: Iterable[str] | None
|
|
626
|
+
:returns: A list of term instances. Returns an empty list if no matches are found.
|
|
627
|
+
:rtype: list[DataDescriptor]
|
|
628
|
+
"""
|
|
629
|
+
result = list()
|
|
630
|
+
if connection := _get_project_connection(project_id):
|
|
631
|
+
with connection.create_session() as session:
|
|
632
|
+
collections = _get_all_collections_in_project(session)
|
|
633
|
+
for collection in collections:
|
|
634
|
+
# Term may have some synonyms in a project.
|
|
635
|
+
result.extend(_get_all_terms_in_collection(collection, selected_term_fields))
|
|
636
|
+
return result
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def get_all_terms_in_all_projects(
|
|
640
|
+
selected_term_fields: Iterable[str] | None = None,
|
|
641
|
+
) -> list[tuple[str, list[DataDescriptor]]]:
|
|
642
|
+
"""
|
|
643
|
+
Gets all terms of all projects.
|
|
644
|
+
|
|
645
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
646
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
647
|
+
:type selected_term_fields: Iterable[str] | None
|
|
648
|
+
:returns: A list of tuple project_id and term instances of that project.
|
|
649
|
+
:rtype: list[tuple[str, list[DataDescriptor]]]
|
|
650
|
+
"""
|
|
651
|
+
project_ids = get_all_projects()
|
|
652
|
+
result = list()
|
|
653
|
+
for project_id in project_ids:
|
|
654
|
+
terms = get_all_terms_in_project(project_id, selected_term_fields)
|
|
655
|
+
result.append((project_id, terms))
|
|
656
|
+
return result
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def get_all_projects() -> list[str]:
|
|
660
|
+
"""
|
|
661
|
+
Gets all projects.
|
|
662
|
+
|
|
663
|
+
:returns: A list of project ids.
|
|
664
|
+
:rtype: list[str]
|
|
665
|
+
"""
|
|
666
|
+
return list(service.current_state.projects.keys())
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
def _get_term_in_project(term_id: str, session: Session) -> PTerm | None:
|
|
670
|
+
statement = select(PTerm).where(PTerm.id == term_id)
|
|
671
|
+
results = session.exec(statement)
|
|
672
|
+
# Term ids are not supposed to be unique within a project.
|
|
673
|
+
result = results.first()
|
|
674
|
+
return result
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
def get_term_in_project(
|
|
678
|
+
project_id: str, term_id: str, selected_term_fields: Iterable[str] | None = None
|
|
679
|
+
) -> DataDescriptor | None:
|
|
680
|
+
"""
|
|
681
|
+
Returns the first occurrence of the terms, in the given project, whose id corresponds exactly to
|
|
682
|
+
the given term id.
|
|
683
|
+
Terms are unique within a collection but may have some synonyms in a project.
|
|
684
|
+
This function performs an exact match on the `project_id` and `term_id`, and does not search
|
|
685
|
+
for similar or related projects and terms.
|
|
686
|
+
If any of the provided ids (`project_id` or `term_id`) is not found,
|
|
687
|
+
the function returns `None`.
|
|
688
|
+
|
|
689
|
+
:param project_id: The id of the given project.
|
|
690
|
+
:type project_id: str
|
|
691
|
+
:param term_id: The id of a term to be found.
|
|
692
|
+
:type term_id: str
|
|
693
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
694
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
695
|
+
:type selected_term_fields: Iterable[str] | None
|
|
696
|
+
:returns: A term instance. Returns `None` if no match is found.
|
|
697
|
+
:rtype: DataDescriptor | None
|
|
698
|
+
"""
|
|
699
|
+
result: DataDescriptor | None = None
|
|
700
|
+
if connection := _get_project_connection(project_id):
|
|
701
|
+
with connection.create_session() as session:
|
|
702
|
+
term_found = _get_term_in_project(term_id, session)
|
|
703
|
+
if term_found:
|
|
704
|
+
result = instantiate_pydantic_term(term_found, selected_term_fields)
|
|
705
|
+
return result
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
def _get_term_in_collection(collection_id: str, term_id: str, session: Session) -> PTerm | None:
|
|
709
|
+
statement = select(PTerm).join(PCollection).where(PCollection.id == collection_id, PTerm.id == term_id)
|
|
710
|
+
results = session.exec(statement)
|
|
711
|
+
result = results.one_or_none()
|
|
712
|
+
return result
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def get_term_in_collection(
|
|
716
|
+
project_id: str, collection_id: str, term_id: str, selected_term_fields: Iterable[str] | None = None
|
|
717
|
+
) -> DataDescriptor | None:
|
|
718
|
+
"""
|
|
719
|
+
Returns the term, in the given project and collection,
|
|
720
|
+
whose id corresponds exactly to the given term id.
|
|
721
|
+
This function performs an exact match on the `project_id`, `collection_id` and `term_id`,
|
|
722
|
+
and does not search for similar or related projects, collections and terms.
|
|
723
|
+
If any of the provided ids (`project_id`, `collection_id` or `term_id`) is not found,
|
|
724
|
+
the function returns `None`.
|
|
725
|
+
|
|
726
|
+
:param project_id: The id of the given project.
|
|
727
|
+
:type project_id: str
|
|
728
|
+
:param collection_id: The id of the given collection.
|
|
729
|
+
:type collection_id: str
|
|
730
|
+
:param term_id: The id of a term to be found.
|
|
731
|
+
:type term_id: str
|
|
732
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
733
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
734
|
+
:type selected_term_fields: Iterable[str] | None
|
|
735
|
+
:returns: A term instance. Returns `None` if no match is found.
|
|
736
|
+
:rtype: DataDescriptor | None
|
|
737
|
+
"""
|
|
738
|
+
result: DataDescriptor | None = None
|
|
739
|
+
if connection := _get_project_connection(project_id):
|
|
740
|
+
with connection.create_session() as session:
|
|
741
|
+
term_found = _get_term_in_collection(collection_id, term_id, session)
|
|
742
|
+
if term_found:
|
|
743
|
+
result = instantiate_pydantic_term(term_found, selected_term_fields)
|
|
744
|
+
return result
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
def _get_collection_in_project(collection_id: str, session: Session) -> PCollection | None:
|
|
748
|
+
statement = select(PCollection).where(PCollection.id == collection_id)
|
|
749
|
+
results = session.exec(statement)
|
|
750
|
+
result = results.one_or_none()
|
|
751
|
+
return result
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
def get_collection_in_project(project_id: str, collection_id: str) -> tuple[str, dict] | None:
|
|
755
|
+
"""
|
|
756
|
+
Returns the collection, in the given project, whose id corresponds exactly to
|
|
757
|
+
the given collection id.
|
|
758
|
+
This function performs an exact match on the `project_id` and `collection_id`, and does not search
|
|
759
|
+
for similar or related projects and collections.
|
|
760
|
+
If any of the provided ids (`project_id` or `collection_id`) is not found,
|
|
761
|
+
the function returns `None`.
|
|
762
|
+
|
|
763
|
+
:param project_id: The id of the given project.
|
|
764
|
+
:type project_id: str
|
|
765
|
+
:param collection_id: The id of a collection to be found.
|
|
766
|
+
:type collection_id: str
|
|
767
|
+
:returns: A collection id and context. Returns `None` if no match is found.
|
|
768
|
+
:rtype: tuple[str, dict] | None
|
|
769
|
+
"""
|
|
770
|
+
result: tuple[str, dict] | None = None
|
|
771
|
+
if connection := _get_project_connection(project_id):
|
|
772
|
+
with connection.create_session() as session:
|
|
773
|
+
collection_found = _get_collection_in_project(collection_id, session)
|
|
774
|
+
if collection_found:
|
|
775
|
+
result = collection_found.id, collection_found.context
|
|
776
|
+
return result
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
def get_project(project_id: str) -> ProjectSpecs | None:
|
|
780
|
+
"""
|
|
781
|
+
Get a project and returns its specifications.
|
|
782
|
+
This function performs an exact match on the `project_id` and
|
|
783
|
+
does not search for similar or related projects.
|
|
784
|
+
If the provided `project_id` is not found, the function returns `None`.
|
|
785
|
+
|
|
786
|
+
:param project_id: A project id to be found
|
|
787
|
+
:type project_id: str
|
|
788
|
+
:returns: The specs of the project found. Returns `None` if no matches are found.
|
|
789
|
+
:rtype: ProjectSpecs | None
|
|
790
|
+
"""
|
|
791
|
+
result: ProjectSpecs | None = None
|
|
792
|
+
if connection := _get_project_connection(project_id):
|
|
793
|
+
with connection.create_session() as session:
|
|
794
|
+
project = session.get(Project, constants.SQLITE_FIRST_PK)
|
|
795
|
+
try:
|
|
796
|
+
# Project can't be missing if session exists.
|
|
797
|
+
result = ProjectSpecs(**project.specs, version=project.git_hash) # type: ignore
|
|
798
|
+
except Exception as e:
|
|
799
|
+
msg = f"unable to read specs in project '{project_id}'"
|
|
800
|
+
raise EsgvocDbError(msg) from e
|
|
801
|
+
return result
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
def _get_collection_from_data_descriptor_in_project(data_descriptor_id: str, session: Session) -> list[PCollection]:
|
|
805
|
+
statement = select(PCollection).where(PCollection.data_descriptor_id == data_descriptor_id)
|
|
806
|
+
results = session.exec(statement).all()
|
|
807
|
+
return results
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
def get_collection_from_data_descriptor_in_project(project_id: str, data_descriptor_id: str) -> list[tuple[str, dict]]:
|
|
811
|
+
"""
|
|
812
|
+
Returns the collections, in the given project, that correspond to the given data descriptor
|
|
813
|
+
in the universe.
|
|
814
|
+
This function performs an exact match on the `project_id` and `data_descriptor_id`,
|
|
815
|
+
and does not search for similar or related projects and data descriptors.
|
|
816
|
+
If any of the provided ids (`project_id` or `data_descriptor_id`) is not found, or if
|
|
817
|
+
there is no collection corresponding to the given data descriptor, the function returns an empty list.
|
|
818
|
+
|
|
819
|
+
:param project_id: The id of the given project.
|
|
820
|
+
:type project_id: str
|
|
821
|
+
:param data_descriptor_id: The id of the given data descriptor.
|
|
822
|
+
:type data_descriptor_id: str
|
|
823
|
+
:returns: A list of collection ids and contexts. Returns an empty list if no matches are found.
|
|
824
|
+
:rtype: list[tuple[str, dict]]
|
|
825
|
+
"""
|
|
826
|
+
result: list[tuple[str, dict]] = []
|
|
827
|
+
if connection := _get_project_connection(project_id):
|
|
828
|
+
with connection.create_session() as session:
|
|
829
|
+
collections_found = _get_collection_from_data_descriptor_in_project(data_descriptor_id, session)
|
|
830
|
+
result = [(collection.id, collection.context) for collection in collections_found]
|
|
831
|
+
return result
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
def get_collection_from_data_descriptor_in_all_projects(data_descriptor_id: str) -> list[tuple[str, str, dict]]:
|
|
835
|
+
"""
|
|
836
|
+
Returns the collections, in all projects, that correspond to the given data descriptor
|
|
837
|
+
in the universe.
|
|
838
|
+
This function performs an exact match on `data_descriptor_id`,
|
|
839
|
+
and does not search for similar or related data descriptors.
|
|
840
|
+
If the provided `data_descriptor_id` is not found, or if
|
|
841
|
+
there is no collection corresponding to the given data descriptor, the function returns
|
|
842
|
+
an empty list.
|
|
843
|
+
|
|
844
|
+
:param data_descriptor_id: The id of the given data descriptor.
|
|
845
|
+
:type data_descriptor_id: str
|
|
846
|
+
:returns: A list of collection ids, their project_ids and contexts. \
|
|
847
|
+
Returns an empty list if no matches are found.
|
|
848
|
+
:rtype: list[tuple[str, str, dict]]
|
|
849
|
+
"""
|
|
850
|
+
result = list()
|
|
851
|
+
project_ids = get_all_projects()
|
|
852
|
+
for project_id in project_ids:
|
|
853
|
+
collections_found = get_collection_from_data_descriptor_in_project(project_id, data_descriptor_id)
|
|
854
|
+
for collection_id, context in collections_found:
|
|
855
|
+
result.append((project_id, collection_id, context))
|
|
856
|
+
return result
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
def _get_term_from_universe_term_id_in_project(
|
|
860
|
+
data_descriptor_id: str, universe_term_id: str, project_session: Session
|
|
861
|
+
) -> PTerm | None:
|
|
862
|
+
statement = (
|
|
863
|
+
select(PTerm)
|
|
864
|
+
.join(PCollection)
|
|
865
|
+
.where(PCollection.data_descriptor_id == data_descriptor_id, PTerm.id == universe_term_id)
|
|
866
|
+
)
|
|
867
|
+
results = project_session.exec(statement)
|
|
868
|
+
result = results.one_or_none()
|
|
869
|
+
return result
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
def get_term_from_universe_term_id_in_project(
|
|
873
|
+
project_id: str, data_descriptor_id: str, universe_term_id: str, selected_term_fields: Iterable[str] | None = None
|
|
874
|
+
) -> tuple[str, DataDescriptor] | None:
|
|
875
|
+
"""
|
|
876
|
+
Returns the term, in the given project, that corresponds to the given term in the universe.
|
|
877
|
+
This function performs an exact match on the `project_id`, `data_descriptor_id`
|
|
878
|
+
and `universe_term_id`, and does not search for similar or related projects, data descriptors
|
|
879
|
+
and terms. If any of the provided ids (`project_id`, `data_descriptor_id` or `universe_term_id`)
|
|
880
|
+
is not found, or if there is no project term corresponding to the given universe term
|
|
881
|
+
the function returns `None`.
|
|
882
|
+
|
|
883
|
+
:param project_id: The id of the given project.
|
|
884
|
+
:type project_id: str
|
|
885
|
+
:param data_descriptor_id: The id of the data descriptor that contains the given universe term.
|
|
886
|
+
:type data_descriptor_id: str
|
|
887
|
+
:param universe_term_id: The id of the given universe term.
|
|
888
|
+
:type universe_term_id: str
|
|
889
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
890
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
891
|
+
:type selected_term_fields: Iterable[str] | None
|
|
892
|
+
:returns: A collection id and the project term instance. Returns `None` if no matches are found.
|
|
893
|
+
:rtype: tuple[str, DataDescriptor] | None
|
|
894
|
+
"""
|
|
895
|
+
result: tuple[str, DataDescriptor] | None = None
|
|
896
|
+
if connection := _get_project_connection(project_id):
|
|
897
|
+
with connection.create_session() as session:
|
|
898
|
+
term_found = _get_term_from_universe_term_id_in_project(data_descriptor_id, universe_term_id, session)
|
|
899
|
+
if term_found:
|
|
900
|
+
pydantic_term = instantiate_pydantic_term(term_found, selected_term_fields)
|
|
901
|
+
result = (term_found.collection.id, pydantic_term)
|
|
902
|
+
return result
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
def get_term_from_universe_term_id_in_all_projects(
|
|
906
|
+
data_descriptor_id: str, universe_term_id: str, selected_term_fields: Iterable[str] | None = None
|
|
907
|
+
) -> list[tuple[str, str, DataDescriptor]]:
|
|
908
|
+
"""
|
|
909
|
+
Returns the terms, in all projects, that correspond to the given term in the universe.
|
|
910
|
+
This function performs an exact match on the `data_descriptor_id`
|
|
911
|
+
and `universe_term_id`, and does not search for similar or related data descriptors
|
|
912
|
+
and terms. If any of the provided ids (`data_descriptor_id` or `universe_term_id`)
|
|
913
|
+
is not found, or if there is no project term corresponding to the given universe term
|
|
914
|
+
the function returns an empty list.
|
|
915
|
+
|
|
916
|
+
:param data_descriptor_id: The id of the data descriptor that contains the given universe term.
|
|
917
|
+
:type data_descriptor_id: str
|
|
918
|
+
:param universe_term_id: The id of the given universe term.
|
|
919
|
+
:type universe_term_id: str
|
|
920
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
921
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
922
|
+
:type selected_term_fields: Iterable[str] | None
|
|
923
|
+
:returns: A project_id, collection id and the project term instance. \
|
|
924
|
+
Returns an empty list if no matches are found.
|
|
925
|
+
:rtype: list[tuple[str, str, DataDescriptor]]
|
|
926
|
+
"""
|
|
927
|
+
result: list[tuple[str, str, DataDescriptor]] = list()
|
|
928
|
+
project_ids = get_all_projects()
|
|
929
|
+
for project_id in project_ids:
|
|
930
|
+
term_found = get_term_from_universe_term_id_in_project(
|
|
931
|
+
project_id, data_descriptor_id, universe_term_id, selected_term_fields
|
|
932
|
+
)
|
|
933
|
+
if term_found:
|
|
934
|
+
result.append((project_id, term_found[0], term_found[1]))
|
|
935
|
+
return result
|
|
936
|
+
|
|
937
|
+
|
|
938
|
+
def _find_collections_in_project(
|
|
939
|
+
expression: str, session: Session, only_id: bool = False, limit: int | None = None, offset: int | None = None
|
|
940
|
+
) -> Sequence[PCollection]:
|
|
941
|
+
matching_condition = generate_matching_condition(PCollectionFTS5, expression, only_id)
|
|
942
|
+
tmp_statement = select(PCollectionFTS5).where(matching_condition)
|
|
943
|
+
statement = select(PCollection).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
|
|
944
|
+
return execute_match_statement(expression, statement, session)
|
|
945
|
+
|
|
946
|
+
|
|
947
|
+
def find_collections_in_project(
|
|
948
|
+
expression: str, project_id: str, only_id: bool = False, limit: int | None = None, offset: int | None = None
|
|
949
|
+
) -> list[tuple[str, dict]]:
|
|
950
|
+
"""
|
|
951
|
+
Find collections in the given project based on a full text search defined by the given `expression`.
|
|
952
|
+
The `expression` can be composed of one or multiple keywords.
|
|
953
|
+
The keywords can combined with boolean operators: `AND`,
|
|
954
|
+
`OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
|
|
955
|
+
if no boolean operators is provided, whitespaces are handled as if there were
|
|
956
|
+
an implicit AND operator between each pair of keywords. Note that this
|
|
957
|
+
function does not provide any priority operator (parenthesis).
|
|
958
|
+
Keywords can define prefixes when adding a `*` at the end of them.
|
|
959
|
+
If the expression is composed of only one keyword, the function
|
|
960
|
+
automatically defines it as a prefix.
|
|
961
|
+
The function returns a list of collection ids and contexts, sorted according to the
|
|
962
|
+
bm25 ranking metric (list index `0` has the highest rank).
|
|
963
|
+
This function performs an exact match on the `project_id`,
|
|
964
|
+
and does not search for similar or related projects.
|
|
965
|
+
If the provided `expression` does not hit any collection or the given `project_id` does not
|
|
966
|
+
match exactly to an id of a project, the function returns an empty list.
|
|
967
|
+
The function searches for the `expression` in the collection specifications.
|
|
968
|
+
However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
|
|
969
|
+
collections. **At the moment, `only_id` is set to `True` as the collections
|
|
970
|
+
haven't got any description.**
|
|
971
|
+
|
|
972
|
+
:param expression: The full text search expression.
|
|
973
|
+
:type expression: str
|
|
974
|
+
:param project_id: The id of the given project.
|
|
975
|
+
:type project_id: str
|
|
976
|
+
:param only_id: Performs the search only on ids, otherwise on all the specifications.
|
|
977
|
+
:type only_id: bool
|
|
978
|
+
:param limit: Limit the number of returned items found. Returns all items found the if \
|
|
979
|
+
`limit` is either `None`, zero or negative.
|
|
980
|
+
:type limit: int | None
|
|
981
|
+
:param offset: Skips `offset` number of items found. Ignored if `offset` is \
|
|
982
|
+
either `None`, zero or negative.
|
|
983
|
+
:type offset: int | None
|
|
984
|
+
:returns: A list of collection ids and contexts. Returns an empty list if no matches are found.
|
|
985
|
+
:rtype: list[tuple[str, dict]]
|
|
986
|
+
:raises EsgvocValueError: If the `expression` cannot be interpreted.
|
|
987
|
+
"""
|
|
988
|
+
result: list[tuple[str, dict]] = list()
|
|
989
|
+
if connection := _get_project_connection(project_id):
|
|
990
|
+
with connection.create_session() as session:
|
|
991
|
+
collections_found = _find_collections_in_project(expression, session, only_id, limit, offset)
|
|
992
|
+
for collection in collections_found:
|
|
993
|
+
result.append((collection.id, collection.context))
|
|
994
|
+
return result
|
|
995
|
+
|
|
996
|
+
|
|
997
|
+
def _find_terms_in_collection(
|
|
998
|
+
expression: str,
|
|
999
|
+
collection_id: str,
|
|
1000
|
+
session: Session,
|
|
1001
|
+
only_id: bool = False,
|
|
1002
|
+
limit: int | None = None,
|
|
1003
|
+
offset: int | None = None,
|
|
1004
|
+
) -> Sequence[PTerm]:
|
|
1005
|
+
matching_condition = generate_matching_condition(PTermFTS5, expression, only_id)
|
|
1006
|
+
where_condition = PCollection.id == collection_id, matching_condition
|
|
1007
|
+
tmp_statement = select(PTermFTS5).join(PCollection).where(*where_condition)
|
|
1008
|
+
statement = select(PTerm).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
|
|
1009
|
+
return execute_match_statement(expression, statement, session)
|
|
1010
|
+
|
|
1011
|
+
|
|
1012
|
+
def _find_terms_in_project(
|
|
1013
|
+
expression: str, session: Session, only_id: bool = False, limit: int | None = None, offset: int | None = None
|
|
1014
|
+
) -> Sequence[PTerm]:
|
|
1015
|
+
matching_condition = generate_matching_condition(PTermFTS5, expression, only_id)
|
|
1016
|
+
tmp_statement = select(PTermFTS5).where(matching_condition)
|
|
1017
|
+
statement = select(PTerm).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
|
|
1018
|
+
return execute_match_statement(expression, statement, session)
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
def find_terms_in_collection(
|
|
1022
|
+
expression: str,
|
|
1023
|
+
project_id: str,
|
|
1024
|
+
collection_id: str,
|
|
1025
|
+
only_id: bool = False,
|
|
1026
|
+
limit: int | None = None,
|
|
1027
|
+
offset: int | None = None,
|
|
1028
|
+
selected_term_fields: Iterable[str] | None = None,
|
|
1029
|
+
) -> list[DataDescriptor]:
|
|
1030
|
+
"""
|
|
1031
|
+
Find terms in the given project and collection based on a full text search defined by the given
|
|
1032
|
+
`expression`.
|
|
1033
|
+
The `expression` can be composed of one or multiple keywords.
|
|
1034
|
+
The keywords can combined with boolean operators: `AND`,
|
|
1035
|
+
`OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
|
|
1036
|
+
if no boolean operators is provided, whitespaces are handled as if there were
|
|
1037
|
+
an implicit AND operator between each pair of keywords. Note that this
|
|
1038
|
+
function does not provide any priority operator (parenthesis).
|
|
1039
|
+
Keywords can define prefixes when adding a `*` at the end of them.
|
|
1040
|
+
If the expression is composed of only one keyword, the function
|
|
1041
|
+
automatically defines it as a prefix.
|
|
1042
|
+
The function returns a list of term instances, sorted according to the
|
|
1043
|
+
bm25 ranking metric (list index `0` has the highest rank).
|
|
1044
|
+
This function performs an exact match on the `project_id` and `collection_id`,
|
|
1045
|
+
and does not search for similar or related projects and collections.
|
|
1046
|
+
If the provided `expression` does not hit any term or if any of the provided ids
|
|
1047
|
+
(`project_id` or `collection_id`) is not found, the function returns an empty list.
|
|
1048
|
+
The function searches for the `expression` in the term specifications.
|
|
1049
|
+
However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
|
|
1050
|
+
terms.
|
|
1051
|
+
|
|
1052
|
+
:param expression: The full text search expression.
|
|
1053
|
+
:type expression: str
|
|
1054
|
+
:param project_id: The id of the given project.
|
|
1055
|
+
:type project_id: str
|
|
1056
|
+
:param collection_id: The id of the given collection.
|
|
1057
|
+
:type collection_id: str
|
|
1058
|
+
:param only_id: Performs the search only on ids, otherwise on all the specifications.
|
|
1059
|
+
:type only_id: bool
|
|
1060
|
+
:param limit: Limit the number of returned items found. Returns all items found the if \
|
|
1061
|
+
`limit` is either `None`, zero or negative.
|
|
1062
|
+
:type limit: int | None
|
|
1063
|
+
:param offset: Skips `offset` number of items found. Ignored if `offset` is \
|
|
1064
|
+
either `None`, zero or negative.
|
|
1065
|
+
:type offset: int | None
|
|
1066
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
1067
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
1068
|
+
:type selected_term_fields: Iterable[str] | None
|
|
1069
|
+
:returns: A list of term instances. Returns an empty list if no matches are found.
|
|
1070
|
+
:rtype: list[DataDescriptor]
|
|
1071
|
+
:raises EsgvocValueError: If the `expression` cannot be interpreted.
|
|
1072
|
+
"""
|
|
1073
|
+
result: list[DataDescriptor] = list()
|
|
1074
|
+
if connection := _get_project_connection(project_id):
|
|
1075
|
+
with connection.create_session() as session:
|
|
1076
|
+
pterms_found = _find_terms_in_collection(expression, collection_id, session, only_id, limit, offset)
|
|
1077
|
+
instantiate_pydantic_terms(pterms_found, result, selected_term_fields)
|
|
1078
|
+
return result
|
|
1079
|
+
|
|
1080
|
+
|
|
1081
|
+
def find_terms_in_project(
|
|
1082
|
+
expression: str,
|
|
1083
|
+
project_id: str,
|
|
1084
|
+
only_id: bool = False,
|
|
1085
|
+
limit: int | None = None,
|
|
1086
|
+
offset: int | None = None,
|
|
1087
|
+
selected_term_fields: Iterable[str] | None = None,
|
|
1088
|
+
) -> list[DataDescriptor]:
|
|
1089
|
+
"""
|
|
1090
|
+
Find terms in the given project based on a full text search defined by the given `expression`.
|
|
1091
|
+
The `expression` can be composed of one or multiple keywords.
|
|
1092
|
+
The keywords can combined with boolean operators: `AND`,
|
|
1093
|
+
`OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
|
|
1094
|
+
if no boolean operators is provided, whitespaces are handled as if there were
|
|
1095
|
+
an implicit AND operator between each pair of keywords. Note that this
|
|
1096
|
+
function does not provide any priority operator (parenthesis).
|
|
1097
|
+
Keywords can define prefixes when adding a `*` at the end of them.
|
|
1098
|
+
If the expression is composed of only one keyword, the function
|
|
1099
|
+
automatically defines it as a prefix.
|
|
1100
|
+
The function returns a list of term instances, sorted according to the
|
|
1101
|
+
bm25 ranking metric (list index `0` has the highest rank).
|
|
1102
|
+
This function performs an exact match on the `project_id`,
|
|
1103
|
+
and does not search for similar or related projects.
|
|
1104
|
+
If the provided `expression` does not hit any term or if any of the provided `project_id` is
|
|
1105
|
+
not found, the function returns an empty list.
|
|
1106
|
+
The function searches for the `expression` in the term specifications.
|
|
1107
|
+
However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
|
|
1108
|
+
terms.
|
|
1109
|
+
|
|
1110
|
+
:param expression: The full text search expression.
|
|
1111
|
+
:type expression: str
|
|
1112
|
+
:param project_id: The id of the given project.
|
|
1113
|
+
:type project_id: str
|
|
1114
|
+
:param only_id: Performs the search only on ids, otherwise on all the specifications.
|
|
1115
|
+
:type only_id: bool
|
|
1116
|
+
:param limit: Limit the number of returned items found. Returns all items found the if \
|
|
1117
|
+
`limit` is either `None`, zero or negative.
|
|
1118
|
+
:type limit: int | None
|
|
1119
|
+
:param offset: Skips `offset` number of items found. Ignored if `offset` is \
|
|
1120
|
+
either `None`, zero or negative.
|
|
1121
|
+
:type offset: int | None
|
|
1122
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
1123
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
1124
|
+
:type selected_term_fields: Iterable[str] | None
|
|
1125
|
+
:returns: A list of term instances. Returns an empty list if no matches are found.
|
|
1126
|
+
:rtype: list[DataDescriptor]
|
|
1127
|
+
:raises EsgvocValueError: If the `expression` cannot be interpreted.
|
|
1128
|
+
"""
|
|
1129
|
+
result: list[DataDescriptor] = list()
|
|
1130
|
+
if connection := _get_project_connection(project_id):
|
|
1131
|
+
with connection.create_session() as session:
|
|
1132
|
+
pterms_found = _find_terms_in_project(expression, session, only_id, limit, offset)
|
|
1133
|
+
instantiate_pydantic_terms(pterms_found, result, selected_term_fields)
|
|
1134
|
+
return result
|
|
1135
|
+
|
|
1136
|
+
|
|
1137
|
+
def find_terms_in_all_projects(
|
|
1138
|
+
expression: str,
|
|
1139
|
+
only_id: bool = False,
|
|
1140
|
+
limit: int | None = None,
|
|
1141
|
+
offset: int | None = None,
|
|
1142
|
+
selected_term_fields: Iterable[str] | None = None,
|
|
1143
|
+
) -> list[tuple[str, list[DataDescriptor]]]:
|
|
1144
|
+
"""
|
|
1145
|
+
Find terms in all projects based on a full text search defined by the given `expression`.
|
|
1146
|
+
The `expression` can be composed of one or multiple keywords.
|
|
1147
|
+
The keywords can combined with boolean operators: `AND`,
|
|
1148
|
+
`OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
|
|
1149
|
+
if no boolean operators is provided, whitespaces are handled as if there were
|
|
1150
|
+
an implicit AND operator between each pair of keywords. Note that this
|
|
1151
|
+
function does not provide any priority operator (parenthesis).
|
|
1152
|
+
Keywords can define prefixes when adding a `*` at the end of them.
|
|
1153
|
+
If the expression is composed of only one keyword, the function
|
|
1154
|
+
automatically defines it as a prefix.
|
|
1155
|
+
The function returns a list of project ids and term instances, sorted according to the
|
|
1156
|
+
bm25 ranking metric (list index `0` has the highest rank).
|
|
1157
|
+
If the provided `expression` does not hit any term, the function returns an empty list.
|
|
1158
|
+
The function searches for the `expression` in the term specifications.
|
|
1159
|
+
However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
|
|
1160
|
+
terms.
|
|
1161
|
+
|
|
1162
|
+
:param expression: The full text search expression.
|
|
1163
|
+
:type expression: str
|
|
1164
|
+
:param only_id: Performs the search only on ids, otherwise on all the specifications.
|
|
1165
|
+
:type only_id: bool
|
|
1166
|
+
:param limit: Limit the number of returned items found. Returns all items found the if \
|
|
1167
|
+
`limit` is either `None`, zero or negative.
|
|
1168
|
+
:type limit: int | None
|
|
1169
|
+
:param offset: Skips `offset` number of items found. Ignored if `offset` is \
|
|
1170
|
+
either `None`, zero or negative.
|
|
1171
|
+
:type offset: int | None
|
|
1172
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
1173
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
1174
|
+
:type selected_term_fields: Iterable[str] | None
|
|
1175
|
+
:returns: A list of project ids and term instances. Returns an empty list if no matches are found.
|
|
1176
|
+
:rtype: list[tuple[str, list[DataDescriptor]]]
|
|
1177
|
+
:raises EsgvocValueError: If the `expression` cannot be interpreted.
|
|
1178
|
+
"""
|
|
1179
|
+
result: list[tuple[str, list[DataDescriptor]]] = list()
|
|
1180
|
+
project_ids = get_all_projects()
|
|
1181
|
+
for project_id in project_ids:
|
|
1182
|
+
terms_found = find_terms_in_project(expression, project_id, only_id, limit, offset, selected_term_fields)
|
|
1183
|
+
if terms_found:
|
|
1184
|
+
result.append((project_id, terms_found))
|
|
1185
|
+
return result
|
|
1186
|
+
|
|
1187
|
+
|
|
1188
|
+
def find_items_in_project(
|
|
1189
|
+
expression: str, project_id: str, only_id: bool = False, limit: int | None = None, offset: int | None = None
|
|
1190
|
+
) -> list[Item]:
|
|
1191
|
+
"""
|
|
1192
|
+
Find items, at the moment terms and collections, in the given project based on a full-text
|
|
1193
|
+
search defined by the given `expression`.
|
|
1194
|
+
The `expression` can be composed of one or multiple keywords.
|
|
1195
|
+
The keywords can combined with boolean operators: `AND`,
|
|
1196
|
+
`OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
|
|
1197
|
+
if no boolean operators is provided, whitespaces are handled as if there were
|
|
1198
|
+
an implicit AND operator between each pair of keywords. Note that this
|
|
1199
|
+
function does not provide any priority operator (parenthesis).
|
|
1200
|
+
Keywords can define prefixes when adding a `*` at the end of them.
|
|
1201
|
+
If the expression is composed of only one keyword, the function
|
|
1202
|
+
automatically defines it as a prefix.
|
|
1203
|
+
The function returns a list of item instances sorted according to the
|
|
1204
|
+
bm25 ranking metric (list index `0` has the highest rank).
|
|
1205
|
+
This function performs an exact match on the `project_id`,
|
|
1206
|
+
and does not search for similar or related projects.
|
|
1207
|
+
If the provided `expression` does not hit any item, or the provided `project_id` is not found,
|
|
1208
|
+
the function returns an empty list.
|
|
1209
|
+
The function searches for the `expression` in the term and collection specifications.
|
|
1210
|
+
However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
|
|
1211
|
+
terms and collections. **At the moment, `only_id` is set to `True` for the collections because
|
|
1212
|
+
they haven't got any description.**
|
|
1213
|
+
|
|
1214
|
+
:param expression: The full text search expression.
|
|
1215
|
+
:type expression: str
|
|
1216
|
+
:param only_id: Performs the search only on ids, otherwise on all the specifications.
|
|
1217
|
+
:type only_id: bool
|
|
1218
|
+
:param limit: Limit the number of returned items found. Returns all items found the if \
|
|
1219
|
+
`limit` is either `None`, zero or negative.
|
|
1220
|
+
:type limit: int | None
|
|
1221
|
+
:param offset: Skips `offset` number of items found. Ignored if `offset` is \
|
|
1222
|
+
either `None`, zero or negative.
|
|
1223
|
+
:type offset: int | None
|
|
1224
|
+
:returns: A list of item instances. Returns an empty list if no matches are found.
|
|
1225
|
+
:rtype: list[Item]
|
|
1226
|
+
:raises EsgvocValueError: If the `expression` cannot be interpreted.
|
|
1227
|
+
"""
|
|
1228
|
+
# TODO: execute union query when it will be possible to compute parent of terms and collections.
|
|
1229
|
+
result = list()
|
|
1230
|
+
if connection := _get_project_connection(project_id):
|
|
1231
|
+
with connection.create_session() as session:
|
|
1232
|
+
processed_expression = process_expression(expression)
|
|
1233
|
+
if only_id:
|
|
1234
|
+
collection_column = col(PCollectionFTS5.id)
|
|
1235
|
+
term_column = col(PTermFTS5.id)
|
|
1236
|
+
else:
|
|
1237
|
+
# TODO: use specs when implemented!
|
|
1238
|
+
collection_column = col(PCollectionFTS5.id)
|
|
1239
|
+
term_column = col(PTermFTS5.specs) # type: ignore
|
|
1240
|
+
collection_where_condition = collection_column.match(processed_expression)
|
|
1241
|
+
collection_statement = select(
|
|
1242
|
+
PCollectionFTS5.id, text("'collection' AS TYPE"), text(f"'{project_id}' AS TYPE"), text("rank")
|
|
1243
|
+
).where(collection_where_condition)
|
|
1244
|
+
term_where_condition = term_column.match(processed_expression)
|
|
1245
|
+
term_statement = (
|
|
1246
|
+
select(PTermFTS5.id, text("'term' AS TYPE"), PCollection.id, text("rank"))
|
|
1247
|
+
.join(PCollection)
|
|
1248
|
+
.where(term_where_condition)
|
|
1249
|
+
)
|
|
1250
|
+
result = execute_find_item_statements(
|
|
1251
|
+
session, processed_expression, collection_statement, term_statement, limit, offset
|
|
1252
|
+
)
|
|
1253
|
+
return result
|