esgvoc 0.4.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of esgvoc might be problematic. Click here for more details.
- esgvoc/__init__.py +1 -1
- esgvoc/api/data_descriptors/__init__.py +52 -28
- esgvoc/api/data_descriptors/activity.py +3 -3
- esgvoc/api/data_descriptors/area_label.py +16 -1
- esgvoc/api/data_descriptors/branded_suffix.py +20 -0
- esgvoc/api/data_descriptors/branded_variable.py +12 -0
- esgvoc/api/data_descriptors/consortium.py +14 -13
- esgvoc/api/data_descriptors/contact.py +5 -0
- esgvoc/api/data_descriptors/conventions.py +6 -0
- esgvoc/api/data_descriptors/creation_date.py +5 -0
- esgvoc/api/data_descriptors/data_descriptor.py +14 -9
- esgvoc/api/data_descriptors/data_specs_version.py +5 -0
- esgvoc/api/data_descriptors/date.py +1 -1
- esgvoc/api/data_descriptors/directory_date.py +1 -1
- esgvoc/api/data_descriptors/experiment.py +13 -11
- esgvoc/api/data_descriptors/forcing_index.py +1 -1
- esgvoc/api/data_descriptors/frequency.py +3 -3
- esgvoc/api/data_descriptors/further_info_url.py +5 -0
- esgvoc/api/data_descriptors/grid_label.py +2 -2
- esgvoc/api/data_descriptors/horizontal_label.py +15 -1
- esgvoc/api/data_descriptors/initialisation_index.py +1 -1
- esgvoc/api/data_descriptors/institution.py +8 -5
- esgvoc/api/data_descriptors/known_branded_variable.py +23 -0
- esgvoc/api/data_descriptors/license.py +3 -3
- esgvoc/api/data_descriptors/member_id.py +9 -0
- esgvoc/api/data_descriptors/mip_era.py +1 -1
- esgvoc/api/data_descriptors/model_component.py +1 -1
- esgvoc/api/data_descriptors/obs_type.py +5 -0
- esgvoc/api/data_descriptors/organisation.py +1 -1
- esgvoc/api/data_descriptors/physic_index.py +1 -1
- esgvoc/api/data_descriptors/product.py +2 -2
- esgvoc/api/data_descriptors/publication_status.py +5 -0
- esgvoc/api/data_descriptors/realisation_index.py +1 -1
- esgvoc/api/data_descriptors/realm.py +1 -1
- esgvoc/api/data_descriptors/region.py +5 -0
- esgvoc/api/data_descriptors/resolution.py +3 -3
- esgvoc/api/data_descriptors/source.py +9 -5
- esgvoc/api/data_descriptors/source_type.py +1 -1
- esgvoc/api/data_descriptors/table.py +3 -2
- esgvoc/api/data_descriptors/temporal_label.py +15 -1
- esgvoc/api/data_descriptors/time_range.py +4 -3
- esgvoc/api/data_descriptors/title.py +5 -0
- esgvoc/api/data_descriptors/tracking_id.py +5 -0
- esgvoc/api/data_descriptors/variable.py +25 -12
- esgvoc/api/data_descriptors/variant_label.py +3 -3
- esgvoc/api/data_descriptors/vertical_label.py +14 -0
- esgvoc/api/project_specs.py +117 -2
- esgvoc/api/projects.py +328 -287
- esgvoc/api/search.py +30 -3
- esgvoc/api/universe.py +42 -27
- esgvoc/apps/drs/generator.py +87 -74
- esgvoc/apps/jsg/cmip6_template.json +74 -0
- esgvoc/apps/jsg/json_schema_generator.py +194 -0
- esgvoc/cli/config.py +500 -0
- esgvoc/cli/find.py +138 -0
- esgvoc/cli/get.py +43 -38
- esgvoc/cli/main.py +10 -3
- esgvoc/cli/status.py +27 -18
- esgvoc/cli/valid.py +10 -15
- esgvoc/core/db/models/project.py +11 -11
- esgvoc/core/db/models/universe.py +3 -3
- esgvoc/core/db/project_ingestion.py +40 -40
- esgvoc/core/db/universe_ingestion.py +36 -33
- esgvoc/core/logging_handler.py +24 -2
- esgvoc/core/repo_fetcher.py +61 -59
- esgvoc/core/service/data_merger.py +47 -34
- esgvoc/core/service/state.py +107 -83
- {esgvoc-0.4.0.dist-info → esgvoc-1.0.1.dist-info}/METADATA +5 -20
- esgvoc-1.0.1.dist-info/RECORD +95 -0
- esgvoc/core/logging.conf +0 -21
- esgvoc-0.4.0.dist-info/RECORD +0 -80
- {esgvoc-0.4.0.dist-info → esgvoc-1.0.1.dist-info}/WHEEL +0 -0
- {esgvoc-0.4.0.dist-info → esgvoc-1.0.1.dist-info}/entry_points.txt +0 -0
- {esgvoc-0.4.0.dist-info → esgvoc-1.0.1.dist-info}/licenses/LICENSE.txt +0 -0
esgvoc/api/projects.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import itertools
|
|
1
2
|
import re
|
|
2
3
|
from typing import Iterable, Sequence
|
|
3
4
|
|
|
@@ -20,16 +21,11 @@ from esgvoc.api.search import (
|
|
|
20
21
|
handle_rank_limit_offset,
|
|
21
22
|
instantiate_pydantic_term,
|
|
22
23
|
instantiate_pydantic_terms,
|
|
24
|
+
process_expression,
|
|
23
25
|
)
|
|
24
26
|
from esgvoc.core.db.connection import DBConnection
|
|
25
27
|
from esgvoc.core.db.models.mixins import TermKind
|
|
26
|
-
from esgvoc.core.db.models.project import
|
|
27
|
-
Collection,
|
|
28
|
-
PCollectionFTS5,
|
|
29
|
-
Project,
|
|
30
|
-
PTerm,
|
|
31
|
-
PTermFTS5,
|
|
32
|
-
)
|
|
28
|
+
from esgvoc.core.db.models.project import PCollection, PCollectionFTS5, Project, PTerm, PTermFTS5
|
|
33
29
|
from esgvoc.core.db.models.universe import UTerm
|
|
34
30
|
from esgvoc.core.exceptions import EsgvocDbError, EsgvocNotFoundError, EsgvocNotImplementedError, EsgvocValueError
|
|
35
31
|
|
|
@@ -53,21 +49,17 @@ def _get_project_session_with_exception(project_id: str) -> Session:
|
|
|
53
49
|
raise EsgvocNotFoundError(f"unable to find project '{project_id}'")
|
|
54
50
|
|
|
55
51
|
|
|
56
|
-
def _resolve_term(composite_term_part: dict,
|
|
57
|
-
universe_session: Session,
|
|
58
|
-
project_session: Session) -> UTerm | PTerm:
|
|
52
|
+
def _resolve_term(composite_term_part: dict, universe_session: Session, project_session: Session) -> UTerm | PTerm:
|
|
59
53
|
# First find the term in the universe than in the current project
|
|
60
54
|
term_id = composite_term_part[constants.TERM_ID_JSON_KEY]
|
|
61
55
|
term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
|
|
62
|
-
uterm = universe._get_term_in_data_descriptor(
|
|
63
|
-
|
|
64
|
-
|
|
56
|
+
uterm = universe._get_term_in_data_descriptor(
|
|
57
|
+
data_descriptor_id=term_type, term_id=term_id, session=universe_session
|
|
58
|
+
)
|
|
65
59
|
if uterm:
|
|
66
60
|
return uterm
|
|
67
61
|
else:
|
|
68
|
-
pterm = _get_term_in_collection(collection_id=term_type,
|
|
69
|
-
term_id=term_id,
|
|
70
|
-
session=project_session)
|
|
62
|
+
pterm = _get_term_in_collection(collection_id=term_type, term_id=term_id, session=project_session)
|
|
71
63
|
if pterm:
|
|
72
64
|
return pterm
|
|
73
65
|
else:
|
|
@@ -81,13 +73,88 @@ def _get_composite_term_separator_parts(term: UTerm | PTerm) -> tuple[str, list]
|
|
|
81
73
|
return separator, parts
|
|
82
74
|
|
|
83
75
|
|
|
76
|
+
def _valid_value_composite_term_with_separator(
|
|
77
|
+
value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
|
|
78
|
+
) -> list[UniverseTermError | ProjectTermError]:
|
|
79
|
+
result = []
|
|
80
|
+
separator, parts = _get_composite_term_separator_parts(term)
|
|
81
|
+
required_indices = {i for i, p in enumerate(parts) if p.get("is_required", False)}
|
|
82
|
+
|
|
83
|
+
splits = value.split(separator)
|
|
84
|
+
nb_splits = len(splits)
|
|
85
|
+
nb_parts = len(parts)
|
|
86
|
+
|
|
87
|
+
if nb_splits > nb_parts:
|
|
88
|
+
return [_create_term_error(value, term)]
|
|
89
|
+
|
|
90
|
+
# Generate all possible assignments of split values into parts
|
|
91
|
+
# Only keep those that include all required parts
|
|
92
|
+
all_positions = [i for i in range(nb_parts)]
|
|
93
|
+
valid_combinations = [
|
|
94
|
+
comb for comb in itertools.combinations(all_positions, nb_splits) if required_indices.issubset(comb)
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
for positions in valid_combinations:
|
|
98
|
+
candidate = [None] * nb_parts
|
|
99
|
+
for idx, pos in enumerate(positions):
|
|
100
|
+
candidate[pos] = splits[idx]
|
|
101
|
+
|
|
102
|
+
# Separator structure validation:
|
|
103
|
+
# - No leading separator if the first part is None
|
|
104
|
+
# - No trailing separator if the last part is None
|
|
105
|
+
# - No double separators where two adjacent optional parts are missing
|
|
106
|
+
if candidate[0] is None and value.startswith(separator):
|
|
107
|
+
continue
|
|
108
|
+
if candidate[-1] is None and value.endswith(separator):
|
|
109
|
+
continue
|
|
110
|
+
if any(
|
|
111
|
+
candidate[i] is None and candidate[i + 1] is None and separator * 2 in value for i in range(nb_parts - 1)
|
|
112
|
+
):
|
|
113
|
+
continue # invalid double separator between two missing parts
|
|
114
|
+
|
|
115
|
+
# Validate each filled part value
|
|
116
|
+
all_valid = True
|
|
117
|
+
for i, given_value in enumerate(candidate):
|
|
118
|
+
if given_value is None:
|
|
119
|
+
if parts[i].get("is_required", False):
|
|
120
|
+
all_valid = False
|
|
121
|
+
break
|
|
122
|
+
continue # optional and missing part is allowed
|
|
123
|
+
|
|
124
|
+
part = parts[i]
|
|
125
|
+
|
|
126
|
+
# Resolve term ID list if not present
|
|
127
|
+
if "id" not in part:
|
|
128
|
+
terms = universe.get_all_terms_in_data_descriptor(part["type"], None)
|
|
129
|
+
part["id"] = [term.id for term in terms]
|
|
130
|
+
if isinstance(part["id"], str):
|
|
131
|
+
part["id"] = [part["id"]]
|
|
132
|
+
|
|
133
|
+
# Try all possible term IDs to find a valid match
|
|
134
|
+
valid_for_this_part = False
|
|
135
|
+
for id in part["id"]:
|
|
136
|
+
part_copy = dict(part)
|
|
137
|
+
part_copy["id"] = id
|
|
138
|
+
resolved_term = _resolve_term(part_copy, universe_session, project_session)
|
|
139
|
+
errors = _valid_value(given_value, resolved_term, universe_session, project_session)
|
|
140
|
+
if not errors:
|
|
141
|
+
valid_for_this_part = True
|
|
142
|
+
break
|
|
143
|
+
if not valid_for_this_part:
|
|
144
|
+
all_valid = False
|
|
145
|
+
break
|
|
146
|
+
|
|
147
|
+
if all_valid:
|
|
148
|
+
return [] # At least one valid combination found
|
|
149
|
+
|
|
150
|
+
return [_create_term_error(value, term)] # No valid combination found
|
|
151
|
+
|
|
152
|
+
|
|
84
153
|
# TODO: support optionality of parts of composite.
|
|
85
154
|
# It is backtrack possible for more than one missing parts.
|
|
86
|
-
def
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
project_session: Session)\
|
|
90
|
-
-> list[UniverseTermError | ProjectTermError]:
|
|
155
|
+
def _valid_value_composite_term_with_separator2(
|
|
156
|
+
value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
|
|
157
|
+
) -> list[UniverseTermError | ProjectTermError]:
|
|
91
158
|
result = list()
|
|
92
159
|
separator, parts = _get_composite_term_separator_parts(term)
|
|
93
160
|
if separator in value:
|
|
@@ -95,14 +162,25 @@ def _valid_value_composite_term_with_separator(value: str,
|
|
|
95
162
|
if len(splits) == len(parts):
|
|
96
163
|
for index in range(0, len(splits)):
|
|
97
164
|
given_value = splits[index]
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
165
|
+
if "id" not in parts[index].keys():
|
|
166
|
+
terms = universe.get_all_terms_in_data_descriptor(parts[index]["type"], None)
|
|
167
|
+
parts[index]["id"] = [term.id for term in terms]
|
|
168
|
+
if type(parts[index]["id"]) is str:
|
|
169
|
+
parts[index]["id"] = [parts[index]["id"]]
|
|
170
|
+
|
|
171
|
+
errors_list = list()
|
|
172
|
+
for id in parts[index]["id"]:
|
|
173
|
+
part_parts = dict(parts[index])
|
|
174
|
+
part_parts["id"] = id
|
|
175
|
+
resolved_term = _resolve_term(part_parts, universe_session, project_session)
|
|
176
|
+
errors = _valid_value(given_value, resolved_term, universe_session, project_session)
|
|
177
|
+
if len(errors) == 0:
|
|
178
|
+
errors_list = errors
|
|
179
|
+
break
|
|
180
|
+
else:
|
|
181
|
+
errors_list.extend(errors)
|
|
182
|
+
else:
|
|
183
|
+
result.append(_create_term_error(value, term))
|
|
106
184
|
else:
|
|
107
185
|
result.append(_create_term_error(value, term))
|
|
108
186
|
else:
|
|
@@ -110,16 +188,13 @@ def _valid_value_composite_term_with_separator(value: str,
|
|
|
110
188
|
return result
|
|
111
189
|
|
|
112
190
|
|
|
113
|
-
def _transform_to_pattern(term: UTerm | PTerm,
|
|
114
|
-
universe_session: Session,
|
|
115
|
-
project_session: Session) -> str:
|
|
191
|
+
def _transform_to_pattern(term: UTerm | PTerm, universe_session: Session, project_session: Session) -> str:
|
|
116
192
|
match term.kind:
|
|
117
193
|
case TermKind.PLAIN:
|
|
118
194
|
if constants.DRS_SPECS_JSON_KEY in term.specs:
|
|
119
195
|
result = term.specs[constants.DRS_SPECS_JSON_KEY]
|
|
120
196
|
else:
|
|
121
|
-
raise EsgvocValueError(f"the term '{term.id}' doesn't have drs name. " +
|
|
122
|
-
"Can't validate it.")
|
|
197
|
+
raise EsgvocValueError(f"the term '{term.id}' doesn't have drs name. " + "Can't validate it.")
|
|
123
198
|
case TermKind.PATTERN:
|
|
124
199
|
result = term.specs[constants.PATTERN_JSON_KEY]
|
|
125
200
|
case TermKind.COMPOSITE:
|
|
@@ -128,7 +203,7 @@ def _transform_to_pattern(term: UTerm | PTerm,
|
|
|
128
203
|
for part in parts:
|
|
129
204
|
resolved_term = _resolve_term(part, universe_session, project_session)
|
|
130
205
|
pattern = _transform_to_pattern(resolved_term, universe_session, project_session)
|
|
131
|
-
result = f
|
|
206
|
+
result = f"{result}{pattern}{separator}"
|
|
132
207
|
result = result.rstrip(separator)
|
|
133
208
|
case _:
|
|
134
209
|
raise EsgvocDbError(f"unsupported term kind '{term.kind}'")
|
|
@@ -137,11 +212,9 @@ def _transform_to_pattern(term: UTerm | PTerm,
|
|
|
137
212
|
|
|
138
213
|
# TODO: support optionality of parts of composite.
|
|
139
214
|
# It is backtrack possible for more than one missing parts.
|
|
140
|
-
def _valid_value_composite_term_separator_less(
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
project_session: Session)\
|
|
144
|
-
-> list[UniverseTermError | ProjectTermError]:
|
|
215
|
+
def _valid_value_composite_term_separator_less(
|
|
216
|
+
value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
|
|
217
|
+
) -> list[UniverseTermError | ProjectTermError]:
|
|
145
218
|
result = list()
|
|
146
219
|
try:
|
|
147
220
|
pattern = _transform_to_pattern(term, universe_session, project_session)
|
|
@@ -150,8 +223,8 @@ def _valid_value_composite_term_separator_less(value: str,
|
|
|
150
223
|
# So their regex are defined as a whole (begins by a ^, ends by a $).
|
|
151
224
|
# As the pattern is a concatenation of plain or regex, multiple ^ and $ can exist.
|
|
152
225
|
# The later, must be removed.
|
|
153
|
-
pattern = pattern.replace(
|
|
154
|
-
pattern = f
|
|
226
|
+
pattern = pattern.replace("^", "").replace("$", "")
|
|
227
|
+
pattern = f"^{pattern}$"
|
|
155
228
|
regex = re.compile(pattern)
|
|
156
229
|
except Exception as e:
|
|
157
230
|
msg = f"regex compilation error while processing term '{term.id}'':\n{e}"
|
|
@@ -165,35 +238,30 @@ def _valid_value_composite_term_separator_less(value: str,
|
|
|
165
238
|
raise EsgvocNotImplementedError(msg) from e
|
|
166
239
|
|
|
167
240
|
|
|
168
|
-
def _valid_value_for_composite_term(
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
project_session: Session)\
|
|
172
|
-
-> list[UniverseTermError | ProjectTermError]:
|
|
241
|
+
def _valid_value_for_composite_term(
|
|
242
|
+
value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
|
|
243
|
+
) -> list[UniverseTermError | ProjectTermError]:
|
|
173
244
|
result = list()
|
|
174
245
|
separator, _ = _get_composite_term_separator_parts(term)
|
|
175
246
|
if separator:
|
|
176
|
-
result = _valid_value_composite_term_with_separator(value, term, universe_session,
|
|
177
|
-
project_session)
|
|
247
|
+
result = _valid_value_composite_term_with_separator(value, term, universe_session, project_session)
|
|
178
248
|
else:
|
|
179
|
-
result = _valid_value_composite_term_separator_less(value, term, universe_session,
|
|
180
|
-
project_session)
|
|
249
|
+
result = _valid_value_composite_term_separator_less(value, term, universe_session, project_session)
|
|
181
250
|
return result
|
|
182
251
|
|
|
183
252
|
|
|
184
253
|
def _create_term_error(value: str, term: UTerm | PTerm) -> UniverseTermError | ProjectTermError:
|
|
185
254
|
if isinstance(term, UTerm):
|
|
186
|
-
return UniverseTermError(
|
|
187
|
-
|
|
255
|
+
return UniverseTermError(
|
|
256
|
+
value=value, term=term.specs, term_kind=term.kind, data_descriptor_id=term.data_descriptor.id
|
|
257
|
+
)
|
|
188
258
|
else:
|
|
189
|
-
return ProjectTermError(value=value, term=term.specs, term_kind=term.kind,
|
|
190
|
-
collection_id=term.collection.id)
|
|
259
|
+
return ProjectTermError(value=value, term=term.specs, term_kind=term.kind, collection_id=term.collection.id)
|
|
191
260
|
|
|
192
261
|
|
|
193
|
-
def _valid_value(
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
project_session: Session) -> list[UniverseTermError | ProjectTermError]:
|
|
262
|
+
def _valid_value(
|
|
263
|
+
value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
|
|
264
|
+
) -> list[UniverseTermError | ProjectTermError]:
|
|
197
265
|
result = list()
|
|
198
266
|
match term.kind:
|
|
199
267
|
case TermKind.PLAIN:
|
|
@@ -201,17 +269,14 @@ def _valid_value(value: str,
|
|
|
201
269
|
if term.specs[constants.DRS_SPECS_JSON_KEY] != value:
|
|
202
270
|
result.append(_create_term_error(value, term))
|
|
203
271
|
else:
|
|
204
|
-
raise EsgvocValueError(f"the term '{term.id}' doesn't have drs name. " +
|
|
205
|
-
"Can't validate it.")
|
|
272
|
+
raise EsgvocValueError(f"the term '{term.id}' doesn't have drs name. " + "Can't validate it.")
|
|
206
273
|
case TermKind.PATTERN:
|
|
207
274
|
# TODO: Pattern can be compiled and stored for further matching.
|
|
208
275
|
pattern_match = re.match(term.specs[constants.PATTERN_JSON_KEY], value)
|
|
209
276
|
if pattern_match is None:
|
|
210
277
|
result.append(_create_term_error(value, term))
|
|
211
278
|
case TermKind.COMPOSITE:
|
|
212
|
-
result.extend(_valid_value_for_composite_term(value, term,
|
|
213
|
-
universe_session,
|
|
214
|
-
project_session))
|
|
279
|
+
result.extend(_valid_value_for_composite_term(value, term, universe_session, project_session))
|
|
215
280
|
case _:
|
|
216
281
|
raise EsgvocDbError(f"unsupported term kind '{term.kind}'")
|
|
217
282
|
return result
|
|
@@ -219,33 +284,25 @@ def _valid_value(value: str,
|
|
|
219
284
|
|
|
220
285
|
def _check_value(value: str) -> str:
|
|
221
286
|
if not value or value.isspace():
|
|
222
|
-
raise EsgvocValueError(
|
|
287
|
+
raise EsgvocValueError("value should be set")
|
|
223
288
|
else:
|
|
224
289
|
return value
|
|
225
290
|
|
|
226
291
|
|
|
227
|
-
def _search_plain_term_and_valid_value(value: str,
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
-> str | None:
|
|
231
|
-
where_expression = and_(Collection.id == collection_id,
|
|
232
|
-
PTerm.specs[constants.DRS_SPECS_JSON_KEY] == f'"{value}"')
|
|
233
|
-
statement = select(PTerm).join(Collection).where(where_expression)
|
|
292
|
+
def _search_plain_term_and_valid_value(value: str, collection_id: str, project_session: Session) -> str | None:
|
|
293
|
+
where_expression = and_(PCollection.id == collection_id, PTerm.specs[constants.DRS_SPECS_JSON_KEY] == f'"{value}"')
|
|
294
|
+
statement = select(PTerm).join(PCollection).where(where_expression)
|
|
234
295
|
term = project_session.exec(statement).one_or_none()
|
|
235
296
|
return term.id if term else None
|
|
236
297
|
|
|
237
298
|
|
|
238
|
-
def _valid_value_against_all_terms_of_collection(
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
project_session: Session) \
|
|
242
|
-
-> list[str]:
|
|
299
|
+
def _valid_value_against_all_terms_of_collection(
|
|
300
|
+
value: str, collection: PCollection, universe_session: Session, project_session: Session
|
|
301
|
+
) -> list[str]:
|
|
243
302
|
if collection.terms:
|
|
244
303
|
result = list()
|
|
245
304
|
for pterm in collection.terms:
|
|
246
|
-
_errors = _valid_value(value, pterm,
|
|
247
|
-
universe_session,
|
|
248
|
-
project_session)
|
|
305
|
+
_errors = _valid_value(value, pterm, universe_session, project_session)
|
|
249
306
|
if not _errors:
|
|
250
307
|
result.append(pterm.id)
|
|
251
308
|
return result
|
|
@@ -253,35 +310,24 @@ def _valid_value_against_all_terms_of_collection(value: str,
|
|
|
253
310
|
raise EsgvocDbError(f"collection '{collection.id}' has no term")
|
|
254
311
|
|
|
255
312
|
|
|
256
|
-
def _valid_value_against_given_term(
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
term_id: str,
|
|
260
|
-
universe_session: Session,
|
|
261
|
-
project_session: Session)\
|
|
262
|
-
-> list[UniverseTermError | ProjectTermError]:
|
|
313
|
+
def _valid_value_against_given_term(
|
|
314
|
+
value: str, project_id: str, collection_id: str, term_id: str, universe_session: Session, project_session: Session
|
|
315
|
+
) -> list[UniverseTermError | ProjectTermError]:
|
|
263
316
|
# [OPTIMIZATION]
|
|
264
317
|
key = value + project_id + collection_id + term_id
|
|
265
318
|
if key in _VALID_VALUE_AGAINST_GIVEN_TERM_CACHE:
|
|
266
319
|
result = _VALID_VALUE_AGAINST_GIVEN_TERM_CACHE[key]
|
|
267
320
|
else:
|
|
268
|
-
term = _get_term_in_collection(collection_id,
|
|
269
|
-
term_id,
|
|
270
|
-
project_session)
|
|
321
|
+
term = _get_term_in_collection(collection_id, term_id, project_session)
|
|
271
322
|
if term:
|
|
272
323
|
result = _valid_value(value, term, universe_session, project_session)
|
|
273
324
|
else:
|
|
274
|
-
raise EsgvocNotFoundError(f"unable to find term '{term_id}' " +
|
|
275
|
-
f"in collection '{collection_id}'")
|
|
325
|
+
raise EsgvocNotFoundError(f"unable to find term '{term_id}' " + f"in collection '{collection_id}'")
|
|
276
326
|
_VALID_VALUE_AGAINST_GIVEN_TERM_CACHE[key] = result
|
|
277
327
|
return result
|
|
278
328
|
|
|
279
329
|
|
|
280
|
-
def valid_term(value: str,
|
|
281
|
-
project_id: str,
|
|
282
|
-
collection_id: str,
|
|
283
|
-
term_id: str) \
|
|
284
|
-
-> ValidationReport:
|
|
330
|
+
def valid_term(value: str, project_id: str, collection_id: str, term_id: str) -> ValidationReport:
|
|
285
331
|
"""
|
|
286
332
|
Check if the given value may or may not represent the given term. The functions returns
|
|
287
333
|
a report that contains the possible errors.
|
|
@@ -312,19 +358,16 @@ def valid_term(value: str,
|
|
|
312
358
|
:raises EsgvocNotFoundError: If any of the provided ids is not found
|
|
313
359
|
"""
|
|
314
360
|
value = _check_value(value)
|
|
315
|
-
with get_universe_session() as universe_session,
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
361
|
+
with get_universe_session() as universe_session, _get_project_session_with_exception(project_id) as project_session:
|
|
362
|
+
errors = _valid_value_against_given_term(
|
|
363
|
+
value, project_id, collection_id, term_id, universe_session, project_session
|
|
364
|
+
)
|
|
319
365
|
return ValidationReport(expression=value, errors=errors)
|
|
320
366
|
|
|
321
367
|
|
|
322
|
-
def _valid_term_in_collection(
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
universe_session: Session,
|
|
326
|
-
project_session: Session) \
|
|
327
|
-
-> list[MatchingTerm]:
|
|
368
|
+
def _valid_term_in_collection(
|
|
369
|
+
value: str, project_id: str, collection_id: str, universe_session: Session, project_session: Session
|
|
370
|
+
) -> list[MatchingTerm]:
|
|
328
371
|
# [OPTIMIZATION]
|
|
329
372
|
key = value + project_id + collection_id
|
|
330
373
|
if key in _VALID_TERM_IN_COLLECTION_CACHE:
|
|
@@ -336,20 +379,19 @@ def _valid_term_in_collection(value: str,
|
|
|
336
379
|
if collection:
|
|
337
380
|
match collection.term_kind:
|
|
338
381
|
case TermKind.PLAIN:
|
|
339
|
-
term_id_found = _search_plain_term_and_valid_value(value, collection_id,
|
|
340
|
-
project_session)
|
|
382
|
+
term_id_found = _search_plain_term_and_valid_value(value, collection_id, project_session)
|
|
341
383
|
if term_id_found:
|
|
342
|
-
result.append(
|
|
343
|
-
|
|
344
|
-
|
|
384
|
+
result.append(
|
|
385
|
+
MatchingTerm(project_id=project_id, collection_id=collection_id, term_id=term_id_found)
|
|
386
|
+
)
|
|
345
387
|
case _:
|
|
346
|
-
term_ids_found = _valid_value_against_all_terms_of_collection(
|
|
347
|
-
|
|
348
|
-
|
|
388
|
+
term_ids_found = _valid_value_against_all_terms_of_collection(
|
|
389
|
+
value, collection, universe_session, project_session
|
|
390
|
+
)
|
|
349
391
|
for term_id_found in term_ids_found:
|
|
350
|
-
result.append(
|
|
351
|
-
|
|
352
|
-
|
|
392
|
+
result.append(
|
|
393
|
+
MatchingTerm(project_id=project_id, collection_id=collection_id, term_id=term_id_found)
|
|
394
|
+
)
|
|
353
395
|
else:
|
|
354
396
|
msg = f"unable to find collection '{collection_id}'"
|
|
355
397
|
raise EsgvocNotFoundError(msg)
|
|
@@ -357,10 +399,7 @@ def _valid_term_in_collection(value: str,
|
|
|
357
399
|
return result
|
|
358
400
|
|
|
359
401
|
|
|
360
|
-
def valid_term_in_collection(value: str,
|
|
361
|
-
project_id: str,
|
|
362
|
-
collection_id: str) \
|
|
363
|
-
-> list[MatchingTerm]:
|
|
402
|
+
def valid_term_in_collection(value: str, project_id: str, collection_id: str) -> list[MatchingTerm]:
|
|
364
403
|
"""
|
|
365
404
|
Check if the given value may or may not represent a term in the given collection. The function
|
|
366
405
|
returns the terms that the value matches.
|
|
@@ -388,21 +427,17 @@ def valid_term_in_collection(value: str,
|
|
|
388
427
|
:rtype: list[MatchingTerm]
|
|
389
428
|
:raises EsgvocNotFoundError: If any of the provided ids is not found
|
|
390
429
|
"""
|
|
391
|
-
with get_universe_session() as universe_session,
|
|
392
|
-
|
|
393
|
-
return _valid_term_in_collection(value, project_id, collection_id,
|
|
394
|
-
universe_session, project_session)
|
|
430
|
+
with get_universe_session() as universe_session, _get_project_session_with_exception(project_id) as project_session:
|
|
431
|
+
return _valid_term_in_collection(value, project_id, collection_id, universe_session, project_session)
|
|
395
432
|
|
|
396
433
|
|
|
397
|
-
def _valid_term_in_project(
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
project_session: Session) -> list[MatchingTerm]:
|
|
434
|
+
def _valid_term_in_project(
|
|
435
|
+
value: str, project_id: str, universe_session: Session, project_session: Session
|
|
436
|
+
) -> list[MatchingTerm]:
|
|
401
437
|
result = list()
|
|
402
438
|
collections = _get_all_collections_in_project(project_session)
|
|
403
439
|
for collection in collections:
|
|
404
|
-
result.extend(_valid_term_in_collection(value, project_id, collection.id,
|
|
405
|
-
universe_session, project_session))
|
|
440
|
+
result.extend(_valid_term_in_collection(value, project_id, collection.id, universe_session, project_session))
|
|
406
441
|
return result
|
|
407
442
|
|
|
408
443
|
|
|
@@ -431,8 +466,7 @@ def valid_term_in_project(value: str, project_id: str) -> list[MatchingTerm]:
|
|
|
431
466
|
:rtype: list[MatchingTerm]
|
|
432
467
|
:raises EsgvocNotFoundError: If the `project_id` is not found
|
|
433
468
|
"""
|
|
434
|
-
with get_universe_session() as universe_session,
|
|
435
|
-
_get_project_session_with_exception(project_id) as project_session:
|
|
469
|
+
with get_universe_session() as universe_session, _get_project_session_with_exception(project_id) as project_session:
|
|
436
470
|
return _valid_term_in_project(value, project_id, universe_session, project_session)
|
|
437
471
|
|
|
438
472
|
|
|
@@ -460,15 +494,13 @@ def valid_term_in_all_projects(value: str) -> list[MatchingTerm]:
|
|
|
460
494
|
with get_universe_session() as universe_session:
|
|
461
495
|
for project_id in get_all_projects():
|
|
462
496
|
with _get_project_session_with_exception(project_id) as project_session:
|
|
463
|
-
result.extend(_valid_term_in_project(value, project_id,
|
|
464
|
-
universe_session, project_session))
|
|
497
|
+
result.extend(_valid_term_in_project(value, project_id, universe_session, project_session))
|
|
465
498
|
return result
|
|
466
499
|
|
|
467
500
|
|
|
468
|
-
def get_all_terms_in_collection(
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
-> list[DataDescriptor]:
|
|
501
|
+
def get_all_terms_in_collection(
|
|
502
|
+
project_id: str, collection_id: str, selected_term_fields: Iterable[str] | None = None
|
|
503
|
+
) -> list[DataDescriptor]:
|
|
472
504
|
"""
|
|
473
505
|
Gets all terms of the given collection of a project.
|
|
474
506
|
This function performs an exact match on the `project_id` and `collection_id`,
|
|
@@ -495,7 +527,7 @@ def get_all_terms_in_collection(project_id: str,
|
|
|
495
527
|
return result
|
|
496
528
|
|
|
497
529
|
|
|
498
|
-
def _get_all_collections_in_project(session: Session) -> list[
|
|
530
|
+
def _get_all_collections_in_project(session: Session) -> list[PCollection]:
|
|
499
531
|
project = session.get(Project, constants.SQLITE_FIRST_PK)
|
|
500
532
|
# Project can't be missing if session exists.
|
|
501
533
|
return project.collections # type: ignore
|
|
@@ -522,15 +554,17 @@ def get_all_collections_in_project(project_id: str) -> list[str]:
|
|
|
522
554
|
return result
|
|
523
555
|
|
|
524
556
|
|
|
525
|
-
def _get_all_terms_in_collection(
|
|
526
|
-
|
|
557
|
+
def _get_all_terms_in_collection(
|
|
558
|
+
collection: PCollection, selected_term_fields: Iterable[str] | None
|
|
559
|
+
) -> list[DataDescriptor]:
|
|
527
560
|
result: list[DataDescriptor] = list()
|
|
528
561
|
instantiate_pydantic_terms(collection.terms, result, selected_term_fields)
|
|
529
562
|
return result
|
|
530
563
|
|
|
531
564
|
|
|
532
|
-
def get_all_terms_in_project(
|
|
533
|
-
|
|
565
|
+
def get_all_terms_in_project(
|
|
566
|
+
project_id: str, selected_term_fields: Iterable[str] | None = None
|
|
567
|
+
) -> list[DataDescriptor]:
|
|
534
568
|
"""
|
|
535
569
|
Gets all terms of the given project.
|
|
536
570
|
This function performs an exact match on the `project_id` and
|
|
@@ -556,8 +590,9 @@ def get_all_terms_in_project(project_id: str,
|
|
|
556
590
|
return result
|
|
557
591
|
|
|
558
592
|
|
|
559
|
-
def get_all_terms_in_all_projects(
|
|
560
|
-
|
|
593
|
+
def get_all_terms_in_all_projects(
|
|
594
|
+
selected_term_fields: Iterable[str] | None = None,
|
|
595
|
+
) -> list[tuple[str, list[DataDescriptor]]]:
|
|
561
596
|
"""
|
|
562
597
|
Gets all terms of all projects.
|
|
563
598
|
|
|
@@ -592,8 +627,9 @@ def _get_term_in_project(term_id: str, session: Session) -> PTerm | None:
|
|
|
592
627
|
return result
|
|
593
628
|
|
|
594
629
|
|
|
595
|
-
def get_term_in_project(
|
|
596
|
-
|
|
630
|
+
def get_term_in_project(
|
|
631
|
+
project_id: str, term_id: str, selected_term_fields: Iterable[str] | None = None
|
|
632
|
+
) -> DataDescriptor | None:
|
|
597
633
|
"""
|
|
598
634
|
Returns the first occurrence of the terms, in the given project, whose id corresponds exactly to
|
|
599
635
|
the given term id.
|
|
@@ -623,15 +659,15 @@ def get_term_in_project(project_id: str, term_id: str,
|
|
|
623
659
|
|
|
624
660
|
|
|
625
661
|
def _get_term_in_collection(collection_id: str, term_id: str, session: Session) -> PTerm | None:
|
|
626
|
-
statement = select(PTerm).join(
|
|
627
|
-
PTerm.id == term_id)
|
|
662
|
+
statement = select(PTerm).join(PCollection).where(PCollection.id == collection_id, PTerm.id == term_id)
|
|
628
663
|
results = session.exec(statement)
|
|
629
664
|
result = results.one_or_none()
|
|
630
665
|
return result
|
|
631
666
|
|
|
632
667
|
|
|
633
|
-
def get_term_in_collection(
|
|
634
|
-
|
|
668
|
+
def get_term_in_collection(
|
|
669
|
+
project_id: str, collection_id: str, term_id: str, selected_term_fields: Iterable[str] | None = None
|
|
670
|
+
) -> DataDescriptor | None:
|
|
635
671
|
"""
|
|
636
672
|
Returns the term, in the given project and collection,
|
|
637
673
|
whose id corresponds exactly to the given term id.
|
|
@@ -661,8 +697,8 @@ def get_term_in_collection(project_id: str, collection_id: str, term_id: str,
|
|
|
661
697
|
return result
|
|
662
698
|
|
|
663
699
|
|
|
664
|
-
def _get_collection_in_project(collection_id: str, session: Session) ->
|
|
665
|
-
statement = select(
|
|
700
|
+
def _get_collection_in_project(collection_id: str, session: Session) -> PCollection | None:
|
|
701
|
+
statement = select(PCollection).where(PCollection.id == collection_id)
|
|
666
702
|
results = session.exec(statement)
|
|
667
703
|
result = results.one_or_none()
|
|
668
704
|
return result
|
|
@@ -718,16 +754,13 @@ def get_project(project_id: str) -> ProjectSpecs | None:
|
|
|
718
754
|
return result
|
|
719
755
|
|
|
720
756
|
|
|
721
|
-
def _get_collection_from_data_descriptor_in_project(data_descriptor_id: str,
|
|
722
|
-
|
|
723
|
-
statement = select(Collection).where(Collection.data_descriptor_id == data_descriptor_id)
|
|
757
|
+
def _get_collection_from_data_descriptor_in_project(data_descriptor_id: str, session: Session) -> PCollection | None:
|
|
758
|
+
statement = select(PCollection).where(PCollection.data_descriptor_id == data_descriptor_id)
|
|
724
759
|
result = session.exec(statement).one_or_none()
|
|
725
760
|
return result
|
|
726
761
|
|
|
727
762
|
|
|
728
|
-
def get_collection_from_data_descriptor_in_project(project_id: str,
|
|
729
|
-
data_descriptor_id: str) \
|
|
730
|
-
-> tuple[str, dict] | None:
|
|
763
|
+
def get_collection_from_data_descriptor_in_project(project_id: str, data_descriptor_id: str) -> tuple[str, dict] | None:
|
|
731
764
|
"""
|
|
732
765
|
Returns the collection, in the given project, that corresponds to the given data descriptor
|
|
733
766
|
in the universe.
|
|
@@ -746,15 +779,13 @@ def get_collection_from_data_descriptor_in_project(project_id: str,
|
|
|
746
779
|
result: tuple[str, dict] | None = None
|
|
747
780
|
if connection := _get_project_connection(project_id):
|
|
748
781
|
with connection.create_session() as session:
|
|
749
|
-
collection_found = _get_collection_from_data_descriptor_in_project(data_descriptor_id,
|
|
750
|
-
session)
|
|
782
|
+
collection_found = _get_collection_from_data_descriptor_in_project(data_descriptor_id, session)
|
|
751
783
|
if collection_found:
|
|
752
784
|
result = collection_found.id, collection_found.context
|
|
753
785
|
return result
|
|
754
786
|
|
|
755
787
|
|
|
756
|
-
def get_collection_from_data_descriptor_in_all_projects(data_descriptor_id: str)
|
|
757
|
-
-> list[tuple[str, str, dict]]:
|
|
788
|
+
def get_collection_from_data_descriptor_in_all_projects(data_descriptor_id: str) -> list[tuple[str, str, dict]]:
|
|
758
789
|
"""
|
|
759
790
|
Returns the collections, in all projects, that correspond to the given data descriptor
|
|
760
791
|
in the universe.
|
|
@@ -773,28 +804,28 @@ def get_collection_from_data_descriptor_in_all_projects(data_descriptor_id: str)
|
|
|
773
804
|
result = list()
|
|
774
805
|
project_ids = get_all_projects()
|
|
775
806
|
for project_id in project_ids:
|
|
776
|
-
collection_found = get_collection_from_data_descriptor_in_project(project_id,
|
|
777
|
-
data_descriptor_id)
|
|
807
|
+
collection_found = get_collection_from_data_descriptor_in_project(project_id, data_descriptor_id)
|
|
778
808
|
if collection_found:
|
|
779
809
|
result.append((project_id, collection_found[0], collection_found[1]))
|
|
780
810
|
return result
|
|
781
811
|
|
|
782
812
|
|
|
783
|
-
def _get_term_from_universe_term_id_in_project(
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
statement =
|
|
787
|
-
|
|
813
|
+
def _get_term_from_universe_term_id_in_project(
|
|
814
|
+
data_descriptor_id: str, universe_term_id: str, project_session: Session
|
|
815
|
+
) -> PTerm | None:
|
|
816
|
+
statement = (
|
|
817
|
+
select(PTerm)
|
|
818
|
+
.join(PCollection)
|
|
819
|
+
.where(PCollection.data_descriptor_id == data_descriptor_id, PTerm.id == universe_term_id)
|
|
820
|
+
)
|
|
788
821
|
results = project_session.exec(statement)
|
|
789
822
|
result = results.one_or_none()
|
|
790
823
|
return result
|
|
791
824
|
|
|
792
825
|
|
|
793
|
-
def get_term_from_universe_term_id_in_project(
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
selected_term_fields: Iterable[str] | None = None) \
|
|
797
|
-
-> tuple[str, DataDescriptor] | None:
|
|
826
|
+
def get_term_from_universe_term_id_in_project(
|
|
827
|
+
project_id: str, data_descriptor_id: str, universe_term_id: str, selected_term_fields: Iterable[str] | None = None
|
|
828
|
+
) -> tuple[str, DataDescriptor] | None:
|
|
798
829
|
"""
|
|
799
830
|
Returns the term, in the given project, that corresponds to the given term in the universe.
|
|
800
831
|
This function performs an exact match on the `project_id`, `data_descriptor_id`
|
|
@@ -818,19 +849,16 @@ def get_term_from_universe_term_id_in_project(project_id: str,
|
|
|
818
849
|
result: tuple[str, DataDescriptor] | None = None
|
|
819
850
|
if connection := _get_project_connection(project_id):
|
|
820
851
|
with connection.create_session() as session:
|
|
821
|
-
term_found = _get_term_from_universe_term_id_in_project(data_descriptor_id,
|
|
822
|
-
universe_term_id,
|
|
823
|
-
session)
|
|
852
|
+
term_found = _get_term_from_universe_term_id_in_project(data_descriptor_id, universe_term_id, session)
|
|
824
853
|
if term_found:
|
|
825
854
|
pydantic_term = instantiate_pydantic_term(term_found, selected_term_fields)
|
|
826
855
|
result = (term_found.collection.id, pydantic_term)
|
|
827
856
|
return result
|
|
828
857
|
|
|
829
858
|
|
|
830
|
-
def get_term_from_universe_term_id_in_all_projects(
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
-> list[tuple[str, str, DataDescriptor]]:
|
|
859
|
+
def get_term_from_universe_term_id_in_all_projects(
|
|
860
|
+
data_descriptor_id: str, universe_term_id: str, selected_term_fields: Iterable[str] | None = None
|
|
861
|
+
) -> list[tuple[str, str, DataDescriptor]]:
|
|
834
862
|
"""
|
|
835
863
|
Returns the terms, in all projects, that correspond to the given term in the universe.
|
|
836
864
|
This function performs an exact match on the `data_descriptor_id`
|
|
@@ -853,38 +881,37 @@ def get_term_from_universe_term_id_in_all_projects(data_descriptor_id: str,
|
|
|
853
881
|
result: list[tuple[str, str, DataDescriptor]] = list()
|
|
854
882
|
project_ids = get_all_projects()
|
|
855
883
|
for project_id in project_ids:
|
|
856
|
-
term_found = get_term_from_universe_term_id_in_project(
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
selected_term_fields)
|
|
884
|
+
term_found = get_term_from_universe_term_id_in_project(
|
|
885
|
+
project_id, data_descriptor_id, universe_term_id, selected_term_fields
|
|
886
|
+
)
|
|
860
887
|
if term_found:
|
|
861
888
|
result.append((project_id, term_found[0], term_found[1]))
|
|
862
889
|
return result
|
|
863
890
|
|
|
864
891
|
|
|
865
|
-
def _find_collections_in_project(
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
limit: int | None = None,
|
|
869
|
-
offset: int | None = None) -> Sequence[Collection]:
|
|
892
|
+
def _find_collections_in_project(
|
|
893
|
+
expression: str, session: Session, only_id: bool = False, limit: int | None = None, offset: int | None = None
|
|
894
|
+
) -> Sequence[PCollection]:
|
|
870
895
|
matching_condition = generate_matching_condition(PCollectionFTS5, expression, only_id)
|
|
871
896
|
tmp_statement = select(PCollectionFTS5).where(matching_condition)
|
|
872
|
-
statement = select(
|
|
897
|
+
statement = select(PCollection).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
|
|
873
898
|
return execute_match_statement(expression, statement, session)
|
|
874
899
|
|
|
875
900
|
|
|
876
|
-
def find_collections_in_project(
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
offset: int | None = None) -> list[tuple[str, dict]]:
|
|
901
|
+
def find_collections_in_project(
|
|
902
|
+
expression: str, project_id: str, only_id: bool = False, limit: int | None = None, offset: int | None = None
|
|
903
|
+
) -> list[tuple[str, dict]]:
|
|
880
904
|
"""
|
|
881
905
|
Find collections in the given project based on a full text search defined by the given `expression`.
|
|
882
|
-
The `expression`
|
|
883
|
-
|
|
884
|
-
and
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
906
|
+
The `expression` can be composed of one or multiple keywords.
|
|
907
|
+
The keywords can combined with boolean operators: `AND`,
|
|
908
|
+
`OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
|
|
909
|
+
if no boolean operators is provided, whitespaces are handled as if there were
|
|
910
|
+
an implicit AND operator between each pair of keywords. Note that this
|
|
911
|
+
function does not provide any priority operator (parenthesis).
|
|
912
|
+
Keywords can define prefixes when adding a `*` at the end of them.
|
|
913
|
+
If the expression is composed of only one keyword, the function
|
|
914
|
+
automatically defines it as a prefix.
|
|
888
915
|
The function returns a list of collection ids and contexts, sorted according to the
|
|
889
916
|
bm25 ranking metric (list index `0` has the highest rank).
|
|
890
917
|
This function performs an exact match on the `project_id`,
|
|
@@ -915,52 +942,57 @@ def find_collections_in_project(expression: str, project_id: str,
|
|
|
915
942
|
result: list[tuple[str, dict]] = list()
|
|
916
943
|
if connection := _get_project_connection(project_id):
|
|
917
944
|
with connection.create_session() as session:
|
|
918
|
-
collections_found = _find_collections_in_project(expression, session, only_id,
|
|
919
|
-
limit, offset)
|
|
945
|
+
collections_found = _find_collections_in_project(expression, session, only_id, limit, offset)
|
|
920
946
|
for collection in collections_found:
|
|
921
947
|
result.append((collection.id, collection.context))
|
|
922
948
|
return result
|
|
923
949
|
|
|
924
950
|
|
|
925
|
-
def _find_terms_in_collection(
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
951
|
+
def _find_terms_in_collection(
|
|
952
|
+
expression: str,
|
|
953
|
+
collection_id: str,
|
|
954
|
+
session: Session,
|
|
955
|
+
only_id: bool = False,
|
|
956
|
+
limit: int | None = None,
|
|
957
|
+
offset: int | None = None,
|
|
958
|
+
) -> Sequence[PTerm]:
|
|
931
959
|
matching_condition = generate_matching_condition(PTermFTS5, expression, only_id)
|
|
932
|
-
where_condition =
|
|
933
|
-
tmp_statement = select(PTermFTS5).join(
|
|
960
|
+
where_condition = PCollection.id == collection_id, matching_condition
|
|
961
|
+
tmp_statement = select(PTermFTS5).join(PCollection).where(*where_condition)
|
|
934
962
|
statement = select(PTerm).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
|
|
935
963
|
return execute_match_statement(expression, statement, session)
|
|
936
964
|
|
|
937
965
|
|
|
938
|
-
def _find_terms_in_project(
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
limit: int | None = None,
|
|
942
|
-
offset: int | None = None) -> Sequence[PTerm]:
|
|
966
|
+
def _find_terms_in_project(
|
|
967
|
+
expression: str, session: Session, only_id: bool = False, limit: int | None = None, offset: int | None = None
|
|
968
|
+
) -> Sequence[PTerm]:
|
|
943
969
|
matching_condition = generate_matching_condition(PTermFTS5, expression, only_id)
|
|
944
970
|
tmp_statement = select(PTermFTS5).where(matching_condition)
|
|
945
971
|
statement = select(PTerm).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
|
|
946
972
|
return execute_match_statement(expression, statement, session)
|
|
947
973
|
|
|
948
974
|
|
|
949
|
-
def find_terms_in_collection(
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
975
|
+
def find_terms_in_collection(
|
|
976
|
+
expression: str,
|
|
977
|
+
project_id: str,
|
|
978
|
+
collection_id: str,
|
|
979
|
+
only_id: bool = False,
|
|
980
|
+
limit: int | None = None,
|
|
981
|
+
offset: int | None = None,
|
|
982
|
+
selected_term_fields: Iterable[str] | None = None,
|
|
983
|
+
) -> list[DataDescriptor]:
|
|
956
984
|
"""
|
|
957
985
|
Find terms in the given project and collection based on a full text search defined by the given
|
|
958
|
-
`expression`.
|
|
959
|
-
`
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
986
|
+
`expression`.
|
|
987
|
+
The `expression` can be composed of one or multiple keywords.
|
|
988
|
+
The keywords can combined with boolean operators: `AND`,
|
|
989
|
+
`OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
|
|
990
|
+
if no boolean operators is provided, whitespaces are handled as if there were
|
|
991
|
+
an implicit AND operator between each pair of keywords. Note that this
|
|
992
|
+
function does not provide any priority operator (parenthesis).
|
|
993
|
+
Keywords can define prefixes when adding a `*` at the end of them.
|
|
994
|
+
If the expression is composed of only one keyword, the function
|
|
995
|
+
automatically defines it as a prefix.
|
|
964
996
|
The function returns a list of term instances, sorted according to the
|
|
965
997
|
bm25 ranking metric (list index `0` has the highest rank).
|
|
966
998
|
This function performs an exact match on the `project_id` and `collection_id`,
|
|
@@ -995,27 +1027,30 @@ def find_terms_in_collection(expression: str, project_id: str,
|
|
|
995
1027
|
result: list[DataDescriptor] = list()
|
|
996
1028
|
if connection := _get_project_connection(project_id):
|
|
997
1029
|
with connection.create_session() as session:
|
|
998
|
-
pterms_found = _find_terms_in_collection(expression, collection_id, session,
|
|
999
|
-
only_id, limit, offset)
|
|
1030
|
+
pterms_found = _find_terms_in_collection(expression, collection_id, session, only_id, limit, offset)
|
|
1000
1031
|
instantiate_pydantic_terms(pterms_found, result, selected_term_fields)
|
|
1001
1032
|
return result
|
|
1002
1033
|
|
|
1003
1034
|
|
|
1004
|
-
def find_terms_in_project(
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1035
|
+
def find_terms_in_project(
|
|
1036
|
+
expression: str,
|
|
1037
|
+
project_id: str,
|
|
1038
|
+
only_id: bool = False,
|
|
1039
|
+
limit: int | None = None,
|
|
1040
|
+
offset: int | None = None,
|
|
1041
|
+
selected_term_fields: Iterable[str] | None = None,
|
|
1042
|
+
) -> list[DataDescriptor]:
|
|
1011
1043
|
"""
|
|
1012
|
-
Find terms in the given project on a full text search defined by the given
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
and
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1044
|
+
Find terms in the given project based on a full text search defined by the given `expression`.
|
|
1045
|
+
The `expression` can be composed of one or multiple keywords.
|
|
1046
|
+
The keywords can combined with boolean operators: `AND`,
|
|
1047
|
+
`OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
|
|
1048
|
+
if no boolean operators is provided, whitespaces are handled as if there were
|
|
1049
|
+
an implicit AND operator between each pair of keywords. Note that this
|
|
1050
|
+
function does not provide any priority operator (parenthesis).
|
|
1051
|
+
Keywords can define prefixes when adding a `*` at the end of them.
|
|
1052
|
+
If the expression is composed of only one keyword, the function
|
|
1053
|
+
automatically defines it as a prefix.
|
|
1019
1054
|
The function returns a list of term instances, sorted according to the
|
|
1020
1055
|
bm25 ranking metric (list index `0` has the highest rank).
|
|
1021
1056
|
This function performs an exact match on the `project_id`,
|
|
@@ -1053,20 +1088,24 @@ def find_terms_in_project(expression: str,
|
|
|
1053
1088
|
return result
|
|
1054
1089
|
|
|
1055
1090
|
|
|
1056
|
-
def find_terms_in_all_projects(
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1091
|
+
def find_terms_in_all_projects(
|
|
1092
|
+
expression: str,
|
|
1093
|
+
only_id: bool = False,
|
|
1094
|
+
limit: int | None = None,
|
|
1095
|
+
offset: int | None = None,
|
|
1096
|
+
selected_term_fields: Iterable[str] | None = None,
|
|
1097
|
+
) -> list[tuple[str, list[DataDescriptor]]]:
|
|
1062
1098
|
"""
|
|
1063
|
-
Find terms in
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
and
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1099
|
+
Find terms in all projects based on a full text search defined by the given `expression`.
|
|
1100
|
+
The `expression` can be composed of one or multiple keywords.
|
|
1101
|
+
The keywords can combined with boolean operators: `AND`,
|
|
1102
|
+
`OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
|
|
1103
|
+
if no boolean operators is provided, whitespaces are handled as if there were
|
|
1104
|
+
an implicit AND operator between each pair of keywords. Note that this
|
|
1105
|
+
function does not provide any priority operator (parenthesis).
|
|
1106
|
+
Keywords can define prefixes when adding a `*` at the end of them.
|
|
1107
|
+
If the expression is composed of only one keyword, the function
|
|
1108
|
+
automatically defines it as a prefix.
|
|
1070
1109
|
The function returns a list of project ids and term instances, sorted according to the
|
|
1071
1110
|
bm25 ranking metric (list index `0` has the highest rank).
|
|
1072
1111
|
If the provided `expression` does not hit any term, the function returns an empty list.
|
|
@@ -1094,26 +1133,27 @@ def find_terms_in_all_projects(expression: str,
|
|
|
1094
1133
|
result: list[tuple[str, list[DataDescriptor]]] = list()
|
|
1095
1134
|
project_ids = get_all_projects()
|
|
1096
1135
|
for project_id in project_ids:
|
|
1097
|
-
terms_found = find_terms_in_project(expression, project_id, only_id,
|
|
1098
|
-
limit, offset, selected_term_fields)
|
|
1136
|
+
terms_found = find_terms_in_project(expression, project_id, only_id, limit, offset, selected_term_fields)
|
|
1099
1137
|
if terms_found:
|
|
1100
1138
|
result.append((project_id, terms_found))
|
|
1101
1139
|
return result
|
|
1102
1140
|
|
|
1103
1141
|
|
|
1104
|
-
def find_items_in_project(
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
limit: int | None = None,
|
|
1108
|
-
offset: int | None = None) -> list[Item]:
|
|
1142
|
+
def find_items_in_project(
|
|
1143
|
+
expression: str, project_id: str, only_id: bool = False, limit: int | None = None, offset: int | None = None
|
|
1144
|
+
) -> list[Item]:
|
|
1109
1145
|
"""
|
|
1110
1146
|
Find items, at the moment terms and collections, in the given project based on a full-text
|
|
1111
|
-
search defined by the given `expression`.
|
|
1112
|
-
`
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1147
|
+
search defined by the given `expression`.
|
|
1148
|
+
The `expression` can be composed of one or multiple keywords.
|
|
1149
|
+
The keywords can combined with boolean operators: `AND`,
|
|
1150
|
+
`OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
|
|
1151
|
+
if no boolean operators is provided, whitespaces are handled as if there were
|
|
1152
|
+
an implicit AND operator between each pair of keywords. Note that this
|
|
1153
|
+
function does not provide any priority operator (parenthesis).
|
|
1154
|
+
Keywords can define prefixes when adding a `*` at the end of them.
|
|
1155
|
+
If the expression is composed of only one keyword, the function
|
|
1156
|
+
automatically defines it as a prefix.
|
|
1117
1157
|
The function returns a list of item instances sorted according to the
|
|
1118
1158
|
bm25 ranking metric (list index `0` has the highest rank).
|
|
1119
1159
|
This function performs an exact match on the `project_id`,
|
|
@@ -1143,23 +1183,24 @@ def find_items_in_project(expression: str,
|
|
|
1143
1183
|
result = list()
|
|
1144
1184
|
if connection := _get_project_connection(project_id):
|
|
1145
1185
|
with connection.create_session() as session:
|
|
1186
|
+
processed_expression = process_expression(expression)
|
|
1146
1187
|
if only_id:
|
|
1147
1188
|
collection_column = col(PCollectionFTS5.id)
|
|
1148
1189
|
term_column = col(PTermFTS5.id)
|
|
1149
1190
|
else:
|
|
1150
1191
|
collection_column = col(PCollectionFTS5.id) # TODO: use specs when implemented!
|
|
1151
1192
|
term_column = col(PTermFTS5.specs) # type: ignore
|
|
1152
|
-
collection_where_condition = collection_column.match(
|
|
1153
|
-
collection_statement = select(
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1193
|
+
collection_where_condition = collection_column.match(processed_expression)
|
|
1194
|
+
collection_statement = select(
|
|
1195
|
+
PCollectionFTS5.id, text("'collection' AS TYPE"), text(f"'{project_id}' AS TYPE"), text("rank")
|
|
1196
|
+
).where(collection_where_condition)
|
|
1197
|
+
term_where_condition = term_column.match(processed_expression)
|
|
1198
|
+
term_statement = (
|
|
1199
|
+
select(PTermFTS5.id, text("'term' AS TYPE"), PCollection.id, text("rank"))
|
|
1200
|
+
.join(PCollection)
|
|
1201
|
+
.where(term_where_condition)
|
|
1202
|
+
)
|
|
1203
|
+
result = execute_find_item_statements(
|
|
1204
|
+
session, processed_expression, collection_statement, term_statement, limit, offset
|
|
1205
|
+
)
|
|
1165
1206
|
return result
|