esgvoc 0.4.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of esgvoc might be problematic. Click here for more details.

Files changed (74) hide show
  1. esgvoc/__init__.py +1 -1
  2. esgvoc/api/data_descriptors/__init__.py +52 -28
  3. esgvoc/api/data_descriptors/activity.py +3 -3
  4. esgvoc/api/data_descriptors/area_label.py +16 -1
  5. esgvoc/api/data_descriptors/branded_suffix.py +20 -0
  6. esgvoc/api/data_descriptors/branded_variable.py +12 -0
  7. esgvoc/api/data_descriptors/consortium.py +14 -13
  8. esgvoc/api/data_descriptors/contact.py +5 -0
  9. esgvoc/api/data_descriptors/conventions.py +6 -0
  10. esgvoc/api/data_descriptors/creation_date.py +5 -0
  11. esgvoc/api/data_descriptors/data_descriptor.py +14 -9
  12. esgvoc/api/data_descriptors/data_specs_version.py +5 -0
  13. esgvoc/api/data_descriptors/date.py +1 -1
  14. esgvoc/api/data_descriptors/directory_date.py +1 -1
  15. esgvoc/api/data_descriptors/experiment.py +13 -11
  16. esgvoc/api/data_descriptors/forcing_index.py +1 -1
  17. esgvoc/api/data_descriptors/frequency.py +3 -3
  18. esgvoc/api/data_descriptors/further_info_url.py +5 -0
  19. esgvoc/api/data_descriptors/grid_label.py +2 -2
  20. esgvoc/api/data_descriptors/horizontal_label.py +15 -1
  21. esgvoc/api/data_descriptors/initialisation_index.py +1 -1
  22. esgvoc/api/data_descriptors/institution.py +8 -5
  23. esgvoc/api/data_descriptors/known_branded_variable.py +23 -0
  24. esgvoc/api/data_descriptors/license.py +3 -3
  25. esgvoc/api/data_descriptors/member_id.py +9 -0
  26. esgvoc/api/data_descriptors/mip_era.py +1 -1
  27. esgvoc/api/data_descriptors/model_component.py +1 -1
  28. esgvoc/api/data_descriptors/obs_type.py +5 -0
  29. esgvoc/api/data_descriptors/organisation.py +1 -1
  30. esgvoc/api/data_descriptors/physic_index.py +1 -1
  31. esgvoc/api/data_descriptors/product.py +2 -2
  32. esgvoc/api/data_descriptors/publication_status.py +5 -0
  33. esgvoc/api/data_descriptors/realisation_index.py +1 -1
  34. esgvoc/api/data_descriptors/realm.py +1 -1
  35. esgvoc/api/data_descriptors/region.py +5 -0
  36. esgvoc/api/data_descriptors/resolution.py +3 -3
  37. esgvoc/api/data_descriptors/source.py +9 -5
  38. esgvoc/api/data_descriptors/source_type.py +1 -1
  39. esgvoc/api/data_descriptors/table.py +3 -2
  40. esgvoc/api/data_descriptors/temporal_label.py +15 -1
  41. esgvoc/api/data_descriptors/time_range.py +4 -3
  42. esgvoc/api/data_descriptors/title.py +5 -0
  43. esgvoc/api/data_descriptors/tracking_id.py +5 -0
  44. esgvoc/api/data_descriptors/variable.py +25 -12
  45. esgvoc/api/data_descriptors/variant_label.py +3 -3
  46. esgvoc/api/data_descriptors/vertical_label.py +14 -0
  47. esgvoc/api/project_specs.py +117 -2
  48. esgvoc/api/projects.py +328 -287
  49. esgvoc/api/search.py +30 -3
  50. esgvoc/api/universe.py +42 -27
  51. esgvoc/apps/drs/generator.py +87 -74
  52. esgvoc/apps/jsg/cmip6_template.json +74 -0
  53. esgvoc/apps/jsg/json_schema_generator.py +194 -0
  54. esgvoc/cli/config.py +500 -0
  55. esgvoc/cli/find.py +138 -0
  56. esgvoc/cli/get.py +43 -38
  57. esgvoc/cli/main.py +10 -3
  58. esgvoc/cli/status.py +27 -18
  59. esgvoc/cli/valid.py +10 -15
  60. esgvoc/core/db/models/project.py +11 -11
  61. esgvoc/core/db/models/universe.py +3 -3
  62. esgvoc/core/db/project_ingestion.py +40 -40
  63. esgvoc/core/db/universe_ingestion.py +36 -33
  64. esgvoc/core/logging_handler.py +24 -2
  65. esgvoc/core/repo_fetcher.py +61 -59
  66. esgvoc/core/service/data_merger.py +47 -34
  67. esgvoc/core/service/state.py +107 -83
  68. {esgvoc-0.4.0.dist-info → esgvoc-1.0.1.dist-info}/METADATA +5 -20
  69. esgvoc-1.0.1.dist-info/RECORD +95 -0
  70. esgvoc/core/logging.conf +0 -21
  71. esgvoc-0.4.0.dist-info/RECORD +0 -80
  72. {esgvoc-0.4.0.dist-info → esgvoc-1.0.1.dist-info}/WHEEL +0 -0
  73. {esgvoc-0.4.0.dist-info → esgvoc-1.0.1.dist-info}/entry_points.txt +0 -0
  74. {esgvoc-0.4.0.dist-info → esgvoc-1.0.1.dist-info}/licenses/LICENSE.txt +0 -0
esgvoc/api/projects.py CHANGED
@@ -1,3 +1,4 @@
1
+ import itertools
1
2
  import re
2
3
  from typing import Iterable, Sequence
3
4
 
@@ -20,16 +21,11 @@ from esgvoc.api.search import (
20
21
  handle_rank_limit_offset,
21
22
  instantiate_pydantic_term,
22
23
  instantiate_pydantic_terms,
24
+ process_expression,
23
25
  )
24
26
  from esgvoc.core.db.connection import DBConnection
25
27
  from esgvoc.core.db.models.mixins import TermKind
26
- from esgvoc.core.db.models.project import (
27
- Collection,
28
- PCollectionFTS5,
29
- Project,
30
- PTerm,
31
- PTermFTS5,
32
- )
28
+ from esgvoc.core.db.models.project import PCollection, PCollectionFTS5, Project, PTerm, PTermFTS5
33
29
  from esgvoc.core.db.models.universe import UTerm
34
30
  from esgvoc.core.exceptions import EsgvocDbError, EsgvocNotFoundError, EsgvocNotImplementedError, EsgvocValueError
35
31
 
@@ -53,21 +49,17 @@ def _get_project_session_with_exception(project_id: str) -> Session:
53
49
  raise EsgvocNotFoundError(f"unable to find project '{project_id}'")
54
50
 
55
51
 
56
- def _resolve_term(composite_term_part: dict,
57
- universe_session: Session,
58
- project_session: Session) -> UTerm | PTerm:
52
+ def _resolve_term(composite_term_part: dict, universe_session: Session, project_session: Session) -> UTerm | PTerm:
59
53
  # First find the term in the universe than in the current project
60
54
  term_id = composite_term_part[constants.TERM_ID_JSON_KEY]
61
55
  term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
62
- uterm = universe._get_term_in_data_descriptor(data_descriptor_id=term_type,
63
- term_id=term_id,
64
- session=universe_session)
56
+ uterm = universe._get_term_in_data_descriptor(
57
+ data_descriptor_id=term_type, term_id=term_id, session=universe_session
58
+ )
65
59
  if uterm:
66
60
  return uterm
67
61
  else:
68
- pterm = _get_term_in_collection(collection_id=term_type,
69
- term_id=term_id,
70
- session=project_session)
62
+ pterm = _get_term_in_collection(collection_id=term_type, term_id=term_id, session=project_session)
71
63
  if pterm:
72
64
  return pterm
73
65
  else:
@@ -81,13 +73,88 @@ def _get_composite_term_separator_parts(term: UTerm | PTerm) -> tuple[str, list]
81
73
  return separator, parts
82
74
 
83
75
 
76
+ def _valid_value_composite_term_with_separator(
77
+ value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
78
+ ) -> list[UniverseTermError | ProjectTermError]:
79
+ result = []
80
+ separator, parts = _get_composite_term_separator_parts(term)
81
+ required_indices = {i for i, p in enumerate(parts) if p.get("is_required", False)}
82
+
83
+ splits = value.split(separator)
84
+ nb_splits = len(splits)
85
+ nb_parts = len(parts)
86
+
87
+ if nb_splits > nb_parts:
88
+ return [_create_term_error(value, term)]
89
+
90
+ # Generate all possible assignments of split values into parts
91
+ # Only keep those that include all required parts
92
+ all_positions = [i for i in range(nb_parts)]
93
+ valid_combinations = [
94
+ comb for comb in itertools.combinations(all_positions, nb_splits) if required_indices.issubset(comb)
95
+ ]
96
+
97
+ for positions in valid_combinations:
98
+ candidate = [None] * nb_parts
99
+ for idx, pos in enumerate(positions):
100
+ candidate[pos] = splits[idx]
101
+
102
+ # Separator structure validation:
103
+ # - No leading separator if the first part is None
104
+ # - No trailing separator if the last part is None
105
+ # - No double separators where two adjacent optional parts are missing
106
+ if candidate[0] is None and value.startswith(separator):
107
+ continue
108
+ if candidate[-1] is None and value.endswith(separator):
109
+ continue
110
+ if any(
111
+ candidate[i] is None and candidate[i + 1] is None and separator * 2 in value for i in range(nb_parts - 1)
112
+ ):
113
+ continue # invalid double separator between two missing parts
114
+
115
+ # Validate each filled part value
116
+ all_valid = True
117
+ for i, given_value in enumerate(candidate):
118
+ if given_value is None:
119
+ if parts[i].get("is_required", False):
120
+ all_valid = False
121
+ break
122
+ continue # optional and missing part is allowed
123
+
124
+ part = parts[i]
125
+
126
+ # Resolve term ID list if not present
127
+ if "id" not in part:
128
+ terms = universe.get_all_terms_in_data_descriptor(part["type"], None)
129
+ part["id"] = [term.id for term in terms]
130
+ if isinstance(part["id"], str):
131
+ part["id"] = [part["id"]]
132
+
133
+ # Try all possible term IDs to find a valid match
134
+ valid_for_this_part = False
135
+ for id in part["id"]:
136
+ part_copy = dict(part)
137
+ part_copy["id"] = id
138
+ resolved_term = _resolve_term(part_copy, universe_session, project_session)
139
+ errors = _valid_value(given_value, resolved_term, universe_session, project_session)
140
+ if not errors:
141
+ valid_for_this_part = True
142
+ break
143
+ if not valid_for_this_part:
144
+ all_valid = False
145
+ break
146
+
147
+ if all_valid:
148
+ return [] # At least one valid combination found
149
+
150
+ return [_create_term_error(value, term)] # No valid combination found
151
+
152
+
84
153
  # TODO: support optionality of parts of composite.
85
154
  # It is backtrack possible for more than one missing parts.
86
- def _valid_value_composite_term_with_separator(value: str,
87
- term: UTerm | PTerm,
88
- universe_session: Session,
89
- project_session: Session)\
90
- -> list[UniverseTermError | ProjectTermError]:
155
+ def _valid_value_composite_term_with_separator2(
156
+ value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
157
+ ) -> list[UniverseTermError | ProjectTermError]:
91
158
  result = list()
92
159
  separator, parts = _get_composite_term_separator_parts(term)
93
160
  if separator in value:
@@ -95,14 +162,25 @@ def _valid_value_composite_term_with_separator(value: str,
95
162
  if len(splits) == len(parts):
96
163
  for index in range(0, len(splits)):
97
164
  given_value = splits[index]
98
- resolved_term = _resolve_term(parts[index],
99
- universe_session,
100
- project_session)
101
- errors = _valid_value(given_value,
102
- resolved_term,
103
- universe_session,
104
- project_session)
105
- result.extend(errors)
165
+ if "id" not in parts[index].keys():
166
+ terms = universe.get_all_terms_in_data_descriptor(parts[index]["type"], None)
167
+ parts[index]["id"] = [term.id for term in terms]
168
+ if type(parts[index]["id"]) is str:
169
+ parts[index]["id"] = [parts[index]["id"]]
170
+
171
+ errors_list = list()
172
+ for id in parts[index]["id"]:
173
+ part_parts = dict(parts[index])
174
+ part_parts["id"] = id
175
+ resolved_term = _resolve_term(part_parts, universe_session, project_session)
176
+ errors = _valid_value(given_value, resolved_term, universe_session, project_session)
177
+ if len(errors) == 0:
178
+ errors_list = errors
179
+ break
180
+ else:
181
+ errors_list.extend(errors)
182
+ else:
183
+ result.append(_create_term_error(value, term))
106
184
  else:
107
185
  result.append(_create_term_error(value, term))
108
186
  else:
@@ -110,16 +188,13 @@ def _valid_value_composite_term_with_separator(value: str,
110
188
  return result
111
189
 
112
190
 
113
- def _transform_to_pattern(term: UTerm | PTerm,
114
- universe_session: Session,
115
- project_session: Session) -> str:
191
+ def _transform_to_pattern(term: UTerm | PTerm, universe_session: Session, project_session: Session) -> str:
116
192
  match term.kind:
117
193
  case TermKind.PLAIN:
118
194
  if constants.DRS_SPECS_JSON_KEY in term.specs:
119
195
  result = term.specs[constants.DRS_SPECS_JSON_KEY]
120
196
  else:
121
- raise EsgvocValueError(f"the term '{term.id}' doesn't have drs name. " +
122
- "Can't validate it.")
197
+ raise EsgvocValueError(f"the term '{term.id}' doesn't have drs name. " + "Can't validate it.")
123
198
  case TermKind.PATTERN:
124
199
  result = term.specs[constants.PATTERN_JSON_KEY]
125
200
  case TermKind.COMPOSITE:
@@ -128,7 +203,7 @@ def _transform_to_pattern(term: UTerm | PTerm,
128
203
  for part in parts:
129
204
  resolved_term = _resolve_term(part, universe_session, project_session)
130
205
  pattern = _transform_to_pattern(resolved_term, universe_session, project_session)
131
- result = f'{result}{pattern}{separator}'
206
+ result = f"{result}{pattern}{separator}"
132
207
  result = result.rstrip(separator)
133
208
  case _:
134
209
  raise EsgvocDbError(f"unsupported term kind '{term.kind}'")
@@ -137,11 +212,9 @@ def _transform_to_pattern(term: UTerm | PTerm,
137
212
 
138
213
  # TODO: support optionality of parts of composite.
139
214
  # It is backtrack possible for more than one missing parts.
140
- def _valid_value_composite_term_separator_less(value: str,
141
- term: UTerm | PTerm,
142
- universe_session: Session,
143
- project_session: Session)\
144
- -> list[UniverseTermError | ProjectTermError]:
215
+ def _valid_value_composite_term_separator_less(
216
+ value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
217
+ ) -> list[UniverseTermError | ProjectTermError]:
145
218
  result = list()
146
219
  try:
147
220
  pattern = _transform_to_pattern(term, universe_session, project_session)
@@ -150,8 +223,8 @@ def _valid_value_composite_term_separator_less(value: str,
150
223
  # So their regex are defined as a whole (begins by a ^, ends by a $).
151
224
  # As the pattern is a concatenation of plain or regex, multiple ^ and $ can exist.
152
225
  # The later, must be removed.
153
- pattern = pattern.replace('^', '').replace('$', '')
154
- pattern = f'^{pattern}$'
226
+ pattern = pattern.replace("^", "").replace("$", "")
227
+ pattern = f"^{pattern}$"
155
228
  regex = re.compile(pattern)
156
229
  except Exception as e:
157
230
  msg = f"regex compilation error while processing term '{term.id}'':\n{e}"
@@ -165,35 +238,30 @@ def _valid_value_composite_term_separator_less(value: str,
165
238
  raise EsgvocNotImplementedError(msg) from e
166
239
 
167
240
 
168
- def _valid_value_for_composite_term(value: str,
169
- term: UTerm | PTerm,
170
- universe_session: Session,
171
- project_session: Session)\
172
- -> list[UniverseTermError | ProjectTermError]:
241
+ def _valid_value_for_composite_term(
242
+ value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
243
+ ) -> list[UniverseTermError | ProjectTermError]:
173
244
  result = list()
174
245
  separator, _ = _get_composite_term_separator_parts(term)
175
246
  if separator:
176
- result = _valid_value_composite_term_with_separator(value, term, universe_session,
177
- project_session)
247
+ result = _valid_value_composite_term_with_separator(value, term, universe_session, project_session)
178
248
  else:
179
- result = _valid_value_composite_term_separator_less(value, term, universe_session,
180
- project_session)
249
+ result = _valid_value_composite_term_separator_less(value, term, universe_session, project_session)
181
250
  return result
182
251
 
183
252
 
184
253
  def _create_term_error(value: str, term: UTerm | PTerm) -> UniverseTermError | ProjectTermError:
185
254
  if isinstance(term, UTerm):
186
- return UniverseTermError(value=value, term=term.specs, term_kind=term.kind,
187
- data_descriptor_id=term.data_descriptor.id)
255
+ return UniverseTermError(
256
+ value=value, term=term.specs, term_kind=term.kind, data_descriptor_id=term.data_descriptor.id
257
+ )
188
258
  else:
189
- return ProjectTermError(value=value, term=term.specs, term_kind=term.kind,
190
- collection_id=term.collection.id)
259
+ return ProjectTermError(value=value, term=term.specs, term_kind=term.kind, collection_id=term.collection.id)
191
260
 
192
261
 
193
- def _valid_value(value: str,
194
- term: UTerm | PTerm,
195
- universe_session: Session,
196
- project_session: Session) -> list[UniverseTermError | ProjectTermError]:
262
+ def _valid_value(
263
+ value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
264
+ ) -> list[UniverseTermError | ProjectTermError]:
197
265
  result = list()
198
266
  match term.kind:
199
267
  case TermKind.PLAIN:
@@ -201,17 +269,14 @@ def _valid_value(value: str,
201
269
  if term.specs[constants.DRS_SPECS_JSON_KEY] != value:
202
270
  result.append(_create_term_error(value, term))
203
271
  else:
204
- raise EsgvocValueError(f"the term '{term.id}' doesn't have drs name. " +
205
- "Can't validate it.")
272
+ raise EsgvocValueError(f"the term '{term.id}' doesn't have drs name. " + "Can't validate it.")
206
273
  case TermKind.PATTERN:
207
274
  # TODO: Pattern can be compiled and stored for further matching.
208
275
  pattern_match = re.match(term.specs[constants.PATTERN_JSON_KEY], value)
209
276
  if pattern_match is None:
210
277
  result.append(_create_term_error(value, term))
211
278
  case TermKind.COMPOSITE:
212
- result.extend(_valid_value_for_composite_term(value, term,
213
- universe_session,
214
- project_session))
279
+ result.extend(_valid_value_for_composite_term(value, term, universe_session, project_session))
215
280
  case _:
216
281
  raise EsgvocDbError(f"unsupported term kind '{term.kind}'")
217
282
  return result
@@ -219,33 +284,25 @@ def _valid_value(value: str,
219
284
 
220
285
  def _check_value(value: str) -> str:
221
286
  if not value or value.isspace():
222
- raise EsgvocValueError('value should be set')
287
+ raise EsgvocValueError("value should be set")
223
288
  else:
224
289
  return value
225
290
 
226
291
 
227
- def _search_plain_term_and_valid_value(value: str,
228
- collection_id: str,
229
- project_session: Session) \
230
- -> str | None:
231
- where_expression = and_(Collection.id == collection_id,
232
- PTerm.specs[constants.DRS_SPECS_JSON_KEY] == f'"{value}"')
233
- statement = select(PTerm).join(Collection).where(where_expression)
292
+ def _search_plain_term_and_valid_value(value: str, collection_id: str, project_session: Session) -> str | None:
293
+ where_expression = and_(PCollection.id == collection_id, PTerm.specs[constants.DRS_SPECS_JSON_KEY] == f'"{value}"')
294
+ statement = select(PTerm).join(PCollection).where(where_expression)
234
295
  term = project_session.exec(statement).one_or_none()
235
296
  return term.id if term else None
236
297
 
237
298
 
238
- def _valid_value_against_all_terms_of_collection(value: str,
239
- collection: Collection,
240
- universe_session: Session,
241
- project_session: Session) \
242
- -> list[str]:
299
+ def _valid_value_against_all_terms_of_collection(
300
+ value: str, collection: PCollection, universe_session: Session, project_session: Session
301
+ ) -> list[str]:
243
302
  if collection.terms:
244
303
  result = list()
245
304
  for pterm in collection.terms:
246
- _errors = _valid_value(value, pterm,
247
- universe_session,
248
- project_session)
305
+ _errors = _valid_value(value, pterm, universe_session, project_session)
249
306
  if not _errors:
250
307
  result.append(pterm.id)
251
308
  return result
@@ -253,35 +310,24 @@ def _valid_value_against_all_terms_of_collection(value: str,
253
310
  raise EsgvocDbError(f"collection '{collection.id}' has no term")
254
311
 
255
312
 
256
- def _valid_value_against_given_term(value: str,
257
- project_id: str,
258
- collection_id: str,
259
- term_id: str,
260
- universe_session: Session,
261
- project_session: Session)\
262
- -> list[UniverseTermError | ProjectTermError]:
313
+ def _valid_value_against_given_term(
314
+ value: str, project_id: str, collection_id: str, term_id: str, universe_session: Session, project_session: Session
315
+ ) -> list[UniverseTermError | ProjectTermError]:
263
316
  # [OPTIMIZATION]
264
317
  key = value + project_id + collection_id + term_id
265
318
  if key in _VALID_VALUE_AGAINST_GIVEN_TERM_CACHE:
266
319
  result = _VALID_VALUE_AGAINST_GIVEN_TERM_CACHE[key]
267
320
  else:
268
- term = _get_term_in_collection(collection_id,
269
- term_id,
270
- project_session)
321
+ term = _get_term_in_collection(collection_id, term_id, project_session)
271
322
  if term:
272
323
  result = _valid_value(value, term, universe_session, project_session)
273
324
  else:
274
- raise EsgvocNotFoundError(f"unable to find term '{term_id}' " +
275
- f"in collection '{collection_id}'")
325
+ raise EsgvocNotFoundError(f"unable to find term '{term_id}' " + f"in collection '{collection_id}'")
276
326
  _VALID_VALUE_AGAINST_GIVEN_TERM_CACHE[key] = result
277
327
  return result
278
328
 
279
329
 
280
- def valid_term(value: str,
281
- project_id: str,
282
- collection_id: str,
283
- term_id: str) \
284
- -> ValidationReport:
330
+ def valid_term(value: str, project_id: str, collection_id: str, term_id: str) -> ValidationReport:
285
331
  """
286
332
  Check if the given value may or may not represent the given term. The functions returns
287
333
  a report that contains the possible errors.
@@ -312,19 +358,16 @@ def valid_term(value: str,
312
358
  :raises EsgvocNotFoundError: If any of the provided ids is not found
313
359
  """
314
360
  value = _check_value(value)
315
- with get_universe_session() as universe_session, \
316
- _get_project_session_with_exception(project_id) as project_session:
317
- errors = _valid_value_against_given_term(value, project_id, collection_id, term_id,
318
- universe_session, project_session)
361
+ with get_universe_session() as universe_session, _get_project_session_with_exception(project_id) as project_session:
362
+ errors = _valid_value_against_given_term(
363
+ value, project_id, collection_id, term_id, universe_session, project_session
364
+ )
319
365
  return ValidationReport(expression=value, errors=errors)
320
366
 
321
367
 
322
- def _valid_term_in_collection(value: str,
323
- project_id: str,
324
- collection_id: str,
325
- universe_session: Session,
326
- project_session: Session) \
327
- -> list[MatchingTerm]:
368
+ def _valid_term_in_collection(
369
+ value: str, project_id: str, collection_id: str, universe_session: Session, project_session: Session
370
+ ) -> list[MatchingTerm]:
328
371
  # [OPTIMIZATION]
329
372
  key = value + project_id + collection_id
330
373
  if key in _VALID_TERM_IN_COLLECTION_CACHE:
@@ -336,20 +379,19 @@ def _valid_term_in_collection(value: str,
336
379
  if collection:
337
380
  match collection.term_kind:
338
381
  case TermKind.PLAIN:
339
- term_id_found = _search_plain_term_and_valid_value(value, collection_id,
340
- project_session)
382
+ term_id_found = _search_plain_term_and_valid_value(value, collection_id, project_session)
341
383
  if term_id_found:
342
- result.append(MatchingTerm(project_id=project_id,
343
- collection_id=collection_id,
344
- term_id=term_id_found))
384
+ result.append(
385
+ MatchingTerm(project_id=project_id, collection_id=collection_id, term_id=term_id_found)
386
+ )
345
387
  case _:
346
- term_ids_found = _valid_value_against_all_terms_of_collection(value, collection,
347
- universe_session,
348
- project_session)
388
+ term_ids_found = _valid_value_against_all_terms_of_collection(
389
+ value, collection, universe_session, project_session
390
+ )
349
391
  for term_id_found in term_ids_found:
350
- result.append(MatchingTerm(project_id=project_id,
351
- collection_id=collection_id,
352
- term_id=term_id_found))
392
+ result.append(
393
+ MatchingTerm(project_id=project_id, collection_id=collection_id, term_id=term_id_found)
394
+ )
353
395
  else:
354
396
  msg = f"unable to find collection '{collection_id}'"
355
397
  raise EsgvocNotFoundError(msg)
@@ -357,10 +399,7 @@ def _valid_term_in_collection(value: str,
357
399
  return result
358
400
 
359
401
 
360
- def valid_term_in_collection(value: str,
361
- project_id: str,
362
- collection_id: str) \
363
- -> list[MatchingTerm]:
402
+ def valid_term_in_collection(value: str, project_id: str, collection_id: str) -> list[MatchingTerm]:
364
403
  """
365
404
  Check if the given value may or may not represent a term in the given collection. The function
366
405
  returns the terms that the value matches.
@@ -388,21 +427,17 @@ def valid_term_in_collection(value: str,
388
427
  :rtype: list[MatchingTerm]
389
428
  :raises EsgvocNotFoundError: If any of the provided ids is not found
390
429
  """
391
- with get_universe_session() as universe_session, \
392
- _get_project_session_with_exception(project_id) as project_session:
393
- return _valid_term_in_collection(value, project_id, collection_id,
394
- universe_session, project_session)
430
+ with get_universe_session() as universe_session, _get_project_session_with_exception(project_id) as project_session:
431
+ return _valid_term_in_collection(value, project_id, collection_id, universe_session, project_session)
395
432
 
396
433
 
397
- def _valid_term_in_project(value: str,
398
- project_id: str,
399
- universe_session: Session,
400
- project_session: Session) -> list[MatchingTerm]:
434
+ def _valid_term_in_project(
435
+ value: str, project_id: str, universe_session: Session, project_session: Session
436
+ ) -> list[MatchingTerm]:
401
437
  result = list()
402
438
  collections = _get_all_collections_in_project(project_session)
403
439
  for collection in collections:
404
- result.extend(_valid_term_in_collection(value, project_id, collection.id,
405
- universe_session, project_session))
440
+ result.extend(_valid_term_in_collection(value, project_id, collection.id, universe_session, project_session))
406
441
  return result
407
442
 
408
443
 
@@ -431,8 +466,7 @@ def valid_term_in_project(value: str, project_id: str) -> list[MatchingTerm]:
431
466
  :rtype: list[MatchingTerm]
432
467
  :raises EsgvocNotFoundError: If the `project_id` is not found
433
468
  """
434
- with get_universe_session() as universe_session, \
435
- _get_project_session_with_exception(project_id) as project_session:
469
+ with get_universe_session() as universe_session, _get_project_session_with_exception(project_id) as project_session:
436
470
  return _valid_term_in_project(value, project_id, universe_session, project_session)
437
471
 
438
472
 
@@ -460,15 +494,13 @@ def valid_term_in_all_projects(value: str) -> list[MatchingTerm]:
460
494
  with get_universe_session() as universe_session:
461
495
  for project_id in get_all_projects():
462
496
  with _get_project_session_with_exception(project_id) as project_session:
463
- result.extend(_valid_term_in_project(value, project_id,
464
- universe_session, project_session))
497
+ result.extend(_valid_term_in_project(value, project_id, universe_session, project_session))
465
498
  return result
466
499
 
467
500
 
468
- def get_all_terms_in_collection(project_id: str,
469
- collection_id: str,
470
- selected_term_fields: Iterable[str] | None = None)\
471
- -> list[DataDescriptor]:
501
+ def get_all_terms_in_collection(
502
+ project_id: str, collection_id: str, selected_term_fields: Iterable[str] | None = None
503
+ ) -> list[DataDescriptor]:
472
504
  """
473
505
  Gets all terms of the given collection of a project.
474
506
  This function performs an exact match on the `project_id` and `collection_id`,
@@ -495,7 +527,7 @@ def get_all_terms_in_collection(project_id: str,
495
527
  return result
496
528
 
497
529
 
498
- def _get_all_collections_in_project(session: Session) -> list[Collection]:
530
+ def _get_all_collections_in_project(session: Session) -> list[PCollection]:
499
531
  project = session.get(Project, constants.SQLITE_FIRST_PK)
500
532
  # Project can't be missing if session exists.
501
533
  return project.collections # type: ignore
@@ -522,15 +554,17 @@ def get_all_collections_in_project(project_id: str) -> list[str]:
522
554
  return result
523
555
 
524
556
 
525
- def _get_all_terms_in_collection(collection: Collection,
526
- selected_term_fields: Iterable[str] | None) -> list[DataDescriptor]:
557
+ def _get_all_terms_in_collection(
558
+ collection: PCollection, selected_term_fields: Iterable[str] | None
559
+ ) -> list[DataDescriptor]:
527
560
  result: list[DataDescriptor] = list()
528
561
  instantiate_pydantic_terms(collection.terms, result, selected_term_fields)
529
562
  return result
530
563
 
531
564
 
532
- def get_all_terms_in_project(project_id: str,
533
- selected_term_fields: Iterable[str] | None = None) -> list[DataDescriptor]:
565
+ def get_all_terms_in_project(
566
+ project_id: str, selected_term_fields: Iterable[str] | None = None
567
+ ) -> list[DataDescriptor]:
534
568
  """
535
569
  Gets all terms of the given project.
536
570
  This function performs an exact match on the `project_id` and
@@ -556,8 +590,9 @@ def get_all_terms_in_project(project_id: str,
556
590
  return result
557
591
 
558
592
 
559
- def get_all_terms_in_all_projects(selected_term_fields: Iterable[str] | None = None) \
560
- -> list[tuple[str, list[DataDescriptor]]]:
593
+ def get_all_terms_in_all_projects(
594
+ selected_term_fields: Iterable[str] | None = None,
595
+ ) -> list[tuple[str, list[DataDescriptor]]]:
561
596
  """
562
597
  Gets all terms of all projects.
563
598
 
@@ -592,8 +627,9 @@ def _get_term_in_project(term_id: str, session: Session) -> PTerm | None:
592
627
  return result
593
628
 
594
629
 
595
- def get_term_in_project(project_id: str, term_id: str,
596
- selected_term_fields: Iterable[str] | None = None) -> DataDescriptor | None:
630
+ def get_term_in_project(
631
+ project_id: str, term_id: str, selected_term_fields: Iterable[str] | None = None
632
+ ) -> DataDescriptor | None:
597
633
  """
598
634
  Returns the first occurrence of the terms, in the given project, whose id corresponds exactly to
599
635
  the given term id.
@@ -623,15 +659,15 @@ def get_term_in_project(project_id: str, term_id: str,
623
659
 
624
660
 
625
661
  def _get_term_in_collection(collection_id: str, term_id: str, session: Session) -> PTerm | None:
626
- statement = select(PTerm).join(Collection).where(Collection.id == collection_id,
627
- PTerm.id == term_id)
662
+ statement = select(PTerm).join(PCollection).where(PCollection.id == collection_id, PTerm.id == term_id)
628
663
  results = session.exec(statement)
629
664
  result = results.one_or_none()
630
665
  return result
631
666
 
632
667
 
633
- def get_term_in_collection(project_id: str, collection_id: str, term_id: str,
634
- selected_term_fields: Iterable[str] | None = None) -> DataDescriptor | None:
668
+ def get_term_in_collection(
669
+ project_id: str, collection_id: str, term_id: str, selected_term_fields: Iterable[str] | None = None
670
+ ) -> DataDescriptor | None:
635
671
  """
636
672
  Returns the term, in the given project and collection,
637
673
  whose id corresponds exactly to the given term id.
@@ -661,8 +697,8 @@ def get_term_in_collection(project_id: str, collection_id: str, term_id: str,
661
697
  return result
662
698
 
663
699
 
664
- def _get_collection_in_project(collection_id: str, session: Session) -> Collection | None:
665
- statement = select(Collection).where(Collection.id == collection_id)
700
+ def _get_collection_in_project(collection_id: str, session: Session) -> PCollection | None:
701
+ statement = select(PCollection).where(PCollection.id == collection_id)
666
702
  results = session.exec(statement)
667
703
  result = results.one_or_none()
668
704
  return result
@@ -718,16 +754,13 @@ def get_project(project_id: str) -> ProjectSpecs | None:
718
754
  return result
719
755
 
720
756
 
721
- def _get_collection_from_data_descriptor_in_project(data_descriptor_id: str,
722
- session: Session) -> Collection | None:
723
- statement = select(Collection).where(Collection.data_descriptor_id == data_descriptor_id)
757
+ def _get_collection_from_data_descriptor_in_project(data_descriptor_id: str, session: Session) -> PCollection | None:
758
+ statement = select(PCollection).where(PCollection.data_descriptor_id == data_descriptor_id)
724
759
  result = session.exec(statement).one_or_none()
725
760
  return result
726
761
 
727
762
 
728
- def get_collection_from_data_descriptor_in_project(project_id: str,
729
- data_descriptor_id: str) \
730
- -> tuple[str, dict] | None:
763
+ def get_collection_from_data_descriptor_in_project(project_id: str, data_descriptor_id: str) -> tuple[str, dict] | None:
731
764
  """
732
765
  Returns the collection, in the given project, that corresponds to the given data descriptor
733
766
  in the universe.
@@ -746,15 +779,13 @@ def get_collection_from_data_descriptor_in_project(project_id: str,
746
779
  result: tuple[str, dict] | None = None
747
780
  if connection := _get_project_connection(project_id):
748
781
  with connection.create_session() as session:
749
- collection_found = _get_collection_from_data_descriptor_in_project(data_descriptor_id,
750
- session)
782
+ collection_found = _get_collection_from_data_descriptor_in_project(data_descriptor_id, session)
751
783
  if collection_found:
752
784
  result = collection_found.id, collection_found.context
753
785
  return result
754
786
 
755
787
 
756
- def get_collection_from_data_descriptor_in_all_projects(data_descriptor_id: str) \
757
- -> list[tuple[str, str, dict]]:
788
+ def get_collection_from_data_descriptor_in_all_projects(data_descriptor_id: str) -> list[tuple[str, str, dict]]:
758
789
  """
759
790
  Returns the collections, in all projects, that correspond to the given data descriptor
760
791
  in the universe.
@@ -773,28 +804,28 @@ def get_collection_from_data_descriptor_in_all_projects(data_descriptor_id: str)
773
804
  result = list()
774
805
  project_ids = get_all_projects()
775
806
  for project_id in project_ids:
776
- collection_found = get_collection_from_data_descriptor_in_project(project_id,
777
- data_descriptor_id)
807
+ collection_found = get_collection_from_data_descriptor_in_project(project_id, data_descriptor_id)
778
808
  if collection_found:
779
809
  result.append((project_id, collection_found[0], collection_found[1]))
780
810
  return result
781
811
 
782
812
 
783
- def _get_term_from_universe_term_id_in_project(data_descriptor_id: str,
784
- universe_term_id: str,
785
- project_session: Session) -> PTerm | None:
786
- statement = select(PTerm).join(Collection).where(Collection.data_descriptor_id == data_descriptor_id,
787
- PTerm.id == universe_term_id)
813
+ def _get_term_from_universe_term_id_in_project(
814
+ data_descriptor_id: str, universe_term_id: str, project_session: Session
815
+ ) -> PTerm | None:
816
+ statement = (
817
+ select(PTerm)
818
+ .join(PCollection)
819
+ .where(PCollection.data_descriptor_id == data_descriptor_id, PTerm.id == universe_term_id)
820
+ )
788
821
  results = project_session.exec(statement)
789
822
  result = results.one_or_none()
790
823
  return result
791
824
 
792
825
 
793
- def get_term_from_universe_term_id_in_project(project_id: str,
794
- data_descriptor_id: str,
795
- universe_term_id: str,
796
- selected_term_fields: Iterable[str] | None = None) \
797
- -> tuple[str, DataDescriptor] | None:
826
+ def get_term_from_universe_term_id_in_project(
827
+ project_id: str, data_descriptor_id: str, universe_term_id: str, selected_term_fields: Iterable[str] | None = None
828
+ ) -> tuple[str, DataDescriptor] | None:
798
829
  """
799
830
  Returns the term, in the given project, that corresponds to the given term in the universe.
800
831
  This function performs an exact match on the `project_id`, `data_descriptor_id`
@@ -818,19 +849,16 @@ def get_term_from_universe_term_id_in_project(project_id: str,
818
849
  result: tuple[str, DataDescriptor] | None = None
819
850
  if connection := _get_project_connection(project_id):
820
851
  with connection.create_session() as session:
821
- term_found = _get_term_from_universe_term_id_in_project(data_descriptor_id,
822
- universe_term_id,
823
- session)
852
+ term_found = _get_term_from_universe_term_id_in_project(data_descriptor_id, universe_term_id, session)
824
853
  if term_found:
825
854
  pydantic_term = instantiate_pydantic_term(term_found, selected_term_fields)
826
855
  result = (term_found.collection.id, pydantic_term)
827
856
  return result
828
857
 
829
858
 
830
- def get_term_from_universe_term_id_in_all_projects(data_descriptor_id: str,
831
- universe_term_id: str,
832
- selected_term_fields: Iterable[str] | None = None) \
833
- -> list[tuple[str, str, DataDescriptor]]:
859
+ def get_term_from_universe_term_id_in_all_projects(
860
+ data_descriptor_id: str, universe_term_id: str, selected_term_fields: Iterable[str] | None = None
861
+ ) -> list[tuple[str, str, DataDescriptor]]:
834
862
  """
835
863
  Returns the terms, in all projects, that correspond to the given term in the universe.
836
864
  This function performs an exact match on the `data_descriptor_id`
@@ -853,38 +881,37 @@ def get_term_from_universe_term_id_in_all_projects(data_descriptor_id: str,
853
881
  result: list[tuple[str, str, DataDescriptor]] = list()
854
882
  project_ids = get_all_projects()
855
883
  for project_id in project_ids:
856
- term_found = get_term_from_universe_term_id_in_project(project_id,
857
- data_descriptor_id,
858
- universe_term_id,
859
- selected_term_fields)
884
+ term_found = get_term_from_universe_term_id_in_project(
885
+ project_id, data_descriptor_id, universe_term_id, selected_term_fields
886
+ )
860
887
  if term_found:
861
888
  result.append((project_id, term_found[0], term_found[1]))
862
889
  return result
863
890
 
864
891
 
865
- def _find_collections_in_project(expression: str,
866
- session: Session,
867
- only_id: bool = False,
868
- limit: int | None = None,
869
- offset: int | None = None) -> Sequence[Collection]:
892
+ def _find_collections_in_project(
893
+ expression: str, session: Session, only_id: bool = False, limit: int | None = None, offset: int | None = None
894
+ ) -> Sequence[PCollection]:
870
895
  matching_condition = generate_matching_condition(PCollectionFTS5, expression, only_id)
871
896
  tmp_statement = select(PCollectionFTS5).where(matching_condition)
872
- statement = select(Collection).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
897
+ statement = select(PCollection).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
873
898
  return execute_match_statement(expression, statement, session)
874
899
 
875
900
 
876
- def find_collections_in_project(expression: str, project_id: str,
877
- only_id: bool = False,
878
- limit: int | None = None,
879
- offset: int | None = None) -> list[tuple[str, dict]]:
901
+ def find_collections_in_project(
902
+ expression: str, project_id: str, only_id: bool = False, limit: int | None = None, offset: int | None = None
903
+ ) -> list[tuple[str, dict]]:
880
904
  """
881
905
  Find collections in the given project based on a full text search defined by the given `expression`.
882
- The `expression` comes from the powerful
883
- `SQLite FTS extension <https://sqlite.org/fts5.html#full_text_query_syntax>`_
884
- and corresponds to the expression of the `MATCH` operator.
885
- It can be composed of one or multiple keywords combined with boolean
886
- operators (`NOT`, `AND`, `^`, etc. default is `OR`). Keywords can define prefixes or postfixes
887
- with the wildcard `*`.
906
+ The `expression` can be composed of one or multiple keywords.
907
+ The keywords can combined with boolean operators: `AND`,
908
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
909
+ if no boolean operators is provided, whitespaces are handled as if there were
910
+ an implicit AND operator between each pair of keywords. Note that this
911
+ function does not provide any priority operator (parenthesis).
912
+ Keywords can define prefixes when adding a `*` at the end of them.
913
+ If the expression is composed of only one keyword, the function
914
+ automatically defines it as a prefix.
888
915
  The function returns a list of collection ids and contexts, sorted according to the
889
916
  bm25 ranking metric (list index `0` has the highest rank).
890
917
  This function performs an exact match on the `project_id`,
@@ -915,52 +942,57 @@ def find_collections_in_project(expression: str, project_id: str,
915
942
  result: list[tuple[str, dict]] = list()
916
943
  if connection := _get_project_connection(project_id):
917
944
  with connection.create_session() as session:
918
- collections_found = _find_collections_in_project(expression, session, only_id,
919
- limit, offset)
945
+ collections_found = _find_collections_in_project(expression, session, only_id, limit, offset)
920
946
  for collection in collections_found:
921
947
  result.append((collection.id, collection.context))
922
948
  return result
923
949
 
924
950
 
925
- def _find_terms_in_collection(expression: str,
926
- collection_id: str,
927
- session: Session,
928
- only_id: bool = False,
929
- limit: int | None = None,
930
- offset: int | None = None) -> Sequence[PTerm]:
951
+ def _find_terms_in_collection(
952
+ expression: str,
953
+ collection_id: str,
954
+ session: Session,
955
+ only_id: bool = False,
956
+ limit: int | None = None,
957
+ offset: int | None = None,
958
+ ) -> Sequence[PTerm]:
931
959
  matching_condition = generate_matching_condition(PTermFTS5, expression, only_id)
932
- where_condition = Collection.id == collection_id, matching_condition
933
- tmp_statement = select(PTermFTS5).join(Collection).where(*where_condition)
960
+ where_condition = PCollection.id == collection_id, matching_condition
961
+ tmp_statement = select(PTermFTS5).join(PCollection).where(*where_condition)
934
962
  statement = select(PTerm).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
935
963
  return execute_match_statement(expression, statement, session)
936
964
 
937
965
 
938
- def _find_terms_in_project(expression: str,
939
- session: Session,
940
- only_id: bool = False,
941
- limit: int | None = None,
942
- offset: int | None = None) -> Sequence[PTerm]:
966
+ def _find_terms_in_project(
967
+ expression: str, session: Session, only_id: bool = False, limit: int | None = None, offset: int | None = None
968
+ ) -> Sequence[PTerm]:
943
969
  matching_condition = generate_matching_condition(PTermFTS5, expression, only_id)
944
970
  tmp_statement = select(PTermFTS5).where(matching_condition)
945
971
  statement = select(PTerm).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
946
972
  return execute_match_statement(expression, statement, session)
947
973
 
948
974
 
949
- def find_terms_in_collection(expression: str, project_id: str,
950
- collection_id: str,
951
- only_id: bool = False,
952
- limit: int | None = None,
953
- offset: int | None = None,
954
- selected_term_fields: Iterable[str] | None = None) \
955
- -> list[DataDescriptor]:
975
+ def find_terms_in_collection(
976
+ expression: str,
977
+ project_id: str,
978
+ collection_id: str,
979
+ only_id: bool = False,
980
+ limit: int | None = None,
981
+ offset: int | None = None,
982
+ selected_term_fields: Iterable[str] | None = None,
983
+ ) -> list[DataDescriptor]:
956
984
  """
957
985
  Find terms in the given project and collection based on a full text search defined by the given
958
- `expression`. The `expression` comes from the powerful
959
- `SQLite FTS extension <https://sqlite.org/fts5.html#full_text_query_syntax>`_
960
- and corresponds to the expression of the `MATCH` operator.
961
- It can be composed of one or multiple keywords combined with boolean
962
- operators (`NOT`, `AND`, `^`, etc. default is `OR`). Keywords can define prefixes or postfixes
963
- with the wildcard `*`.
986
+ `expression`.
987
+ The `expression` can be composed of one or multiple keywords.
988
+ The keywords can combined with boolean operators: `AND`,
989
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
990
+ if no boolean operators is provided, whitespaces are handled as if there were
991
+ an implicit AND operator between each pair of keywords. Note that this
992
+ function does not provide any priority operator (parenthesis).
993
+ Keywords can define prefixes when adding a `*` at the end of them.
994
+ If the expression is composed of only one keyword, the function
995
+ automatically defines it as a prefix.
964
996
  The function returns a list of term instances, sorted according to the
965
997
  bm25 ranking metric (list index `0` has the highest rank).
966
998
  This function performs an exact match on the `project_id` and `collection_id`,
@@ -995,27 +1027,30 @@ def find_terms_in_collection(expression: str, project_id: str,
995
1027
  result: list[DataDescriptor] = list()
996
1028
  if connection := _get_project_connection(project_id):
997
1029
  with connection.create_session() as session:
998
- pterms_found = _find_terms_in_collection(expression, collection_id, session,
999
- only_id, limit, offset)
1030
+ pterms_found = _find_terms_in_collection(expression, collection_id, session, only_id, limit, offset)
1000
1031
  instantiate_pydantic_terms(pterms_found, result, selected_term_fields)
1001
1032
  return result
1002
1033
 
1003
1034
 
1004
- def find_terms_in_project(expression: str,
1005
- project_id: str,
1006
- only_id: bool = False,
1007
- limit: int | None = None,
1008
- offset: int | None = None,
1009
- selected_term_fields: Iterable[str] | None = None) \
1010
- -> list[DataDescriptor]:
1035
+ def find_terms_in_project(
1036
+ expression: str,
1037
+ project_id: str,
1038
+ only_id: bool = False,
1039
+ limit: int | None = None,
1040
+ offset: int | None = None,
1041
+ selected_term_fields: Iterable[str] | None = None,
1042
+ ) -> list[DataDescriptor]:
1011
1043
  """
1012
- Find terms in the given project on a full text search defined by the given
1013
- `expression`. The `expression` comes from the powerful
1014
- `SQLite FTS extension <https://sqlite.org/fts5.html#full_text_query_syntax>`_
1015
- and corresponds to the expression of the `MATCH` operator.
1016
- It can be composed of one or multiple keywords combined with boolean
1017
- operators (`NOT`, `AND`, `^`, etc. default is `OR`). Keywords can define prefixes or postfixes
1018
- with the wildcard `*`.
1044
+ Find terms in the given project based on a full text search defined by the given `expression`.
1045
+ The `expression` can be composed of one or multiple keywords.
1046
+ The keywords can combined with boolean operators: `AND`,
1047
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
1048
+ if no boolean operators is provided, whitespaces are handled as if there were
1049
+ an implicit AND operator between each pair of keywords. Note that this
1050
+ function does not provide any priority operator (parenthesis).
1051
+ Keywords can define prefixes when adding a `*` at the end of them.
1052
+ If the expression is composed of only one keyword, the function
1053
+ automatically defines it as a prefix.
1019
1054
  The function returns a list of term instances, sorted according to the
1020
1055
  bm25 ranking metric (list index `0` has the highest rank).
1021
1056
  This function performs an exact match on the `project_id`,
@@ -1053,20 +1088,24 @@ def find_terms_in_project(expression: str,
1053
1088
  return result
1054
1089
 
1055
1090
 
1056
- def find_terms_in_all_projects(expression: str,
1057
- only_id: bool = False,
1058
- limit: int | None = None,
1059
- offset: int | None = None,
1060
- selected_term_fields: Iterable[str] | None = None) \
1061
- -> list[tuple[str, list[DataDescriptor]]]:
1091
+ def find_terms_in_all_projects(
1092
+ expression: str,
1093
+ only_id: bool = False,
1094
+ limit: int | None = None,
1095
+ offset: int | None = None,
1096
+ selected_term_fields: Iterable[str] | None = None,
1097
+ ) -> list[tuple[str, list[DataDescriptor]]]:
1062
1098
  """
1063
- Find terms in the all projects on a full text search defined by the given
1064
- `expression`. The `expression` comes from the powerful
1065
- `SQLite FTS extension <https://sqlite.org/fts5.html#full_text_query_syntax>`_
1066
- and corresponds to the expression of the `MATCH` operator.
1067
- It can be composed of one or multiple keywords combined with boolean
1068
- operators (`NOT`, `AND`, `^`, etc. default is `OR`). Keywords can define prefixes or postfixes
1069
- with the wildcard `*`.
1099
+ Find terms in all projects based on a full text search defined by the given `expression`.
1100
+ The `expression` can be composed of one or multiple keywords.
1101
+ The keywords can combined with boolean operators: `AND`,
1102
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
1103
+ if no boolean operators is provided, whitespaces are handled as if there were
1104
+ an implicit AND operator between each pair of keywords. Note that this
1105
+ function does not provide any priority operator (parenthesis).
1106
+ Keywords can define prefixes when adding a `*` at the end of them.
1107
+ If the expression is composed of only one keyword, the function
1108
+ automatically defines it as a prefix.
1070
1109
  The function returns a list of project ids and term instances, sorted according to the
1071
1110
  bm25 ranking metric (list index `0` has the highest rank).
1072
1111
  If the provided `expression` does not hit any term, the function returns an empty list.
@@ -1094,26 +1133,27 @@ def find_terms_in_all_projects(expression: str,
1094
1133
  result: list[tuple[str, list[DataDescriptor]]] = list()
1095
1134
  project_ids = get_all_projects()
1096
1135
  for project_id in project_ids:
1097
- terms_found = find_terms_in_project(expression, project_id, only_id,
1098
- limit, offset, selected_term_fields)
1136
+ terms_found = find_terms_in_project(expression, project_id, only_id, limit, offset, selected_term_fields)
1099
1137
  if terms_found:
1100
1138
  result.append((project_id, terms_found))
1101
1139
  return result
1102
1140
 
1103
1141
 
1104
- def find_items_in_project(expression: str,
1105
- project_id: str,
1106
- only_id: bool = False,
1107
- limit: int | None = None,
1108
- offset: int | None = None) -> list[Item]:
1142
+ def find_items_in_project(
1143
+ expression: str, project_id: str, only_id: bool = False, limit: int | None = None, offset: int | None = None
1144
+ ) -> list[Item]:
1109
1145
  """
1110
1146
  Find items, at the moment terms and collections, in the given project based on a full-text
1111
- search defined by the given `expression`. The `expression` comes from the powerful
1112
- `SQLite FTS extension <https://sqlite.org/fts5.html#full_text_query_syntax>`_
1113
- and corresponds to the expression of the `MATCH` operator.
1114
- It can be composed of one or multiple keywords combined with boolean
1115
- operators (`NOT`, `AND`, `^`, etc. default is `OR`). Keywords can define prefixes or postfixes
1116
- with the wildcard `*`.
1147
+ search defined by the given `expression`.
1148
+ The `expression` can be composed of one or multiple keywords.
1149
+ The keywords can combined with boolean operators: `AND`,
1150
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
1151
+ if no boolean operators is provided, whitespaces are handled as if there were
1152
+ an implicit AND operator between each pair of keywords. Note that this
1153
+ function does not provide any priority operator (parenthesis).
1154
+ Keywords can define prefixes when adding a `*` at the end of them.
1155
+ If the expression is composed of only one keyword, the function
1156
+ automatically defines it as a prefix.
1117
1157
  The function returns a list of item instances sorted according to the
1118
1158
  bm25 ranking metric (list index `0` has the highest rank).
1119
1159
  This function performs an exact match on the `project_id`,
@@ -1143,23 +1183,24 @@ def find_items_in_project(expression: str,
1143
1183
  result = list()
1144
1184
  if connection := _get_project_connection(project_id):
1145
1185
  with connection.create_session() as session:
1186
+ processed_expression = process_expression(expression)
1146
1187
  if only_id:
1147
1188
  collection_column = col(PCollectionFTS5.id)
1148
1189
  term_column = col(PTermFTS5.id)
1149
1190
  else:
1150
1191
  collection_column = col(PCollectionFTS5.id) # TODO: use specs when implemented!
1151
1192
  term_column = col(PTermFTS5.specs) # type: ignore
1152
- collection_where_condition = collection_column.match(expression)
1153
- collection_statement = select(PCollectionFTS5.id,
1154
- text("'collection' AS TYPE"),
1155
- text(f"'{project_id}' AS TYPE"),
1156
- text('rank')).where(collection_where_condition)
1157
- term_where_condition = term_column.match(expression)
1158
- term_statement = select(PTermFTS5.id,
1159
- text("'term' AS TYPE"),
1160
- Collection.id,
1161
- text('rank')).join(Collection) \
1162
- .where(term_where_condition)
1163
- result = execute_find_item_statements(session, expression, collection_statement,
1164
- term_statement, limit, offset)
1193
+ collection_where_condition = collection_column.match(processed_expression)
1194
+ collection_statement = select(
1195
+ PCollectionFTS5.id, text("'collection' AS TYPE"), text(f"'{project_id}' AS TYPE"), text("rank")
1196
+ ).where(collection_where_condition)
1197
+ term_where_condition = term_column.match(processed_expression)
1198
+ term_statement = (
1199
+ select(PTermFTS5.id, text("'term' AS TYPE"), PCollection.id, text("rank"))
1200
+ .join(PCollection)
1201
+ .where(term_where_condition)
1202
+ )
1203
+ result = execute_find_item_statements(
1204
+ session, processed_expression, collection_statement, term_statement, limit, offset
1205
+ )
1165
1206
  return result