esgvoc 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. esgvoc/__init__.py +3 -0
  2. esgvoc/api/__init__.py +91 -0
  3. esgvoc/api/data_descriptors/EMD_models/__init__.py +66 -0
  4. esgvoc/api/data_descriptors/EMD_models/arrangement.py +21 -0
  5. esgvoc/api/data_descriptors/EMD_models/calendar.py +5 -0
  6. esgvoc/api/data_descriptors/EMD_models/cell_variable_type.py +20 -0
  7. esgvoc/api/data_descriptors/EMD_models/component_type.py +5 -0
  8. esgvoc/api/data_descriptors/EMD_models/coordinate.py +52 -0
  9. esgvoc/api/data_descriptors/EMD_models/grid_mapping.py +19 -0
  10. esgvoc/api/data_descriptors/EMD_models/grid_region.py +19 -0
  11. esgvoc/api/data_descriptors/EMD_models/grid_type.py +19 -0
  12. esgvoc/api/data_descriptors/EMD_models/horizontal_computational_grid.py +56 -0
  13. esgvoc/api/data_descriptors/EMD_models/horizontal_grid_cells.py +230 -0
  14. esgvoc/api/data_descriptors/EMD_models/horizontal_subgrid.py +41 -0
  15. esgvoc/api/data_descriptors/EMD_models/horizontal_units.py +5 -0
  16. esgvoc/api/data_descriptors/EMD_models/model.py +139 -0
  17. esgvoc/api/data_descriptors/EMD_models/model_component.py +115 -0
  18. esgvoc/api/data_descriptors/EMD_models/reference.py +61 -0
  19. esgvoc/api/data_descriptors/EMD_models/resolution.py +48 -0
  20. esgvoc/api/data_descriptors/EMD_models/temporal_refinement.py +19 -0
  21. esgvoc/api/data_descriptors/EMD_models/truncation_method.py +17 -0
  22. esgvoc/api/data_descriptors/EMD_models/vertical_computational_grid.py +91 -0
  23. esgvoc/api/data_descriptors/EMD_models/vertical_coordinate.py +5 -0
  24. esgvoc/api/data_descriptors/EMD_models/vertical_units.py +19 -0
  25. esgvoc/api/data_descriptors/__init__.py +159 -0
  26. esgvoc/api/data_descriptors/activity.py +72 -0
  27. esgvoc/api/data_descriptors/archive.py +5 -0
  28. esgvoc/api/data_descriptors/area_label.py +30 -0
  29. esgvoc/api/data_descriptors/branded_suffix.py +30 -0
  30. esgvoc/api/data_descriptors/branded_variable.py +21 -0
  31. esgvoc/api/data_descriptors/citation_url.py +5 -0
  32. esgvoc/api/data_descriptors/contact.py +5 -0
  33. esgvoc/api/data_descriptors/conventions.py +28 -0
  34. esgvoc/api/data_descriptors/creation_date.py +18 -0
  35. esgvoc/api/data_descriptors/data_descriptor.py +127 -0
  36. esgvoc/api/data_descriptors/data_specs_version.py +25 -0
  37. esgvoc/api/data_descriptors/date.py +5 -0
  38. esgvoc/api/data_descriptors/directory_date.py +22 -0
  39. esgvoc/api/data_descriptors/drs_specs.py +38 -0
  40. esgvoc/api/data_descriptors/experiment.py +215 -0
  41. esgvoc/api/data_descriptors/forcing_index.py +21 -0
  42. esgvoc/api/data_descriptors/frequency.py +48 -0
  43. esgvoc/api/data_descriptors/further_info_url.py +5 -0
  44. esgvoc/api/data_descriptors/grid.py +43 -0
  45. esgvoc/api/data_descriptors/horizontal_label.py +20 -0
  46. esgvoc/api/data_descriptors/initialization_index.py +27 -0
  47. esgvoc/api/data_descriptors/institution.py +80 -0
  48. esgvoc/api/data_descriptors/known_branded_variable.py +75 -0
  49. esgvoc/api/data_descriptors/license.py +31 -0
  50. esgvoc/api/data_descriptors/member_id.py +9 -0
  51. esgvoc/api/data_descriptors/mip_era.py +26 -0
  52. esgvoc/api/data_descriptors/model_component.py +32 -0
  53. esgvoc/api/data_descriptors/models_test/models.py +17 -0
  54. esgvoc/api/data_descriptors/nominal_resolution.py +50 -0
  55. esgvoc/api/data_descriptors/obs_type.py +5 -0
  56. esgvoc/api/data_descriptors/organisation.py +22 -0
  57. esgvoc/api/data_descriptors/physics_index.py +21 -0
  58. esgvoc/api/data_descriptors/product.py +16 -0
  59. esgvoc/api/data_descriptors/publication_status.py +5 -0
  60. esgvoc/api/data_descriptors/realization_index.py +24 -0
  61. esgvoc/api/data_descriptors/realm.py +16 -0
  62. esgvoc/api/data_descriptors/regex.py +5 -0
  63. esgvoc/api/data_descriptors/region.py +35 -0
  64. esgvoc/api/data_descriptors/resolution.py +7 -0
  65. esgvoc/api/data_descriptors/source.py +120 -0
  66. esgvoc/api/data_descriptors/source_type.py +5 -0
  67. esgvoc/api/data_descriptors/sub_experiment.py +5 -0
  68. esgvoc/api/data_descriptors/table.py +28 -0
  69. esgvoc/api/data_descriptors/temporal_label.py +20 -0
  70. esgvoc/api/data_descriptors/time_range.py +17 -0
  71. esgvoc/api/data_descriptors/title.py +5 -0
  72. esgvoc/api/data_descriptors/tracking_id.py +67 -0
  73. esgvoc/api/data_descriptors/variable.py +56 -0
  74. esgvoc/api/data_descriptors/variant_label.py +25 -0
  75. esgvoc/api/data_descriptors/vertical_label.py +20 -0
  76. esgvoc/api/project_specs.py +143 -0
  77. esgvoc/api/projects.py +1253 -0
  78. esgvoc/api/py.typed +0 -0
  79. esgvoc/api/pydantic_handler.py +146 -0
  80. esgvoc/api/report.py +127 -0
  81. esgvoc/api/search.py +171 -0
  82. esgvoc/api/universe.py +434 -0
  83. esgvoc/apps/__init__.py +6 -0
  84. esgvoc/apps/cmor_tables/__init__.py +7 -0
  85. esgvoc/apps/cmor_tables/cvs_table.py +948 -0
  86. esgvoc/apps/drs/__init__.py +0 -0
  87. esgvoc/apps/drs/constants.py +2 -0
  88. esgvoc/apps/drs/generator.py +429 -0
  89. esgvoc/apps/drs/report.py +540 -0
  90. esgvoc/apps/drs/validator.py +312 -0
  91. esgvoc/apps/ga/__init__.py +104 -0
  92. esgvoc/apps/ga/example_usage.py +315 -0
  93. esgvoc/apps/ga/models/__init__.py +47 -0
  94. esgvoc/apps/ga/models/netcdf_header.py +306 -0
  95. esgvoc/apps/ga/models/validator.py +491 -0
  96. esgvoc/apps/ga/test_ga.py +161 -0
  97. esgvoc/apps/ga/validator.py +277 -0
  98. esgvoc/apps/jsg/json_schema_generator.py +341 -0
  99. esgvoc/apps/jsg/templates/template.jinja +241 -0
  100. esgvoc/apps/test_cv/README.md +214 -0
  101. esgvoc/apps/test_cv/__init__.py +0 -0
  102. esgvoc/apps/test_cv/cv_tester.py +1611 -0
  103. esgvoc/apps/test_cv/example_usage.py +216 -0
  104. esgvoc/apps/vr/__init__.py +12 -0
  105. esgvoc/apps/vr/build_variable_registry.py +71 -0
  106. esgvoc/apps/vr/example_usage.py +60 -0
  107. esgvoc/apps/vr/vr_app.py +333 -0
  108. esgvoc/cli/clean.py +304 -0
  109. esgvoc/cli/cmor.py +46 -0
  110. esgvoc/cli/config.py +1300 -0
  111. esgvoc/cli/drs.py +267 -0
  112. esgvoc/cli/find.py +138 -0
  113. esgvoc/cli/get.py +155 -0
  114. esgvoc/cli/install.py +41 -0
  115. esgvoc/cli/main.py +60 -0
  116. esgvoc/cli/offline.py +269 -0
  117. esgvoc/cli/status.py +79 -0
  118. esgvoc/cli/test_cv.py +258 -0
  119. esgvoc/cli/valid.py +147 -0
  120. esgvoc/core/constants.py +17 -0
  121. esgvoc/core/convert.py +0 -0
  122. esgvoc/core/data_handler.py +206 -0
  123. esgvoc/core/db/__init__.py +3 -0
  124. esgvoc/core/db/connection.py +40 -0
  125. esgvoc/core/db/models/mixins.py +25 -0
  126. esgvoc/core/db/models/project.py +102 -0
  127. esgvoc/core/db/models/universe.py +98 -0
  128. esgvoc/core/db/project_ingestion.py +231 -0
  129. esgvoc/core/db/universe_ingestion.py +172 -0
  130. esgvoc/core/exceptions.py +33 -0
  131. esgvoc/core/logging_handler.py +26 -0
  132. esgvoc/core/repo_fetcher.py +345 -0
  133. esgvoc/core/service/__init__.py +41 -0
  134. esgvoc/core/service/configuration/config_manager.py +196 -0
  135. esgvoc/core/service/configuration/setting.py +363 -0
  136. esgvoc/core/service/data_merger.py +634 -0
  137. esgvoc/core/service/esg_voc.py +77 -0
  138. esgvoc/core/service/resolver_config.py +56 -0
  139. esgvoc/core/service/state.py +324 -0
  140. esgvoc/core/service/string_heuristics.py +98 -0
  141. esgvoc/core/service/term_cache.py +108 -0
  142. esgvoc/core/service/uri_resolver.py +133 -0
  143. esgvoc-2.0.2.dist-info/METADATA +82 -0
  144. esgvoc-2.0.2.dist-info/RECORD +147 -0
  145. esgvoc-2.0.2.dist-info/WHEEL +4 -0
  146. esgvoc-2.0.2.dist-info/entry_points.txt +2 -0
  147. esgvoc-2.0.2.dist-info/licenses/LICENSE.txt +519 -0
esgvoc/api/projects.py ADDED
@@ -0,0 +1,1253 @@
1
+ import itertools
2
+ import re
3
+ from typing import Iterable, Sequence, cast
4
+
5
+ from sqlalchemy import text
6
+ from sqlmodel import Session, and_, col, select
7
+
8
+ import esgvoc.api.universe as universe
9
+ import esgvoc.core.constants as constants
10
+ import esgvoc.core.service as service
11
+ from esgvoc.api.data_descriptors.data_descriptor import DataDescriptor
12
+ from esgvoc.api.project_specs import ProjectSpecs
13
+ from esgvoc.api.report import ProjectTermError, UniverseTermError, ValidationReport
14
+ from esgvoc.api.pydantic_handler import instantiate_pydantic_term
15
+ from esgvoc.api.search import (
16
+ Item,
17
+ MatchingTerm,
18
+ execute_find_item_statements,
19
+ execute_match_statement,
20
+ generate_matching_condition,
21
+ get_universe_session,
22
+ handle_rank_limit_offset,
23
+ instantiate_pydantic_terms,
24
+ process_expression,
25
+ )
26
+ from esgvoc.core.db.connection import DBConnection
27
+ from esgvoc.core.db.models.mixins import TermKind
28
+ from esgvoc.core.db.models.project import PCollection, PCollectionFTS5, Project, PTerm, PTermFTS5
29
+ from esgvoc.core.db.models.universe import UTerm
30
+ from esgvoc.core.exceptions import EsgvocDbError, EsgvocNotFoundError, EsgvocNotImplementedError, EsgvocValueError
31
+
32
+ # [OPTIMIZATION]
33
+ _VALID_TERM_IN_COLLECTION_CACHE: dict[str, list[MatchingTerm]] = dict()
34
+ _VALID_VALUE_AGAINST_GIVEN_TERM_CACHE: dict[str, list[UniverseTermError | ProjectTermError]] = dict()
35
+
36
+
37
+ def _get_project_connection(project_id: str) -> DBConnection | None:
38
+ if project_id in service.current_state.projects:
39
+ return service.current_state.projects[project_id].db_connection
40
+ else:
41
+ return None
42
+
43
+
44
+ def _get_project_session_with_exception(project_id: str) -> Session:
45
+ if connection := _get_project_connection(project_id):
46
+ project_session = connection.create_session()
47
+ return project_session
48
+ else:
49
+ raise EsgvocNotFoundError(f"unable to find project '{project_id}'")
50
+
51
+
52
+ def _resolve_composite_term_part(
53
+ composite_term_part: dict, universe_session: Session, project_session: Session
54
+ ) -> UTerm | PTerm | Sequence[UTerm | PTerm]:
55
+ if constants.TERM_ID_JSON_KEY in composite_term_part:
56
+ # First find the term in the universe than in the current project
57
+ term_id = composite_term_part[constants.TERM_ID_JSON_KEY]
58
+ term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
59
+ uterm = universe._get_term_in_data_descriptor(
60
+ data_descriptor_id=term_type, term_id=term_id, session=universe_session
61
+ )
62
+ if uterm:
63
+ return uterm
64
+ else:
65
+ pterm = _get_term_in_collection(collection_id=term_type, term_id=term_id, session=project_session)
66
+ if pterm:
67
+ return pterm
68
+ else:
69
+ msg = f"unable to find the term '{term_id}' in '{term_type}'"
70
+ raise EsgvocNotFoundError(msg)
71
+ else:
72
+ term_type = composite_term_part[constants.TERM_TYPE_JSON_KEY]
73
+ data_descriptor = universe._get_data_descriptor_in_universe(term_type, universe_session)
74
+ if data_descriptor is not None:
75
+ return data_descriptor.terms
76
+ else:
77
+ collection = _get_collection_in_project(term_type, project_session)
78
+ if collection is not None:
79
+ return collection.terms
80
+ else:
81
+ msg = f"unable to find the terms of '{term_type}'"
82
+ raise EsgvocNotFoundError(msg)
83
+
84
+
85
+ def _get_composite_term_separator_parts(term: UTerm | PTerm) -> tuple[str, list]:
86
+ separator = term.specs[constants.COMPOSITE_SEPARATOR_JSON_KEY]
87
+ parts = term.specs[constants.COMPOSITE_PARTS_JSON_KEY]
88
+ return separator, parts
89
+
90
+
91
+ def _valid_value_composite_term_with_separator(
92
+ value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
93
+ ) -> list[UniverseTermError | ProjectTermError]:
94
+ separator, parts = _get_composite_term_separator_parts(term)
95
+ required_indices = {i for i, p in enumerate(parts) if p.get(constants.COMPOSITE_REQUIRED_KEY, False)}
96
+
97
+ splits = value.split(separator)
98
+ nb_splits = len(splits)
99
+ nb_parts = len(parts)
100
+
101
+ if nb_splits > nb_parts:
102
+ return [_create_term_error(value, term)]
103
+
104
+ # Generate all possible assignments of split values into parts
105
+ # Only keep those that include all required parts
106
+ all_positions = [i for i in range(nb_parts)]
107
+ valid_combinations = [
108
+ comb for comb in itertools.combinations(all_positions, nb_splits) if required_indices.issubset(comb)
109
+ ]
110
+
111
+ for positions in valid_combinations:
112
+ candidate = [None] * nb_parts
113
+ for idx, pos in enumerate(positions):
114
+ candidate[pos] = splits[idx]
115
+
116
+ # Separator structure validation:
117
+ # - No leading separator if the first part is None
118
+ # - No trailing separator if the last part is None
119
+ # - No double separators where two adjacent optional parts are missing
120
+ if candidate[0] is None and value.startswith(separator):
121
+ continue
122
+ if candidate[-1] is None and value.endswith(separator):
123
+ continue
124
+ if any(
125
+ candidate[i] is None and candidate[i + 1] is None and separator * 2 in value for i in range(nb_parts - 1)
126
+ ):
127
+ continue # invalid double separator between two missing parts
128
+
129
+ # Validate each filled part value
130
+ all_valid = True
131
+ for i, given_value in enumerate(candidate):
132
+ if given_value is None:
133
+ if parts[i].get(constants.COMPOSITE_REQUIRED_KEY, False):
134
+ all_valid = False
135
+ break
136
+ continue # optional and missing part is allowed
137
+
138
+ part = parts[i]
139
+
140
+ # Resolve term ID list if not present
141
+ if "id" not in part:
142
+ terms = universe.get_all_terms_in_data_descriptor(part["type"], None)
143
+ part["id"] = [term.id for term in terms]
144
+ if isinstance(part["id"], str):
145
+ part["id"] = [part["id"]]
146
+
147
+ # Try all possible term IDs to find a valid match
148
+ valid_for_this_part = False
149
+ for id in part["id"]:
150
+ part_copy = dict(part)
151
+ part_copy["id"] = id
152
+ resolved_term = _resolve_composite_term_part(part_copy, universe_session, project_session)
153
+ # resolved_term can't be a list of terms here.
154
+ resolved_term = cast(UTerm | PTerm, resolved_term)
155
+ errors = _valid_value(given_value, resolved_term, universe_session, project_session)
156
+ if not errors:
157
+ valid_for_this_part = True
158
+ break
159
+ if not valid_for_this_part:
160
+ all_valid = False
161
+ break
162
+
163
+ if all_valid:
164
+ return [] # At least one valid combination found
165
+
166
+ return [_create_term_error(value, term)] # No valid combination found
167
+
168
+
169
+ def _transform_to_pattern(term: UTerm | PTerm, universe_session: Session, project_session: Session) -> str:
170
+ match term.kind:
171
+ case TermKind.PLAIN:
172
+ if constants.DRS_SPECS_JSON_KEY in term.specs:
173
+ result = term.specs[constants.DRS_SPECS_JSON_KEY]
174
+ else:
175
+ raise EsgvocValueError(f"the term '{term.id}' doesn't have drs name. " + "Can't validate it.")
176
+ case TermKind.PATTERN:
177
+ result = term.specs[constants.PATTERN_JSON_KEY]
178
+ case TermKind.COMPOSITE:
179
+ separator, parts = _get_composite_term_separator_parts(term)
180
+ result = ""
181
+ for part in parts:
182
+ resolved_term = _resolve_composite_term_part(part, universe_session, project_session)
183
+ if isinstance(resolved_term, Sequence):
184
+ pattern = ""
185
+ for r_term in resolved_term:
186
+ pattern += _transform_to_pattern(r_term, universe_session, project_session)
187
+ else:
188
+ pattern = _transform_to_pattern(resolved_term, universe_session, project_session)
189
+ result = f"{result}{pattern}{separator}"
190
+ result = result.rstrip(separator)
191
+ case _:
192
+ raise EsgvocDbError(f"unsupported term kind '{term.kind}'")
193
+ return result
194
+
195
+
196
+ # TODO: support optionality of parts of composite.
197
+ # It is backtrack possible for more than one missing parts.
198
+ def _valid_value_composite_term_separator_less(
199
+ value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
200
+ ) -> list[UniverseTermError | ProjectTermError]:
201
+ result = list()
202
+ try:
203
+ pattern = _transform_to_pattern(term, universe_session, project_session)
204
+ try:
205
+ # Patterns terms are meant to be validated individually.
206
+ # So their regex are defined as a whole (begins by a ^, ends by a $).
207
+ # As the pattern is a concatenation of plain or regex, multiple ^ and $ can exist.
208
+ # The later, must be removed.
209
+ pattern = pattern.replace("^", "").replace("$", "")
210
+ pattern = f"^{pattern}$"
211
+ regex = re.compile(pattern)
212
+ except Exception as e:
213
+ msg = f"regex compilation error while processing term '{term.id}'':\n{e}"
214
+ raise EsgvocDbError(msg) from e
215
+ match = regex.match(value)
216
+ if match is None:
217
+ result.append(_create_term_error(value, term))
218
+ return result
219
+ except Exception as e:
220
+ msg = f"cannot validate separator less composite term '{term.id}':\n{e}"
221
+ raise EsgvocNotImplementedError(msg) from e
222
+
223
+
224
+ def _valid_value_for_composite_term(
225
+ value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
226
+ ) -> list[UniverseTermError | ProjectTermError]:
227
+ result = list()
228
+ separator, _ = _get_composite_term_separator_parts(term)
229
+ if separator:
230
+ result = _valid_value_composite_term_with_separator(value, term, universe_session, project_session)
231
+ else:
232
+ result = _valid_value_composite_term_separator_less(value, term, universe_session, project_session)
233
+ return result
234
+
235
+
236
+ def _create_term_error(value: str, term: UTerm | PTerm) -> UniverseTermError | ProjectTermError:
237
+ if isinstance(term, UTerm):
238
+ return UniverseTermError(
239
+ value=value, term=term.specs, term_kind=term.kind, data_descriptor_id=term.data_descriptor.id
240
+ )
241
+ else:
242
+ return ProjectTermError(value=value, term=term.specs, term_kind=term.kind, collection_id=term.collection.id)
243
+
244
+
245
+ def _valid_value(
246
+ value: str, term: UTerm | PTerm, universe_session: Session, project_session: Session
247
+ ) -> list[UniverseTermError | ProjectTermError]:
248
+ result = list()
249
+ match term.kind:
250
+ case TermKind.PLAIN:
251
+ if constants.DRS_SPECS_JSON_KEY in term.specs:
252
+ if term.specs[constants.DRS_SPECS_JSON_KEY] != value:
253
+ result.append(_create_term_error(value, term))
254
+ else:
255
+ raise EsgvocValueError(f"the term '{term.id}' doesn't have drs name. " + "Can't validate it.")
256
+ case TermKind.PATTERN:
257
+ # TODO: Pattern can be compiled and stored for further matching.
258
+ pattern_match = re.match(term.specs[constants.PATTERN_JSON_KEY], value)
259
+ if pattern_match is None:
260
+ result.append(_create_term_error(value, term))
261
+ case TermKind.COMPOSITE:
262
+ result.extend(_valid_value_for_composite_term(value, term, universe_session, project_session))
263
+ case _:
264
+ raise EsgvocDbError(f"unsupported term kind '{term.kind}'")
265
+ return result
266
+
267
+
268
+ def _check_value(value: str) -> str:
269
+ if not value or value.isspace():
270
+ raise EsgvocValueError("value should be set")
271
+ else:
272
+ return value
273
+
274
+
275
+ def _search_plain_term_and_valid_value(value: str, collection_id: str, project_session: Session) -> str | None:
276
+ where_expression = and_(PCollection.id == collection_id, PTerm.specs[constants.DRS_SPECS_JSON_KEY] == f'"{value}"')
277
+ statement = select(PTerm).join(PCollection).where(where_expression)
278
+ term = project_session.exec(statement).one_or_none()
279
+ return term.id if term else None
280
+
281
+
282
+ def _valid_value_against_all_terms_of_collection(
283
+ value: str, collection: PCollection, universe_session: Session, project_session: Session
284
+ ) -> list[str]:
285
+ if collection.terms:
286
+ result = list()
287
+ for pterm in collection.terms:
288
+ _errors = _valid_value(value, pterm, universe_session, project_session)
289
+ if not _errors:
290
+ result.append(pterm.id)
291
+ return result
292
+ else:
293
+ raise EsgvocDbError(f"collection '{collection.id}' has no term")
294
+
295
+
296
+ def _valid_value_against_given_term(
297
+ value: str, project_id: str, collection_id: str, term_id: str, universe_session: Session, project_session: Session
298
+ ) -> list[UniverseTermError | ProjectTermError]:
299
+ # [OPTIMIZATION]
300
+ key = value + project_id + collection_id + term_id
301
+ if key in _VALID_VALUE_AGAINST_GIVEN_TERM_CACHE:
302
+ result = _VALID_VALUE_AGAINST_GIVEN_TERM_CACHE[key]
303
+ else:
304
+ term = _get_term_in_collection(collection_id, term_id, project_session)
305
+ if term:
306
+ result = _valid_value(value, term, universe_session, project_session)
307
+ else:
308
+ raise EsgvocNotFoundError(f"unable to find term '{term_id}' " + f"in collection '{collection_id}'")
309
+ _VALID_VALUE_AGAINST_GIVEN_TERM_CACHE[key] = result
310
+ return result
311
+
312
+
313
+ def valid_term(value: str, project_id: str, collection_id: str, term_id: str) -> ValidationReport:
314
+ """
315
+ Check if the given value may or may not represent the given term. The functions returns
316
+ a report that contains the possible errors.
317
+
318
+ Behavior based on the nature of the term:
319
+ - plain term: the function try to match the value on the drs_name field.
320
+ - pattern term: the function try to match the value on the pattern field (regex).
321
+ - composite term:
322
+ - if the composite has got a separator, the function splits the value according to the\
323
+ separator of the term then it try to match every part of the composite\
324
+ with every split of the value.
325
+ - if the composite hasn't got a separator, the function aggregates the parts of the \
326
+ composite so as to compare it as a regex to the value.
327
+
328
+ If any of the provided ids (`project_id`, `collection_id` or `term_id`) is not found,
329
+ the function raises a EsgvocNotFoundError.
330
+
331
+ :param value: A value to be validated
332
+ :type value: str
333
+ :param project_id: A project id
334
+ :type project_id: str
335
+ :param collection_id: A collection id
336
+ :type collection_id: str
337
+ :param term_id: A term id
338
+ :type term_id: str
339
+ :returns: A validation report that contains the possible errors
340
+ :rtype: ValidationReport
341
+ :raises EsgvocNotFoundError: If any of the provided ids is not found
342
+ """
343
+ value = _check_value(value)
344
+ with get_universe_session() as universe_session, _get_project_session_with_exception(project_id) as project_session:
345
+ errors = _valid_value_against_given_term(
346
+ value, project_id, collection_id, term_id, universe_session, project_session
347
+ )
348
+ return ValidationReport(expression=value, errors=errors)
349
+
350
+
351
+ def _valid_term_in_collection(
352
+ value: str, project_id: str, collection_id: str, universe_session: Session, project_session: Session
353
+ ) -> list[MatchingTerm]:
354
+ # [OPTIMIZATION]
355
+ key = value + project_id + collection_id
356
+ if key in _VALID_TERM_IN_COLLECTION_CACHE:
357
+ result = _VALID_TERM_IN_COLLECTION_CACHE[key]
358
+ else:
359
+ value = _check_value(value)
360
+ result = list()
361
+ collection = _get_collection_in_project(collection_id, project_session)
362
+ if collection:
363
+ match collection.term_kind:
364
+ case TermKind.PLAIN:
365
+ term_id_found = _search_plain_term_and_valid_value(value, collection_id, project_session)
366
+ if term_id_found:
367
+ result.append(
368
+ MatchingTerm(project_id=project_id, collection_id=collection_id, term_id=term_id_found)
369
+ )
370
+ case _:
371
+ term_ids_found = _valid_value_against_all_terms_of_collection(
372
+ value, collection, universe_session, project_session
373
+ )
374
+ for term_id_found in term_ids_found:
375
+ result.append(
376
+ MatchingTerm(project_id=project_id, collection_id=collection_id, term_id=term_id_found)
377
+ )
378
+ else:
379
+ msg = f"unable to find collection '{collection_id}'"
380
+ raise EsgvocNotFoundError(msg)
381
+ _VALID_TERM_IN_COLLECTION_CACHE[key] = result
382
+ return result
383
+
384
+
385
+ def valid_term_in_collection(value: str, project_id: str, collection_id: str) -> list[MatchingTerm]:
386
+ """
387
+ Check if the given value may or may not represent a term in the given collection. The function
388
+ returns the terms that the value matches.
389
+
390
+ Behavior based on the nature of the term:
391
+ - plain term: the function try to match the value on the drs_name field.
392
+ - pattern term: the function try to match the value on the pattern field (regex).
393
+ - composite term:
394
+ - if the composite has got a separator, the function splits the value according to the \
395
+ separator of the term then it try to match every part of the composite \
396
+ with every split of the value.
397
+ - if the composite hasn't got a separator, the function aggregates the parts of the \
398
+ composite so as to compare it as a regex to the value.
399
+
400
+ If any of the provided ids (`project_id` or `collection_id`) is not found,
401
+ the function raises a EsgvocNotFoundError.
402
+
403
+ :param value: A value to be validated
404
+ :type value: str
405
+ :param project_id: A project id
406
+ :type project_id: str
407
+ :param collection_id: A collection id
408
+ :type collection_id: str
409
+ :returns: The list of terms that the value matches.
410
+ :rtype: list[MatchingTerm]
411
+ :raises EsgvocNotFoundError: If any of the provided ids is not found
412
+ """
413
+ with get_universe_session() as universe_session, _get_project_session_with_exception(project_id) as project_session:
414
+ return _valid_term_in_collection(value, project_id, collection_id, universe_session, project_session)
415
+
416
+
417
+ def _valid_term_in_project(
418
+ value: str, project_id: str, universe_session: Session, project_session: Session
419
+ ) -> list[MatchingTerm]:
420
+ result = list()
421
+ collections = _get_all_collections_in_project(project_session)
422
+ for collection in collections:
423
+ result.extend(_valid_term_in_collection(value, project_id, collection.id, universe_session, project_session))
424
+ return result
425
+
426
+
427
+ def valid_term_in_project(value: str, project_id: str) -> list[MatchingTerm]:
428
+ """
429
+ Check if the given value may or may not represent a term in the given project. The function
430
+ returns the terms that the value matches.
431
+
432
+ Behavior based on the nature of the term:
433
+ - plain term: the function try to match the value on the drs_name field.
434
+ - pattern term: the function try to match the value on the pattern field (regex).
435
+ - composite term:
436
+ - if the composite has got a separator, the function splits the value according to the \
437
+ separator of the term then it try to match every part of the composite \
438
+ with every split of the value.
439
+ - if the composite hasn't got a separator, the function aggregates the parts of the \
440
+ composite so as to compare it as a regex to the value.
441
+
442
+ If the `project_id` is not found, the function raises a EsgvocNotFoundError.
443
+
444
+ :param value: A value to be validated
445
+ :type value: str
446
+ :param project_id: A project id
447
+ :type project_id: str
448
+ :returns: The list of terms that the value matches.
449
+ :rtype: list[MatchingTerm]
450
+ :raises EsgvocNotFoundError: If the `project_id` is not found
451
+ """
452
+ with get_universe_session() as universe_session, _get_project_session_with_exception(project_id) as project_session:
453
+ return _valid_term_in_project(value, project_id, universe_session, project_session)
454
+
455
+
456
+ def valid_term_in_all_projects(value: str) -> list[MatchingTerm]:
457
+ """
458
+ Check if the given value may or may not represent a term in all projects. The function
459
+ returns the terms that the value matches.
460
+
461
+ Behavior based on the nature of the term:
462
+ - plain term: the function try to match the value on the drs_name field.
463
+ - pattern term: the function try to match the value on the pattern field (regex).
464
+ - composite term:
465
+ - if the composite has got a separator, the function splits the value according to the \
466
+ separator of the term then it try to match every part of the composite \
467
+ with every split of the value.
468
+ - if the composite hasn't got a separator, the function aggregates the parts of the \
469
+ composite so as to compare it as a regex to the value.
470
+
471
+ :param value: A value to be validated
472
+ :type value: str
473
+ :returns: The list of terms that the value matches.
474
+ :rtype: list[MatchingTerm]
475
+ """
476
+ result = list()
477
+ with get_universe_session() as universe_session:
478
+ for project_id in get_all_projects():
479
+ with _get_project_session_with_exception(project_id) as project_session:
480
+ result.extend(_valid_term_in_project(value, project_id, universe_session, project_session))
481
+ return result
482
+
483
+
484
+ def get_all_terms_in_collection(
485
+ project_id: str, collection_id: str, selected_term_fields: Iterable[str] | None = None
486
+ ) -> list[DataDescriptor]:
487
+ """
488
+ Gets all terms of the given collection of a project.
489
+ This function performs an exact match on the `project_id` and `collection_id`,
490
+ and does not search for similar or related projects and collections.
491
+ If any of the provided ids (`project_id` or `collection_id`) is not found, the function
492
+ returns an empty list.
493
+
494
+ :param project_id: A project id
495
+ :type project_id: str
496
+ :param collection_id: A collection id
497
+ :type collection_id: str
498
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
499
+ fields of the terms are returned. If empty, selects the id and type fields.
500
+ :type selected_term_fields: Iterable[str] | None
501
+ :returns: a list of term instances. Returns an empty list if no matches are found.
502
+ :rtype: list[DataDescriptor]
503
+ """
504
+ result = list()
505
+ if connection := _get_project_connection(project_id):
506
+ with connection.create_session() as session:
507
+ collection = _get_collection_in_project(collection_id, session)
508
+ if collection:
509
+ result = _get_all_terms_in_collection(collection, selected_term_fields)
510
+ return result
511
+
512
+
513
+ def _get_all_collections_in_project(session: Session) -> list[PCollection]:
514
+ project = session.get(Project, constants.SQLITE_FIRST_PK)
515
+ # Project can't be missing if session exists.
516
+ try:
517
+ return project.collections # type: ignore
518
+ except Exception as e:
519
+ # Enhanced error context for collection retrieval failures
520
+ import logging
521
+
522
+ logger = logging.getLogger(__name__)
523
+ logger.error(f"Failed to retrieve collections for project '{project.id}': {str(e)}")
524
+
525
+ # Use raw SQL to inspect collections without Pydantic validation
526
+ from sqlalchemy import text
527
+
528
+ try:
529
+ # Query raw data to identify problematic collections
530
+ raw_query = text("""
531
+ SELECT id, term_kind, data_descriptor_id
532
+ FROM pcollections
533
+ WHERE project_pk = :project_pk
534
+ """)
535
+ result = session.execute(raw_query, {"project_pk": project.pk})
536
+
537
+ problematic_collections = []
538
+
539
+ for row in result:
540
+ collection_id, term_kind_value, data_descriptor_id = row
541
+
542
+ # Only empty string is invalid - indicates ingestion couldn't determine termkind
543
+ if term_kind_value == "" or term_kind_value is None:
544
+ problematic_collections.append((collection_id, term_kind_value, data_descriptor_id))
545
+ msg = (
546
+ f"Collection '{collection_id}' has empty term_kind (data_descriptor: "
547
+ + f"{data_descriptor_id}) - CV ingestion failed to determine termkind"
548
+ )
549
+ logger.error(msg)
550
+
551
+ if problematic_collections:
552
+ error_details = []
553
+ for col_id, _, data_desc in problematic_collections:
554
+ error_details.append(f" • Collection '{col_id}' (data_descriptor: {data_desc}): EMPTY termkind")
555
+
556
+ error_msg = f"Found {len(problematic_collections)} collections with empty term_kind:\n" + "\n".join(
557
+ error_details
558
+ )
559
+ raise ValueError(error_msg) from e
560
+
561
+ except Exception as inner_e:
562
+ logger.error(f"Failed to analyze problematic collections using raw SQL: {inner_e}")
563
+
564
+ raise e
565
+
566
+
567
+ def get_all_collections_in_project(project_id: str) -> list[str]:
568
+ """
569
+ Gets all collections of the given project.
570
+ This function performs an exact match on the `project_id` and
571
+ does not search for similar or related projects.
572
+ If the provided `project_id` is not found, the function returns an empty list.
573
+
574
+ :param project_id: A project id
575
+ :type project_id: str
576
+ :returns: A list of collection ids. Returns an empty list if no matches are found.
577
+ :rtype: list[str]
578
+ """
579
+ result = list()
580
+ if connection := _get_project_connection(project_id):
581
+ try:
582
+ with connection.create_session() as session:
583
+ collections = _get_all_collections_in_project(session)
584
+ for collection in collections:
585
+ result.append(collection.id)
586
+ except Exception as e:
587
+ # Enhanced error context for project collection retrieval
588
+ import logging
589
+
590
+ logger = logging.getLogger(__name__)
591
+ logger.error(f"Failed to get collections for project '{project_id}': {str(e)}")
592
+
593
+ # Re-raise with enhanced context
594
+ raise ValueError(
595
+ f"Failed to retrieve collections for project '{project_id}'. "
596
+ f"This may be due to invalid termkind values in the database. "
597
+ f"Check the project database for collections with empty or invalid termkind values. "
598
+ f"Original error: {str(e)}"
599
+ ) from e
600
+ return result
601
+
602
+
603
+ def _get_all_terms_in_collection(
604
+ collection: PCollection, selected_term_fields: Iterable[str] | None
605
+ ) -> list[DataDescriptor]:
606
+ result: list[DataDescriptor] = list()
607
+ instantiate_pydantic_terms(collection.terms, result, selected_term_fields)
608
+ return result
609
+
610
+
611
+ def get_all_terms_in_project(
612
+ project_id: str, selected_term_fields: Iterable[str] | None = None
613
+ ) -> list[DataDescriptor]:
614
+ """
615
+ Gets all terms of the given project.
616
+ This function performs an exact match on the `project_id` and
617
+ does not search for similar or related projects.
618
+ Terms are unique within a collection but may have some synonyms in a project.
619
+ If the provided `project_id` is not found, the function returns an empty list.
620
+
621
+ :param project_id: A project id
622
+ :type project_id: str
623
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
624
+ fields of the terms are returned. If empty, selects the id and type fields.
625
+ :type selected_term_fields: Iterable[str] | None
626
+ :returns: A list of term instances. Returns an empty list if no matches are found.
627
+ :rtype: list[DataDescriptor]
628
+ """
629
+ result = list()
630
+ if connection := _get_project_connection(project_id):
631
+ with connection.create_session() as session:
632
+ collections = _get_all_collections_in_project(session)
633
+ for collection in collections:
634
+ # Term may have some synonyms in a project.
635
+ result.extend(_get_all_terms_in_collection(collection, selected_term_fields))
636
+ return result
637
+
638
+
639
+ def get_all_terms_in_all_projects(
640
+ selected_term_fields: Iterable[str] | None = None,
641
+ ) -> list[tuple[str, list[DataDescriptor]]]:
642
+ """
643
+ Gets all terms of all projects.
644
+
645
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
646
+ fields of the terms are returned. If empty, selects the id and type fields.
647
+ :type selected_term_fields: Iterable[str] | None
648
+ :returns: A list of tuple project_id and term instances of that project.
649
+ :rtype: list[tuple[str, list[DataDescriptor]]]
650
+ """
651
+ project_ids = get_all_projects()
652
+ result = list()
653
+ for project_id in project_ids:
654
+ terms = get_all_terms_in_project(project_id, selected_term_fields)
655
+ result.append((project_id, terms))
656
+ return result
657
+
658
+
659
+ def get_all_projects() -> list[str]:
660
+ """
661
+ Gets all projects.
662
+
663
+ :returns: A list of project ids.
664
+ :rtype: list[str]
665
+ """
666
+ return list(service.current_state.projects.keys())
667
+
668
+
669
+ def _get_term_in_project(term_id: str, session: Session) -> PTerm | None:
670
+ statement = select(PTerm).where(PTerm.id == term_id)
671
+ results = session.exec(statement)
672
+ # Term ids are not supposed to be unique within a project.
673
+ result = results.first()
674
+ return result
675
+
676
+
677
+ def get_term_in_project(
678
+ project_id: str, term_id: str, selected_term_fields: Iterable[str] | None = None
679
+ ) -> DataDescriptor | None:
680
+ """
681
+ Returns the first occurrence of the terms, in the given project, whose id corresponds exactly to
682
+ the given term id.
683
+ Terms are unique within a collection but may have some synonyms in a project.
684
+ This function performs an exact match on the `project_id` and `term_id`, and does not search
685
+ for similar or related projects and terms.
686
+ If any of the provided ids (`project_id` or `term_id`) is not found,
687
+ the function returns `None`.
688
+
689
+ :param project_id: The id of the given project.
690
+ :type project_id: str
691
+ :param term_id: The id of a term to be found.
692
+ :type term_id: str
693
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
694
+ fields of the terms are returned. If empty, selects the id and type fields.
695
+ :type selected_term_fields: Iterable[str] | None
696
+ :returns: A term instance. Returns `None` if no match is found.
697
+ :rtype: DataDescriptor | None
698
+ """
699
+ result: DataDescriptor | None = None
700
+ if connection := _get_project_connection(project_id):
701
+ with connection.create_session() as session:
702
+ term_found = _get_term_in_project(term_id, session)
703
+ if term_found:
704
+ result = instantiate_pydantic_term(term_found, selected_term_fields)
705
+ return result
706
+
707
+
708
+ def _get_term_in_collection(collection_id: str, term_id: str, session: Session) -> PTerm | None:
709
+ statement = select(PTerm).join(PCollection).where(PCollection.id == collection_id, PTerm.id == term_id)
710
+ results = session.exec(statement)
711
+ result = results.one_or_none()
712
+ return result
713
+
714
+
715
+ def get_term_in_collection(
716
+ project_id: str, collection_id: str, term_id: str, selected_term_fields: Iterable[str] | None = None
717
+ ) -> DataDescriptor | None:
718
+ """
719
+ Returns the term, in the given project and collection,
720
+ whose id corresponds exactly to the given term id.
721
+ This function performs an exact match on the `project_id`, `collection_id` and `term_id`,
722
+ and does not search for similar or related projects, collections and terms.
723
+ If any of the provided ids (`project_id`, `collection_id` or `term_id`) is not found,
724
+ the function returns `None`.
725
+
726
+ :param project_id: The id of the given project.
727
+ :type project_id: str
728
+ :param collection_id: The id of the given collection.
729
+ :type collection_id: str
730
+ :param term_id: The id of a term to be found.
731
+ :type term_id: str
732
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
733
+ fields of the terms are returned. If empty, selects the id and type fields.
734
+ :type selected_term_fields: Iterable[str] | None
735
+ :returns: A term instance. Returns `None` if no match is found.
736
+ :rtype: DataDescriptor | None
737
+ """
738
+ result: DataDescriptor | None = None
739
+ if connection := _get_project_connection(project_id):
740
+ with connection.create_session() as session:
741
+ term_found = _get_term_in_collection(collection_id, term_id, session)
742
+ if term_found:
743
+ result = instantiate_pydantic_term(term_found, selected_term_fields)
744
+ return result
745
+
746
+
747
+ def _get_collection_in_project(collection_id: str, session: Session) -> PCollection | None:
748
+ statement = select(PCollection).where(PCollection.id == collection_id)
749
+ results = session.exec(statement)
750
+ result = results.one_or_none()
751
+ return result
752
+
753
+
754
+ def get_collection_in_project(project_id: str, collection_id: str) -> tuple[str, dict] | None:
755
+ """
756
+ Returns the collection, in the given project, whose id corresponds exactly to
757
+ the given collection id.
758
+ This function performs an exact match on the `project_id` and `collection_id`, and does not search
759
+ for similar or related projects and collections.
760
+ If any of the provided ids (`project_id` or `collection_id`) is not found,
761
+ the function returns `None`.
762
+
763
+ :param project_id: The id of the given project.
764
+ :type project_id: str
765
+ :param collection_id: The id of a collection to be found.
766
+ :type collection_id: str
767
+ :returns: A collection id and context. Returns `None` if no match is found.
768
+ :rtype: tuple[str, dict] | None
769
+ """
770
+ result: tuple[str, dict] | None = None
771
+ if connection := _get_project_connection(project_id):
772
+ with connection.create_session() as session:
773
+ collection_found = _get_collection_in_project(collection_id, session)
774
+ if collection_found:
775
+ result = collection_found.id, collection_found.context
776
+ return result
777
+
778
+
779
+ def get_project(project_id: str) -> ProjectSpecs | None:
780
+ """
781
+ Get a project and returns its specifications.
782
+ This function performs an exact match on the `project_id` and
783
+ does not search for similar or related projects.
784
+ If the provided `project_id` is not found, the function returns `None`.
785
+
786
+ :param project_id: A project id to be found
787
+ :type project_id: str
788
+ :returns: The specs of the project found. Returns `None` if no matches are found.
789
+ :rtype: ProjectSpecs | None
790
+ """
791
+ result: ProjectSpecs | None = None
792
+ if connection := _get_project_connection(project_id):
793
+ with connection.create_session() as session:
794
+ project = session.get(Project, constants.SQLITE_FIRST_PK)
795
+ try:
796
+ # Project can't be missing if session exists.
797
+ result = ProjectSpecs(**project.specs, version=project.git_hash) # type: ignore
798
+ except Exception as e:
799
+ msg = f"unable to read specs in project '{project_id}'"
800
+ raise EsgvocDbError(msg) from e
801
+ return result
802
+
803
+
804
+ def _get_collection_from_data_descriptor_in_project(data_descriptor_id: str, session: Session) -> list[PCollection]:
805
+ statement = select(PCollection).where(PCollection.data_descriptor_id == data_descriptor_id)
806
+ results = session.exec(statement).all()
807
+ return results
808
+
809
+
810
+ def get_collection_from_data_descriptor_in_project(project_id: str, data_descriptor_id: str) -> list[tuple[str, dict]]:
811
+ """
812
+ Returns the collections, in the given project, that correspond to the given data descriptor
813
+ in the universe.
814
+ This function performs an exact match on the `project_id` and `data_descriptor_id`,
815
+ and does not search for similar or related projects and data descriptors.
816
+ If any of the provided ids (`project_id` or `data_descriptor_id`) is not found, or if
817
+ there is no collection corresponding to the given data descriptor, the function returns an empty list.
818
+
819
+ :param project_id: The id of the given project.
820
+ :type project_id: str
821
+ :param data_descriptor_id: The id of the given data descriptor.
822
+ :type data_descriptor_id: str
823
+ :returns: A list of collection ids and contexts. Returns an empty list if no matches are found.
824
+ :rtype: list[tuple[str, dict]]
825
+ """
826
+ result: list[tuple[str, dict]] = []
827
+ if connection := _get_project_connection(project_id):
828
+ with connection.create_session() as session:
829
+ collections_found = _get_collection_from_data_descriptor_in_project(data_descriptor_id, session)
830
+ result = [(collection.id, collection.context) for collection in collections_found]
831
+ return result
832
+
833
+
834
+ def get_collection_from_data_descriptor_in_all_projects(data_descriptor_id: str) -> list[tuple[str, str, dict]]:
835
+ """
836
+ Returns the collections, in all projects, that correspond to the given data descriptor
837
+ in the universe.
838
+ This function performs an exact match on `data_descriptor_id`,
839
+ and does not search for similar or related data descriptors.
840
+ If the provided `data_descriptor_id` is not found, or if
841
+ there is no collection corresponding to the given data descriptor, the function returns
842
+ an empty list.
843
+
844
+ :param data_descriptor_id: The id of the given data descriptor.
845
+ :type data_descriptor_id: str
846
+ :returns: A list of collection ids, their project_ids and contexts. \
847
+ Returns an empty list if no matches are found.
848
+ :rtype: list[tuple[str, str, dict]]
849
+ """
850
+ result = list()
851
+ project_ids = get_all_projects()
852
+ for project_id in project_ids:
853
+ collections_found = get_collection_from_data_descriptor_in_project(project_id, data_descriptor_id)
854
+ for collection_id, context in collections_found:
855
+ result.append((project_id, collection_id, context))
856
+ return result
857
+
858
+
859
+ def _get_term_from_universe_term_id_in_project(
860
+ data_descriptor_id: str, universe_term_id: str, project_session: Session
861
+ ) -> PTerm | None:
862
+ statement = (
863
+ select(PTerm)
864
+ .join(PCollection)
865
+ .where(PCollection.data_descriptor_id == data_descriptor_id, PTerm.id == universe_term_id)
866
+ )
867
+ results = project_session.exec(statement)
868
+ result = results.one_or_none()
869
+ return result
870
+
871
+
872
+ def get_term_from_universe_term_id_in_project(
873
+ project_id: str, data_descriptor_id: str, universe_term_id: str, selected_term_fields: Iterable[str] | None = None
874
+ ) -> tuple[str, DataDescriptor] | None:
875
+ """
876
+ Returns the term, in the given project, that corresponds to the given term in the universe.
877
+ This function performs an exact match on the `project_id`, `data_descriptor_id`
878
+ and `universe_term_id`, and does not search for similar or related projects, data descriptors
879
+ and terms. If any of the provided ids (`project_id`, `data_descriptor_id` or `universe_term_id`)
880
+ is not found, or if there is no project term corresponding to the given universe term
881
+ the function returns `None`.
882
+
883
+ :param project_id: The id of the given project.
884
+ :type project_id: str
885
+ :param data_descriptor_id: The id of the data descriptor that contains the given universe term.
886
+ :type data_descriptor_id: str
887
+ :param universe_term_id: The id of the given universe term.
888
+ :type universe_term_id: str
889
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
890
+ fields of the terms are returned. If empty, selects the id and type fields.
891
+ :type selected_term_fields: Iterable[str] | None
892
+ :returns: A collection id and the project term instance. Returns `None` if no matches are found.
893
+ :rtype: tuple[str, DataDescriptor] | None
894
+ """
895
+ result: tuple[str, DataDescriptor] | None = None
896
+ if connection := _get_project_connection(project_id):
897
+ with connection.create_session() as session:
898
+ term_found = _get_term_from_universe_term_id_in_project(data_descriptor_id, universe_term_id, session)
899
+ if term_found:
900
+ pydantic_term = instantiate_pydantic_term(term_found, selected_term_fields)
901
+ result = (term_found.collection.id, pydantic_term)
902
+ return result
903
+
904
+
905
+ def get_term_from_universe_term_id_in_all_projects(
906
+ data_descriptor_id: str, universe_term_id: str, selected_term_fields: Iterable[str] | None = None
907
+ ) -> list[tuple[str, str, DataDescriptor]]:
908
+ """
909
+ Returns the terms, in all projects, that correspond to the given term in the universe.
910
+ This function performs an exact match on the `data_descriptor_id`
911
+ and `universe_term_id`, and does not search for similar or related data descriptors
912
+ and terms. If any of the provided ids (`data_descriptor_id` or `universe_term_id`)
913
+ is not found, or if there is no project term corresponding to the given universe term
914
+ the function returns an empty list.
915
+
916
+ :param data_descriptor_id: The id of the data descriptor that contains the given universe term.
917
+ :type data_descriptor_id: str
918
+ :param universe_term_id: The id of the given universe term.
919
+ :type universe_term_id: str
920
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
921
+ fields of the terms are returned. If empty, selects the id and type fields.
922
+ :type selected_term_fields: Iterable[str] | None
923
+ :returns: A project_id, collection id and the project term instance. \
924
+ Returns an empty list if no matches are found.
925
+ :rtype: list[tuple[str, str, DataDescriptor]]
926
+ """
927
+ result: list[tuple[str, str, DataDescriptor]] = list()
928
+ project_ids = get_all_projects()
929
+ for project_id in project_ids:
930
+ term_found = get_term_from_universe_term_id_in_project(
931
+ project_id, data_descriptor_id, universe_term_id, selected_term_fields
932
+ )
933
+ if term_found:
934
+ result.append((project_id, term_found[0], term_found[1]))
935
+ return result
936
+
937
+
938
+ def _find_collections_in_project(
939
+ expression: str, session: Session, only_id: bool = False, limit: int | None = None, offset: int | None = None
940
+ ) -> Sequence[PCollection]:
941
+ matching_condition = generate_matching_condition(PCollectionFTS5, expression, only_id)
942
+ tmp_statement = select(PCollectionFTS5).where(matching_condition)
943
+ statement = select(PCollection).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
944
+ return execute_match_statement(expression, statement, session)
945
+
946
+
947
+ def find_collections_in_project(
948
+ expression: str, project_id: str, only_id: bool = False, limit: int | None = None, offset: int | None = None
949
+ ) -> list[tuple[str, dict]]:
950
+ """
951
+ Find collections in the given project based on a full text search defined by the given `expression`.
952
+ The `expression` can be composed of one or multiple keywords.
953
+ The keywords can combined with boolean operators: `AND`,
954
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
955
+ if no boolean operators is provided, whitespaces are handled as if there were
956
+ an implicit AND operator between each pair of keywords. Note that this
957
+ function does not provide any priority operator (parenthesis).
958
+ Keywords can define prefixes when adding a `*` at the end of them.
959
+ If the expression is composed of only one keyword, the function
960
+ automatically defines it as a prefix.
961
+ The function returns a list of collection ids and contexts, sorted according to the
962
+ bm25 ranking metric (list index `0` has the highest rank).
963
+ This function performs an exact match on the `project_id`,
964
+ and does not search for similar or related projects.
965
+ If the provided `expression` does not hit any collection or the given `project_id` does not
966
+ match exactly to an id of a project, the function returns an empty list.
967
+ The function searches for the `expression` in the collection specifications.
968
+ However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
969
+ collections. **At the moment, `only_id` is set to `True` as the collections
970
+ haven't got any description.**
971
+
972
+ :param expression: The full text search expression.
973
+ :type expression: str
974
+ :param project_id: The id of the given project.
975
+ :type project_id: str
976
+ :param only_id: Performs the search only on ids, otherwise on all the specifications.
977
+ :type only_id: bool
978
+ :param limit: Limit the number of returned items found. Returns all items found the if \
979
+ `limit` is either `None`, zero or negative.
980
+ :type limit: int | None
981
+ :param offset: Skips `offset` number of items found. Ignored if `offset` is \
982
+ either `None`, zero or negative.
983
+ :type offset: int | None
984
+ :returns: A list of collection ids and contexts. Returns an empty list if no matches are found.
985
+ :rtype: list[tuple[str, dict]]
986
+ :raises EsgvocValueError: If the `expression` cannot be interpreted.
987
+ """
988
+ result: list[tuple[str, dict]] = list()
989
+ if connection := _get_project_connection(project_id):
990
+ with connection.create_session() as session:
991
+ collections_found = _find_collections_in_project(expression, session, only_id, limit, offset)
992
+ for collection in collections_found:
993
+ result.append((collection.id, collection.context))
994
+ return result
995
+
996
+
997
+ def _find_terms_in_collection(
998
+ expression: str,
999
+ collection_id: str,
1000
+ session: Session,
1001
+ only_id: bool = False,
1002
+ limit: int | None = None,
1003
+ offset: int | None = None,
1004
+ ) -> Sequence[PTerm]:
1005
+ matching_condition = generate_matching_condition(PTermFTS5, expression, only_id)
1006
+ where_condition = PCollection.id == collection_id, matching_condition
1007
+ tmp_statement = select(PTermFTS5).join(PCollection).where(*where_condition)
1008
+ statement = select(PTerm).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
1009
+ return execute_match_statement(expression, statement, session)
1010
+
1011
+
1012
+ def _find_terms_in_project(
1013
+ expression: str, session: Session, only_id: bool = False, limit: int | None = None, offset: int | None = None
1014
+ ) -> Sequence[PTerm]:
1015
+ matching_condition = generate_matching_condition(PTermFTS5, expression, only_id)
1016
+ tmp_statement = select(PTermFTS5).where(matching_condition)
1017
+ statement = select(PTerm).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
1018
+ return execute_match_statement(expression, statement, session)
1019
+
1020
+
1021
+ def find_terms_in_collection(
1022
+ expression: str,
1023
+ project_id: str,
1024
+ collection_id: str,
1025
+ only_id: bool = False,
1026
+ limit: int | None = None,
1027
+ offset: int | None = None,
1028
+ selected_term_fields: Iterable[str] | None = None,
1029
+ ) -> list[DataDescriptor]:
1030
+ """
1031
+ Find terms in the given project and collection based on a full text search defined by the given
1032
+ `expression`.
1033
+ The `expression` can be composed of one or multiple keywords.
1034
+ The keywords can combined with boolean operators: `AND`,
1035
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
1036
+ if no boolean operators is provided, whitespaces are handled as if there were
1037
+ an implicit AND operator between each pair of keywords. Note that this
1038
+ function does not provide any priority operator (parenthesis).
1039
+ Keywords can define prefixes when adding a `*` at the end of them.
1040
+ If the expression is composed of only one keyword, the function
1041
+ automatically defines it as a prefix.
1042
+ The function returns a list of term instances, sorted according to the
1043
+ bm25 ranking metric (list index `0` has the highest rank).
1044
+ This function performs an exact match on the `project_id` and `collection_id`,
1045
+ and does not search for similar or related projects and collections.
1046
+ If the provided `expression` does not hit any term or if any of the provided ids
1047
+ (`project_id` or `collection_id`) is not found, the function returns an empty list.
1048
+ The function searches for the `expression` in the term specifications.
1049
+ However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
1050
+ terms.
1051
+
1052
+ :param expression: The full text search expression.
1053
+ :type expression: str
1054
+ :param project_id: The id of the given project.
1055
+ :type project_id: str
1056
+ :param collection_id: The id of the given collection.
1057
+ :type collection_id: str
1058
+ :param only_id: Performs the search only on ids, otherwise on all the specifications.
1059
+ :type only_id: bool
1060
+ :param limit: Limit the number of returned items found. Returns all items found the if \
1061
+ `limit` is either `None`, zero or negative.
1062
+ :type limit: int | None
1063
+ :param offset: Skips `offset` number of items found. Ignored if `offset` is \
1064
+ either `None`, zero or negative.
1065
+ :type offset: int | None
1066
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
1067
+ fields of the terms are returned. If empty, selects the id and type fields.
1068
+ :type selected_term_fields: Iterable[str] | None
1069
+ :returns: A list of term instances. Returns an empty list if no matches are found.
1070
+ :rtype: list[DataDescriptor]
1071
+ :raises EsgvocValueError: If the `expression` cannot be interpreted.
1072
+ """
1073
+ result: list[DataDescriptor] = list()
1074
+ if connection := _get_project_connection(project_id):
1075
+ with connection.create_session() as session:
1076
+ pterms_found = _find_terms_in_collection(expression, collection_id, session, only_id, limit, offset)
1077
+ instantiate_pydantic_terms(pterms_found, result, selected_term_fields)
1078
+ return result
1079
+
1080
+
1081
+ def find_terms_in_project(
1082
+ expression: str,
1083
+ project_id: str,
1084
+ only_id: bool = False,
1085
+ limit: int | None = None,
1086
+ offset: int | None = None,
1087
+ selected_term_fields: Iterable[str] | None = None,
1088
+ ) -> list[DataDescriptor]:
1089
+ """
1090
+ Find terms in the given project based on a full text search defined by the given `expression`.
1091
+ The `expression` can be composed of one or multiple keywords.
1092
+ The keywords can combined with boolean operators: `AND`,
1093
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
1094
+ if no boolean operators is provided, whitespaces are handled as if there were
1095
+ an implicit AND operator between each pair of keywords. Note that this
1096
+ function does not provide any priority operator (parenthesis).
1097
+ Keywords can define prefixes when adding a `*` at the end of them.
1098
+ If the expression is composed of only one keyword, the function
1099
+ automatically defines it as a prefix.
1100
+ The function returns a list of term instances, sorted according to the
1101
+ bm25 ranking metric (list index `0` has the highest rank).
1102
+ This function performs an exact match on the `project_id`,
1103
+ and does not search for similar or related projects.
1104
+ If the provided `expression` does not hit any term or if any of the provided `project_id` is
1105
+ not found, the function returns an empty list.
1106
+ The function searches for the `expression` in the term specifications.
1107
+ However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
1108
+ terms.
1109
+
1110
+ :param expression: The full text search expression.
1111
+ :type expression: str
1112
+ :param project_id: The id of the given project.
1113
+ :type project_id: str
1114
+ :param only_id: Performs the search only on ids, otherwise on all the specifications.
1115
+ :type only_id: bool
1116
+ :param limit: Limit the number of returned items found. Returns all items found the if \
1117
+ `limit` is either `None`, zero or negative.
1118
+ :type limit: int | None
1119
+ :param offset: Skips `offset` number of items found. Ignored if `offset` is \
1120
+ either `None`, zero or negative.
1121
+ :type offset: int | None
1122
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
1123
+ fields of the terms are returned. If empty, selects the id and type fields.
1124
+ :type selected_term_fields: Iterable[str] | None
1125
+ :returns: A list of term instances. Returns an empty list if no matches are found.
1126
+ :rtype: list[DataDescriptor]
1127
+ :raises EsgvocValueError: If the `expression` cannot be interpreted.
1128
+ """
1129
+ result: list[DataDescriptor] = list()
1130
+ if connection := _get_project_connection(project_id):
1131
+ with connection.create_session() as session:
1132
+ pterms_found = _find_terms_in_project(expression, session, only_id, limit, offset)
1133
+ instantiate_pydantic_terms(pterms_found, result, selected_term_fields)
1134
+ return result
1135
+
1136
+
1137
+ def find_terms_in_all_projects(
1138
+ expression: str,
1139
+ only_id: bool = False,
1140
+ limit: int | None = None,
1141
+ offset: int | None = None,
1142
+ selected_term_fields: Iterable[str] | None = None,
1143
+ ) -> list[tuple[str, list[DataDescriptor]]]:
1144
+ """
1145
+ Find terms in all projects based on a full text search defined by the given `expression`.
1146
+ The `expression` can be composed of one or multiple keywords.
1147
+ The keywords can combined with boolean operators: `AND`,
1148
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
1149
+ if no boolean operators is provided, whitespaces are handled as if there were
1150
+ an implicit AND operator between each pair of keywords. Note that this
1151
+ function does not provide any priority operator (parenthesis).
1152
+ Keywords can define prefixes when adding a `*` at the end of them.
1153
+ If the expression is composed of only one keyword, the function
1154
+ automatically defines it as a prefix.
1155
+ The function returns a list of project ids and term instances, sorted according to the
1156
+ bm25 ranking metric (list index `0` has the highest rank).
1157
+ If the provided `expression` does not hit any term, the function returns an empty list.
1158
+ The function searches for the `expression` in the term specifications.
1159
+ However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
1160
+ terms.
1161
+
1162
+ :param expression: The full text search expression.
1163
+ :type expression: str
1164
+ :param only_id: Performs the search only on ids, otherwise on all the specifications.
1165
+ :type only_id: bool
1166
+ :param limit: Limit the number of returned items found. Returns all items found the if \
1167
+ `limit` is either `None`, zero or negative.
1168
+ :type limit: int | None
1169
+ :param offset: Skips `offset` number of items found. Ignored if `offset` is \
1170
+ either `None`, zero or negative.
1171
+ :type offset: int | None
1172
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
1173
+ fields of the terms are returned. If empty, selects the id and type fields.
1174
+ :type selected_term_fields: Iterable[str] | None
1175
+ :returns: A list of project ids and term instances. Returns an empty list if no matches are found.
1176
+ :rtype: list[tuple[str, list[DataDescriptor]]]
1177
+ :raises EsgvocValueError: If the `expression` cannot be interpreted.
1178
+ """
1179
+ result: list[tuple[str, list[DataDescriptor]]] = list()
1180
+ project_ids = get_all_projects()
1181
+ for project_id in project_ids:
1182
+ terms_found = find_terms_in_project(expression, project_id, only_id, limit, offset, selected_term_fields)
1183
+ if terms_found:
1184
+ result.append((project_id, terms_found))
1185
+ return result
1186
+
1187
+
1188
+ def find_items_in_project(
1189
+ expression: str, project_id: str, only_id: bool = False, limit: int | None = None, offset: int | None = None
1190
+ ) -> list[Item]:
1191
+ """
1192
+ Find items, at the moment terms and collections, in the given project based on a full-text
1193
+ search defined by the given `expression`.
1194
+ The `expression` can be composed of one or multiple keywords.
1195
+ The keywords can combined with boolean operators: `AND`,
1196
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
1197
+ if no boolean operators is provided, whitespaces are handled as if there were
1198
+ an implicit AND operator between each pair of keywords. Note that this
1199
+ function does not provide any priority operator (parenthesis).
1200
+ Keywords can define prefixes when adding a `*` at the end of them.
1201
+ If the expression is composed of only one keyword, the function
1202
+ automatically defines it as a prefix.
1203
+ The function returns a list of item instances sorted according to the
1204
+ bm25 ranking metric (list index `0` has the highest rank).
1205
+ This function performs an exact match on the `project_id`,
1206
+ and does not search for similar or related projects.
1207
+ If the provided `expression` does not hit any item, or the provided `project_id` is not found,
1208
+ the function returns an empty list.
1209
+ The function searches for the `expression` in the term and collection specifications.
1210
+ However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
1211
+ terms and collections. **At the moment, `only_id` is set to `True` for the collections because
1212
+ they haven't got any description.**
1213
+
1214
+ :param expression: The full text search expression.
1215
+ :type expression: str
1216
+ :param only_id: Performs the search only on ids, otherwise on all the specifications.
1217
+ :type only_id: bool
1218
+ :param limit: Limit the number of returned items found. Returns all items found the if \
1219
+ `limit` is either `None`, zero or negative.
1220
+ :type limit: int | None
1221
+ :param offset: Skips `offset` number of items found. Ignored if `offset` is \
1222
+ either `None`, zero or negative.
1223
+ :type offset: int | None
1224
+ :returns: A list of item instances. Returns an empty list if no matches are found.
1225
+ :rtype: list[Item]
1226
+ :raises EsgvocValueError: If the `expression` cannot be interpreted.
1227
+ """
1228
+ # TODO: execute union query when it will be possible to compute parent of terms and collections.
1229
+ result = list()
1230
+ if connection := _get_project_connection(project_id):
1231
+ with connection.create_session() as session:
1232
+ processed_expression = process_expression(expression)
1233
+ if only_id:
1234
+ collection_column = col(PCollectionFTS5.id)
1235
+ term_column = col(PTermFTS5.id)
1236
+ else:
1237
+ # TODO: use specs when implemented!
1238
+ collection_column = col(PCollectionFTS5.id)
1239
+ term_column = col(PTermFTS5.specs) # type: ignore
1240
+ collection_where_condition = collection_column.match(processed_expression)
1241
+ collection_statement = select(
1242
+ PCollectionFTS5.id, text("'collection' AS TYPE"), text(f"'{project_id}' AS TYPE"), text("rank")
1243
+ ).where(collection_where_condition)
1244
+ term_where_condition = term_column.match(processed_expression)
1245
+ term_statement = (
1246
+ select(PTermFTS5.id, text("'term' AS TYPE"), PCollection.id, text("rank"))
1247
+ .join(PCollection)
1248
+ .where(term_where_condition)
1249
+ )
1250
+ result = execute_find_item_statements(
1251
+ session, processed_expression, collection_statement, term_statement, limit, offset
1252
+ )
1253
+ return result