esgvoc 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of esgvoc might be problematic. Click here for more details.

Files changed (87) hide show
  1. esgvoc/__init__.py +1 -1
  2. esgvoc/api/__init__.py +95 -60
  3. esgvoc/api/data_descriptors/__init__.py +50 -28
  4. esgvoc/api/data_descriptors/activity.py +3 -3
  5. esgvoc/api/data_descriptors/area_label.py +16 -1
  6. esgvoc/api/data_descriptors/branded_suffix.py +20 -0
  7. esgvoc/api/data_descriptors/branded_variable.py +12 -0
  8. esgvoc/api/data_descriptors/consortium.py +14 -13
  9. esgvoc/api/data_descriptors/contact.py +5 -0
  10. esgvoc/api/data_descriptors/conventions.py +6 -0
  11. esgvoc/api/data_descriptors/creation_date.py +5 -0
  12. esgvoc/api/data_descriptors/data_descriptor.py +14 -9
  13. esgvoc/api/data_descriptors/data_specs_version.py +5 -0
  14. esgvoc/api/data_descriptors/date.py +1 -1
  15. esgvoc/api/data_descriptors/directory_date.py +1 -1
  16. esgvoc/api/data_descriptors/experiment.py +13 -11
  17. esgvoc/api/data_descriptors/forcing_index.py +1 -1
  18. esgvoc/api/data_descriptors/frequency.py +3 -3
  19. esgvoc/api/data_descriptors/further_info_url.py +5 -0
  20. esgvoc/api/data_descriptors/grid_label.py +2 -2
  21. esgvoc/api/data_descriptors/horizontal_label.py +15 -1
  22. esgvoc/api/data_descriptors/initialisation_index.py +1 -1
  23. esgvoc/api/data_descriptors/institution.py +8 -5
  24. esgvoc/api/data_descriptors/known_branded_variable.py +23 -0
  25. esgvoc/api/data_descriptors/license.py +3 -3
  26. esgvoc/api/data_descriptors/mip_era.py +1 -1
  27. esgvoc/api/data_descriptors/model_component.py +1 -1
  28. esgvoc/api/data_descriptors/obs_type.py +5 -0
  29. esgvoc/api/data_descriptors/organisation.py +1 -1
  30. esgvoc/api/data_descriptors/physic_index.py +1 -1
  31. esgvoc/api/data_descriptors/product.py +2 -2
  32. esgvoc/api/data_descriptors/publication_status.py +5 -0
  33. esgvoc/api/data_descriptors/realisation_index.py +1 -1
  34. esgvoc/api/data_descriptors/realm.py +1 -1
  35. esgvoc/api/data_descriptors/region.py +5 -0
  36. esgvoc/api/data_descriptors/resolution.py +3 -3
  37. esgvoc/api/data_descriptors/source.py +9 -5
  38. esgvoc/api/data_descriptors/source_type.py +1 -1
  39. esgvoc/api/data_descriptors/table.py +3 -2
  40. esgvoc/api/data_descriptors/temporal_label.py +15 -1
  41. esgvoc/api/data_descriptors/time_range.py +4 -3
  42. esgvoc/api/data_descriptors/title.py +5 -0
  43. esgvoc/api/data_descriptors/tracking_id.py +5 -0
  44. esgvoc/api/data_descriptors/variable.py +25 -12
  45. esgvoc/api/data_descriptors/variant_label.py +3 -3
  46. esgvoc/api/data_descriptors/vertical_label.py +14 -0
  47. esgvoc/api/project_specs.py +120 -4
  48. esgvoc/api/projects.py +733 -505
  49. esgvoc/api/py.typed +0 -0
  50. esgvoc/api/report.py +12 -8
  51. esgvoc/api/search.py +168 -98
  52. esgvoc/api/universe.py +368 -157
  53. esgvoc/apps/drs/constants.py +1 -1
  54. esgvoc/apps/drs/generator.py +51 -69
  55. esgvoc/apps/drs/report.py +60 -15
  56. esgvoc/apps/drs/validator.py +60 -71
  57. esgvoc/apps/jsg/cmip6_template.json +74 -0
  58. esgvoc/apps/jsg/cmip6plus_template.json +74 -0
  59. esgvoc/apps/jsg/json_schema_generator.py +185 -0
  60. esgvoc/apps/py.typed +0 -0
  61. esgvoc/cli/config.py +500 -0
  62. esgvoc/cli/drs.py +3 -2
  63. esgvoc/cli/find.py +138 -0
  64. esgvoc/cli/get.py +46 -38
  65. esgvoc/cli/main.py +10 -3
  66. esgvoc/cli/status.py +27 -18
  67. esgvoc/cli/valid.py +10 -15
  68. esgvoc/core/constants.py +1 -1
  69. esgvoc/core/db/__init__.py +2 -4
  70. esgvoc/core/db/connection.py +5 -3
  71. esgvoc/core/db/models/project.py +57 -15
  72. esgvoc/core/db/models/universe.py +49 -10
  73. esgvoc/core/db/project_ingestion.py +79 -65
  74. esgvoc/core/db/universe_ingestion.py +71 -40
  75. esgvoc/core/exceptions.py +33 -0
  76. esgvoc/core/logging_handler.py +24 -2
  77. esgvoc/core/repo_fetcher.py +61 -59
  78. esgvoc/core/service/data_merger.py +47 -34
  79. esgvoc/core/service/state.py +107 -83
  80. {esgvoc-0.3.0.dist-info → esgvoc-1.0.0.dist-info}/METADATA +7 -20
  81. esgvoc-1.0.0.dist-info/RECORD +95 -0
  82. esgvoc/api/_utils.py +0 -53
  83. esgvoc/core/logging.conf +0 -21
  84. esgvoc-0.3.0.dist-info/RECORD +0 -78
  85. {esgvoc-0.3.0.dist-info → esgvoc-1.0.0.dist-info}/WHEEL +0 -0
  86. {esgvoc-0.3.0.dist-info → esgvoc-1.0.0.dist-info}/entry_points.txt +0 -0
  87. {esgvoc-0.3.0.dist-info → esgvoc-1.0.0.dist-info}/licenses/LICENSE.txt +0 -0
esgvoc/api/universe.py CHANGED
@@ -1,183 +1,56 @@
1
- from typing import Sequence, Iterable
1
+ from typing import Iterable, Sequence
2
2
 
3
- from esgvoc.api._utils import (get_universe_session,
4
- instantiate_pydantic_terms)
5
- from esgvoc.api.search import SearchSettings, _create_str_comparison_expression
6
- from esgvoc.api.data_descriptors.data_descriptor import DataDescriptor
7
- from esgvoc.core.db.models.universe import UDataDescriptor, UTerm
8
- from sqlmodel import Session, select
9
-
10
-
11
- def _find_terms_in_data_descriptor(data_descriptor_id: str,
12
- term_id: str,
13
- session: Session,
14
- settings: SearchSettings|None) -> Sequence[UTerm]:
15
- """Settings only apply on the term_id comparison."""
16
- where_expression = _create_str_comparison_expression(field=UTerm.id,
17
- value=term_id,
18
- settings=settings)
19
- statement = select(UTerm).join(UDataDescriptor).where(UDataDescriptor.id==data_descriptor_id,
20
- where_expression)
21
- results = session.exec(statement)
22
- result = results.all()
23
- return result
24
-
25
-
26
- def find_terms_in_data_descriptor(data_descriptor_id: str,
27
- term_id: str,
28
- settings: SearchSettings|None = None) \
29
- -> list[DataDescriptor]:
30
- """
31
- Finds one or more terms in the given data descriptor based on the specified search settings.
32
- This function performs an exact match on the `data_descriptor_id` and
33
- does **not** search for similar or related descriptors.
34
- The given `term_id` is searched according to the search type specified in
35
- the parameter `settings`,
36
- which allows a flexible matching (e.g., `LIKE` may return multiple results).
37
- If the parameter `settings` is `None`, this function performs an exact match on the `term_id`.
38
- If any of the provided ids (`data_descriptor_id` or `term_id`) is not found, the function
39
- returns an empty list.
3
+ from sqlalchemy import text
4
+ from sqlmodel import Session, col, select
40
5
 
41
- Behavior based on search type:
42
- - `EXACT` and absence of `settings`: returns zero or one term instance in the list.
43
- - `REGEX`, `LIKE`, `STARTS_WITH` and `ENDS_WITH`: returns zero, one or more term \
44
- instances in the list.
45
-
46
- :param data_descriptor_id: A data descriptor id
47
- :type data_descriptor_id: str
48
- :param term_id: A term id to be found
49
- :type term_id: str
50
- :param settings: The search settings
51
- :type settings: SearchSettings|None
52
- :returns: A list of term instances. Returns an empty list if no matches are found.
53
- :rtype: list[DataDescriptor]
54
- """
55
- result: list[DataDescriptor] = list()
56
- with get_universe_session() as session:
57
- terms = _find_terms_in_data_descriptor(data_descriptor_id, term_id, session, settings)
58
- instantiate_pydantic_terms(terms, result, settings.selected_term_fields if settings else None)
59
- return result
60
-
61
-
62
- def _find_terms_in_universe(term_id: str,
63
- session: Session,
64
- settings: SearchSettings|None) -> Sequence[UTerm]:
65
- where_expression = _create_str_comparison_expression(field=UTerm.id,
66
- value=term_id,
67
- settings=settings)
68
- statement = select(UTerm).where(where_expression)
69
- results = session.exec(statement).all()
70
- return results
71
-
72
-
73
- def find_terms_in_universe(term_id: str,
74
- settings: SearchSettings|None = None) \
75
- -> list[DataDescriptor]:
76
- """
77
- Finds one or more terms of the universe.
78
- The given `term_id` is searched according to the search type specified in
79
- the parameter `settings`,
80
- which allows a flexible matching (e.g., `LIKE` may return multiple results).
81
- If the parameter `settings` is `None`, this function performs an exact match on the `term_id`.
82
- Terms are unique within a data descriptor but may have some synonyms in the universe.
83
- If the provided `term_id` is not found, the function returns an empty list.
84
-
85
- :param term_id: A term id to be found
86
- :type term_id: str
87
- :param settings: The search settings
88
- :type settings: SearchSettings|None
89
- :returns: A list of term instances. Returns an empty list if no matches are found.
90
- :rtype: list[DataDescriptor]
91
- """
92
- result: list[DataDescriptor] = list()
93
- with get_universe_session() as session:
94
- terms = _find_terms_in_universe(term_id, session, settings)
95
- instantiate_pydantic_terms(terms, result, settings.selected_term_fields if settings else None)
96
- return result
6
+ from esgvoc.api.data_descriptors.data_descriptor import DataDescriptor
7
+ from esgvoc.api.search import (
8
+ Item,
9
+ execute_find_item_statements,
10
+ execute_match_statement,
11
+ generate_matching_condition,
12
+ get_universe_session,
13
+ handle_rank_limit_offset,
14
+ instantiate_pydantic_term,
15
+ instantiate_pydantic_terms,
16
+ process_expression,
17
+ )
18
+ from esgvoc.core.db.models.universe import UDataDescriptor, UDataDescriptorFTS5, UTerm, UTermFTS5
97
19
 
98
20
 
99
21
  def _get_all_terms_in_data_descriptor(data_descriptor: UDataDescriptor,
100
- selected_term_fields: Iterable[str]|None) -> list[DataDescriptor]:
22
+ selected_term_fields: Iterable[str] | None) -> list[DataDescriptor]:
101
23
  result: list[DataDescriptor] = list()
102
24
  instantiate_pydantic_terms(data_descriptor.terms, result, selected_term_fields)
103
25
  return result
104
26
 
105
27
 
106
- def _find_data_descriptors_in_universe(data_descriptor_id: str,
107
- session: Session,
108
- settings: SearchSettings|None) -> Sequence[UDataDescriptor]:
109
- where_expression = _create_str_comparison_expression(field=UDataDescriptor.id,
110
- value=data_descriptor_id,
111
- settings=settings)
112
- statement = select(UDataDescriptor).where(where_expression)
113
- results = session.exec(statement)
114
- result = results.all()
115
- return result
116
-
117
-
118
28
  def get_all_terms_in_data_descriptor(data_descriptor_id: str,
119
- selected_term_fields: Iterable[str]|None = None) \
29
+ selected_term_fields: Iterable[str] | None = None) \
120
30
  -> list[DataDescriptor]:
121
31
  """
122
32
  Gets all the terms of the given data descriptor.
123
- This function performs an exact match on the `data_descriptor_id` and does **not** search
33
+ This function performs an exact match on the `data_descriptor_id` and does not search
124
34
  for similar or related descriptors.
125
35
  If the provided `data_descriptor_id` is not found, the function returns an empty list.
126
36
 
127
37
  :param data_descriptor_id: A data descriptor id
128
38
  :type data_descriptor_id: str
129
39
  :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
130
- fields of the terms are returned.
131
- :type selected_term_fields: Iterable[str]|None
40
+ fields of the terms are returned. If empty, selects the id and type fields.
41
+ :type selected_term_fields: Iterable[str] | None
132
42
  :returns: a list of term instances. Returns an empty list if no matches are found.
133
43
  :rtype: list[DataDescriptor]
134
44
  """
135
45
  with get_universe_session() as session:
136
- data_descriptors = _find_data_descriptors_in_universe(data_descriptor_id,
137
- session,
138
- None)
139
- if data_descriptors:
140
- data_descriptor = data_descriptors[0]
46
+ data_descriptor = _get_data_descriptor_in_universe(data_descriptor_id, session)
47
+ if data_descriptor:
141
48
  result = _get_all_terms_in_data_descriptor(data_descriptor, selected_term_fields)
142
49
  else:
143
50
  result = list()
144
51
  return result
145
52
 
146
53
 
147
- def find_data_descriptors_in_universe(data_descriptor_id: str,
148
- settings: SearchSettings|None = None) \
149
- -> list[dict]:
150
- """
151
- Finds one or more data descriptor of the universe, based on the specified search settings.
152
- The given `data_descriptor_id` is searched according to the search type specified in
153
- the parameter `settings`,
154
- which allows a flexible matching (e.g., `LIKE` may return multiple results).
155
- If the parameter `settings` is `None`, this function performs an exact match on
156
- the `data_descriptor_id`.
157
- If the provided `data_descriptor_id` is not found, the function returns an empty list.
158
-
159
- Behavior based on search type:
160
- - `EXACT` and absence of `settings`: returns zero or one data descriptor context in the list.
161
- - `REGEX`, `LIKE`, `STARTS_WITH` and `ENDS_WITH`: returns zero, one or more \
162
- data descriptor contexts in the list.
163
-
164
- :param data_descriptor_id: A data descriptor id to be found
165
- :type data_descriptor_id: str
166
- :param settings: The search settings
167
- :type settings: SearchSettings|None
168
- :returns: A list of data descriptor contexts. Returns an empty list if no matches are found.
169
- :rtype: list[dict]
170
- """
171
- result = list()
172
- with get_universe_session() as session:
173
- data_descriptors = _find_data_descriptors_in_universe(data_descriptor_id,
174
- session,
175
- settings)
176
- for data_descriptor in data_descriptors:
177
- result.append(data_descriptor.context)
178
- return result
179
-
180
-
181
54
  def _get_all_data_descriptors_in_universe(session: Session) -> Sequence[UDataDescriptor]:
182
55
  statement = select(UDataDescriptor)
183
56
  data_descriptors = session.exec(statement)
@@ -200,14 +73,14 @@ def get_all_data_descriptors_in_universe() -> list[str]:
200
73
  return result
201
74
 
202
75
 
203
- def get_all_terms_in_universe(selected_term_fields: Iterable[str]|None = None) -> list[DataDescriptor]:
76
+ def get_all_terms_in_universe(selected_term_fields: Iterable[str] | None = None) -> list[DataDescriptor]:
204
77
  """
205
78
  Gets all the terms of the universe.
206
79
  Terms are unique within a data descriptor but may have some synonyms in the universe.
207
80
 
208
81
  :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
209
- fields of the terms are returned.
210
- :type selected_term_fields: Iterable[str]|None
82
+ fields of the terms are returned. If empty, selects the id and type fields.
83
+ :type selected_term_fields: Iterable[str] | None
211
84
  :returns: A list of term instances.
212
85
  :rtype: list[DataDescriptor]
213
86
  """
@@ -221,7 +94,345 @@ def get_all_terms_in_universe(selected_term_fields: Iterable[str]|None = None) -
221
94
  return result
222
95
 
223
96
 
224
- if __name__ == "__main__":
225
- settings = SearchSettings()
226
- settings.selected_term_fields = ('id',)
227
- print(find_terms_in_data_descriptor('institution', 'ipsl', settings))
97
+ def _get_term_in_data_descriptor(data_descriptor_id: str, term_id: str, session: Session) \
98
+ -> UTerm | None:
99
+ statement = select(UTerm).join(UDataDescriptor).where(UDataDescriptor.id == data_descriptor_id,
100
+ UTerm.id == term_id)
101
+ results = session.exec(statement)
102
+ result = results.one_or_none()
103
+ return result
104
+
105
+
106
+ def get_term_in_data_descriptor(data_descriptor_id: str,
107
+ term_id: str,
108
+ selected_term_fields: Iterable[str] | None = None) \
109
+ -> DataDescriptor | None:
110
+ """
111
+ Returns the term, in the given data descriptor, whose id corresponds exactly to the given term id.
112
+ This function performs an exact match on the `term_id` and the `data_descriptor_id` and does
113
+ not search for similar or related terms and data descriptors.
114
+ If the provided `term_id` is not found, the function returns `None`.
115
+
116
+ :param data_descriptor_id: The id of the given data descriptor.
117
+ :type data_descriptor_id: str
118
+ :param term_id: The id of a term to be found.
119
+ :type term_id: str
120
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
121
+ fields of the terms are returned. If empty, selects the id and type fields.
122
+ :type selected_term_fields: Iterable[str] | None
123
+ :returns: A term instance. Returns `None` if no match is found.
124
+ :rtype: DataDescriptor | None
125
+ """
126
+ with get_universe_session() as session:
127
+ term_found = _get_term_in_data_descriptor(data_descriptor_id, term_id, session)
128
+ if term_found:
129
+ result = instantiate_pydantic_term(term_found, selected_term_fields)
130
+ else:
131
+ result = None
132
+ return result
133
+
134
+
135
+ def _get_term_in_universe(term_id: str, session: Session) -> UTerm | None:
136
+ statement = select(UTerm).where(UTerm.id == term_id)
137
+ results = session.exec(statement)
138
+ result = results.first() # Term ids are not supposed to be unique within the universe.
139
+ return result
140
+
141
+
142
+ def get_term_in_universe(term_id: str,
143
+ selected_term_fields: Iterable[str] | None = None) -> DataDescriptor | None:
144
+ """
145
+ Returns the first occurrence of the terms, in the universe, whose id corresponds exactly to
146
+ the given term id.
147
+ Terms are unique within a data descriptor but may have some synonyms in the universe.
148
+ This function performs an exact match on the `term_id` and does not search
149
+ for similar or related terms. If the provided `term_id` is not found, the function returns `None`.
150
+
151
+ :param term_id: The id of a term to be found.
152
+ :type term_id: str
153
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
154
+ fields of the terms are returned. If empty, selects the id and type fields.
155
+ :type selected_term_fields: Iterable[str] | None
156
+ :returns: A term instance. Returns `None` if no match is found.
157
+ :rtype: DataDescriptor | None
158
+ """
159
+ with get_universe_session() as session:
160
+ term_found = _get_term_in_universe(term_id, session)
161
+ if term_found:
162
+ result = instantiate_pydantic_term(term_found, selected_term_fields)
163
+ else:
164
+ result = None
165
+ return result
166
+
167
+
168
+ def _get_data_descriptor_in_universe(data_descriptor_id: str, session: Session) -> UDataDescriptor | None:
169
+ statement = select(UDataDescriptor).where(UDataDescriptor.id == data_descriptor_id)
170
+ results = session.exec(statement)
171
+ result = results.one_or_none()
172
+ return result
173
+
174
+
175
+ def get_data_descriptor_in_universe(data_descriptor_id: str) -> tuple[str, dict] | None:
176
+ """
177
+ Returns the id and the context of the data descriptor, in the universe whose, id corresponds
178
+ exactly to the given data descriptor id.
179
+ This function performs an exact match on the `data_descriptor_id` and does not
180
+ search for similar or related data descriptors.
181
+ If the provided `data_descriptor_id` is not found, the function returns `None`.
182
+
183
+ :param data_descriptor_id: An id of a data descriptor to be found.
184
+ :type data_descriptor_id: str
185
+ :returns: The data descriptor id and context. Returns `None` if no match is found.
186
+ :rtype: tuple[str, dict] | None
187
+ """
188
+ with get_universe_session() as session:
189
+ data_descriptor_found = _get_data_descriptor_in_universe(data_descriptor_id, session)
190
+ if data_descriptor_found:
191
+ result = data_descriptor_found.id, data_descriptor_found.context
192
+ else:
193
+ result = None
194
+ return result
195
+
196
+
197
+ def _find_data_descriptors_in_universe(expression: str,
198
+ session: Session,
199
+ only_id: bool = False,
200
+ limit: int | None = None,
201
+ offset: int | None = None) -> Sequence[UDataDescriptor]:
202
+ matching_condition = generate_matching_condition(UDataDescriptorFTS5, expression, only_id)
203
+ tmp_statement = select(UDataDescriptorFTS5).where(matching_condition)
204
+ statement = select(UDataDescriptor).from_statement(handle_rank_limit_offset(tmp_statement,
205
+ limit, offset))
206
+ return execute_match_statement(expression, statement, session)
207
+
208
+
209
+ def find_data_descriptors_in_universe(expression: str,
210
+ only_id: bool = False,
211
+ limit: int | None = None,
212
+ offset: int | None = None) -> list[tuple[str, dict]]:
213
+ """
214
+ Find data descriptors in the universe based on a full text search defined by the given `expression`.
215
+ The `expression` can be composed of one or multiple keywords.
216
+ The keywords can combined with boolean operators: `AND`,
217
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
218
+ if no boolean operators is provided, whitespaces are handled as if there were
219
+ an implicit AND operator between each pair of keywords. Note that this
220
+ function does not provide any priority operator (parenthesis).
221
+ Keywords can define prefixes when adding a `*` at the end of them.
222
+ If the expression is composed of only one keyword, the function
223
+ automatically defines it as a prefix.
224
+ The function returns a list of data descriptor ids and contexts, sorted according to the
225
+ bm25 ranking metric (list index `0` has the highest rank).
226
+ If the provided `expression` does not hit any data descriptor, the function returns an empty list.
227
+ The function searches for the `expression` in the data descriptor specifications.
228
+ However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
229
+ data descriptors. **At the moment, `only_id` is set to `True` as the data descriptors
230
+ haven't got any description.**
231
+
232
+ :param expression: The full text search expression.
233
+ :type expression: str
234
+ :param only_id: Performs the search only on ids, otherwise on all the specifications.
235
+ :type only_id: bool
236
+ :param limit: Limit the number of returned items found. Returns all items found the if \
237
+ `limit` is either `None`, zero or negative.
238
+ :type limit: int | None
239
+ :param offset: Skips `offset` number of items found. Ignored if `offset` is \
240
+ either `None`, zero or negative.
241
+ :type offset: int | None
242
+ :returns: A list of data descriptor ids and contexts. Returns an empty list if no matches are found.
243
+ :rtype: list[tuple[str, dict]]
244
+ :raises EsgvocValueError: If the `expression` cannot be interpreted.
245
+ """
246
+ result: list[tuple[str, dict]] = list()
247
+ with get_universe_session() as session:
248
+ data_descriptors_found = _find_data_descriptors_in_universe(expression, session, only_id,
249
+ limit, offset)
250
+ if data_descriptors_found:
251
+ for data_descriptor_found in data_descriptors_found:
252
+ result.append((data_descriptor_found.id, data_descriptor_found.context))
253
+ return result
254
+
255
+
256
+ def _find_terms_in_universe(expression: str, session: Session,
257
+ only_id: bool = False,
258
+ limit: int | None = None,
259
+ offset: int | None = None) -> Sequence[UTerm]:
260
+ matching_condition = generate_matching_condition(UTermFTS5, expression, only_id)
261
+ tmp_statement = select(UTermFTS5).where(matching_condition)
262
+ statement = select(UTerm).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
263
+ return execute_match_statement(expression, statement, session)
264
+
265
+
266
+ def find_terms_in_universe(expression: str,
267
+ only_id: bool = False,
268
+ limit: int | None = None,
269
+ offset: int | None = None,
270
+ selected_term_fields: Iterable[str] | None = None) -> list[DataDescriptor]:
271
+ """
272
+ Find terms in the universe based on a full-text search defined by the given `expression`.
273
+ The `expression` can be composed of one or multiple keywords.
274
+ The keywords can combined with boolean operators: `AND`,
275
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
276
+ if no boolean operators is provided, whitespaces are handled as if there were
277
+ an implicit AND operator between each pair of keywords. Note that this
278
+ function does not provide any priority operator (parenthesis).
279
+ Keywords can define prefixes when adding a `*` at the end of them.
280
+ If the expression is composed of only one keyword, the function
281
+ automatically defines it as a prefix.
282
+ The function returns a list of term instances sorted according to the
283
+ bm25 ranking metric (list index `0` has the highest rank).
284
+ If the provided `expression` does not hit any term, the function returns an empty list.
285
+ The function searches for the `expression` in the term specifications.
286
+ However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the terms.
287
+
288
+ :param expression: The full text search expression.
289
+ :type expression: str
290
+ :param only_id: Performs the search only on ids, otherwise on all the specifications.
291
+ :type only_id: bool
292
+ :param limit: Limit the number of returned items found. Returns all items found the if \
293
+ `limit` is either `None`, zero or negative.
294
+ :type limit: int | None
295
+ :param offset: Skips `offset` number of items found. Ignored if `offset` is \
296
+ either `None`, zero or negative.
297
+ :type offset: int | None
298
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
299
+ fields of the terms are returned. If empty, selects the id and type fields.
300
+ :type selected_term_fields: Iterable[str] | None
301
+ :returns: A list of term instances. Returns an empty list if no matches are found.
302
+ :rtype: list[DataDescriptor]
303
+ :raises EsgvocValueError: If the `expression` cannot be interpreted.
304
+ """
305
+ result: list[DataDescriptor] = list()
306
+ with get_universe_session() as session:
307
+ uterms_found = _find_terms_in_universe(expression, session, only_id, limit, offset)
308
+ if uterms_found:
309
+ instantiate_pydantic_terms(uterms_found, result, selected_term_fields)
310
+ return result
311
+
312
+
313
+ def _find_terms_in_data_descriptor(expression: str, data_descriptor_id: str,
314
+ session: Session,
315
+ only_id: bool = False,
316
+ limit: int | None = None,
317
+ offset: int | None = None) -> Sequence[UTerm]:
318
+ matching_condition = generate_matching_condition(UTermFTS5, expression, only_id)
319
+ where_condition = UDataDescriptor.id == data_descriptor_id, matching_condition
320
+ tmp_statement = select(UTermFTS5).join(UDataDescriptor).where(*where_condition)
321
+ statement = select(UTerm).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
322
+ return execute_match_statement(expression, statement, session)
323
+
324
+
325
+ def find_terms_in_data_descriptor(expression: str, data_descriptor_id: str,
326
+ only_id: bool = False,
327
+ limit: int | None = None,
328
+ offset: int | None = None,
329
+ selected_term_fields: Iterable[str] | None = None) \
330
+ -> list[DataDescriptor]:
331
+ """
332
+ Find terms in the given data descriptor based on a full-text search defined by the given `expression`.
333
+ The `expression` can be composed of one or multiple keywords.
334
+ The keywords can combined with boolean operators: `AND`,
335
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
336
+ if no boolean operators is provided, whitespaces are handled as if there were
337
+ an implicit AND operator between each pair of keywords. Note that this
338
+ function does not provide any priority operator (parenthesis).
339
+ Keywords can define prefixes when adding a `*` at the end of them.
340
+ If the expression is composed of only one keyword, the function
341
+ automatically defines it as a prefix.
342
+ The function returns a list of term instances sorted according to the
343
+ bm25 ranking metric (list index `0` has the highest rank).
344
+ This function performs an exact match on the `data_descriptor_id`,
345
+ and does not search for similar or related data descriptor.
346
+ If the provided `expression` does not hit any term or the given `data_descriptor_id` does not
347
+ match exactly to an id of a data descriptor, the function returns an empty list.
348
+ The function searches for the `expression` in the term specifications.
349
+ However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the terms.
350
+
351
+ :param expression: The full text search expression.
352
+ :type expression: str
353
+ :param only_id: Performs the search only on ids, otherwise on all the specifications.
354
+ :type only_id: bool
355
+ :param limit: Limit the number of returned items found. Returns all items found the if \
356
+ `limit` is either `None`, zero or negative.
357
+ :type limit: int | None
358
+ :param offset: Skips `offset` number of items found. Ignored if `offset` is \
359
+ either `None`, zero or negative.
360
+ :type offset: int | None
361
+ :param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
362
+ fields of the terms are returned. If empty, selects the id and type fields.
363
+ :type selected_term_fields: Iterable[str] | None
364
+ :returns: A list of term instances. Returns an empty list if no matches are found.
365
+ :rtype: list[DataDescriptor]
366
+ :raises EsgvocValueError: If the `expression` cannot be interpreted.
367
+ """
368
+ result: list[DataDescriptor] = list()
369
+ with get_universe_session() as session:
370
+ uterms_found = _find_terms_in_data_descriptor(expression, data_descriptor_id,
371
+ session, only_id, limit, offset)
372
+ if uterms_found:
373
+ instantiate_pydantic_terms(uterms_found, result, selected_term_fields)
374
+ return result
375
+
376
+
377
+ def find_items_in_universe(expression: str,
378
+ only_id: bool = False,
379
+ limit: int | None = None,
380
+ offset: int | None = None) -> list[Item]:
381
+ """
382
+ Find items, at the moment terms and data descriptors, in the universe based on a full-text
383
+ search defined by the given `expression`.
384
+ The `expression` can be composed of one or multiple keywords.
385
+ The keywords can combined with boolean operators: `AND`,
386
+ `OR` and `NOT` (case sensitive). The keywords are separated by whitespaces,
387
+ if no boolean operators is provided, whitespaces are handled as if there were
388
+ an implicit AND operator between each pair of keywords. Note that this
389
+ function does not provide any priority operator (parenthesis).
390
+ Keywords can define prefixes when adding a `*` at the end of them.
391
+ If the expression is composed of only one keyword, the function
392
+ automatically defines it as a prefix.
393
+ The function returns a list of item instances sorted according to the
394
+ bm25 ranking metric (list index `0` has the highest rank).
395
+ If the provided `expression` does not hit any item, the function returns an empty list.
396
+ The function searches for the `expression` in the term and data descriptor specifications.
397
+ However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
398
+ terms and data descriptors. **At the moment, `only_id` is set to `True` for the data descriptors
399
+ because they haven't got any description.**
400
+
401
+ :param expression: The full text search expression.
402
+ :type expression: str
403
+ :param only_id: Performs the search only on ids, otherwise on all the specifications.
404
+ :type only_id: bool
405
+ :param limit: Limit the number of returned items found. Returns all items found the if \
406
+ `limit` is either `None`, zero or negative.
407
+ :type limit: int | None
408
+ :param offset: Skips `offset` number of items found. Ignored if `offset` is \
409
+ either `None`, zero or negative.
410
+ :type offset: int | None
411
+ :returns: A list of item instances. Returns an empty list if no matches are found.
412
+ :rtype: list[Item]
413
+ :raises EsgvocValueError: If the `expression` cannot be interpreted.
414
+ """
415
+ # TODO: execute union query when it will be possible to compute parent of terms and data descriptors.
416
+ result = list()
417
+ with get_universe_session() as session:
418
+ processed_expression = process_expression(expression)
419
+ if only_id:
420
+ dd_column = col(UDataDescriptorFTS5.id)
421
+ term_column = col(UTermFTS5.id)
422
+ else:
423
+ dd_column = col(UDataDescriptorFTS5.id) # TODO: use specs when implemented!
424
+ term_column = col(UTermFTS5.specs) # type: ignore
425
+ dd_where_condition = dd_column.match(processed_expression)
426
+ dd_statement = select(UDataDescriptorFTS5.id,
427
+ text("'data_descriptor' AS TYPE"),
428
+ text("'universe' AS TYPE"),
429
+ text('rank')).where(dd_where_condition)
430
+ term_where_condition = term_column.match(processed_expression)
431
+ term_statement = select(UTermFTS5.id,
432
+ text("'term' AS TYPE"),
433
+ UDataDescriptor.id,
434
+ text('rank')).join(UDataDescriptor) \
435
+ .where(term_where_condition)
436
+ result = execute_find_item_statements(session, processed_expression, dd_statement,
437
+ term_statement, limit, offset)
438
+ return result
@@ -1,2 +1,2 @@
1
1
  FILE_NAME_EXTENSION_KEY = 'extension'
2
- FILE_NAME_EXTENSION_SEPARATOR_KEY = 'extension_separator'
2
+ FILE_NAME_EXTENSION_SEPARATOR_KEY = 'extension_separator'