esgvoc 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of esgvoc might be problematic. Click here for more details.
- esgvoc/__init__.py +3 -1
- esgvoc/api/__init__.py +96 -72
- esgvoc/api/data_descriptors/__init__.py +18 -12
- esgvoc/api/data_descriptors/activity.py +8 -45
- esgvoc/api/data_descriptors/area_label.py +6 -0
- esgvoc/api/data_descriptors/branded_suffix.py +5 -0
- esgvoc/api/data_descriptors/branded_variable.py +5 -0
- esgvoc/api/data_descriptors/consortium.py +16 -56
- esgvoc/api/data_descriptors/data_descriptor.py +106 -0
- esgvoc/api/data_descriptors/date.py +3 -46
- esgvoc/api/data_descriptors/directory_date.py +3 -46
- esgvoc/api/data_descriptors/experiment.py +19 -54
- esgvoc/api/data_descriptors/forcing_index.py +3 -45
- esgvoc/api/data_descriptors/frequency.py +6 -43
- esgvoc/api/data_descriptors/grid_label.py +6 -44
- esgvoc/api/data_descriptors/horizontal_label.py +6 -0
- esgvoc/api/data_descriptors/initialisation_index.py +3 -44
- esgvoc/api/data_descriptors/institution.py +11 -54
- esgvoc/api/data_descriptors/license.py +4 -44
- esgvoc/api/data_descriptors/mip_era.py +6 -44
- esgvoc/api/data_descriptors/model_component.py +7 -45
- esgvoc/api/data_descriptors/organisation.py +3 -40
- esgvoc/api/data_descriptors/physic_index.py +3 -45
- esgvoc/api/data_descriptors/product.py +4 -43
- esgvoc/api/data_descriptors/realisation_index.py +3 -44
- esgvoc/api/data_descriptors/realm.py +4 -42
- esgvoc/api/data_descriptors/resolution.py +6 -44
- esgvoc/api/data_descriptors/source.py +18 -53
- esgvoc/api/data_descriptors/source_type.py +3 -41
- esgvoc/api/data_descriptors/sub_experiment.py +3 -41
- esgvoc/api/data_descriptors/table.py +6 -48
- esgvoc/api/data_descriptors/temporal_label.py +6 -0
- esgvoc/api/data_descriptors/time_range.py +3 -27
- esgvoc/api/data_descriptors/variable.py +13 -71
- esgvoc/api/data_descriptors/variant_label.py +3 -47
- esgvoc/api/data_descriptors/vertical_label.py +5 -0
- esgvoc/api/project_specs.py +3 -2
- esgvoc/api/projects.py +727 -446
- esgvoc/api/py.typed +0 -0
- esgvoc/api/report.py +29 -16
- esgvoc/api/search.py +140 -95
- esgvoc/api/universe.py +362 -156
- esgvoc/apps/__init__.py +3 -4
- esgvoc/apps/drs/constants.py +1 -1
- esgvoc/apps/drs/generator.py +185 -198
- esgvoc/apps/drs/report.py +272 -136
- esgvoc/apps/drs/validator.py +132 -145
- esgvoc/apps/py.typed +0 -0
- esgvoc/cli/drs.py +32 -21
- esgvoc/cli/get.py +35 -31
- esgvoc/cli/install.py +11 -8
- esgvoc/cli/main.py +0 -2
- esgvoc/cli/status.py +5 -5
- esgvoc/cli/valid.py +40 -40
- esgvoc/core/constants.py +1 -1
- esgvoc/core/db/__init__.py +2 -4
- esgvoc/core/db/connection.py +5 -3
- esgvoc/core/db/models/project.py +50 -8
- esgvoc/core/db/models/universe.py +51 -12
- esgvoc/core/db/project_ingestion.py +60 -46
- esgvoc/core/db/universe_ingestion.py +58 -29
- esgvoc/core/exceptions.py +33 -0
- esgvoc/core/logging_handler.py +1 -1
- esgvoc/core/repo_fetcher.py +4 -3
- esgvoc/core/service/__init__.py +37 -5
- esgvoc/core/service/configuration/config_manager.py +188 -0
- esgvoc/core/service/configuration/setting.py +88 -0
- esgvoc/core/service/state.py +49 -32
- {esgvoc-0.2.1.dist-info → esgvoc-0.4.0.dist-info}/METADATA +34 -3
- esgvoc-0.4.0.dist-info/RECORD +80 -0
- esgvoc/api/_utils.py +0 -39
- esgvoc/cli/config.py +0 -82
- esgvoc/core/service/settings.py +0 -73
- esgvoc/core/service/settings.toml +0 -17
- esgvoc/core/service/settings_default.toml +0 -17
- esgvoc-0.2.1.dist-info/RECORD +0 -73
- {esgvoc-0.2.1.dist-info → esgvoc-0.4.0.dist-info}/WHEEL +0 -0
- {esgvoc-0.2.1.dist-info → esgvoc-0.4.0.dist-info}/entry_points.txt +0 -0
- {esgvoc-0.2.1.dist-info → esgvoc-0.4.0.dist-info}/licenses/LICENSE.txt +0 -0
esgvoc/api/universe.py
CHANGED
|
@@ -1,217 +1,423 @@
|
|
|
1
|
-
from typing import Sequence
|
|
2
|
-
|
|
3
|
-
from
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from esgvoc.
|
|
7
|
-
from
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
result =
|
|
1
|
+
from typing import Iterable, Sequence
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import text
|
|
4
|
+
from sqlmodel import Session, col, select
|
|
5
|
+
|
|
6
|
+
from esgvoc.api.data_descriptors.data_descriptor import DataDescriptor
|
|
7
|
+
from esgvoc.api.search import (
|
|
8
|
+
Item,
|
|
9
|
+
execute_find_item_statements,
|
|
10
|
+
execute_match_statement,
|
|
11
|
+
generate_matching_condition,
|
|
12
|
+
get_universe_session,
|
|
13
|
+
handle_rank_limit_offset,
|
|
14
|
+
instantiate_pydantic_term,
|
|
15
|
+
instantiate_pydantic_terms,
|
|
16
|
+
)
|
|
17
|
+
from esgvoc.core.db.models.universe import UDataDescriptor, UDataDescriptorFTS5, UTerm, UTermFTS5
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_all_terms_in_data_descriptor(data_descriptor: UDataDescriptor,
|
|
21
|
+
selected_term_fields: Iterable[str] | None) -> list[DataDescriptor]:
|
|
22
|
+
result: list[DataDescriptor] = list()
|
|
23
|
+
instantiate_pydantic_terms(data_descriptor.terms, result, selected_term_fields)
|
|
23
24
|
return result
|
|
24
25
|
|
|
25
26
|
|
|
26
|
-
def
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
-> list[BaseModel]:
|
|
27
|
+
def get_all_terms_in_data_descriptor(data_descriptor_id: str,
|
|
28
|
+
selected_term_fields: Iterable[str] | None = None) \
|
|
29
|
+
-> list[DataDescriptor]:
|
|
30
30
|
"""
|
|
31
|
-
|
|
32
|
-
This function performs an exact match on the `data_descriptor_id` and
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
the parameter `settings`,
|
|
36
|
-
which allows a flexible matching (e.g., `LIKE` may return multiple results).
|
|
37
|
-
If the parameter `settings` is `None`, this function performs an exact match on the `term_id`.
|
|
38
|
-
If any of the provided ids (`data_descriptor_id` or `term_id`) is not found, the function
|
|
39
|
-
returns an empty list.
|
|
40
|
-
|
|
41
|
-
Behavior based on search type:
|
|
42
|
-
- `EXACT` and absence of `settings`: returns zero or one Pydantic term instance in the list.
|
|
43
|
-
- `REGEX`, `LIKE`, `STARTS_WITH` and `ENDS_WITH`: returns zero, one or more Pydantic term \
|
|
44
|
-
instances in the list.
|
|
31
|
+
Gets all the terms of the given data descriptor.
|
|
32
|
+
This function performs an exact match on the `data_descriptor_id` and does not search
|
|
33
|
+
for similar or related descriptors.
|
|
34
|
+
If the provided `data_descriptor_id` is not found, the function returns an empty list.
|
|
45
35
|
|
|
46
36
|
:param data_descriptor_id: A data descriptor id
|
|
47
37
|
:type data_descriptor_id: str
|
|
48
|
-
:param
|
|
49
|
-
|
|
50
|
-
:
|
|
51
|
-
:
|
|
52
|
-
:
|
|
53
|
-
:rtype: list[BaseModel]
|
|
38
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
39
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
40
|
+
:type selected_term_fields: Iterable[str] | None
|
|
41
|
+
:returns: a list of term instances. Returns an empty list if no matches are found.
|
|
42
|
+
:rtype: list[DataDescriptor]
|
|
54
43
|
"""
|
|
55
|
-
result: list[BaseModel] = list()
|
|
56
44
|
with get_universe_session() as session:
|
|
57
|
-
|
|
58
|
-
|
|
45
|
+
data_descriptor = _get_data_descriptor_in_universe(data_descriptor_id, session)
|
|
46
|
+
if data_descriptor:
|
|
47
|
+
result = _get_all_terms_in_data_descriptor(data_descriptor, selected_term_fields)
|
|
48
|
+
else:
|
|
49
|
+
result = list()
|
|
59
50
|
return result
|
|
60
51
|
|
|
61
52
|
|
|
62
|
-
def
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
settings=settings)
|
|
68
|
-
statement = select(UTerm).where(where_expression)
|
|
69
|
-
results = session.exec(statement).all()
|
|
70
|
-
return results
|
|
53
|
+
def _get_all_data_descriptors_in_universe(session: Session) -> Sequence[UDataDescriptor]:
|
|
54
|
+
statement = select(UDataDescriptor)
|
|
55
|
+
data_descriptors = session.exec(statement)
|
|
56
|
+
result = data_descriptors.all()
|
|
57
|
+
return result
|
|
71
58
|
|
|
72
59
|
|
|
73
|
-
def
|
|
74
|
-
settings: SearchSettings|None = None) \
|
|
75
|
-
-> list[BaseModel]:
|
|
60
|
+
def get_all_data_descriptors_in_universe() -> list[str]:
|
|
76
61
|
"""
|
|
77
|
-
|
|
78
|
-
The given `term_id` is searched according to the search type specified in
|
|
79
|
-
the parameter `settings`,
|
|
80
|
-
which allows a flexible matching (e.g., `LIKE` may return multiple results).
|
|
81
|
-
If the parameter `settings` is `None`, this function performs an exact match on the `term_id`.
|
|
82
|
-
Terms are unique within a data descriptor but may have some synonyms in the universe.
|
|
83
|
-
If the provided `term_id` is not found, the function returns an empty list.
|
|
62
|
+
Gets all the data descriptors of the universe.
|
|
84
63
|
|
|
85
|
-
:
|
|
86
|
-
:
|
|
87
|
-
:param settings: The search settings
|
|
88
|
-
:type settings: SearchSettings|None
|
|
89
|
-
:returns: A list of Pydantic term instances. Returns an empty list if no matches are found.
|
|
90
|
-
:rtype: list[BaseModel]
|
|
64
|
+
:returns: A list of data descriptor ids.
|
|
65
|
+
:rtype: list[str]
|
|
91
66
|
"""
|
|
92
|
-
result
|
|
67
|
+
result = list()
|
|
93
68
|
with get_universe_session() as session:
|
|
94
|
-
|
|
95
|
-
|
|
69
|
+
data_descriptors = _get_all_data_descriptors_in_universe(session)
|
|
70
|
+
for data_descriptor in data_descriptors:
|
|
71
|
+
result.append(data_descriptor.id)
|
|
96
72
|
return result
|
|
97
73
|
|
|
98
74
|
|
|
99
|
-
def
|
|
100
|
-
|
|
101
|
-
|
|
75
|
+
def get_all_terms_in_universe(selected_term_fields: Iterable[str] | None = None) -> list[DataDescriptor]:
|
|
76
|
+
"""
|
|
77
|
+
Gets all the terms of the universe.
|
|
78
|
+
Terms are unique within a data descriptor but may have some synonyms in the universe.
|
|
79
|
+
|
|
80
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
81
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
82
|
+
:type selected_term_fields: Iterable[str] | None
|
|
83
|
+
:returns: A list of term instances.
|
|
84
|
+
:rtype: list[DataDescriptor]
|
|
85
|
+
"""
|
|
86
|
+
result = list()
|
|
87
|
+
with get_universe_session() as session:
|
|
88
|
+
data_descriptors = _get_all_data_descriptors_in_universe(session)
|
|
89
|
+
for data_descriptor in data_descriptors:
|
|
90
|
+
# Term may have some synonyms within the whole universe.
|
|
91
|
+
terms = _get_all_terms_in_data_descriptor(data_descriptor, selected_term_fields)
|
|
92
|
+
result.extend(terms)
|
|
102
93
|
return result
|
|
103
94
|
|
|
104
95
|
|
|
105
|
-
def
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
value=data_descriptor_id,
|
|
110
|
-
settings=settings)
|
|
111
|
-
statement = select(DataDescriptor).where(where_expression)
|
|
96
|
+
def _get_term_in_data_descriptor(data_descriptor_id: str, term_id: str, session: Session) \
|
|
97
|
+
-> UTerm | None:
|
|
98
|
+
statement = select(UTerm).join(UDataDescriptor).where(UDataDescriptor.id == data_descriptor_id,
|
|
99
|
+
UTerm.id == term_id)
|
|
112
100
|
results = session.exec(statement)
|
|
113
|
-
result = results.
|
|
101
|
+
result = results.one_or_none()
|
|
114
102
|
return result
|
|
115
103
|
|
|
116
104
|
|
|
117
|
-
def
|
|
118
|
-
|
|
105
|
+
def get_term_in_data_descriptor(data_descriptor_id: str,
|
|
106
|
+
term_id: str,
|
|
107
|
+
selected_term_fields: Iterable[str] | None = None) \
|
|
108
|
+
-> DataDescriptor | None:
|
|
119
109
|
"""
|
|
120
|
-
|
|
121
|
-
This function performs an exact match on the `
|
|
122
|
-
for similar or related descriptors.
|
|
123
|
-
If the provided `
|
|
110
|
+
Returns the term, in the given data descriptor, whose id corresponds exactly to the given term id.
|
|
111
|
+
This function performs an exact match on the `term_id` and the `data_descriptor_id` and does
|
|
112
|
+
not search for similar or related terms and data descriptors.
|
|
113
|
+
If the provided `term_id` is not found, the function returns `None`.
|
|
124
114
|
|
|
125
|
-
:param data_descriptor_id:
|
|
115
|
+
:param data_descriptor_id: The id of the given data descriptor.
|
|
126
116
|
:type data_descriptor_id: str
|
|
127
|
-
:
|
|
128
|
-
:
|
|
117
|
+
:param term_id: The id of a term to be found.
|
|
118
|
+
:type term_id: str
|
|
119
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
120
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
121
|
+
:type selected_term_fields: Iterable[str] | None
|
|
122
|
+
:returns: A term instance. Returns `None` if no match is found.
|
|
123
|
+
:rtype: DataDescriptor | None
|
|
129
124
|
"""
|
|
130
125
|
with get_universe_session() as session:
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
if data_descriptors:
|
|
135
|
-
data_descriptor = data_descriptors[0]
|
|
136
|
-
result = _get_all_terms_in_data_descriptor(data_descriptor)
|
|
126
|
+
term_found = _get_term_in_data_descriptor(data_descriptor_id, term_id, session)
|
|
127
|
+
if term_found:
|
|
128
|
+
result = instantiate_pydantic_term(term_found, selected_term_fields)
|
|
137
129
|
else:
|
|
138
|
-
result =
|
|
130
|
+
result = None
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _get_term_in_universe(term_id: str, session: Session) -> UTerm | None:
|
|
135
|
+
statement = select(UTerm).where(UTerm.id == term_id)
|
|
136
|
+
results = session.exec(statement)
|
|
137
|
+
result = results.first() # Term ids are not supposed to be unique within the universe.
|
|
139
138
|
return result
|
|
140
139
|
|
|
141
140
|
|
|
142
|
-
def
|
|
143
|
-
|
|
144
|
-
-> list[dict]:
|
|
141
|
+
def get_term_in_universe(term_id: str,
|
|
142
|
+
selected_term_fields: Iterable[str] | None = None) -> DataDescriptor | None:
|
|
145
143
|
"""
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
the
|
|
149
|
-
|
|
150
|
-
If the
|
|
151
|
-
the `data_descriptor_id`.
|
|
152
|
-
If the provided `data_descriptor_id` is not found, the function returns an empty list.
|
|
153
|
-
|
|
154
|
-
Behavior based on search type:
|
|
155
|
-
- `EXACT` and absence of `settings`: returns zero or one data descriptor context in the list.
|
|
156
|
-
- `REGEX`, `LIKE`, `STARTS_WITH` and `ENDS_WITH`: returns zero, one or more \
|
|
157
|
-
data descriptor contexts in the list.
|
|
144
|
+
Returns the first occurrence of the terms, in the universe, whose id corresponds exactly to
|
|
145
|
+
the given term id.
|
|
146
|
+
Terms are unique within a data descriptor but may have some synonyms in the universe.
|
|
147
|
+
This function performs an exact match on the `term_id` and does not search
|
|
148
|
+
for similar or related terms. If the provided `term_id` is not found, the function returns `None`.
|
|
158
149
|
|
|
159
|
-
:param
|
|
160
|
-
:type
|
|
161
|
-
:param
|
|
162
|
-
|
|
163
|
-
:
|
|
164
|
-
:
|
|
150
|
+
:param term_id: The id of a term to be found.
|
|
151
|
+
:type term_id: str
|
|
152
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
153
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
154
|
+
:type selected_term_fields: Iterable[str] | None
|
|
155
|
+
:returns: A term instance. Returns `None` if no match is found.
|
|
156
|
+
:rtype: DataDescriptor | None
|
|
165
157
|
"""
|
|
166
|
-
result = list()
|
|
167
158
|
with get_universe_session() as session:
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
result
|
|
159
|
+
term_found = _get_term_in_universe(term_id, session)
|
|
160
|
+
if term_found:
|
|
161
|
+
result = instantiate_pydantic_term(term_found, selected_term_fields)
|
|
162
|
+
else:
|
|
163
|
+
result = None
|
|
173
164
|
return result
|
|
174
165
|
|
|
175
166
|
|
|
176
|
-
def
|
|
177
|
-
statement = select(
|
|
178
|
-
|
|
179
|
-
result =
|
|
167
|
+
def _get_data_descriptor_in_universe(data_descriptor_id: str, session: Session) -> UDataDescriptor | None:
|
|
168
|
+
statement = select(UDataDescriptor).where(UDataDescriptor.id == data_descriptor_id)
|
|
169
|
+
results = session.exec(statement)
|
|
170
|
+
result = results.one_or_none()
|
|
180
171
|
return result
|
|
181
172
|
|
|
182
173
|
|
|
183
|
-
def
|
|
174
|
+
def get_data_descriptor_in_universe(data_descriptor_id: str) -> tuple[str, dict] | None:
|
|
184
175
|
"""
|
|
185
|
-
|
|
176
|
+
Returns the id and the context of the data descriptor, in the universe whose, id corresponds
|
|
177
|
+
exactly to the given data descriptor id.
|
|
178
|
+
This function performs an exact match on the `data_descriptor_id` and does not
|
|
179
|
+
search for similar or related data descriptors.
|
|
180
|
+
If the provided `data_descriptor_id` is not found, the function returns `None`.
|
|
186
181
|
|
|
187
|
-
:
|
|
188
|
-
:
|
|
182
|
+
:param data_descriptor_id: An id of a data descriptor to be found.
|
|
183
|
+
:type data_descriptor_id: str
|
|
184
|
+
:returns: The data descriptor id and context. Returns `None` if no match is found.
|
|
185
|
+
:rtype: tuple[str, dict] | None
|
|
189
186
|
"""
|
|
190
|
-
result = list()
|
|
191
187
|
with get_universe_session() as session:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
result.
|
|
188
|
+
data_descriptor_found = _get_data_descriptor_in_universe(data_descriptor_id, session)
|
|
189
|
+
if data_descriptor_found:
|
|
190
|
+
result = data_descriptor_found.id, data_descriptor_found.context
|
|
191
|
+
else:
|
|
192
|
+
result = None
|
|
195
193
|
return result
|
|
196
194
|
|
|
197
195
|
|
|
198
|
-
def
|
|
196
|
+
def _find_data_descriptors_in_universe(expression: str,
|
|
197
|
+
session: Session,
|
|
198
|
+
only_id: bool = False,
|
|
199
|
+
limit: int | None = None,
|
|
200
|
+
offset: int | None = None) -> Sequence[UDataDescriptor]:
|
|
201
|
+
matching_condition = generate_matching_condition(UDataDescriptorFTS5, expression, only_id)
|
|
202
|
+
tmp_statement = select(UDataDescriptorFTS5).where(matching_condition)
|
|
203
|
+
statement = select(UDataDescriptor).from_statement(handle_rank_limit_offset(tmp_statement,
|
|
204
|
+
limit, offset))
|
|
205
|
+
return execute_match_statement(expression, statement, session)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def find_data_descriptors_in_universe(expression: str,
|
|
209
|
+
only_id: bool = False,
|
|
210
|
+
limit: int | None = None,
|
|
211
|
+
offset: int | None = None) -> list[tuple[str, dict]]:
|
|
199
212
|
"""
|
|
200
|
-
|
|
201
|
-
|
|
213
|
+
Find data descriptors in the universe based on a full text search defined by the given `expression`.
|
|
214
|
+
The `expression` comes from the powerful
|
|
215
|
+
`SQLite FTS extension <https://sqlite.org/fts5.html#full_text_query_syntax>`_
|
|
216
|
+
and corresponds to the expression of the `MATCH` operator.
|
|
217
|
+
It can be composed of one or multiple keywords combined with boolean
|
|
218
|
+
operators (`NOT`, `AND`, `^`, etc. default is `OR`). Keywords can define prefixes or postfixes
|
|
219
|
+
with the wildcard `*`.
|
|
220
|
+
The function returns a list of data descriptor ids and contexts, sorted according to the
|
|
221
|
+
bm25 ranking metric (list index `0` has the highest rank).
|
|
222
|
+
If the provided `expression` does not hit any data descriptor, the function returns an empty list.
|
|
223
|
+
The function searches for the `expression` in the data descriptor specifications.
|
|
224
|
+
However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
|
|
225
|
+
data descriptors. **At the moment, `only_id` is set to `True` as the data descriptors
|
|
226
|
+
haven't got any description.**
|
|
227
|
+
|
|
228
|
+
:param expression: The full text search expression.
|
|
229
|
+
:type expression: str
|
|
230
|
+
:param only_id: Performs the search only on ids, otherwise on all the specifications.
|
|
231
|
+
:type only_id: bool
|
|
232
|
+
:param limit: Limit the number of returned items found. Returns all items found the if \
|
|
233
|
+
`limit` is either `None`, zero or negative.
|
|
234
|
+
:type limit: int | None
|
|
235
|
+
:param offset: Skips `offset` number of items found. Ignored if `offset` is \
|
|
236
|
+
either `None`, zero or negative.
|
|
237
|
+
:type offset: int | None
|
|
238
|
+
:returns: A list of data descriptor ids and contexts. Returns an empty list if no matches are found.
|
|
239
|
+
:rtype: list[tuple[str, dict]]
|
|
240
|
+
:raises EsgvocValueError: If the `expression` cannot be interpreted.
|
|
241
|
+
"""
|
|
242
|
+
result: list[tuple[str, dict]] = list()
|
|
243
|
+
with get_universe_session() as session:
|
|
244
|
+
data_descriptors_found = _find_data_descriptors_in_universe(expression, session, only_id,
|
|
245
|
+
limit, offset)
|
|
246
|
+
if data_descriptors_found:
|
|
247
|
+
for data_descriptor_found in data_descriptors_found:
|
|
248
|
+
result.append((data_descriptor_found.id, data_descriptor_found.context))
|
|
249
|
+
return result
|
|
250
|
+
|
|
202
251
|
|
|
203
|
-
|
|
204
|
-
|
|
252
|
+
def _find_terms_in_universe(expression: str, session: Session,
|
|
253
|
+
only_id: bool = False,
|
|
254
|
+
limit: int | None = None,
|
|
255
|
+
offset: int | None = None) -> Sequence[UTerm]:
|
|
256
|
+
matching_condition = generate_matching_condition(UTermFTS5, expression, only_id)
|
|
257
|
+
tmp_statement = select(UTermFTS5).where(matching_condition)
|
|
258
|
+
statement = select(UTerm).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
|
|
259
|
+
return execute_match_statement(expression, statement, session)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def find_terms_in_universe(expression: str,
|
|
263
|
+
only_id: bool = False,
|
|
264
|
+
limit: int | None = None,
|
|
265
|
+
offset: int | None = None,
|
|
266
|
+
selected_term_fields: Iterable[str] | None = None) -> list[DataDescriptor]:
|
|
205
267
|
"""
|
|
206
|
-
|
|
268
|
+
Find terms in the universe based on a full-text search defined by the given `expression`.
|
|
269
|
+
The `expression` comes from the powerful
|
|
270
|
+
`SQLite FTS extension <https://sqlite.org/fts5.html#full_text_query_syntax>`_
|
|
271
|
+
and corresponds to the expression of the `MATCH` operator.
|
|
272
|
+
It can be composed of one or multiple keywords combined with boolean
|
|
273
|
+
operators (`NOT`, `AND`, `^`, etc. default is `OR`). Keywords can define prefixes or postfixes
|
|
274
|
+
with the wildcard `*`.
|
|
275
|
+
The function returns a list of term instances sorted according to the
|
|
276
|
+
bm25 ranking metric (list index `0` has the highest rank).
|
|
277
|
+
If the provided `expression` does not hit any term, the function returns an empty list.
|
|
278
|
+
The function searches for the `expression` in the term specifications.
|
|
279
|
+
However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the terms.
|
|
280
|
+
|
|
281
|
+
:param expression: The full text search expression.
|
|
282
|
+
:type expression: str
|
|
283
|
+
:param only_id: Performs the search only on ids, otherwise on all the specifications.
|
|
284
|
+
:type only_id: bool
|
|
285
|
+
:param limit: Limit the number of returned items found. Returns all items found the if \
|
|
286
|
+
`limit` is either `None`, zero or negative.
|
|
287
|
+
:type limit: int | None
|
|
288
|
+
:param offset: Skips `offset` number of items found. Ignored if `offset` is \
|
|
289
|
+
either `None`, zero or negative.
|
|
290
|
+
:type offset: int | None
|
|
291
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
292
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
293
|
+
:type selected_term_fields: Iterable[str] | None
|
|
294
|
+
:returns: A list of term instances. Returns an empty list if no matches are found.
|
|
295
|
+
:rtype: list[DataDescriptor]
|
|
296
|
+
:raises EsgvocValueError: If the `expression` cannot be interpreted.
|
|
297
|
+
"""
|
|
298
|
+
result: list[DataDescriptor] = list()
|
|
207
299
|
with get_universe_session() as session:
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
terms = _get_all_terms_in_data_descriptor(data_descriptor)
|
|
212
|
-
result.extend(terms)
|
|
300
|
+
uterms_found = _find_terms_in_universe(expression, session, only_id, limit, offset)
|
|
301
|
+
if uterms_found:
|
|
302
|
+
instantiate_pydantic_terms(uterms_found, result, selected_term_fields)
|
|
213
303
|
return result
|
|
214
304
|
|
|
215
305
|
|
|
216
|
-
|
|
217
|
-
|
|
306
|
+
def _find_terms_in_data_descriptor(expression: str, data_descriptor_id: str,
|
|
307
|
+
session: Session,
|
|
308
|
+
only_id: bool = False,
|
|
309
|
+
limit: int | None = None,
|
|
310
|
+
offset: int | None = None) -> Sequence[UTerm]:
|
|
311
|
+
matching_condition = generate_matching_condition(UTermFTS5, expression, only_id)
|
|
312
|
+
where_condition = UDataDescriptor.id == data_descriptor_id, matching_condition
|
|
313
|
+
tmp_statement = select(UTermFTS5).join(UDataDescriptor).where(*where_condition)
|
|
314
|
+
statement = select(UTerm).from_statement(handle_rank_limit_offset(tmp_statement, limit, offset))
|
|
315
|
+
return execute_match_statement(expression, statement, session)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def find_terms_in_data_descriptor(expression: str, data_descriptor_id: str,
|
|
319
|
+
only_id: bool = False,
|
|
320
|
+
limit: int | None = None,
|
|
321
|
+
offset: int | None = None,
|
|
322
|
+
selected_term_fields: Iterable[str] | None = None) \
|
|
323
|
+
-> list[DataDescriptor]:
|
|
324
|
+
"""
|
|
325
|
+
Find terms in the given data descriptor based on a full-text search defined by the given `expression`.
|
|
326
|
+
The `expression` comes from the powerful
|
|
327
|
+
`SQLite FTS extension <https://sqlite.org/fts5.html#full_text_query_syntax>`_
|
|
328
|
+
and corresponds to the expression of the `MATCH` operator.
|
|
329
|
+
It can be composed of one or multiple keywords combined with boolean
|
|
330
|
+
operators (`NOT`, `AND`, `^`, etc. default is `OR`). Keywords can define prefixes or postfixes
|
|
331
|
+
with the wildcard `*`.
|
|
332
|
+
The function returns a list of term instances sorted according to the
|
|
333
|
+
bm25 ranking metric (list index `0` has the highest rank).
|
|
334
|
+
This function performs an exact match on the `data_descriptor_id`,
|
|
335
|
+
and does not search for similar or related data descriptor.
|
|
336
|
+
If the provided `expression` does not hit any term or the given `data_descriptor_id` does not
|
|
337
|
+
match exactly to an id of a data descriptor, the function returns an empty list.
|
|
338
|
+
The function searches for the `expression` in the term specifications.
|
|
339
|
+
However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the terms.
|
|
340
|
+
|
|
341
|
+
:param expression: The full text search expression.
|
|
342
|
+
:type expression: str
|
|
343
|
+
:param only_id: Performs the search only on ids, otherwise on all the specifications.
|
|
344
|
+
:type only_id: bool
|
|
345
|
+
:param limit: Limit the number of returned items found. Returns all items found the if \
|
|
346
|
+
`limit` is either `None`, zero or negative.
|
|
347
|
+
:type limit: int | None
|
|
348
|
+
:param offset: Skips `offset` number of items found. Ignored if `offset` is \
|
|
349
|
+
either `None`, zero or negative.
|
|
350
|
+
:type offset: int | None
|
|
351
|
+
:param selected_term_fields: A list of term fields to select or `None`. If `None`, all the \
|
|
352
|
+
fields of the terms are returned. If empty, selects the id and type fields.
|
|
353
|
+
:type selected_term_fields: Iterable[str] | None
|
|
354
|
+
:returns: A list of term instances. Returns an empty list if no matches are found.
|
|
355
|
+
:rtype: list[DataDescriptor]
|
|
356
|
+
:raises EsgvocValueError: If the `expression` cannot be interpreted.
|
|
357
|
+
"""
|
|
358
|
+
result: list[DataDescriptor] = list()
|
|
359
|
+
with get_universe_session() as session:
|
|
360
|
+
uterms_found = _find_terms_in_data_descriptor(expression, data_descriptor_id,
|
|
361
|
+
session, only_id, limit, offset)
|
|
362
|
+
if uterms_found:
|
|
363
|
+
instantiate_pydantic_terms(uterms_found, result, selected_term_fields)
|
|
364
|
+
return result
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def find_items_in_universe(expression: str,
|
|
368
|
+
only_id: bool = False,
|
|
369
|
+
limit: int | None = None,
|
|
370
|
+
offset: int | None = None) -> list[Item]:
|
|
371
|
+
"""
|
|
372
|
+
Find items, at the moment terms and data descriptors, in the universe based on a full-text
|
|
373
|
+
search defined by the given `expression`. The `expression` comes from the powerful
|
|
374
|
+
`SQLite FTS extension <https://sqlite.org/fts5.html#full_text_query_syntax>`_
|
|
375
|
+
and corresponds to the expression of the `MATCH` operator.
|
|
376
|
+
It can be composed of one or multiple keywords combined with boolean
|
|
377
|
+
operators (`NOT`, `AND`, `^`, etc. default is `OR`). Keywords can define prefixes or postfixes
|
|
378
|
+
with the wildcard `*`.
|
|
379
|
+
The function returns a list of item instances sorted according to the
|
|
380
|
+
bm25 ranking metric (list index `0` has the highest rank).
|
|
381
|
+
If the provided `expression` does not hit any item, the function returns an empty list.
|
|
382
|
+
The function searches for the `expression` in the term and data descriptor specifications.
|
|
383
|
+
However, if `only_id` is `True` (default is `False`), the search is restricted to the id of the
|
|
384
|
+
terms and data descriptors. **At the moment, `only_id` is set to `True` for the data descriptors
|
|
385
|
+
because they haven't got any description.**
|
|
386
|
+
|
|
387
|
+
:param expression: The full text search expression.
|
|
388
|
+
:type expression: str
|
|
389
|
+
:param only_id: Performs the search only on ids, otherwise on all the specifications.
|
|
390
|
+
:type only_id: bool
|
|
391
|
+
:param limit: Limit the number of returned items found. Returns all items found the if \
|
|
392
|
+
`limit` is either `None`, zero or negative.
|
|
393
|
+
:type limit: int | None
|
|
394
|
+
:param offset: Skips `offset` number of items found. Ignored if `offset` is \
|
|
395
|
+
either `None`, zero or negative.
|
|
396
|
+
:type offset: int | None
|
|
397
|
+
:returns: A list of item instances. Returns an empty list if no matches are found.
|
|
398
|
+
:rtype: list[Item]
|
|
399
|
+
:raises EsgvocValueError: If the `expression` cannot be interpreted.
|
|
400
|
+
"""
|
|
401
|
+
# TODO: execute union query when it will be possible to compute parent of terms and data descriptors.
|
|
402
|
+
result = list()
|
|
403
|
+
with get_universe_session() as session:
|
|
404
|
+
if only_id:
|
|
405
|
+
dd_column = col(UDataDescriptorFTS5.id)
|
|
406
|
+
term_column = col(UTermFTS5.id)
|
|
407
|
+
else:
|
|
408
|
+
dd_column = col(UDataDescriptorFTS5.id) # TODO: use specs when implemented!
|
|
409
|
+
term_column = col(UTermFTS5.specs) # type: ignore
|
|
410
|
+
dd_where_condition = dd_column.match(expression)
|
|
411
|
+
dd_statement = select(UDataDescriptorFTS5.id,
|
|
412
|
+
text("'data_descriptor' AS TYPE"),
|
|
413
|
+
text("'universe' AS TYPE"),
|
|
414
|
+
text('rank')).where(dd_where_condition)
|
|
415
|
+
term_where_condition = term_column.match(expression)
|
|
416
|
+
term_statement = select(UTermFTS5.id,
|
|
417
|
+
text("'term' AS TYPE"),
|
|
418
|
+
UDataDescriptor.id,
|
|
419
|
+
text('rank')).join(UDataDescriptor) \
|
|
420
|
+
.where(term_where_condition)
|
|
421
|
+
result = execute_find_item_statements(session, expression, dd_statement,
|
|
422
|
+
term_statement, limit, offset)
|
|
423
|
+
return result
|
esgvoc/apps/__init__.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
|
|
2
|
-
from esgvoc.apps.drs.validator import DrsValidator
|
|
3
|
-
from esgvoc.apps.drs.report import DrsValidationReport
|
|
4
2
|
from esgvoc.apps.drs.generator import DrsGenerator
|
|
5
|
-
from esgvoc.apps.drs.report import
|
|
3
|
+
from esgvoc.apps.drs.report import DrsGenerationReport, DrsValidationReport
|
|
4
|
+
from esgvoc.apps.drs.validator import DrsValidator
|
|
6
5
|
|
|
7
|
-
__all__ = ["DrsValidator", "DrsValidationReport", "DrsGenerator", "
|
|
6
|
+
__all__ = ["DrsValidator", "DrsValidationReport", "DrsGenerator", "DrsGenerationReport"]
|
esgvoc/apps/drs/constants.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
FILE_NAME_EXTENSION_KEY = 'extension'
|
|
2
|
-
FILE_NAME_EXTENSION_SEPARATOR_KEY = 'extension_separator'
|
|
2
|
+
FILE_NAME_EXTENSION_SEPARATOR_KEY = 'extension_separator'
|