invenio-vocabularies 6.7.0__py2.py3-none-any.whl → 6.9.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of invenio-vocabularies might be problematic. Click here for more details.
- invenio_vocabularies/__init__.py +1 -1
- invenio_vocabularies/cli.py +2 -0
- invenio_vocabularies/config.py +16 -0
- invenio_vocabularies/contrib/affiliations/schema.py +10 -0
- invenio_vocabularies/contrib/names/datastreams.py +182 -57
- invenio_vocabularies/contrib/subjects/bodc/__init__.py +9 -0
- invenio_vocabularies/contrib/subjects/bodc/datastreams.py +111 -0
- invenio_vocabularies/contrib/subjects/config.py +9 -4
- invenio_vocabularies/contrib/subjects/datastreams.py +2 -4
- invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +33 -11
- invenio_vocabularies/contrib/subjects/gemet/datastreams.py +36 -5
- invenio_vocabularies/datastreams/datastreams.py +18 -7
- invenio_vocabularies/datastreams/factories.py +3 -1
- invenio_vocabularies/datastreams/transformers.py +12 -0
- invenio_vocabularies/datastreams/writers.py +6 -2
- invenio_vocabularies/factories.py +41 -0
- invenio_vocabularies/fixtures.py +2 -0
- invenio_vocabularies/services/querystr.py +5 -0
- invenio_vocabularies/services/tasks.py +2 -0
- {invenio_vocabularies-6.7.0.dist-info → invenio_vocabularies-6.9.0.dist-info}/METADATA +12 -1
- {invenio_vocabularies-6.7.0.dist-info → invenio_vocabularies-6.9.0.dist-info}/RECORD +26 -24
- {invenio_vocabularies-6.7.0.dist-info → invenio_vocabularies-6.9.0.dist-info}/AUTHORS.rst +0 -0
- {invenio_vocabularies-6.7.0.dist-info → invenio_vocabularies-6.9.0.dist-info}/LICENSE +0 -0
- {invenio_vocabularies-6.7.0.dist-info → invenio_vocabularies-6.9.0.dist-info}/WHEEL +0 -0
- {invenio_vocabularies-6.7.0.dist-info → invenio_vocabularies-6.9.0.dist-info}/entry_points.txt +0 -0
- {invenio_vocabularies-6.7.0.dist-info → invenio_vocabularies-6.9.0.dist-info}/top_level.txt +0 -0
invenio_vocabularies/__init__.py
CHANGED
invenio_vocabularies/cli.py
CHANGED
|
@@ -29,6 +29,8 @@ def _process_vocab(config, num_samples=None):
|
|
|
29
29
|
readers_config=config["readers"],
|
|
30
30
|
transformers_config=config.get("transformers"),
|
|
31
31
|
writers_config=config["writers"],
|
|
32
|
+
batch_size=config.get("batch_size", 1000),
|
|
33
|
+
write_many=config.get("write_many", False),
|
|
32
34
|
)
|
|
33
35
|
|
|
34
36
|
success, errored, filtered = 0, 0, 0
|
invenio_vocabularies/config.py
CHANGED
|
@@ -196,6 +196,9 @@ VOCABULARIES_SUBJECTS_GEMET_FILE_URL = (
|
|
|
196
196
|
)
|
|
197
197
|
"""Subject GEMET file download link."""
|
|
198
198
|
|
|
199
|
+
VOCABULARIES_SUBJECTS_BODC_PUV_FILE_URL = "http://vocab.nerc.ac.uk/collection/P01/current/?_profile=nvs&_mediatype=application/rdf+xml"
|
|
200
|
+
"""Subject BODC-PUV file download link."""
|
|
201
|
+
|
|
199
202
|
VOCABULARIES_AFFILIATIONS_EDMO_COUNTRY_MAPPING = {
|
|
200
203
|
"Cape Verde": "Cabo Verde",
|
|
201
204
|
}
|
|
@@ -213,3 +216,16 @@ VOCABULARIES_ORCID_SYNC_SINCE = {
|
|
|
213
216
|
"days": 1,
|
|
214
217
|
}
|
|
215
218
|
"""ORCID time shift to sync. Parameters accepted are the ones passed to 'datetime.timedelta'."""
|
|
219
|
+
|
|
220
|
+
VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH = None
|
|
221
|
+
"""Path to the CSV file for mapping ORCiD organization IDs to affiliation IDs.
|
|
222
|
+
|
|
223
|
+
The path can be specified as either an absolute path or a relative path within the
|
|
224
|
+
Flask app instance folder (i.e. ``current_app.instance_path``).
|
|
225
|
+
|
|
226
|
+
The CSV file should have the following columns:
|
|
227
|
+
|
|
228
|
+
- `org_scheme`: The ORCiD organization ID.
|
|
229
|
+
- `org_id`: The ORCiD organization ID.
|
|
230
|
+
- `aff_id`: The affiliation ID to map to.
|
|
231
|
+
"""
|
|
@@ -59,3 +59,13 @@ class AffiliationRelationSchema(ContribVocabularyRelationSchema):
|
|
|
59
59
|
ftf_name = "name"
|
|
60
60
|
parent_field_name = "affiliations"
|
|
61
61
|
name = SanitizedUnicode()
|
|
62
|
+
identifiers = IdentifierSet(
|
|
63
|
+
fields.Nested(
|
|
64
|
+
partial(
|
|
65
|
+
IdentifierSchema,
|
|
66
|
+
allowed_schemes=affiliation_schemes,
|
|
67
|
+
identifier_required=False,
|
|
68
|
+
)
|
|
69
|
+
),
|
|
70
|
+
dump_only=True,
|
|
71
|
+
)
|
|
@@ -13,12 +13,14 @@ import io
|
|
|
13
13
|
import tarfile
|
|
14
14
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
15
15
|
from datetime import timedelta
|
|
16
|
+
from itertools import islice
|
|
17
|
+
from pathlib import Path
|
|
16
18
|
|
|
17
19
|
import arrow
|
|
18
20
|
import regex as re
|
|
19
21
|
from flask import current_app
|
|
20
22
|
from invenio_access.permissions import system_identity
|
|
21
|
-
from
|
|
23
|
+
from werkzeug.utils import cached_property
|
|
22
24
|
|
|
23
25
|
from invenio_vocabularies.contrib.names.s3client import S3OrcidClient
|
|
24
26
|
|
|
@@ -47,10 +49,11 @@ class OrcidDataSyncReader(BaseReader):
|
|
|
47
49
|
suffix = orcid_to_sync[-3:]
|
|
48
50
|
key = f"{suffix}/{orcid_to_sync}.xml"
|
|
49
51
|
try:
|
|
52
|
+
# Potential improvement: use the a XML jax parser to avoid loading the whole file in memory
|
|
53
|
+
# and choose the sections we need to read (probably the summary)
|
|
50
54
|
return self.s3_client.read_file(f"s3://{bucket}/{key}")
|
|
51
|
-
except Exception
|
|
52
|
-
|
|
53
|
-
return None
|
|
55
|
+
except Exception:
|
|
56
|
+
current_app.logger.exception("Failed to fetch ORCiD record.")
|
|
54
57
|
|
|
55
58
|
def _process_lambda_file(self, fileobj):
|
|
56
59
|
"""Process the ORCiD lambda file and returns a list of ORCiDs to sync.
|
|
@@ -67,42 +70,54 @@ class OrcidDataSyncReader(BaseReader):
|
|
|
67
70
|
if self.since:
|
|
68
71
|
time_shift = self.since
|
|
69
72
|
last_sync = arrow.now() - timedelta(**time_shift)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
73
|
+
try:
|
|
74
|
+
content = io.TextIOWrapper(fileobj, encoding="utf-8")
|
|
75
|
+
csv_reader = csv.DictReader(content)
|
|
76
|
+
|
|
77
|
+
for row in csv_reader: # Skip the header line
|
|
78
|
+
orcid = row["orcid"]
|
|
79
|
+
|
|
80
|
+
# Lambda file is ordered by last modified date
|
|
81
|
+
last_modified_str = row["last_modified"]
|
|
82
|
+
try:
|
|
83
|
+
last_modified_date = arrow.get(last_modified_str, date_format)
|
|
84
|
+
except arrow.parser.ParserError:
|
|
85
|
+
last_modified_date = arrow.get(
|
|
86
|
+
last_modified_str, date_format_no_millis
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
if last_modified_date < last_sync:
|
|
90
|
+
break
|
|
91
|
+
yield orcid
|
|
92
|
+
finally:
|
|
93
|
+
fileobj.close()
|
|
88
94
|
|
|
89
95
|
def _iter(self, orcids):
|
|
90
96
|
"""Iterates over the ORCiD records yielding each one."""
|
|
91
97
|
with ThreadPoolExecutor(
|
|
92
98
|
max_workers=current_app.config["VOCABULARIES_ORCID_SYNC_MAX_WORKERS"]
|
|
93
99
|
) as executor:
|
|
94
|
-
futures
|
|
95
|
-
|
|
100
|
+
# futures is a dictionary where the key is the ORCID value and the item is the Future object
|
|
101
|
+
futures = {
|
|
102
|
+
orcid: executor.submit(
|
|
96
103
|
self._fetch_orcid_data,
|
|
97
104
|
orcid,
|
|
98
105
|
current_app.config["VOCABULARIES_ORCID_SUMMARIES_BUCKET"],
|
|
99
106
|
)
|
|
100
107
|
for orcid in orcids
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
for orcid in list(futures.keys()):
|
|
111
|
+
try:
|
|
112
|
+
result = futures[orcid].result()
|
|
113
|
+
if result:
|
|
114
|
+
yield result
|
|
115
|
+
finally:
|
|
116
|
+
# Explicitly release memory, as we don't need the future anymore.
|
|
117
|
+
# This is mostly required because as long as we keep a reference to the future
|
|
118
|
+
# (in the above futures dict), the garbage collector won't collect it
|
|
119
|
+
# and it will keep the memory allocated.
|
|
120
|
+
del futures[orcid]
|
|
106
121
|
|
|
107
122
|
def read(self, item=None, *args, **kwargs):
|
|
108
123
|
"""Streams the ORCiD lambda file, process it to get the ORCiDS to sync and yields it's data."""
|
|
@@ -111,7 +126,6 @@ class OrcidDataSyncReader(BaseReader):
|
|
|
111
126
|
"s3://orcid-lambda-file/last_modified.csv.tar"
|
|
112
127
|
)
|
|
113
128
|
|
|
114
|
-
orcids_to_sync = []
|
|
115
129
|
# Opens tar file and process it
|
|
116
130
|
with tarfile.open(fileobj=io.BytesIO(tar_content)) as tar:
|
|
117
131
|
# Iterate over each member (file or directory) in the tar file
|
|
@@ -119,10 +133,24 @@ class OrcidDataSyncReader(BaseReader):
|
|
|
119
133
|
# Extract the file
|
|
120
134
|
extracted_file = tar.extractfile(member)
|
|
121
135
|
if extracted_file:
|
|
136
|
+
current_app.logger.info(f"[ORCID Reader] Processing lambda file...")
|
|
122
137
|
# Process the file and get the ORCiDs to sync
|
|
123
|
-
orcids_to_sync
|
|
138
|
+
orcids_to_sync = set(self._process_lambda_file(extracted_file))
|
|
139
|
+
|
|
140
|
+
# Close the file explicitly after processing
|
|
141
|
+
extracted_file.close()
|
|
142
|
+
|
|
143
|
+
# Process ORCIDs in smaller batches
|
|
144
|
+
for orcid_batch in self._chunked_iter(
|
|
145
|
+
orcids_to_sync, batch_size=100
|
|
146
|
+
):
|
|
147
|
+
yield from self._iter(orcid_batch)
|
|
124
148
|
|
|
125
|
-
|
|
149
|
+
def _chunked_iter(self, iterable, batch_size):
|
|
150
|
+
"""Yield successive chunks of a given size."""
|
|
151
|
+
it = iter(iterable)
|
|
152
|
+
while chunk := list(islice(it, batch_size)):
|
|
153
|
+
yield chunk
|
|
126
154
|
|
|
127
155
|
|
|
128
156
|
class OrcidHTTPReader(SimpleHTTPReader):
|
|
@@ -139,24 +167,75 @@ class OrcidHTTPReader(SimpleHTTPReader):
|
|
|
139
167
|
|
|
140
168
|
|
|
141
169
|
DEFAULT_NAMES_EXCLUDE_REGEX = r"[\p{P}\p{S}\p{Nd}\p{No}\p{Emoji}--,.()\-']"
|
|
142
|
-
"""Regex to filter out names with
|
|
170
|
+
"""Regex to filter out names with punctuation, symbols, numbers and emojis."""
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class OrcidOrgToAffiliationMapper:
|
|
174
|
+
"""Default ORCiD Org ID to affiliation ID mapper."""
|
|
175
|
+
|
|
176
|
+
def __init__(self, org_ids_mapping=None, org_ids_mapping_file=None):
|
|
177
|
+
"""Constructor."""
|
|
178
|
+
self._org_ids_mapping = org_ids_mapping
|
|
179
|
+
self._org_ids_mapping_file = org_ids_mapping_file
|
|
180
|
+
|
|
181
|
+
@cached_property
|
|
182
|
+
def org_ids_mapping(self):
|
|
183
|
+
"""Mapping of ORCiD org IDs to affiliation IDs."""
|
|
184
|
+
org_ids_mapping_file = self._org_ids_mapping_file or current_app.config.get(
|
|
185
|
+
"VOCABULARIES_ORCID_ORG_IDS_MAPPING_PATH"
|
|
186
|
+
)
|
|
187
|
+
if org_ids_mapping_file:
|
|
188
|
+
org_ids_mapping_file = Path(org_ids_mapping_file)
|
|
189
|
+
# If the path is relative, prepend the instance path
|
|
190
|
+
if not org_ids_mapping_file.is_absolute():
|
|
191
|
+
org_ids_mapping_file = (
|
|
192
|
+
Path(current_app.instance_path) / org_ids_mapping_file
|
|
193
|
+
)
|
|
194
|
+
with open(org_ids_mapping_file) as fin:
|
|
195
|
+
result = {}
|
|
196
|
+
reader = csv.reader(fin)
|
|
197
|
+
|
|
198
|
+
# Check if the first row is a header
|
|
199
|
+
org_scheme, org_id, aff_id = next(reader)
|
|
200
|
+
if org_scheme.lower() != "org_scheme":
|
|
201
|
+
result[(org_scheme, org_id)] = aff_id
|
|
202
|
+
|
|
203
|
+
for org_scheme, org_id, aff_id in reader:
|
|
204
|
+
result[(org_scheme, org_id)] = aff_id
|
|
205
|
+
|
|
206
|
+
return result
|
|
207
|
+
|
|
208
|
+
return self._org_ids_mapping or {}
|
|
209
|
+
|
|
210
|
+
def __call__(self, org_scheme, org_id):
|
|
211
|
+
"""Map an ORCiD org ID to an affiliation ID."""
|
|
212
|
+
# By default we know that ROR IDs are linkable
|
|
213
|
+
if org_scheme == "ROR":
|
|
214
|
+
return org_id.split("/")[-1]
|
|
215
|
+
# Otherwise see if we have a mapping from other schemes to an affiliation ID
|
|
216
|
+
return self.org_ids_mapping.get((org_scheme, org_id))
|
|
143
217
|
|
|
144
218
|
|
|
145
219
|
class OrcidTransformer(BaseTransformer):
|
|
146
220
|
"""Transforms an ORCiD record into a names record."""
|
|
147
221
|
|
|
148
222
|
def __init__(
|
|
149
|
-
self,
|
|
223
|
+
self,
|
|
224
|
+
*args,
|
|
225
|
+
names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX,
|
|
226
|
+
org_id_to_affiliation_id_func=None,
|
|
227
|
+
**kwargs,
|
|
150
228
|
) -> None:
|
|
151
229
|
"""Constructor."""
|
|
152
230
|
self._names_exclude_regex = names_exclude_regex
|
|
231
|
+
self._org_id_to_affiliation_id_func = (
|
|
232
|
+
org_id_to_affiliation_id_func or OrcidOrgToAffiliationMapper()
|
|
233
|
+
)
|
|
153
234
|
super().__init__()
|
|
154
235
|
|
|
155
|
-
def
|
|
156
|
-
"""
|
|
157
|
-
|
|
158
|
-
return True
|
|
159
|
-
return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1))
|
|
236
|
+
def org_id_to_affiliation_id(self, org_scheme, org_id):
|
|
237
|
+
"""Convert and ORCiD org ID to a linkable affiliation ID."""
|
|
238
|
+
return self._org_id_to_affiliation_id_func(org_scheme, org_id)
|
|
160
239
|
|
|
161
240
|
def apply(self, stream_entry, **kwargs):
|
|
162
241
|
"""Applies the transformation to the stream entry."""
|
|
@@ -166,42 +245,88 @@ class OrcidTransformer(BaseTransformer):
|
|
|
166
245
|
|
|
167
246
|
name = person.get("name")
|
|
168
247
|
if name is None:
|
|
169
|
-
raise TransformerError(
|
|
248
|
+
raise TransformerError("Name not found in ORCiD entry.")
|
|
170
249
|
if name.get("family-name") is None:
|
|
171
|
-
raise TransformerError(
|
|
250
|
+
raise TransformerError("Family name not found in ORCiD entry.")
|
|
172
251
|
|
|
173
252
|
if not self._is_valid_name(name["given-names"] + name["family-name"]):
|
|
174
|
-
raise TransformerError(
|
|
253
|
+
raise TransformerError("Invalid characters in name.")
|
|
175
254
|
|
|
176
255
|
entry = {
|
|
177
256
|
"id": orcid_id,
|
|
178
257
|
"given_name": name.get("given-names"),
|
|
179
258
|
"family_name": name.get("family-name"),
|
|
180
259
|
"identifiers": [{"scheme": "orcid", "identifier": orcid_id}],
|
|
181
|
-
"affiliations":
|
|
260
|
+
"affiliations": self._extract_affiliations(record),
|
|
182
261
|
}
|
|
183
262
|
|
|
263
|
+
stream_entry.entry = entry
|
|
264
|
+
return stream_entry
|
|
265
|
+
|
|
266
|
+
def _is_valid_name(self, name):
|
|
267
|
+
"""Check whether the name passes the regex."""
|
|
268
|
+
if not self._names_exclude_regex:
|
|
269
|
+
return True
|
|
270
|
+
return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1))
|
|
271
|
+
|
|
272
|
+
def _extract_affiliations(self, record):
|
|
273
|
+
"""Extract affiliations from the ORCiD record."""
|
|
274
|
+
result = []
|
|
184
275
|
try:
|
|
185
|
-
employments =
|
|
186
|
-
record
|
|
276
|
+
employments = (
|
|
277
|
+
record.get("activities-summary", {})
|
|
278
|
+
.get("employments", {})
|
|
279
|
+
.get("affiliation-group", [])
|
|
187
280
|
)
|
|
281
|
+
|
|
282
|
+
# If there are single values, the XML to dict, doesn't wrap them in a list
|
|
188
283
|
if isinstance(employments, dict):
|
|
189
284
|
employments = [employments]
|
|
190
|
-
|
|
285
|
+
|
|
286
|
+
# Remove the "employment-summary" nesting
|
|
287
|
+
employments = [
|
|
288
|
+
employment.get("employment-summary", {}) for employment in employments
|
|
289
|
+
]
|
|
290
|
+
|
|
191
291
|
for employment in employments:
|
|
192
|
-
terminated = employment
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
292
|
+
terminated = employment.get("end-date")
|
|
293
|
+
if terminated:
|
|
294
|
+
continue
|
|
295
|
+
|
|
296
|
+
org = employment["organization"]
|
|
297
|
+
aff_id = self._extract_affiliation_id(org)
|
|
298
|
+
|
|
299
|
+
# Skip adding if the ID already exists in result
|
|
300
|
+
if aff_id and any(aff.get("id") == aff_id for aff in result):
|
|
301
|
+
continue
|
|
302
|
+
|
|
303
|
+
# Skip adding if the name exists in result with no ID
|
|
304
|
+
if any(
|
|
305
|
+
aff.get("name") == org["name"] and "id" not in aff for aff in result
|
|
306
|
+
):
|
|
307
|
+
continue
|
|
308
|
+
|
|
309
|
+
aff = {"name": org["name"]}
|
|
310
|
+
if aff_id:
|
|
311
|
+
aff["id"] = aff_id
|
|
312
|
+
|
|
313
|
+
result.append(aff)
|
|
200
314
|
except Exception:
|
|
201
315
|
pass
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
316
|
+
return result
|
|
317
|
+
|
|
318
|
+
def _extract_affiliation_id(self, org):
|
|
319
|
+
"""Extract the affiliation ID from an ORCiD organization."""
|
|
320
|
+
dis_org = org.get("disambiguated-organization")
|
|
321
|
+
if not dis_org:
|
|
322
|
+
return
|
|
323
|
+
|
|
324
|
+
aff_id = None
|
|
325
|
+
org_id = dis_org.get("disambiguated-organization-identifier")
|
|
326
|
+
org_scheme = dis_org.get("disambiguation-source")
|
|
327
|
+
if org_id and org_scheme:
|
|
328
|
+
aff_id = self.org_id_to_affiliation_id(org_scheme, org_id)
|
|
329
|
+
return aff_id
|
|
205
330
|
|
|
206
331
|
|
|
207
332
|
class NamesServiceWriter(ServiceWriter):
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#
|
|
3
|
+
# Copyright (C) 2024 CERN.
|
|
4
|
+
#
|
|
5
|
+
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
|
+
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
7
|
+
# details.
|
|
8
|
+
|
|
9
|
+
"""BODC subjects datastreams, readers, transformers, and writers."""
|
|
10
|
+
|
|
11
|
+
from invenio_vocabularies.datastreams.errors import TransformerError
|
|
12
|
+
from invenio_vocabularies.datastreams.readers import RDFReader
|
|
13
|
+
from invenio_vocabularies.datastreams.transformers import RDFTransformer
|
|
14
|
+
|
|
15
|
+
from ..config import bodc_puv_file_url
|
|
16
|
+
|
|
17
|
+
# Available with the "rdf" extra
|
|
18
|
+
try:
|
|
19
|
+
import rdflib
|
|
20
|
+
except ImportError:
|
|
21
|
+
rdflib = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class BODCPUVSubjectsTransformer(RDFTransformer):
|
|
25
|
+
"""
|
|
26
|
+
Transformer class to convert BODC-PUV RDF data to a dictionary format.
|
|
27
|
+
|
|
28
|
+
Input:
|
|
29
|
+
- Relevant fields:
|
|
30
|
+
- `skos:notation`: Primary identifier for the concept.
|
|
31
|
+
- `skos:prefLabel`: Preferred labels with language codes.
|
|
32
|
+
- `skos:altLabel`: Alternative labels (optional).
|
|
33
|
+
- `skos:definition`: Definitions of the concept.
|
|
34
|
+
- `owl:deprecated`: Boolean flag indicating if the concept is deprecated.
|
|
35
|
+
|
|
36
|
+
Output:
|
|
37
|
+
- A dictionary with the following structure:
|
|
38
|
+
{
|
|
39
|
+
"id": "SDN:P01::SAGEMSFM", # BODC-specific parameter ID (skos:notation).
|
|
40
|
+
"scheme": "BODC-PUV", # The scheme name indicating this is a BODC Parameter Usage Vocabulary concept.
|
|
41
|
+
"subject": "AMSSedAge", # The alternative label (skos:altLabel), if available, or None.
|
|
42
|
+
"title": {
|
|
43
|
+
"en": "14C age of Foraminiferida" # English preferred label (skos:prefLabel).
|
|
44
|
+
},
|
|
45
|
+
"props": {
|
|
46
|
+
"definitions": "Accelerated mass spectrometry on picked tests", # Definition of subject (skos:definition).
|
|
47
|
+
},
|
|
48
|
+
"identifiers": [
|
|
49
|
+
{
|
|
50
|
+
"scheme": "url", # Type of identifier (URL).
|
|
51
|
+
"identifier": "http://vocab.nerc.ac.uk/collection/P01/current/SAGEMSFM" # URI of the concept.
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
}
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def _get_subject_data(self, rdf_graph, subject):
|
|
58
|
+
"""Fetch all triples for a subject and organize them into a dictionary."""
|
|
59
|
+
data = {}
|
|
60
|
+
for predicate, obj in rdf_graph.predicate_objects(subject=subject):
|
|
61
|
+
predicate_name = str(predicate)
|
|
62
|
+
if predicate_name not in data:
|
|
63
|
+
data[predicate_name] = []
|
|
64
|
+
data[predicate_name].append(obj)
|
|
65
|
+
return data
|
|
66
|
+
|
|
67
|
+
def _transform_entry(self, subject, rdf_graph):
|
|
68
|
+
"""Transform an entry to the required dictionary format."""
|
|
69
|
+
labels = self._get_labels(subject, rdf_graph)
|
|
70
|
+
subject_data = self._get_subject_data(rdf_graph, subject)
|
|
71
|
+
deprecated = subject_data.get(str(rdflib.namespace.OWL.deprecated), [False])
|
|
72
|
+
if deprecated and str(deprecated[0]).lower() == "true":
|
|
73
|
+
return None # Skip deprecated subjects
|
|
74
|
+
|
|
75
|
+
notation = subject_data.get(str(self.skos_core.notation), [])
|
|
76
|
+
if notation:
|
|
77
|
+
id = str(notation[0])
|
|
78
|
+
else:
|
|
79
|
+
raise TransformerError(f"No id found for: {subject}")
|
|
80
|
+
|
|
81
|
+
alt_labels = [obj for obj in subject_data.get(str(self.skos_core.altLabel), [])]
|
|
82
|
+
subject_text = str(alt_labels[0]) if alt_labels else ""
|
|
83
|
+
definition = str(subject_data.get(str(self.skos_core.definition), [None])[0])
|
|
84
|
+
|
|
85
|
+
return {
|
|
86
|
+
"id": id,
|
|
87
|
+
"scheme": "BODC-PUV",
|
|
88
|
+
"subject": subject_text,
|
|
89
|
+
"title": labels,
|
|
90
|
+
"props": {"definition": definition} if definition else {},
|
|
91
|
+
"identifiers": self._get_identifiers(subject),
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# Configuration for datastream
|
|
96
|
+
|
|
97
|
+
VOCABULARIES_DATASTREAM_TRANSFORMERS = {"bodc-transformer": BODCPUVSubjectsTransformer}
|
|
98
|
+
|
|
99
|
+
DATASTREAM_CONFIG = {
|
|
100
|
+
"readers": [
|
|
101
|
+
{
|
|
102
|
+
"type": "http",
|
|
103
|
+
"args": {
|
|
104
|
+
"origin": bodc_puv_file_url,
|
|
105
|
+
},
|
|
106
|
+
},
|
|
107
|
+
{"type": "rdf"},
|
|
108
|
+
],
|
|
109
|
+
"transformers": [{"type": "bodc-transformer"}],
|
|
110
|
+
"writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
|
|
111
|
+
}
|
|
@@ -15,10 +15,12 @@ from invenio_i18n import get_locale
|
|
|
15
15
|
from invenio_i18n import lazy_gettext as _
|
|
16
16
|
from invenio_records_resources.services import SearchOptions
|
|
17
17
|
from invenio_records_resources.services.records.components import DataComponent
|
|
18
|
+
from invenio_records_resources.services.records.queryparser import (
|
|
19
|
+
CompositeSuggestQueryParser,
|
|
20
|
+
)
|
|
18
21
|
from werkzeug.local import LocalProxy
|
|
19
22
|
|
|
20
23
|
from ...services.components import PIDComponent
|
|
21
|
-
from ...services.querystr import FilteredSuggestQueryParser
|
|
22
24
|
|
|
23
25
|
subject_schemes = LocalProxy(
|
|
24
26
|
lambda: current_app.config["VOCABULARIES_SUBJECTS_SCHEMES"]
|
|
@@ -34,13 +36,16 @@ euroscivoc_file_url = LocalProxy(
|
|
|
34
36
|
lambda: current_app.config["VOCABULARIES_SUBJECTS_EUROSCIVOC_FILE_URL"]
|
|
35
37
|
)
|
|
36
38
|
|
|
39
|
+
bodc_puv_file_url = LocalProxy(
|
|
40
|
+
lambda: current_app.config["VOCABULARIES_SUBJECTS_BODC_PUV_FILE_URL"]
|
|
41
|
+
)
|
|
42
|
+
|
|
37
43
|
|
|
38
44
|
class SubjectsSearchOptions(SearchOptions):
|
|
39
45
|
"""Search options."""
|
|
40
46
|
|
|
41
|
-
suggest_parser_cls =
|
|
42
|
-
|
|
43
|
-
fields=[ # suggest fields
|
|
47
|
+
suggest_parser_cls = CompositeSuggestQueryParser.factory(
|
|
48
|
+
fields=[
|
|
44
49
|
"subject^100",
|
|
45
50
|
localized_title,
|
|
46
51
|
"synonyms^20",
|
|
@@ -12,6 +12,7 @@ from invenio_access.permissions import system_identity
|
|
|
12
12
|
from invenio_i18n import lazy_gettext as _
|
|
13
13
|
|
|
14
14
|
from ...datastreams.writers import ServiceWriter
|
|
15
|
+
from .bodc import datastreams as bodc_datastreams
|
|
15
16
|
from .euroscivoc import datastreams as euroscivoc_datastreams
|
|
16
17
|
from .gemet import datastreams as gemet_datastreams
|
|
17
18
|
from .mesh import datastreams as mesh_datastreams
|
|
@@ -32,8 +33,6 @@ class SubjectsServiceWriter(ServiceWriter):
|
|
|
32
33
|
|
|
33
34
|
VOCABULARIES_DATASTREAM_READERS = {
|
|
34
35
|
**mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
|
|
35
|
-
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
|
|
36
|
-
**gemet_datastreams.VOCABULARIES_DATASTREAM_READERS,
|
|
37
36
|
}
|
|
38
37
|
"""Subjects Data Streams readers."""
|
|
39
38
|
|
|
@@ -41,14 +40,13 @@ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
|
41
40
|
**mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
|
|
42
41
|
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
|
|
43
42
|
**gemet_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
|
|
43
|
+
**bodc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
|
|
44
44
|
}
|
|
45
45
|
"""Subjects Data Streams transformers."""
|
|
46
46
|
|
|
47
47
|
VOCABULARIES_DATASTREAM_WRITERS = {
|
|
48
48
|
"subjects-service": SubjectsServiceWriter,
|
|
49
49
|
**mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
|
|
50
|
-
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
|
|
51
|
-
**gemet_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
|
|
52
50
|
}
|
|
53
51
|
"""Subjects Data Streams writers."""
|
|
54
52
|
|
|
@@ -14,7 +14,36 @@ from ..config import euroscivoc_file_url
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class EuroSciVocSubjectsTransformer(RDFTransformer):
|
|
17
|
-
"""
|
|
17
|
+
"""
|
|
18
|
+
Transformer class to convert EuroSciVoc RDF data to a dictionary format.
|
|
19
|
+
|
|
20
|
+
Input:
|
|
21
|
+
- Relevant fields:
|
|
22
|
+
- `skos:notation`: Primary identifier for the concept.
|
|
23
|
+
- `skos:prefLabel`: Preferred labels with language codes.
|
|
24
|
+
- `skos:altLabel`: Alternative labels.
|
|
25
|
+
- `skos:broader`: Broader concepts that this concept belongs to.
|
|
26
|
+
|
|
27
|
+
Output:
|
|
28
|
+
{
|
|
29
|
+
"id": "euroscivoc:1717", # EuroSciVoc-specific concept ID (skos:notation).
|
|
30
|
+
"scheme": "EuroSciVoc", # The scheme name indicating this is a EuroSciVoc concept.
|
|
31
|
+
"subject": "Satellite radio", # The primary subject label (first preferred label in English, skos:prefLabel).
|
|
32
|
+
"title": {
|
|
33
|
+
"it": "Radio satellitare", # Italian preferred label (skos:prefLabel).
|
|
34
|
+
"en": "Satellite radio", # English preferred label (skos:prefLabel).
|
|
35
|
+
},
|
|
36
|
+
"props": {
|
|
37
|
+
"parents": "euroscivoc:1225", # The broader concept (skos:broader), identified by its EuroSciVoc Concept ID.
|
|
38
|
+
},
|
|
39
|
+
"identifiers": [
|
|
40
|
+
{
|
|
41
|
+
"scheme": "url", # Type of identifier (URL).
|
|
42
|
+
"identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", # URI of the concept (rdf:about).
|
|
43
|
+
}
|
|
44
|
+
],
|
|
45
|
+
}
|
|
46
|
+
"""
|
|
18
47
|
|
|
19
48
|
def _get_notation(self, subject, rdf_graph):
|
|
20
49
|
"""Extract the numeric notation for a subject."""
|
|
@@ -38,7 +67,6 @@ class EuroSciVocSubjectsTransformer(RDFTransformer):
|
|
|
38
67
|
for n in reversed(self._find_parents(subject, rdf_graph))
|
|
39
68
|
if n
|
|
40
69
|
)
|
|
41
|
-
identifiers = [{"scheme": "url", "identifier": str(subject)}]
|
|
42
70
|
|
|
43
71
|
return {
|
|
44
72
|
"id": id,
|
|
@@ -46,13 +74,11 @@ class EuroSciVocSubjectsTransformer(RDFTransformer):
|
|
|
46
74
|
"subject": labels.get("en", "").capitalize(),
|
|
47
75
|
"title": labels,
|
|
48
76
|
"props": {"parents": parents} if parents else {},
|
|
49
|
-
"identifiers":
|
|
77
|
+
"identifiers": self._get_identifiers(subject),
|
|
50
78
|
}
|
|
51
79
|
|
|
52
80
|
|
|
53
|
-
# Configuration for datastream
|
|
54
|
-
VOCABULARIES_DATASTREAM_READERS = {}
|
|
55
|
-
VOCABULARIES_DATASTREAM_WRITERS = {}
|
|
81
|
+
# Configuration for datastream
|
|
56
82
|
|
|
57
83
|
VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
58
84
|
"euroscivoc-transformer": EuroSciVocSubjectsTransformer
|
|
@@ -71,9 +97,5 @@ DATASTREAM_CONFIG = {
|
|
|
71
97
|
},
|
|
72
98
|
],
|
|
73
99
|
"transformers": [{"type": "euroscivoc-transformer"}],
|
|
74
|
-
"writers": [
|
|
75
|
-
{
|
|
76
|
-
"type": "subjects-service",
|
|
77
|
-
}
|
|
78
|
-
],
|
|
100
|
+
"writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
|
|
79
101
|
}
|
|
@@ -20,7 +20,40 @@ except ImportError:
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class GEMETSubjectsTransformer(RDFTransformer):
|
|
23
|
-
"""
|
|
23
|
+
"""
|
|
24
|
+
Transformer class to convert GEMET RDF data to a dictionary format.
|
|
25
|
+
|
|
26
|
+
Input:
|
|
27
|
+
- Relevant fields:
|
|
28
|
+
- `skos:prefLabel`: Preferred labels with language codes.
|
|
29
|
+
- `skos:broader`: References to broader concepts (parent concepts).
|
|
30
|
+
- `skos:memberOf`: References to groups or themes the concept belongs to.
|
|
31
|
+
|
|
32
|
+
Output:
|
|
33
|
+
- A dictionary with the following structure:
|
|
34
|
+
{
|
|
35
|
+
"id": "gemet:concept/10008", # GEMET-specific concept ID (skos:Concept).
|
|
36
|
+
"scheme": "GEMET", # The scheme name indicating this is a GEMET concept.
|
|
37
|
+
"subject": "Consumer product", # The subject label (first preferred label in English, skos:prefLabel).
|
|
38
|
+
"title": {
|
|
39
|
+
"en": "Consumer product", # English label for the concept (skos:prefLabel).
|
|
40
|
+
"ar": "منتج استهلاكي" # Arabic label for the concept (skos:prefLabel).
|
|
41
|
+
},
|
|
42
|
+
"props": {
|
|
43
|
+
"parents": "gemet:concept/6660", # The parent concept (skos:broader), identified by its GEMET Concept ID.
|
|
44
|
+
"groups": ["http://www.eionet.europa.eu/gemet/group/10112"], # Group the concept belongs to (skos:memberOf)(skos:prefLabel).
|
|
45
|
+
"themes": [
|
|
46
|
+
"http://www.eionet.europa.eu/gemet/theme/27", # Theme the concept belongs to (skos:memberOf)(rdfs:label).
|
|
47
|
+
]
|
|
48
|
+
},
|
|
49
|
+
"identifiers": [
|
|
50
|
+
{
|
|
51
|
+
"scheme": "url", # Type of identifier (URL).
|
|
52
|
+
"identifier": "http://www.eionet.europa.eu/gemet/concept/10008" # URI of the concept (rdf:about).
|
|
53
|
+
}
|
|
54
|
+
]
|
|
55
|
+
}
|
|
56
|
+
"""
|
|
24
57
|
|
|
25
58
|
def _get_parent_notation(self, broader, rdf_graph):
|
|
26
59
|
"""Extract parent notation from GEMET URI."""
|
|
@@ -83,13 +116,11 @@ class GEMETSubjectsTransformer(RDFTransformer):
|
|
|
83
116
|
"subject": labels.get("en", "").capitalize(),
|
|
84
117
|
"title": labels,
|
|
85
118
|
"props": props,
|
|
86
|
-
"identifiers":
|
|
119
|
+
"identifiers": self._get_identifiers(subject),
|
|
87
120
|
}
|
|
88
121
|
|
|
89
122
|
|
|
90
|
-
# Configuration for datastream
|
|
91
|
-
VOCABULARIES_DATASTREAM_READERS = {}
|
|
92
|
-
VOCABULARIES_DATASTREAM_WRITERS = {}
|
|
123
|
+
# Configuration for datastream
|
|
93
124
|
|
|
94
125
|
VOCABULARIES_DATASTREAM_TRANSFORMERS = {"gemet-transformer": GEMETSubjectsTransformer}
|
|
95
126
|
|
|
@@ -48,7 +48,16 @@ class StreamEntry:
|
|
|
48
48
|
class DataStream:
|
|
49
49
|
"""Data stream."""
|
|
50
50
|
|
|
51
|
-
def __init__(
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
readers,
|
|
54
|
+
writers,
|
|
55
|
+
transformers=None,
|
|
56
|
+
batch_size=100,
|
|
57
|
+
write_many=False,
|
|
58
|
+
*args,
|
|
59
|
+
**kwargs,
|
|
60
|
+
):
|
|
52
61
|
"""Constructor.
|
|
53
62
|
|
|
54
63
|
:param readers: an ordered list of readers.
|
|
@@ -58,12 +67,14 @@ class DataStream:
|
|
|
58
67
|
self._readers = readers
|
|
59
68
|
self._transformers = transformers
|
|
60
69
|
self._writers = writers
|
|
70
|
+
self.batch_size = batch_size
|
|
71
|
+
self.write_many = write_many
|
|
61
72
|
|
|
62
73
|
def filter(self, stream_entry, *args, **kwargs):
|
|
63
74
|
"""Checks if an stream_entry should be filtered out (skipped)."""
|
|
64
75
|
return False
|
|
65
76
|
|
|
66
|
-
def process_batch(self, batch
|
|
77
|
+
def process_batch(self, batch):
|
|
67
78
|
"""Process a batch of entries."""
|
|
68
79
|
transformed_entries = []
|
|
69
80
|
for stream_entry in batch:
|
|
@@ -79,12 +90,12 @@ class DataStream:
|
|
|
79
90
|
else:
|
|
80
91
|
transformed_entries.append(transformed_entry)
|
|
81
92
|
if transformed_entries:
|
|
82
|
-
if write_many:
|
|
93
|
+
if self.write_many:
|
|
83
94
|
yield from self.batch_write(transformed_entries)
|
|
84
95
|
else:
|
|
85
96
|
yield from (self.write(entry) for entry in transformed_entries)
|
|
86
97
|
|
|
87
|
-
def process(self,
|
|
98
|
+
def process(self, *args, **kwargs):
|
|
88
99
|
"""Iterates over the entries.
|
|
89
100
|
|
|
90
101
|
Uses the reader to get the raw entries and transforms them.
|
|
@@ -95,13 +106,13 @@ class DataStream:
|
|
|
95
106
|
batch = []
|
|
96
107
|
for stream_entry in self.read():
|
|
97
108
|
batch.append(stream_entry)
|
|
98
|
-
if len(batch) >= batch_size:
|
|
99
|
-
yield from self.process_batch(batch
|
|
109
|
+
if len(batch) >= self.batch_size:
|
|
110
|
+
yield from self.process_batch(batch)
|
|
100
111
|
batch = []
|
|
101
112
|
|
|
102
113
|
# Process any remaining entries in the last batch
|
|
103
114
|
if batch:
|
|
104
|
-
yield from self.process_batch(batch
|
|
115
|
+
yield from self.process_batch(batch)
|
|
105
116
|
|
|
106
117
|
def read(self):
|
|
107
118
|
"""Recursively read the entries."""
|
|
@@ -81,4 +81,6 @@ class DataStreamFactory:
|
|
|
81
81
|
for t_conf in transformers_config:
|
|
82
82
|
transformers.append(TransformerFactory.create(t_conf))
|
|
83
83
|
|
|
84
|
-
return DataStream(
|
|
84
|
+
return DataStream(
|
|
85
|
+
readers=readers, writers=writers, transformers=transformers, **kwargs
|
|
86
|
+
)
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
"""Transformers module."""
|
|
10
10
|
|
|
11
11
|
from abc import ABC, abstractmethod
|
|
12
|
+
from urllib.parse import urlparse
|
|
12
13
|
|
|
13
14
|
from lxml import etree
|
|
14
15
|
|
|
@@ -76,6 +77,17 @@ class RDFTransformer(BaseTransformer):
|
|
|
76
77
|
"""Get the SKOS core namespace."""
|
|
77
78
|
return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
|
|
78
79
|
|
|
80
|
+
def _validate_subject_url(self, subject):
|
|
81
|
+
"""Check if the subject is a valid URL."""
|
|
82
|
+
parsed = urlparse(str(subject))
|
|
83
|
+
return bool(parsed.netloc and parsed.scheme)
|
|
84
|
+
|
|
85
|
+
def _get_identifiers(self, subject):
|
|
86
|
+
"""Generate identifiers field for a valid subject URL."""
|
|
87
|
+
if self._validate_subject_url(subject):
|
|
88
|
+
return [{"scheme": "url", "identifier": str(subject)}]
|
|
89
|
+
return []
|
|
90
|
+
|
|
79
91
|
def _get_labels(self, subject, rdf_graph):
|
|
80
92
|
"""Extract labels (prefLabel or altLabel) for a subject."""
|
|
81
93
|
labels = {
|
|
@@ -12,6 +12,7 @@ from abc import ABC, abstractmethod
|
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
|
|
14
14
|
import yaml
|
|
15
|
+
from flask import current_app
|
|
15
16
|
from invenio_access.permissions import system_identity
|
|
16
17
|
from invenio_pidstore.errors import PIDAlreadyExists, PIDDoesNotExistError
|
|
17
18
|
from invenio_records.systemfields.relations.errors import InvalidRelationValue
|
|
@@ -120,11 +121,14 @@ class ServiceWriter(BaseWriter):
|
|
|
120
121
|
|
|
121
122
|
def write_many(self, stream_entries, *args, **kwargs):
|
|
122
123
|
"""Writes the input entries using a given service."""
|
|
124
|
+
current_app.logger.info(f"Writing {len(stream_entries)} entries")
|
|
123
125
|
entries = [entry.entry for entry in stream_entries]
|
|
124
126
|
entries_with_id = [(self._entry_id(entry), entry) for entry in entries]
|
|
125
|
-
|
|
127
|
+
result_list = self._service.create_or_update_many(
|
|
128
|
+
self._identity, entries_with_id
|
|
129
|
+
)
|
|
126
130
|
stream_entries_processed = []
|
|
127
|
-
for entry, result in zip(entries, results):
|
|
131
|
+
for entry, result in zip(entries, result_list.results):
|
|
128
132
|
processed_stream_entry = StreamEntry(
|
|
129
133
|
entry=entry,
|
|
130
134
|
record=result.record,
|
|
@@ -28,7 +28,12 @@ from .contrib.awards.datastreams import (
|
|
|
28
28
|
)
|
|
29
29
|
from .contrib.funders.datastreams import DATASTREAM_CONFIG as funders_ds_config
|
|
30
30
|
from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config
|
|
31
|
+
from .contrib.subjects.bodc.datastreams import DATASTREAM_CONFIG as bodc_ds_config
|
|
31
32
|
from .contrib.subjects.datastreams import DATASTREAM_CONFIG as subjects_ds_config
|
|
33
|
+
from .contrib.subjects.euroscivoc.datastreams import (
|
|
34
|
+
DATASTREAM_CONFIG as euroscivoc_ds_config,
|
|
35
|
+
)
|
|
36
|
+
from .contrib.subjects.gemet.datastreams import DATASTREAM_CONFIG as gemet_ds_config
|
|
32
37
|
|
|
33
38
|
|
|
34
39
|
class VocabularyConfig:
|
|
@@ -137,6 +142,39 @@ class AffiliationsEDMOVocabularyConfig(VocabularyConfig):
|
|
|
137
142
|
raise NotImplementedError("Service not implemented for EDMO Affiliations")
|
|
138
143
|
|
|
139
144
|
|
|
145
|
+
class SubjectsEuroSciVocVocabularyConfig(VocabularyConfig):
|
|
146
|
+
"""EuroSciVoc Subjects Vocabulary Config."""
|
|
147
|
+
|
|
148
|
+
config = euroscivoc_ds_config
|
|
149
|
+
vocabulary_name = "subjects:euroscivoc"
|
|
150
|
+
|
|
151
|
+
def get_service(self):
|
|
152
|
+
"""Get the service for the vocabulary."""
|
|
153
|
+
raise NotImplementedError("Service not implemented for EuroSciVoc Subjects")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class SubjectsGEMETVocabularyConfig(VocabularyConfig):
|
|
157
|
+
"""GEMET Subjects Vocabulary Config."""
|
|
158
|
+
|
|
159
|
+
config = gemet_ds_config
|
|
160
|
+
vocabulary_name = "subjects:gemet"
|
|
161
|
+
|
|
162
|
+
def get_service(self):
|
|
163
|
+
"""Get the service for the vocabulary."""
|
|
164
|
+
raise NotImplementedError("Service not implemented for GEMET Subjects")
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class SubjectsBODCVocabularyConfig(VocabularyConfig):
|
|
168
|
+
"""BODC Subjects Vocabulary Config."""
|
|
169
|
+
|
|
170
|
+
config = bodc_ds_config
|
|
171
|
+
vocabulary_name = "subjects:bodc-puv"
|
|
172
|
+
|
|
173
|
+
def get_service(self):
|
|
174
|
+
"""Get the service for the vocabulary."""
|
|
175
|
+
raise NotImplementedError("Service not implemented for BODC Subjects")
|
|
176
|
+
|
|
177
|
+
|
|
140
178
|
def get_vocabulary_config(vocabulary):
|
|
141
179
|
"""Factory function to get the appropriate Vocabulary Config."""
|
|
142
180
|
vocab_config = {
|
|
@@ -148,5 +186,8 @@ def get_vocabulary_config(vocabulary):
|
|
|
148
186
|
"affiliations:openaire": AffiliationsOpenAIREVocabularyConfig,
|
|
149
187
|
"affiliations:edmo": AffiliationsEDMOVocabularyConfig,
|
|
150
188
|
"subjects": SubjectsVocabularyConfig,
|
|
189
|
+
"subjects:gemet": SubjectsGEMETVocabularyConfig,
|
|
190
|
+
"subjects:bodc": SubjectsBODCVocabularyConfig,
|
|
191
|
+
"subjects:euroscivoc": SubjectsEuroSciVocVocabularyConfig,
|
|
151
192
|
}
|
|
152
193
|
return vocab_config.get(vocabulary, VocabularyConfig)()
|
invenio_vocabularies/fixtures.py
CHANGED
|
@@ -28,6 +28,8 @@ class VocabularyFixture:
|
|
|
28
28
|
readers_config=config["readers"],
|
|
29
29
|
transformers_config=config.get("transformers"),
|
|
30
30
|
writers_config=config["writers"],
|
|
31
|
+
batch_size=config.get("batch_size", 1000),
|
|
32
|
+
write_many=config.get("write_many", False),
|
|
31
33
|
)
|
|
32
34
|
|
|
33
35
|
errors = []
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
|
|
9
9
|
"""Querystring parsing."""
|
|
10
10
|
|
|
11
|
+
import warnings
|
|
11
12
|
from functools import partial
|
|
12
13
|
|
|
13
14
|
from invenio_records_resources.services.records.params import SuggestQueryParser
|
|
@@ -20,6 +21,10 @@ class FilteredSuggestQueryParser(SuggestQueryParser):
|
|
|
20
21
|
@classmethod
|
|
21
22
|
def factory(cls, filter_field=None, **extra_params):
|
|
22
23
|
"""Create a prepared instance of the query parser."""
|
|
24
|
+
warnings.warn(
|
|
25
|
+
"FilteredSuggestQueryParser is deprecated, use SuggestQueryParser or CompositeSuggestQueryParser instead",
|
|
26
|
+
DeprecationWarning,
|
|
27
|
+
)
|
|
23
28
|
return partial(cls, filter_field=filter_field, extra_params=extra_params)
|
|
24
29
|
|
|
25
30
|
def __init__(self, identity=None, filter_field=None, extra_params=None):
|
|
@@ -20,6 +20,8 @@ def process_datastream(config):
|
|
|
20
20
|
readers_config=config["readers"],
|
|
21
21
|
transformers_config=config.get("transformers"),
|
|
22
22
|
writers_config=config["writers"],
|
|
23
|
+
batch_size=config.get("batch_size", 1000),
|
|
24
|
+
write_many=config.get("write_many", False),
|
|
23
25
|
)
|
|
24
26
|
|
|
25
27
|
for result in ds.process():
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: invenio-vocabularies
|
|
3
|
-
Version: 6.
|
|
3
|
+
Version: 6.9.0
|
|
4
4
|
Summary: Invenio module for managing vocabularies.
|
|
5
5
|
Home-page: https://github.com/inveniosoftware/invenio-vocabularies
|
|
6
6
|
Author: CERN
|
|
@@ -88,6 +88,17 @@ https://invenio-vocabularies.readthedocs.io/
|
|
|
88
88
|
Changes
|
|
89
89
|
=======
|
|
90
90
|
|
|
91
|
+
Version v6.9.0 (released 2024-12-09)
|
|
92
|
+
|
|
93
|
+
- schema: added identifiers in affiliations relation
|
|
94
|
+
|
|
95
|
+
Version v6.8.0 (released 2024-12-09)
|
|
96
|
+
|
|
97
|
+
- names: extract affiliation identifiers from employments
|
|
98
|
+
- names: optimize memory usage on ORCID sync
|
|
99
|
+
- subjects: improve search with CompositeSuggestQueryParser
|
|
100
|
+
- subjects: added datastream for bodc
|
|
101
|
+
|
|
91
102
|
Version v6.7.0 (released 2024-11-27)
|
|
92
103
|
|
|
93
104
|
- contrib: improve search accuracy for names, funders, affiliations
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
invenio_vocabularies/__init__.py,sha256=
|
|
2
|
-
invenio_vocabularies/cli.py,sha256=
|
|
3
|
-
invenio_vocabularies/config.py,sha256=
|
|
1
|
+
invenio_vocabularies/__init__.py,sha256=sQ6j-Dnfro84dUjb1SBr8X7MucAMXh1112UnZK_-tZE,377
|
|
2
|
+
invenio_vocabularies/cli.py,sha256=CpXTTIn2GTpUqNfLEMlRAp3JWst8ZjHVxoGYdhuuv_4,5959
|
|
3
|
+
invenio_vocabularies/config.py,sha256=h9Iied753mmZwZZHe5COMqUYvV-zSQtx763EIkUVb1Q,6973
|
|
4
4
|
invenio_vocabularies/ext.py,sha256=GujJ4UARd4Fxf4z7zznRk9JAgHamZuYCOdrKU5czg00,5987
|
|
5
|
-
invenio_vocabularies/factories.py,sha256=
|
|
6
|
-
invenio_vocabularies/fixtures.py,sha256=
|
|
5
|
+
invenio_vocabularies/factories.py,sha256=lRHPGPos6GdXf0yAhB1d7iMjVfpOeFDZRL9PPZvuWlY,6408
|
|
6
|
+
invenio_vocabularies/fixtures.py,sha256=iEPkWf_ZjdP2D9r2sLdIlPoR8Rq2m5cnoFwywUGHneg,1696
|
|
7
7
|
invenio_vocabularies/jobs.py,sha256=0aTukWooBPCvEgvnjJcQAZuMeS2H_m-RGULNIfJ5Gmc,6800
|
|
8
8
|
invenio_vocabularies/proxies.py,sha256=k7cTUgWfnCoYIuNqAj_VFi1zBN33KNNclRSVnBkObEM,711
|
|
9
9
|
invenio_vocabularies/views.py,sha256=PNJ5nvc3O7ASwNe56xmqy5YaU9n3UYF3W2JwvtE_kYs,1561
|
|
@@ -51,7 +51,7 @@ invenio_vocabularies/contrib/affiliations/datastreams.py,sha256=sMvkt9XOBTV7Q0we
|
|
|
51
51
|
invenio_vocabularies/contrib/affiliations/facets.py,sha256=w316MGvtdyTpRCPOpCEmMxxLraRkbFFb1VvLkFlEc9o,1229
|
|
52
52
|
invenio_vocabularies/contrib/affiliations/models.py,sha256=JUcj-1ydc2Cw2Rsc24JwXE3TFBJ_6fivhUYhGq4rT8A,329
|
|
53
53
|
invenio_vocabularies/contrib/affiliations/resources.py,sha256=DBEbRxQmp-o-PeZlgFG588Q4sGcruuwIL8L9O-SzCes,435
|
|
54
|
-
invenio_vocabularies/contrib/affiliations/schema.py,sha256=
|
|
54
|
+
invenio_vocabularies/contrib/affiliations/schema.py,sha256=geORDYdBIWnv81Txl07qdHhB3U_fo9ObVp7UrSlCLRI,2104
|
|
55
55
|
invenio_vocabularies/contrib/affiliations/services.py,sha256=KJbv46c2LuQOW3xz7KVLtfZjWR8vhMRPHninlUEhrss,395
|
|
56
56
|
invenio_vocabularies/contrib/affiliations/jsonschemas/__init__.py,sha256=ILyZ5kejTr0p50macMBPALQCTJSe4KEE3_cgf2p3zV4,252
|
|
57
57
|
invenio_vocabularies/contrib/affiliations/jsonschemas/affiliations/affiliation-v1.0.0.json,sha256=be-glRNIBtIO87Tcyw8d68OdG4J8-ojjiCj8UJBnckg,1649
|
|
@@ -113,7 +113,7 @@ invenio_vocabularies/contrib/funders/mappings/v7/funders/funder-v1.0.0.json,sha2
|
|
|
113
113
|
invenio_vocabularies/contrib/names/__init__.py,sha256=DBfsM7JMETZGaV5QmXEwE7zhCaAXvc2SZN6uXnW_V-c,451
|
|
114
114
|
invenio_vocabularies/contrib/names/api.py,sha256=sEPn_jFX3gyoxgbdEUSIvOoPCUI8pocI6qCZO6mzCgQ,300
|
|
115
115
|
invenio_vocabularies/contrib/names/config.py,sha256=9sb5novWuQYXg_5Egexn52mjgGd1D_D9UKyQ1fmIuh4,1977
|
|
116
|
-
invenio_vocabularies/contrib/names/datastreams.py,sha256=
|
|
116
|
+
invenio_vocabularies/contrib/names/datastreams.py,sha256=mmhtdrda6b4c83dRjxVF5JTqtkt92GSEMHTU6TzQtHw,14570
|
|
117
117
|
invenio_vocabularies/contrib/names/models.py,sha256=SYdtDDG-y5Wq_d06YhiVO5n8gfxPW_mx-tECsIcv5H8,308
|
|
118
118
|
invenio_vocabularies/contrib/names/names.py,sha256=_kBJBcPuANgUHlZ8RoVkpfJwzR5qaOQCBIyZusjKoCE,2509
|
|
119
119
|
invenio_vocabularies/contrib/names/permissions.py,sha256=5xrpYsA3oQUJ5lJpF7wjRAFiW-pM6_yP1k9zllbRwnQ,844
|
|
@@ -134,18 +134,20 @@ invenio_vocabularies/contrib/names/mappings/v7/__init__.py,sha256=qLGB8C0kPI3xub
|
|
|
134
134
|
invenio_vocabularies/contrib/names/mappings/v7/names/name-v1.0.0.json,sha256=5Ybcq3fUMYx3u1MNKmHh-CWBtATS9MYpdEcwAM8EQ80,1943
|
|
135
135
|
invenio_vocabularies/contrib/subjects/__init__.py,sha256=GtXZKA6VWG1oA1fUX2Wh92nd-1i7RnnQF6RprGhxkD4,591
|
|
136
136
|
invenio_vocabularies/contrib/subjects/api.py,sha256=QH8mxoLsa8qjJT1i1Tj6rRnpbH23plo2IMOJ56rnvbU,347
|
|
137
|
-
invenio_vocabularies/contrib/subjects/config.py,sha256=
|
|
138
|
-
invenio_vocabularies/contrib/subjects/datastreams.py,sha256=
|
|
137
|
+
invenio_vocabularies/contrib/subjects/config.py,sha256=6svsCjiptqWB5x3NlG6wDH_dehdQYRTPKDCkNc9MtNA,2169
|
|
138
|
+
invenio_vocabularies/contrib/subjects/datastreams.py,sha256=YRdUP0saks5LGVuFjDhhyJdsiYzkysLTBja32I4x9eU,1888
|
|
139
139
|
invenio_vocabularies/contrib/subjects/facets.py,sha256=qQ7_rppFBzsmrlZu4-MvOIdUcjeOmDA9gOHAcs0lWwI,695
|
|
140
140
|
invenio_vocabularies/contrib/subjects/models.py,sha256=8XgbVRxDDvhWPjMWsoCriNlOKdmV_113a14yLRtlvM4,363
|
|
141
141
|
invenio_vocabularies/contrib/subjects/resources.py,sha256=0KRfUMizwgIziZybk4HnIjiSsXbrCv_XmguNPwnxoo8,506
|
|
142
142
|
invenio_vocabularies/contrib/subjects/schema.py,sha256=VOW8a9Ob5M-mKrict2bApdFyTpHBwCTJZSxrm93Puv0,3516
|
|
143
143
|
invenio_vocabularies/contrib/subjects/services.py,sha256=s1U6HMmpjuz7rrgR0DtT9C28TC6sZEeDTsa4Jh1TXQk,864
|
|
144
144
|
invenio_vocabularies/contrib/subjects/subjects.py,sha256=NwZycExLyV8l7ikGStH4GOecVuDSxFT70KoNv6qC78I,1877
|
|
145
|
+
invenio_vocabularies/contrib/subjects/bodc/__init__.py,sha256=RlJVmWpbRgDcpx61ITjco3IqHkwZwIypeo2Dt2_AWRc,241
|
|
146
|
+
invenio_vocabularies/contrib/subjects/bodc/datastreams.py,sha256=RgFJTrr-eMyKrS2MuGK4QHhOkPseMwpmsKKpEqE_tgs,4220
|
|
145
147
|
invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py,sha256=e5L9E4l5JHqVzijAX8tn2DIa2n01vJ5wOAZdN62RnIo,247
|
|
146
|
-
invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py,sha256=
|
|
148
|
+
invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py,sha256=Vs4mpIn321KZ94lzTxpYnQTATle1QdKg0yegmDMptw4,3565
|
|
147
149
|
invenio_vocabularies/contrib/subjects/gemet/__init__.py,sha256=OlRWH2gumZZ1Djc_N3ZGPHyt2wOcIwlDDYO6uOfaZfI,242
|
|
148
|
-
invenio_vocabularies/contrib/subjects/gemet/datastreams.py,sha256=
|
|
150
|
+
invenio_vocabularies/contrib/subjects/gemet/datastreams.py,sha256=OZaKnT6cw3cjNtB_TxEBtVwWWf1Wrm-x9h71YXMAmBk,5203
|
|
149
151
|
invenio_vocabularies/contrib/subjects/jsonschemas/__init__.py,sha256=WowVUST1JoEDS3-xeHhCJvIgC9nzMkFs8XRks9zgzaM,292
|
|
150
152
|
invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json,sha256=O1IsPWrVeuEiMBKtADcRByFNmd1soABgODOnauEJBoI,1868
|
|
151
153
|
invenio_vocabularies/contrib/subjects/mappings/__init__.py,sha256=Qk-yj1ENsTmijO8ImWuDYGzXi6QQ2VjP4DbjrpRfDk8,243
|
|
@@ -158,13 +160,13 @@ invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json,s
|
|
|
158
160
|
invenio_vocabularies/contrib/subjects/mesh/__init__.py,sha256=P44hmgVNNTN5O_EmWgaeYJ91yqkGNoeKYo0wfif_wE4,241
|
|
159
161
|
invenio_vocabularies/contrib/subjects/mesh/datastreams.py,sha256=6W6bgQ7P_31kf3enkAqCBTFBqgrQ2BlV625vn0N9ibQ,1544
|
|
160
162
|
invenio_vocabularies/datastreams/__init__.py,sha256=VPefh6k4Q3eYxKIW8I5zXUGucntp7VHxaOR5Vhgkfmg,412
|
|
161
|
-
invenio_vocabularies/datastreams/datastreams.py,sha256=
|
|
163
|
+
invenio_vocabularies/datastreams/datastreams.py,sha256=mAi_xUDmDWpc3NyhU1TMOhqVRbwYu_meJ9UY6-wgBKQ,6169
|
|
162
164
|
invenio_vocabularies/datastreams/errors.py,sha256=IDUZ3gNtYGrhcOgApHCms1gNNJTyJzoMPmG5JtIeYNU,678
|
|
163
|
-
invenio_vocabularies/datastreams/factories.py,sha256=
|
|
165
|
+
invenio_vocabularies/datastreams/factories.py,sha256=kuuN4Zt7Xw58rwf0M03djqcdZOZRWgJdLK16-HmID24,2213
|
|
164
166
|
invenio_vocabularies/datastreams/readers.py,sha256=DUuV-D2PLio3nVR0J-2knASq8rB-H14QBr3DoRL6UgA,14352
|
|
165
167
|
invenio_vocabularies/datastreams/tasks.py,sha256=0fuH_PRt9Ncv6WHM4pkYmfheRVGDKkERZiMPvgV4bZU,1129
|
|
166
|
-
invenio_vocabularies/datastreams/transformers.py,sha256=
|
|
167
|
-
invenio_vocabularies/datastreams/writers.py,sha256=
|
|
168
|
+
invenio_vocabularies/datastreams/transformers.py,sha256=PJFbmRSj3dpJ95NzONAIns5ksztshd99JOp_FLQAlJM,4133
|
|
169
|
+
invenio_vocabularies/datastreams/writers.py,sha256=VIXx9klJaCEdscaKqi2zO959cc157YUGjVYdeTfhTTI,6861
|
|
168
170
|
invenio_vocabularies/datastreams/xml.py,sha256=HFa-lfxj7kFrr2IjeN1jxSLDfcvpBwO9nZLZF2-BryE,997
|
|
169
171
|
invenio_vocabularies/records/__init__.py,sha256=Uj7O6fYdAtLOkLXUGSAYPADBB7aqP4yVs9b6OAjA158,243
|
|
170
172
|
invenio_vocabularies/records/api.py,sha256=Lynt6Sz4BVN1orh0zgJ5ljhnUobEtcq8c22PmSeUo2U,1494
|
|
@@ -194,11 +196,11 @@ invenio_vocabularies/services/config.py,sha256=A9_r2vErcfo3Xt6fC4YVobHXdd64_YyI7
|
|
|
194
196
|
invenio_vocabularies/services/facets.py,sha256=qvdHoGSJJr90dZHSVe0-hlO1r0LtTnFVSjrt9PNuNAg,3872
|
|
195
197
|
invenio_vocabularies/services/generators.py,sha256=jcXwb9Hiyek4o-cQ1G2osVgbTBKDbd-5siJMBOWE018,1116
|
|
196
198
|
invenio_vocabularies/services/permissions.py,sha256=83rNOwCuggdJji3VtWTQgytTrhfiWqASCpvI75DxEus,960
|
|
197
|
-
invenio_vocabularies/services/querystr.py,sha256=
|
|
199
|
+
invenio_vocabularies/services/querystr.py,sha256=OrNUR_QAcQ_T-EiL3H1Jvzz9gK2ZB5FicsG0fOipSro,2029
|
|
198
200
|
invenio_vocabularies/services/results.py,sha256=6LZIpzWSbt9wpRNWgjA1uIM4RFooOYTkHcp5-PnIJdU,3767
|
|
199
201
|
invenio_vocabularies/services/schema.py,sha256=mwIBFylpQlWw1M6h_axc-z4Yd7X3Z1S0PxJOlZGpfrQ,4634
|
|
200
202
|
invenio_vocabularies/services/service.py,sha256=9QQDsG1WShCpBVFze-Dnq-iC2BwNX_0-qzfzrpImJo8,6469
|
|
201
|
-
invenio_vocabularies/services/tasks.py,sha256=
|
|
203
|
+
invenio_vocabularies/services/tasks.py,sha256=xKEymph1M-wFjPLeCGkqvnuYdPMMHgxhCCdC0j44Pi4,891
|
|
202
204
|
invenio_vocabularies/services/custom_fields/__init__.py,sha256=QgvSsn-S1xLzbZ57pjjGTt5oI3HqzXHVjwGTtuPgzN8,421
|
|
203
205
|
invenio_vocabularies/services/custom_fields/subject.py,sha256=ZM-ZkaxoouF9lL62smOtLxsjQQZwiQs0jG3qGruP6nY,2231
|
|
204
206
|
invenio_vocabularies/services/custom_fields/vocabulary.py,sha256=oQwI8Aoi2Nr9k3eWKnde5H7RXc7qdlATSeI6coy8UR0,3020
|
|
@@ -300,10 +302,10 @@ invenio_vocabularies/translations/zh_CN/LC_MESSAGES/messages.mo,sha256=g1I5aNO8r
|
|
|
300
302
|
invenio_vocabularies/translations/zh_CN/LC_MESSAGES/messages.po,sha256=vg8qC8ofpAdJ3mQz7mWM1ylKDpiNWXFs7rlMdSPkgKk,4629
|
|
301
303
|
invenio_vocabularies/translations/zh_TW/LC_MESSAGES/messages.mo,sha256=cqSm8NtMAwrP9O6qbmtkDtRT1e9D93qpsJN5X9_PPVw,600
|
|
302
304
|
invenio_vocabularies/translations/zh_TW/LC_MESSAGES/messages.po,sha256=9ACePz_EpB-LfcIJajZ2kp8Q04tcdrQLOtug162ZUss,4115
|
|
303
|
-
invenio_vocabularies-6.
|
|
304
|
-
invenio_vocabularies-6.
|
|
305
|
-
invenio_vocabularies-6.
|
|
306
|
-
invenio_vocabularies-6.
|
|
307
|
-
invenio_vocabularies-6.
|
|
308
|
-
invenio_vocabularies-6.
|
|
309
|
-
invenio_vocabularies-6.
|
|
305
|
+
invenio_vocabularies-6.9.0.dist-info/AUTHORS.rst,sha256=8d0p_WWE1r9DavvzMDi2D4YIGBHiMYcN3LYxqQOj8sY,291
|
|
306
|
+
invenio_vocabularies-6.9.0.dist-info/LICENSE,sha256=UvI8pR8jGWqe0sTkb_hRG6eIrozzWwWzyCGEpuXX4KE,1062
|
|
307
|
+
invenio_vocabularies-6.9.0.dist-info/METADATA,sha256=DDZji_utemzfuJq2CxCfHfyIwQApzq4hDckuaKpIsRo,12303
|
|
308
|
+
invenio_vocabularies-6.9.0.dist-info/WHEEL,sha256=-G_t0oGuE7UD0DrSpVZnq1hHMBV9DD2XkS5v7XpmTnk,110
|
|
309
|
+
invenio_vocabularies-6.9.0.dist-info/entry_points.txt,sha256=ud9nfdMlhO_mu3okwmy5vQD48r3-rCU_pSR-lUtLeYE,3180
|
|
310
|
+
invenio_vocabularies-6.9.0.dist-info/top_level.txt,sha256=x1gRNbaODF_bCD0SBLM3nVOFPGi06cmGX5X94WKrFKk,21
|
|
311
|
+
invenio_vocabularies-6.9.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{invenio_vocabularies-6.7.0.dist-info → invenio_vocabularies-6.9.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|