invenio-vocabularies 5.1.0__py2.py3-none-any.whl → 6.1.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of invenio-vocabularies might be problematic. Click here for more details.
- invenio_vocabularies/__init__.py +1 -1
- invenio_vocabularies/cli.py +7 -2
- invenio_vocabularies/config.py +13 -0
- invenio_vocabularies/contrib/affiliations/datastreams.py +95 -1
- invenio_vocabularies/contrib/awards/awards.py +15 -4
- invenio_vocabularies/contrib/awards/datastreams.py +156 -60
- invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +35 -0
- invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +44 -1
- invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +44 -1
- invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +44 -1
- invenio_vocabularies/contrib/awards/schema.py +16 -1
- invenio_vocabularies/contrib/awards/serializer.py +8 -1
- invenio_vocabularies/contrib/common/openaire/__init__.py +9 -0
- invenio_vocabularies/contrib/common/openaire/datastreams.py +84 -0
- invenio_vocabularies/contrib/common/ror/datastreams.py +20 -7
- invenio_vocabularies/contrib/names/datastreams.py +12 -2
- invenio_vocabularies/contrib/names/names.py +4 -3
- invenio_vocabularies/contrib/names/permissions.py +20 -0
- invenio_vocabularies/contrib/subjects/datastreams.py +12 -6
- invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py +9 -0
- invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +171 -0
- invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json +16 -0
- invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json +14 -0
- invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json +14 -0
- invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json +14 -0
- invenio_vocabularies/contrib/subjects/mesh/__init__.py +9 -0
- invenio_vocabularies/contrib/subjects/schema.py +30 -6
- invenio_vocabularies/datastreams/readers.py +15 -4
- invenio_vocabularies/datastreams/transformers.py +15 -4
- invenio_vocabularies/datastreams/writers.py +44 -12
- invenio_vocabularies/factories.py +30 -0
- invenio_vocabularies/jobs.py +88 -0
- {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/METADATA +16 -1
- {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/RECORD +39 -32
- {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/entry_points.txt +3 -0
- {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/AUTHORS.rst +0 -0
- {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/LICENSE +0 -0
- {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/WHEEL +0 -0
- {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/top_level.txt +0 -0
|
@@ -58,12 +58,55 @@
|
|
|
58
58
|
"acronym": {
|
|
59
59
|
"type": "keyword",
|
|
60
60
|
"fields": {
|
|
61
|
-
"text": { "type": "text"}
|
|
61
|
+
"text": { "type": "text" }
|
|
62
62
|
}
|
|
63
63
|
},
|
|
64
64
|
"program": {
|
|
65
65
|
"type": "keyword"
|
|
66
66
|
},
|
|
67
|
+
"subjects": {
|
|
68
|
+
"properties": {
|
|
69
|
+
"@v": {
|
|
70
|
+
"type": "keyword"
|
|
71
|
+
},
|
|
72
|
+
"id": {
|
|
73
|
+
"type": "keyword"
|
|
74
|
+
},
|
|
75
|
+
"props": {
|
|
76
|
+
"type": "object",
|
|
77
|
+
"dynamic": "true"
|
|
78
|
+
},
|
|
79
|
+
"subject": {
|
|
80
|
+
"type": "keyword"
|
|
81
|
+
},
|
|
82
|
+
"scheme": {
|
|
83
|
+
"type": "keyword"
|
|
84
|
+
},
|
|
85
|
+
"identifiers": {
|
|
86
|
+
"properties": {
|
|
87
|
+
"identifier": {
|
|
88
|
+
"type": "keyword"
|
|
89
|
+
},
|
|
90
|
+
"scheme": {
|
|
91
|
+
"type": "keyword"
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
"organizations": {
|
|
98
|
+
"properties": {
|
|
99
|
+
"scheme": {
|
|
100
|
+
"type": "keyword"
|
|
101
|
+
},
|
|
102
|
+
"id": {
|
|
103
|
+
"type": "keyword"
|
|
104
|
+
},
|
|
105
|
+
"organization": {
|
|
106
|
+
"type": "keyword"
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
},
|
|
67
110
|
"funder": {
|
|
68
111
|
"type": "object",
|
|
69
112
|
"properties": {
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2021-
|
|
3
|
+
# Copyright (C) 2021-2024 CERN.
|
|
4
4
|
#
|
|
5
5
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
@@ -17,13 +17,24 @@ from marshmallow_utils.schemas import IdentifierSchema
|
|
|
17
17
|
|
|
18
18
|
from ...services.schema import (
|
|
19
19
|
BaseVocabularySchema,
|
|
20
|
+
ContribVocabularyRelationSchema,
|
|
20
21
|
ModePIDFieldVocabularyMixin,
|
|
21
22
|
i18n_strings,
|
|
22
23
|
)
|
|
23
24
|
from ..funders.schema import FunderRelationSchema
|
|
25
|
+
from ..subjects.schema import SubjectRelationSchema
|
|
24
26
|
from .config import award_schemes
|
|
25
27
|
|
|
26
28
|
|
|
29
|
+
class AwardOrganizationRelationSchema(ContribVocabularyRelationSchema):
|
|
30
|
+
"""Schema to define an organization relation in an award."""
|
|
31
|
+
|
|
32
|
+
ftf_name = "organization"
|
|
33
|
+
parent_field_name = "organizations"
|
|
34
|
+
organization = SanitizedUnicode()
|
|
35
|
+
scheme = SanitizedUnicode()
|
|
36
|
+
|
|
37
|
+
|
|
27
38
|
class AwardSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
|
|
28
39
|
"""Award schema."""
|
|
29
40
|
|
|
@@ -46,6 +57,10 @@ class AwardSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
|
|
|
46
57
|
|
|
47
58
|
program = SanitizedUnicode()
|
|
48
59
|
|
|
60
|
+
subjects = fields.List(fields.Nested(SubjectRelationSchema))
|
|
61
|
+
|
|
62
|
+
organizations = fields.List(fields.Nested(AwardOrganizationRelationSchema))
|
|
63
|
+
|
|
49
64
|
id = SanitizedUnicode(
|
|
50
65
|
validate=validate.Length(min=1, error=_("PID cannot be blank."))
|
|
51
66
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2022 CERN.
|
|
3
|
+
# Copyright (C) 2022-2024 CERN.
|
|
4
4
|
#
|
|
5
5
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
@@ -12,6 +12,9 @@ from marshmallow import Schema, fields
|
|
|
12
12
|
|
|
13
13
|
from invenio_vocabularies.resources import L10NString
|
|
14
14
|
|
|
15
|
+
from ..subjects.schema import SubjectRelationSchema
|
|
16
|
+
from .schema import AwardOrganizationRelationSchema
|
|
17
|
+
|
|
15
18
|
|
|
16
19
|
class IdentifierSchema(Schema):
|
|
17
20
|
"""Identifier scheme."""
|
|
@@ -37,4 +40,8 @@ class AwardL10NItemSchema(Schema):
|
|
|
37
40
|
acronym = fields.String(dump_only=True)
|
|
38
41
|
program = fields.String(dump_only=True)
|
|
39
42
|
funder = fields.Nested(FunderRelationSchema, dump_only=True)
|
|
43
|
+
subjects = fields.List(fields.Nested(SubjectRelationSchema), dump_only=True)
|
|
40
44
|
identifiers = fields.List(fields.Nested(IdentifierSchema), dump_only=True)
|
|
45
|
+
organizations = fields.List(
|
|
46
|
+
fields.Nested(AwardOrganizationRelationSchema), dump_only=True
|
|
47
|
+
)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#
|
|
3
|
+
# Copyright (C) 2024 CERN.
|
|
4
|
+
#
|
|
5
|
+
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
|
+
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
7
|
+
# details.
|
|
8
|
+
|
|
9
|
+
"""OpenAIRE-related Datastreams Readers/Writers/Transformers module."""
|
|
10
|
+
|
|
11
|
+
import io
|
|
12
|
+
|
|
13
|
+
import requests
|
|
14
|
+
|
|
15
|
+
from invenio_vocabularies.datastreams.errors import ReaderError
|
|
16
|
+
from invenio_vocabularies.datastreams.readers import BaseReader
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class OpenAIREHTTPReader(BaseReader):
|
|
20
|
+
"""OpenAIRE HTTP Reader returning an in-memory binary stream of the latest OpenAIRE Graph Dataset tar file of a given type."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, origin=None, mode="r", tar_href=None, *args, **kwargs):
|
|
23
|
+
"""Constructor."""
|
|
24
|
+
self.tar_href = tar_href
|
|
25
|
+
super().__init__(origin, mode, *args, **kwargs)
|
|
26
|
+
|
|
27
|
+
def _iter(self, fp, *args, **kwargs):
|
|
28
|
+
raise NotImplementedError(
|
|
29
|
+
"OpenAIREHTTPReader downloads one file and therefore does not iterate through items"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def read(self, item=None, *args, **kwargs):
|
|
33
|
+
"""Reads the latest OpenAIRE Graph Dataset tar file of a given type from Zenodo and yields an in-memory binary stream of it."""
|
|
34
|
+
if item:
|
|
35
|
+
raise NotImplementedError(
|
|
36
|
+
"OpenAIREHTTPReader does not support being chained after another reader"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if self._origin == "full":
|
|
40
|
+
# OpenAIRE Graph Dataset
|
|
41
|
+
api_url = "https://zenodo.org/api/records/3516917"
|
|
42
|
+
elif self._origin == "diff":
|
|
43
|
+
# OpenAIRE Graph dataset: new collected projects
|
|
44
|
+
api_url = "https://zenodo.org/api/records/6419021"
|
|
45
|
+
else:
|
|
46
|
+
raise ReaderError("The --origin option should be either 'full' or 'diff'")
|
|
47
|
+
|
|
48
|
+
# Call the signposting `linkset+json` endpoint for the Concept DOI (i.e. latest version) of the OpenAIRE Graph Dataset.
|
|
49
|
+
# See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint
|
|
50
|
+
headers = {"Accept": "application/linkset+json"}
|
|
51
|
+
api_resp = requests.get(api_url, headers=headers)
|
|
52
|
+
api_resp.raise_for_status()
|
|
53
|
+
|
|
54
|
+
# Extract the Landing page Link Set Object located as the first (index 0) item.
|
|
55
|
+
landing_page_linkset = api_resp.json()["linkset"][0]
|
|
56
|
+
|
|
57
|
+
# Extract the URL of the only tar file matching `tar_href` linked to the record.
|
|
58
|
+
landing_page_matching_tar_items = [
|
|
59
|
+
item
|
|
60
|
+
for item in landing_page_linkset["item"]
|
|
61
|
+
if item["type"] == "application/x-tar"
|
|
62
|
+
and item["href"].endswith(self.tar_href)
|
|
63
|
+
]
|
|
64
|
+
if len(landing_page_matching_tar_items) != 1:
|
|
65
|
+
raise ReaderError(
|
|
66
|
+
f"Expected 1 tar item matching {self.tar_href} but got {len(landing_page_matching_tar_items)}"
|
|
67
|
+
)
|
|
68
|
+
file_url = landing_page_matching_tar_items[0]["href"]
|
|
69
|
+
|
|
70
|
+
# Download the matching tar file and fully load the response bytes content in memory.
|
|
71
|
+
# The bytes content are then wrapped by a BytesIO to be file-like object (as required by `tarfile.open`).
|
|
72
|
+
# Using directly `file_resp.raw` is not possible since `tarfile.open` requires the file-like object to be seekable.
|
|
73
|
+
file_resp = requests.get(file_url)
|
|
74
|
+
file_resp.raise_for_status()
|
|
75
|
+
yield io.BytesIO(file_resp.content)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
VOCABULARIES_DATASTREAM_READERS = {
|
|
79
|
+
"openaire-http": OpenAIREHTTPReader,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
VOCABULARIES_DATASTREAM_TRANSFORMERS = {}
|
|
83
|
+
|
|
84
|
+
VOCABULARIES_DATASTREAM_WRITERS = {}
|
|
@@ -21,7 +21,11 @@ from invenio_vocabularies.datastreams.transformers import BaseTransformer
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class RORHTTPReader(BaseReader):
|
|
24
|
-
"""ROR HTTP Reader
|
|
24
|
+
"""ROR HTTP Reader.
|
|
25
|
+
|
|
26
|
+
Returning an in-memory
|
|
27
|
+
binary stream of the latest ROR data dump ZIP file.
|
|
28
|
+
"""
|
|
25
29
|
|
|
26
30
|
def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
|
|
27
31
|
"""Constructor."""
|
|
@@ -30,7 +34,8 @@ class RORHTTPReader(BaseReader):
|
|
|
30
34
|
|
|
31
35
|
def _iter(self, fp, *args, **kwargs):
|
|
32
36
|
raise NotImplementedError(
|
|
33
|
-
"RORHTTPReader downloads one file
|
|
37
|
+
"RORHTTPReader downloads one file "
|
|
38
|
+
"and therefore does not iterate through items"
|
|
34
39
|
)
|
|
35
40
|
|
|
36
41
|
def _get_last_dump_date(self, linksets):
|
|
@@ -53,11 +58,16 @@ class RORHTTPReader(BaseReader):
|
|
|
53
58
|
return last_dump_date
|
|
54
59
|
else:
|
|
55
60
|
raise ReaderError(
|
|
56
|
-
"Couldn't find JSON-LD in publisher's linkset
|
|
61
|
+
"Couldn't find JSON-LD in publisher's linkset "
|
|
62
|
+
"to determine last dump date."
|
|
57
63
|
)
|
|
58
64
|
|
|
59
65
|
def read(self, item=None, *args, **kwargs):
|
|
60
|
-
"""Reads the latest ROR data dump
|
|
66
|
+
"""Reads the latest ROR data dump.
|
|
67
|
+
|
|
68
|
+
Read from ZIP file from
|
|
69
|
+
Zenodo and yields an in-memory binary stream of it.
|
|
70
|
+
"""
|
|
61
71
|
if item:
|
|
62
72
|
raise NotImplementedError(
|
|
63
73
|
"RORHTTPReader does not support being chained after another reader"
|
|
@@ -68,7 +78,8 @@ class RORHTTPReader(BaseReader):
|
|
|
68
78
|
landing_page = requests.get(dataset_doi_link, allow_redirects=True)
|
|
69
79
|
landing_page.raise_for_status()
|
|
70
80
|
|
|
71
|
-
# Call the signposting `linkset+json` endpoint for
|
|
81
|
+
# Call the signposting `linkset+json` endpoint for
|
|
82
|
+
# the Concept DOI (i.e. latest version) of the ROR data dump.
|
|
72
83
|
# See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint
|
|
73
84
|
if "linkset" not in landing_page.links:
|
|
74
85
|
raise ReaderError("Linkset not found in the ROR dataset record.")
|
|
@@ -94,8 +105,10 @@ class RORHTTPReader(BaseReader):
|
|
|
94
105
|
raise ReaderError(f"Expected 1 ZIP item but got {len(zip_files)}")
|
|
95
106
|
|
|
96
107
|
# Download the ZIP file and fully load the response bytes content in memory.
|
|
97
|
-
# The bytes content are then wrapped by a BytesIO to be
|
|
98
|
-
#
|
|
108
|
+
# The bytes content are then wrapped by a BytesIO to be
|
|
109
|
+
# file-like object (as required by `zipfile.ZipFile`).
|
|
110
|
+
# Using directly `file_resp.raw` is not possible since
|
|
111
|
+
# `zipfile.ZipFile` requires the file-like object to be seekable.
|
|
99
112
|
file_resp = requests.get(file_url)
|
|
100
113
|
file_resp.raise_for_status()
|
|
101
114
|
yield io.BytesIO(file_resp.content)
|
|
@@ -243,7 +243,12 @@ DATASTREAM_CONFIG = {
|
|
|
243
243
|
"regex": "\\.xml$",
|
|
244
244
|
},
|
|
245
245
|
},
|
|
246
|
-
{
|
|
246
|
+
{
|
|
247
|
+
"type": "xml",
|
|
248
|
+
"args": {
|
|
249
|
+
"root_element": "record",
|
|
250
|
+
},
|
|
251
|
+
},
|
|
247
252
|
],
|
|
248
253
|
"transformers": [{"type": "orcid"}],
|
|
249
254
|
"writers": [
|
|
@@ -266,7 +271,12 @@ ORCID_PRESET_DATASTREAM_CONFIG = {
|
|
|
266
271
|
{
|
|
267
272
|
"type": "orcid-data-sync",
|
|
268
273
|
},
|
|
269
|
-
{
|
|
274
|
+
{
|
|
275
|
+
"type": "xml",
|
|
276
|
+
"args": {
|
|
277
|
+
"root_element": "record",
|
|
278
|
+
},
|
|
279
|
+
},
|
|
270
280
|
],
|
|
271
281
|
"transformers": [{"type": "orcid"}],
|
|
272
282
|
"writers": [
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2021 CERN.
|
|
3
|
+
# Copyright (C) 2021-2024 CERN.
|
|
4
4
|
#
|
|
5
5
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
@@ -21,7 +21,8 @@ from invenio_records_resources.records.systemfields import (
|
|
|
21
21
|
)
|
|
22
22
|
from invenio_records_resources.resources.records.headers import etag_headers
|
|
23
23
|
|
|
24
|
-
from
|
|
24
|
+
from invenio_vocabularies.contrib.names.permissions import NamesPermissionPolicy
|
|
25
|
+
|
|
25
26
|
from ..affiliations.api import Affiliation
|
|
26
27
|
from .config import NamesSearchOptions, service_components
|
|
27
28
|
from .schema import NameSchema
|
|
@@ -63,7 +64,7 @@ record_type = RecordTypeFactory(
|
|
|
63
64
|
service_schema=NameSchema,
|
|
64
65
|
search_options=NamesSearchOptions,
|
|
65
66
|
service_components=service_components,
|
|
66
|
-
permission_policy_cls=
|
|
67
|
+
permission_policy_cls=NamesPermissionPolicy,
|
|
67
68
|
# Resource layer
|
|
68
69
|
endpoint_route="/names",
|
|
69
70
|
resource_cls_attrs={
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#
|
|
3
|
+
# Copyright (C) 2020-2024 CERN.
|
|
4
|
+
#
|
|
5
|
+
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
|
+
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
7
|
+
# details.
|
|
8
|
+
|
|
9
|
+
"""Vocabulary permissions."""
|
|
10
|
+
|
|
11
|
+
from invenio_records_permissions.generators import AuthenticatedUser, SystemProcess
|
|
12
|
+
|
|
13
|
+
from ...services.permissions import PermissionPolicy
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class NamesPermissionPolicy(PermissionPolicy):
|
|
17
|
+
"""Permission policy."""
|
|
18
|
+
|
|
19
|
+
can_search = [SystemProcess(), AuthenticatedUser()]
|
|
20
|
+
can_read = [SystemProcess(), AuthenticatedUser()]
|
|
@@ -12,9 +12,8 @@ from invenio_access.permissions import system_identity
|
|
|
12
12
|
from invenio_i18n import lazy_gettext as _
|
|
13
13
|
|
|
14
14
|
from ...datastreams.writers import ServiceWriter
|
|
15
|
-
from .
|
|
16
|
-
from .mesh
|
|
17
|
-
from .mesh.datastreams import VOCABULARIES_DATASTREAM_WRITERS as mesh_writers
|
|
15
|
+
from .euroscivoc import datastreams as euroscivoc_datastreams
|
|
16
|
+
from .mesh import datastreams as mesh_datastreams
|
|
18
17
|
|
|
19
18
|
|
|
20
19
|
class SubjectsServiceWriter(ServiceWriter):
|
|
@@ -30,15 +29,22 @@ class SubjectsServiceWriter(ServiceWriter):
|
|
|
30
29
|
return entry["id"]
|
|
31
30
|
|
|
32
31
|
|
|
33
|
-
VOCABULARIES_DATASTREAM_READERS = {
|
|
32
|
+
VOCABULARIES_DATASTREAM_READERS = {
|
|
33
|
+
**mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
|
|
34
|
+
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
|
|
35
|
+
}
|
|
34
36
|
"""Subjects Data Streams readers."""
|
|
35
37
|
|
|
36
|
-
VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
38
|
+
VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
39
|
+
**mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
|
|
40
|
+
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
|
|
41
|
+
}
|
|
37
42
|
"""Subjects Data Streams transformers."""
|
|
38
43
|
|
|
39
44
|
VOCABULARIES_DATASTREAM_WRITERS = {
|
|
40
45
|
"subjects-service": SubjectsServiceWriter,
|
|
41
|
-
**
|
|
46
|
+
**mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
|
|
47
|
+
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
|
|
42
48
|
}
|
|
43
49
|
"""Subjects Data Streams writers."""
|
|
44
50
|
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#
|
|
3
|
+
# Copyright (C) 2022-2024 CERN.
|
|
4
|
+
#
|
|
5
|
+
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
|
+
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
7
|
+
# details.
|
|
8
|
+
|
|
9
|
+
"""EuroSciVoc subjects datastreams, readers, transformers, and writers."""
|
|
10
|
+
|
|
11
|
+
import io
|
|
12
|
+
from collections import namedtuple
|
|
13
|
+
|
|
14
|
+
import requests
|
|
15
|
+
from rdflib import OWL, RDF, Graph, Namespace
|
|
16
|
+
|
|
17
|
+
from invenio_vocabularies.config import SUBJECTS_EUROSCIVOC_FILE_URL
|
|
18
|
+
from invenio_vocabularies.datastreams.readers import BaseReader
|
|
19
|
+
from invenio_vocabularies.datastreams.transformers import BaseTransformer
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class EuroSciVocSubjectsHTTPReader(BaseReader):
|
|
23
|
+
"""Reader class to fetch and process EuroSciVoc RDF data."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
|
|
26
|
+
"""Initialize the reader with the data source.
|
|
27
|
+
|
|
28
|
+
:param origin: The URL from which to fetch the RDF data.
|
|
29
|
+
:param mode: Mode of operation (default is 'r' for reading).
|
|
30
|
+
"""
|
|
31
|
+
self.origin = origin or SUBJECTS_EUROSCIVOC_FILE_URL
|
|
32
|
+
super().__init__(origin=origin, mode=mode, *args, **kwargs)
|
|
33
|
+
|
|
34
|
+
def _iter(self, rdf_graph):
|
|
35
|
+
"""Iterate over the RDF graph, yielding one subject at a time.
|
|
36
|
+
|
|
37
|
+
:param rdf_graph: The RDF graph to process.
|
|
38
|
+
:yield: Subject and graph to be transformed.
|
|
39
|
+
"""
|
|
40
|
+
SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
|
|
41
|
+
|
|
42
|
+
for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
|
|
43
|
+
yield {"subject": subject, "rdf_graph": rdf_graph}
|
|
44
|
+
|
|
45
|
+
def read(self, item=None, *args, **kwargs):
|
|
46
|
+
"""Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.
|
|
47
|
+
|
|
48
|
+
:param item: The RDF data provided as bytes (optional).
|
|
49
|
+
:yield: Processed EuroSciVoc subject data.
|
|
50
|
+
"""
|
|
51
|
+
if item:
|
|
52
|
+
raise NotImplementedError(
|
|
53
|
+
"EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
|
|
54
|
+
)
|
|
55
|
+
# Fetch the RDF data from the specified origin URL
|
|
56
|
+
response = requests.get(self.origin)
|
|
57
|
+
response.raise_for_status()
|
|
58
|
+
|
|
59
|
+
# Treat the response content as a file-like object
|
|
60
|
+
rdf_data = io.BytesIO(response.content)
|
|
61
|
+
|
|
62
|
+
# Parse the RDF data into a graph
|
|
63
|
+
rdf_graph = Graph()
|
|
64
|
+
rdf_graph.parse(rdf_data, format="xml")
|
|
65
|
+
|
|
66
|
+
# Yield each processed subject from the RDF graph
|
|
67
|
+
yield from self._iter(rdf_graph)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class EuroSciVocSubjectsTransformer(BaseTransformer):
|
|
71
|
+
"""Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
|
|
72
|
+
|
|
73
|
+
SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
|
|
74
|
+
SPLITCHAR = ","
|
|
75
|
+
|
|
76
|
+
def _get_notation(self, subject, rdf_graph):
|
|
77
|
+
"""Extract the numeric notation for a subject."""
|
|
78
|
+
for _, _, notation in rdf_graph.triples(
|
|
79
|
+
(subject, self.SKOS_CORE.notation, None)
|
|
80
|
+
):
|
|
81
|
+
if str(notation).isdigit():
|
|
82
|
+
return str(notation)
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
def _get_labels(self, subject, rdf_graph):
|
|
86
|
+
"""Extract prefLabel and altLabel languages for a subject."""
|
|
87
|
+
labels = {
|
|
88
|
+
label.language: label.value.capitalize()
|
|
89
|
+
for _, _, label in rdf_graph.triples(
|
|
90
|
+
(subject, self.SKOS_CORE.prefLabel, None)
|
|
91
|
+
)
|
|
92
|
+
}
|
|
93
|
+
if "en" not in labels:
|
|
94
|
+
for _, _, label in rdf_graph.triples(
|
|
95
|
+
(subject, self.SKOS_CORE.altLabel, None)
|
|
96
|
+
):
|
|
97
|
+
labels.setdefault(label.language, label.value.capitalize())
|
|
98
|
+
return labels
|
|
99
|
+
|
|
100
|
+
def _find_parents(self, subject, rdf_graph):
|
|
101
|
+
"""Find parent notations."""
|
|
102
|
+
parents = []
|
|
103
|
+
|
|
104
|
+
# Traverse the broader hierarchy
|
|
105
|
+
for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader):
|
|
106
|
+
if broader != subject: # Ensure we don't include the current subject
|
|
107
|
+
parent_notation = self._get_notation(broader, rdf_graph)
|
|
108
|
+
if parent_notation:
|
|
109
|
+
parents.append(parent_notation)
|
|
110
|
+
|
|
111
|
+
return parents
|
|
112
|
+
|
|
113
|
+
def _transform_entry(self, subject, rdf_graph):
|
|
114
|
+
"""Transform an entry to the required dictionary format."""
|
|
115
|
+
# Get subject notation with euroscivoc prefix
|
|
116
|
+
notation = self._get_notation(subject, rdf_graph)
|
|
117
|
+
id = f"euroscivoc:{notation}" if notation else None
|
|
118
|
+
# Get labels for the current subject
|
|
119
|
+
labels = self._get_labels(subject, rdf_graph)
|
|
120
|
+
# Join parent notations with SPLITCHAR separator and add euroscivoc prefix
|
|
121
|
+
parents = self.SPLITCHAR.join(
|
|
122
|
+
f"euroscivoc:{n}" for n in reversed(self._find_parents(subject, rdf_graph))
|
|
123
|
+
)
|
|
124
|
+
# Create identifiers list
|
|
125
|
+
identifiers = [{"scheme": "url", "identifier": str(subject)}]
|
|
126
|
+
|
|
127
|
+
return {
|
|
128
|
+
"id": id,
|
|
129
|
+
"scheme": "EuroSciVoc",
|
|
130
|
+
"subject": labels.get("en", "").capitalize(),
|
|
131
|
+
"title": labels,
|
|
132
|
+
"props": {"parents": parents} if parents else {},
|
|
133
|
+
"identifiers": identifiers,
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
def apply(self, stream_entry, *args, **kwargs):
|
|
137
|
+
"""Transform a stream entry to the required dictionary format.
|
|
138
|
+
|
|
139
|
+
:param stream_entry: The entry to be transformed, which includes the subject and the RDF graph.
|
|
140
|
+
:return: The transformed stream entry.
|
|
141
|
+
"""
|
|
142
|
+
# Apply transformations
|
|
143
|
+
entry_data = self._transform_entry(
|
|
144
|
+
stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
|
|
145
|
+
)
|
|
146
|
+
stream_entry.entry = entry_data
|
|
147
|
+
return stream_entry
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# Configuration for datastream readers, transformers, and writers
|
|
151
|
+
VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
|
|
152
|
+
|
|
153
|
+
VOCABULARIES_DATASTREAM_WRITERS = {}
|
|
154
|
+
|
|
155
|
+
VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
156
|
+
"euroscivoc-transformer": EuroSciVocSubjectsTransformer
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
DATASTREAM_CONFIG = {
|
|
160
|
+
"readers": [
|
|
161
|
+
{
|
|
162
|
+
"type": "euroscivoc-reader",
|
|
163
|
+
}
|
|
164
|
+
],
|
|
165
|
+
"transformers": [{"type": "euroscivoc-transformer"}],
|
|
166
|
+
"writers": [
|
|
167
|
+
{
|
|
168
|
+
"type": "subjects-service",
|
|
169
|
+
}
|
|
170
|
+
],
|
|
171
|
+
}
|
|
@@ -30,6 +30,22 @@
|
|
|
30
30
|
"description": "Human readable label in different languages.",
|
|
31
31
|
"$ref": "local://vocabularies/definitions-v1.0.0.json#/title"
|
|
32
32
|
},
|
|
33
|
+
"props": {
|
|
34
|
+
"type": "object",
|
|
35
|
+
"patternProperties": {
|
|
36
|
+
"^.*$": {
|
|
37
|
+
"type": "string"
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
"identifiers": {
|
|
42
|
+
"description": "Alternate identifiers for the subject.",
|
|
43
|
+
"type": "array",
|
|
44
|
+
"items": {
|
|
45
|
+
"$ref": "local://definitions-v2.0.0.json#/identifiers_with_scheme"
|
|
46
|
+
},
|
|
47
|
+
"uniqueItems": true
|
|
48
|
+
},
|
|
33
49
|
"synonyms": {
|
|
34
50
|
"description": "Synonyms of the subject label.",
|
|
35
51
|
"type": "array",
|
|
@@ -71,6 +71,20 @@
|
|
|
71
71
|
"type": "object",
|
|
72
72
|
"dynamic": "true"
|
|
73
73
|
},
|
|
74
|
+
"props": {
|
|
75
|
+
"type": "object",
|
|
76
|
+
"dynamic": "true"
|
|
77
|
+
},
|
|
78
|
+
"identifiers": {
|
|
79
|
+
"properties": {
|
|
80
|
+
"identifier": {
|
|
81
|
+
"type": "keyword"
|
|
82
|
+
},
|
|
83
|
+
"scheme": {
|
|
84
|
+
"type": "keyword"
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
},
|
|
74
88
|
"synonyms": {
|
|
75
89
|
"type": "text"
|
|
76
90
|
},
|
|
@@ -74,6 +74,20 @@
|
|
|
74
74
|
"synonyms": {
|
|
75
75
|
"type": "text"
|
|
76
76
|
},
|
|
77
|
+
"props": {
|
|
78
|
+
"type": "object",
|
|
79
|
+
"dynamic": "true"
|
|
80
|
+
},
|
|
81
|
+
"identifiers": {
|
|
82
|
+
"properties": {
|
|
83
|
+
"identifier": {
|
|
84
|
+
"type": "keyword"
|
|
85
|
+
},
|
|
86
|
+
"scheme": {
|
|
87
|
+
"type": "keyword"
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
},
|
|
77
91
|
"tags": {
|
|
78
92
|
"type": "keyword"
|
|
79
93
|
}
|
|
@@ -71,6 +71,20 @@
|
|
|
71
71
|
"type": "object",
|
|
72
72
|
"dynamic": "true"
|
|
73
73
|
},
|
|
74
|
+
"props": {
|
|
75
|
+
"type": "object",
|
|
76
|
+
"dynamic": "true"
|
|
77
|
+
},
|
|
78
|
+
"identifiers": {
|
|
79
|
+
"properties": {
|
|
80
|
+
"identifier": {
|
|
81
|
+
"type": "keyword"
|
|
82
|
+
},
|
|
83
|
+
"scheme": {
|
|
84
|
+
"type": "keyword"
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
},
|
|
74
88
|
"synonyms": {
|
|
75
89
|
"type": "text"
|
|
76
90
|
},
|