invenio-vocabularies 6.5.0__py2.py3-none-any.whl → 6.7.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of invenio-vocabularies might be problematic. Click here for more details.
- invenio_vocabularies/__init__.py +1 -1
- invenio_vocabularies/assets/semantic-ui/js/invenio_vocabularies/src/contrib/forms/Funding/FundingModal.js +3 -27
- invenio_vocabularies/config.py +27 -1
- invenio_vocabularies/contrib/affiliations/affiliations.py +2 -1
- invenio_vocabularies/contrib/affiliations/config.py +21 -10
- invenio_vocabularies/contrib/affiliations/datastreams.py +103 -1
- invenio_vocabularies/contrib/awards/awards.py +2 -1
- invenio_vocabularies/contrib/awards/datastreams.py +7 -0
- invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +9 -0
- invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +22 -1
- invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +22 -1
- invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +22 -1
- invenio_vocabularies/contrib/awards/schema.py +9 -3
- invenio_vocabularies/contrib/funders/config.py +19 -12
- invenio_vocabularies/contrib/funders/funders.py +2 -1
- invenio_vocabularies/contrib/names/config.py +13 -10
- invenio_vocabularies/contrib/names/jsonschemas/names/name-v1.0.0.json +28 -5
- invenio_vocabularies/contrib/names/mappings/os-v1/names/name-v2.0.0.json +15 -0
- invenio_vocabularies/contrib/names/mappings/os-v2/names/name-v2.0.0.json +15 -0
- invenio_vocabularies/contrib/names/names.py +1 -1
- invenio_vocabularies/contrib/names/permissions.py +14 -4
- invenio_vocabularies/contrib/names/schema.py +11 -2
- invenio_vocabularies/contrib/names/services.py +23 -14
- invenio_vocabularies/contrib/subjects/config.py +14 -2
- invenio_vocabularies/contrib/subjects/datastreams.py +4 -0
- invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +22 -114
- invenio_vocabularies/contrib/subjects/gemet/__init__.py +9 -0
- invenio_vocabularies/contrib/subjects/gemet/datastreams.py +109 -0
- invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json +13 -2
- invenio_vocabularies/contrib/subjects/mesh/datastreams.py +7 -2
- invenio_vocabularies/contrib/subjects/schema.py +18 -3
- invenio_vocabularies/datastreams/readers.py +99 -9
- invenio_vocabularies/datastreams/transformers.py +55 -0
- invenio_vocabularies/factories.py +15 -0
- invenio_vocabularies/jobs.py +15 -0
- invenio_vocabularies/records/jsonschemas/vocabularies/definitions-v1.0.0.json +9 -0
- invenio_vocabularies/services/config.py +1 -7
- invenio_vocabularies/services/generators.py +38 -0
- invenio_vocabularies/services/permissions.py +6 -1
- {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/METADATA +32 -2
- {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/RECORD +46 -43
- {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/entry_points.txt +1 -0
- {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/AUTHORS.rst +0 -0
- {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/LICENSE +0 -0
- {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/WHEEL +0 -0
- {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/top_level.txt +0 -0
|
@@ -125,6 +125,17 @@
|
|
|
125
125
|
"type": "text",
|
|
126
126
|
"analyzer": "accent_edge_analyzer",
|
|
127
127
|
"search_analyzer": "accent_analyzer"
|
|
128
|
+
},
|
|
129
|
+
"acronym": {
|
|
130
|
+
"type": "text",
|
|
131
|
+
"analyzer": "accent_edge_analyzer",
|
|
132
|
+
"search_analyzer": "accent_analyzer",
|
|
133
|
+
"fields": {
|
|
134
|
+
"keyword": {
|
|
135
|
+
"type": "keyword",
|
|
136
|
+
"normalizer": "accent_normalizer"
|
|
137
|
+
}
|
|
138
|
+
}
|
|
128
139
|
}
|
|
129
140
|
}
|
|
130
141
|
},
|
|
@@ -144,6 +155,10 @@
|
|
|
144
155
|
"type": "keyword"
|
|
145
156
|
}
|
|
146
157
|
}
|
|
158
|
+
},
|
|
159
|
+
"props": {
|
|
160
|
+
"type": "object",
|
|
161
|
+
"dynamic": "true"
|
|
147
162
|
}
|
|
148
163
|
}
|
|
149
164
|
}
|
|
@@ -125,6 +125,17 @@
|
|
|
125
125
|
"type": "text",
|
|
126
126
|
"analyzer": "accent_edge_analyzer",
|
|
127
127
|
"search_analyzer": "accent_analyzer"
|
|
128
|
+
},
|
|
129
|
+
"acronym": {
|
|
130
|
+
"type": "text",
|
|
131
|
+
"analyzer": "accent_edge_analyzer",
|
|
132
|
+
"search_analyzer": "accent_analyzer",
|
|
133
|
+
"fields": {
|
|
134
|
+
"keyword": {
|
|
135
|
+
"type": "keyword",
|
|
136
|
+
"normalizer": "accent_normalizer"
|
|
137
|
+
}
|
|
138
|
+
}
|
|
128
139
|
}
|
|
129
140
|
}
|
|
130
141
|
},
|
|
@@ -144,6 +155,10 @@
|
|
|
144
155
|
"type": "keyword"
|
|
145
156
|
}
|
|
146
157
|
}
|
|
158
|
+
},
|
|
159
|
+
"props": {
|
|
160
|
+
"type": "object",
|
|
161
|
+
"dynamic": "true"
|
|
147
162
|
}
|
|
148
163
|
}
|
|
149
164
|
}
|
|
@@ -10,11 +10,21 @@
|
|
|
10
10
|
|
|
11
11
|
from invenio_records_permissions.generators import AuthenticatedUser, SystemProcess
|
|
12
12
|
|
|
13
|
-
from
|
|
13
|
+
from invenio_vocabularies.services.generators import IfTags
|
|
14
|
+
from invenio_vocabularies.services.permissions import PermissionPolicy
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
class NamesPermissionPolicy(PermissionPolicy):
|
|
17
|
-
"""
|
|
18
|
+
"""Names permission policy.
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
|
|
20
|
+
Names endpoints are protected, only authenticated users can access them.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
can_search = [
|
|
24
|
+
SystemProcess(),
|
|
25
|
+
AuthenticatedUser(),
|
|
26
|
+
]
|
|
27
|
+
can_read = [
|
|
28
|
+
SystemProcess(),
|
|
29
|
+
IfTags(["unlisted"], then_=[SystemProcess()], else_=[AuthenticatedUser()]),
|
|
30
|
+
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2021 CERN.
|
|
3
|
+
# Copyright (C) 2021-2024 CERN.
|
|
4
4
|
#
|
|
5
5
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
@@ -16,10 +16,18 @@ from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
|
|
|
16
16
|
from marshmallow_utils.schemas import IdentifierSchema
|
|
17
17
|
|
|
18
18
|
from ...services.schema import BaseVocabularySchema, ModePIDFieldVocabularyMixin
|
|
19
|
-
from ..affiliations.schema import
|
|
19
|
+
from ..affiliations.schema import (
|
|
20
|
+
AffiliationRelationSchema as BaseAffiliationRelationSchema,
|
|
21
|
+
)
|
|
20
22
|
from .config import names_schemes
|
|
21
23
|
|
|
22
24
|
|
|
25
|
+
class AffiliationRelationSchema(BaseAffiliationRelationSchema):
|
|
26
|
+
"""Affiliation relation schema."""
|
|
27
|
+
|
|
28
|
+
acronym = SanitizedUnicode(dump_only=True)
|
|
29
|
+
|
|
30
|
+
|
|
23
31
|
class NameSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
|
|
24
32
|
"""Service schema for names.
|
|
25
33
|
|
|
@@ -42,6 +50,7 @@ class NameSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
|
|
|
42
50
|
)
|
|
43
51
|
)
|
|
44
52
|
affiliations = fields.List(fields.Nested(AffiliationRelationSchema))
|
|
53
|
+
props = fields.Dict(keys=fields.Str(), values=fields.Raw())
|
|
45
54
|
|
|
46
55
|
@validates_schema
|
|
47
56
|
def validate_names(self, data, **kwargs):
|
|
@@ -19,11 +19,12 @@ NamesServiceConfig = record_type.service_config_cls
|
|
|
19
19
|
class NamesService(record_type.service_cls):
|
|
20
20
|
"""Name service."""
|
|
21
21
|
|
|
22
|
-
def resolve(self, identity, id_, id_type):
|
|
22
|
+
def resolve(self, identity, id_, id_type, many=False):
|
|
23
23
|
"""Get the record with a given identifier.
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
param id_: The identifier value.
|
|
26
|
+
param id_type: The identifier type.
|
|
27
|
+
param many: If True, return a list of records.
|
|
27
28
|
"""
|
|
28
29
|
search_query = dsl.Q(
|
|
29
30
|
"bool",
|
|
@@ -36,20 +37,28 @@ class NamesService(record_type.service_cls):
|
|
|
36
37
|
|
|
37
38
|
# max_records = 1, we assume there cannot be duplicates
|
|
38
39
|
# the loading process needs to make sure of that
|
|
39
|
-
|
|
40
|
+
if many:
|
|
41
|
+
results = self._read_many(identity, search_query)
|
|
42
|
+
else:
|
|
43
|
+
results = self._read_many(identity, search_query, max_records=1)
|
|
44
|
+
|
|
40
45
|
# cant use the results_item because it returns dicts intead of records
|
|
41
46
|
total = results.hits.total["value"]
|
|
42
47
|
if total == 0:
|
|
43
48
|
# Not a PID but trated as such
|
|
44
49
|
raise PIDDoesNotExistError(pid_type=id_type, pid_value=id_)
|
|
50
|
+
if many:
|
|
51
|
+
for result in results:
|
|
52
|
+
record = self.record_cls.loads(result.to_dict())
|
|
53
|
+
self.require_permission(identity, "read", record=record)
|
|
54
|
+
return self.result_list(self, identity, results)
|
|
55
|
+
else:
|
|
56
|
+
record = self.record_cls.loads(results[0].to_dict())
|
|
57
|
+
self.require_permission(identity, "read", record=record)
|
|
45
58
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
identity,
|
|
53
|
-
record,
|
|
54
|
-
links_tpl=self.links_item_tpl,
|
|
55
|
-
)
|
|
59
|
+
return self.result_item(
|
|
60
|
+
self,
|
|
61
|
+
identity,
|
|
62
|
+
record,
|
|
63
|
+
links_tpl=self.links_item_tpl,
|
|
64
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2021 CERN.
|
|
3
|
+
# Copyright (C) 2021-2024 CERN.
|
|
4
4
|
# Copyright (C) 2021 Northwestern University.
|
|
5
5
|
# Copyright (C) 2024 University of Münster.
|
|
6
6
|
#
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
"""Subjects configuration."""
|
|
12
12
|
|
|
13
13
|
from flask import current_app
|
|
14
|
+
from invenio_i18n import get_locale
|
|
14
15
|
from invenio_i18n import lazy_gettext as _
|
|
15
16
|
from invenio_records_resources.services import SearchOptions
|
|
16
17
|
from invenio_records_resources.services.records.components import DataComponent
|
|
@@ -22,6 +23,16 @@ from ...services.querystr import FilteredSuggestQueryParser
|
|
|
22
23
|
subject_schemes = LocalProxy(
|
|
23
24
|
lambda: current_app.config["VOCABULARIES_SUBJECTS_SCHEMES"]
|
|
24
25
|
)
|
|
26
|
+
localized_title = LocalProxy(lambda: f"title.{get_locale()}^20")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
gemet_file_url = LocalProxy(
|
|
30
|
+
lambda: current_app.config["VOCABULARIES_SUBJECTS_GEMET_FILE_URL"]
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
euroscivoc_file_url = LocalProxy(
|
|
34
|
+
lambda: current_app.config["VOCABULARIES_SUBJECTS_EUROSCIVOC_FILE_URL"]
|
|
35
|
+
)
|
|
25
36
|
|
|
26
37
|
|
|
27
38
|
class SubjectsSearchOptions(SearchOptions):
|
|
@@ -30,7 +41,8 @@ class SubjectsSearchOptions(SearchOptions):
|
|
|
30
41
|
suggest_parser_cls = FilteredSuggestQueryParser.factory(
|
|
31
42
|
filter_field="scheme",
|
|
32
43
|
fields=[ # suggest fields
|
|
33
|
-
"
|
|
44
|
+
"subject^100",
|
|
45
|
+
localized_title,
|
|
34
46
|
"synonyms^20",
|
|
35
47
|
],
|
|
36
48
|
)
|
|
@@ -13,6 +13,7 @@ from invenio_i18n import lazy_gettext as _
|
|
|
13
13
|
|
|
14
14
|
from ...datastreams.writers import ServiceWriter
|
|
15
15
|
from .euroscivoc import datastreams as euroscivoc_datastreams
|
|
16
|
+
from .gemet import datastreams as gemet_datastreams
|
|
16
17
|
from .mesh import datastreams as mesh_datastreams
|
|
17
18
|
|
|
18
19
|
|
|
@@ -32,12 +33,14 @@ class SubjectsServiceWriter(ServiceWriter):
|
|
|
32
33
|
VOCABULARIES_DATASTREAM_READERS = {
|
|
33
34
|
**mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
|
|
34
35
|
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
|
|
36
|
+
**gemet_datastreams.VOCABULARIES_DATASTREAM_READERS,
|
|
35
37
|
}
|
|
36
38
|
"""Subjects Data Streams readers."""
|
|
37
39
|
|
|
38
40
|
VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
39
41
|
**mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
|
|
40
42
|
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
|
|
43
|
+
**gemet_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
|
|
41
44
|
}
|
|
42
45
|
"""Subjects Data Streams transformers."""
|
|
43
46
|
|
|
@@ -45,6 +48,7 @@ VOCABULARIES_DATASTREAM_WRITERS = {
|
|
|
45
48
|
"subjects-service": SubjectsServiceWriter,
|
|
46
49
|
**mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
|
|
47
50
|
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
|
|
51
|
+
**gemet_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
|
|
48
52
|
}
|
|
49
53
|
"""Subjects Data Streams writers."""
|
|
50
54
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C)
|
|
3
|
+
# Copyright (C) 2024 CERN.
|
|
4
4
|
#
|
|
5
5
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
@@ -8,120 +8,36 @@
|
|
|
8
8
|
|
|
9
9
|
"""EuroSciVoc subjects datastreams, readers, transformers, and writers."""
|
|
10
10
|
|
|
11
|
-
import
|
|
12
|
-
from collections import namedtuple
|
|
11
|
+
from invenio_vocabularies.datastreams.transformers import RDFTransformer
|
|
13
12
|
|
|
14
|
-
import
|
|
15
|
-
from rdflib import OWL, RDF, Graph, Namespace
|
|
13
|
+
from ..config import euroscivoc_file_url
|
|
16
14
|
|
|
17
|
-
from invenio_vocabularies.config import SUBJECTS_EUROSCIVOC_FILE_URL
|
|
18
|
-
from invenio_vocabularies.datastreams.readers import BaseReader
|
|
19
|
-
from invenio_vocabularies.datastreams.transformers import BaseTransformer
|
|
20
15
|
|
|
21
|
-
|
|
22
|
-
class EuroSciVocSubjectsHTTPReader(BaseReader):
|
|
23
|
-
"""Reader class to fetch and process EuroSciVoc RDF data."""
|
|
24
|
-
|
|
25
|
-
def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
|
|
26
|
-
"""Initialize the reader with the data source.
|
|
27
|
-
|
|
28
|
-
:param origin: The URL from which to fetch the RDF data.
|
|
29
|
-
:param mode: Mode of operation (default is 'r' for reading).
|
|
30
|
-
"""
|
|
31
|
-
self.origin = origin or SUBJECTS_EUROSCIVOC_FILE_URL
|
|
32
|
-
super().__init__(origin=origin, mode=mode, *args, **kwargs)
|
|
33
|
-
|
|
34
|
-
def _iter(self, rdf_graph):
|
|
35
|
-
"""Iterate over the RDF graph, yielding one subject at a time.
|
|
36
|
-
|
|
37
|
-
:param rdf_graph: The RDF graph to process.
|
|
38
|
-
:yield: Subject and graph to be transformed.
|
|
39
|
-
"""
|
|
40
|
-
SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
|
|
41
|
-
|
|
42
|
-
for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
|
|
43
|
-
yield {"subject": subject, "rdf_graph": rdf_graph}
|
|
44
|
-
|
|
45
|
-
def read(self, item=None, *args, **kwargs):
|
|
46
|
-
"""Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.
|
|
47
|
-
|
|
48
|
-
:param item: The RDF data provided as bytes (optional).
|
|
49
|
-
:yield: Processed EuroSciVoc subject data.
|
|
50
|
-
"""
|
|
51
|
-
if item:
|
|
52
|
-
raise NotImplementedError(
|
|
53
|
-
"EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
|
|
54
|
-
)
|
|
55
|
-
# Fetch the RDF data from the specified origin URL
|
|
56
|
-
response = requests.get(self.origin)
|
|
57
|
-
response.raise_for_status()
|
|
58
|
-
|
|
59
|
-
# Treat the response content as a file-like object
|
|
60
|
-
rdf_data = io.BytesIO(response.content)
|
|
61
|
-
|
|
62
|
-
# Parse the RDF data into a graph
|
|
63
|
-
rdf_graph = Graph()
|
|
64
|
-
rdf_graph.parse(rdf_data, format="xml")
|
|
65
|
-
|
|
66
|
-
# Yield each processed subject from the RDF graph
|
|
67
|
-
yield from self._iter(rdf_graph)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class EuroSciVocSubjectsTransformer(BaseTransformer):
|
|
16
|
+
class EuroSciVocSubjectsTransformer(RDFTransformer):
|
|
71
17
|
"""Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
|
|
72
18
|
|
|
73
|
-
SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
|
|
74
|
-
SPLITCHAR = ","
|
|
75
|
-
|
|
76
19
|
def _get_notation(self, subject, rdf_graph):
|
|
77
20
|
"""Extract the numeric notation for a subject."""
|
|
78
21
|
for _, _, notation in rdf_graph.triples(
|
|
79
|
-
(subject, self.
|
|
22
|
+
(subject, self.skos_core.notation, None)
|
|
80
23
|
):
|
|
81
24
|
if str(notation).isdigit():
|
|
82
25
|
return str(notation)
|
|
83
26
|
return None
|
|
84
27
|
|
|
85
|
-
def
|
|
86
|
-
"""Extract
|
|
87
|
-
|
|
88
|
-
label.language: label.value.capitalize()
|
|
89
|
-
for _, _, label in rdf_graph.triples(
|
|
90
|
-
(subject, self.SKOS_CORE.prefLabel, None)
|
|
91
|
-
)
|
|
92
|
-
}
|
|
93
|
-
if "en" not in labels:
|
|
94
|
-
for _, _, label in rdf_graph.triples(
|
|
95
|
-
(subject, self.SKOS_CORE.altLabel, None)
|
|
96
|
-
):
|
|
97
|
-
labels.setdefault(label.language, label.value.capitalize())
|
|
98
|
-
return labels
|
|
99
|
-
|
|
100
|
-
def _find_parents(self, subject, rdf_graph):
|
|
101
|
-
"""Find parent notations."""
|
|
102
|
-
parents = []
|
|
103
|
-
|
|
104
|
-
# Traverse the broader hierarchy
|
|
105
|
-
for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader):
|
|
106
|
-
if broader != subject: # Ensure we don't include the current subject
|
|
107
|
-
parent_notation = self._get_notation(broader, rdf_graph)
|
|
108
|
-
if parent_notation:
|
|
109
|
-
parents.append(parent_notation)
|
|
110
|
-
|
|
111
|
-
return parents
|
|
28
|
+
def _get_parent_notation(self, broader, rdf_graph):
|
|
29
|
+
"""Extract parent notation using numeric notation."""
|
|
30
|
+
return self._get_notation(broader, rdf_graph)
|
|
112
31
|
|
|
113
32
|
def _transform_entry(self, subject, rdf_graph):
|
|
114
|
-
"""Transform an entry to the required dictionary format."""
|
|
115
|
-
# Get subject notation with euroscivoc prefix
|
|
116
33
|
notation = self._get_notation(subject, rdf_graph)
|
|
117
34
|
id = f"euroscivoc:{notation}" if notation else None
|
|
118
|
-
# Get labels for the current subject
|
|
119
35
|
labels = self._get_labels(subject, rdf_graph)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
36
|
+
parents = ",".join(
|
|
37
|
+
f"euroscivoc:{n}"
|
|
38
|
+
for n in reversed(self._find_parents(subject, rdf_graph))
|
|
39
|
+
if n
|
|
123
40
|
)
|
|
124
|
-
# Create identifiers list
|
|
125
41
|
identifiers = [{"scheme": "url", "identifier": str(subject)}]
|
|
126
42
|
|
|
127
43
|
return {
|
|
@@ -133,23 +49,9 @@ class EuroSciVocSubjectsTransformer(BaseTransformer):
|
|
|
133
49
|
"identifiers": identifiers,
|
|
134
50
|
}
|
|
135
51
|
|
|
136
|
-
def apply(self, stream_entry, *args, **kwargs):
|
|
137
|
-
"""Transform a stream entry to the required dictionary format.
|
|
138
|
-
|
|
139
|
-
:param stream_entry: The entry to be transformed, which includes the subject and the RDF graph.
|
|
140
|
-
:return: The transformed stream entry.
|
|
141
|
-
"""
|
|
142
|
-
# Apply transformations
|
|
143
|
-
entry_data = self._transform_entry(
|
|
144
|
-
stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
|
|
145
|
-
)
|
|
146
|
-
stream_entry.entry = entry_data
|
|
147
|
-
return stream_entry
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
# Configuration for datastream readers, transformers, and writers
|
|
151
|
-
VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
|
|
152
52
|
|
|
53
|
+
# Configuration for datastream transformers, and writers
|
|
54
|
+
VOCABULARIES_DATASTREAM_READERS = {}
|
|
153
55
|
VOCABULARIES_DATASTREAM_WRITERS = {}
|
|
154
56
|
|
|
155
57
|
VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
@@ -159,8 +61,14 @@ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
|
159
61
|
DATASTREAM_CONFIG = {
|
|
160
62
|
"readers": [
|
|
161
63
|
{
|
|
162
|
-
"type": "
|
|
163
|
-
|
|
64
|
+
"type": "http",
|
|
65
|
+
"args": {
|
|
66
|
+
"origin": euroscivoc_file_url,
|
|
67
|
+
},
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"type": "rdf",
|
|
71
|
+
},
|
|
164
72
|
],
|
|
165
73
|
"transformers": [{"type": "euroscivoc-transformer"}],
|
|
166
74
|
"writers": [
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#
|
|
3
|
+
# Copyright (C) 2024 CERN.
|
|
4
|
+
#
|
|
5
|
+
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
|
+
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
7
|
+
# details.
|
|
8
|
+
|
|
9
|
+
"""GEMET subjects datastreams, readers, transformers, and writers."""
|
|
10
|
+
|
|
11
|
+
from invenio_vocabularies.datastreams.transformers import RDFTransformer
|
|
12
|
+
|
|
13
|
+
from ..config import gemet_file_url
|
|
14
|
+
|
|
15
|
+
# Available with the "rdf" extra
|
|
16
|
+
try:
|
|
17
|
+
import rdflib
|
|
18
|
+
except ImportError:
|
|
19
|
+
rdflib = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class GEMETSubjectsTransformer(RDFTransformer):
|
|
23
|
+
"""Transformer class to convert GEMET RDF data to a dictionary format."""
|
|
24
|
+
|
|
25
|
+
def _get_parent_notation(self, broader, rdf_graph):
|
|
26
|
+
"""Extract parent notation from GEMET URI."""
|
|
27
|
+
return "/".join(broader.split("/")[-2:])
|
|
28
|
+
|
|
29
|
+
def _get_groups_and_themes(self, subject, rdf_graph):
|
|
30
|
+
"""Extract groups and themes for a subject."""
|
|
31
|
+
groups = []
|
|
32
|
+
themes = []
|
|
33
|
+
|
|
34
|
+
for relation in rdf_graph.subjects(
|
|
35
|
+
predicate=self.skos_core.member, object=subject
|
|
36
|
+
):
|
|
37
|
+
relation_uri = str(relation)
|
|
38
|
+
relation_label = None
|
|
39
|
+
|
|
40
|
+
# If the relation is a group, check for skos:prefLabel
|
|
41
|
+
if "group" in relation_uri:
|
|
42
|
+
labels = rdf_graph.objects(
|
|
43
|
+
subject=relation, predicate=self.skos_core.prefLabel
|
|
44
|
+
)
|
|
45
|
+
relation_label = next(
|
|
46
|
+
(str(label) for label in labels if label.language == "en"), None
|
|
47
|
+
)
|
|
48
|
+
groups.append(relation_uri)
|
|
49
|
+
|
|
50
|
+
# If the relation is a theme, check for rdfs:label
|
|
51
|
+
elif "theme" in relation_uri:
|
|
52
|
+
labels = rdf_graph.objects(
|
|
53
|
+
subject=relation, predicate=rdflib.RDFS.label
|
|
54
|
+
)
|
|
55
|
+
relation_label = next(
|
|
56
|
+
(str(label) for label in labels if label.language == "en"), None
|
|
57
|
+
)
|
|
58
|
+
themes.append(relation_uri)
|
|
59
|
+
|
|
60
|
+
return groups, themes
|
|
61
|
+
|
|
62
|
+
def _transform_entry(self, subject, rdf_graph):
|
|
63
|
+
"""Transform an entry to the required dictionary format."""
|
|
64
|
+
concept_number = "/".join(subject.split("/")[-2:])
|
|
65
|
+
id = f"gemet:{concept_number}" if concept_number else None
|
|
66
|
+
labels = self._get_labels(subject, rdf_graph)
|
|
67
|
+
parents = ",".join(
|
|
68
|
+
f"gemet:{n}" for n in reversed(self._find_parents(subject, rdf_graph)) if n
|
|
69
|
+
)
|
|
70
|
+
identifiers = [{"scheme": "url", "identifier": str(subject)}]
|
|
71
|
+
groups, themes = self._get_groups_and_themes(subject, rdf_graph)
|
|
72
|
+
|
|
73
|
+
props = {"parents": parents} if parents else {}
|
|
74
|
+
|
|
75
|
+
if groups:
|
|
76
|
+
props["groups"] = groups
|
|
77
|
+
if themes:
|
|
78
|
+
props["themes"] = themes
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
"id": id,
|
|
82
|
+
"scheme": "GEMET",
|
|
83
|
+
"subject": labels.get("en", "").capitalize(),
|
|
84
|
+
"title": labels,
|
|
85
|
+
"props": props,
|
|
86
|
+
"identifiers": identifiers,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# Configuration for datastream transformers, and writers
|
|
91
|
+
VOCABULARIES_DATASTREAM_READERS = {}
|
|
92
|
+
VOCABULARIES_DATASTREAM_WRITERS = {}
|
|
93
|
+
|
|
94
|
+
VOCABULARIES_DATASTREAM_TRANSFORMERS = {"gemet-transformer": GEMETSubjectsTransformer}
|
|
95
|
+
|
|
96
|
+
DATASTREAM_CONFIG = {
|
|
97
|
+
"readers": [
|
|
98
|
+
{
|
|
99
|
+
"type": "http",
|
|
100
|
+
"args": {
|
|
101
|
+
"origin": gemet_file_url,
|
|
102
|
+
},
|
|
103
|
+
},
|
|
104
|
+
{"type": "gzip"},
|
|
105
|
+
{"type": "rdf"},
|
|
106
|
+
],
|
|
107
|
+
"transformers": [{"type": "gemet-transformer"}],
|
|
108
|
+
"writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
|
|
109
|
+
}
|
|
@@ -34,9 +34,20 @@
|
|
|
34
34
|
"type": "object",
|
|
35
35
|
"patternProperties": {
|
|
36
36
|
"^.*$": {
|
|
37
|
-
"
|
|
37
|
+
"oneOf": [
|
|
38
|
+
{
|
|
39
|
+
"type": "string"
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"type": "array",
|
|
43
|
+
"items": {
|
|
44
|
+
"type": "string"
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
]
|
|
38
48
|
}
|
|
39
|
-
}
|
|
49
|
+
},
|
|
50
|
+
"additionalProperties": false
|
|
40
51
|
},
|
|
41
52
|
"identifiers": {
|
|
42
53
|
"description": "Alternate identifiers for the subject.",
|
|
@@ -22,14 +22,19 @@ class MeshSubjectsTransformer(BaseTransformer):
|
|
|
22
22
|
"""Apply transformation on steam entry."""
|
|
23
23
|
entry_data = stream_entry.entry
|
|
24
24
|
|
|
25
|
-
# ID in MeSH data is the URL, ex. https://id.nlm.nih.gov/mesh/D000001
|
|
25
|
+
# ID in MeSH data is in the URL, ex. https://id.nlm.nih.gov/mesh/D000001
|
|
26
26
|
# We just want to use the ID prefixed by "mesh:""
|
|
27
27
|
try:
|
|
28
28
|
mesh_id = entry_data["id"].split("/")[-1]
|
|
29
|
+
entry_data["id"] = "mesh:" + mesh_id
|
|
29
30
|
except Exception:
|
|
30
31
|
raise TransformerError("Not a valid MeSH ID.")
|
|
31
32
|
|
|
32
|
-
entry_data["
|
|
33
|
+
entry_data["title"] = title = entry_data.get("title", {})
|
|
34
|
+
# NOTE: MeSH import file comes with an English subject by default
|
|
35
|
+
if "en" not in title:
|
|
36
|
+
title["en"] = entry_data["subject"]
|
|
37
|
+
|
|
33
38
|
return stream_entry
|
|
34
39
|
|
|
35
40
|
|
|
@@ -13,8 +13,8 @@
|
|
|
13
13
|
from functools import partial
|
|
14
14
|
|
|
15
15
|
from invenio_i18n import get_locale
|
|
16
|
-
from marshmallow import EXCLUDE, Schema, fields, pre_load
|
|
17
|
-
from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
|
|
16
|
+
from marshmallow import EXCLUDE, Schema, ValidationError, fields, pre_load, validate
|
|
17
|
+
from marshmallow_utils.fields import URL, IdentifierSet, SanitizedUnicode
|
|
18
18
|
from marshmallow_utils.schemas import IdentifierSchema
|
|
19
19
|
|
|
20
20
|
from ...services.schema import (
|
|
@@ -25,6 +25,21 @@ from ...services.schema import (
|
|
|
25
25
|
from .config import subject_schemes
|
|
26
26
|
|
|
27
27
|
|
|
28
|
+
class StringOrListOfStrings(fields.Field):
|
|
29
|
+
"""Custom field to handle both string and list of strings."""
|
|
30
|
+
|
|
31
|
+
# TODO: Move this to marshmallow-utils for broader type support.
|
|
32
|
+
def _deserialize(self, value, attr, data, **kwargs):
|
|
33
|
+
if isinstance(value, str):
|
|
34
|
+
return fields.String()._deserialize(value, attr, data, **kwargs)
|
|
35
|
+
elif isinstance(value, list) and all(isinstance(item, str) for item in value):
|
|
36
|
+
return [
|
|
37
|
+
fields.String()._deserialize(item, attr, data, **kwargs)
|
|
38
|
+
for item in value
|
|
39
|
+
]
|
|
40
|
+
raise ValidationError("Invalid value. Must be a string or a list of strings.")
|
|
41
|
+
|
|
42
|
+
|
|
28
43
|
class SubjectSchema(BaseVocabularySchema):
|
|
29
44
|
"""Service schema for subjects."""
|
|
30
45
|
|
|
@@ -35,7 +50,7 @@ class SubjectSchema(BaseVocabularySchema):
|
|
|
35
50
|
scheme = SanitizedUnicode(required=True)
|
|
36
51
|
subject = SanitizedUnicode(required=True)
|
|
37
52
|
title = i18n_strings
|
|
38
|
-
props = fields.Dict(keys=SanitizedUnicode(), values=
|
|
53
|
+
props = fields.Dict(keys=SanitizedUnicode(), values=StringOrListOfStrings())
|
|
39
54
|
identifiers = IdentifierSet(
|
|
40
55
|
fields.Nested(
|
|
41
56
|
partial(
|