invenio-vocabularies 6.5.0__py2.py3-none-any.whl → 6.7.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (46) hide show
  1. invenio_vocabularies/__init__.py +1 -1
  2. invenio_vocabularies/assets/semantic-ui/js/invenio_vocabularies/src/contrib/forms/Funding/FundingModal.js +3 -27
  3. invenio_vocabularies/config.py +27 -1
  4. invenio_vocabularies/contrib/affiliations/affiliations.py +2 -1
  5. invenio_vocabularies/contrib/affiliations/config.py +21 -10
  6. invenio_vocabularies/contrib/affiliations/datastreams.py +103 -1
  7. invenio_vocabularies/contrib/awards/awards.py +2 -1
  8. invenio_vocabularies/contrib/awards/datastreams.py +7 -0
  9. invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +9 -0
  10. invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +22 -1
  11. invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +22 -1
  12. invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +22 -1
  13. invenio_vocabularies/contrib/awards/schema.py +9 -3
  14. invenio_vocabularies/contrib/funders/config.py +19 -12
  15. invenio_vocabularies/contrib/funders/funders.py +2 -1
  16. invenio_vocabularies/contrib/names/config.py +13 -10
  17. invenio_vocabularies/contrib/names/jsonschemas/names/name-v1.0.0.json +28 -5
  18. invenio_vocabularies/contrib/names/mappings/os-v1/names/name-v2.0.0.json +15 -0
  19. invenio_vocabularies/contrib/names/mappings/os-v2/names/name-v2.0.0.json +15 -0
  20. invenio_vocabularies/contrib/names/names.py +1 -1
  21. invenio_vocabularies/contrib/names/permissions.py +14 -4
  22. invenio_vocabularies/contrib/names/schema.py +11 -2
  23. invenio_vocabularies/contrib/names/services.py +23 -14
  24. invenio_vocabularies/contrib/subjects/config.py +14 -2
  25. invenio_vocabularies/contrib/subjects/datastreams.py +4 -0
  26. invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +22 -114
  27. invenio_vocabularies/contrib/subjects/gemet/__init__.py +9 -0
  28. invenio_vocabularies/contrib/subjects/gemet/datastreams.py +109 -0
  29. invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json +13 -2
  30. invenio_vocabularies/contrib/subjects/mesh/datastreams.py +7 -2
  31. invenio_vocabularies/contrib/subjects/schema.py +18 -3
  32. invenio_vocabularies/datastreams/readers.py +99 -9
  33. invenio_vocabularies/datastreams/transformers.py +55 -0
  34. invenio_vocabularies/factories.py +15 -0
  35. invenio_vocabularies/jobs.py +15 -0
  36. invenio_vocabularies/records/jsonschemas/vocabularies/definitions-v1.0.0.json +9 -0
  37. invenio_vocabularies/services/config.py +1 -7
  38. invenio_vocabularies/services/generators.py +38 -0
  39. invenio_vocabularies/services/permissions.py +6 -1
  40. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/METADATA +32 -2
  41. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/RECORD +46 -43
  42. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/entry_points.txt +1 -0
  43. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/AUTHORS.rst +0 -0
  44. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/LICENSE +0 -0
  45. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/WHEEL +0 -0
  46. {invenio_vocabularies-6.5.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/top_level.txt +0 -0
@@ -125,6 +125,17 @@
125
125
  "type": "text",
126
126
  "analyzer": "accent_edge_analyzer",
127
127
  "search_analyzer": "accent_analyzer"
128
+ },
129
+ "acronym": {
130
+ "type": "text",
131
+ "analyzer": "accent_edge_analyzer",
132
+ "search_analyzer": "accent_analyzer",
133
+ "fields": {
134
+ "keyword": {
135
+ "type": "keyword",
136
+ "normalizer": "accent_normalizer"
137
+ }
138
+ }
128
139
  }
129
140
  }
130
141
  },
@@ -144,6 +155,10 @@
144
155
  "type": "keyword"
145
156
  }
146
157
  }
158
+ },
159
+ "props": {
160
+ "type": "object",
161
+ "dynamic": "true"
147
162
  }
148
163
  }
149
164
  }
@@ -125,6 +125,17 @@
125
125
  "type": "text",
126
126
  "analyzer": "accent_edge_analyzer",
127
127
  "search_analyzer": "accent_analyzer"
128
+ },
129
+ "acronym": {
130
+ "type": "text",
131
+ "analyzer": "accent_edge_analyzer",
132
+ "search_analyzer": "accent_analyzer",
133
+ "fields": {
134
+ "keyword": {
135
+ "type": "keyword",
136
+ "normalizer": "accent_normalizer"
137
+ }
138
+ }
128
139
  }
129
140
  }
130
141
  },
@@ -144,6 +155,10 @@
144
155
  "type": "keyword"
145
156
  }
146
157
  }
158
+ },
159
+ "props": {
160
+ "type": "object",
161
+ "dynamic": "true"
147
162
  }
148
163
  }
149
164
  }
@@ -30,7 +30,7 @@ from .schema import NameSchema
30
30
  name_relations = RelationsField(
31
31
  affiliations=PIDListRelation(
32
32
  "affiliations",
33
- keys=["name"],
33
+ keys=["name", "acronym"],
34
34
  pid_field=Affiliation.pid,
35
35
  cache_key="affiliations",
36
36
  )
@@ -10,11 +10,21 @@
10
10
 
11
11
  from invenio_records_permissions.generators import AuthenticatedUser, SystemProcess
12
12
 
13
- from ...services.permissions import PermissionPolicy
13
+ from invenio_vocabularies.services.generators import IfTags
14
+ from invenio_vocabularies.services.permissions import PermissionPolicy
14
15
 
15
16
 
16
17
  class NamesPermissionPolicy(PermissionPolicy):
17
- """Permission policy."""
18
+ """Names permission policy.
18
19
 
19
- can_search = [SystemProcess(), AuthenticatedUser()]
20
- can_read = [SystemProcess(), AuthenticatedUser()]
20
+ Names endpoints are protected, only authenticated users can access them.
21
+ """
22
+
23
+ can_search = [
24
+ SystemProcess(),
25
+ AuthenticatedUser(),
26
+ ]
27
+ can_read = [
28
+ SystemProcess(),
29
+ IfTags(["unlisted"], then_=[SystemProcess()], else_=[AuthenticatedUser()]),
30
+ ]
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -16,10 +16,18 @@ from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
16
16
  from marshmallow_utils.schemas import IdentifierSchema
17
17
 
18
18
  from ...services.schema import BaseVocabularySchema, ModePIDFieldVocabularyMixin
19
- from ..affiliations.schema import AffiliationRelationSchema
19
+ from ..affiliations.schema import (
20
+ AffiliationRelationSchema as BaseAffiliationRelationSchema,
21
+ )
20
22
  from .config import names_schemes
21
23
 
22
24
 
25
+ class AffiliationRelationSchema(BaseAffiliationRelationSchema):
26
+ """Affiliation relation schema."""
27
+
28
+ acronym = SanitizedUnicode(dump_only=True)
29
+
30
+
23
31
  class NameSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
24
32
  """Service schema for names.
25
33
 
@@ -42,6 +50,7 @@ class NameSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
42
50
  )
43
51
  )
44
52
  affiliations = fields.List(fields.Nested(AffiliationRelationSchema))
53
+ props = fields.Dict(keys=fields.Str(), values=fields.Raw())
45
54
 
46
55
  @validates_schema
47
56
  def validate_names(self, data, **kwargs):
@@ -19,11 +19,12 @@ NamesServiceConfig = record_type.service_config_cls
19
19
  class NamesService(record_type.service_cls):
20
20
  """Name service."""
21
21
 
22
- def resolve(self, identity, id_, id_type):
22
+ def resolve(self, identity, id_, id_type, many=False):
23
23
  """Get the record with a given identifier.
24
24
 
25
- This method assumes that the are no duplicates in the system
26
- (i.e. only one name record can have a pair of identifier:scheme).
25
+ param id_: The identifier value.
26
+ param id_type: The identifier type.
27
+ param many: If True, return a list of records.
27
28
  """
28
29
  search_query = dsl.Q(
29
30
  "bool",
@@ -36,20 +37,28 @@ class NamesService(record_type.service_cls):
36
37
 
37
38
  # max_records = 1, we assume there cannot be duplicates
38
39
  # the loading process needs to make sure of that
39
- results = self._read_many(identity, search_query, max_records=1)
40
+ if many:
41
+ results = self._read_many(identity, search_query)
42
+ else:
43
+ results = self._read_many(identity, search_query, max_records=1)
44
+
40
45
  # cant use the results_item because it returns dicts intead of records
41
46
  total = results.hits.total["value"]
42
47
  if total == 0:
43
48
  # Not a PID but trated as such
44
49
  raise PIDDoesNotExistError(pid_type=id_type, pid_value=id_)
50
+ if many:
51
+ for result in results:
52
+ record = self.record_cls.loads(result.to_dict())
53
+ self.require_permission(identity, "read", record=record)
54
+ return self.result_list(self, identity, results)
55
+ else:
56
+ record = self.record_cls.loads(results[0].to_dict())
57
+ self.require_permission(identity, "read", record=record)
45
58
 
46
- # (0 < #hits <= max_records) = 1
47
- record = self.record_cls.loads(results[0].to_dict())
48
- self.require_permission(identity, "read", record=record)
49
-
50
- return self.result_item(
51
- self,
52
- identity,
53
- record,
54
- links_tpl=self.links_item_tpl,
55
- )
59
+ return self.result_item(
60
+ self,
61
+ identity,
62
+ record,
63
+ links_tpl=self.links_item_tpl,
64
+ )
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  # Copyright (C) 2021 Northwestern University.
5
5
  # Copyright (C) 2024 University of Münster.
6
6
  #
@@ -11,6 +11,7 @@
11
11
  """Subjects configuration."""
12
12
 
13
13
  from flask import current_app
14
+ from invenio_i18n import get_locale
14
15
  from invenio_i18n import lazy_gettext as _
15
16
  from invenio_records_resources.services import SearchOptions
16
17
  from invenio_records_resources.services.records.components import DataComponent
@@ -22,6 +23,16 @@ from ...services.querystr import FilteredSuggestQueryParser
22
23
  subject_schemes = LocalProxy(
23
24
  lambda: current_app.config["VOCABULARIES_SUBJECTS_SCHEMES"]
24
25
  )
26
+ localized_title = LocalProxy(lambda: f"title.{get_locale()}^20")
27
+
28
+
29
+ gemet_file_url = LocalProxy(
30
+ lambda: current_app.config["VOCABULARIES_SUBJECTS_GEMET_FILE_URL"]
31
+ )
32
+
33
+ euroscivoc_file_url = LocalProxy(
34
+ lambda: current_app.config["VOCABULARIES_SUBJECTS_EUROSCIVOC_FILE_URL"]
35
+ )
25
36
 
26
37
 
27
38
  class SubjectsSearchOptions(SearchOptions):
@@ -30,7 +41,8 @@ class SubjectsSearchOptions(SearchOptions):
30
41
  suggest_parser_cls = FilteredSuggestQueryParser.factory(
31
42
  filter_field="scheme",
32
43
  fields=[ # suggest fields
33
- "title.*^100",
44
+ "subject^100",
45
+ localized_title,
34
46
  "synonyms^20",
35
47
  ],
36
48
  )
@@ -13,6 +13,7 @@ from invenio_i18n import lazy_gettext as _
13
13
 
14
14
  from ...datastreams.writers import ServiceWriter
15
15
  from .euroscivoc import datastreams as euroscivoc_datastreams
16
+ from .gemet import datastreams as gemet_datastreams
16
17
  from .mesh import datastreams as mesh_datastreams
17
18
 
18
19
 
@@ -32,12 +33,14 @@ class SubjectsServiceWriter(ServiceWriter):
32
33
  VOCABULARIES_DATASTREAM_READERS = {
33
34
  **mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
34
35
  **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
36
+ **gemet_datastreams.VOCABULARIES_DATASTREAM_READERS,
35
37
  }
36
38
  """Subjects Data Streams readers."""
37
39
 
38
40
  VOCABULARIES_DATASTREAM_TRANSFORMERS = {
39
41
  **mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
40
42
  **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
43
+ **gemet_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
41
44
  }
42
45
  """Subjects Data Streams transformers."""
43
46
 
@@ -45,6 +48,7 @@ VOCABULARIES_DATASTREAM_WRITERS = {
45
48
  "subjects-service": SubjectsServiceWriter,
46
49
  **mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
47
50
  **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
51
+ **gemet_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
48
52
  }
49
53
  """Subjects Data Streams writers."""
50
54
 
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2022-2024 CERN.
3
+ # Copyright (C) 2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,120 +8,36 @@
8
8
 
9
9
  """EuroSciVoc subjects datastreams, readers, transformers, and writers."""
10
10
 
11
- import io
12
- from collections import namedtuple
11
+ from invenio_vocabularies.datastreams.transformers import RDFTransformer
13
12
 
14
- import requests
15
- from rdflib import OWL, RDF, Graph, Namespace
13
+ from ..config import euroscivoc_file_url
16
14
 
17
- from invenio_vocabularies.config import SUBJECTS_EUROSCIVOC_FILE_URL
18
- from invenio_vocabularies.datastreams.readers import BaseReader
19
- from invenio_vocabularies.datastreams.transformers import BaseTransformer
20
15
 
21
-
22
- class EuroSciVocSubjectsHTTPReader(BaseReader):
23
- """Reader class to fetch and process EuroSciVoc RDF data."""
24
-
25
- def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
26
- """Initialize the reader with the data source.
27
-
28
- :param origin: The URL from which to fetch the RDF data.
29
- :param mode: Mode of operation (default is 'r' for reading).
30
- """
31
- self.origin = origin or SUBJECTS_EUROSCIVOC_FILE_URL
32
- super().__init__(origin=origin, mode=mode, *args, **kwargs)
33
-
34
- def _iter(self, rdf_graph):
35
- """Iterate over the RDF graph, yielding one subject at a time.
36
-
37
- :param rdf_graph: The RDF graph to process.
38
- :yield: Subject and graph to be transformed.
39
- """
40
- SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
41
-
42
- for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
43
- yield {"subject": subject, "rdf_graph": rdf_graph}
44
-
45
- def read(self, item=None, *args, **kwargs):
46
- """Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.
47
-
48
- :param item: The RDF data provided as bytes (optional).
49
- :yield: Processed EuroSciVoc subject data.
50
- """
51
- if item:
52
- raise NotImplementedError(
53
- "EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
54
- )
55
- # Fetch the RDF data from the specified origin URL
56
- response = requests.get(self.origin)
57
- response.raise_for_status()
58
-
59
- # Treat the response content as a file-like object
60
- rdf_data = io.BytesIO(response.content)
61
-
62
- # Parse the RDF data into a graph
63
- rdf_graph = Graph()
64
- rdf_graph.parse(rdf_data, format="xml")
65
-
66
- # Yield each processed subject from the RDF graph
67
- yield from self._iter(rdf_graph)
68
-
69
-
70
- class EuroSciVocSubjectsTransformer(BaseTransformer):
16
+ class EuroSciVocSubjectsTransformer(RDFTransformer):
71
17
  """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
72
18
 
73
- SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
74
- SPLITCHAR = ","
75
-
76
19
  def _get_notation(self, subject, rdf_graph):
77
20
  """Extract the numeric notation for a subject."""
78
21
  for _, _, notation in rdf_graph.triples(
79
- (subject, self.SKOS_CORE.notation, None)
22
+ (subject, self.skos_core.notation, None)
80
23
  ):
81
24
  if str(notation).isdigit():
82
25
  return str(notation)
83
26
  return None
84
27
 
85
- def _get_labels(self, subject, rdf_graph):
86
- """Extract prefLabel and altLabel languages for a subject."""
87
- labels = {
88
- label.language: label.value.capitalize()
89
- for _, _, label in rdf_graph.triples(
90
- (subject, self.SKOS_CORE.prefLabel, None)
91
- )
92
- }
93
- if "en" not in labels:
94
- for _, _, label in rdf_graph.triples(
95
- (subject, self.SKOS_CORE.altLabel, None)
96
- ):
97
- labels.setdefault(label.language, label.value.capitalize())
98
- return labels
99
-
100
- def _find_parents(self, subject, rdf_graph):
101
- """Find parent notations."""
102
- parents = []
103
-
104
- # Traverse the broader hierarchy
105
- for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader):
106
- if broader != subject: # Ensure we don't include the current subject
107
- parent_notation = self._get_notation(broader, rdf_graph)
108
- if parent_notation:
109
- parents.append(parent_notation)
110
-
111
- return parents
28
+ def _get_parent_notation(self, broader, rdf_graph):
29
+ """Extract parent notation using numeric notation."""
30
+ return self._get_notation(broader, rdf_graph)
112
31
 
113
32
  def _transform_entry(self, subject, rdf_graph):
114
- """Transform an entry to the required dictionary format."""
115
- # Get subject notation with euroscivoc prefix
116
33
  notation = self._get_notation(subject, rdf_graph)
117
34
  id = f"euroscivoc:{notation}" if notation else None
118
- # Get labels for the current subject
119
35
  labels = self._get_labels(subject, rdf_graph)
120
- # Join parent notations with SPLITCHAR separator and add euroscivoc prefix
121
- parents = self.SPLITCHAR.join(
122
- f"euroscivoc:{n}" for n in reversed(self._find_parents(subject, rdf_graph))
36
+ parents = ",".join(
37
+ f"euroscivoc:{n}"
38
+ for n in reversed(self._find_parents(subject, rdf_graph))
39
+ if n
123
40
  )
124
- # Create identifiers list
125
41
  identifiers = [{"scheme": "url", "identifier": str(subject)}]
126
42
 
127
43
  return {
@@ -133,23 +49,9 @@ class EuroSciVocSubjectsTransformer(BaseTransformer):
133
49
  "identifiers": identifiers,
134
50
  }
135
51
 
136
- def apply(self, stream_entry, *args, **kwargs):
137
- """Transform a stream entry to the required dictionary format.
138
-
139
- :param stream_entry: The entry to be transformed, which includes the subject and the RDF graph.
140
- :return: The transformed stream entry.
141
- """
142
- # Apply transformations
143
- entry_data = self._transform_entry(
144
- stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
145
- )
146
- stream_entry.entry = entry_data
147
- return stream_entry
148
-
149
-
150
- # Configuration for datastream readers, transformers, and writers
151
- VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
152
52
 
53
+ # Configuration for datastream transformers, and writers
54
+ VOCABULARIES_DATASTREAM_READERS = {}
153
55
  VOCABULARIES_DATASTREAM_WRITERS = {}
154
56
 
155
57
  VOCABULARIES_DATASTREAM_TRANSFORMERS = {
@@ -159,8 +61,14 @@ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
159
61
  DATASTREAM_CONFIG = {
160
62
  "readers": [
161
63
  {
162
- "type": "euroscivoc-reader",
163
- }
64
+ "type": "http",
65
+ "args": {
66
+ "origin": euroscivoc_file_url,
67
+ },
68
+ },
69
+ {
70
+ "type": "rdf",
71
+ },
164
72
  ],
165
73
  "transformers": [{"type": "euroscivoc-transformer"}],
166
74
  "writers": [
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """GEMET Subjects module."""
@@ -0,0 +1,109 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """GEMET subjects datastreams, readers, transformers, and writers."""
10
+
11
+ from invenio_vocabularies.datastreams.transformers import RDFTransformer
12
+
13
+ from ..config import gemet_file_url
14
+
15
+ # Available with the "rdf" extra
16
+ try:
17
+ import rdflib
18
+ except ImportError:
19
+ rdflib = None
20
+
21
+
22
+ class GEMETSubjectsTransformer(RDFTransformer):
23
+ """Transformer class to convert GEMET RDF data to a dictionary format."""
24
+
25
+ def _get_parent_notation(self, broader, rdf_graph):
26
+ """Extract parent notation from GEMET URI."""
27
+ return "/".join(broader.split("/")[-2:])
28
+
29
+ def _get_groups_and_themes(self, subject, rdf_graph):
30
+ """Extract groups and themes for a subject."""
31
+ groups = []
32
+ themes = []
33
+
34
+ for relation in rdf_graph.subjects(
35
+ predicate=self.skos_core.member, object=subject
36
+ ):
37
+ relation_uri = str(relation)
38
+ relation_label = None
39
+
40
+ # If the relation is a group, check for skos:prefLabel
41
+ if "group" in relation_uri:
42
+ labels = rdf_graph.objects(
43
+ subject=relation, predicate=self.skos_core.prefLabel
44
+ )
45
+ relation_label = next(
46
+ (str(label) for label in labels if label.language == "en"), None
47
+ )
48
+ groups.append(relation_uri)
49
+
50
+ # If the relation is a theme, check for rdfs:label
51
+ elif "theme" in relation_uri:
52
+ labels = rdf_graph.objects(
53
+ subject=relation, predicate=rdflib.RDFS.label
54
+ )
55
+ relation_label = next(
56
+ (str(label) for label in labels if label.language == "en"), None
57
+ )
58
+ themes.append(relation_uri)
59
+
60
+ return groups, themes
61
+
62
+ def _transform_entry(self, subject, rdf_graph):
63
+ """Transform an entry to the required dictionary format."""
64
+ concept_number = "/".join(subject.split("/")[-2:])
65
+ id = f"gemet:{concept_number}" if concept_number else None
66
+ labels = self._get_labels(subject, rdf_graph)
67
+ parents = ",".join(
68
+ f"gemet:{n}" for n in reversed(self._find_parents(subject, rdf_graph)) if n
69
+ )
70
+ identifiers = [{"scheme": "url", "identifier": str(subject)}]
71
+ groups, themes = self._get_groups_and_themes(subject, rdf_graph)
72
+
73
+ props = {"parents": parents} if parents else {}
74
+
75
+ if groups:
76
+ props["groups"] = groups
77
+ if themes:
78
+ props["themes"] = themes
79
+
80
+ return {
81
+ "id": id,
82
+ "scheme": "GEMET",
83
+ "subject": labels.get("en", "").capitalize(),
84
+ "title": labels,
85
+ "props": props,
86
+ "identifiers": identifiers,
87
+ }
88
+
89
+
90
+ # Configuration for datastream transformers, and writers
91
+ VOCABULARIES_DATASTREAM_READERS = {}
92
+ VOCABULARIES_DATASTREAM_WRITERS = {}
93
+
94
+ VOCABULARIES_DATASTREAM_TRANSFORMERS = {"gemet-transformer": GEMETSubjectsTransformer}
95
+
96
+ DATASTREAM_CONFIG = {
97
+ "readers": [
98
+ {
99
+ "type": "http",
100
+ "args": {
101
+ "origin": gemet_file_url,
102
+ },
103
+ },
104
+ {"type": "gzip"},
105
+ {"type": "rdf"},
106
+ ],
107
+ "transformers": [{"type": "gemet-transformer"}],
108
+ "writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
109
+ }
@@ -34,9 +34,20 @@
34
34
  "type": "object",
35
35
  "patternProperties": {
36
36
  "^.*$": {
37
- "type": "string"
37
+ "oneOf": [
38
+ {
39
+ "type": "string"
40
+ },
41
+ {
42
+ "type": "array",
43
+ "items": {
44
+ "type": "string"
45
+ }
46
+ }
47
+ ]
38
48
  }
39
- }
49
+ },
50
+ "additionalProperties": false
40
51
  },
41
52
  "identifiers": {
42
53
  "description": "Alternate identifiers for the subject.",
@@ -22,14 +22,19 @@ class MeshSubjectsTransformer(BaseTransformer):
22
22
  """Apply transformation on steam entry."""
23
23
  entry_data = stream_entry.entry
24
24
 
25
- # ID in MeSH data is the URL, ex. https://id.nlm.nih.gov/mesh/D000001
25
+ # ID in MeSH data is in the URL, ex. https://id.nlm.nih.gov/mesh/D000001
26
26
  # We just want to use the ID prefixed by "mesh:""
27
27
  try:
28
28
  mesh_id = entry_data["id"].split("/")[-1]
29
+ entry_data["id"] = "mesh:" + mesh_id
29
30
  except Exception:
30
31
  raise TransformerError("Not a valid MeSH ID.")
31
32
 
32
- entry_data["id"] = "mesh:" + mesh_id
33
+ entry_data["title"] = title = entry_data.get("title", {})
34
+ # NOTE: MeSH import file comes with an English subject by default
35
+ if "en" not in title:
36
+ title["en"] = entry_data["subject"]
37
+
33
38
  return stream_entry
34
39
 
35
40
 
@@ -13,8 +13,8 @@
13
13
  from functools import partial
14
14
 
15
15
  from invenio_i18n import get_locale
16
- from marshmallow import EXCLUDE, Schema, fields, pre_load
17
- from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
16
+ from marshmallow import EXCLUDE, Schema, ValidationError, fields, pre_load, validate
17
+ from marshmallow_utils.fields import URL, IdentifierSet, SanitizedUnicode
18
18
  from marshmallow_utils.schemas import IdentifierSchema
19
19
 
20
20
  from ...services.schema import (
@@ -25,6 +25,21 @@ from ...services.schema import (
25
25
  from .config import subject_schemes
26
26
 
27
27
 
28
+ class StringOrListOfStrings(fields.Field):
29
+ """Custom field to handle both string and list of strings."""
30
+
31
+ # TODO: Move this to marshmallow-utils for broader type support.
32
+ def _deserialize(self, value, attr, data, **kwargs):
33
+ if isinstance(value, str):
34
+ return fields.String()._deserialize(value, attr, data, **kwargs)
35
+ elif isinstance(value, list) and all(isinstance(item, str) for item in value):
36
+ return [
37
+ fields.String()._deserialize(item, attr, data, **kwargs)
38
+ for item in value
39
+ ]
40
+ raise ValidationError("Invalid value. Must be a string or a list of strings.")
41
+
42
+
28
43
  class SubjectSchema(BaseVocabularySchema):
29
44
  """Service schema for subjects."""
30
45
 
@@ -35,7 +50,7 @@ class SubjectSchema(BaseVocabularySchema):
35
50
  scheme = SanitizedUnicode(required=True)
36
51
  subject = SanitizedUnicode(required=True)
37
52
  title = i18n_strings
38
- props = fields.Dict(keys=SanitizedUnicode(), values=SanitizedUnicode())
53
+ props = fields.Dict(keys=SanitizedUnicode(), values=StringOrListOfStrings())
39
54
  identifiers = IdentifierSet(
40
55
  fields.Nested(
41
56
  partial(