invenio-vocabularies 6.6.0__py2.py3-none-any.whl → 6.7.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (36) hide show
  1. invenio_vocabularies/__init__.py +1 -1
  2. invenio_vocabularies/assets/semantic-ui/js/invenio_vocabularies/src/contrib/forms/Funding/FundingModal.js +3 -27
  3. invenio_vocabularies/config.py +27 -1
  4. invenio_vocabularies/contrib/affiliations/config.py +21 -10
  5. invenio_vocabularies/contrib/affiliations/datastreams.py +103 -1
  6. invenio_vocabularies/contrib/awards/datastreams.py +7 -0
  7. invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +9 -0
  8. invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +22 -1
  9. invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +22 -1
  10. invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +22 -1
  11. invenio_vocabularies/contrib/awards/schema.py +9 -3
  12. invenio_vocabularies/contrib/funders/config.py +19 -12
  13. invenio_vocabularies/contrib/names/config.py +13 -10
  14. invenio_vocabularies/contrib/names/mappings/os-v1/names/name-v2.0.0.json +11 -0
  15. invenio_vocabularies/contrib/names/mappings/os-v2/names/name-v2.0.0.json +11 -0
  16. invenio_vocabularies/contrib/names/names.py +1 -1
  17. invenio_vocabularies/contrib/names/schema.py +10 -2
  18. invenio_vocabularies/contrib/subjects/config.py +10 -1
  19. invenio_vocabularies/contrib/subjects/datastreams.py +4 -0
  20. invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +22 -114
  21. invenio_vocabularies/contrib/subjects/gemet/__init__.py +9 -0
  22. invenio_vocabularies/contrib/subjects/gemet/datastreams.py +109 -0
  23. invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json +13 -2
  24. invenio_vocabularies/contrib/subjects/schema.py +18 -3
  25. invenio_vocabularies/datastreams/readers.py +99 -9
  26. invenio_vocabularies/datastreams/transformers.py +55 -0
  27. invenio_vocabularies/factories.py +15 -0
  28. invenio_vocabularies/records/jsonschemas/vocabularies/definitions-v1.0.0.json +9 -0
  29. invenio_vocabularies/services/config.py +1 -7
  30. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/METADATA +21 -2
  31. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/RECORD +36 -34
  32. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/AUTHORS.rst +0 -0
  33. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/LICENSE +0 -0
  34. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/WHEEL +0 -0
  35. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/entry_points.txt +0 -0
  36. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.7.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  # Copyright (C) 2021 Northwestern University.
5
5
  # Copyright (C) 2024 University of Münster.
6
6
  #
@@ -26,6 +26,15 @@ subject_schemes = LocalProxy(
26
26
  localized_title = LocalProxy(lambda: f"title.{get_locale()}^20")
27
27
 
28
28
 
29
+ gemet_file_url = LocalProxy(
30
+ lambda: current_app.config["VOCABULARIES_SUBJECTS_GEMET_FILE_URL"]
31
+ )
32
+
33
+ euroscivoc_file_url = LocalProxy(
34
+ lambda: current_app.config["VOCABULARIES_SUBJECTS_EUROSCIVOC_FILE_URL"]
35
+ )
36
+
37
+
29
38
  class SubjectsSearchOptions(SearchOptions):
30
39
  """Search options."""
31
40
 
@@ -13,6 +13,7 @@ from invenio_i18n import lazy_gettext as _
13
13
 
14
14
  from ...datastreams.writers import ServiceWriter
15
15
  from .euroscivoc import datastreams as euroscivoc_datastreams
16
+ from .gemet import datastreams as gemet_datastreams
16
17
  from .mesh import datastreams as mesh_datastreams
17
18
 
18
19
 
@@ -32,12 +33,14 @@ class SubjectsServiceWriter(ServiceWriter):
32
33
  VOCABULARIES_DATASTREAM_READERS = {
33
34
  **mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
34
35
  **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
36
+ **gemet_datastreams.VOCABULARIES_DATASTREAM_READERS,
35
37
  }
36
38
  """Subjects Data Streams readers."""
37
39
 
38
40
  VOCABULARIES_DATASTREAM_TRANSFORMERS = {
39
41
  **mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
40
42
  **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
43
+ **gemet_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
41
44
  }
42
45
  """Subjects Data Streams transformers."""
43
46
 
@@ -45,6 +48,7 @@ VOCABULARIES_DATASTREAM_WRITERS = {
45
48
  "subjects-service": SubjectsServiceWriter,
46
49
  **mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
47
50
  **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
51
+ **gemet_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
48
52
  }
49
53
  """Subjects Data Streams writers."""
50
54
 
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2022-2024 CERN.
3
+ # Copyright (C) 2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,120 +8,36 @@
8
8
 
9
9
  """EuroSciVoc subjects datastreams, readers, transformers, and writers."""
10
10
 
11
- import io
12
- from collections import namedtuple
11
+ from invenio_vocabularies.datastreams.transformers import RDFTransformer
13
12
 
14
- import requests
15
- from rdflib import OWL, RDF, Graph, Namespace
13
+ from ..config import euroscivoc_file_url
16
14
 
17
- from invenio_vocabularies.config import SUBJECTS_EUROSCIVOC_FILE_URL
18
- from invenio_vocabularies.datastreams.readers import BaseReader
19
- from invenio_vocabularies.datastreams.transformers import BaseTransformer
20
15
 
21
-
22
- class EuroSciVocSubjectsHTTPReader(BaseReader):
23
- """Reader class to fetch and process EuroSciVoc RDF data."""
24
-
25
- def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
26
- """Initialize the reader with the data source.
27
-
28
- :param origin: The URL from which to fetch the RDF data.
29
- :param mode: Mode of operation (default is 'r' for reading).
30
- """
31
- self.origin = origin or SUBJECTS_EUROSCIVOC_FILE_URL
32
- super().__init__(origin=origin, mode=mode, *args, **kwargs)
33
-
34
- def _iter(self, rdf_graph):
35
- """Iterate over the RDF graph, yielding one subject at a time.
36
-
37
- :param rdf_graph: The RDF graph to process.
38
- :yield: Subject and graph to be transformed.
39
- """
40
- SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
41
-
42
- for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
43
- yield {"subject": subject, "rdf_graph": rdf_graph}
44
-
45
- def read(self, item=None, *args, **kwargs):
46
- """Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.
47
-
48
- :param item: The RDF data provided as bytes (optional).
49
- :yield: Processed EuroSciVoc subject data.
50
- """
51
- if item:
52
- raise NotImplementedError(
53
- "EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
54
- )
55
- # Fetch the RDF data from the specified origin URL
56
- response = requests.get(self.origin)
57
- response.raise_for_status()
58
-
59
- # Treat the response content as a file-like object
60
- rdf_data = io.BytesIO(response.content)
61
-
62
- # Parse the RDF data into a graph
63
- rdf_graph = Graph()
64
- rdf_graph.parse(rdf_data, format="xml")
65
-
66
- # Yield each processed subject from the RDF graph
67
- yield from self._iter(rdf_graph)
68
-
69
-
70
- class EuroSciVocSubjectsTransformer(BaseTransformer):
16
+ class EuroSciVocSubjectsTransformer(RDFTransformer):
71
17
  """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
72
18
 
73
- SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
74
- SPLITCHAR = ","
75
-
76
19
  def _get_notation(self, subject, rdf_graph):
77
20
  """Extract the numeric notation for a subject."""
78
21
  for _, _, notation in rdf_graph.triples(
79
- (subject, self.SKOS_CORE.notation, None)
22
+ (subject, self.skos_core.notation, None)
80
23
  ):
81
24
  if str(notation).isdigit():
82
25
  return str(notation)
83
26
  return None
84
27
 
85
- def _get_labels(self, subject, rdf_graph):
86
- """Extract prefLabel and altLabel languages for a subject."""
87
- labels = {
88
- label.language: label.value.capitalize()
89
- for _, _, label in rdf_graph.triples(
90
- (subject, self.SKOS_CORE.prefLabel, None)
91
- )
92
- }
93
- if "en" not in labels:
94
- for _, _, label in rdf_graph.triples(
95
- (subject, self.SKOS_CORE.altLabel, None)
96
- ):
97
- labels.setdefault(label.language, label.value.capitalize())
98
- return labels
99
-
100
- def _find_parents(self, subject, rdf_graph):
101
- """Find parent notations."""
102
- parents = []
103
-
104
- # Traverse the broader hierarchy
105
- for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader):
106
- if broader != subject: # Ensure we don't include the current subject
107
- parent_notation = self._get_notation(broader, rdf_graph)
108
- if parent_notation:
109
- parents.append(parent_notation)
110
-
111
- return parents
28
+ def _get_parent_notation(self, broader, rdf_graph):
29
+ """Extract parent notation using numeric notation."""
30
+ return self._get_notation(broader, rdf_graph)
112
31
 
113
32
  def _transform_entry(self, subject, rdf_graph):
114
- """Transform an entry to the required dictionary format."""
115
- # Get subject notation with euroscivoc prefix
116
33
  notation = self._get_notation(subject, rdf_graph)
117
34
  id = f"euroscivoc:{notation}" if notation else None
118
- # Get labels for the current subject
119
35
  labels = self._get_labels(subject, rdf_graph)
120
- # Join parent notations with SPLITCHAR separator and add euroscivoc prefix
121
- parents = self.SPLITCHAR.join(
122
- f"euroscivoc:{n}" for n in reversed(self._find_parents(subject, rdf_graph))
36
+ parents = ",".join(
37
+ f"euroscivoc:{n}"
38
+ for n in reversed(self._find_parents(subject, rdf_graph))
39
+ if n
123
40
  )
124
- # Create identifiers list
125
41
  identifiers = [{"scheme": "url", "identifier": str(subject)}]
126
42
 
127
43
  return {
@@ -133,23 +49,9 @@ class EuroSciVocSubjectsTransformer(BaseTransformer):
133
49
  "identifiers": identifiers,
134
50
  }
135
51
 
136
- def apply(self, stream_entry, *args, **kwargs):
137
- """Transform a stream entry to the required dictionary format.
138
-
139
- :param stream_entry: The entry to be transformed, which includes the subject and the RDF graph.
140
- :return: The transformed stream entry.
141
- """
142
- # Apply transformations
143
- entry_data = self._transform_entry(
144
- stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
145
- )
146
- stream_entry.entry = entry_data
147
- return stream_entry
148
-
149
-
150
- # Configuration for datastream readers, transformers, and writers
151
- VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
152
52
 
53
+ # Configuration for datastream transformers, and writers
54
+ VOCABULARIES_DATASTREAM_READERS = {}
153
55
  VOCABULARIES_DATASTREAM_WRITERS = {}
154
56
 
155
57
  VOCABULARIES_DATASTREAM_TRANSFORMERS = {
@@ -159,8 +61,14 @@ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
159
61
  DATASTREAM_CONFIG = {
160
62
  "readers": [
161
63
  {
162
- "type": "euroscivoc-reader",
163
- }
64
+ "type": "http",
65
+ "args": {
66
+ "origin": euroscivoc_file_url,
67
+ },
68
+ },
69
+ {
70
+ "type": "rdf",
71
+ },
164
72
  ],
165
73
  "transformers": [{"type": "euroscivoc-transformer"}],
166
74
  "writers": [
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """GEMET Subjects module."""
@@ -0,0 +1,109 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """GEMET subjects datastreams, readers, transformers, and writers."""
10
+
11
+ from invenio_vocabularies.datastreams.transformers import RDFTransformer
12
+
13
+ from ..config import gemet_file_url
14
+
15
+ # Available with the "rdf" extra
16
+ try:
17
+ import rdflib
18
+ except ImportError:
19
+ rdflib = None
20
+
21
+
22
+ class GEMETSubjectsTransformer(RDFTransformer):
23
+ """Transformer class to convert GEMET RDF data to a dictionary format."""
24
+
25
+ def _get_parent_notation(self, broader, rdf_graph):
26
+ """Extract parent notation from GEMET URI."""
27
+ return "/".join(broader.split("/")[-2:])
28
+
29
+ def _get_groups_and_themes(self, subject, rdf_graph):
30
+ """Extract groups and themes for a subject."""
31
+ groups = []
32
+ themes = []
33
+
34
+ for relation in rdf_graph.subjects(
35
+ predicate=self.skos_core.member, object=subject
36
+ ):
37
+ relation_uri = str(relation)
38
+ relation_label = None
39
+
40
+ # If the relation is a group, check for skos:prefLabel
41
+ if "group" in relation_uri:
42
+ labels = rdf_graph.objects(
43
+ subject=relation, predicate=self.skos_core.prefLabel
44
+ )
45
+ relation_label = next(
46
+ (str(label) for label in labels if label.language == "en"), None
47
+ )
48
+ groups.append(relation_uri)
49
+
50
+ # If the relation is a theme, check for rdfs:label
51
+ elif "theme" in relation_uri:
52
+ labels = rdf_graph.objects(
53
+ subject=relation, predicate=rdflib.RDFS.label
54
+ )
55
+ relation_label = next(
56
+ (str(label) for label in labels if label.language == "en"), None
57
+ )
58
+ themes.append(relation_uri)
59
+
60
+ return groups, themes
61
+
62
+ def _transform_entry(self, subject, rdf_graph):
63
+ """Transform an entry to the required dictionary format."""
64
+ concept_number = "/".join(subject.split("/")[-2:])
65
+ id = f"gemet:{concept_number}" if concept_number else None
66
+ labels = self._get_labels(subject, rdf_graph)
67
+ parents = ",".join(
68
+ f"gemet:{n}" for n in reversed(self._find_parents(subject, rdf_graph)) if n
69
+ )
70
+ identifiers = [{"scheme": "url", "identifier": str(subject)}]
71
+ groups, themes = self._get_groups_and_themes(subject, rdf_graph)
72
+
73
+ props = {"parents": parents} if parents else {}
74
+
75
+ if groups:
76
+ props["groups"] = groups
77
+ if themes:
78
+ props["themes"] = themes
79
+
80
+ return {
81
+ "id": id,
82
+ "scheme": "GEMET",
83
+ "subject": labels.get("en", "").capitalize(),
84
+ "title": labels,
85
+ "props": props,
86
+ "identifiers": identifiers,
87
+ }
88
+
89
+
90
+ # Configuration for datastream transformers, and writers
91
+ VOCABULARIES_DATASTREAM_READERS = {}
92
+ VOCABULARIES_DATASTREAM_WRITERS = {}
93
+
94
+ VOCABULARIES_DATASTREAM_TRANSFORMERS = {"gemet-transformer": GEMETSubjectsTransformer}
95
+
96
+ DATASTREAM_CONFIG = {
97
+ "readers": [
98
+ {
99
+ "type": "http",
100
+ "args": {
101
+ "origin": gemet_file_url,
102
+ },
103
+ },
104
+ {"type": "gzip"},
105
+ {"type": "rdf"},
106
+ ],
107
+ "transformers": [{"type": "gemet-transformer"}],
108
+ "writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
109
+ }
@@ -34,9 +34,20 @@
34
34
  "type": "object",
35
35
  "patternProperties": {
36
36
  "^.*$": {
37
- "type": "string"
37
+ "oneOf": [
38
+ {
39
+ "type": "string"
40
+ },
41
+ {
42
+ "type": "array",
43
+ "items": {
44
+ "type": "string"
45
+ }
46
+ }
47
+ ]
38
48
  }
39
- }
49
+ },
50
+ "additionalProperties": false
40
51
  },
41
52
  "identifiers": {
42
53
  "description": "Alternate identifiers for the subject.",
@@ -13,8 +13,8 @@
13
13
  from functools import partial
14
14
 
15
15
  from invenio_i18n import get_locale
16
- from marshmallow import EXCLUDE, Schema, fields, pre_load
17
- from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
16
+ from marshmallow import EXCLUDE, Schema, ValidationError, fields, pre_load, validate
17
+ from marshmallow_utils.fields import URL, IdentifierSet, SanitizedUnicode
18
18
  from marshmallow_utils.schemas import IdentifierSchema
19
19
 
20
20
  from ...services.schema import (
@@ -25,6 +25,21 @@ from ...services.schema import (
25
25
  from .config import subject_schemes
26
26
 
27
27
 
28
+ class StringOrListOfStrings(fields.Field):
29
+ """Custom field to handle both string and list of strings."""
30
+
31
+ # TODO: Move this to marshmallow-utils for broader type support.
32
+ def _deserialize(self, value, attr, data, **kwargs):
33
+ if isinstance(value, str):
34
+ return fields.String()._deserialize(value, attr, data, **kwargs)
35
+ elif isinstance(value, list) and all(isinstance(item, str) for item in value):
36
+ return [
37
+ fields.String()._deserialize(item, attr, data, **kwargs)
38
+ for item in value
39
+ ]
40
+ raise ValidationError("Invalid value. Must be a string or a list of strings.")
41
+
42
+
28
43
  class SubjectSchema(BaseVocabularySchema):
29
44
  """Service schema for subjects."""
30
45
 
@@ -35,7 +50,7 @@ class SubjectSchema(BaseVocabularySchema):
35
50
  scheme = SanitizedUnicode(required=True)
36
51
  subject = SanitizedUnicode(required=True)
37
52
  title = i18n_strings
38
- props = fields.Dict(keys=SanitizedUnicode(), values=SanitizedUnicode())
53
+ props = fields.Dict(keys=SanitizedUnicode(), values=StringOrListOfStrings())
39
54
  identifiers = IdentifierSet(
40
55
  fields.Nested(
41
56
  partial(
@@ -11,6 +11,7 @@
11
11
 
12
12
  import csv
13
13
  import gzip
14
+ import io
14
15
  import json
15
16
  import re
16
17
  import tarfile
@@ -27,11 +28,25 @@ from lxml.html import parse as html_parse
27
28
  from .errors import ReaderError
28
29
  from .xml import etree_to_dict
29
30
 
31
+ # Extras dependencies
32
+ # "oaipmh"
30
33
  try:
31
34
  import oaipmh_scythe
32
35
  except ImportError:
33
36
  oaipmh_scythe = None
34
37
 
38
+ # "rdf"
39
+ try:
40
+ import rdflib
41
+ except ImportError:
42
+ rdflib = None
43
+
44
+ # "sparql"
45
+ try:
46
+ import SPARQLWrapper as sparql
47
+ except ImportError:
48
+ sparql = None
49
+
35
50
 
36
51
  class BaseReader(ABC):
37
52
  """Base reader."""
@@ -103,8 +118,7 @@ class SimpleHTTPReader(BaseReader):
103
118
 
104
119
  def __init__(self, origin, id=None, ids=None, content_type=None, *args, **kwargs):
105
120
  """Constructor."""
106
- assert id or ids
107
- self._ids = ids if ids else [id]
121
+ self._ids = ids if ids else ([id] if id else None)
108
122
  self.content_type = content_type
109
123
  super().__init__(origin, *args, **kwargs)
110
124
 
@@ -113,14 +127,22 @@ class SimpleHTTPReader(BaseReader):
113
127
  base_url = url
114
128
  headers = {"Accept": self.content_type}
115
129
 
116
- for id_ in self._ids:
117
- url = base_url.format(id=id_)
130
+ # If there are no IDs, query the base URL
131
+ if not self._ids:
118
132
  resp = requests.get(url, headers=headers)
119
- if resp.status_code != 200:
120
- # todo add logging/fail
121
- pass
133
+ if resp.status_code == 200:
134
+ yield resp.content
135
+ else:
136
+ print(f"Failed to fetch URL {url}: {resp.status_code}")
137
+ else:
138
+ for id_ in self._ids:
139
+ url = base_url.format(id=id_)
140
+ resp = requests.get(url, headers=headers)
141
+ if resp.status_code != 200:
142
+ # todo add logging/fail
143
+ pass
122
144
 
123
- yield resp.content
145
+ yield resp.content
124
146
 
125
147
  def read(self, item=None, *args, **kwargs):
126
148
  """Chooses between item and origin as url."""
@@ -197,6 +219,9 @@ class GzipReader(BaseReader):
197
219
  """Gzip reader."""
198
220
 
199
221
  def _iter(self, fp, *args, **kwargs):
222
+ if isinstance(fp, bytes):
223
+ fp = io.BytesIO(fp)
224
+
200
225
  with gzip.open(fp) as gp:
201
226
  yield gp
202
227
 
@@ -236,7 +261,7 @@ class XMLReader(BaseReader):
236
261
  try:
237
262
  xml_tree = fromstring(fp)
238
263
  xml_dict = etree_to_dict(xml_tree)
239
- except Exception as e:
264
+ except Exception:
240
265
  xml_tree = html_parse(fp).getroot()
241
266
  xml_dict = etree_to_dict(xml_tree)["html"]["body"]
242
267
 
@@ -346,3 +371,68 @@ def xml_to_dict(tree: etree._Element):
346
371
  dict_obj["record"] = etree.tostring(tree)
347
372
 
348
373
  return dict_obj
374
+
375
+
376
+ class RDFReader(BaseReader):
377
+ """Base Reader class to fetch and process RDF data."""
378
+
379
+ @property
380
+ def skos_core(self):
381
+ """Return the SKOS Core namespace."""
382
+ return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
383
+
384
+ def _iter(self, rdf_graph):
385
+ """Iterate over the RDF graph, yielding one subject at a time."""
386
+ for subject, _, _ in rdf_graph.triples(
387
+ (None, rdflib.RDF.type, self.skos_core.Concept)
388
+ ):
389
+ yield {"subject": subject, "rdf_graph": rdf_graph}
390
+
391
+ def read(self, item=None, *args, **kwargs):
392
+ """Fetch and process the RDF data, yielding it one subject at a time."""
393
+ if isinstance(item, gzip.GzipFile):
394
+ rdf_content = item.read().decode("utf-8")
395
+
396
+ elif isinstance(item, bytes):
397
+ rdf_content = item.decode("utf-8")
398
+ else:
399
+ raise ReaderError("Unsupported content type")
400
+
401
+ rdf_graph = rdflib.Graph()
402
+ rdf_graph.parse(io.StringIO(rdf_content), format="xml")
403
+
404
+ yield from self._iter(rdf_graph)
405
+
406
+
407
+ class SPARQLReader(BaseReader):
408
+ """Generic reader class to fetch and process RDF data from a SPARQL endpoint."""
409
+
410
+ def __init__(self, origin, query, mode="r", *args, **kwargs):
411
+ """Initialize the reader with the data source.
412
+
413
+ :param origin: The SPARQL endpoint from which to fetch the RDF data.
414
+ :param query: The SPARQL query to execute.
415
+ :param mode: Mode of operation (default is 'r' for reading).
416
+ """
417
+ self._origin = origin
418
+ self._query = query
419
+ super().__init__(origin=origin, mode=mode, *args, **kwargs)
420
+
421
+ def _iter(self, fp, *args, **kwargs):
422
+ raise NotImplementedError(
423
+ "SPARQLReader downloads one result set from SPARQL and therefore does not iterate through items"
424
+ )
425
+
426
+ def read(self, item=None, *args, **kwargs):
427
+ """Fetch and process RDF data, yielding results one at a time."""
428
+ if item:
429
+ raise NotImplementedError(
430
+ "SPARQLReader does not support being chained after another reader"
431
+ )
432
+
433
+ sparql_client = sparql.SPARQLWrapper(self._origin)
434
+ sparql_client.setQuery(self._query)
435
+ sparql_client.setReturnFormat(sparql.JSON)
436
+
437
+ results = sparql_client.query().convert()
438
+ yield from results["results"]["bindings"]
@@ -15,6 +15,11 @@ from lxml import etree
15
15
  from .errors import TransformerError
16
16
  from .xml import etree_to_dict
17
17
 
18
+ try:
19
+ import rdflib
20
+ except ImportError:
21
+ rdflib = None
22
+
18
23
 
19
24
  class BaseTransformer(ABC):
20
25
  """Base transformer."""
@@ -61,3 +66,53 @@ class XMLTransformer(BaseTransformer):
61
66
 
62
67
  stream_entry.entry = record
63
68
  return stream_entry
69
+
70
+
71
+ class RDFTransformer(BaseTransformer):
72
+ """Base Transformer class for RDF data to dictionary format."""
73
+
74
+ @property
75
+ def skos_core(self):
76
+ """Get the SKOS core namespace."""
77
+ return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
78
+
79
+ def _get_labels(self, subject, rdf_graph):
80
+ """Extract labels (prefLabel or altLabel) for a subject."""
81
+ labels = {
82
+ label.language: label.value.capitalize()
83
+ for _, _, label in rdf_graph.triples(
84
+ (subject, self.skos_core.prefLabel, None)
85
+ )
86
+ if label.language and "-" not in label.language
87
+ }
88
+
89
+ if "en" not in labels:
90
+ for _, _, label in rdf_graph.triples(
91
+ (subject, self.skos_core.altLabel, None)
92
+ ):
93
+ labels.setdefault(label.language, label.value.capitalize())
94
+
95
+ return labels
96
+
97
+ def _find_parents(self, subject, rdf_graph):
98
+ """Find parent notations."""
99
+ return [
100
+ self._get_parent_notation(broader, rdf_graph)
101
+ for broader in rdf_graph.transitive_objects(subject, self.skos_core.broader)
102
+ if broader != subject
103
+ ]
104
+
105
+ def _get_parent_notation(self, broader, rdf_graph):
106
+ """Extract notation for a parent."""
107
+ raise NotImplementedError("This method should be implemented in a subclass.")
108
+
109
+ def _transform_entry(self, subject, rdf_graph):
110
+ """Transform an RDF subject entry into the desired dictionary format."""
111
+ raise NotImplementedError("This method should be implemented in a subclass.")
112
+
113
+ def apply(self, stream_entry, *args, **kwargs):
114
+ """Apply transformation to a stream entry."""
115
+ stream_entry.entry = self._transform_entry(
116
+ stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
117
+ )
118
+ return stream_entry
@@ -16,6 +16,9 @@ from invenio_records_resources.proxies import current_service_registry
16
16
  from .contrib.affiliations.datastreams import (
17
17
  DATASTREAM_CONFIG as affiliations_ds_config,
18
18
  )
19
+ from .contrib.affiliations.datastreams import (
20
+ DATASTREAM_CONFIG_EDMO as affiliations_edmo_ds_config,
21
+ )
19
22
  from .contrib.affiliations.datastreams import (
20
23
  DATASTREAM_CONFIG_OPENAIRE as affiliations_openaire_ds_config,
21
24
  )
@@ -123,6 +126,17 @@ class AffiliationsOpenAIREVocabularyConfig(VocabularyConfig):
123
126
  raise NotImplementedError("Service not implemented for OpenAIRE Affiliations")
124
127
 
125
128
 
129
+ class AffiliationsEDMOVocabularyConfig(VocabularyConfig):
130
+ """European Directory of Marine Organisations (EDMO) Affiliations Vocabulary Config."""
131
+
132
+ config = affiliations_edmo_ds_config
133
+ vocabulary_name = "affiliations:edmo"
134
+
135
+ def get_service(self):
136
+ """Get the service for the vocabulary."""
137
+ raise NotImplementedError("Service not implemented for EDMO Affiliations")
138
+
139
+
126
140
  def get_vocabulary_config(vocabulary):
127
141
  """Factory function to get the appropriate Vocabulary Config."""
128
142
  vocab_config = {
@@ -132,6 +146,7 @@ def get_vocabulary_config(vocabulary):
132
146
  "awards:cordis": AwardsCordisVocabularyConfig,
133
147
  "affiliations": AffiliationsVocabularyConfig,
134
148
  "affiliations:openaire": AffiliationsOpenAIREVocabularyConfig,
149
+ "affiliations:edmo": AffiliationsEDMOVocabularyConfig,
135
150
  "subjects": SubjectsVocabularyConfig,
136
151
  }
137
152
  return vocab_config.get(vocabulary, VocabularyConfig)()