invenio-vocabularies 5.1.0__py2.py3-none-any.whl → 6.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (39) hide show
  1. invenio_vocabularies/__init__.py +1 -1
  2. invenio_vocabularies/cli.py +7 -2
  3. invenio_vocabularies/config.py +13 -0
  4. invenio_vocabularies/contrib/affiliations/datastreams.py +95 -1
  5. invenio_vocabularies/contrib/awards/awards.py +15 -4
  6. invenio_vocabularies/contrib/awards/datastreams.py +156 -60
  7. invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +35 -0
  8. invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +44 -1
  9. invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +44 -1
  10. invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +44 -1
  11. invenio_vocabularies/contrib/awards/schema.py +16 -1
  12. invenio_vocabularies/contrib/awards/serializer.py +8 -1
  13. invenio_vocabularies/contrib/common/openaire/__init__.py +9 -0
  14. invenio_vocabularies/contrib/common/openaire/datastreams.py +84 -0
  15. invenio_vocabularies/contrib/common/ror/datastreams.py +20 -7
  16. invenio_vocabularies/contrib/names/datastreams.py +12 -2
  17. invenio_vocabularies/contrib/names/names.py +4 -3
  18. invenio_vocabularies/contrib/names/permissions.py +20 -0
  19. invenio_vocabularies/contrib/subjects/datastreams.py +12 -6
  20. invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py +9 -0
  21. invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +171 -0
  22. invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json +16 -0
  23. invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json +14 -0
  24. invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json +14 -0
  25. invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json +14 -0
  26. invenio_vocabularies/contrib/subjects/mesh/__init__.py +9 -0
  27. invenio_vocabularies/contrib/subjects/schema.py +30 -6
  28. invenio_vocabularies/datastreams/readers.py +15 -4
  29. invenio_vocabularies/datastreams/transformers.py +15 -4
  30. invenio_vocabularies/datastreams/writers.py +44 -12
  31. invenio_vocabularies/factories.py +30 -0
  32. invenio_vocabularies/jobs.py +88 -0
  33. {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/METADATA +16 -1
  34. {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/RECORD +39 -32
  35. {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/entry_points.txt +3 -0
  36. {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/AUTHORS.rst +0 -0
  37. {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/LICENSE +0 -0
  38. {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/WHEEL +0 -0
  39. {invenio_vocabularies-5.1.0.dist-info → invenio_vocabularies-6.1.0.dist-info}/top_level.txt +0 -0
@@ -58,12 +58,55 @@
58
58
  "acronym": {
59
59
  "type": "keyword",
60
60
  "fields": {
61
- "text": { "type": "text"}
61
+ "text": { "type": "text" }
62
62
  }
63
63
  },
64
64
  "program": {
65
65
  "type": "keyword"
66
66
  },
67
+ "subjects": {
68
+ "properties": {
69
+ "@v": {
70
+ "type": "keyword"
71
+ },
72
+ "id": {
73
+ "type": "keyword"
74
+ },
75
+ "props": {
76
+ "type": "object",
77
+ "dynamic": "true"
78
+ },
79
+ "subject": {
80
+ "type": "keyword"
81
+ },
82
+ "scheme": {
83
+ "type": "keyword"
84
+ },
85
+ "identifiers": {
86
+ "properties": {
87
+ "identifier": {
88
+ "type": "keyword"
89
+ },
90
+ "scheme": {
91
+ "type": "keyword"
92
+ }
93
+ }
94
+ }
95
+ }
96
+ },
97
+ "organizations": {
98
+ "properties": {
99
+ "scheme": {
100
+ "type": "keyword"
101
+ },
102
+ "id": {
103
+ "type": "keyword"
104
+ },
105
+ "organization": {
106
+ "type": "keyword"
107
+ }
108
+ }
109
+ },
67
110
  "funder": {
68
111
  "type": "object",
69
112
  "properties": {
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021-2022 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -17,13 +17,24 @@ from marshmallow_utils.schemas import IdentifierSchema
17
17
 
18
18
  from ...services.schema import (
19
19
  BaseVocabularySchema,
20
+ ContribVocabularyRelationSchema,
20
21
  ModePIDFieldVocabularyMixin,
21
22
  i18n_strings,
22
23
  )
23
24
  from ..funders.schema import FunderRelationSchema
25
+ from ..subjects.schema import SubjectRelationSchema
24
26
  from .config import award_schemes
25
27
 
26
28
 
29
+ class AwardOrganizationRelationSchema(ContribVocabularyRelationSchema):
30
+ """Schema to define an organization relation in an award."""
31
+
32
+ ftf_name = "organization"
33
+ parent_field_name = "organizations"
34
+ organization = SanitizedUnicode()
35
+ scheme = SanitizedUnicode()
36
+
37
+
27
38
  class AwardSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
28
39
  """Award schema."""
29
40
 
@@ -46,6 +57,10 @@ class AwardSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin):
46
57
 
47
58
  program = SanitizedUnicode()
48
59
 
60
+ subjects = fields.List(fields.Nested(SubjectRelationSchema))
61
+
62
+ organizations = fields.List(fields.Nested(AwardOrganizationRelationSchema))
63
+
49
64
  id = SanitizedUnicode(
50
65
  validate=validate.Length(min=1, error=_("PID cannot be blank."))
51
66
  )
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2022 CERN.
3
+ # Copyright (C) 2022-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -12,6 +12,9 @@ from marshmallow import Schema, fields
12
12
 
13
13
  from invenio_vocabularies.resources import L10NString
14
14
 
15
+ from ..subjects.schema import SubjectRelationSchema
16
+ from .schema import AwardOrganizationRelationSchema
17
+
15
18
 
16
19
  class IdentifierSchema(Schema):
17
20
  """Identifier scheme."""
@@ -37,4 +40,8 @@ class AwardL10NItemSchema(Schema):
37
40
  acronym = fields.String(dump_only=True)
38
41
  program = fields.String(dump_only=True)
39
42
  funder = fields.Nested(FunderRelationSchema, dump_only=True)
43
+ subjects = fields.List(fields.Nested(SubjectRelationSchema), dump_only=True)
40
44
  identifiers = fields.List(fields.Nested(IdentifierSchema), dump_only=True)
45
+ organizations = fields.List(
46
+ fields.Nested(AwardOrganizationRelationSchema), dump_only=True
47
+ )
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """OpenAIRE-related module."""
@@ -0,0 +1,84 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """OpenAIRE-related Datastreams Readers/Writers/Transformers module."""
10
+
11
+ import io
12
+
13
+ import requests
14
+
15
+ from invenio_vocabularies.datastreams.errors import ReaderError
16
+ from invenio_vocabularies.datastreams.readers import BaseReader
17
+
18
+
19
+ class OpenAIREHTTPReader(BaseReader):
20
+ """OpenAIRE HTTP Reader returning an in-memory binary stream of the latest OpenAIRE Graph Dataset tar file of a given type."""
21
+
22
+ def __init__(self, origin=None, mode="r", tar_href=None, *args, **kwargs):
23
+ """Constructor."""
24
+ self.tar_href = tar_href
25
+ super().__init__(origin, mode, *args, **kwargs)
26
+
27
+ def _iter(self, fp, *args, **kwargs):
28
+ raise NotImplementedError(
29
+ "OpenAIREHTTPReader downloads one file and therefore does not iterate through items"
30
+ )
31
+
32
+ def read(self, item=None, *args, **kwargs):
33
+ """Reads the latest OpenAIRE Graph Dataset tar file of a given type from Zenodo and yields an in-memory binary stream of it."""
34
+ if item:
35
+ raise NotImplementedError(
36
+ "OpenAIREHTTPReader does not support being chained after another reader"
37
+ )
38
+
39
+ if self._origin == "full":
40
+ # OpenAIRE Graph Dataset
41
+ api_url = "https://zenodo.org/api/records/3516917"
42
+ elif self._origin == "diff":
43
+ # OpenAIRE Graph dataset: new collected projects
44
+ api_url = "https://zenodo.org/api/records/6419021"
45
+ else:
46
+ raise ReaderError("The --origin option should be either 'full' or 'diff'")
47
+
48
+ # Call the signposting `linkset+json` endpoint for the Concept DOI (i.e. latest version) of the OpenAIRE Graph Dataset.
49
+ # See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint
50
+ headers = {"Accept": "application/linkset+json"}
51
+ api_resp = requests.get(api_url, headers=headers)
52
+ api_resp.raise_for_status()
53
+
54
+ # Extract the Landing page Link Set Object located as the first (index 0) item.
55
+ landing_page_linkset = api_resp.json()["linkset"][0]
56
+
57
+ # Extract the URL of the only tar file matching `tar_href` linked to the record.
58
+ landing_page_matching_tar_items = [
59
+ item
60
+ for item in landing_page_linkset["item"]
61
+ if item["type"] == "application/x-tar"
62
+ and item["href"].endswith(self.tar_href)
63
+ ]
64
+ if len(landing_page_matching_tar_items) != 1:
65
+ raise ReaderError(
66
+ f"Expected 1 tar item matching {self.tar_href} but got {len(landing_page_matching_tar_items)}"
67
+ )
68
+ file_url = landing_page_matching_tar_items[0]["href"]
69
+
70
+ # Download the matching tar file and fully load the response bytes content in memory.
71
+ # The bytes content are then wrapped by a BytesIO to be file-like object (as required by `tarfile.open`).
72
+ # Using directly `file_resp.raw` is not possible since `tarfile.open` requires the file-like object to be seekable.
73
+ file_resp = requests.get(file_url)
74
+ file_resp.raise_for_status()
75
+ yield io.BytesIO(file_resp.content)
76
+
77
+
78
+ VOCABULARIES_DATASTREAM_READERS = {
79
+ "openaire-http": OpenAIREHTTPReader,
80
+ }
81
+
82
+ VOCABULARIES_DATASTREAM_TRANSFORMERS = {}
83
+
84
+ VOCABULARIES_DATASTREAM_WRITERS = {}
@@ -21,7 +21,11 @@ from invenio_vocabularies.datastreams.transformers import BaseTransformer
21
21
 
22
22
 
23
23
  class RORHTTPReader(BaseReader):
24
- """ROR HTTP Reader returning an in-memory binary stream of the latest ROR data dump ZIP file."""
24
+ """ROR HTTP Reader.
25
+
26
+ Returning an in-memory
27
+ binary stream of the latest ROR data dump ZIP file.
28
+ """
25
29
 
26
30
  def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
27
31
  """Constructor."""
@@ -30,7 +34,8 @@ class RORHTTPReader(BaseReader):
30
34
 
31
35
  def _iter(self, fp, *args, **kwargs):
32
36
  raise NotImplementedError(
33
- "RORHTTPReader downloads one file and therefore does not iterate through items"
37
+ "RORHTTPReader downloads one file "
38
+ "and therefore does not iterate through items"
34
39
  )
35
40
 
36
41
  def _get_last_dump_date(self, linksets):
@@ -53,11 +58,16 @@ class RORHTTPReader(BaseReader):
53
58
  return last_dump_date
54
59
  else:
55
60
  raise ReaderError(
56
- "Couldn't find JSON-LD in publisher's linkset to determine last dump date."
61
+ "Couldn't find JSON-LD in publisher's linkset "
62
+ "to determine last dump date."
57
63
  )
58
64
 
59
65
  def read(self, item=None, *args, **kwargs):
60
- """Reads the latest ROR data dump ZIP file from Zenodo and yields an in-memory binary stream of it."""
66
+ """Reads the latest ROR data dump.
67
+
68
+ Read from ZIP file from
69
+ Zenodo and yields an in-memory binary stream of it.
70
+ """
61
71
  if item:
62
72
  raise NotImplementedError(
63
73
  "RORHTTPReader does not support being chained after another reader"
@@ -68,7 +78,8 @@ class RORHTTPReader(BaseReader):
68
78
  landing_page = requests.get(dataset_doi_link, allow_redirects=True)
69
79
  landing_page.raise_for_status()
70
80
 
71
- # Call the signposting `linkset+json` endpoint for the Concept DOI (i.e. latest version) of the ROR data dump.
81
+ # Call the signposting `linkset+json` endpoint for
82
+ # the Concept DOI (i.e. latest version) of the ROR data dump.
72
83
  # See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint
73
84
  if "linkset" not in landing_page.links:
74
85
  raise ReaderError("Linkset not found in the ROR dataset record.")
@@ -94,8 +105,10 @@ class RORHTTPReader(BaseReader):
94
105
  raise ReaderError(f"Expected 1 ZIP item but got {len(zip_files)}")
95
106
 
96
107
  # Download the ZIP file and fully load the response bytes content in memory.
97
- # The bytes content are then wrapped by a BytesIO to be file-like object (as required by `zipfile.ZipFile`).
98
- # Using directly `file_resp.raw` is not possible since `zipfile.ZipFile` requires the file-like object to be seekable.
108
+ # The bytes content are then wrapped by a BytesIO to be
109
+ # file-like object (as required by `zipfile.ZipFile`).
110
+ # Using directly `file_resp.raw` is not possible since
111
+ # `zipfile.ZipFile` requires the file-like object to be seekable.
99
112
  file_resp = requests.get(file_url)
100
113
  file_resp.raise_for_status()
101
114
  yield io.BytesIO(file_resp.content)
@@ -243,7 +243,12 @@ DATASTREAM_CONFIG = {
243
243
  "regex": "\\.xml$",
244
244
  },
245
245
  },
246
- {"type": "xml"},
246
+ {
247
+ "type": "xml",
248
+ "args": {
249
+ "root_element": "record",
250
+ },
251
+ },
247
252
  ],
248
253
  "transformers": [{"type": "orcid"}],
249
254
  "writers": [
@@ -266,7 +271,12 @@ ORCID_PRESET_DATASTREAM_CONFIG = {
266
271
  {
267
272
  "type": "orcid-data-sync",
268
273
  },
269
- {"type": "xml"},
274
+ {
275
+ "type": "xml",
276
+ "args": {
277
+ "root_element": "record",
278
+ },
279
+ },
270
280
  ],
271
281
  "transformers": [{"type": "orcid"}],
272
282
  "writers": [
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -21,7 +21,8 @@ from invenio_records_resources.records.systemfields import (
21
21
  )
22
22
  from invenio_records_resources.resources.records.headers import etag_headers
23
23
 
24
- from ...services.permissions import PermissionPolicy
24
+ from invenio_vocabularies.contrib.names.permissions import NamesPermissionPolicy
25
+
25
26
  from ..affiliations.api import Affiliation
26
27
  from .config import NamesSearchOptions, service_components
27
28
  from .schema import NameSchema
@@ -63,7 +64,7 @@ record_type = RecordTypeFactory(
63
64
  service_schema=NameSchema,
64
65
  search_options=NamesSearchOptions,
65
66
  service_components=service_components,
66
- permission_policy_cls=PermissionPolicy,
67
+ permission_policy_cls=NamesPermissionPolicy,
67
68
  # Resource layer
68
69
  endpoint_route="/names",
69
70
  resource_cls_attrs={
@@ -0,0 +1,20 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2020-2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """Vocabulary permissions."""
10
+
11
+ from invenio_records_permissions.generators import AuthenticatedUser, SystemProcess
12
+
13
+ from ...services.permissions import PermissionPolicy
14
+
15
+
16
+ class NamesPermissionPolicy(PermissionPolicy):
17
+ """Permission policy."""
18
+
19
+ can_search = [SystemProcess(), AuthenticatedUser()]
20
+ can_read = [SystemProcess(), AuthenticatedUser()]
@@ -12,9 +12,8 @@ from invenio_access.permissions import system_identity
12
12
  from invenio_i18n import lazy_gettext as _
13
13
 
14
14
  from ...datastreams.writers import ServiceWriter
15
- from .mesh.datastreams import VOCABULARIES_DATASTREAM_READERS as mesh_readers
16
- from .mesh.datastreams import VOCABULARIES_DATASTREAM_TRANSFORMERS as mesh_transformers
17
- from .mesh.datastreams import VOCABULARIES_DATASTREAM_WRITERS as mesh_writers
15
+ from .euroscivoc import datastreams as euroscivoc_datastreams
16
+ from .mesh import datastreams as mesh_datastreams
18
17
 
19
18
 
20
19
  class SubjectsServiceWriter(ServiceWriter):
@@ -30,15 +29,22 @@ class SubjectsServiceWriter(ServiceWriter):
30
29
  return entry["id"]
31
30
 
32
31
 
33
- VOCABULARIES_DATASTREAM_READERS = {**mesh_readers}
32
+ VOCABULARIES_DATASTREAM_READERS = {
33
+ **mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
34
+ **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
35
+ }
34
36
  """Subjects Data Streams readers."""
35
37
 
36
- VOCABULARIES_DATASTREAM_TRANSFORMERS = {**mesh_transformers}
38
+ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
39
+ **mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
40
+ **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
41
+ }
37
42
  """Subjects Data Streams transformers."""
38
43
 
39
44
  VOCABULARIES_DATASTREAM_WRITERS = {
40
45
  "subjects-service": SubjectsServiceWriter,
41
- **mesh_writers,
46
+ **mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
47
+ **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
42
48
  }
43
49
  """Subjects Data Streams writers."""
44
50
 
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """EuroSciVoc Subjects module."""
@@ -0,0 +1,171 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2022-2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """EuroSciVoc subjects datastreams, readers, transformers, and writers."""
10
+
11
+ import io
12
+ from collections import namedtuple
13
+
14
+ import requests
15
+ from rdflib import OWL, RDF, Graph, Namespace
16
+
17
+ from invenio_vocabularies.config import SUBJECTS_EUROSCIVOC_FILE_URL
18
+ from invenio_vocabularies.datastreams.readers import BaseReader
19
+ from invenio_vocabularies.datastreams.transformers import BaseTransformer
20
+
21
+
22
+ class EuroSciVocSubjectsHTTPReader(BaseReader):
23
+ """Reader class to fetch and process EuroSciVoc RDF data."""
24
+
25
+ def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
26
+ """Initialize the reader with the data source.
27
+
28
+ :param origin: The URL from which to fetch the RDF data.
29
+ :param mode: Mode of operation (default is 'r' for reading).
30
+ """
31
+ self.origin = origin or SUBJECTS_EUROSCIVOC_FILE_URL
32
+ super().__init__(origin=origin, mode=mode, *args, **kwargs)
33
+
34
+ def _iter(self, rdf_graph):
35
+ """Iterate over the RDF graph, yielding one subject at a time.
36
+
37
+ :param rdf_graph: The RDF graph to process.
38
+ :yield: Subject and graph to be transformed.
39
+ """
40
+ SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
41
+
42
+ for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
43
+ yield {"subject": subject, "rdf_graph": rdf_graph}
44
+
45
+ def read(self, item=None, *args, **kwargs):
46
+ """Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.
47
+
48
+ :param item: The RDF data provided as bytes (optional).
49
+ :yield: Processed EuroSciVoc subject data.
50
+ """
51
+ if item:
52
+ raise NotImplementedError(
53
+ "EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
54
+ )
55
+ # Fetch the RDF data from the specified origin URL
56
+ response = requests.get(self.origin)
57
+ response.raise_for_status()
58
+
59
+ # Treat the response content as a file-like object
60
+ rdf_data = io.BytesIO(response.content)
61
+
62
+ # Parse the RDF data into a graph
63
+ rdf_graph = Graph()
64
+ rdf_graph.parse(rdf_data, format="xml")
65
+
66
+ # Yield each processed subject from the RDF graph
67
+ yield from self._iter(rdf_graph)
68
+
69
+
70
+ class EuroSciVocSubjectsTransformer(BaseTransformer):
71
+ """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
72
+
73
+ SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
74
+ SPLITCHAR = ","
75
+
76
+ def _get_notation(self, subject, rdf_graph):
77
+ """Extract the numeric notation for a subject."""
78
+ for _, _, notation in rdf_graph.triples(
79
+ (subject, self.SKOS_CORE.notation, None)
80
+ ):
81
+ if str(notation).isdigit():
82
+ return str(notation)
83
+ return None
84
+
85
+ def _get_labels(self, subject, rdf_graph):
86
+ """Extract prefLabel and altLabel languages for a subject."""
87
+ labels = {
88
+ label.language: label.value.capitalize()
89
+ for _, _, label in rdf_graph.triples(
90
+ (subject, self.SKOS_CORE.prefLabel, None)
91
+ )
92
+ }
93
+ if "en" not in labels:
94
+ for _, _, label in rdf_graph.triples(
95
+ (subject, self.SKOS_CORE.altLabel, None)
96
+ ):
97
+ labels.setdefault(label.language, label.value.capitalize())
98
+ return labels
99
+
100
+ def _find_parents(self, subject, rdf_graph):
101
+ """Find parent notations."""
102
+ parents = []
103
+
104
+ # Traverse the broader hierarchy
105
+ for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader):
106
+ if broader != subject: # Ensure we don't include the current subject
107
+ parent_notation = self._get_notation(broader, rdf_graph)
108
+ if parent_notation:
109
+ parents.append(parent_notation)
110
+
111
+ return parents
112
+
113
+ def _transform_entry(self, subject, rdf_graph):
114
+ """Transform an entry to the required dictionary format."""
115
+ # Get subject notation with euroscivoc prefix
116
+ notation = self._get_notation(subject, rdf_graph)
117
+ id = f"euroscivoc:{notation}" if notation else None
118
+ # Get labels for the current subject
119
+ labels = self._get_labels(subject, rdf_graph)
120
+ # Join parent notations with SPLITCHAR separator and add euroscivoc prefix
121
+ parents = self.SPLITCHAR.join(
122
+ f"euroscivoc:{n}" for n in reversed(self._find_parents(subject, rdf_graph))
123
+ )
124
+ # Create identifiers list
125
+ identifiers = [{"scheme": "url", "identifier": str(subject)}]
126
+
127
+ return {
128
+ "id": id,
129
+ "scheme": "EuroSciVoc",
130
+ "subject": labels.get("en", "").capitalize(),
131
+ "title": labels,
132
+ "props": {"parents": parents} if parents else {},
133
+ "identifiers": identifiers,
134
+ }
135
+
136
+ def apply(self, stream_entry, *args, **kwargs):
137
+ """Transform a stream entry to the required dictionary format.
138
+
139
+ :param stream_entry: The entry to be transformed, which includes the subject and the RDF graph.
140
+ :return: The transformed stream entry.
141
+ """
142
+ # Apply transformations
143
+ entry_data = self._transform_entry(
144
+ stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
145
+ )
146
+ stream_entry.entry = entry_data
147
+ return stream_entry
148
+
149
+
150
+ # Configuration for datastream readers, transformers, and writers
151
+ VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
152
+
153
+ VOCABULARIES_DATASTREAM_WRITERS = {}
154
+
155
+ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
156
+ "euroscivoc-transformer": EuroSciVocSubjectsTransformer
157
+ }
158
+
159
+ DATASTREAM_CONFIG = {
160
+ "readers": [
161
+ {
162
+ "type": "euroscivoc-reader",
163
+ }
164
+ ],
165
+ "transformers": [{"type": "euroscivoc-transformer"}],
166
+ "writers": [
167
+ {
168
+ "type": "subjects-service",
169
+ }
170
+ ],
171
+ }
@@ -30,6 +30,22 @@
30
30
  "description": "Human readable label in different languages.",
31
31
  "$ref": "local://vocabularies/definitions-v1.0.0.json#/title"
32
32
  },
33
+ "props": {
34
+ "type": "object",
35
+ "patternProperties": {
36
+ "^.*$": {
37
+ "type": "string"
38
+ }
39
+ }
40
+ },
41
+ "identifiers": {
42
+ "description": "Alternate identifiers for the subject.",
43
+ "type": "array",
44
+ "items": {
45
+ "$ref": "local://definitions-v2.0.0.json#/identifiers_with_scheme"
46
+ },
47
+ "uniqueItems": true
48
+ },
33
49
  "synonyms": {
34
50
  "description": "Synonyms of the subject label.",
35
51
  "type": "array",
@@ -71,6 +71,20 @@
71
71
  "type": "object",
72
72
  "dynamic": "true"
73
73
  },
74
+ "props": {
75
+ "type": "object",
76
+ "dynamic": "true"
77
+ },
78
+ "identifiers": {
79
+ "properties": {
80
+ "identifier": {
81
+ "type": "keyword"
82
+ },
83
+ "scheme": {
84
+ "type": "keyword"
85
+ }
86
+ }
87
+ },
74
88
  "synonyms": {
75
89
  "type": "text"
76
90
  },
@@ -74,6 +74,20 @@
74
74
  "synonyms": {
75
75
  "type": "text"
76
76
  },
77
+ "props": {
78
+ "type": "object",
79
+ "dynamic": "true"
80
+ },
81
+ "identifiers": {
82
+ "properties": {
83
+ "identifier": {
84
+ "type": "keyword"
85
+ },
86
+ "scheme": {
87
+ "type": "keyword"
88
+ }
89
+ }
90
+ },
77
91
  "tags": {
78
92
  "type": "keyword"
79
93
  }
@@ -71,6 +71,20 @@
71
71
  "type": "object",
72
72
  "dynamic": "true"
73
73
  },
74
+ "props": {
75
+ "type": "object",
76
+ "dynamic": "true"
77
+ },
78
+ "identifiers": {
79
+ "properties": {
80
+ "identifier": {
81
+ "type": "keyword"
82
+ },
83
+ "scheme": {
84
+ "type": "keyword"
85
+ }
86
+ }
87
+ },
74
88
  "synonyms": {
75
89
  "type": "text"
76
90
  },