invenio-vocabularies 6.6.0__py2.py3-none-any.whl → 6.8.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (46) hide show
  1. invenio_vocabularies/__init__.py +1 -1
  2. invenio_vocabularies/assets/semantic-ui/js/invenio_vocabularies/src/contrib/forms/Funding/FundingModal.js +3 -27
  3. invenio_vocabularies/cli.py +2 -0
  4. invenio_vocabularies/config.py +43 -1
  5. invenio_vocabularies/contrib/affiliations/config.py +21 -10
  6. invenio_vocabularies/contrib/affiliations/datastreams.py +103 -1
  7. invenio_vocabularies/contrib/awards/datastreams.py +7 -0
  8. invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +9 -0
  9. invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +22 -1
  10. invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +22 -1
  11. invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +22 -1
  12. invenio_vocabularies/contrib/awards/schema.py +9 -3
  13. invenio_vocabularies/contrib/funders/config.py +19 -12
  14. invenio_vocabularies/contrib/names/config.py +13 -10
  15. invenio_vocabularies/contrib/names/datastreams.py +182 -57
  16. invenio_vocabularies/contrib/names/mappings/os-v1/names/name-v2.0.0.json +11 -0
  17. invenio_vocabularies/contrib/names/mappings/os-v2/names/name-v2.0.0.json +11 -0
  18. invenio_vocabularies/contrib/names/names.py +1 -1
  19. invenio_vocabularies/contrib/names/schema.py +10 -2
  20. invenio_vocabularies/contrib/subjects/bodc/__init__.py +9 -0
  21. invenio_vocabularies/contrib/subjects/bodc/datastreams.py +111 -0
  22. invenio_vocabularies/contrib/subjects/config.py +19 -5
  23. invenio_vocabularies/contrib/subjects/datastreams.py +4 -2
  24. invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +56 -126
  25. invenio_vocabularies/contrib/subjects/gemet/__init__.py +9 -0
  26. invenio_vocabularies/contrib/subjects/gemet/datastreams.py +140 -0
  27. invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json +13 -2
  28. invenio_vocabularies/contrib/subjects/schema.py +18 -3
  29. invenio_vocabularies/datastreams/datastreams.py +18 -7
  30. invenio_vocabularies/datastreams/factories.py +3 -1
  31. invenio_vocabularies/datastreams/readers.py +99 -9
  32. invenio_vocabularies/datastreams/transformers.py +67 -0
  33. invenio_vocabularies/datastreams/writers.py +6 -2
  34. invenio_vocabularies/factories.py +56 -0
  35. invenio_vocabularies/fixtures.py +2 -0
  36. invenio_vocabularies/records/jsonschemas/vocabularies/definitions-v1.0.0.json +9 -0
  37. invenio_vocabularies/services/config.py +1 -7
  38. invenio_vocabularies/services/querystr.py +5 -0
  39. invenio_vocabularies/services/tasks.py +2 -0
  40. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/METADATA +28 -2
  41. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/RECORD +46 -42
  42. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/AUTHORS.rst +0 -0
  43. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/LICENSE +0 -0
  44. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/WHEEL +0 -0
  45. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/entry_points.txt +0 -0
  46. {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2022-2024 CERN.
3
+ # Copyright (C) 2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,121 +8,65 @@
8
8
 
9
9
  """EuroSciVoc subjects datastreams, readers, transformers, and writers."""
10
10
 
11
- import io
12
- from collections import namedtuple
13
-
14
- import requests
15
- from rdflib import OWL, RDF, Graph, Namespace
16
-
17
- from invenio_vocabularies.config import SUBJECTS_EUROSCIVOC_FILE_URL
18
- from invenio_vocabularies.datastreams.readers import BaseReader
19
- from invenio_vocabularies.datastreams.transformers import BaseTransformer
20
-
21
-
22
- class EuroSciVocSubjectsHTTPReader(BaseReader):
23
- """Reader class to fetch and process EuroSciVoc RDF data."""
24
-
25
- def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
26
- """Initialize the reader with the data source.
27
-
28
- :param origin: The URL from which to fetch the RDF data.
29
- :param mode: Mode of operation (default is 'r' for reading).
30
- """
31
- self.origin = origin or SUBJECTS_EUROSCIVOC_FILE_URL
32
- super().__init__(origin=origin, mode=mode, *args, **kwargs)
33
-
34
- def _iter(self, rdf_graph):
35
- """Iterate over the RDF graph, yielding one subject at a time.
36
-
37
- :param rdf_graph: The RDF graph to process.
38
- :yield: Subject and graph to be transformed.
39
- """
40
- SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
41
-
42
- for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
43
- yield {"subject": subject, "rdf_graph": rdf_graph}
44
-
45
- def read(self, item=None, *args, **kwargs):
46
- """Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.
47
-
48
- :param item: The RDF data provided as bytes (optional).
49
- :yield: Processed EuroSciVoc subject data.
50
- """
51
- if item:
52
- raise NotImplementedError(
53
- "EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
54
- )
55
- # Fetch the RDF data from the specified origin URL
56
- response = requests.get(self.origin)
57
- response.raise_for_status()
58
-
59
- # Treat the response content as a file-like object
60
- rdf_data = io.BytesIO(response.content)
61
-
62
- # Parse the RDF data into a graph
63
- rdf_graph = Graph()
64
- rdf_graph.parse(rdf_data, format="xml")
65
-
66
- # Yield each processed subject from the RDF graph
67
- yield from self._iter(rdf_graph)
68
-
69
-
70
- class EuroSciVocSubjectsTransformer(BaseTransformer):
71
- """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
72
-
73
- SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
74
- SPLITCHAR = ","
11
+ from invenio_vocabularies.datastreams.transformers import RDFTransformer
12
+
13
+ from ..config import euroscivoc_file_url
14
+
15
+
16
+ class EuroSciVocSubjectsTransformer(RDFTransformer):
17
+ """
18
+ Transformer class to convert EuroSciVoc RDF data to a dictionary format.
19
+
20
+ Input:
21
+ - Relevant fields:
22
+ - `skos:notation`: Primary identifier for the concept.
23
+ - `skos:prefLabel`: Preferred labels with language codes.
24
+ - `skos:altLabel`: Alternative labels.
25
+ - `skos:broader`: Broader concepts that this concept belongs to.
26
+
27
+ Output:
28
+ {
29
+ "id": "euroscivoc:1717", # EuroSciVoc-specific concept ID (skos:notation).
30
+ "scheme": "EuroSciVoc", # The scheme name indicating this is a EuroSciVoc concept.
31
+ "subject": "Satellite radio", # The primary subject label (first preferred label in English, skos:prefLabel).
32
+ "title": {
33
+ "it": "Radio satellitare", # Italian preferred label (skos:prefLabel).
34
+ "en": "Satellite radio", # English preferred label (skos:prefLabel).
35
+ },
36
+ "props": {
37
+ "parents": "euroscivoc:1225", # The broader concept (skos:broader), identified by its EuroSciVoc Concept ID.
38
+ },
39
+ "identifiers": [
40
+ {
41
+ "scheme": "url", # Type of identifier (URL).
42
+ "identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", # URI of the concept (rdf:about).
43
+ }
44
+ ],
45
+ }
46
+ """
75
47
 
76
48
  def _get_notation(self, subject, rdf_graph):
77
49
  """Extract the numeric notation for a subject."""
78
50
  for _, _, notation in rdf_graph.triples(
79
- (subject, self.SKOS_CORE.notation, None)
51
+ (subject, self.skos_core.notation, None)
80
52
  ):
81
53
  if str(notation).isdigit():
82
54
  return str(notation)
83
55
  return None
84
56
 
85
- def _get_labels(self, subject, rdf_graph):
86
- """Extract prefLabel and altLabel languages for a subject."""
87
- labels = {
88
- label.language: label.value.capitalize()
89
- for _, _, label in rdf_graph.triples(
90
- (subject, self.SKOS_CORE.prefLabel, None)
91
- )
92
- }
93
- if "en" not in labels:
94
- for _, _, label in rdf_graph.triples(
95
- (subject, self.SKOS_CORE.altLabel, None)
96
- ):
97
- labels.setdefault(label.language, label.value.capitalize())
98
- return labels
99
-
100
- def _find_parents(self, subject, rdf_graph):
101
- """Find parent notations."""
102
- parents = []
103
-
104
- # Traverse the broader hierarchy
105
- for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader):
106
- if broader != subject: # Ensure we don't include the current subject
107
- parent_notation = self._get_notation(broader, rdf_graph)
108
- if parent_notation:
109
- parents.append(parent_notation)
110
-
111
- return parents
57
+ def _get_parent_notation(self, broader, rdf_graph):
58
+ """Extract parent notation using numeric notation."""
59
+ return self._get_notation(broader, rdf_graph)
112
60
 
113
61
  def _transform_entry(self, subject, rdf_graph):
114
- """Transform an entry to the required dictionary format."""
115
- # Get subject notation with euroscivoc prefix
116
62
  notation = self._get_notation(subject, rdf_graph)
117
63
  id = f"euroscivoc:{notation}" if notation else None
118
- # Get labels for the current subject
119
64
  labels = self._get_labels(subject, rdf_graph)
120
- # Join parent notations with SPLITCHAR separator and add euroscivoc prefix
121
- parents = self.SPLITCHAR.join(
122
- f"euroscivoc:{n}" for n in reversed(self._find_parents(subject, rdf_graph))
65
+ parents = ",".join(
66
+ f"euroscivoc:{n}"
67
+ for n in reversed(self._find_parents(subject, rdf_graph))
68
+ if n
123
69
  )
124
- # Create identifiers list
125
- identifiers = [{"scheme": "url", "identifier": str(subject)}]
126
70
 
127
71
  return {
128
72
  "id": id,
@@ -130,27 +74,11 @@ class EuroSciVocSubjectsTransformer(BaseTransformer):
130
74
  "subject": labels.get("en", "").capitalize(),
131
75
  "title": labels,
132
76
  "props": {"parents": parents} if parents else {},
133
- "identifiers": identifiers,
77
+ "identifiers": self._get_identifiers(subject),
134
78
  }
135
79
 
136
- def apply(self, stream_entry, *args, **kwargs):
137
- """Transform a stream entry to the required dictionary format.
138
-
139
- :param stream_entry: The entry to be transformed, which includes the subject and the RDF graph.
140
- :return: The transformed stream entry.
141
- """
142
- # Apply transformations
143
- entry_data = self._transform_entry(
144
- stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
145
- )
146
- stream_entry.entry = entry_data
147
- return stream_entry
148
80
 
149
-
150
- # Configuration for datastream readers, transformers, and writers
151
- VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
152
-
153
- VOCABULARIES_DATASTREAM_WRITERS = {}
81
+ # Configuration for datastream
154
82
 
155
83
  VOCABULARIES_DATASTREAM_TRANSFORMERS = {
156
84
  "euroscivoc-transformer": EuroSciVocSubjectsTransformer
@@ -159,13 +87,15 @@ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
159
87
  DATASTREAM_CONFIG = {
160
88
  "readers": [
161
89
  {
162
- "type": "euroscivoc-reader",
163
- }
164
- ],
165
- "transformers": [{"type": "euroscivoc-transformer"}],
166
- "writers": [
90
+ "type": "http",
91
+ "args": {
92
+ "origin": euroscivoc_file_url,
93
+ },
94
+ },
167
95
  {
168
- "type": "subjects-service",
169
- }
96
+ "type": "rdf",
97
+ },
170
98
  ],
99
+ "transformers": [{"type": "euroscivoc-transformer"}],
100
+ "writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
171
101
  }
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """GEMET Subjects module."""
@@ -0,0 +1,140 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """GEMET subjects datastreams, readers, transformers, and writers."""
10
+
11
+ from invenio_vocabularies.datastreams.transformers import RDFTransformer
12
+
13
+ from ..config import gemet_file_url
14
+
15
+ # Available with the "rdf" extra
16
+ try:
17
+ import rdflib
18
+ except ImportError:
19
+ rdflib = None
20
+
21
+
22
+ class GEMETSubjectsTransformer(RDFTransformer):
23
+ """
24
+ Transformer class to convert GEMET RDF data to a dictionary format.
25
+
26
+ Input:
27
+ - Relevant fields:
28
+ - `skos:prefLabel`: Preferred labels with language codes.
29
+ - `skos:broader`: References to broader concepts (parent concepts).
30
+ - `skos:memberOf`: References to groups or themes the concept belongs to.
31
+
32
+ Output:
33
+ - A dictionary with the following structure:
34
+ {
35
+ "id": "gemet:concept/10008", # GEMET-specific concept ID (skos:Concept).
36
+ "scheme": "GEMET", # The scheme name indicating this is a GEMET concept.
37
+ "subject": "Consumer product", # The subject label (first preferred label in English, skos:prefLabel).
38
+ "title": {
39
+ "en": "Consumer product", # English label for the concept (skos:prefLabel).
40
+ "ar": "منتج استهلاكي" # Arabic label for the concept (skos:prefLabel).
41
+ },
42
+ "props": {
43
+ "parents": "gemet:concept/6660", # The parent concept (skos:broader), identified by its GEMET Concept ID.
44
+ "groups": ["http://www.eionet.europa.eu/gemet/group/10112"], # Group the concept belongs to (skos:memberOf)(skos:prefLabel).
45
+ "themes": [
46
+ "http://www.eionet.europa.eu/gemet/theme/27", # Theme the concept belongs to (skos:memberOf)(rdfs:label).
47
+ ]
48
+ },
49
+ "identifiers": [
50
+ {
51
+ "scheme": "url", # Type of identifier (URL).
52
+ "identifier": "http://www.eionet.europa.eu/gemet/concept/10008" # URI of the concept (rdf:about).
53
+ }
54
+ ]
55
+ }
56
+ """
57
+
58
+ def _get_parent_notation(self, broader, rdf_graph):
59
+ """Extract parent notation from GEMET URI."""
60
+ return "/".join(broader.split("/")[-2:])
61
+
62
+ def _get_groups_and_themes(self, subject, rdf_graph):
63
+ """Extract groups and themes for a subject."""
64
+ groups = []
65
+ themes = []
66
+
67
+ for relation in rdf_graph.subjects(
68
+ predicate=self.skos_core.member, object=subject
69
+ ):
70
+ relation_uri = str(relation)
71
+ relation_label = None
72
+
73
+ # If the relation is a group, check for skos:prefLabel
74
+ if "group" in relation_uri:
75
+ labels = rdf_graph.objects(
76
+ subject=relation, predicate=self.skos_core.prefLabel
77
+ )
78
+ relation_label = next(
79
+ (str(label) for label in labels if label.language == "en"), None
80
+ )
81
+ groups.append(relation_uri)
82
+
83
+ # If the relation is a theme, check for rdfs:label
84
+ elif "theme" in relation_uri:
85
+ labels = rdf_graph.objects(
86
+ subject=relation, predicate=rdflib.RDFS.label
87
+ )
88
+ relation_label = next(
89
+ (str(label) for label in labels if label.language == "en"), None
90
+ )
91
+ themes.append(relation_uri)
92
+
93
+ return groups, themes
94
+
95
+ def _transform_entry(self, subject, rdf_graph):
96
+ """Transform an entry to the required dictionary format."""
97
+ concept_number = "/".join(subject.split("/")[-2:])
98
+ id = f"gemet:{concept_number}" if concept_number else None
99
+ labels = self._get_labels(subject, rdf_graph)
100
+ parents = ",".join(
101
+ f"gemet:{n}" for n in reversed(self._find_parents(subject, rdf_graph)) if n
102
+ )
103
+ identifiers = [{"scheme": "url", "identifier": str(subject)}]
104
+ groups, themes = self._get_groups_and_themes(subject, rdf_graph)
105
+
106
+ props = {"parents": parents} if parents else {}
107
+
108
+ if groups:
109
+ props["groups"] = groups
110
+ if themes:
111
+ props["themes"] = themes
112
+
113
+ return {
114
+ "id": id,
115
+ "scheme": "GEMET",
116
+ "subject": labels.get("en", "").capitalize(),
117
+ "title": labels,
118
+ "props": props,
119
+ "identifiers": self._get_identifiers(subject),
120
+ }
121
+
122
+
123
+ # Configuration for datastream
124
+
125
+ VOCABULARIES_DATASTREAM_TRANSFORMERS = {"gemet-transformer": GEMETSubjectsTransformer}
126
+
127
+ DATASTREAM_CONFIG = {
128
+ "readers": [
129
+ {
130
+ "type": "http",
131
+ "args": {
132
+ "origin": gemet_file_url,
133
+ },
134
+ },
135
+ {"type": "gzip"},
136
+ {"type": "rdf"},
137
+ ],
138
+ "transformers": [{"type": "gemet-transformer"}],
139
+ "writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
140
+ }
@@ -34,9 +34,20 @@
34
34
  "type": "object",
35
35
  "patternProperties": {
36
36
  "^.*$": {
37
- "type": "string"
37
+ "oneOf": [
38
+ {
39
+ "type": "string"
40
+ },
41
+ {
42
+ "type": "array",
43
+ "items": {
44
+ "type": "string"
45
+ }
46
+ }
47
+ ]
38
48
  }
39
- }
49
+ },
50
+ "additionalProperties": false
40
51
  },
41
52
  "identifiers": {
42
53
  "description": "Alternate identifiers for the subject.",
@@ -13,8 +13,8 @@
13
13
  from functools import partial
14
14
 
15
15
  from invenio_i18n import get_locale
16
- from marshmallow import EXCLUDE, Schema, fields, pre_load
17
- from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
16
+ from marshmallow import EXCLUDE, Schema, ValidationError, fields, pre_load, validate
17
+ from marshmallow_utils.fields import URL, IdentifierSet, SanitizedUnicode
18
18
  from marshmallow_utils.schemas import IdentifierSchema
19
19
 
20
20
  from ...services.schema import (
@@ -25,6 +25,21 @@ from ...services.schema import (
25
25
  from .config import subject_schemes
26
26
 
27
27
 
28
+ class StringOrListOfStrings(fields.Field):
29
+ """Custom field to handle both string and list of strings."""
30
+
31
+ # TODO: Move this to marshmallow-utils for broader type support.
32
+ def _deserialize(self, value, attr, data, **kwargs):
33
+ if isinstance(value, str):
34
+ return fields.String()._deserialize(value, attr, data, **kwargs)
35
+ elif isinstance(value, list) and all(isinstance(item, str) for item in value):
36
+ return [
37
+ fields.String()._deserialize(item, attr, data, **kwargs)
38
+ for item in value
39
+ ]
40
+ raise ValidationError("Invalid value. Must be a string or a list of strings.")
41
+
42
+
28
43
  class SubjectSchema(BaseVocabularySchema):
29
44
  """Service schema for subjects."""
30
45
 
@@ -35,7 +50,7 @@ class SubjectSchema(BaseVocabularySchema):
35
50
  scheme = SanitizedUnicode(required=True)
36
51
  subject = SanitizedUnicode(required=True)
37
52
  title = i18n_strings
38
- props = fields.Dict(keys=SanitizedUnicode(), values=SanitizedUnicode())
53
+ props = fields.Dict(keys=SanitizedUnicode(), values=StringOrListOfStrings())
39
54
  identifiers = IdentifierSet(
40
55
  fields.Nested(
41
56
  partial(
@@ -48,7 +48,16 @@ class StreamEntry:
48
48
  class DataStream:
49
49
  """Data stream."""
50
50
 
51
- def __init__(self, readers, writers, transformers=None, *args, **kwargs):
51
+ def __init__(
52
+ self,
53
+ readers,
54
+ writers,
55
+ transformers=None,
56
+ batch_size=100,
57
+ write_many=False,
58
+ *args,
59
+ **kwargs,
60
+ ):
52
61
  """Constructor.
53
62
 
54
63
  :param readers: an ordered list of readers.
@@ -58,12 +67,14 @@ class DataStream:
58
67
  self._readers = readers
59
68
  self._transformers = transformers
60
69
  self._writers = writers
70
+ self.batch_size = batch_size
71
+ self.write_many = write_many
61
72
 
62
73
  def filter(self, stream_entry, *args, **kwargs):
63
74
  """Checks if an stream_entry should be filtered out (skipped)."""
64
75
  return False
65
76
 
66
- def process_batch(self, batch, write_many=False):
77
+ def process_batch(self, batch):
67
78
  """Process a batch of entries."""
68
79
  transformed_entries = []
69
80
  for stream_entry in batch:
@@ -79,12 +90,12 @@ class DataStream:
79
90
  else:
80
91
  transformed_entries.append(transformed_entry)
81
92
  if transformed_entries:
82
- if write_many:
93
+ if self.write_many:
83
94
  yield from self.batch_write(transformed_entries)
84
95
  else:
85
96
  yield from (self.write(entry) for entry in transformed_entries)
86
97
 
87
- def process(self, batch_size=100, write_many=False, *args, **kwargs):
98
+ def process(self, *args, **kwargs):
88
99
  """Iterates over the entries.
89
100
 
90
101
  Uses the reader to get the raw entries and transforms them.
@@ -95,13 +106,13 @@ class DataStream:
95
106
  batch = []
96
107
  for stream_entry in self.read():
97
108
  batch.append(stream_entry)
98
- if len(batch) >= batch_size:
99
- yield from self.process_batch(batch, write_many=write_many)
109
+ if len(batch) >= self.batch_size:
110
+ yield from self.process_batch(batch)
100
111
  batch = []
101
112
 
102
113
  # Process any remaining entries in the last batch
103
114
  if batch:
104
- yield from self.process_batch(batch, write_many=write_many)
115
+ yield from self.process_batch(batch)
105
116
 
106
117
  def read(self):
107
118
  """Recursively read the entries."""
@@ -81,4 +81,6 @@ class DataStreamFactory:
81
81
  for t_conf in transformers_config:
82
82
  transformers.append(TransformerFactory.create(t_conf))
83
83
 
84
- return DataStream(readers=readers, writers=writers, transformers=transformers)
84
+ return DataStream(
85
+ readers=readers, writers=writers, transformers=transformers, **kwargs
86
+ )
@@ -11,6 +11,7 @@
11
11
 
12
12
  import csv
13
13
  import gzip
14
+ import io
14
15
  import json
15
16
  import re
16
17
  import tarfile
@@ -27,11 +28,25 @@ from lxml.html import parse as html_parse
27
28
  from .errors import ReaderError
28
29
  from .xml import etree_to_dict
29
30
 
31
+ # Extras dependencies
32
+ # "oaipmh"
30
33
  try:
31
34
  import oaipmh_scythe
32
35
  except ImportError:
33
36
  oaipmh_scythe = None
34
37
 
38
+ # "rdf"
39
+ try:
40
+ import rdflib
41
+ except ImportError:
42
+ rdflib = None
43
+
44
+ # "sparql"
45
+ try:
46
+ import SPARQLWrapper as sparql
47
+ except ImportError:
48
+ sparql = None
49
+
35
50
 
36
51
  class BaseReader(ABC):
37
52
  """Base reader."""
@@ -103,8 +118,7 @@ class SimpleHTTPReader(BaseReader):
103
118
 
104
119
  def __init__(self, origin, id=None, ids=None, content_type=None, *args, **kwargs):
105
120
  """Constructor."""
106
- assert id or ids
107
- self._ids = ids if ids else [id]
121
+ self._ids = ids if ids else ([id] if id else None)
108
122
  self.content_type = content_type
109
123
  super().__init__(origin, *args, **kwargs)
110
124
 
@@ -113,14 +127,22 @@ class SimpleHTTPReader(BaseReader):
113
127
  base_url = url
114
128
  headers = {"Accept": self.content_type}
115
129
 
116
- for id_ in self._ids:
117
- url = base_url.format(id=id_)
130
+ # If there are no IDs, query the base URL
131
+ if not self._ids:
118
132
  resp = requests.get(url, headers=headers)
119
- if resp.status_code != 200:
120
- # todo add logging/fail
121
- pass
133
+ if resp.status_code == 200:
134
+ yield resp.content
135
+ else:
136
+ print(f"Failed to fetch URL {url}: {resp.status_code}")
137
+ else:
138
+ for id_ in self._ids:
139
+ url = base_url.format(id=id_)
140
+ resp = requests.get(url, headers=headers)
141
+ if resp.status_code != 200:
142
+ # todo add logging/fail
143
+ pass
122
144
 
123
- yield resp.content
145
+ yield resp.content
124
146
 
125
147
  def read(self, item=None, *args, **kwargs):
126
148
  """Chooses between item and origin as url."""
@@ -197,6 +219,9 @@ class GzipReader(BaseReader):
197
219
  """Gzip reader."""
198
220
 
199
221
  def _iter(self, fp, *args, **kwargs):
222
+ if isinstance(fp, bytes):
223
+ fp = io.BytesIO(fp)
224
+
200
225
  with gzip.open(fp) as gp:
201
226
  yield gp
202
227
 
@@ -236,7 +261,7 @@ class XMLReader(BaseReader):
236
261
  try:
237
262
  xml_tree = fromstring(fp)
238
263
  xml_dict = etree_to_dict(xml_tree)
239
- except Exception as e:
264
+ except Exception:
240
265
  xml_tree = html_parse(fp).getroot()
241
266
  xml_dict = etree_to_dict(xml_tree)["html"]["body"]
242
267
 
@@ -346,3 +371,68 @@ def xml_to_dict(tree: etree._Element):
346
371
  dict_obj["record"] = etree.tostring(tree)
347
372
 
348
373
  return dict_obj
374
+
375
+
376
+ class RDFReader(BaseReader):
377
+ """Base Reader class to fetch and process RDF data."""
378
+
379
+ @property
380
+ def skos_core(self):
381
+ """Return the SKOS Core namespace."""
382
+ return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
383
+
384
+ def _iter(self, rdf_graph):
385
+ """Iterate over the RDF graph, yielding one subject at a time."""
386
+ for subject, _, _ in rdf_graph.triples(
387
+ (None, rdflib.RDF.type, self.skos_core.Concept)
388
+ ):
389
+ yield {"subject": subject, "rdf_graph": rdf_graph}
390
+
391
+ def read(self, item=None, *args, **kwargs):
392
+ """Fetch and process the RDF data, yielding it one subject at a time."""
393
+ if isinstance(item, gzip.GzipFile):
394
+ rdf_content = item.read().decode("utf-8")
395
+
396
+ elif isinstance(item, bytes):
397
+ rdf_content = item.decode("utf-8")
398
+ else:
399
+ raise ReaderError("Unsupported content type")
400
+
401
+ rdf_graph = rdflib.Graph()
402
+ rdf_graph.parse(io.StringIO(rdf_content), format="xml")
403
+
404
+ yield from self._iter(rdf_graph)
405
+
406
+
407
+ class SPARQLReader(BaseReader):
408
+ """Generic reader class to fetch and process RDF data from a SPARQL endpoint."""
409
+
410
+ def __init__(self, origin, query, mode="r", *args, **kwargs):
411
+ """Initialize the reader with the data source.
412
+
413
+ :param origin: The SPARQL endpoint from which to fetch the RDF data.
414
+ :param query: The SPARQL query to execute.
415
+ :param mode: Mode of operation (default is 'r' for reading).
416
+ """
417
+ self._origin = origin
418
+ self._query = query
419
+ super().__init__(origin=origin, mode=mode, *args, **kwargs)
420
+
421
+ def _iter(self, fp, *args, **kwargs):
422
+ raise NotImplementedError(
423
+ "SPARQLReader downloads one result set from SPARQL and therefore does not iterate through items"
424
+ )
425
+
426
+ def read(self, item=None, *args, **kwargs):
427
+ """Fetch and process RDF data, yielding results one at a time."""
428
+ if item:
429
+ raise NotImplementedError(
430
+ "SPARQLReader does not support being chained after another reader"
431
+ )
432
+
433
+ sparql_client = sparql.SPARQLWrapper(self._origin)
434
+ sparql_client.setQuery(self._query)
435
+ sparql_client.setReturnFormat(sparql.JSON)
436
+
437
+ results = sparql_client.query().convert()
438
+ yield from results["results"]["bindings"]