invenio-vocabularies 6.6.0__py2.py3-none-any.whl → 6.8.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of invenio-vocabularies might be problematic. Click here for more details.
- invenio_vocabularies/__init__.py +1 -1
- invenio_vocabularies/assets/semantic-ui/js/invenio_vocabularies/src/contrib/forms/Funding/FundingModal.js +3 -27
- invenio_vocabularies/cli.py +2 -0
- invenio_vocabularies/config.py +43 -1
- invenio_vocabularies/contrib/affiliations/config.py +21 -10
- invenio_vocabularies/contrib/affiliations/datastreams.py +103 -1
- invenio_vocabularies/contrib/awards/datastreams.py +7 -0
- invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +9 -0
- invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +22 -1
- invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +22 -1
- invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +22 -1
- invenio_vocabularies/contrib/awards/schema.py +9 -3
- invenio_vocabularies/contrib/funders/config.py +19 -12
- invenio_vocabularies/contrib/names/config.py +13 -10
- invenio_vocabularies/contrib/names/datastreams.py +182 -57
- invenio_vocabularies/contrib/names/mappings/os-v1/names/name-v2.0.0.json +11 -0
- invenio_vocabularies/contrib/names/mappings/os-v2/names/name-v2.0.0.json +11 -0
- invenio_vocabularies/contrib/names/names.py +1 -1
- invenio_vocabularies/contrib/names/schema.py +10 -2
- invenio_vocabularies/contrib/subjects/bodc/__init__.py +9 -0
- invenio_vocabularies/contrib/subjects/bodc/datastreams.py +111 -0
- invenio_vocabularies/contrib/subjects/config.py +19 -5
- invenio_vocabularies/contrib/subjects/datastreams.py +4 -2
- invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +56 -126
- invenio_vocabularies/contrib/subjects/gemet/__init__.py +9 -0
- invenio_vocabularies/contrib/subjects/gemet/datastreams.py +140 -0
- invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json +13 -2
- invenio_vocabularies/contrib/subjects/schema.py +18 -3
- invenio_vocabularies/datastreams/datastreams.py +18 -7
- invenio_vocabularies/datastreams/factories.py +3 -1
- invenio_vocabularies/datastreams/readers.py +99 -9
- invenio_vocabularies/datastreams/transformers.py +67 -0
- invenio_vocabularies/datastreams/writers.py +6 -2
- invenio_vocabularies/factories.py +56 -0
- invenio_vocabularies/fixtures.py +2 -0
- invenio_vocabularies/records/jsonschemas/vocabularies/definitions-v1.0.0.json +9 -0
- invenio_vocabularies/services/config.py +1 -7
- invenio_vocabularies/services/querystr.py +5 -0
- invenio_vocabularies/services/tasks.py +2 -0
- {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/METADATA +28 -2
- {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/RECORD +46 -42
- {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/AUTHORS.rst +0 -0
- {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/LICENSE +0 -0
- {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/WHEEL +0 -0
- {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/entry_points.txt +0 -0
- {invenio_vocabularies-6.6.0.dist-info → invenio_vocabularies-6.8.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C)
|
|
3
|
+
# Copyright (C) 2024 CERN.
|
|
4
4
|
#
|
|
5
5
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
@@ -8,121 +8,65 @@
|
|
|
8
8
|
|
|
9
9
|
"""EuroSciVoc subjects datastreams, readers, transformers, and writers."""
|
|
10
10
|
|
|
11
|
-
import
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
:param item: The RDF data provided as bytes (optional).
|
|
49
|
-
:yield: Processed EuroSciVoc subject data.
|
|
50
|
-
"""
|
|
51
|
-
if item:
|
|
52
|
-
raise NotImplementedError(
|
|
53
|
-
"EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
|
|
54
|
-
)
|
|
55
|
-
# Fetch the RDF data from the specified origin URL
|
|
56
|
-
response = requests.get(self.origin)
|
|
57
|
-
response.raise_for_status()
|
|
58
|
-
|
|
59
|
-
# Treat the response content as a file-like object
|
|
60
|
-
rdf_data = io.BytesIO(response.content)
|
|
61
|
-
|
|
62
|
-
# Parse the RDF data into a graph
|
|
63
|
-
rdf_graph = Graph()
|
|
64
|
-
rdf_graph.parse(rdf_data, format="xml")
|
|
65
|
-
|
|
66
|
-
# Yield each processed subject from the RDF graph
|
|
67
|
-
yield from self._iter(rdf_graph)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class EuroSciVocSubjectsTransformer(BaseTransformer):
|
|
71
|
-
"""Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
|
|
72
|
-
|
|
73
|
-
SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
|
|
74
|
-
SPLITCHAR = ","
|
|
11
|
+
from invenio_vocabularies.datastreams.transformers import RDFTransformer
|
|
12
|
+
|
|
13
|
+
from ..config import euroscivoc_file_url
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EuroSciVocSubjectsTransformer(RDFTransformer):
|
|
17
|
+
"""
|
|
18
|
+
Transformer class to convert EuroSciVoc RDF data to a dictionary format.
|
|
19
|
+
|
|
20
|
+
Input:
|
|
21
|
+
- Relevant fields:
|
|
22
|
+
- `skos:notation`: Primary identifier for the concept.
|
|
23
|
+
- `skos:prefLabel`: Preferred labels with language codes.
|
|
24
|
+
- `skos:altLabel`: Alternative labels.
|
|
25
|
+
- `skos:broader`: Broader concepts that this concept belongs to.
|
|
26
|
+
|
|
27
|
+
Output:
|
|
28
|
+
{
|
|
29
|
+
"id": "euroscivoc:1717", # EuroSciVoc-specific concept ID (skos:notation).
|
|
30
|
+
"scheme": "EuroSciVoc", # The scheme name indicating this is a EuroSciVoc concept.
|
|
31
|
+
"subject": "Satellite radio", # The primary subject label (first preferred label in English, skos:prefLabel).
|
|
32
|
+
"title": {
|
|
33
|
+
"it": "Radio satellitare", # Italian preferred label (skos:prefLabel).
|
|
34
|
+
"en": "Satellite radio", # English preferred label (skos:prefLabel).
|
|
35
|
+
},
|
|
36
|
+
"props": {
|
|
37
|
+
"parents": "euroscivoc:1225", # The broader concept (skos:broader), identified by its EuroSciVoc Concept ID.
|
|
38
|
+
},
|
|
39
|
+
"identifiers": [
|
|
40
|
+
{
|
|
41
|
+
"scheme": "url", # Type of identifier (URL).
|
|
42
|
+
"identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", # URI of the concept (rdf:about).
|
|
43
|
+
}
|
|
44
|
+
],
|
|
45
|
+
}
|
|
46
|
+
"""
|
|
75
47
|
|
|
76
48
|
def _get_notation(self, subject, rdf_graph):
|
|
77
49
|
"""Extract the numeric notation for a subject."""
|
|
78
50
|
for _, _, notation in rdf_graph.triples(
|
|
79
|
-
(subject, self.
|
|
51
|
+
(subject, self.skos_core.notation, None)
|
|
80
52
|
):
|
|
81
53
|
if str(notation).isdigit():
|
|
82
54
|
return str(notation)
|
|
83
55
|
return None
|
|
84
56
|
|
|
85
|
-
def
|
|
86
|
-
"""Extract
|
|
87
|
-
|
|
88
|
-
label.language: label.value.capitalize()
|
|
89
|
-
for _, _, label in rdf_graph.triples(
|
|
90
|
-
(subject, self.SKOS_CORE.prefLabel, None)
|
|
91
|
-
)
|
|
92
|
-
}
|
|
93
|
-
if "en" not in labels:
|
|
94
|
-
for _, _, label in rdf_graph.triples(
|
|
95
|
-
(subject, self.SKOS_CORE.altLabel, None)
|
|
96
|
-
):
|
|
97
|
-
labels.setdefault(label.language, label.value.capitalize())
|
|
98
|
-
return labels
|
|
99
|
-
|
|
100
|
-
def _find_parents(self, subject, rdf_graph):
|
|
101
|
-
"""Find parent notations."""
|
|
102
|
-
parents = []
|
|
103
|
-
|
|
104
|
-
# Traverse the broader hierarchy
|
|
105
|
-
for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader):
|
|
106
|
-
if broader != subject: # Ensure we don't include the current subject
|
|
107
|
-
parent_notation = self._get_notation(broader, rdf_graph)
|
|
108
|
-
if parent_notation:
|
|
109
|
-
parents.append(parent_notation)
|
|
110
|
-
|
|
111
|
-
return parents
|
|
57
|
+
def _get_parent_notation(self, broader, rdf_graph):
|
|
58
|
+
"""Extract parent notation using numeric notation."""
|
|
59
|
+
return self._get_notation(broader, rdf_graph)
|
|
112
60
|
|
|
113
61
|
def _transform_entry(self, subject, rdf_graph):
|
|
114
|
-
"""Transform an entry to the required dictionary format."""
|
|
115
|
-
# Get subject notation with euroscivoc prefix
|
|
116
62
|
notation = self._get_notation(subject, rdf_graph)
|
|
117
63
|
id = f"euroscivoc:{notation}" if notation else None
|
|
118
|
-
# Get labels for the current subject
|
|
119
64
|
labels = self._get_labels(subject, rdf_graph)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
65
|
+
parents = ",".join(
|
|
66
|
+
f"euroscivoc:{n}"
|
|
67
|
+
for n in reversed(self._find_parents(subject, rdf_graph))
|
|
68
|
+
if n
|
|
123
69
|
)
|
|
124
|
-
# Create identifiers list
|
|
125
|
-
identifiers = [{"scheme": "url", "identifier": str(subject)}]
|
|
126
70
|
|
|
127
71
|
return {
|
|
128
72
|
"id": id,
|
|
@@ -130,27 +74,11 @@ class EuroSciVocSubjectsTransformer(BaseTransformer):
|
|
|
130
74
|
"subject": labels.get("en", "").capitalize(),
|
|
131
75
|
"title": labels,
|
|
132
76
|
"props": {"parents": parents} if parents else {},
|
|
133
|
-
"identifiers":
|
|
77
|
+
"identifiers": self._get_identifiers(subject),
|
|
134
78
|
}
|
|
135
79
|
|
|
136
|
-
def apply(self, stream_entry, *args, **kwargs):
|
|
137
|
-
"""Transform a stream entry to the required dictionary format.
|
|
138
|
-
|
|
139
|
-
:param stream_entry: The entry to be transformed, which includes the subject and the RDF graph.
|
|
140
|
-
:return: The transformed stream entry.
|
|
141
|
-
"""
|
|
142
|
-
# Apply transformations
|
|
143
|
-
entry_data = self._transform_entry(
|
|
144
|
-
stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
|
|
145
|
-
)
|
|
146
|
-
stream_entry.entry = entry_data
|
|
147
|
-
return stream_entry
|
|
148
80
|
|
|
149
|
-
|
|
150
|
-
# Configuration for datastream readers, transformers, and writers
|
|
151
|
-
VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
|
|
152
|
-
|
|
153
|
-
VOCABULARIES_DATASTREAM_WRITERS = {}
|
|
81
|
+
# Configuration for datastream
|
|
154
82
|
|
|
155
83
|
VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
156
84
|
"euroscivoc-transformer": EuroSciVocSubjectsTransformer
|
|
@@ -159,13 +87,15 @@ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
|
159
87
|
DATASTREAM_CONFIG = {
|
|
160
88
|
"readers": [
|
|
161
89
|
{
|
|
162
|
-
"type": "
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
90
|
+
"type": "http",
|
|
91
|
+
"args": {
|
|
92
|
+
"origin": euroscivoc_file_url,
|
|
93
|
+
},
|
|
94
|
+
},
|
|
167
95
|
{
|
|
168
|
-
"type": "
|
|
169
|
-
}
|
|
96
|
+
"type": "rdf",
|
|
97
|
+
},
|
|
170
98
|
],
|
|
99
|
+
"transformers": [{"type": "euroscivoc-transformer"}],
|
|
100
|
+
"writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
|
|
171
101
|
}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#
|
|
3
|
+
# Copyright (C) 2024 CERN.
|
|
4
|
+
#
|
|
5
|
+
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
|
+
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
7
|
+
# details.
|
|
8
|
+
|
|
9
|
+
"""GEMET subjects datastreams, readers, transformers, and writers."""
|
|
10
|
+
|
|
11
|
+
from invenio_vocabularies.datastreams.transformers import RDFTransformer
|
|
12
|
+
|
|
13
|
+
from ..config import gemet_file_url
|
|
14
|
+
|
|
15
|
+
# Available with the "rdf" extra
|
|
16
|
+
try:
|
|
17
|
+
import rdflib
|
|
18
|
+
except ImportError:
|
|
19
|
+
rdflib = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class GEMETSubjectsTransformer(RDFTransformer):
|
|
23
|
+
"""
|
|
24
|
+
Transformer class to convert GEMET RDF data to a dictionary format.
|
|
25
|
+
|
|
26
|
+
Input:
|
|
27
|
+
- Relevant fields:
|
|
28
|
+
- `skos:prefLabel`: Preferred labels with language codes.
|
|
29
|
+
- `skos:broader`: References to broader concepts (parent concepts).
|
|
30
|
+
- `skos:memberOf`: References to groups or themes the concept belongs to.
|
|
31
|
+
|
|
32
|
+
Output:
|
|
33
|
+
- A dictionary with the following structure:
|
|
34
|
+
{
|
|
35
|
+
"id": "gemet:concept/10008", # GEMET-specific concept ID (skos:Concept).
|
|
36
|
+
"scheme": "GEMET", # The scheme name indicating this is a GEMET concept.
|
|
37
|
+
"subject": "Consumer product", # The subject label (first preferred label in English, skos:prefLabel).
|
|
38
|
+
"title": {
|
|
39
|
+
"en": "Consumer product", # English label for the concept (skos:prefLabel).
|
|
40
|
+
"ar": "منتج استهلاكي" # Arabic label for the concept (skos:prefLabel).
|
|
41
|
+
},
|
|
42
|
+
"props": {
|
|
43
|
+
"parents": "gemet:concept/6660", # The parent concept (skos:broader), identified by its GEMET Concept ID.
|
|
44
|
+
"groups": ["http://www.eionet.europa.eu/gemet/group/10112"], # Group the concept belongs to (skos:memberOf)(skos:prefLabel).
|
|
45
|
+
"themes": [
|
|
46
|
+
"http://www.eionet.europa.eu/gemet/theme/27", # Theme the concept belongs to (skos:memberOf)(rdfs:label).
|
|
47
|
+
]
|
|
48
|
+
},
|
|
49
|
+
"identifiers": [
|
|
50
|
+
{
|
|
51
|
+
"scheme": "url", # Type of identifier (URL).
|
|
52
|
+
"identifier": "http://www.eionet.europa.eu/gemet/concept/10008" # URI of the concept (rdf:about).
|
|
53
|
+
}
|
|
54
|
+
]
|
|
55
|
+
}
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def _get_parent_notation(self, broader, rdf_graph):
|
|
59
|
+
"""Extract parent notation from GEMET URI."""
|
|
60
|
+
return "/".join(broader.split("/")[-2:])
|
|
61
|
+
|
|
62
|
+
def _get_groups_and_themes(self, subject, rdf_graph):
|
|
63
|
+
"""Extract groups and themes for a subject."""
|
|
64
|
+
groups = []
|
|
65
|
+
themes = []
|
|
66
|
+
|
|
67
|
+
for relation in rdf_graph.subjects(
|
|
68
|
+
predicate=self.skos_core.member, object=subject
|
|
69
|
+
):
|
|
70
|
+
relation_uri = str(relation)
|
|
71
|
+
relation_label = None
|
|
72
|
+
|
|
73
|
+
# If the relation is a group, check for skos:prefLabel
|
|
74
|
+
if "group" in relation_uri:
|
|
75
|
+
labels = rdf_graph.objects(
|
|
76
|
+
subject=relation, predicate=self.skos_core.prefLabel
|
|
77
|
+
)
|
|
78
|
+
relation_label = next(
|
|
79
|
+
(str(label) for label in labels if label.language == "en"), None
|
|
80
|
+
)
|
|
81
|
+
groups.append(relation_uri)
|
|
82
|
+
|
|
83
|
+
# If the relation is a theme, check for rdfs:label
|
|
84
|
+
elif "theme" in relation_uri:
|
|
85
|
+
labels = rdf_graph.objects(
|
|
86
|
+
subject=relation, predicate=rdflib.RDFS.label
|
|
87
|
+
)
|
|
88
|
+
relation_label = next(
|
|
89
|
+
(str(label) for label in labels if label.language == "en"), None
|
|
90
|
+
)
|
|
91
|
+
themes.append(relation_uri)
|
|
92
|
+
|
|
93
|
+
return groups, themes
|
|
94
|
+
|
|
95
|
+
def _transform_entry(self, subject, rdf_graph):
|
|
96
|
+
"""Transform an entry to the required dictionary format."""
|
|
97
|
+
concept_number = "/".join(subject.split("/")[-2:])
|
|
98
|
+
id = f"gemet:{concept_number}" if concept_number else None
|
|
99
|
+
labels = self._get_labels(subject, rdf_graph)
|
|
100
|
+
parents = ",".join(
|
|
101
|
+
f"gemet:{n}" for n in reversed(self._find_parents(subject, rdf_graph)) if n
|
|
102
|
+
)
|
|
103
|
+
identifiers = [{"scheme": "url", "identifier": str(subject)}]
|
|
104
|
+
groups, themes = self._get_groups_and_themes(subject, rdf_graph)
|
|
105
|
+
|
|
106
|
+
props = {"parents": parents} if parents else {}
|
|
107
|
+
|
|
108
|
+
if groups:
|
|
109
|
+
props["groups"] = groups
|
|
110
|
+
if themes:
|
|
111
|
+
props["themes"] = themes
|
|
112
|
+
|
|
113
|
+
return {
|
|
114
|
+
"id": id,
|
|
115
|
+
"scheme": "GEMET",
|
|
116
|
+
"subject": labels.get("en", "").capitalize(),
|
|
117
|
+
"title": labels,
|
|
118
|
+
"props": props,
|
|
119
|
+
"identifiers": self._get_identifiers(subject),
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# Configuration for datastream
|
|
124
|
+
|
|
125
|
+
VOCABULARIES_DATASTREAM_TRANSFORMERS = {"gemet-transformer": GEMETSubjectsTransformer}
|
|
126
|
+
|
|
127
|
+
DATASTREAM_CONFIG = {
|
|
128
|
+
"readers": [
|
|
129
|
+
{
|
|
130
|
+
"type": "http",
|
|
131
|
+
"args": {
|
|
132
|
+
"origin": gemet_file_url,
|
|
133
|
+
},
|
|
134
|
+
},
|
|
135
|
+
{"type": "gzip"},
|
|
136
|
+
{"type": "rdf"},
|
|
137
|
+
],
|
|
138
|
+
"transformers": [{"type": "gemet-transformer"}],
|
|
139
|
+
"writers": [{"args": {"writer": {"type": "subjects-service"}}, "type": "async"}],
|
|
140
|
+
}
|
|
@@ -34,9 +34,20 @@
|
|
|
34
34
|
"type": "object",
|
|
35
35
|
"patternProperties": {
|
|
36
36
|
"^.*$": {
|
|
37
|
-
"
|
|
37
|
+
"oneOf": [
|
|
38
|
+
{
|
|
39
|
+
"type": "string"
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"type": "array",
|
|
43
|
+
"items": {
|
|
44
|
+
"type": "string"
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
]
|
|
38
48
|
}
|
|
39
|
-
}
|
|
49
|
+
},
|
|
50
|
+
"additionalProperties": false
|
|
40
51
|
},
|
|
41
52
|
"identifiers": {
|
|
42
53
|
"description": "Alternate identifiers for the subject.",
|
|
@@ -13,8 +13,8 @@
|
|
|
13
13
|
from functools import partial
|
|
14
14
|
|
|
15
15
|
from invenio_i18n import get_locale
|
|
16
|
-
from marshmallow import EXCLUDE, Schema, fields, pre_load
|
|
17
|
-
from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
|
|
16
|
+
from marshmallow import EXCLUDE, Schema, ValidationError, fields, pre_load, validate
|
|
17
|
+
from marshmallow_utils.fields import URL, IdentifierSet, SanitizedUnicode
|
|
18
18
|
from marshmallow_utils.schemas import IdentifierSchema
|
|
19
19
|
|
|
20
20
|
from ...services.schema import (
|
|
@@ -25,6 +25,21 @@ from ...services.schema import (
|
|
|
25
25
|
from .config import subject_schemes
|
|
26
26
|
|
|
27
27
|
|
|
28
|
+
class StringOrListOfStrings(fields.Field):
|
|
29
|
+
"""Custom field to handle both string and list of strings."""
|
|
30
|
+
|
|
31
|
+
# TODO: Move this to marshmallow-utils for broader type support.
|
|
32
|
+
def _deserialize(self, value, attr, data, **kwargs):
|
|
33
|
+
if isinstance(value, str):
|
|
34
|
+
return fields.String()._deserialize(value, attr, data, **kwargs)
|
|
35
|
+
elif isinstance(value, list) and all(isinstance(item, str) for item in value):
|
|
36
|
+
return [
|
|
37
|
+
fields.String()._deserialize(item, attr, data, **kwargs)
|
|
38
|
+
for item in value
|
|
39
|
+
]
|
|
40
|
+
raise ValidationError("Invalid value. Must be a string or a list of strings.")
|
|
41
|
+
|
|
42
|
+
|
|
28
43
|
class SubjectSchema(BaseVocabularySchema):
|
|
29
44
|
"""Service schema for subjects."""
|
|
30
45
|
|
|
@@ -35,7 +50,7 @@ class SubjectSchema(BaseVocabularySchema):
|
|
|
35
50
|
scheme = SanitizedUnicode(required=True)
|
|
36
51
|
subject = SanitizedUnicode(required=True)
|
|
37
52
|
title = i18n_strings
|
|
38
|
-
props = fields.Dict(keys=SanitizedUnicode(), values=
|
|
53
|
+
props = fields.Dict(keys=SanitizedUnicode(), values=StringOrListOfStrings())
|
|
39
54
|
identifiers = IdentifierSet(
|
|
40
55
|
fields.Nested(
|
|
41
56
|
partial(
|
|
@@ -48,7 +48,16 @@ class StreamEntry:
|
|
|
48
48
|
class DataStream:
|
|
49
49
|
"""Data stream."""
|
|
50
50
|
|
|
51
|
-
def __init__(
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
readers,
|
|
54
|
+
writers,
|
|
55
|
+
transformers=None,
|
|
56
|
+
batch_size=100,
|
|
57
|
+
write_many=False,
|
|
58
|
+
*args,
|
|
59
|
+
**kwargs,
|
|
60
|
+
):
|
|
52
61
|
"""Constructor.
|
|
53
62
|
|
|
54
63
|
:param readers: an ordered list of readers.
|
|
@@ -58,12 +67,14 @@ class DataStream:
|
|
|
58
67
|
self._readers = readers
|
|
59
68
|
self._transformers = transformers
|
|
60
69
|
self._writers = writers
|
|
70
|
+
self.batch_size = batch_size
|
|
71
|
+
self.write_many = write_many
|
|
61
72
|
|
|
62
73
|
def filter(self, stream_entry, *args, **kwargs):
|
|
63
74
|
"""Checks if an stream_entry should be filtered out (skipped)."""
|
|
64
75
|
return False
|
|
65
76
|
|
|
66
|
-
def process_batch(self, batch
|
|
77
|
+
def process_batch(self, batch):
|
|
67
78
|
"""Process a batch of entries."""
|
|
68
79
|
transformed_entries = []
|
|
69
80
|
for stream_entry in batch:
|
|
@@ -79,12 +90,12 @@ class DataStream:
|
|
|
79
90
|
else:
|
|
80
91
|
transformed_entries.append(transformed_entry)
|
|
81
92
|
if transformed_entries:
|
|
82
|
-
if write_many:
|
|
93
|
+
if self.write_many:
|
|
83
94
|
yield from self.batch_write(transformed_entries)
|
|
84
95
|
else:
|
|
85
96
|
yield from (self.write(entry) for entry in transformed_entries)
|
|
86
97
|
|
|
87
|
-
def process(self,
|
|
98
|
+
def process(self, *args, **kwargs):
|
|
88
99
|
"""Iterates over the entries.
|
|
89
100
|
|
|
90
101
|
Uses the reader to get the raw entries and transforms them.
|
|
@@ -95,13 +106,13 @@ class DataStream:
|
|
|
95
106
|
batch = []
|
|
96
107
|
for stream_entry in self.read():
|
|
97
108
|
batch.append(stream_entry)
|
|
98
|
-
if len(batch) >= batch_size:
|
|
99
|
-
yield from self.process_batch(batch
|
|
109
|
+
if len(batch) >= self.batch_size:
|
|
110
|
+
yield from self.process_batch(batch)
|
|
100
111
|
batch = []
|
|
101
112
|
|
|
102
113
|
# Process any remaining entries in the last batch
|
|
103
114
|
if batch:
|
|
104
|
-
yield from self.process_batch(batch
|
|
115
|
+
yield from self.process_batch(batch)
|
|
105
116
|
|
|
106
117
|
def read(self):
|
|
107
118
|
"""Recursively read the entries."""
|
|
@@ -81,4 +81,6 @@ class DataStreamFactory:
|
|
|
81
81
|
for t_conf in transformers_config:
|
|
82
82
|
transformers.append(TransformerFactory.create(t_conf))
|
|
83
83
|
|
|
84
|
-
return DataStream(
|
|
84
|
+
return DataStream(
|
|
85
|
+
readers=readers, writers=writers, transformers=transformers, **kwargs
|
|
86
|
+
)
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
|
|
12
12
|
import csv
|
|
13
13
|
import gzip
|
|
14
|
+
import io
|
|
14
15
|
import json
|
|
15
16
|
import re
|
|
16
17
|
import tarfile
|
|
@@ -27,11 +28,25 @@ from lxml.html import parse as html_parse
|
|
|
27
28
|
from .errors import ReaderError
|
|
28
29
|
from .xml import etree_to_dict
|
|
29
30
|
|
|
31
|
+
# Extras dependencies
|
|
32
|
+
# "oaipmh"
|
|
30
33
|
try:
|
|
31
34
|
import oaipmh_scythe
|
|
32
35
|
except ImportError:
|
|
33
36
|
oaipmh_scythe = None
|
|
34
37
|
|
|
38
|
+
# "rdf"
|
|
39
|
+
try:
|
|
40
|
+
import rdflib
|
|
41
|
+
except ImportError:
|
|
42
|
+
rdflib = None
|
|
43
|
+
|
|
44
|
+
# "sparql"
|
|
45
|
+
try:
|
|
46
|
+
import SPARQLWrapper as sparql
|
|
47
|
+
except ImportError:
|
|
48
|
+
sparql = None
|
|
49
|
+
|
|
35
50
|
|
|
36
51
|
class BaseReader(ABC):
|
|
37
52
|
"""Base reader."""
|
|
@@ -103,8 +118,7 @@ class SimpleHTTPReader(BaseReader):
|
|
|
103
118
|
|
|
104
119
|
def __init__(self, origin, id=None, ids=None, content_type=None, *args, **kwargs):
|
|
105
120
|
"""Constructor."""
|
|
106
|
-
|
|
107
|
-
self._ids = ids if ids else [id]
|
|
121
|
+
self._ids = ids if ids else ([id] if id else None)
|
|
108
122
|
self.content_type = content_type
|
|
109
123
|
super().__init__(origin, *args, **kwargs)
|
|
110
124
|
|
|
@@ -113,14 +127,22 @@ class SimpleHTTPReader(BaseReader):
|
|
|
113
127
|
base_url = url
|
|
114
128
|
headers = {"Accept": self.content_type}
|
|
115
129
|
|
|
116
|
-
|
|
117
|
-
|
|
130
|
+
# If there are no IDs, query the base URL
|
|
131
|
+
if not self._ids:
|
|
118
132
|
resp = requests.get(url, headers=headers)
|
|
119
|
-
if resp.status_code
|
|
120
|
-
|
|
121
|
-
|
|
133
|
+
if resp.status_code == 200:
|
|
134
|
+
yield resp.content
|
|
135
|
+
else:
|
|
136
|
+
print(f"Failed to fetch URL {url}: {resp.status_code}")
|
|
137
|
+
else:
|
|
138
|
+
for id_ in self._ids:
|
|
139
|
+
url = base_url.format(id=id_)
|
|
140
|
+
resp = requests.get(url, headers=headers)
|
|
141
|
+
if resp.status_code != 200:
|
|
142
|
+
# todo add logging/fail
|
|
143
|
+
pass
|
|
122
144
|
|
|
123
|
-
|
|
145
|
+
yield resp.content
|
|
124
146
|
|
|
125
147
|
def read(self, item=None, *args, **kwargs):
|
|
126
148
|
"""Chooses between item and origin as url."""
|
|
@@ -197,6 +219,9 @@ class GzipReader(BaseReader):
|
|
|
197
219
|
"""Gzip reader."""
|
|
198
220
|
|
|
199
221
|
def _iter(self, fp, *args, **kwargs):
|
|
222
|
+
if isinstance(fp, bytes):
|
|
223
|
+
fp = io.BytesIO(fp)
|
|
224
|
+
|
|
200
225
|
with gzip.open(fp) as gp:
|
|
201
226
|
yield gp
|
|
202
227
|
|
|
@@ -236,7 +261,7 @@ class XMLReader(BaseReader):
|
|
|
236
261
|
try:
|
|
237
262
|
xml_tree = fromstring(fp)
|
|
238
263
|
xml_dict = etree_to_dict(xml_tree)
|
|
239
|
-
except Exception
|
|
264
|
+
except Exception:
|
|
240
265
|
xml_tree = html_parse(fp).getroot()
|
|
241
266
|
xml_dict = etree_to_dict(xml_tree)["html"]["body"]
|
|
242
267
|
|
|
@@ -346,3 +371,68 @@ def xml_to_dict(tree: etree._Element):
|
|
|
346
371
|
dict_obj["record"] = etree.tostring(tree)
|
|
347
372
|
|
|
348
373
|
return dict_obj
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
class RDFReader(BaseReader):
|
|
377
|
+
"""Base Reader class to fetch and process RDF data."""
|
|
378
|
+
|
|
379
|
+
@property
|
|
380
|
+
def skos_core(self):
|
|
381
|
+
"""Return the SKOS Core namespace."""
|
|
382
|
+
return rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
|
|
383
|
+
|
|
384
|
+
def _iter(self, rdf_graph):
|
|
385
|
+
"""Iterate over the RDF graph, yielding one subject at a time."""
|
|
386
|
+
for subject, _, _ in rdf_graph.triples(
|
|
387
|
+
(None, rdflib.RDF.type, self.skos_core.Concept)
|
|
388
|
+
):
|
|
389
|
+
yield {"subject": subject, "rdf_graph": rdf_graph}
|
|
390
|
+
|
|
391
|
+
def read(self, item=None, *args, **kwargs):
|
|
392
|
+
"""Fetch and process the RDF data, yielding it one subject at a time."""
|
|
393
|
+
if isinstance(item, gzip.GzipFile):
|
|
394
|
+
rdf_content = item.read().decode("utf-8")
|
|
395
|
+
|
|
396
|
+
elif isinstance(item, bytes):
|
|
397
|
+
rdf_content = item.decode("utf-8")
|
|
398
|
+
else:
|
|
399
|
+
raise ReaderError("Unsupported content type")
|
|
400
|
+
|
|
401
|
+
rdf_graph = rdflib.Graph()
|
|
402
|
+
rdf_graph.parse(io.StringIO(rdf_content), format="xml")
|
|
403
|
+
|
|
404
|
+
yield from self._iter(rdf_graph)
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
class SPARQLReader(BaseReader):
|
|
408
|
+
"""Generic reader class to fetch and process RDF data from a SPARQL endpoint."""
|
|
409
|
+
|
|
410
|
+
def __init__(self, origin, query, mode="r", *args, **kwargs):
|
|
411
|
+
"""Initialize the reader with the data source.
|
|
412
|
+
|
|
413
|
+
:param origin: The SPARQL endpoint from which to fetch the RDF data.
|
|
414
|
+
:param query: The SPARQL query to execute.
|
|
415
|
+
:param mode: Mode of operation (default is 'r' for reading).
|
|
416
|
+
"""
|
|
417
|
+
self._origin = origin
|
|
418
|
+
self._query = query
|
|
419
|
+
super().__init__(origin=origin, mode=mode, *args, **kwargs)
|
|
420
|
+
|
|
421
|
+
def _iter(self, fp, *args, **kwargs):
|
|
422
|
+
raise NotImplementedError(
|
|
423
|
+
"SPARQLReader downloads one result set from SPARQL and therefore does not iterate through items"
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
def read(self, item=None, *args, **kwargs):
|
|
427
|
+
"""Fetch and process RDF data, yielding results one at a time."""
|
|
428
|
+
if item:
|
|
429
|
+
raise NotImplementedError(
|
|
430
|
+
"SPARQLReader does not support being chained after another reader"
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
sparql_client = sparql.SPARQLWrapper(self._origin)
|
|
434
|
+
sparql_client.setQuery(self._query)
|
|
435
|
+
sparql_client.setReturnFormat(sparql.JSON)
|
|
436
|
+
|
|
437
|
+
results = sparql_client.query().convert()
|
|
438
|
+
yield from results["results"]["bindings"]
|