invenio-vocabularies 4.1.1__py2.py3-none-any.whl → 4.3.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of invenio-vocabularies might be problematic. Click here for more details.
- invenio_vocabularies/__init__.py +1 -1
- invenio_vocabularies/administration/views/vocabularies.py +1 -0
- invenio_vocabularies/cli.py +17 -6
- invenio_vocabularies/config.py +15 -1
- invenio_vocabularies/contrib/affiliations/api.py +1 -2
- invenio_vocabularies/contrib/affiliations/datastreams.py +33 -8
- invenio_vocabularies/contrib/affiliations/services.py +1 -2
- invenio_vocabularies/contrib/awards/awards.py +2 -1
- invenio_vocabularies/contrib/awards/datastreams.py +1 -0
- invenio_vocabularies/contrib/awards/services.py +1 -2
- invenio_vocabularies/contrib/common/ror/datastreams.py +39 -5
- invenio_vocabularies/contrib/funders/datastreams.py +38 -11
- invenio_vocabularies/contrib/funders/funders.py +2 -1
- invenio_vocabularies/contrib/names/datastreams.py +160 -2
- invenio_vocabularies/contrib/names/s3client.py +44 -0
- invenio_vocabularies/datastreams/datastreams.py +61 -13
- invenio_vocabularies/datastreams/readers.py +40 -15
- invenio_vocabularies/datastreams/tasks.py +37 -0
- invenio_vocabularies/datastreams/writers.py +70 -0
- invenio_vocabularies/factories.py +1 -0
- invenio_vocabularies/records/models.py +2 -4
- invenio_vocabularies/records/pidprovider.py +1 -2
- invenio_vocabularies/resources/__init__.py +1 -0
- invenio_vocabularies/resources/schema.py +2 -1
- invenio_vocabularies/services/custom_fields/subject.py +3 -2
- invenio_vocabularies/services/custom_fields/vocabulary.py +1 -1
- invenio_vocabularies/services/tasks.py +0 -30
- invenio_vocabularies/templates/semantic-ui/invenio_vocabularies/subjects.html +1 -1
- {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/METADATA +18 -1
- {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/RECORD +35 -33
- {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/AUTHORS.rst +0 -0
- {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/LICENSE +0 -0
- {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/WHEEL +0 -0
- {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/entry_points.txt +0 -0
- {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/top_level.txt +0 -0
invenio_vocabularies/__init__.py
CHANGED
invenio_vocabularies/cli.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2020-
|
|
3
|
+
# Copyright (C) 2020-2024 CERN.
|
|
4
4
|
# Copyright (C) 2021 Graz University of Technology.
|
|
5
5
|
#
|
|
6
6
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
@@ -9,7 +9,6 @@
|
|
|
9
9
|
|
|
10
10
|
"""Commands to create and manage vocabularies."""
|
|
11
11
|
|
|
12
|
-
|
|
13
12
|
import click
|
|
14
13
|
from flask.cli import with_appcontext
|
|
15
14
|
from invenio_access.permissions import system_identity
|
|
@@ -101,7 +100,10 @@ def update(vocabulary, filepath=None, origin=None):
|
|
|
101
100
|
config = vc.get_config(filepath, origin)
|
|
102
101
|
|
|
103
102
|
for w_conf in config["writers"]:
|
|
104
|
-
w_conf["
|
|
103
|
+
if w_conf["type"] == "async":
|
|
104
|
+
w_conf["args"]["writer"]["args"]["update"] = True
|
|
105
|
+
else:
|
|
106
|
+
w_conf["args"]["update"] = True
|
|
105
107
|
|
|
106
108
|
success, errored, filtered = _process_vocab(config)
|
|
107
109
|
|
|
@@ -140,18 +142,27 @@ def convert(vocabulary, filepath=None, origin=None, target=None, num_samples=Non
|
|
|
140
142
|
type=click.STRING,
|
|
141
143
|
help="Identifier of the vocabulary item to delete.",
|
|
142
144
|
)
|
|
143
|
-
@click.option("--all", is_flag=True, default=False
|
|
145
|
+
@click.option("--all", is_flag=True, default=False)
|
|
144
146
|
@with_appcontext
|
|
145
147
|
def delete(vocabulary, identifier, all):
|
|
146
148
|
"""Delete all items or a specific one of the vocabulary."""
|
|
147
|
-
if not
|
|
149
|
+
if not identifier and not all:
|
|
148
150
|
click.secho("An identifier or the --all flag must be present.", fg="red")
|
|
149
151
|
exit(1)
|
|
152
|
+
|
|
150
153
|
vc = get_vocabulary_config(vocabulary)
|
|
151
154
|
service = vc.get_service()
|
|
152
155
|
if identifier:
|
|
153
156
|
try:
|
|
154
|
-
if service.delete(
|
|
157
|
+
if service.delete(system_identity, identifier):
|
|
155
158
|
click.secho(f"{identifier} deleted from {vocabulary}.", fg="green")
|
|
156
159
|
except (PIDDeletedError, PIDDoesNotExistError):
|
|
157
160
|
click.secho(f"PID {identifier} not found.")
|
|
161
|
+
elif all:
|
|
162
|
+
items = service.scan(system_identity)
|
|
163
|
+
for item in items.hits:
|
|
164
|
+
try:
|
|
165
|
+
if service.delete(system_identity, item["id"]):
|
|
166
|
+
click.secho(f"{item['id']} deleted from {vocabulary}.", fg="green")
|
|
167
|
+
except (PIDDeletedError, PIDDoesNotExistError):
|
|
168
|
+
click.secho(f"PID {item['id']} not found.")
|
invenio_vocabularies/config.py
CHANGED
|
@@ -24,7 +24,7 @@ from .datastreams.readers import (
|
|
|
24
24
|
ZipReader,
|
|
25
25
|
)
|
|
26
26
|
from .datastreams.transformers import XMLTransformer
|
|
27
|
-
from .datastreams.writers import ServiceWriter, YamlWriter
|
|
27
|
+
from .datastreams.writers import AsyncWriter, ServiceWriter, YamlWriter
|
|
28
28
|
from .resources import VocabulariesResourceConfig
|
|
29
29
|
from .services.config import VocabulariesServiceConfig
|
|
30
30
|
|
|
@@ -134,6 +134,7 @@ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
|
134
134
|
VOCABULARIES_DATASTREAM_WRITERS = {
|
|
135
135
|
"service": ServiceWriter,
|
|
136
136
|
"yaml": YamlWriter,
|
|
137
|
+
"async": AsyncWriter,
|
|
137
138
|
}
|
|
138
139
|
"""Data Streams writers."""
|
|
139
140
|
|
|
@@ -154,3 +155,16 @@ VOCABULARIES_TYPES_SEARCH = {
|
|
|
154
155
|
"sort": ["name", "count"],
|
|
155
156
|
}
|
|
156
157
|
"""Vocabulary type search configuration."""
|
|
158
|
+
|
|
159
|
+
VOCABULARIES_ORCID_ACCESS_KEY = "TODO"
|
|
160
|
+
"""ORCID access key to access the s3 bucket."""
|
|
161
|
+
VOCABULARIES_ORCID_SECRET_KEY = "TODO"
|
|
162
|
+
"""ORCID secret key to access the s3 bucket."""
|
|
163
|
+
VOCABULARIES_ORCID_SUMMARIES_BUCKET = "v3.0-summaries"
|
|
164
|
+
"""ORCID summaries bucket name."""
|
|
165
|
+
VOCABULARIES_ORCID_SYNC_MAX_WORKERS = 32
|
|
166
|
+
"""ORCID max number of simultaneous workers/connections."""
|
|
167
|
+
VOCABULARIES_ORCID_SYNC_SINCE = {
|
|
168
|
+
"days": 1,
|
|
169
|
+
}
|
|
170
|
+
"""ORCID time shift to sync. Parameters accepted are the ones passed to 'datetime.timedelta'."""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2021 CERN.
|
|
3
|
+
# Copyright (C) 2021-2024 CERN.
|
|
4
4
|
#
|
|
5
5
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
@@ -8,7 +8,6 @@
|
|
|
8
8
|
|
|
9
9
|
"""Vocabulary affiliations."""
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
from .affiliations import record_type
|
|
13
12
|
|
|
14
13
|
Affiliation = record_type.record_cls
|
|
@@ -9,11 +9,11 @@
|
|
|
9
9
|
|
|
10
10
|
"""Affiliations datastreams, transformers, writers and readers."""
|
|
11
11
|
|
|
12
|
-
from
|
|
12
|
+
from flask import current_app
|
|
13
13
|
from invenio_i18n import lazy_gettext as _
|
|
14
14
|
|
|
15
15
|
from ...datastreams.writers import ServiceWriter
|
|
16
|
-
from .
|
|
16
|
+
from ..common.ror.datastreams import RORTransformer
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class AffiliationsServiceWriter(ServiceWriter):
|
|
@@ -29,9 +29,35 @@ class AffiliationsServiceWriter(ServiceWriter):
|
|
|
29
29
|
return entry["id"]
|
|
30
30
|
|
|
31
31
|
|
|
32
|
+
class AffiliationsRORTransformer(RORTransformer):
|
|
33
|
+
"""Affiliations ROR Transformer."""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self, *args, vocab_schemes=None, funder_fundref_doi_prefix=None, **kwargs
|
|
37
|
+
):
|
|
38
|
+
"""Constructor."""
|
|
39
|
+
if vocab_schemes is None:
|
|
40
|
+
vocab_schemes = current_app.config.get("VOCABULARIES_AFFILIATION_SCHEMES")
|
|
41
|
+
super().__init__(
|
|
42
|
+
*args,
|
|
43
|
+
vocab_schemes=vocab_schemes,
|
|
44
|
+
funder_fundref_doi_prefix=funder_fundref_doi_prefix,
|
|
45
|
+
**kwargs,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
VOCABULARIES_DATASTREAM_READERS = {}
|
|
50
|
+
"""Affiliations datastream readers."""
|
|
51
|
+
|
|
32
52
|
VOCABULARIES_DATASTREAM_WRITERS = {
|
|
33
53
|
"affiliations-service": AffiliationsServiceWriter,
|
|
34
54
|
}
|
|
55
|
+
"""Affiliations datastream writers."""
|
|
56
|
+
|
|
57
|
+
VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
58
|
+
"ror-affiliations": AffiliationsRORTransformer,
|
|
59
|
+
}
|
|
60
|
+
"""Affiliations datastream transformers."""
|
|
35
61
|
|
|
36
62
|
|
|
37
63
|
DATASTREAM_CONFIG = {
|
|
@@ -46,17 +72,16 @@ DATASTREAM_CONFIG = {
|
|
|
46
72
|
],
|
|
47
73
|
"transformers": [
|
|
48
74
|
{
|
|
49
|
-
"type": "ror",
|
|
50
|
-
"args": {
|
|
51
|
-
"vocab_schemes": affiliation_schemes,
|
|
52
|
-
},
|
|
75
|
+
"type": "ror-affiliations",
|
|
53
76
|
},
|
|
54
77
|
],
|
|
55
78
|
"writers": [
|
|
56
79
|
{
|
|
57
|
-
"type": "
|
|
80
|
+
"type": "async",
|
|
58
81
|
"args": {
|
|
59
|
-
"
|
|
82
|
+
"writer": {
|
|
83
|
+
"type": "affiliations-service",
|
|
84
|
+
}
|
|
60
85
|
},
|
|
61
86
|
}
|
|
62
87
|
],
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2021 CERN.
|
|
3
|
+
# Copyright (C) 2021-2024 CERN.
|
|
4
4
|
#
|
|
5
5
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
@@ -8,7 +8,6 @@
|
|
|
8
8
|
|
|
9
9
|
"""Vocabulary affiliations."""
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
from .affiliations import record_type
|
|
13
12
|
|
|
14
13
|
AffiliationsServiceConfig = record_type.service_config_cls
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2022 CERN.
|
|
3
|
+
# Copyright (C) 2022-2024 CERN.
|
|
4
4
|
#
|
|
5
5
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
7
7
|
# details.
|
|
8
8
|
|
|
9
9
|
"""Vocabulary awards."""
|
|
10
|
+
|
|
10
11
|
from flask_resources import (
|
|
11
12
|
BaseListSchema,
|
|
12
13
|
JSONSerializer,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2022 CERN.
|
|
3
|
+
# Copyright (C) 2022-2024 CERN.
|
|
4
4
|
#
|
|
5
5
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
@@ -8,7 +8,6 @@
|
|
|
8
8
|
|
|
9
9
|
"""Vocabulary awards."""
|
|
10
10
|
|
|
11
|
-
|
|
12
11
|
from .awards import record_type
|
|
13
12
|
|
|
14
13
|
AwardsServiceConfig = record_type.service_config_cls
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
"""ROR-related Datastreams Readers/Writers/Transformers module."""
|
|
11
11
|
|
|
12
12
|
import io
|
|
13
|
+
from datetime import datetime
|
|
13
14
|
|
|
14
15
|
import requests
|
|
15
16
|
from idutils import normalize_ror
|
|
@@ -22,6 +23,11 @@ from invenio_vocabularies.datastreams.transformers import BaseTransformer
|
|
|
22
23
|
class RORHTTPReader(BaseReader):
|
|
23
24
|
"""ROR HTTP Reader returning an in-memory binary stream of the latest ROR data dump ZIP file."""
|
|
24
25
|
|
|
26
|
+
def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
|
|
27
|
+
"""Constructor."""
|
|
28
|
+
self._since = since
|
|
29
|
+
super().__init__(origin, mode, *args, **kwargs)
|
|
30
|
+
|
|
25
31
|
def _iter(self, fp, *args, **kwargs):
|
|
26
32
|
raise NotImplementedError(
|
|
27
33
|
"RORHTTPReader downloads one file and therefore does not iterate through items"
|
|
@@ -34,15 +40,41 @@ class RORHTTPReader(BaseReader):
|
|
|
34
40
|
"RORHTTPReader does not support being chained after another reader"
|
|
35
41
|
)
|
|
36
42
|
|
|
43
|
+
# Follow the DOI to get the link of the linkset
|
|
44
|
+
dataset_doi_link = "https://doi.org/10.5281/zenodo.6347574"
|
|
45
|
+
landing_page = requests.get(dataset_doi_link, allow_redirects=True)
|
|
46
|
+
landing_page.raise_for_status()
|
|
47
|
+
|
|
37
48
|
# Call the signposting `linkset+json` endpoint for the Concept DOI (i.e. latest version) of the ROR data dump.
|
|
38
49
|
# See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
50
|
+
if "linkset" not in landing_page.links:
|
|
51
|
+
raise ReaderError("Linkset not found in the ROR dataset record.")
|
|
52
|
+
linkset_response = requests.get(
|
|
53
|
+
landing_page.links["linkset"]["url"],
|
|
54
|
+
headers={"Accept": "application/linkset+json"},
|
|
55
|
+
)
|
|
56
|
+
linkset_response.raise_for_status()
|
|
57
|
+
|
|
58
|
+
if self._since:
|
|
59
|
+
for link in linkset_response.json()["linkset"]:
|
|
60
|
+
if "type" in link and link["type"] == "application/ld+json":
|
|
61
|
+
json_ld_reponse = requests.get(
|
|
62
|
+
link["anchor"], headers={"Accept": link["type"]}
|
|
63
|
+
)
|
|
64
|
+
json_ld_reponse.raise_for_status()
|
|
65
|
+
|
|
66
|
+
# TODO Update to use dateCreated once the field is added to InvenioRDM. (https://github.com/inveniosoftware/invenio-rdm-records/issues/1777)
|
|
67
|
+
last_dump_date = json_ld_reponse.json()["datePublished"]
|
|
68
|
+
if datetime.fromisoformat(last_dump_date) < datetime.fromisoformat(
|
|
69
|
+
self._since
|
|
70
|
+
):
|
|
71
|
+
return
|
|
72
|
+
break
|
|
73
|
+
else:
|
|
74
|
+
raise ReaderError("Couldn't find json-ld in publisher's linkset.")
|
|
43
75
|
|
|
44
76
|
# Extract the Landing page Link Set Object located as the first (index 0) item.
|
|
45
|
-
landing_page_linkset =
|
|
77
|
+
landing_page_linkset = linkset_response.json()["linkset"][0]
|
|
46
78
|
|
|
47
79
|
# Extract the URL of the only ZIP file linked to the record.
|
|
48
80
|
landing_page_zip_items = [
|
|
@@ -164,3 +196,5 @@ class RORTransformer(BaseTransformer):
|
|
|
164
196
|
VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
165
197
|
"ror": RORTransformer,
|
|
166
198
|
}
|
|
199
|
+
|
|
200
|
+
VOCABULARIES_DATASTREAM_WRITERS = {}
|
|
@@ -9,12 +9,11 @@
|
|
|
9
9
|
|
|
10
10
|
"""Funders datastreams, transformers, writers and readers."""
|
|
11
11
|
|
|
12
|
-
from
|
|
13
|
-
from invenio_access.permissions import system_identity
|
|
12
|
+
from flask import current_app
|
|
14
13
|
from invenio_i18n import lazy_gettext as _
|
|
15
14
|
|
|
16
15
|
from ...datastreams.writers import ServiceWriter
|
|
17
|
-
from .
|
|
16
|
+
from ..common.ror.datastreams import RORTransformer
|
|
18
17
|
|
|
19
18
|
|
|
20
19
|
class FundersServiceWriter(ServiceWriter):
|
|
@@ -30,10 +29,40 @@ class FundersServiceWriter(ServiceWriter):
|
|
|
30
29
|
return entry["id"]
|
|
31
30
|
|
|
32
31
|
|
|
32
|
+
class FundersRORTransformer(RORTransformer):
|
|
33
|
+
"""Funders ROR Transformer."""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self, *args, vocab_schemes=None, funder_fundref_doi_prefix=None, **kwargs
|
|
37
|
+
):
|
|
38
|
+
"""Constructor."""
|
|
39
|
+
if vocab_schemes is None:
|
|
40
|
+
vocab_schemes = current_app.config.get("VOCABULARIES_FUNDER_SCHEMES")
|
|
41
|
+
if funder_fundref_doi_prefix is None:
|
|
42
|
+
funder_fundref_doi_prefix = current_app.config.get(
|
|
43
|
+
"VOCABULARIES_FUNDER_DOI_PREFIX"
|
|
44
|
+
)
|
|
45
|
+
super().__init__(
|
|
46
|
+
*args,
|
|
47
|
+
vocab_schemes=vocab_schemes,
|
|
48
|
+
funder_fundref_doi_prefix=funder_fundref_doi_prefix,
|
|
49
|
+
**kwargs,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
VOCABULARIES_DATASTREAM_READERS = {}
|
|
54
|
+
"""Funders datastreams writers."""
|
|
55
|
+
|
|
33
56
|
VOCABULARIES_DATASTREAM_WRITERS = {
|
|
34
57
|
"funders-service": FundersServiceWriter,
|
|
35
58
|
}
|
|
36
|
-
"""Funders
|
|
59
|
+
"""Funders datastreams writers."""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
VOCABULARIES_DATASTREAM_TRANSFORMERS = {
|
|
63
|
+
"ror-funders": FundersRORTransformer,
|
|
64
|
+
}
|
|
65
|
+
"""Funders datastreams transformers."""
|
|
37
66
|
|
|
38
67
|
|
|
39
68
|
DATASTREAM_CONFIG = {
|
|
@@ -48,18 +77,16 @@ DATASTREAM_CONFIG = {
|
|
|
48
77
|
],
|
|
49
78
|
"transformers": [
|
|
50
79
|
{
|
|
51
|
-
"type": "ror",
|
|
52
|
-
"args": {
|
|
53
|
-
"vocab_schemes": funder_schemes,
|
|
54
|
-
"funder_fundref_doi_prefix": funder_fundref_doi_prefix,
|
|
55
|
-
},
|
|
80
|
+
"type": "ror-funders",
|
|
56
81
|
},
|
|
57
82
|
],
|
|
58
83
|
"writers": [
|
|
59
84
|
{
|
|
60
|
-
"type": "
|
|
85
|
+
"type": "async",
|
|
61
86
|
"args": {
|
|
62
|
-
"
|
|
87
|
+
"writer": {
|
|
88
|
+
"type": "funders-service",
|
|
89
|
+
}
|
|
63
90
|
},
|
|
64
91
|
}
|
|
65
92
|
],
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2022 CERN.
|
|
3
|
+
# Copyright (C) 2022-2024 CERN.
|
|
4
4
|
#
|
|
5
5
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
7
7
|
# details.
|
|
8
8
|
|
|
9
9
|
"""Vocabulary funders."""
|
|
10
|
+
|
|
10
11
|
from flask_resources import (
|
|
11
12
|
BaseListSchema,
|
|
12
13
|
JSONSerializer,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2021-
|
|
3
|
+
# Copyright (C) 2021-2024 CERN.
|
|
4
4
|
#
|
|
5
5
|
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
@@ -8,15 +8,123 @@
|
|
|
8
8
|
|
|
9
9
|
"""Names datastreams, transformers, writers and readers."""
|
|
10
10
|
|
|
11
|
+
import csv
|
|
12
|
+
import io
|
|
13
|
+
import tarfile
|
|
14
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
15
|
+
from datetime import timedelta
|
|
16
|
+
|
|
17
|
+
import arrow
|
|
18
|
+
import regex as re
|
|
19
|
+
from flask import current_app
|
|
11
20
|
from invenio_access.permissions import system_identity
|
|
12
21
|
from invenio_records.dictutils import dict_lookup
|
|
13
22
|
|
|
23
|
+
from invenio_vocabularies.contrib.names.s3client import S3OrcidClient
|
|
24
|
+
|
|
14
25
|
from ...datastreams.errors import TransformerError
|
|
15
|
-
from ...datastreams.readers import SimpleHTTPReader
|
|
26
|
+
from ...datastreams.readers import BaseReader, SimpleHTTPReader
|
|
16
27
|
from ...datastreams.transformers import BaseTransformer
|
|
17
28
|
from ...datastreams.writers import ServiceWriter
|
|
18
29
|
|
|
19
30
|
|
|
31
|
+
class OrcidDataSyncReader(BaseReader):
|
|
32
|
+
"""ORCiD Data Sync Reader."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
|
|
35
|
+
"""Constructor.
|
|
36
|
+
|
|
37
|
+
:param origin: Data source (e.g. filepath).
|
|
38
|
+
Can be none in case of piped readers.
|
|
39
|
+
"""
|
|
40
|
+
super().__init__(origin=origin, mode=mode, *args, **kwargs)
|
|
41
|
+
self.s3_client = S3OrcidClient()
|
|
42
|
+
self.since = since
|
|
43
|
+
|
|
44
|
+
def _fetch_orcid_data(self, orcid_to_sync, bucket):
|
|
45
|
+
"""Fetches a single ORCiD record from S3."""
|
|
46
|
+
# The ORCiD file key is located in a folder which name corresponds to the last three digits of the ORCiD
|
|
47
|
+
suffix = orcid_to_sync[-3:]
|
|
48
|
+
key = f"{suffix}/{orcid_to_sync}.xml"
|
|
49
|
+
try:
|
|
50
|
+
return self.s3_client.read_file(f"s3://{bucket}/{key}")
|
|
51
|
+
except Exception as e:
|
|
52
|
+
# TODO: log
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
def _process_lambda_file(self, fileobj):
|
|
56
|
+
"""Process the ORCiD lambda file and returns a list of ORCiDs to sync.
|
|
57
|
+
|
|
58
|
+
The decoded fileobj looks like the following:
|
|
59
|
+
orcid, path, date_created, last_modified
|
|
60
|
+
0000-0001-5109-3700, http://orcid.org/0000-0001-5109-3700, 2014-08-02 15:00:00.000,2021-08-02 15:00:00.000
|
|
61
|
+
|
|
62
|
+
Yield ORCiDs to sync until the last sync date is reached.
|
|
63
|
+
"""
|
|
64
|
+
date_format = "YYYY-MM-DD HH:mm:ss.SSSSSS"
|
|
65
|
+
date_format_no_millis = "YYYY-MM-DD HH:mm:ss"
|
|
66
|
+
time_shift = current_app.config["VOCABULARIES_ORCID_SYNC_SINCE"]
|
|
67
|
+
if self.since:
|
|
68
|
+
time_shift = self.since
|
|
69
|
+
last_sync = arrow.now() - timedelta(**time_shift)
|
|
70
|
+
|
|
71
|
+
file_content = fileobj.read().decode("utf-8")
|
|
72
|
+
|
|
73
|
+
csv_reader = csv.DictReader(file_content.splitlines())
|
|
74
|
+
|
|
75
|
+
for row in csv_reader: # Skip the header line
|
|
76
|
+
orcid = row["orcid"]
|
|
77
|
+
|
|
78
|
+
# Lambda file is ordered by last modified date
|
|
79
|
+
last_modified_str = row["last_modified"]
|
|
80
|
+
try:
|
|
81
|
+
last_modified_date = arrow.get(last_modified_str, date_format)
|
|
82
|
+
except arrow.parser.ParserError:
|
|
83
|
+
last_modified_date = arrow.get(last_modified_str, date_format_no_millis)
|
|
84
|
+
|
|
85
|
+
if last_modified_date < last_sync:
|
|
86
|
+
break
|
|
87
|
+
yield orcid
|
|
88
|
+
|
|
89
|
+
def _iter(self, orcids):
|
|
90
|
+
"""Iterates over the ORCiD records yielding each one."""
|
|
91
|
+
with ThreadPoolExecutor(
|
|
92
|
+
max_workers=current_app.config["VOCABULARIES_ORCID_SYNC_MAX_WORKERS"]
|
|
93
|
+
) as executor:
|
|
94
|
+
futures = [
|
|
95
|
+
executor.submit(
|
|
96
|
+
self._fetch_orcid_data,
|
|
97
|
+
orcid,
|
|
98
|
+
current_app.config["VOCABULARIES_ORCID_SUMMARIES_BUCKET"],
|
|
99
|
+
)
|
|
100
|
+
for orcid in orcids
|
|
101
|
+
]
|
|
102
|
+
for future in as_completed(futures):
|
|
103
|
+
result = future.result()
|
|
104
|
+
if result is not None:
|
|
105
|
+
yield result
|
|
106
|
+
|
|
107
|
+
def read(self, item=None, *args, **kwargs):
|
|
108
|
+
"""Streams the ORCiD lambda file, process it to get the ORCiDS to sync and yields it's data."""
|
|
109
|
+
# Read the file from S3
|
|
110
|
+
tar_content = self.s3_client.read_file(
|
|
111
|
+
"s3://orcid-lambda-file/last_modified.csv.tar"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
orcids_to_sync = []
|
|
115
|
+
# Opens tar file and process it
|
|
116
|
+
with tarfile.open(fileobj=io.BytesIO(tar_content)) as tar:
|
|
117
|
+
# Iterate over each member (file or directory) in the tar file
|
|
118
|
+
for member in tar.getmembers():
|
|
119
|
+
# Extract the file
|
|
120
|
+
extracted_file = tar.extractfile(member)
|
|
121
|
+
if extracted_file:
|
|
122
|
+
# Process the file and get the ORCiDs to sync
|
|
123
|
+
orcids_to_sync.extend(self._process_lambda_file(extracted_file))
|
|
124
|
+
|
|
125
|
+
yield from self._iter(orcids_to_sync)
|
|
126
|
+
|
|
127
|
+
|
|
20
128
|
class OrcidHTTPReader(SimpleHTTPReader):
|
|
21
129
|
"""ORCiD HTTP Reader."""
|
|
22
130
|
|
|
@@ -30,9 +138,26 @@ class OrcidHTTPReader(SimpleHTTPReader):
|
|
|
30
138
|
super().__init__(origin, *args, **kwargs)
|
|
31
139
|
|
|
32
140
|
|
|
141
|
+
DEFAULT_NAMES_EXCLUDE_REGEX = r"[\p{P}\p{S}\p{Nd}\p{No}\p{Emoji}--,.()\-']"
|
|
142
|
+
"""Regex to filter out names with punctuations, symbols, decimal numbers and emojis."""
|
|
143
|
+
|
|
144
|
+
|
|
33
145
|
class OrcidTransformer(BaseTransformer):
|
|
34
146
|
"""Transforms an ORCiD record into a names record."""
|
|
35
147
|
|
|
148
|
+
def __init__(
|
|
149
|
+
self, *args, names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX, **kwargs
|
|
150
|
+
) -> None:
|
|
151
|
+
"""Constructor."""
|
|
152
|
+
self._names_exclude_regex = names_exclude_regex
|
|
153
|
+
super().__init__()
|
|
154
|
+
|
|
155
|
+
def _is_valid_name(self, name):
|
|
156
|
+
"""Check whether the name passes the regex."""
|
|
157
|
+
if not self._names_exclude_regex:
|
|
158
|
+
return True
|
|
159
|
+
return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1))
|
|
160
|
+
|
|
36
161
|
def apply(self, stream_entry, **kwargs):
|
|
37
162
|
"""Applies the transformation to the stream entry."""
|
|
38
163
|
record = stream_entry.entry
|
|
@@ -42,6 +167,11 @@ class OrcidTransformer(BaseTransformer):
|
|
|
42
167
|
name = person.get("name")
|
|
43
168
|
if name is None:
|
|
44
169
|
raise TransformerError(f"Name not found in ORCiD entry.")
|
|
170
|
+
if name.get("family-name") is None:
|
|
171
|
+
raise TransformerError(f"Family name not found in ORCiD entry.")
|
|
172
|
+
|
|
173
|
+
if not self._is_valid_name(name["given-names"] + name["family-name"]):
|
|
174
|
+
raise TransformerError(f"Invalid characters in name.")
|
|
45
175
|
|
|
46
176
|
entry = {
|
|
47
177
|
"id": orcid_id,
|
|
@@ -89,6 +219,7 @@ class NamesServiceWriter(ServiceWriter):
|
|
|
89
219
|
|
|
90
220
|
VOCABULARIES_DATASTREAM_READERS = {
|
|
91
221
|
"orcid-http": OrcidHTTPReader,
|
|
222
|
+
"orcid-data-sync": OrcidDataSyncReader,
|
|
92
223
|
}
|
|
93
224
|
|
|
94
225
|
|
|
@@ -128,3 +259,30 @@ DATASTREAM_CONFIG = {
|
|
|
128
259
|
|
|
129
260
|
An origin is required for the reader.
|
|
130
261
|
"""
|
|
262
|
+
|
|
263
|
+
# TODO: Used on the jobs and should be set as a "PRESET" (naming to be defined)
|
|
264
|
+
ORCID_PRESET_DATASTREAM_CONFIG = {
|
|
265
|
+
"readers": [
|
|
266
|
+
{
|
|
267
|
+
"type": "orcid-data-sync",
|
|
268
|
+
},
|
|
269
|
+
{"type": "xml"},
|
|
270
|
+
],
|
|
271
|
+
"transformers": [{"type": "orcid"}],
|
|
272
|
+
"writers": [
|
|
273
|
+
{
|
|
274
|
+
"type": "async",
|
|
275
|
+
"args": {
|
|
276
|
+
"writer": {
|
|
277
|
+
"type": "names-service",
|
|
278
|
+
}
|
|
279
|
+
},
|
|
280
|
+
}
|
|
281
|
+
],
|
|
282
|
+
"batch_size": 1000,
|
|
283
|
+
"write_many": True,
|
|
284
|
+
}
|
|
285
|
+
"""ORCiD Data Stream configuration.
|
|
286
|
+
|
|
287
|
+
An origin is required for the reader.
|
|
288
|
+
"""
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#
|
|
3
|
+
# This file is part of Invenio.
|
|
4
|
+
# Copyright (C) 2024 CERN.
|
|
5
|
+
#
|
|
6
|
+
# Invenio-Vocabularies is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the MIT License; see LICENSE file for more
|
|
8
|
+
# details.
|
|
9
|
+
|
|
10
|
+
"""S3 client."""
|
|
11
|
+
|
|
12
|
+
from flask import current_app
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import s3fs
|
|
16
|
+
except ImportError:
|
|
17
|
+
s3fs = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class S3Client:
|
|
21
|
+
"""S3 client."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, access_key, secret_key):
|
|
24
|
+
"""Constructor."""
|
|
25
|
+
if s3fs is None:
|
|
26
|
+
raise Exception("s3fs is not installed.")
|
|
27
|
+
|
|
28
|
+
self.fs = s3fs.S3FileSystem(key=access_key, secret=secret_key)
|
|
29
|
+
|
|
30
|
+
def read_file(self, s3_path):
|
|
31
|
+
"""Reads a file from S3."""
|
|
32
|
+
with self.fs.open(s3_path, "rb") as f:
|
|
33
|
+
return f.read()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class S3OrcidClient(S3Client):
|
|
37
|
+
"""S3 ORCiD client."""
|
|
38
|
+
|
|
39
|
+
def __init__(self):
|
|
40
|
+
"""Constructor."""
|
|
41
|
+
super().__init__(
|
|
42
|
+
access_key=current_app.config["VOCABULARIES_ORCID_ACCESS_KEY"],
|
|
43
|
+
secret_key=current_app.config["VOCABULARIES_ORCID_SECRET_KEY"],
|
|
44
|
+
)
|