invenio-vocabularies 4.1.1__py2.py3-none-any.whl → 4.3.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (35) hide show
  1. invenio_vocabularies/__init__.py +1 -1
  2. invenio_vocabularies/administration/views/vocabularies.py +1 -0
  3. invenio_vocabularies/cli.py +17 -6
  4. invenio_vocabularies/config.py +15 -1
  5. invenio_vocabularies/contrib/affiliations/api.py +1 -2
  6. invenio_vocabularies/contrib/affiliations/datastreams.py +33 -8
  7. invenio_vocabularies/contrib/affiliations/services.py +1 -2
  8. invenio_vocabularies/contrib/awards/awards.py +2 -1
  9. invenio_vocabularies/contrib/awards/datastreams.py +1 -0
  10. invenio_vocabularies/contrib/awards/services.py +1 -2
  11. invenio_vocabularies/contrib/common/ror/datastreams.py +39 -5
  12. invenio_vocabularies/contrib/funders/datastreams.py +38 -11
  13. invenio_vocabularies/contrib/funders/funders.py +2 -1
  14. invenio_vocabularies/contrib/names/datastreams.py +160 -2
  15. invenio_vocabularies/contrib/names/s3client.py +44 -0
  16. invenio_vocabularies/datastreams/datastreams.py +61 -13
  17. invenio_vocabularies/datastreams/readers.py +40 -15
  18. invenio_vocabularies/datastreams/tasks.py +37 -0
  19. invenio_vocabularies/datastreams/writers.py +70 -0
  20. invenio_vocabularies/factories.py +1 -0
  21. invenio_vocabularies/records/models.py +2 -4
  22. invenio_vocabularies/records/pidprovider.py +1 -2
  23. invenio_vocabularies/resources/__init__.py +1 -0
  24. invenio_vocabularies/resources/schema.py +2 -1
  25. invenio_vocabularies/services/custom_fields/subject.py +3 -2
  26. invenio_vocabularies/services/custom_fields/vocabulary.py +1 -1
  27. invenio_vocabularies/services/tasks.py +0 -30
  28. invenio_vocabularies/templates/semantic-ui/invenio_vocabularies/subjects.html +1 -1
  29. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/METADATA +18 -1
  30. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/RECORD +35 -33
  31. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/AUTHORS.rst +0 -0
  32. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/LICENSE +0 -0
  33. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/WHEEL +0 -0
  34. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/entry_points.txt +0 -0
  35. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,6 @@
10
10
 
11
11
  from .ext import InvenioVocabularies
12
12
 
13
- __version__ = "4.1.1"
13
+ __version__ = "4.3.0"
14
14
 
15
15
  __all__ = ("__version__", "InvenioVocabularies")
@@ -8,6 +8,7 @@
8
8
  # details.
9
9
 
10
10
  """Vocabularies admin interface."""
11
+
11
12
  from invenio_administration.views.base import (
12
13
  AdminResourceEditView,
13
14
  AdminResourceListView,
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2020-2021 CERN.
3
+ # Copyright (C) 2020-2024 CERN.
4
4
  # Copyright (C) 2021 Graz University of Technology.
5
5
  #
6
6
  # Invenio-Vocabularies is free software; you can redistribute it and/or
@@ -9,7 +9,6 @@
9
9
 
10
10
  """Commands to create and manage vocabularies."""
11
11
 
12
-
13
12
  import click
14
13
  from flask.cli import with_appcontext
15
14
  from invenio_access.permissions import system_identity
@@ -101,7 +100,10 @@ def update(vocabulary, filepath=None, origin=None):
101
100
  config = vc.get_config(filepath, origin)
102
101
 
103
102
  for w_conf in config["writers"]:
104
- w_conf["args"]["update"] = True
103
+ if w_conf["type"] == "async":
104
+ w_conf["args"]["writer"]["args"]["update"] = True
105
+ else:
106
+ w_conf["args"]["update"] = True
105
107
 
106
108
  success, errored, filtered = _process_vocab(config)
107
109
 
@@ -140,18 +142,27 @@ def convert(vocabulary, filepath=None, origin=None, target=None, num_samples=Non
140
142
  type=click.STRING,
141
143
  help="Identifier of the vocabulary item to delete.",
142
144
  )
143
- @click.option("--all", is_flag=True, default=False, help="Not supported yet.")
145
+ @click.option("--all", is_flag=True, default=False)
144
146
  @with_appcontext
145
147
  def delete(vocabulary, identifier, all):
146
148
  """Delete all items or a specific one of the vocabulary."""
147
- if not id and not all:
149
+ if not identifier and not all:
148
150
  click.secho("An identifier or the --all flag must be present.", fg="red")
149
151
  exit(1)
152
+
150
153
  vc = get_vocabulary_config(vocabulary)
151
154
  service = vc.get_service()
152
155
  if identifier:
153
156
  try:
154
- if service.delete(identifier, system_identity):
157
+ if service.delete(system_identity, identifier):
155
158
  click.secho(f"{identifier} deleted from {vocabulary}.", fg="green")
156
159
  except (PIDDeletedError, PIDDoesNotExistError):
157
160
  click.secho(f"PID {identifier} not found.")
161
+ elif all:
162
+ items = service.scan(system_identity)
163
+ for item in items.hits:
164
+ try:
165
+ if service.delete(system_identity, item["id"]):
166
+ click.secho(f"{item['id']} deleted from {vocabulary}.", fg="green")
167
+ except (PIDDeletedError, PIDDoesNotExistError):
168
+ click.secho(f"PID {item['id']} not found.")
@@ -24,7 +24,7 @@ from .datastreams.readers import (
24
24
  ZipReader,
25
25
  )
26
26
  from .datastreams.transformers import XMLTransformer
27
- from .datastreams.writers import ServiceWriter, YamlWriter
27
+ from .datastreams.writers import AsyncWriter, ServiceWriter, YamlWriter
28
28
  from .resources import VocabulariesResourceConfig
29
29
  from .services.config import VocabulariesServiceConfig
30
30
 
@@ -134,6 +134,7 @@ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
134
134
  VOCABULARIES_DATASTREAM_WRITERS = {
135
135
  "service": ServiceWriter,
136
136
  "yaml": YamlWriter,
137
+ "async": AsyncWriter,
137
138
  }
138
139
  """Data Streams writers."""
139
140
 
@@ -154,3 +155,16 @@ VOCABULARIES_TYPES_SEARCH = {
154
155
  "sort": ["name", "count"],
155
156
  }
156
157
  """Vocabulary type search configuration."""
158
+
159
+ VOCABULARIES_ORCID_ACCESS_KEY = "TODO"
160
+ """ORCID access key to access the s3 bucket."""
161
+ VOCABULARIES_ORCID_SECRET_KEY = "TODO"
162
+ """ORCID secret key to access the s3 bucket."""
163
+ VOCABULARIES_ORCID_SUMMARIES_BUCKET = "v3.0-summaries"
164
+ """ORCID summaries bucket name."""
165
+ VOCABULARIES_ORCID_SYNC_MAX_WORKERS = 32
166
+ """ORCID max number of simultaneous workers/connections."""
167
+ VOCABULARIES_ORCID_SYNC_SINCE = {
168
+ "days": 1,
169
+ }
170
+ """ORCID time shift to sync. Parameters accepted are the ones passed to 'datetime.timedelta'."""
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,7 +8,6 @@
8
8
 
9
9
  """Vocabulary affiliations."""
10
10
 
11
-
12
11
  from .affiliations import record_type
13
12
 
14
13
  Affiliation = record_type.record_cls
@@ -9,11 +9,11 @@
9
9
 
10
10
  """Affiliations datastreams, transformers, writers and readers."""
11
11
 
12
- from invenio_access.permissions import system_identity
12
+ from flask import current_app
13
13
  from invenio_i18n import lazy_gettext as _
14
14
 
15
15
  from ...datastreams.writers import ServiceWriter
16
- from .config import affiliation_schemes
16
+ from ..common.ror.datastreams import RORTransformer
17
17
 
18
18
 
19
19
  class AffiliationsServiceWriter(ServiceWriter):
@@ -29,9 +29,35 @@ class AffiliationsServiceWriter(ServiceWriter):
29
29
  return entry["id"]
30
30
 
31
31
 
32
+ class AffiliationsRORTransformer(RORTransformer):
33
+ """Affiliations ROR Transformer."""
34
+
35
+ def __init__(
36
+ self, *args, vocab_schemes=None, funder_fundref_doi_prefix=None, **kwargs
37
+ ):
38
+ """Constructor."""
39
+ if vocab_schemes is None:
40
+ vocab_schemes = current_app.config.get("VOCABULARIES_AFFILIATION_SCHEMES")
41
+ super().__init__(
42
+ *args,
43
+ vocab_schemes=vocab_schemes,
44
+ funder_fundref_doi_prefix=funder_fundref_doi_prefix,
45
+ **kwargs,
46
+ )
47
+
48
+
49
+ VOCABULARIES_DATASTREAM_READERS = {}
50
+ """Affiliations datastream readers."""
51
+
32
52
  VOCABULARIES_DATASTREAM_WRITERS = {
33
53
  "affiliations-service": AffiliationsServiceWriter,
34
54
  }
55
+ """Affiliations datastream writers."""
56
+
57
+ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
58
+ "ror-affiliations": AffiliationsRORTransformer,
59
+ }
60
+ """Affiliations datastream transformers."""
35
61
 
36
62
 
37
63
  DATASTREAM_CONFIG = {
@@ -46,17 +72,16 @@ DATASTREAM_CONFIG = {
46
72
  ],
47
73
  "transformers": [
48
74
  {
49
- "type": "ror",
50
- "args": {
51
- "vocab_schemes": affiliation_schemes,
52
- },
75
+ "type": "ror-affiliations",
53
76
  },
54
77
  ],
55
78
  "writers": [
56
79
  {
57
- "type": "affiliations-service",
80
+ "type": "async",
58
81
  "args": {
59
- "identity": system_identity,
82
+ "writer": {
83
+ "type": "affiliations-service",
84
+ }
60
85
  },
61
86
  }
62
87
  ],
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,7 +8,6 @@
8
8
 
9
9
  """Vocabulary affiliations."""
10
10
 
11
-
12
11
  from .affiliations import record_type
13
12
 
14
13
  AffiliationsServiceConfig = record_type.service_config_cls
@@ -1,12 +1,13 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2022 CERN.
3
+ # Copyright (C) 2022-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
7
7
  # details.
8
8
 
9
9
  """Vocabulary awards."""
10
+
10
11
  from flask_resources import (
11
12
  BaseListSchema,
12
13
  JSONSerializer,
@@ -7,6 +7,7 @@
7
7
  # details.
8
8
 
9
9
  """Awards datastreams, transformers, writers and readers."""
10
+
10
11
  import io
11
12
 
12
13
  import requests
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2022 CERN.
3
+ # Copyright (C) 2022-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,7 +8,6 @@
8
8
 
9
9
  """Vocabulary awards."""
10
10
 
11
-
12
11
  from .awards import record_type
13
12
 
14
13
  AwardsServiceConfig = record_type.service_config_cls
@@ -10,6 +10,7 @@
10
10
  """ROR-related Datastreams Readers/Writers/Transformers module."""
11
11
 
12
12
  import io
13
+ from datetime import datetime
13
14
 
14
15
  import requests
15
16
  from idutils import normalize_ror
@@ -22,6 +23,11 @@ from invenio_vocabularies.datastreams.transformers import BaseTransformer
22
23
  class RORHTTPReader(BaseReader):
23
24
  """ROR HTTP Reader returning an in-memory binary stream of the latest ROR data dump ZIP file."""
24
25
 
26
+ def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
27
+ """Constructor."""
28
+ self._since = since
29
+ super().__init__(origin, mode, *args, **kwargs)
30
+
25
31
  def _iter(self, fp, *args, **kwargs):
26
32
  raise NotImplementedError(
27
33
  "RORHTTPReader downloads one file and therefore does not iterate through items"
@@ -34,15 +40,41 @@ class RORHTTPReader(BaseReader):
34
40
  "RORHTTPReader does not support being chained after another reader"
35
41
  )
36
42
 
43
+ # Follow the DOI to get the link of the linkset
44
+ dataset_doi_link = "https://doi.org/10.5281/zenodo.6347574"
45
+ landing_page = requests.get(dataset_doi_link, allow_redirects=True)
46
+ landing_page.raise_for_status()
47
+
37
48
  # Call the signposting `linkset+json` endpoint for the Concept DOI (i.e. latest version) of the ROR data dump.
38
49
  # See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint
39
- headers = {"Accept": "application/linkset+json"}
40
- api_url = "https://zenodo.org/api/records/6347574"
41
- api_resp = requests.get(api_url, headers=headers)
42
- api_resp.raise_for_status()
50
+ if "linkset" not in landing_page.links:
51
+ raise ReaderError("Linkset not found in the ROR dataset record.")
52
+ linkset_response = requests.get(
53
+ landing_page.links["linkset"]["url"],
54
+ headers={"Accept": "application/linkset+json"},
55
+ )
56
+ linkset_response.raise_for_status()
57
+
58
+ if self._since:
59
+ for link in linkset_response.json()["linkset"]:
60
+ if "type" in link and link["type"] == "application/ld+json":
61
+ json_ld_reponse = requests.get(
62
+ link["anchor"], headers={"Accept": link["type"]}
63
+ )
64
+ json_ld_reponse.raise_for_status()
65
+
66
+ # TODO Update to use dateCreated once the field is added to InvenioRDM. (https://github.com/inveniosoftware/invenio-rdm-records/issues/1777)
67
+ last_dump_date = json_ld_reponse.json()["datePublished"]
68
+ if datetime.fromisoformat(last_dump_date) < datetime.fromisoformat(
69
+ self._since
70
+ ):
71
+ return
72
+ break
73
+ else:
74
+ raise ReaderError("Couldn't find json-ld in publisher's linkset.")
43
75
 
44
76
  # Extract the Landing page Link Set Object located as the first (index 0) item.
45
- landing_page_linkset = api_resp.json()["linkset"][0]
77
+ landing_page_linkset = linkset_response.json()["linkset"][0]
46
78
 
47
79
  # Extract the URL of the only ZIP file linked to the record.
48
80
  landing_page_zip_items = [
@@ -164,3 +196,5 @@ class RORTransformer(BaseTransformer):
164
196
  VOCABULARIES_DATASTREAM_TRANSFORMERS = {
165
197
  "ror": RORTransformer,
166
198
  }
199
+
200
+ VOCABULARIES_DATASTREAM_WRITERS = {}
@@ -9,12 +9,11 @@
9
9
 
10
10
  """Funders datastreams, transformers, writers and readers."""
11
11
 
12
- from idutils import normalize_ror
13
- from invenio_access.permissions import system_identity
12
+ from flask import current_app
14
13
  from invenio_i18n import lazy_gettext as _
15
14
 
16
15
  from ...datastreams.writers import ServiceWriter
17
- from .config import funder_fundref_doi_prefix, funder_schemes
16
+ from ..common.ror.datastreams import RORTransformer
18
17
 
19
18
 
20
19
  class FundersServiceWriter(ServiceWriter):
@@ -30,10 +29,40 @@ class FundersServiceWriter(ServiceWriter):
30
29
  return entry["id"]
31
30
 
32
31
 
32
+ class FundersRORTransformer(RORTransformer):
33
+ """Funders ROR Transformer."""
34
+
35
+ def __init__(
36
+ self, *args, vocab_schemes=None, funder_fundref_doi_prefix=None, **kwargs
37
+ ):
38
+ """Constructor."""
39
+ if vocab_schemes is None:
40
+ vocab_schemes = current_app.config.get("VOCABULARIES_FUNDER_SCHEMES")
41
+ if funder_fundref_doi_prefix is None:
42
+ funder_fundref_doi_prefix = current_app.config.get(
43
+ "VOCABULARIES_FUNDER_DOI_PREFIX"
44
+ )
45
+ super().__init__(
46
+ *args,
47
+ vocab_schemes=vocab_schemes,
48
+ funder_fundref_doi_prefix=funder_fundref_doi_prefix,
49
+ **kwargs,
50
+ )
51
+
52
+
53
+ VOCABULARIES_DATASTREAM_READERS = {}
54
+ """Funders datastreams writers."""
55
+
33
56
  VOCABULARIES_DATASTREAM_WRITERS = {
34
57
  "funders-service": FundersServiceWriter,
35
58
  }
36
- """Funders Data Streams transformers."""
59
+ """Funders datastreams writers."""
60
+
61
+
62
+ VOCABULARIES_DATASTREAM_TRANSFORMERS = {
63
+ "ror-funders": FundersRORTransformer,
64
+ }
65
+ """Funders datastreams transformers."""
37
66
 
38
67
 
39
68
  DATASTREAM_CONFIG = {
@@ -48,18 +77,16 @@ DATASTREAM_CONFIG = {
48
77
  ],
49
78
  "transformers": [
50
79
  {
51
- "type": "ror",
52
- "args": {
53
- "vocab_schemes": funder_schemes,
54
- "funder_fundref_doi_prefix": funder_fundref_doi_prefix,
55
- },
80
+ "type": "ror-funders",
56
81
  },
57
82
  ],
58
83
  "writers": [
59
84
  {
60
- "type": "funders-service",
85
+ "type": "async",
61
86
  "args": {
62
- "identity": system_identity,
87
+ "writer": {
88
+ "type": "funders-service",
89
+ }
63
90
  },
64
91
  }
65
92
  ],
@@ -1,12 +1,13 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2022 CERN.
3
+ # Copyright (C) 2022-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
7
7
  # details.
8
8
 
9
9
  """Vocabulary funders."""
10
+
10
11
  from flask_resources import (
11
12
  BaseListSchema,
12
13
  JSONSerializer,
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021-2022 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,15 +8,123 @@
8
8
 
9
9
  """Names datastreams, transformers, writers and readers."""
10
10
 
11
+ import csv
12
+ import io
13
+ import tarfile
14
+ from concurrent.futures import ThreadPoolExecutor, as_completed
15
+ from datetime import timedelta
16
+
17
+ import arrow
18
+ import regex as re
19
+ from flask import current_app
11
20
  from invenio_access.permissions import system_identity
12
21
  from invenio_records.dictutils import dict_lookup
13
22
 
23
+ from invenio_vocabularies.contrib.names.s3client import S3OrcidClient
24
+
14
25
  from ...datastreams.errors import TransformerError
15
- from ...datastreams.readers import SimpleHTTPReader
26
+ from ...datastreams.readers import BaseReader, SimpleHTTPReader
16
27
  from ...datastreams.transformers import BaseTransformer
17
28
  from ...datastreams.writers import ServiceWriter
18
29
 
19
30
 
31
+ class OrcidDataSyncReader(BaseReader):
32
+ """ORCiD Data Sync Reader."""
33
+
34
+ def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
35
+ """Constructor.
36
+
37
+ :param origin: Data source (e.g. filepath).
38
+ Can be none in case of piped readers.
39
+ """
40
+ super().__init__(origin=origin, mode=mode, *args, **kwargs)
41
+ self.s3_client = S3OrcidClient()
42
+ self.since = since
43
+
44
+ def _fetch_orcid_data(self, orcid_to_sync, bucket):
45
+ """Fetches a single ORCiD record from S3."""
46
+ # The ORCiD file key is located in a folder which name corresponds to the last three digits of the ORCiD
47
+ suffix = orcid_to_sync[-3:]
48
+ key = f"{suffix}/{orcid_to_sync}.xml"
49
+ try:
50
+ return self.s3_client.read_file(f"s3://{bucket}/{key}")
51
+ except Exception as e:
52
+ # TODO: log
53
+ return None
54
+
55
+ def _process_lambda_file(self, fileobj):
56
+ """Process the ORCiD lambda file and returns a list of ORCiDs to sync.
57
+
58
+ The decoded fileobj looks like the following:
59
+ orcid, path, date_created, last_modified
60
+ 0000-0001-5109-3700, http://orcid.org/0000-0001-5109-3700, 2014-08-02 15:00:00.000,2021-08-02 15:00:00.000
61
+
62
+ Yield ORCiDs to sync until the last sync date is reached.
63
+ """
64
+ date_format = "YYYY-MM-DD HH:mm:ss.SSSSSS"
65
+ date_format_no_millis = "YYYY-MM-DD HH:mm:ss"
66
+ time_shift = current_app.config["VOCABULARIES_ORCID_SYNC_SINCE"]
67
+ if self.since:
68
+ time_shift = self.since
69
+ last_sync = arrow.now() - timedelta(**time_shift)
70
+
71
+ file_content = fileobj.read().decode("utf-8")
72
+
73
+ csv_reader = csv.DictReader(file_content.splitlines())
74
+
75
+ for row in csv_reader: # Skip the header line
76
+ orcid = row["orcid"]
77
+
78
+ # Lambda file is ordered by last modified date
79
+ last_modified_str = row["last_modified"]
80
+ try:
81
+ last_modified_date = arrow.get(last_modified_str, date_format)
82
+ except arrow.parser.ParserError:
83
+ last_modified_date = arrow.get(last_modified_str, date_format_no_millis)
84
+
85
+ if last_modified_date < last_sync:
86
+ break
87
+ yield orcid
88
+
89
+ def _iter(self, orcids):
90
+ """Iterates over the ORCiD records yielding each one."""
91
+ with ThreadPoolExecutor(
92
+ max_workers=current_app.config["VOCABULARIES_ORCID_SYNC_MAX_WORKERS"]
93
+ ) as executor:
94
+ futures = [
95
+ executor.submit(
96
+ self._fetch_orcid_data,
97
+ orcid,
98
+ current_app.config["VOCABULARIES_ORCID_SUMMARIES_BUCKET"],
99
+ )
100
+ for orcid in orcids
101
+ ]
102
+ for future in as_completed(futures):
103
+ result = future.result()
104
+ if result is not None:
105
+ yield result
106
+
107
+ def read(self, item=None, *args, **kwargs):
108
+ """Streams the ORCiD lambda file, process it to get the ORCiDS to sync and yields it's data."""
109
+ # Read the file from S3
110
+ tar_content = self.s3_client.read_file(
111
+ "s3://orcid-lambda-file/last_modified.csv.tar"
112
+ )
113
+
114
+ orcids_to_sync = []
115
+ # Opens tar file and process it
116
+ with tarfile.open(fileobj=io.BytesIO(tar_content)) as tar:
117
+ # Iterate over each member (file or directory) in the tar file
118
+ for member in tar.getmembers():
119
+ # Extract the file
120
+ extracted_file = tar.extractfile(member)
121
+ if extracted_file:
122
+ # Process the file and get the ORCiDs to sync
123
+ orcids_to_sync.extend(self._process_lambda_file(extracted_file))
124
+
125
+ yield from self._iter(orcids_to_sync)
126
+
127
+
20
128
  class OrcidHTTPReader(SimpleHTTPReader):
21
129
  """ORCiD HTTP Reader."""
22
130
 
@@ -30,9 +138,26 @@ class OrcidHTTPReader(SimpleHTTPReader):
30
138
  super().__init__(origin, *args, **kwargs)
31
139
 
32
140
 
141
+ DEFAULT_NAMES_EXCLUDE_REGEX = r"[\p{P}\p{S}\p{Nd}\p{No}\p{Emoji}--,.()\-']"
142
+ """Regex to filter out names with punctuations, symbols, decimal numbers and emojis."""
143
+
144
+
33
145
  class OrcidTransformer(BaseTransformer):
34
146
  """Transforms an ORCiD record into a names record."""
35
147
 
148
+ def __init__(
149
+ self, *args, names_exclude_regex=DEFAULT_NAMES_EXCLUDE_REGEX, **kwargs
150
+ ) -> None:
151
+ """Constructor."""
152
+ self._names_exclude_regex = names_exclude_regex
153
+ super().__init__()
154
+
155
+ def _is_valid_name(self, name):
156
+ """Check whether the name passes the regex."""
157
+ if not self._names_exclude_regex:
158
+ return True
159
+ return not bool(re.search(self._names_exclude_regex, name, re.UNICODE | re.V1))
160
+
36
161
  def apply(self, stream_entry, **kwargs):
37
162
  """Applies the transformation to the stream entry."""
38
163
  record = stream_entry.entry
@@ -42,6 +167,11 @@ class OrcidTransformer(BaseTransformer):
42
167
  name = person.get("name")
43
168
  if name is None:
44
169
  raise TransformerError(f"Name not found in ORCiD entry.")
170
+ if name.get("family-name") is None:
171
+ raise TransformerError(f"Family name not found in ORCiD entry.")
172
+
173
+ if not self._is_valid_name(name["given-names"] + name["family-name"]):
174
+ raise TransformerError(f"Invalid characters in name.")
45
175
 
46
176
  entry = {
47
177
  "id": orcid_id,
@@ -89,6 +219,7 @@ class NamesServiceWriter(ServiceWriter):
89
219
 
90
220
  VOCABULARIES_DATASTREAM_READERS = {
91
221
  "orcid-http": OrcidHTTPReader,
222
+ "orcid-data-sync": OrcidDataSyncReader,
92
223
  }
93
224
 
94
225
 
@@ -128,3 +259,30 @@ DATASTREAM_CONFIG = {
128
259
 
129
260
  An origin is required for the reader.
130
261
  """
262
+
263
+ # TODO: Used on the jobs and should be set as a "PRESET" (naming to be defined)
264
+ ORCID_PRESET_DATASTREAM_CONFIG = {
265
+ "readers": [
266
+ {
267
+ "type": "orcid-data-sync",
268
+ },
269
+ {"type": "xml"},
270
+ ],
271
+ "transformers": [{"type": "orcid"}],
272
+ "writers": [
273
+ {
274
+ "type": "async",
275
+ "args": {
276
+ "writer": {
277
+ "type": "names-service",
278
+ }
279
+ },
280
+ }
281
+ ],
282
+ "batch_size": 1000,
283
+ "write_many": True,
284
+ }
285
+ """ORCiD Data Stream configuration.
286
+
287
+ An origin is required for the reader.
288
+ """
@@ -0,0 +1,44 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # This file is part of Invenio.
4
+ # Copyright (C) 2024 CERN.
5
+ #
6
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
7
+ # modify it under the terms of the MIT License; see LICENSE file for more
8
+ # details.
9
+
10
+ """S3 client."""
11
+
12
+ from flask import current_app
13
+
14
+ try:
15
+ import s3fs
16
+ except ImportError:
17
+ s3fs = None
18
+
19
+
20
+ class S3Client:
21
+ """S3 client."""
22
+
23
+ def __init__(self, access_key, secret_key):
24
+ """Constructor."""
25
+ if s3fs is None:
26
+ raise Exception("s3fs is not installed.")
27
+
28
+ self.fs = s3fs.S3FileSystem(key=access_key, secret=secret_key)
29
+
30
+ def read_file(self, s3_path):
31
+ """Reads a file from S3."""
32
+ with self.fs.open(s3_path, "rb") as f:
33
+ return f.read()
34
+
35
+
36
+ class S3OrcidClient(S3Client):
37
+ """S3 ORCiD client."""
38
+
39
+ def __init__(self):
40
+ """Constructor."""
41
+ super().__init__(
42
+ access_key=current_app.config["VOCABULARIES_ORCID_ACCESS_KEY"],
43
+ secret_key=current_app.config["VOCABULARIES_ORCID_SECRET_KEY"],
44
+ )