invenio-vocabularies 4.1.1__py2.py3-none-any.whl → 4.3.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (35) hide show
  1. invenio_vocabularies/__init__.py +1 -1
  2. invenio_vocabularies/administration/views/vocabularies.py +1 -0
  3. invenio_vocabularies/cli.py +17 -6
  4. invenio_vocabularies/config.py +15 -1
  5. invenio_vocabularies/contrib/affiliations/api.py +1 -2
  6. invenio_vocabularies/contrib/affiliations/datastreams.py +33 -8
  7. invenio_vocabularies/contrib/affiliations/services.py +1 -2
  8. invenio_vocabularies/contrib/awards/awards.py +2 -1
  9. invenio_vocabularies/contrib/awards/datastreams.py +1 -0
  10. invenio_vocabularies/contrib/awards/services.py +1 -2
  11. invenio_vocabularies/contrib/common/ror/datastreams.py +39 -5
  12. invenio_vocabularies/contrib/funders/datastreams.py +38 -11
  13. invenio_vocabularies/contrib/funders/funders.py +2 -1
  14. invenio_vocabularies/contrib/names/datastreams.py +160 -2
  15. invenio_vocabularies/contrib/names/s3client.py +44 -0
  16. invenio_vocabularies/datastreams/datastreams.py +61 -13
  17. invenio_vocabularies/datastreams/readers.py +40 -15
  18. invenio_vocabularies/datastreams/tasks.py +37 -0
  19. invenio_vocabularies/datastreams/writers.py +70 -0
  20. invenio_vocabularies/factories.py +1 -0
  21. invenio_vocabularies/records/models.py +2 -4
  22. invenio_vocabularies/records/pidprovider.py +1 -2
  23. invenio_vocabularies/resources/__init__.py +1 -0
  24. invenio_vocabularies/resources/schema.py +2 -1
  25. invenio_vocabularies/services/custom_fields/subject.py +3 -2
  26. invenio_vocabularies/services/custom_fields/vocabulary.py +1 -1
  27. invenio_vocabularies/services/tasks.py +0 -30
  28. invenio_vocabularies/templates/semantic-ui/invenio_vocabularies/subjects.html +1 -1
  29. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/METADATA +18 -1
  30. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/RECORD +35 -33
  31. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/AUTHORS.rst +0 -0
  32. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/LICENSE +0 -0
  33. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/WHEEL +0 -0
  34. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/entry_points.txt +0 -0
  35. {invenio_vocabularies-4.1.1.dist-info → invenio_vocabularies-4.3.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021-2022 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,17 +8,41 @@
8
8
 
9
9
  """Base data stream."""
10
10
 
11
+ from flask import current_app
12
+
11
13
  from .errors import ReaderError, TransformerError, WriterError
12
14
 
13
15
 
14
16
  class StreamEntry:
15
17
  """Object to encapsulate streams processing."""
16
18
 
17
- def __init__(self, entry, errors=None):
18
- """Constructor."""
19
+ def __init__(self, entry, record=None, errors=None, op_type=None, exc=None):
20
+ """Constructor for the StreamEntry class.
21
+
22
+ :param entry (object): The entry object, usually a record dict.
23
+ :param record (object): The record object, usually a record class.
24
+ :param errors (list, optional): List of errors. Defaults to None.
25
+ :param op_type (str, optional): The operation type. Defaults to None.
26
+ :param exc (str, optional): The raised unhandled exception. Defaults to None.
27
+ """
19
28
  self.entry = entry
29
+ self.record = record
20
30
  self.filtered = False
21
31
  self.errors = errors or []
32
+ self.op_type = op_type
33
+ self.exc = exc
34
+
35
+ def log_errors(self, logger=None):
36
+ """Log the errors using the provided logger or the default logger.
37
+
38
+ :param logger (logging.Logger, optional): Logger instance to use. Defaults to None.
39
+ """
40
+ if logger is None:
41
+ logger = current_app.logger
42
+ for error in self.errors:
43
+ logger.error(f"Error in entry {self.entry}: {error}")
44
+ if self.exc:
45
+ logger.error(f"Exception in entry {self.entry}: {self.exc}")
22
46
 
23
47
 
24
48
  class DataStream:
@@ -39,15 +63,10 @@ class DataStream:
39
63
  """Checks if an stream_entry should be filtered out (skipped)."""
40
64
  return False
41
65
 
42
- def process(self, *args, **kwargs):
43
- """Iterates over the entries.
44
-
45
- Uses the reader to get the raw entries and transforms them.
46
- It will iterate over the `StreamEntry` objects returned by
47
- the reader, apply the transformations and yield the result of
48
- writing it.
49
- """
50
- for stream_entry in self.read():
66
+ def process_batch(self, batch, write_many=False):
67
+ """Process a batch of entries."""
68
+ transformed_entries = []
69
+ for stream_entry in batch:
51
70
  if stream_entry.errors:
52
71
  yield stream_entry # reading errors
53
72
  else:
@@ -58,7 +77,31 @@ class DataStream:
58
77
  transformed_entry.filtered = True
59
78
  yield transformed_entry
60
79
  else:
61
- yield self.write(transformed_entry)
80
+ transformed_entries.append(transformed_entry)
81
+ if transformed_entries:
82
+ if write_many:
83
+ yield from self.batch_write(transformed_entries)
84
+ else:
85
+ yield from (self.write(entry) for entry in transformed_entries)
86
+
87
+ def process(self, batch_size=100, write_many=False, *args, **kwargs):
88
+ """Iterates over the entries.
89
+
90
+ Uses the reader to get the raw entries and transforms them.
91
+ It will iterate over the `StreamEntry` objects returned by
92
+ the reader, apply the transformations and yield the result of
93
+ writing it.
94
+ """
95
+ batch = []
96
+ for stream_entry in self.read():
97
+ batch.append(stream_entry)
98
+ if len(batch) >= batch_size:
99
+ yield from self.process_batch(batch, write_many=write_many)
100
+ batch = []
101
+
102
+ # Process any remaining entries in the last batch
103
+ if batch:
104
+ yield from self.process_batch(batch, write_many=write_many)
62
105
 
63
106
  def read(self):
64
107
  """Recursively read the entries."""
@@ -107,6 +150,11 @@ class DataStream:
107
150
 
108
151
  return stream_entry
109
152
 
153
+ def batch_write(self, stream_entries, *args, **kwargs):
154
+ """Apply the transformations to an stream_entry. Errors are handler in the service layer."""
155
+ for writer in self._writers:
156
+ yield from writer.write_many(stream_entries)
157
+
110
158
  def total(self, *args, **kwargs):
111
159
  """The total of entries obtained from the origin."""
112
160
  raise NotImplementedError()
@@ -21,6 +21,7 @@ from json.decoder import JSONDecodeError
21
21
  import requests
22
22
  import yaml
23
23
  from lxml import etree
24
+ from lxml.html import fromstring
24
25
  from lxml.html import parse as html_parse
25
26
 
26
27
  from .errors import ReaderError
@@ -226,8 +227,13 @@ class XMLReader(BaseReader):
226
227
  def _iter(self, fp, *args, **kwargs):
227
228
  """Read and parse an XML file to dict."""
228
229
  # NOTE: We parse HTML, to skip XML validation and strip XML namespaces
229
- xml_tree = html_parse(fp).getroot()
230
- record = etree_to_dict(xml_tree)["html"]["body"].get("record")
230
+ record = None
231
+ try:
232
+ xml_tree = fromstring(fp)
233
+ record = etree_to_dict(xml_tree).get("record")
234
+ except Exception as e:
235
+ xml_tree = html_parse(fp).getroot()
236
+ record = etree_to_dict(xml_tree)["html"]["body"].get("record")
231
237
 
232
238
  if not record:
233
239
  raise ReaderError(f"Record not found in XML entry.")
@@ -270,19 +276,38 @@ class OAIPMHReader(BaseReader):
270
276
  self.xml.find(f".//{self._oai_namespace}metadata").getchildren()[0],
271
277
  )
272
278
 
273
- scythe.class_mapping["ListRecords"] = OAIRecord
274
- try:
275
- records = scythe.list_records(
276
- from_=self._from,
277
- until=self._until,
278
- metadata_prefix=self._metadata_prefix,
279
- set_=self._set,
280
- ignore_deleted=True,
281
- )
282
- for record in records:
283
- yield {"record": record}
284
- except oaipmh_scythe.NoRecordsMatch:
285
- raise ReaderError(f"No records found in OAI-PMH request.")
279
+ if self._verb == "ListRecords":
280
+ scythe.class_mapping["ListRecords"] = OAIRecord
281
+ try:
282
+ records = scythe.list_records(
283
+ from_=self._from,
284
+ until=self._until,
285
+ metadata_prefix=self._metadata_prefix,
286
+ set_=self._set,
287
+ ignore_deleted=True,
288
+ )
289
+ for record in records:
290
+ yield {"record": record}
291
+ except oaipmh_scythe.NoRecordsMatch:
292
+ raise ReaderError("No records found in OAI-PMH request.")
293
+ else:
294
+ scythe.class_mapping["GetRecord"] = OAIRecord
295
+ try:
296
+ headers = scythe.list_identifiers(
297
+ from_=self._from,
298
+ until=self._until,
299
+ metadata_prefix=self._metadata_prefix,
300
+ set_=self._set,
301
+ ignore_deleted=True,
302
+ )
303
+ for header in headers:
304
+ record = scythe.get_record(
305
+ identifier=header.identifier,
306
+ metadata_prefix=self._metadata_prefix,
307
+ )
308
+ yield {"record": record}
309
+ except oaipmh_scythe.NoRecordsMatch:
310
+ raise ReaderError("No records found in OAI-PMH request.")
286
311
 
287
312
  def read(self, item=None, *args, **kwargs):
288
313
  """Reads from item or opens the file descriptor from origin."""
@@ -0,0 +1,37 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2022-2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """Data Streams Celery tasks."""
10
+
11
+ from celery import shared_task
12
+
13
+ from ..datastreams import StreamEntry
14
+ from ..datastreams.factories import WriterFactory
15
+
16
+
17
+ @shared_task(ignore_result=True)
18
+ def write_entry(writer_config, entry):
19
+ """Write an entry.
20
+
21
+ :param writer: writer configuration as accepted by the WriterFactory.
22
+ :param entry: dictionary, StreamEntry is not serializable.
23
+ """
24
+ writer = WriterFactory.create(config=writer_config)
25
+ writer.write(StreamEntry(entry))
26
+
27
+
28
+ @shared_task(ignore_result=True)
29
+ def write_many_entry(writer_config, entries):
30
+ """Write many entries.
31
+
32
+ :param writer: writer configuration as accepted by the WriterFactory.
33
+ :param entry: lisf ot dictionaries, StreamEntry is not serializable.
34
+ """
35
+ writer = WriterFactory.create(config=writer_config)
36
+ stream_entries = [StreamEntry(entry) for entry in entries]
37
+ writer.write_many(stream_entries)
@@ -20,11 +20,17 @@ from marshmallow import ValidationError
20
20
 
21
21
  from .datastreams import StreamEntry
22
22
  from .errors import WriterError
23
+ from .tasks import write_entry, write_many_entry
23
24
 
24
25
 
25
26
  class BaseWriter(ABC):
26
27
  """Base writer."""
27
28
 
29
+ def __init__(self, *args, **kwargs):
30
+ """Base initialization logic."""
31
+ # Add any base initialization here if needed
32
+ pass
33
+
28
34
  @abstractmethod
29
35
  def write(self, stream_entry, *args, **kwargs):
30
36
  """Writes the input stream entry to the target output.
@@ -35,6 +41,16 @@ class BaseWriter(ABC):
35
41
  """
36
42
  pass
37
43
 
44
+ @abstractmethod
45
+ def write_many(self, stream_entries, *args, **kwargs):
46
+ """Writes the input streams entry to the target output.
47
+
48
+ :returns: A List of StreamEntry. The result of writing the entry.
49
+ Raises WriterException in case of errors.
50
+
51
+ """
52
+ pass
53
+
38
54
 
39
55
  class ServiceWriter(BaseWriter):
40
56
  """Writes the entries to an RDM instance using a Service object."""
@@ -85,6 +101,25 @@ class ServiceWriter(BaseWriter):
85
101
  # TODO: Check if we can get the error message easier
86
102
  raise WriterError([{"InvalidRelationValue": err.args[0]}])
87
103
 
104
+ def write_many(self, stream_entries, *args, **kwargs):
105
+ """Writes the input entries using a given service."""
106
+ entries = [entry.entry for entry in stream_entries]
107
+ entries_with_id = [(self._entry_id(entry), entry) for entry in entries]
108
+ results = self._service.create_or_update_many(self._identity, entries_with_id)
109
+ stream_entries_processed = []
110
+ for entry, result in zip(entries, results):
111
+ processed_stream_entry = StreamEntry(
112
+ entry=entry,
113
+ record=result.record,
114
+ errors=result.errors,
115
+ op_type=result.op_type,
116
+ exc=result.exc,
117
+ )
118
+ processed_stream_entry.log_errors()
119
+ stream_entries_processed.append(processed_stream_entry)
120
+
121
+ return stream_entries_processed
122
+
88
123
 
89
124
  class YamlWriter(BaseWriter):
90
125
  """Writes the entries to a YAML file."""
@@ -106,3 +141,38 @@ class YamlWriter(BaseWriter):
106
141
  yaml.safe_dump([stream_entry.entry], file, allow_unicode=True)
107
142
 
108
143
  return stream_entry
144
+
145
+ def write_many(self, stream_entries, *args, **kwargs):
146
+ """Writes the yaml input entries."""
147
+ with open(self._filepath, "a") as file:
148
+ yaml.safe_dump(
149
+ [stream_entry.entry for stream_entry in stream_entries],
150
+ file,
151
+ allow_unicode=True,
152
+ )
153
+
154
+
155
+ class AsyncWriter(BaseWriter):
156
+ """Writes the entries asynchronously (celery task)."""
157
+
158
+ def __init__(self, writer, *args, **kwargs):
159
+ """Constructor.
160
+
161
+ :param writer: writer to use.
162
+ """
163
+ super().__init__(*args, **kwargs)
164
+ self._writer = writer
165
+
166
+ def write(self, stream_entry, *args, **kwargs):
167
+ """Launches a celery task to write an entry."""
168
+ write_entry.delay(self._writer, stream_entry.entry)
169
+
170
+ return stream_entry
171
+
172
+ def write_many(self, stream_entries, *args, **kwargs):
173
+ """Launches a celery task to write an entry."""
174
+ write_many_entry.delay(
175
+ self._writer, [stream_entry.entry for stream_entry in stream_entries]
176
+ )
177
+
178
+ return stream_entries
@@ -7,6 +7,7 @@
7
7
  # modify it under the terms of the MIT License; see LICENSE file for more
8
8
  # details.
9
9
  """Generate Vocabulary Config."""
10
+
10
11
  from copy import deepcopy
11
12
 
12
13
  import yaml
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2020-2021 CERN.
3
+ # Copyright (C) 2020-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -79,9 +79,7 @@ class VocabularyScheme(db.Model):
79
79
  """Create a new vocabulary subtype."""
80
80
  banned = [",", ":"]
81
81
  for b in banned:
82
- assert (
83
- b not in data["id"]
84
- ), f"No '{b}' allowed in VocabularyScheme.id" # noqa
82
+ assert b not in data["id"], f"No '{b}' allowed in VocabularyScheme.id"
85
83
 
86
84
  with db.session.begin_nested():
87
85
  obj = cls(**data)
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,7 +8,6 @@
8
8
 
9
9
  """Persistent identifier provider for vocabularies."""
10
10
 
11
-
12
11
  from invenio_pidstore.models import PIDStatus
13
12
  from invenio_pidstore.providers.base import BaseProvider
14
13
 
@@ -7,6 +7,7 @@
7
7
  # details.
8
8
 
9
9
  """Resources module."""
10
+
10
11
  from invenio_vocabularies.resources.schema import L10NString, VocabularyL10Schema
11
12
 
12
13
  from .config import VocabulariesResourceConfig, VocabularyTypeResourceConfig
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2020-2021 CERN.
3
+ # Copyright (C) 2020-2024 CERN.
4
4
  # Copyright (C) 2021 Northwestern University.
5
5
  #
6
6
  # Invenio-Vocabularies is free software; you can redistribute it and/or
@@ -8,6 +8,7 @@
8
8
  # details.
9
9
 
10
10
  """Vocabulary resource schema."""
11
+
11
12
  from marshmallow import Schema, fields
12
13
 
13
14
  from invenio_vocabularies.resources.serializer import L10NString
@@ -1,12 +1,13 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2024-2024 CERN.
3
+ # Copyright (C) 2024 CERN.
4
4
  #
5
5
  # Invenio-RDM-Records is free software; you can redistribute it and/or modify
6
6
  # it under the terms of the MIT License; see LICENSE file for more details.
7
7
 
8
8
 
9
9
  """Custom fields."""
10
+
10
11
  from invenio_i18n import lazy_gettext as _
11
12
 
12
13
  from ...contrib.subjects.api import Subject
@@ -25,7 +26,7 @@ class SubjectCF(VocabularyCF):
25
26
  vocabulary_id="subjects",
26
27
  schema=SubjectRelationSchema,
27
28
  ui_schema=SubjectRelationSchema,
28
- **kwargs
29
+ **kwargs,
29
30
  )
30
31
  self.pid_field = Subject.pid
31
32
 
@@ -39,7 +39,7 @@ class VocabularyCF(BaseCF):
39
39
  sort_by=None,
40
40
  schema=VocabularyRelationSchema,
41
41
  ui_schema=VocabularyL10NItemSchema,
42
- **kwargs
42
+ **kwargs,
43
43
  ):
44
44
  """Constructor."""
45
45
  super().__init__(name, **kwargs)
@@ -11,7 +11,6 @@ from celery import shared_task
11
11
  from flask import current_app
12
12
 
13
13
  from ..datastreams.factories import DataStreamFactory
14
- from ..factories import get_vocabulary_config
15
14
 
16
15
 
17
16
  @shared_task(ignore_result=True)
@@ -27,32 +26,3 @@ def process_datastream(config):
27
26
  if result.errors:
28
27
  for err in result.errors:
29
28
  current_app.logger.error(err)
30
-
31
-
32
- @shared_task()
33
- def import_funders():
34
- """Import the funders vocabulary.
35
-
36
- Only new records are imported.
37
- Existing records are not updated.
38
- """
39
- vc = get_vocabulary_config("funders")
40
- config = vc.get_config()
41
-
42
- # When importing funders via a Celery task, make sure that we are automatically downloading the ROR file,
43
- # instead of relying on a local file on the file system.
44
- if config["readers"][0]["type"] == "ror-http":
45
- readers_config_with_ror_http = config["readers"]
46
- else:
47
- readers_config_with_ror_http = [{"type": "ror-http"}] + config["readers"]
48
-
49
- ds = DataStreamFactory.create(
50
- readers_config=readers_config_with_ror_http,
51
- transformers_config=config.get("transformers"),
52
- writers_config=config["writers"],
53
- )
54
-
55
- for result in ds.process():
56
- if result.errors:
57
- for err in result.errors:
58
- current_app.logger.exception(err)
@@ -1,5 +1,5 @@
1
1
  {#
2
- Copyright (C) 2024-2024 CERN.
2
+ Copyright (C) 2024 CERN.
3
3
 
4
4
  Invenio RDM Records is free software; you can redistribute it and/or modify
5
5
  it under the terms of the MIT License; see LICENSE file for more details.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: invenio-vocabularies
3
- Version: 4.1.1
3
+ Version: 4.3.0
4
4
  Summary: "Invenio module for managing vocabularies."
5
5
  Home-page: https://github.com/inveniosoftware/invenio-vocabularies
6
6
  Author: CERN
@@ -15,6 +15,7 @@ Requires-Dist: invenio-records-resources <7.0.0,>=6.0.0
15
15
  Requires-Dist: invenio-administration <3.0.0,>=2.0.0
16
16
  Requires-Dist: lxml >=4.5.0
17
17
  Requires-Dist: PyYAML >=5.4.1
18
+ Requires-Dist: regex >=2024.7.24
18
19
  Provides-Extra: elasticsearch7
19
20
  Requires-Dist: invenio-search[elasticsearch7] <3.0.0,>=2.1.0 ; extra == 'elasticsearch7'
20
21
  Provides-Extra: mysql
@@ -25,6 +26,8 @@ Requires-Dist: invenio-search[opensearch1] <3.0.0,>=2.1.0 ; extra == 'opensearch
25
26
  Provides-Extra: opensearch2
26
27
  Requires-Dist: invenio-search[opensearch2] <3.0.0,>=2.1.0 ; extra == 'opensearch2'
27
28
  Provides-Extra: postgresql
29
+ Provides-Extra: s3fs
30
+ Requires-Dist: s3fs >=2024.6.1 ; extra == 's3fs'
28
31
  Provides-Extra: sqlite
29
32
  Provides-Extra: tests
30
33
  Requires-Dist: pytest-black-ng >=0.4.0 ; extra == 'tests'
@@ -78,6 +81,20 @@ https://invenio-vocabularies.readthedocs.io/
78
81
  Changes
79
82
  =======
80
83
 
84
+ Version v4.3.0 (released 2024-08-05)
85
+
86
+ - names: make names_exclude_regex configurable
87
+ - names: validate entry full names
88
+ - names: add orcid public data sync
89
+
90
+ Version v4.2.0 (released 2024-07-24)
91
+
92
+ - ror: check last update; use ld+json for metadata (#367)
93
+ - tasks: remove import funders task
94
+ - funders: add and export custom transformer
95
+ - affiliations: add and export custom transformer
96
+ - datastreams: implement asynchronous writer
97
+
81
98
  Version v4.1.1 (released 2024-07-15)
82
99
 
83
100
  - installation: use invenio-oaipmh-scythe from PyPI