invenio-vocabularies 2.3.1__py2.py3-none-any.whl → 6.3.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of invenio-vocabularies might be problematic. Click here for more details.

Files changed (165) hide show
  1. invenio_vocabularies/__init__.py +2 -2
  2. invenio_vocabularies/administration/__init__.py +10 -0
  3. invenio_vocabularies/administration/views/__init__.py +10 -0
  4. invenio_vocabularies/administration/views/vocabularies.py +45 -0
  5. invenio_vocabularies/assets/semantic-ui/js/invenio_vocabularies/package.json +1 -7
  6. invenio_vocabularies/assets/semantic-ui/js/invenio_vocabularies/src/contrib/forms/Funding/CustomAwardForm.js +80 -64
  7. invenio_vocabularies/assets/semantic-ui/js/invenio_vocabularies/src/contrib/forms/Funding/FundingField.js +49 -41
  8. invenio_vocabularies/assets/semantic-ui/js/invenio_vocabularies/src/contrib/forms/Funding/FundingModal.js +5 -7
  9. invenio_vocabularies/assets/semantic-ui/js/invenio_vocabularies/src/contrib/forms/Funding/NoAwardResults.js +3 -3
  10. invenio_vocabularies/cli.py +31 -44
  11. invenio_vocabularies/config.py +68 -4
  12. invenio_vocabularies/contrib/affiliations/affiliations.py +11 -0
  13. invenio_vocabularies/contrib/affiliations/api.py +1 -2
  14. invenio_vocabularies/contrib/affiliations/config.py +13 -2
  15. invenio_vocabularies/contrib/affiliations/datastreams.py +186 -0
  16. invenio_vocabularies/contrib/affiliations/jsonschemas/affiliations/affiliation-v1.0.0.json +38 -1
  17. invenio_vocabularies/contrib/affiliations/mappings/os-v1/affiliations/affiliation-v1.0.0.json +22 -1
  18. invenio_vocabularies/contrib/affiliations/mappings/os-v1/affiliations/affiliation-v2.0.0.json +171 -0
  19. invenio_vocabularies/contrib/affiliations/mappings/os-v2/affiliations/affiliation-v1.0.0.json +22 -1
  20. invenio_vocabularies/contrib/affiliations/mappings/os-v2/affiliations/affiliation-v2.0.0.json +171 -0
  21. invenio_vocabularies/contrib/affiliations/mappings/v7/affiliations/affiliation-v1.0.0.json +22 -1
  22. invenio_vocabularies/contrib/affiliations/schema.py +17 -3
  23. invenio_vocabularies/contrib/affiliations/services.py +1 -2
  24. invenio_vocabularies/contrib/awards/awards.py +17 -5
  25. invenio_vocabularies/contrib/awards/datastreams.py +241 -7
  26. invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +38 -0
  27. invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +51 -2
  28. invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +51 -2
  29. invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +51 -2
  30. invenio_vocabularies/contrib/awards/schema.py +16 -1
  31. invenio_vocabularies/contrib/awards/serializer.py +8 -1
  32. invenio_vocabularies/contrib/awards/services.py +1 -2
  33. invenio_vocabularies/contrib/common/__init__.py +9 -0
  34. invenio_vocabularies/contrib/common/openaire/__init__.py +9 -0
  35. invenio_vocabularies/contrib/common/openaire/datastreams.py +84 -0
  36. invenio_vocabularies/contrib/common/ror/__init__.py +9 -0
  37. invenio_vocabularies/contrib/common/ror/datastreams.py +220 -0
  38. invenio_vocabularies/contrib/funders/config.py +11 -2
  39. invenio_vocabularies/contrib/funders/datastreams.py +40 -62
  40. invenio_vocabularies/contrib/funders/funders.py +3 -1
  41. invenio_vocabularies/contrib/funders/jsonschemas/funders/funder-v1.0.0.json +36 -1
  42. invenio_vocabularies/contrib/funders/mappings/os-v1/funders/funder-v1.0.0.json +22 -1
  43. invenio_vocabularies/contrib/funders/mappings/os-v1/funders/funder-v2.0.0.json +156 -0
  44. invenio_vocabularies/contrib/funders/mappings/os-v2/funders/funder-v1.0.0.json +22 -1
  45. invenio_vocabularies/contrib/funders/mappings/os-v2/funders/funder-v2.0.0.json +156 -0
  46. invenio_vocabularies/contrib/funders/mappings/v7/funders/funder-v1.0.0.json +22 -1
  47. invenio_vocabularies/contrib/funders/schema.py +8 -0
  48. invenio_vocabularies/contrib/funders/serializer.py +2 -1
  49. invenio_vocabularies/contrib/names/config.py +5 -3
  50. invenio_vocabularies/contrib/names/datastreams.py +172 -4
  51. invenio_vocabularies/contrib/names/jsonschemas/names/name-v1.0.0.json +3 -0
  52. invenio_vocabularies/contrib/names/mappings/os-v1/names/name-v1.0.0.json +3 -0
  53. invenio_vocabularies/contrib/names/mappings/os-v1/names/name-v2.0.0.json +150 -0
  54. invenio_vocabularies/contrib/names/mappings/os-v2/names/name-v1.0.0.json +3 -0
  55. invenio_vocabularies/contrib/names/mappings/os-v2/names/name-v2.0.0.json +150 -0
  56. invenio_vocabularies/contrib/names/mappings/v7/names/name-v1.0.0.json +3 -0
  57. invenio_vocabularies/contrib/names/names.py +15 -3
  58. invenio_vocabularies/contrib/names/permissions.py +20 -0
  59. invenio_vocabularies/contrib/names/s3client.py +44 -0
  60. invenio_vocabularies/contrib/names/schema.py +14 -0
  61. invenio_vocabularies/contrib/subjects/config.py +9 -3
  62. invenio_vocabularies/contrib/subjects/datastreams.py +61 -0
  63. invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py +9 -0
  64. invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +171 -0
  65. invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json +31 -0
  66. invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json +35 -0
  67. invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json +35 -0
  68. invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json +35 -0
  69. invenio_vocabularies/contrib/subjects/mesh/__init__.py +9 -0
  70. invenio_vocabularies/contrib/subjects/mesh/datastreams.py +43 -0
  71. invenio_vocabularies/contrib/subjects/schema.py +47 -5
  72. invenio_vocabularies/contrib/subjects/subjects.py +10 -0
  73. invenio_vocabularies/datastreams/datastreams.py +61 -13
  74. invenio_vocabularies/datastreams/factories.py +1 -2
  75. invenio_vocabularies/datastreams/readers.py +138 -29
  76. invenio_vocabularies/datastreams/tasks.py +37 -0
  77. invenio_vocabularies/datastreams/transformers.py +17 -27
  78. invenio_vocabularies/datastreams/writers.py +116 -14
  79. invenio_vocabularies/datastreams/xml.py +34 -0
  80. invenio_vocabularies/ext.py +59 -5
  81. invenio_vocabularies/factories.py +137 -0
  82. invenio_vocabularies/jobs.py +133 -0
  83. invenio_vocabularies/proxies.py +2 -2
  84. invenio_vocabularies/records/jsonschemas/vocabularies/definitions-v1.0.0.json +7 -0
  85. invenio_vocabularies/records/jsonschemas/vocabularies/vocabulary-v1.0.0.json +1 -4
  86. invenio_vocabularies/records/mappings/os-v1/vocabularies/vocabulary-v1.0.0.json +3 -3
  87. invenio_vocabularies/records/mappings/os-v2/vocabularies/vocabulary-v1.0.0.json +3 -3
  88. invenio_vocabularies/records/mappings/v7/vocabularies/vocabulary-v1.0.0.json +3 -3
  89. invenio_vocabularies/records/models.py +2 -4
  90. invenio_vocabularies/records/pidprovider.py +1 -2
  91. invenio_vocabularies/records/systemfields/relations.py +2 -2
  92. invenio_vocabularies/resources/__init__.py +9 -1
  93. invenio_vocabularies/resources/config.py +105 -0
  94. invenio_vocabularies/resources/resource.py +31 -41
  95. invenio_vocabularies/resources/schema.py +2 -1
  96. invenio_vocabularies/services/__init__.py +5 -2
  97. invenio_vocabularies/services/config.py +179 -0
  98. invenio_vocabularies/services/custom_fields/__init__.py +6 -2
  99. invenio_vocabularies/services/custom_fields/subject.py +82 -0
  100. invenio_vocabularies/services/custom_fields/vocabulary.py +5 -3
  101. invenio_vocabularies/services/permissions.py +3 -1
  102. invenio_vocabularies/services/results.py +110 -0
  103. invenio_vocabularies/services/schema.py +11 -2
  104. invenio_vocabularies/services/service.py +46 -94
  105. invenio_vocabularies/services/tasks.py +1 -1
  106. invenio_vocabularies/templates/semantic-ui/invenio_vocabularies/subjects.html +23 -0
  107. invenio_vocabularies/templates/semantic-ui/invenio_vocabularies/vocabularies-list.html +12 -0
  108. invenio_vocabularies/templates/semantic-ui/invenio_vocabularies/vocabulary-details.html +71 -0
  109. invenio_vocabularies/translations/af/LC_MESSAGES/messages.mo +0 -0
  110. invenio_vocabularies/translations/ar/LC_MESSAGES/messages.mo +0 -0
  111. invenio_vocabularies/translations/bg/LC_MESSAGES/messages.mo +0 -0
  112. invenio_vocabularies/translations/ca/LC_MESSAGES/messages.mo +0 -0
  113. invenio_vocabularies/translations/cs/LC_MESSAGES/messages.mo +0 -0
  114. invenio_vocabularies/translations/da/LC_MESSAGES/messages.mo +0 -0
  115. invenio_vocabularies/translations/de/LC_MESSAGES/messages.mo +0 -0
  116. invenio_vocabularies/translations/de_AT/LC_MESSAGES/messages.mo +0 -0
  117. invenio_vocabularies/translations/de_DE/LC_MESSAGES/messages.mo +0 -0
  118. invenio_vocabularies/translations/el/LC_MESSAGES/messages.mo +0 -0
  119. invenio_vocabularies/translations/en/LC_MESSAGES/messages.mo +0 -0
  120. invenio_vocabularies/translations/en_AT/LC_MESSAGES/messages.mo +0 -0
  121. invenio_vocabularies/translations/en_HU/LC_MESSAGES/messages.mo +0 -0
  122. invenio_vocabularies/translations/es/LC_MESSAGES/messages.mo +0 -0
  123. invenio_vocabularies/translations/es_CU/LC_MESSAGES/messages.mo +0 -0
  124. invenio_vocabularies/translations/es_MX/LC_MESSAGES/messages.mo +0 -0
  125. invenio_vocabularies/translations/et/LC_MESSAGES/messages.mo +0 -0
  126. invenio_vocabularies/translations/et_EE/LC_MESSAGES/messages.mo +0 -0
  127. invenio_vocabularies/translations/fa/LC_MESSAGES/messages.mo +0 -0
  128. invenio_vocabularies/translations/fa_IR/LC_MESSAGES/messages.mo +0 -0
  129. invenio_vocabularies/translations/fr/LC_MESSAGES/messages.mo +0 -0
  130. invenio_vocabularies/translations/fr_CI/LC_MESSAGES/messages.mo +0 -0
  131. invenio_vocabularies/translations/fr_FR/LC_MESSAGES/messages.mo +0 -0
  132. invenio_vocabularies/translations/gl/LC_MESSAGES/messages.mo +0 -0
  133. invenio_vocabularies/translations/hi_IN/LC_MESSAGES/messages.mo +0 -0
  134. invenio_vocabularies/translations/hr/LC_MESSAGES/messages.mo +0 -0
  135. invenio_vocabularies/translations/hu/LC_MESSAGES/messages.mo +0 -0
  136. invenio_vocabularies/translations/hu_HU/LC_MESSAGES/messages.mo +0 -0
  137. invenio_vocabularies/translations/it/LC_MESSAGES/messages.mo +0 -0
  138. invenio_vocabularies/translations/ja/LC_MESSAGES/messages.mo +0 -0
  139. invenio_vocabularies/translations/ka/LC_MESSAGES/messages.mo +0 -0
  140. invenio_vocabularies/translations/lt/LC_MESSAGES/messages.mo +0 -0
  141. invenio_vocabularies/translations/messages.pot +95 -48
  142. invenio_vocabularies/translations/ne/LC_MESSAGES/messages.mo +0 -0
  143. invenio_vocabularies/translations/no/LC_MESSAGES/messages.mo +0 -0
  144. invenio_vocabularies/translations/pl/LC_MESSAGES/messages.mo +0 -0
  145. invenio_vocabularies/translations/pt/LC_MESSAGES/messages.mo +0 -0
  146. invenio_vocabularies/translations/ro/LC_MESSAGES/messages.mo +0 -0
  147. invenio_vocabularies/translations/ru/LC_MESSAGES/messages.mo +0 -0
  148. invenio_vocabularies/translations/rw/LC_MESSAGES/messages.mo +0 -0
  149. invenio_vocabularies/translations/sk/LC_MESSAGES/messages.mo +0 -0
  150. invenio_vocabularies/translations/sv/LC_MESSAGES/messages.mo +0 -0
  151. invenio_vocabularies/translations/sv_SE/LC_MESSAGES/messages.mo +0 -0
  152. invenio_vocabularies/translations/tr/LC_MESSAGES/messages.mo +0 -0
  153. invenio_vocabularies/translations/uk/LC_MESSAGES/messages.mo +0 -0
  154. invenio_vocabularies/translations/uk_UA/LC_MESSAGES/messages.mo +0 -0
  155. invenio_vocabularies/translations/zh_CN/LC_MESSAGES/messages.mo +0 -0
  156. invenio_vocabularies/translations/zh_TW/LC_MESSAGES/messages.mo +0 -0
  157. invenio_vocabularies/views.py +12 -26
  158. invenio_vocabularies/webpack.py +3 -3
  159. {invenio_vocabularies-2.3.1.dist-info → invenio_vocabularies-6.3.1.dist-info}/METADATA +150 -6
  160. {invenio_vocabularies-2.3.1.dist-info → invenio_vocabularies-6.3.1.dist-info}/RECORD +165 -132
  161. {invenio_vocabularies-2.3.1.dist-info → invenio_vocabularies-6.3.1.dist-info}/WHEEL +1 -1
  162. {invenio_vocabularies-2.3.1.dist-info → invenio_vocabularies-6.3.1.dist-info}/entry_points.txt +17 -0
  163. {invenio_vocabularies-2.3.1.dist-info → invenio_vocabularies-6.3.1.dist-info}/AUTHORS.rst +0 -0
  164. {invenio_vocabularies-2.3.1.dist-info → invenio_vocabularies-6.3.1.dist-info}/LICENSE +0 -0
  165. {invenio_vocabularies-2.3.1.dist-info → invenio_vocabularies-6.3.1.dist-info}/top_level.txt +0 -0
@@ -9,9 +9,11 @@
9
9
 
10
10
  """Vocabulary subjects."""
11
11
 
12
+ from flask_resources import JSONSerializer, ResponseHandler
12
13
  from invenio_records.dumpers import SearchDumper
13
14
  from invenio_records.dumpers.indexedat import IndexedAtDumperExt
14
15
  from invenio_records_resources.factories.factory import RecordTypeFactory
16
+ from invenio_records_resources.resources.records.headers import etag_headers
15
17
 
16
18
  from ...records.pidprovider import PIDProviderFactory
17
19
  from ...records.systemfields import BaseVocabularyPIDFieldContext
@@ -42,4 +44,12 @@ record_type = RecordTypeFactory(
42
44
  permission_policy_cls=PermissionPolicy,
43
45
  # Resource layer
44
46
  endpoint_route="/subjects",
47
+ resource_cls_attrs={
48
+ "response_handlers": {
49
+ "application/json": ResponseHandler(JSONSerializer(), headers=etag_headers),
50
+ "application/vnd.inveniordm.v1+json": ResponseHandler(
51
+ JSONSerializer(), headers=etag_headers
52
+ ),
53
+ }
54
+ },
45
55
  )
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021-2022 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,17 +8,41 @@
8
8
 
9
9
  """Base data stream."""
10
10
 
11
+ from flask import current_app
12
+
11
13
  from .errors import ReaderError, TransformerError, WriterError
12
14
 
13
15
 
14
16
  class StreamEntry:
15
17
  """Object to encapsulate streams processing."""
16
18
 
17
- def __init__(self, entry, errors=None):
18
- """Constructor."""
19
+ def __init__(self, entry, record=None, errors=None, op_type=None, exc=None):
20
+ """Constructor for the StreamEntry class.
21
+
22
+ :param entry (object): The entry object, usually a record dict.
23
+ :param record (object): The record object, usually a record class.
24
+ :param errors (list, optional): List of errors. Defaults to None.
25
+ :param op_type (str, optional): The operation type. Defaults to None.
26
+ :param exc (str, optional): The raised unhandled exception. Defaults to None.
27
+ """
19
28
  self.entry = entry
29
+ self.record = record
20
30
  self.filtered = False
21
31
  self.errors = errors or []
32
+ self.op_type = op_type
33
+ self.exc = exc
34
+
35
+ def log_errors(self, logger=None):
36
+ """Log the errors using the provided logger or the default logger.
37
+
38
+ :param logger (logging.Logger, optional): Logger instance to use. Defaults to None.
39
+ """
40
+ if logger is None:
41
+ logger = current_app.logger
42
+ for error in self.errors:
43
+ logger.error(f"Error in entry {self.entry}: {error}")
44
+ if self.exc:
45
+ logger.error(f"Exception in entry {self.entry}: {self.exc}")
22
46
 
23
47
 
24
48
  class DataStream:
@@ -39,15 +63,10 @@ class DataStream:
39
63
  """Checks if an stream_entry should be filtered out (skipped)."""
40
64
  return False
41
65
 
42
- def process(self, *args, **kwargs):
43
- """Iterates over the entries.
44
-
45
- Uses the reader to get the raw entries and transforms them.
46
- It will iterate over the `StreamEntry` objects returned by
47
- the reader, apply the transformations and yield the result of
48
- writing it.
49
- """
50
- for stream_entry in self.read():
66
+ def process_batch(self, batch, write_many=False):
67
+ """Process a batch of entries."""
68
+ transformed_entries = []
69
+ for stream_entry in batch:
51
70
  if stream_entry.errors:
52
71
  yield stream_entry # reading errors
53
72
  else:
@@ -58,7 +77,31 @@ class DataStream:
58
77
  transformed_entry.filtered = True
59
78
  yield transformed_entry
60
79
  else:
61
- yield self.write(transformed_entry)
80
+ transformed_entries.append(transformed_entry)
81
+ if transformed_entries:
82
+ if write_many:
83
+ yield from self.batch_write(transformed_entries)
84
+ else:
85
+ yield from (self.write(entry) for entry in transformed_entries)
86
+
87
+ def process(self, batch_size=100, write_many=False, *args, **kwargs):
88
+ """Iterates over the entries.
89
+
90
+ Uses the reader to get the raw entries and transforms them.
91
+ It will iterate over the `StreamEntry` objects returned by
92
+ the reader, apply the transformations and yield the result of
93
+ writing it.
94
+ """
95
+ batch = []
96
+ for stream_entry in self.read():
97
+ batch.append(stream_entry)
98
+ if len(batch) >= batch_size:
99
+ yield from self.process_batch(batch, write_many=write_many)
100
+ batch = []
101
+
102
+ # Process any remaining entries in the last batch
103
+ if batch:
104
+ yield from self.process_batch(batch, write_many=write_many)
62
105
 
63
106
  def read(self):
64
107
  """Recursively read the entries."""
@@ -107,6 +150,11 @@ class DataStream:
107
150
 
108
151
  return stream_entry
109
152
 
153
+ def batch_write(self, stream_entries, *args, **kwargs):
154
+ """Apply the transformations to an stream_entry. Errors are handler in the service layer."""
155
+ for writer in self._writers:
156
+ yield from writer.write_many(stream_entries)
157
+
110
158
  def total(self, *args, **kwargs):
111
159
  """The total of entries obtained from the origin."""
112
160
  raise NotImplementedError()
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021-2022 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -36,7 +36,6 @@ class Factory:
36
36
  try:
37
37
  type_ = config["type"]
38
38
  args = config.get("args", {})
39
-
40
39
  return cls.options()[type_](**args)
41
40
  except KeyError:
42
41
  raise FactoryError(name=cls.FACTORY_NAME, key=type_)
@@ -1,6 +1,7 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021-2022 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
+ # Copyright (C) 2024 University of Münster.
4
5
  #
5
6
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
7
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -15,14 +16,21 @@ import re
15
16
  import tarfile
16
17
  import zipfile
17
18
  from abc import ABC, abstractmethod
18
- from collections import defaultdict
19
19
  from json.decoder import JSONDecodeError
20
20
 
21
21
  import requests
22
22
  import yaml
23
+ from lxml import etree
24
+ from lxml.html import fromstring
23
25
  from lxml.html import parse as html_parse
24
26
 
25
27
  from .errors import ReaderError
28
+ from .xml import etree_to_dict
29
+
30
+ try:
31
+ import oaipmh_scythe
32
+ except ImportError:
33
+ oaipmh_scythe = None
26
34
 
27
35
 
28
36
  class BaseReader(ABC):
@@ -79,7 +87,12 @@ class TarReader(BaseReader):
79
87
  def read(self, item=None, *args, **kwargs):
80
88
  """Opens a tar archive or uses the given file pointer."""
81
89
  if item:
82
- yield from self._iter(fp=item, *args, **kwargs)
90
+ if isinstance(item, tarfile.TarFile):
91
+ yield from self._iter(fp=item, *args, **kwargs)
92
+ else:
93
+ # If the item is not already a TarFile (e.g. if it is a BytesIO), try to create a TarFile from the item.
94
+ with tarfile.open(mode=self._mode, fileobj=item) as archive:
95
+ yield from self._iter(fp=archive, *args, **kwargs)
83
96
  else:
84
97
  with tarfile.open(self._origin, self._mode) as archive:
85
98
  yield from self._iter(fp=archive, *args, **kwargs)
@@ -135,7 +148,12 @@ class ZipReader(BaseReader):
135
148
  """Opens a Zip archive or uses the given file pointer."""
136
149
  # https://docs.python.org/3/library/zipfile.html
137
150
  if item:
138
- yield from self._iter(fp=item, *args, **kwargs)
151
+ if isinstance(item, zipfile.ZipFile):
152
+ yield from self._iter(fp=item, *args, **kwargs)
153
+ else:
154
+ # If the item is not already a ZipFile (e.g. if it is a BytesIO), try to create a ZipFile from the item.
155
+ with zipfile.ZipFile(item, **self._options) as archive:
156
+ yield from self._iter(fp=archive, *args, **kwargs)
139
157
  else:
140
158
  with zipfile.ZipFile(self._origin, **self._options) as archive:
141
159
  yield from self._iter(fp=archive, *args, **kwargs)
@@ -206,34 +224,125 @@ class CSVReader(BaseReader):
206
224
  class XMLReader(BaseReader):
207
225
  """XML reader."""
208
226
 
209
- @classmethod
210
- def _etree_to_dict(cls, tree):
211
- d = {tree.tag: {} if tree.attrib else None}
212
- children = list(tree)
213
- if children:
214
- dd = defaultdict(list)
215
- for dc in map(cls._etree_to_dict, children):
216
- for k, v in dc.items():
217
- dd[k].append(v)
218
- d = {tree.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
219
- if tree.attrib:
220
- d[tree.tag].update(("@" + k, v) for k, v in tree.attrib.items())
221
- if tree.text:
222
- text = tree.text.strip()
223
- if children or tree.attrib:
224
- if text:
225
- d[tree.tag]["#text"] = text
226
- else:
227
- d[tree.tag] = text
228
- return d
227
+ def __init__(self, root_element=None, *args, **kwargs):
228
+ """Constructor."""
229
+ self.root_element = root_element
230
+ super().__init__(*args, **kwargs)
229
231
 
230
232
  def _iter(self, fp, *args, **kwargs):
231
233
  """Read and parse an XML file to dict."""
232
234
  # NOTE: We parse HTML, to skip XML validation and strip XML namespaces
233
- xml_tree = html_parse(fp).getroot()
234
- record = self._etree_to_dict(xml_tree)["html"]["body"].get("record")
235
-
236
- if not record:
237
- raise ReaderError(f"Record not found in XML entry.")
235
+ record = None
236
+ try:
237
+ xml_tree = fromstring(fp)
238
+ xml_dict = etree_to_dict(xml_tree)
239
+ except Exception as e:
240
+ xml_tree = html_parse(fp).getroot()
241
+ xml_dict = etree_to_dict(xml_tree)["html"]["body"]
242
+
243
+ if self.root_element:
244
+ record = xml_dict.get(self.root_element)
245
+ if not record:
246
+ raise ReaderError(
247
+ f"Root element '{self.root_element}' not found in XML entry."
248
+ )
249
+ else:
250
+ record = xml_dict
238
251
 
239
252
  yield record
253
+
254
+
255
+ class OAIPMHReader(BaseReader):
256
+ """OAIPMH reader."""
257
+
258
+ def __init__(
259
+ self,
260
+ *args,
261
+ base_url=None,
262
+ metadata_prefix=None,
263
+ set=None,
264
+ from_date=None,
265
+ until_date=None,
266
+ verb=None,
267
+ **kwargs,
268
+ ):
269
+ """Constructor."""
270
+ self._base_url = base_url
271
+ self._metadata_prefix = metadata_prefix if not None else "oai_dc"
272
+ self._set = set
273
+ self._until = until_date
274
+ self._from = from_date
275
+ self._verb = verb if not None else "ListRecords"
276
+ super().__init__(*args, **kwargs)
277
+
278
+ def _iter(self, scythe, *args, **kwargs):
279
+ """Read and parse an OAIPMH stream to dict."""
280
+
281
+ class OAIRecord(oaipmh_scythe.models.Record):
282
+ """An XML unpacking implementation for more complicated formats."""
283
+
284
+ def get_metadata(self):
285
+ """Extract and return the record's metadata as a dictionary."""
286
+ return xml_to_dict(
287
+ self.xml.find(f".//{self._oai_namespace}metadata").getchildren()[0],
288
+ )
289
+
290
+ if self._verb == "ListRecords":
291
+ scythe.class_mapping["ListRecords"] = OAIRecord
292
+ try:
293
+ records = scythe.list_records(
294
+ from_=self._from,
295
+ until=self._until,
296
+ metadata_prefix=self._metadata_prefix,
297
+ set_=self._set,
298
+ ignore_deleted=True,
299
+ )
300
+ for record in records:
301
+ yield {"record": record}
302
+ except oaipmh_scythe.NoRecordsMatch:
303
+ raise ReaderError("No records found in OAI-PMH request.")
304
+ else:
305
+ scythe.class_mapping["GetRecord"] = OAIRecord
306
+ try:
307
+ headers = scythe.list_identifiers(
308
+ from_=self._from,
309
+ until=self._until,
310
+ metadata_prefix=self._metadata_prefix,
311
+ set_=self._set,
312
+ ignore_deleted=True,
313
+ )
314
+ for header in headers:
315
+ record = scythe.get_record(
316
+ identifier=header.identifier,
317
+ metadata_prefix=self._metadata_prefix,
318
+ )
319
+ yield {"record": record}
320
+ except oaipmh_scythe.NoRecordsMatch:
321
+ raise ReaderError("No records found in OAI-PMH request.")
322
+
323
+ def read(self, item=None, *args, **kwargs):
324
+ """Reads from item or opens the file descriptor from origin."""
325
+ if item:
326
+ raise NotImplementedError(
327
+ "OAIPMHReader does not support being chained after another reader"
328
+ )
329
+ else:
330
+ with oaipmh_scythe.Scythe(self._base_url) as scythe:
331
+ yield from self._iter(scythe=scythe, *args, **kwargs)
332
+
333
+
334
+ def xml_to_dict(tree: etree._Element):
335
+ """Convert an XML tree to a dictionary.
336
+
337
+ This function takes an XML element tree and converts it into a dictionary.
338
+
339
+ Args:
340
+ tree: The root element of the XML tree to be converted.
341
+
342
+ Returns:
343
+ A dictionary with the key "record".
344
+ """
345
+ dict_obj = dict()
346
+ dict_obj["record"] = etree.tostring(tree)
347
+
348
+ return dict_obj
@@ -0,0 +1,37 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Copyright (C) 2022-2024 CERN.
4
+ #
5
+ # Invenio-Vocabularies is free software; you can redistribute it and/or
6
+ # modify it under the terms of the MIT License; see LICENSE file for more
7
+ # details.
8
+
9
+ """Data Streams Celery tasks."""
10
+
11
+ from celery import shared_task
12
+
13
+ from ..datastreams import StreamEntry
14
+ from ..datastreams.factories import WriterFactory
15
+
16
+
17
+ @shared_task(ignore_result=True)
18
+ def write_entry(writer_config, entry):
19
+ """Write an entry.
20
+
21
+ :param writer: writer configuration as accepted by the WriterFactory.
22
+ :param entry: dictionary, StreamEntry is not serializable.
23
+ """
24
+ writer = WriterFactory.create(config=writer_config)
25
+ writer.write(StreamEntry(entry))
26
+
27
+
28
+ @shared_task(ignore_result=True)
29
+ def write_many_entry(writer_config, entries):
30
+ """Write many entries.
31
+
32
+ :param writer: writer configuration as accepted by the WriterFactory.
33
+ :param entry: lisf ot dictionaries, StreamEntry is not serializable.
34
+ """
35
+ writer = WriterFactory.create(config=writer_config)
36
+ stream_entries = [StreamEntry(entry) for entry in entries]
37
+ writer.write_many(stream_entries)
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021-2022 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -9,11 +9,11 @@
9
9
  """Transformers module."""
10
10
 
11
11
  from abc import ABC, abstractmethod
12
- from collections import defaultdict
13
12
 
14
13
  from lxml import etree
15
14
 
16
15
  from .errors import TransformerError
16
+ from .xml import etree_to_dict
17
17
 
18
18
 
19
19
  class BaseTransformer(ABC):
@@ -32,42 +32,32 @@ class BaseTransformer(ABC):
32
32
  class XMLTransformer(BaseTransformer):
33
33
  """XML transformer."""
34
34
 
35
+ def __init__(self, root_element=None, *args, **kwargs):
36
+ """Initializes the transformer."""
37
+ self.root_element = root_element
38
+ super().__init__(*args, **kwargs)
39
+
35
40
  @classmethod
36
41
  def _xml_to_etree(cls, xml):
37
42
  """Converts XML to a lxml etree."""
38
43
  return etree.HTML(xml)
39
44
 
40
- @classmethod
41
- def _etree_to_dict(cls, tree):
42
- d = {tree.tag: {} if tree.attrib else None}
43
- children = list(tree)
44
- if children:
45
- dd = defaultdict(list)
46
- for dc in map(cls._etree_to_dict, children):
47
- for k, v in dc.items():
48
- dd[k].append(v)
49
- d = {tree.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
50
- if tree.attrib:
51
- d[tree.tag].update(("@" + k, v) for k, v in tree.attrib.items())
52
- if tree.text:
53
- text = tree.text.strip()
54
- if children or tree.attrib:
55
- if text:
56
- d[tree.tag]["#text"] = text
57
- else:
58
- d[tree.tag] = text
59
- return d
60
-
61
45
  def apply(self, stream_entry, **kwargs):
62
46
  """Applies the transformation to the stream entry.
63
47
 
64
48
  Requires the root element to be named "record".
65
49
  """
66
50
  xml_tree = self._xml_to_etree(stream_entry.entry)
67
- record = self._etree_to_dict(xml_tree)["html"]["body"].get("record")
68
-
69
- if not record:
70
- raise TransformerError(f"Record not found in XML entry.")
51
+ xml_dict = etree_to_dict(xml_tree)["html"]["body"]
52
+
53
+ if self.root_element:
54
+ record = xml_dict.get(self.root_element)
55
+ if not record:
56
+ raise TransformerError(
57
+ f"Root element '{self.root_element}' not found in XML entry."
58
+ )
59
+ else:
60
+ record = xml_dict
71
61
 
72
62
  stream_entry.entry = record
73
63
  return stream_entry
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  #
3
- # Copyright (C) 2021-2022 CERN.
3
+ # Copyright (C) 2021-2024 CERN.
4
4
  #
5
5
  # Invenio-Vocabularies is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the MIT License; see LICENSE file for more
@@ -13,18 +13,25 @@ from pathlib import Path
13
13
 
14
14
  import yaml
15
15
  from invenio_access.permissions import system_identity
16
- from invenio_pidstore.errors import PIDAlreadyExists
16
+ from invenio_pidstore.errors import PIDAlreadyExists, PIDDoesNotExistError
17
17
  from invenio_records.systemfields.relations.errors import InvalidRelationValue
18
18
  from invenio_records_resources.proxies import current_service_registry
19
19
  from marshmallow import ValidationError
20
+ from sqlalchemy.exc import NoResultFound
20
21
 
21
22
  from .datastreams import StreamEntry
22
23
  from .errors import WriterError
24
+ from .tasks import write_entry, write_many_entry
23
25
 
24
26
 
25
27
  class BaseWriter(ABC):
26
28
  """Base writer."""
27
29
 
30
+ def __init__(self, *args, **kwargs):
31
+ """Base initialization logic."""
32
+ # Add any base initialization here if needed
33
+ pass
34
+
28
35
  @abstractmethod
29
36
  def write(self, stream_entry, *args, **kwargs):
30
37
  """Writes the input stream entry to the target output.
@@ -35,16 +42,29 @@ class BaseWriter(ABC):
35
42
  """
36
43
  pass
37
44
 
45
+ @abstractmethod
46
+ def write_many(self, stream_entries, *args, **kwargs):
47
+ """Writes the input streams entry to the target output.
48
+
49
+ :returns: A List of StreamEntry. The result of writing the entry.
50
+ Raises WriterException in case of errors.
51
+
52
+ """
53
+ pass
54
+
38
55
 
39
56
  class ServiceWriter(BaseWriter):
40
57
  """Writes the entries to an RDM instance using a Service object."""
41
58
 
42
- def __init__(self, service_or_name, *args, identity=None, update=False, **kwargs):
59
+ def __init__(
60
+ self, service_or_name, *args, identity=None, insert=True, update=False, **kwargs
61
+ ):
43
62
  """Constructor.
44
63
 
45
64
  :param service_or_name: a service instance or a key of the
46
65
  service registry.
47
66
  :param identity: access identity.
67
+ :param insert: if True it will insert records which do not exist.
48
68
  :param update: if True it will update records if they exist.
49
69
  """
50
70
  if isinstance(service_or_name, str):
@@ -52,6 +72,7 @@ class ServiceWriter(BaseWriter):
52
72
 
53
73
  self._service = service_or_name
54
74
  self._identity = identity or system_identity
75
+ self._insert = insert
55
76
  self._update = update
56
77
 
57
78
  super().__init__(*args, **kwargs)
@@ -63,20 +84,47 @@ class ServiceWriter(BaseWriter):
63
84
  def _resolve(self, id_):
64
85
  return self._service.read(self._identity, id_)
65
86
 
87
+ def _do_update(self, entry):
88
+ vocab_id = self._entry_id(entry)
89
+ current = self._resolve(vocab_id)
90
+ combined_dict = current.to_dict()
91
+
92
+ # Update fields from entry
93
+ for key, value in entry.items():
94
+ if key in combined_dict:
95
+ if isinstance(combined_dict[key], list) and isinstance(value, list):
96
+ combined_dict[key].extend(
97
+ item for item in value if item not in combined_dict[key]
98
+ )
99
+ else:
100
+ combined_dict[key] = value
101
+ else:
102
+ combined_dict[key] = value
103
+
104
+ return StreamEntry(
105
+ self._service.update(self._identity, vocab_id, combined_dict)
106
+ )
107
+
66
108
  def write(self, stream_entry, *args, **kwargs):
67
109
  """Writes the input entry using a given service."""
68
110
  entry = stream_entry.entry
111
+
69
112
  try:
70
- try:
71
- return StreamEntry(self._service.create(self._identity, entry))
72
- except PIDAlreadyExists:
73
- if not self._update:
74
- raise WriterError([f"Vocabulary entry already exists: {entry}"])
75
- vocab_id = self._entry_id(entry)
76
- current = self._resolve(vocab_id)
77
- updated = dict(current.to_dict(), **entry)
78
- return StreamEntry(
79
- self._service.update(self._identity, vocab_id, updated)
113
+ if self._insert:
114
+ try:
115
+ return StreamEntry(self._service.create(self._identity, entry))
116
+ except PIDAlreadyExists:
117
+ if not self._update:
118
+ raise WriterError([f"Vocabulary entry already exists: {entry}"])
119
+ return self._do_update(entry)
120
+ elif self._update:
121
+ try:
122
+ return self._do_update(entry)
123
+ except (NoResultFound, PIDDoesNotExistError):
124
+ raise WriterError([f"Vocabulary entry does not exist: {entry}"])
125
+ else:
126
+ raise WriterError(
127
+ ["Writer wrongly configured to not insert and to not update"]
80
128
  )
81
129
 
82
130
  except ValidationError as err:
@@ -85,6 +133,25 @@ class ServiceWriter(BaseWriter):
85
133
  # TODO: Check if we can get the error message easier
86
134
  raise WriterError([{"InvalidRelationValue": err.args[0]}])
87
135
 
136
+ def write_many(self, stream_entries, *args, **kwargs):
137
+ """Writes the input entries using a given service."""
138
+ entries = [entry.entry for entry in stream_entries]
139
+ entries_with_id = [(self._entry_id(entry), entry) for entry in entries]
140
+ results = self._service.create_or_update_many(self._identity, entries_with_id)
141
+ stream_entries_processed = []
142
+ for entry, result in zip(entries, results):
143
+ processed_stream_entry = StreamEntry(
144
+ entry=entry,
145
+ record=result.record,
146
+ errors=result.errors,
147
+ op_type=result.op_type,
148
+ exc=result.exc,
149
+ )
150
+ processed_stream_entry.log_errors()
151
+ stream_entries_processed.append(processed_stream_entry)
152
+
153
+ return stream_entries_processed
154
+
88
155
 
89
156
  class YamlWriter(BaseWriter):
90
157
  """Writes the entries to a YAML file."""
@@ -103,6 +170,41 @@ class YamlWriter(BaseWriter):
103
170
  with open(self._filepath, "a") as file:
104
171
  # made into array for safer append
105
172
  # will always read array (good for reader)
106
- yaml.safe_dump([stream_entry.entry], file)
173
+ yaml.safe_dump([stream_entry.entry], file, allow_unicode=True)
107
174
 
108
175
  return stream_entry
176
+
177
+ def write_many(self, stream_entries, *args, **kwargs):
178
+ """Writes the yaml input entries."""
179
+ with open(self._filepath, "a") as file:
180
+ yaml.safe_dump(
181
+ [stream_entry.entry for stream_entry in stream_entries],
182
+ file,
183
+ allow_unicode=True,
184
+ )
185
+
186
+
187
+ class AsyncWriter(BaseWriter):
188
+ """Writes the entries asynchronously (celery task)."""
189
+
190
+ def __init__(self, writer, *args, **kwargs):
191
+ """Constructor.
192
+
193
+ :param writer: writer to use.
194
+ """
195
+ super().__init__(*args, **kwargs)
196
+ self._writer = writer
197
+
198
+ def write(self, stream_entry, *args, **kwargs):
199
+ """Launches a celery task to write an entry."""
200
+ write_entry.delay(self._writer, stream_entry.entry)
201
+
202
+ return stream_entry
203
+
204
+ def write_many(self, stream_entries, *args, **kwargs):
205
+ """Launches a celery task to write an entry."""
206
+ write_many_entry.delay(
207
+ self._writer, [stream_entry.entry for stream_entry in stream_entries]
208
+ )
209
+
210
+ return stream_entries