oarepo-runtime 1.10.2__py3-none-any.whl → 2.0.0.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oarepo_runtime/__init__.py +24 -0
- oarepo_runtime/api.py +111 -0
- oarepo_runtime/cli/__init__.py +10 -21
- oarepo_runtime/cli/search.py +34 -0
- oarepo_runtime/config.py +86 -13
- oarepo_runtime/ext.py +64 -82
- oarepo_runtime/proxies.py +21 -5
- oarepo_runtime/records/__init__.py +11 -50
- oarepo_runtime/records/drafts.py +24 -18
- oarepo_runtime/records/mapping.py +84 -0
- oarepo_runtime/records/pid_providers.py +43 -7
- oarepo_runtime/records/systemfields/__init__.py +15 -33
- oarepo_runtime/records/systemfields/mapping.py +41 -24
- oarepo_runtime/records/systemfields/publication_status.py +59 -0
- oarepo_runtime/services/__init__.py +12 -0
- oarepo_runtime/services/config/__init__.py +15 -21
- oarepo_runtime/services/config/link_conditions.py +69 -75
- oarepo_runtime/services/config/permissions.py +62 -0
- oarepo_runtime/services/records/__init__.py +14 -1
- oarepo_runtime/services/records/links.py +21 -11
- oarepo_runtime/services/records/mapping.py +42 -0
- oarepo_runtime/services/results.py +98 -109
- oarepo_runtime/services/schema/__init__.py +12 -44
- oarepo_runtime/services/schema/i18n.py +47 -22
- oarepo_runtime/services/schema/i18n_ui.py +61 -24
- {oarepo_runtime-1.10.2.dist-info → oarepo_runtime-2.0.0.dev3.dist-info}/METADATA +9 -21
- oarepo_runtime-2.0.0.dev3.dist-info/RECORD +30 -0
- {oarepo_runtime-1.10.2.dist-info → oarepo_runtime-2.0.0.dev3.dist-info}/WHEEL +1 -2
- oarepo_runtime-2.0.0.dev3.dist-info/entry_points.txt +5 -0
- oarepo_runtime/cli/assets.py +0 -145
- oarepo_runtime/cli/base.py +0 -25
- oarepo_runtime/cli/cf.py +0 -15
- oarepo_runtime/cli/check.py +0 -167
- oarepo_runtime/cli/configuration.py +0 -51
- oarepo_runtime/cli/fixtures.py +0 -167
- oarepo_runtime/cli/index.py +0 -272
- oarepo_runtime/cli/permissions/__init__.py +0 -6
- oarepo_runtime/cli/permissions/base.py +0 -26
- oarepo_runtime/cli/permissions/evaluate.py +0 -63
- oarepo_runtime/cli/permissions/list.py +0 -239
- oarepo_runtime/cli/permissions/search.py +0 -121
- oarepo_runtime/cli/validate.py +0 -150
- oarepo_runtime/datastreams/__init__.py +0 -38
- oarepo_runtime/datastreams/asynchronous.py +0 -247
- oarepo_runtime/datastreams/catalogue.py +0 -150
- oarepo_runtime/datastreams/datastreams.py +0 -152
- oarepo_runtime/datastreams/errors.py +0 -54
- oarepo_runtime/datastreams/ext.py +0 -41
- oarepo_runtime/datastreams/fixtures.py +0 -265
- oarepo_runtime/datastreams/json.py +0 -4
- oarepo_runtime/datastreams/readers/__init__.py +0 -39
- oarepo_runtime/datastreams/readers/attachments.py +0 -51
- oarepo_runtime/datastreams/readers/excel.py +0 -123
- oarepo_runtime/datastreams/readers/json.py +0 -27
- oarepo_runtime/datastreams/readers/service.py +0 -54
- oarepo_runtime/datastreams/readers/yaml.py +0 -14
- oarepo_runtime/datastreams/semi_asynchronous.py +0 -91
- oarepo_runtime/datastreams/synchronous.py +0 -70
- oarepo_runtime/datastreams/transformers.py +0 -18
- oarepo_runtime/datastreams/types.py +0 -323
- oarepo_runtime/datastreams/utils.py +0 -131
- oarepo_runtime/datastreams/writers/__init__.py +0 -21
- oarepo_runtime/datastreams/writers/attachments_file.py +0 -92
- oarepo_runtime/datastreams/writers/attachments_service.py +0 -118
- oarepo_runtime/datastreams/writers/publish.py +0 -70
- oarepo_runtime/datastreams/writers/service.py +0 -175
- oarepo_runtime/datastreams/writers/utils.py +0 -30
- oarepo_runtime/datastreams/writers/validation_errors.py +0 -20
- oarepo_runtime/datastreams/writers/yaml.py +0 -56
- oarepo_runtime/ext_config.py +0 -67
- oarepo_runtime/i18n/__init__.py +0 -3
- oarepo_runtime/info/__init__.py +0 -0
- oarepo_runtime/info/check.py +0 -95
- oarepo_runtime/info/permissions/__init__.py +0 -0
- oarepo_runtime/info/permissions/debug.py +0 -191
- oarepo_runtime/info/views.py +0 -586
- oarepo_runtime/profile.py +0 -60
- oarepo_runtime/records/dumpers/__init__.py +0 -8
- oarepo_runtime/records/dumpers/edtf_interval.py +0 -38
- oarepo_runtime/records/dumpers/multilingual_dumper.py +0 -34
- oarepo_runtime/records/entity_resolvers/__init__.py +0 -13
- oarepo_runtime/records/entity_resolvers/proxies.py +0 -57
- oarepo_runtime/records/mappings/__init__.py +0 -0
- oarepo_runtime/records/mappings/rdm_parent_mapping.json +0 -483
- oarepo_runtime/records/owners/__init__.py +0 -3
- oarepo_runtime/records/owners/registry.py +0 -22
- oarepo_runtime/records/relations/__init__.py +0 -22
- oarepo_runtime/records/relations/base.py +0 -296
- oarepo_runtime/records/relations/internal.py +0 -46
- oarepo_runtime/records/relations/lookup.py +0 -28
- oarepo_runtime/records/relations/pid_relation.py +0 -102
- oarepo_runtime/records/systemfields/featured_file.py +0 -45
- oarepo_runtime/records/systemfields/has_draftcheck.py +0 -47
- oarepo_runtime/records/systemfields/icu.py +0 -371
- oarepo_runtime/records/systemfields/owner.py +0 -115
- oarepo_runtime/records/systemfields/record_status.py +0 -35
- oarepo_runtime/records/systemfields/selectors.py +0 -98
- oarepo_runtime/records/systemfields/synthetic.py +0 -130
- oarepo_runtime/resources/__init__.py +0 -4
- oarepo_runtime/resources/config.py +0 -12
- oarepo_runtime/resources/file_resource.py +0 -15
- oarepo_runtime/resources/json_serializer.py +0 -27
- oarepo_runtime/resources/localized_ui_json_serializer.py +0 -54
- oarepo_runtime/resources/resource.py +0 -53
- oarepo_runtime/resources/responses.py +0 -20
- oarepo_runtime/services/components.py +0 -429
- oarepo_runtime/services/config/draft_link.py +0 -23
- oarepo_runtime/services/config/permissions_presets.py +0 -174
- oarepo_runtime/services/config/service.py +0 -117
- oarepo_runtime/services/custom_fields/__init__.py +0 -80
- oarepo_runtime/services/custom_fields/mappings.py +0 -188
- oarepo_runtime/services/entity/__init__.py +0 -0
- oarepo_runtime/services/entity/config.py +0 -14
- oarepo_runtime/services/entity/schema.py +0 -9
- oarepo_runtime/services/entity/service.py +0 -48
- oarepo_runtime/services/expansions/__init__.py +0 -0
- oarepo_runtime/services/expansions/expandable_fields.py +0 -21
- oarepo_runtime/services/expansions/service.py +0 -4
- oarepo_runtime/services/facets/__init__.py +0 -33
- oarepo_runtime/services/facets/base.py +0 -12
- oarepo_runtime/services/facets/date.py +0 -72
- oarepo_runtime/services/facets/enum.py +0 -11
- oarepo_runtime/services/facets/facet_groups_names.py +0 -17
- oarepo_runtime/services/facets/max_facet.py +0 -13
- oarepo_runtime/services/facets/multilingual_facet.py +0 -33
- oarepo_runtime/services/facets/nested_facet.py +0 -32
- oarepo_runtime/services/facets/params.py +0 -192
- oarepo_runtime/services/facets/year_histogram.py +0 -200
- oarepo_runtime/services/files/__init__.py +0 -8
- oarepo_runtime/services/files/components.py +0 -62
- oarepo_runtime/services/files/service.py +0 -16
- oarepo_runtime/services/generators.py +0 -10
- oarepo_runtime/services/permissions/__init__.py +0 -3
- oarepo_runtime/services/permissions/generators.py +0 -103
- oarepo_runtime/services/relations/__init__.py +0 -0
- oarepo_runtime/services/relations/components.py +0 -15
- oarepo_runtime/services/relations/errors.py +0 -18
- oarepo_runtime/services/relations/mapping.py +0 -38
- oarepo_runtime/services/schema/cf.py +0 -13
- oarepo_runtime/services/schema/i18n_validation.py +0 -7
- oarepo_runtime/services/schema/marshmallow.py +0 -44
- oarepo_runtime/services/schema/marshmallow_to_json_schema.py +0 -72
- oarepo_runtime/services/schema/oneofschema.py +0 -192
- oarepo_runtime/services/schema/polymorphic.py +0 -21
- oarepo_runtime/services/schema/rdm.py +0 -75
- oarepo_runtime/services/schema/rdm_ui.py +0 -156
- oarepo_runtime/services/schema/ui.py +0 -251
- oarepo_runtime/services/schema/validation.py +0 -70
- oarepo_runtime/services/search.py +0 -282
- oarepo_runtime/services/service.py +0 -61
- oarepo_runtime/tasks.py +0 -6
- oarepo_runtime/translations/cs/LC_MESSAGES/messages.mo +0 -0
- oarepo_runtime/translations/cs/LC_MESSAGES/messages.po +0 -85
- oarepo_runtime/translations/default_translations.py +0 -6
- oarepo_runtime/translations/en/LC_MESSAGES/messages.mo +0 -0
- oarepo_runtime/translations/en/LC_MESSAGES/messages.po +0 -89
- oarepo_runtime/translations/messages.pot +0 -91
- oarepo_runtime/uow.py +0 -146
- oarepo_runtime/utils/__init__.py +0 -0
- oarepo_runtime/utils/functools.py +0 -37
- oarepo_runtime/utils/identity_utils.py +0 -35
- oarepo_runtime/utils/index.py +0 -11
- oarepo_runtime/utils/path.py +0 -97
- oarepo_runtime-1.10.2.dist-info/RECORD +0 -163
- oarepo_runtime-1.10.2.dist-info/entry_points.txt +0 -16
- oarepo_runtime-1.10.2.dist-info/top_level.txt +0 -2
- tests/marshmallow_to_json/__init__.py +0 -0
- tests/marshmallow_to_json/test_datacite_ui_schema.py +0 -1410
- tests/marshmallow_to_json/test_simple_schema.py +0 -52
- tests/pkg_data/__init__.py +0 -0
- {oarepo_runtime-1.10.2.dist-info → oarepo_runtime-2.0.0.dev3.dist-info}/licenses/LICENSE +0 -0
@@ -1,265 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
import re
|
3
|
-
from pathlib import Path
|
4
|
-
|
5
|
-
import pkg_resources
|
6
|
-
import yaml
|
7
|
-
from celery import shared_task
|
8
|
-
from flask import current_app
|
9
|
-
from invenio_access.permissions import system_identity
|
10
|
-
from invenio_records_resources.proxies import current_service_registry
|
11
|
-
|
12
|
-
from oarepo_runtime.datastreams import (
|
13
|
-
DataStreamCatalogue,
|
14
|
-
StreamBatch,
|
15
|
-
SynchronousDataStream,
|
16
|
-
)
|
17
|
-
from oarepo_runtime.datastreams.types import StatsKeepingDataStreamCallback
|
18
|
-
|
19
|
-
log = logging.getLogger("fixtures")
|
20
|
-
|
21
|
-
|
22
|
-
class FixturesCallback(StatsKeepingDataStreamCallback):
|
23
|
-
def fixture_started(self, fixture_name):
|
24
|
-
pass
|
25
|
-
|
26
|
-
def fixture_finished(self, fixture_name):
|
27
|
-
pass
|
28
|
-
|
29
|
-
|
30
|
-
def load_fixtures(
|
31
|
-
fixture_dir_or_catalogue=None,
|
32
|
-
include=None,
|
33
|
-
exclude=None,
|
34
|
-
system_fixtures=True,
|
35
|
-
callback: FixturesCallback = None,
|
36
|
-
batch_size=100,
|
37
|
-
datastreams_impl=SynchronousDataStream,
|
38
|
-
identity=system_identity,
|
39
|
-
):
|
40
|
-
"""
|
41
|
-
Loads fixtures. If fixture dir is set, fixtures are loaded from that directory first.
|
42
|
-
The directory must contain a catalogue.yaml file containing datastreams to load the
|
43
|
-
fixtures. The format of the catalogue is described in the 'catalogue.py' file.
|
44
|
-
|
45
|
-
Then fixture loading continues with fixtures defined in `oarepo.fixtures` entrypoint.
|
46
|
-
The entry points are sorted and those with the greatest `name` are processed first -
|
47
|
-
so the recommendation is to call the entry points 0000-something, where 0000 is a 4-digit
|
48
|
-
number. oarepo entry points always have this number set to 1000.
|
49
|
-
|
50
|
-
If a datastream is loaded from one fixture, it will not be loaded again from another fixture.
|
51
|
-
If you want to override the default fixtures, just register your own with a key bigger than 1000.
|
52
|
-
"""
|
53
|
-
include = [re.compile(x) for x in (include or [])]
|
54
|
-
exclude = [re.compile(x) for x in (exclude or [])]
|
55
|
-
fixtures = set()
|
56
|
-
|
57
|
-
if fixture_dir_or_catalogue:
|
58
|
-
if Path(fixture_dir_or_catalogue).is_dir():
|
59
|
-
fixture_catalogue = Path(fixture_dir_or_catalogue) / "catalogue.yaml"
|
60
|
-
else:
|
61
|
-
fixture_catalogue = Path(fixture_dir_or_catalogue)
|
62
|
-
|
63
|
-
catalogue = DataStreamCatalogue(fixture_catalogue)
|
64
|
-
_load_fixtures_from_catalogue(
|
65
|
-
catalogue,
|
66
|
-
fixtures,
|
67
|
-
include,
|
68
|
-
exclude,
|
69
|
-
callback,
|
70
|
-
batch_size=batch_size,
|
71
|
-
datastreams_impl=datastreams_impl,
|
72
|
-
identity=identity,
|
73
|
-
)
|
74
|
-
|
75
|
-
if system_fixtures:
|
76
|
-
|
77
|
-
def get_priority(name):
|
78
|
-
match = re.match(r"(\d+)-", name)
|
79
|
-
if match:
|
80
|
-
return -int(match.group(1))
|
81
|
-
return 0
|
82
|
-
|
83
|
-
entry_points = list(
|
84
|
-
(get_priority(r.name), r.name, r)
|
85
|
-
for r in pkg_resources.iter_entry_points("oarepo.fixtures")
|
86
|
-
)
|
87
|
-
entry_points.sort(key=lambda x: x[:2])
|
88
|
-
for r in entry_points:
|
89
|
-
pkg = r[2].load()
|
90
|
-
pkg_fixture_dir = Path(pkg.__file__)
|
91
|
-
if pkg_fixture_dir.is_file():
|
92
|
-
pkg_fixture_dir = pkg_fixture_dir.parent
|
93
|
-
catalogue = DataStreamCatalogue(pkg_fixture_dir / "catalogue.yaml")
|
94
|
-
_load_fixtures_from_catalogue(
|
95
|
-
catalogue,
|
96
|
-
fixtures,
|
97
|
-
include,
|
98
|
-
exclude,
|
99
|
-
callback,
|
100
|
-
batch_size=batch_size,
|
101
|
-
datastreams_impl=datastreams_impl,
|
102
|
-
identity=identity,
|
103
|
-
)
|
104
|
-
|
105
|
-
|
106
|
-
def _load_fixtures_from_catalogue(
|
107
|
-
catalogue,
|
108
|
-
fixtures,
|
109
|
-
include,
|
110
|
-
exclude,
|
111
|
-
callback,
|
112
|
-
batch_size,
|
113
|
-
datastreams_impl,
|
114
|
-
identity=system_identity,
|
115
|
-
):
|
116
|
-
for catalogue_datastream in catalogue.get_datastreams():
|
117
|
-
if catalogue_datastream.stream_name in fixtures:
|
118
|
-
continue
|
119
|
-
if include and not any(
|
120
|
-
x.match(catalogue_datastream.stream_name) for x in include
|
121
|
-
):
|
122
|
-
continue
|
123
|
-
if any(x.match(catalogue_datastream.stream_name) for x in exclude):
|
124
|
-
continue
|
125
|
-
|
126
|
-
fixtures.add(catalogue_datastream.stream_name)
|
127
|
-
|
128
|
-
if hasattr(callback, "fixture_started"):
|
129
|
-
callback.fixture_started(catalogue_datastream.stream_name)
|
130
|
-
datastream = datastreams_impl(
|
131
|
-
readers=catalogue_datastream.readers,
|
132
|
-
writers=catalogue_datastream.writers,
|
133
|
-
transformers=catalogue_datastream.transformers,
|
134
|
-
callback=callback,
|
135
|
-
batch_size=batch_size,
|
136
|
-
)
|
137
|
-
datastream.process(identity=identity)
|
138
|
-
if hasattr(callback, "fixture_finished"):
|
139
|
-
callback.fixture_finished(catalogue_datastream.stream_name)
|
140
|
-
|
141
|
-
|
142
|
-
def dump_fixtures(
|
143
|
-
fixture_dir,
|
144
|
-
include=None,
|
145
|
-
exclude=None,
|
146
|
-
use_files=False,
|
147
|
-
callback: FixturesCallback = None,
|
148
|
-
datastream_impl=SynchronousDataStream,
|
149
|
-
batch_size=1,
|
150
|
-
):
|
151
|
-
include = [re.compile(x) for x in (include or [])]
|
152
|
-
exclude = [
|
153
|
-
re.compile(x)
|
154
|
-
for x in (exclude or current_app.config.get("DATASTREAMS_EXCLUDES", []))
|
155
|
-
]
|
156
|
-
fixture_dir = Path(fixture_dir)
|
157
|
-
if not fixture_dir.exists():
|
158
|
-
fixture_dir.mkdir(parents=True)
|
159
|
-
catalogue_path = fixture_dir / "catalogue.yaml"
|
160
|
-
catalogue_data = {}
|
161
|
-
|
162
|
-
for service_id in current_service_registry._services:
|
163
|
-
config_generator = (
|
164
|
-
current_app.config.get(f"DATASTREAMS_CONFIG_GENERATOR_{service_id.upper()}")
|
165
|
-
or current_app.config["DATASTREAMS_CONFIG_GENERATOR"]
|
166
|
-
)
|
167
|
-
service = current_service_registry.get(service_id)
|
168
|
-
if not hasattr(service, "scan"):
|
169
|
-
continue
|
170
|
-
for fixture_name, fixture_read_config, fixture_write_config in config_generator(
|
171
|
-
service_id, use_files=use_files
|
172
|
-
):
|
173
|
-
if include and not any(x.match(fixture_name) for x in include):
|
174
|
-
continue
|
175
|
-
if any(x.match(fixture_name) for x in exclude):
|
176
|
-
continue
|
177
|
-
|
178
|
-
catalogue_data[fixture_name] = fixture_read_config
|
179
|
-
|
180
|
-
catalogue = DataStreamCatalogue(
|
181
|
-
catalogue_path, {fixture_name: fixture_write_config}
|
182
|
-
)
|
183
|
-
|
184
|
-
for stream_name in catalogue:
|
185
|
-
catalogue_datastream = catalogue.get_datastream(stream_name)
|
186
|
-
if hasattr(callback, "fixture_started"):
|
187
|
-
callback.fixture_started(stream_name)
|
188
|
-
datastream = datastream_impl(
|
189
|
-
readers=catalogue_datastream.readers,
|
190
|
-
writers=catalogue_datastream.writers,
|
191
|
-
transformers=catalogue_datastream.transformers,
|
192
|
-
callback=callback,
|
193
|
-
batch_size=batch_size,
|
194
|
-
)
|
195
|
-
datastream.process()
|
196
|
-
if hasattr(callback, "fixture_finished"):
|
197
|
-
callback.fixture_finished(stream_name)
|
198
|
-
|
199
|
-
with open(catalogue_path, "w") as f:
|
200
|
-
yaml.dump(catalogue_data, f)
|
201
|
-
|
202
|
-
|
203
|
-
def default_config_generator(service_id, use_files=False):
|
204
|
-
writers = [
|
205
|
-
{"writer": "yaml", "target": f"{service_id}.yaml"},
|
206
|
-
]
|
207
|
-
if use_files:
|
208
|
-
writers.append(
|
209
|
-
{"writer": "attachments_file", "target": "files"},
|
210
|
-
)
|
211
|
-
|
212
|
-
yield service_id, [
|
213
|
-
# load
|
214
|
-
{"writer": "service", "service": service_id},
|
215
|
-
{"writer": "attachments_service", "service": service_id},
|
216
|
-
{"source": f"{service_id}.yaml"},
|
217
|
-
], [
|
218
|
-
# dump
|
219
|
-
{"reader": "service", "service": service_id, "load_files": use_files},
|
220
|
-
*writers,
|
221
|
-
]
|
222
|
-
|
223
|
-
|
224
|
-
@shared_task
|
225
|
-
def fixtures_asynchronous_callback(*args, callback, **kwargs):
|
226
|
-
try:
|
227
|
-
if "batch" in kwargs:
|
228
|
-
batch = StreamBatch.from_json(kwargs["batch"])
|
229
|
-
log.info(
|
230
|
-
"Fixtures progress: %s in batch.seq=%s, batch.last=%s",
|
231
|
-
callback,
|
232
|
-
batch.seq,
|
233
|
-
batch.last,
|
234
|
-
)
|
235
|
-
else:
|
236
|
-
batch = None
|
237
|
-
log.info("Fixtures progress: %s", callback)
|
238
|
-
|
239
|
-
if "error" in callback:
|
240
|
-
log.error(
|
241
|
-
"Error in loading fixtures: %s\n%s\n%s",
|
242
|
-
callback,
|
243
|
-
"\n".join(args),
|
244
|
-
"\n".join(f"{kwarg}: {value}" for kwarg, value in kwargs.items()),
|
245
|
-
)
|
246
|
-
|
247
|
-
if batch:
|
248
|
-
if batch.errors:
|
249
|
-
log.error(
|
250
|
-
"Batch errors: batch %s:\n%s",
|
251
|
-
batch.seq,
|
252
|
-
"\n".join(str(x) for x in batch.errors),
|
253
|
-
)
|
254
|
-
|
255
|
-
for entry in batch.entries:
|
256
|
-
if entry.errors:
|
257
|
-
log.error(
|
258
|
-
"Errors in entry %s of batch %s:\npayload %s\n",
|
259
|
-
entry.seq,
|
260
|
-
batch.seq,
|
261
|
-
entry.entry,
|
262
|
-
"\n".join(str(x) for x in entry.errors),
|
263
|
-
)
|
264
|
-
except Exception:
|
265
|
-
print(f"Error in fixtures callback: {callback=}, {args=}, {kwargs=}")
|
@@ -1,39 +0,0 @@
|
|
1
|
-
import contextlib
|
2
|
-
from abc import ABC, abstractmethod
|
3
|
-
from pathlib import Path
|
4
|
-
from typing import Iterator, Union
|
5
|
-
|
6
|
-
from ..types import StreamEntry
|
7
|
-
|
8
|
-
|
9
|
-
class BaseReader(ABC):
|
10
|
-
"""Base reader."""
|
11
|
-
|
12
|
-
base_path: Union[Path, None]
|
13
|
-
|
14
|
-
def __init__(self, *, source=None, base_path=None, **kwargs):
|
15
|
-
"""Constructor.
|
16
|
-
:param source: Data source (e.g. filepath, stream, ...)
|
17
|
-
"""
|
18
|
-
if not source or hasattr(source, "read") or not base_path:
|
19
|
-
self.source = source
|
20
|
-
else:
|
21
|
-
self.source = Path(base_path).joinpath(source)
|
22
|
-
if base_path:
|
23
|
-
self.base_path = Path(base_path)
|
24
|
-
elif isinstance(source, (str, Path)):
|
25
|
-
self.base_path = Path(source).parent
|
26
|
-
else:
|
27
|
-
self.base_path = None
|
28
|
-
|
29
|
-
@abstractmethod
|
30
|
-
def __iter__(self) -> Iterator[StreamEntry]:
|
31
|
-
"""Yields data objects."""
|
32
|
-
|
33
|
-
@contextlib.contextmanager
|
34
|
-
def _open(self, mode="r"):
|
35
|
-
if hasattr(self.source, "read"):
|
36
|
-
yield self.source
|
37
|
-
else:
|
38
|
-
with open(self.source, mode) as f:
|
39
|
-
yield f
|
@@ -1,51 +0,0 @@
|
|
1
|
-
from base64 import b64encode
|
2
|
-
from pathlib import Path
|
3
|
-
|
4
|
-
import yaml
|
5
|
-
|
6
|
-
from oarepo_runtime.datastreams import BaseReader, StreamEntry
|
7
|
-
from oarepo_runtime.datastreams.types import StreamEntryFile
|
8
|
-
from oarepo_runtime.datastreams.writers.attachments_file import format_serial
|
9
|
-
|
10
|
-
|
11
|
-
class AttachmentsReaderMixin(BaseReader):
|
12
|
-
def __init__(self, *, source=None, base_path=None, **kwargs):
|
13
|
-
super().__init__(source=source, base_path=base_path, **kwargs)
|
14
|
-
self.has_files = self.base_path and (self.base_path / "files").is_dir()
|
15
|
-
|
16
|
-
def __iter__(self):
|
17
|
-
"""Iterate over records."""
|
18
|
-
se: StreamEntry
|
19
|
-
for idx, se in enumerate(self.iter_entries()):
|
20
|
-
if self.has_files:
|
21
|
-
file_path = (
|
22
|
-
self.base_path.joinpath("files", format_serial(idx + 1)) / "data"
|
23
|
-
)
|
24
|
-
if file_path.exists():
|
25
|
-
file_metadata = self.load_file_metadata(file_path)
|
26
|
-
for md in file_metadata:
|
27
|
-
se.files.append(
|
28
|
-
StreamEntryFile(
|
29
|
-
metadata=md,
|
30
|
-
content_url="data:"
|
31
|
-
+ b64encode(
|
32
|
-
(file_path / md["key"]).read_bytes()
|
33
|
-
).decode("ascii"),
|
34
|
-
)
|
35
|
-
)
|
36
|
-
yield se
|
37
|
-
|
38
|
-
def load_file_metadata(self, file_path: Path):
|
39
|
-
md = "metadata.yaml"
|
40
|
-
while True:
|
41
|
-
tested_md = "meta_" + md
|
42
|
-
# meta_[A]metadata.yaml does not exist, so [A]metadata.yaml is the metadata file,
|
43
|
-
# where A is (meta_)*
|
44
|
-
if not (file_path / tested_md).exists():
|
45
|
-
with open(file_path / md) as f:
|
46
|
-
return list(yaml.safe_load_all(f))
|
47
|
-
md = tested_md
|
48
|
-
|
49
|
-
def iter_entries(self):
|
50
|
-
"Return an iterator of entries"
|
51
|
-
return []
|
@@ -1,123 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
from traceback import format_exc
|
3
|
-
from typing import Iterator
|
4
|
-
|
5
|
-
import openpyxl
|
6
|
-
|
7
|
-
from ..errors import ReaderError
|
8
|
-
from . import BaseReader, StreamEntry
|
9
|
-
from .attachments import AttachmentsReaderMixin
|
10
|
-
|
11
|
-
|
12
|
-
class ExcelReader(AttachmentsReaderMixin, BaseReader):
|
13
|
-
def iter_entries(self) -> Iterator[StreamEntry]:
|
14
|
-
with self._open("rb") as f:
|
15
|
-
try:
|
16
|
-
wb_obj = openpyxl.load_workbook(f)
|
17
|
-
sheet_obj = wb_obj.active
|
18
|
-
except Exception as err:
|
19
|
-
raise ReaderError(
|
20
|
-
f"Cannot decode excel file {self._data_file.name}: {str(err)}",
|
21
|
-
code="EXCEL_DECODE_ERROR",
|
22
|
-
detail={
|
23
|
-
"message": str(err),
|
24
|
-
"exception": type(err).__name__,
|
25
|
-
"stack": format_exc(limit=10),
|
26
|
-
},
|
27
|
-
)
|
28
|
-
|
29
|
-
header, data = self.get_excel_data(sheet_obj)
|
30
|
-
for row in data:
|
31
|
-
yield StreamEntry(row)
|
32
|
-
|
33
|
-
def get_excel_data(self, sheet_obj):
|
34
|
-
"""
|
35
|
-
returns an iterator (header, data)
|
36
|
-
"""
|
37
|
-
header = []
|
38
|
-
data = []
|
39
|
-
it = sheet_obj.iter_rows()
|
40
|
-
|
41
|
-
try:
|
42
|
-
row = next_row(it)
|
43
|
-
while empty(row):
|
44
|
-
row = next_row(it)
|
45
|
-
while not empty(row):
|
46
|
-
header.append(row)
|
47
|
-
row = next_row(it)
|
48
|
-
while empty(row):
|
49
|
-
row = next_row(it)
|
50
|
-
while True:
|
51
|
-
if not empty(row):
|
52
|
-
data.append(row)
|
53
|
-
row = next_row(it)
|
54
|
-
except StopIteration:
|
55
|
-
pass
|
56
|
-
if not data:
|
57
|
-
return [], self.to_dict(header)
|
58
|
-
else:
|
59
|
-
return self.to_dict(header), self.to_dict(data)
|
60
|
-
|
61
|
-
def to_dict(self, dta):
|
62
|
-
def is_array(val):
|
63
|
-
try:
|
64
|
-
int(val)
|
65
|
-
return True
|
66
|
-
except:
|
67
|
-
return False
|
68
|
-
|
69
|
-
def set_single(container, key, val):
|
70
|
-
try:
|
71
|
-
key = int(key)
|
72
|
-
while key >= len(container):
|
73
|
-
container.append(None)
|
74
|
-
container[key] = val
|
75
|
-
except (TypeError, ValueError):
|
76
|
-
container[key] = val
|
77
|
-
|
78
|
-
def iterset(k, v, container):
|
79
|
-
while True:
|
80
|
-
current_key = k[0]
|
81
|
-
next_key = k[1] if len(k) > 1 else None
|
82
|
-
if not next_key:
|
83
|
-
set_single(container, current_key, v)
|
84
|
-
return
|
85
|
-
if isinstance(container, list):
|
86
|
-
container.append({} if not is_array(next_key) else [])
|
87
|
-
container = container[-1]
|
88
|
-
else:
|
89
|
-
container = container.setdefault(
|
90
|
-
current_key, {} if not is_array(next_key) else []
|
91
|
-
)
|
92
|
-
k = k[1:]
|
93
|
-
|
94
|
-
def to_dict_item(header, item):
|
95
|
-
ret = RowDict()
|
96
|
-
for k, v in zip(header, item):
|
97
|
-
if not k:
|
98
|
-
continue
|
99
|
-
v = v if v is not None else ""
|
100
|
-
v = str(v).strip()
|
101
|
-
if v:
|
102
|
-
iterset(k, v, ret)
|
103
|
-
return ret
|
104
|
-
|
105
|
-
keys = [re.split("[_.]", x) if x else None for x in dta[0]]
|
106
|
-
return [to_dict_item(keys, d) for d in dta[1:]]
|
107
|
-
|
108
|
-
|
109
|
-
def next_row(it):
|
110
|
-
return [x.value for x in next(it)]
|
111
|
-
|
112
|
-
|
113
|
-
def empty(r):
|
114
|
-
for val in r:
|
115
|
-
if val:
|
116
|
-
return False
|
117
|
-
return True
|
118
|
-
|
119
|
-
|
120
|
-
class RowDict(dict):
|
121
|
-
def __init__(self, *args, **kwargs):
|
122
|
-
super().__init__(*args, **kwargs)
|
123
|
-
self._header = {}
|
@@ -1,27 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
from typing import Iterator
|
3
|
-
|
4
|
-
from . import BaseReader, StreamEntry
|
5
|
-
from .attachments import AttachmentsReaderMixin
|
6
|
-
|
7
|
-
|
8
|
-
class JSONReader(AttachmentsReaderMixin, BaseReader):
|
9
|
-
"""JSON Lines data iterator that loads records from JSON Lines files."""
|
10
|
-
|
11
|
-
def iter_entries(self) -> Iterator[StreamEntry]:
|
12
|
-
"""Iterate over records."""
|
13
|
-
with self._open() as fp:
|
14
|
-
data = json.load(fp)
|
15
|
-
assert isinstance(data, list)
|
16
|
-
for d in data:
|
17
|
-
yield StreamEntry(d)
|
18
|
-
|
19
|
-
|
20
|
-
class JSONLinesReader(BaseReader):
|
21
|
-
"""JSON Lines data iterator that loads records from JSON Lines files."""
|
22
|
-
|
23
|
-
def __iter__(self) -> Iterator[StreamEntry]:
|
24
|
-
"""Iterate over records."""
|
25
|
-
with self._open() as fp:
|
26
|
-
for line in fp:
|
27
|
-
yield StreamEntry(json.loads(line))
|
@@ -1,54 +0,0 @@
|
|
1
|
-
from base64 import b64encode
|
2
|
-
from typing import List
|
3
|
-
|
4
|
-
from invenio_access.permissions import system_identity
|
5
|
-
from invenio_records_resources.proxies import current_service_registry
|
6
|
-
|
7
|
-
from ..types import StreamEntryFile
|
8
|
-
from ..utils import get_file_service_for_record_class
|
9
|
-
from . import BaseReader, StreamEntry
|
10
|
-
|
11
|
-
|
12
|
-
class ServiceReader(BaseReader):
|
13
|
-
"""Writes the entries to a repository instance using a Service object."""
|
14
|
-
|
15
|
-
def __init__(self, *, service=None, identity=None, load_files=False, **kwargs):
|
16
|
-
"""Constructor.
|
17
|
-
:param service_or_name: a service instance or a key of the
|
18
|
-
service registry.
|
19
|
-
:param identity: access identity.
|
20
|
-
:param update: if True it will update records if they exist.
|
21
|
-
"""
|
22
|
-
super().__init__(**kwargs)
|
23
|
-
|
24
|
-
if isinstance(service, str):
|
25
|
-
service = current_service_registry.get(service)
|
26
|
-
|
27
|
-
self._service = service
|
28
|
-
self._identity = identity or system_identity
|
29
|
-
self._file_service = None
|
30
|
-
self._record_cls = getattr(self._service.config, "record_cls", None)
|
31
|
-
|
32
|
-
if self._record_cls and load_files:
|
33
|
-
# try to get file service
|
34
|
-
self._file_service = get_file_service_for_record_class(self._record_cls)
|
35
|
-
|
36
|
-
def __iter__(self):
|
37
|
-
for idx, entry in enumerate(self._service.scan(self._identity)):
|
38
|
-
files: List[StreamEntryFile] = []
|
39
|
-
if self._file_service:
|
40
|
-
for f in self._file_service.list_files(
|
41
|
-
self._identity, entry["id"]
|
42
|
-
).entries:
|
43
|
-
file_item = self._file_service.get_file_content(
|
44
|
-
self._identity, entry["id"], f["key"]
|
45
|
-
)
|
46
|
-
with file_item.open_stream("rb") as ff:
|
47
|
-
base64_content = b64encode(ff.read()).decode("ascii")
|
48
|
-
files.append(
|
49
|
-
StreamEntryFile(
|
50
|
-
metadata=f, content_url=f"data:{base64_content}"
|
51
|
-
)
|
52
|
-
)
|
53
|
-
|
54
|
-
yield StreamEntry(entry, files=files)
|
@@ -1,14 +0,0 @@
|
|
1
|
-
import yaml
|
2
|
-
|
3
|
-
from . import BaseReader, StreamEntry
|
4
|
-
from .attachments import AttachmentsReaderMixin
|
5
|
-
|
6
|
-
|
7
|
-
class YamlReader(AttachmentsReaderMixin, BaseReader):
|
8
|
-
"""YAML data iterator that loads records from YAML files."""
|
9
|
-
|
10
|
-
def iter_entries(self):
|
11
|
-
"""Iterate over records."""
|
12
|
-
with self._open() as fp:
|
13
|
-
for entry in yaml.safe_load_all(fp):
|
14
|
-
yield StreamEntry(entry)
|
@@ -1,91 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# This package was taken from Invenio vocabularies and modified to be more universal
|
3
|
-
#
|
4
|
-
import logging
|
5
|
-
from typing import List
|
6
|
-
|
7
|
-
from celery import shared_task
|
8
|
-
from celery.canvas import Signature as CelerySignature
|
9
|
-
|
10
|
-
from .asynchronous import (
|
11
|
-
AsynchronousDataStream,
|
12
|
-
AsynchronousDataStreamChain,
|
13
|
-
deserialize_identity,
|
14
|
-
serialize_identity,
|
15
|
-
)
|
16
|
-
from .datastreams import DataStreamChain, Signature
|
17
|
-
from .transformers import BaseTransformer
|
18
|
-
from .types import JSONObject, StreamBatch, StreamEntryError
|
19
|
-
|
20
|
-
log = logging.getLogger("datastreams")
|
21
|
-
|
22
|
-
|
23
|
-
class SemiAsynchronousDataStreamChain(AsynchronousDataStreamChain):
|
24
|
-
def _prepare_chain(self, callback: CelerySignature):
|
25
|
-
serialized_identity = serialize_identity(self._identity)
|
26
|
-
return run_semi_asynchronous_datastream_processor.s(
|
27
|
-
transformers=[tr.json for tr in self._transformers],
|
28
|
-
writers=[wr.json for wr in self._writers],
|
29
|
-
identity=serialized_identity,
|
30
|
-
callback=callback,
|
31
|
-
)
|
32
|
-
|
33
|
-
|
34
|
-
class SemiAsynchronousDataStream(AsynchronousDataStream):
|
35
|
-
"""Data stream."""
|
36
|
-
|
37
|
-
def build_chain(self, identity) -> DataStreamChain:
|
38
|
-
return SemiAsynchronousDataStreamChain(
|
39
|
-
transformers=self._transformers,
|
40
|
-
writers=self._writers,
|
41
|
-
on_background=self._on_background,
|
42
|
-
identity=identity,
|
43
|
-
)
|
44
|
-
|
45
|
-
|
46
|
-
@shared_task
|
47
|
-
def run_semi_asynchronous_datastream_processor(
|
48
|
-
batch: JSONObject,
|
49
|
-
*,
|
50
|
-
transformers: List[JSONObject],
|
51
|
-
writers: List[JSONObject],
|
52
|
-
identity: JSONObject,
|
53
|
-
callback: CelerySignature,
|
54
|
-
):
|
55
|
-
"""Run datastream processor."""
|
56
|
-
|
57
|
-
callback.apply(kwargs={"callback": "batch_started", "batch": batch})
|
58
|
-
|
59
|
-
batch = StreamBatch.from_json(batch)
|
60
|
-
identity = deserialize_identity(identity)
|
61
|
-
|
62
|
-
for signature in (transformers or []) + (writers or []):
|
63
|
-
signature = Signature.from_json(signature)
|
64
|
-
try:
|
65
|
-
processor = signature.resolve(identity=identity)
|
66
|
-
if isinstance(processor, BaseTransformer):
|
67
|
-
batch = processor.apply(batch) or batch
|
68
|
-
else:
|
69
|
-
batch = processor.write(batch) or batch
|
70
|
-
except Exception as ex:
|
71
|
-
if log.getEffectiveLevel():
|
72
|
-
log.error(
|
73
|
-
"Unexpected error in %s: %s",
|
74
|
-
repr(signature),
|
75
|
-
repr(batch),
|
76
|
-
)
|
77
|
-
err = StreamEntryError.from_exception(ex)
|
78
|
-
batch.errors.append(err)
|
79
|
-
callback.apply(
|
80
|
-
(),
|
81
|
-
{
|
82
|
-
"batch": batch.json,
|
83
|
-
"identity": serialize_identity(identity),
|
84
|
-
"callback": f"{signature.kind.value}_error",
|
85
|
-
"exception": err.json,
|
86
|
-
},
|
87
|
-
)
|
88
|
-
|
89
|
-
callback.apply(kwargs={"callback": "batch_finished", "batch": batch.json})
|
90
|
-
|
91
|
-
return None # do not return anything to avoid redis pollution
|