dataverse-utils 0.22.4__tar.gz → 0.22.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/PKG-INFO +3 -3
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/pyproject.toml +3 -3
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/__init__.py +4 -4
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/collections.py +255 -50
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/ldc.py +7 -5
- dataverse_utils-0.22.8/src/dataverse_utils/scripts/dv_collection_info.py +297 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_ldc_uploader.py +11 -7
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_release.py +1 -2
- dataverse_utils-0.22.4/src/dataverse_utils/scripts/dv_collection_info.py +0 -244
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/LICENCE.md +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/README.md +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/data/LDC_EULA_general.md +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/dataverse_utils.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/dvdata.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_del.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_list_files.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_manifest_gen.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_pg_facet_date.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_readme_creator.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_record_copy.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_replace_licence.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_study_migrator.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_upload_tsv.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataverse-utils
|
|
3
|
-
Version: 0.22.
|
|
3
|
+
Version: 0.22.8
|
|
4
4
|
Summary: Utilities for the Dataverse data respository system
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENCE.md
|
|
@@ -16,14 +16,14 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.13
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.14
|
|
18
18
|
Requires-Dist: bs4 (>=0.0.2,<0.0.3)
|
|
19
|
-
Requires-Dist: chardet (>=5.2
|
|
19
|
+
Requires-Dist: chardet (>=5.2)
|
|
20
20
|
Requires-Dist: dryad2dataverse (>=0.8.4,<0.9.0)
|
|
21
21
|
Requires-Dist: markdown (>=3.10.2,<4.0.0)
|
|
22
22
|
Requires-Dist: markdown-pdf (>=1.13.1,<2.0.0)
|
|
23
23
|
Requires-Dist: markdownify (>=1.2.2,<2.0.0)
|
|
24
24
|
Requires-Dist: pyreadr (>=0.5.4,<0.6.0)
|
|
25
25
|
Requires-Dist: pyreadstat (>=1.3.3,<2.0.0)
|
|
26
|
-
Requires-Dist: requests (>=2.
|
|
26
|
+
Requires-Dist: requests (>=2.33,<3.0)
|
|
27
27
|
Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
|
|
28
28
|
Requires-Dist: tqdm (>=4.67.3,<5.0.0)
|
|
29
29
|
Project-URL: Homepage, https://ubc-library-rc.github.io/dataverse_utils
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "dataverse-utils"
|
|
3
|
-
version = "0.22.
|
|
3
|
+
version = "0.22.8"
|
|
4
4
|
description = "Utilities for the Dataverse data respository system"
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "Paul Lesack",email = "paul.lesack@ubc.ca"}
|
|
@@ -10,7 +10,6 @@ readme = "README.md"
|
|
|
10
10
|
requires-python = ">=3.10, <4.0"
|
|
11
11
|
#When requests 2.33 is released, update poetry and release
|
|
12
12
|
dependencies = [
|
|
13
|
-
"requests (>=2.30.0,<3.0.0)",
|
|
14
13
|
"bs4 (>=0.0.2,<0.0.3)",
|
|
15
14
|
"markdown (>=3.10.2,<4.0.0)",
|
|
16
15
|
"markdown-pdf (>=1.13.1,<2.0.0)",
|
|
@@ -20,7 +19,8 @@ dependencies = [
|
|
|
20
19
|
"requests-toolbelt (>=1.0.0,<2.0.0)",
|
|
21
20
|
"tqdm (>=4.67.3,<5.0.0)",
|
|
22
21
|
"dryad2dataverse (>=0.8.4,<0.9.0)",
|
|
23
|
-
"chardet (>=5.2
|
|
22
|
+
"chardet (>=5.2)",
|
|
23
|
+
"requests (>=2.33,<3.0)"
|
|
24
24
|
]
|
|
25
25
|
#Chardet and requests will need to be changed when requests goes to 2.32; at that
|
|
26
26
|
#point just remove chardet
|
|
@@ -7,7 +7,7 @@ import pathlib
|
|
|
7
7
|
import sys
|
|
8
8
|
from dataverse_utils.dataverse_utils import *
|
|
9
9
|
|
|
10
|
-
VERSION = (0, 22,
|
|
10
|
+
VERSION = (0, 22, 8)
|
|
11
11
|
__version__ = '.'.join([str(x) for x in VERSION])
|
|
12
12
|
|
|
13
13
|
USERAGENT = (f'dataverse_utils/v{__version__} ({sys.platform.capitalize()}); '
|
|
@@ -15,14 +15,14 @@ USERAGENT = (f'dataverse_utils/v{__version__} ({sys.platform.capitalize()}); '
|
|
|
15
15
|
UAHEADER = {'User-agent' : USERAGENT}
|
|
16
16
|
|
|
17
17
|
SCRIPT_VERSIONS={
|
|
18
|
-
'dv_collection_info' : (0,
|
|
18
|
+
'dv_collection_info' : (0, 4, 0),
|
|
19
19
|
'dv_del' : (0, 2, 4),
|
|
20
|
-
'dv_ldc_uploader' : (0,
|
|
20
|
+
'dv_ldc_uploader' : (0, 4, 1),
|
|
21
21
|
'dv_list_files' : (0, 1, 1),
|
|
22
22
|
'dv_manifest_gen' : (0, 5, 1),
|
|
23
23
|
'dv_pg_facet_date' : (0, 1, 1),
|
|
24
24
|
'dv_record_copy' : (0, 1, 2),
|
|
25
|
-
'dv_release' : (0, 1,
|
|
25
|
+
'dv_release' : (0, 1, 3),
|
|
26
26
|
'dv_replace_licence' : (0, 1, 1),
|
|
27
27
|
'dv_readme_creator' : (0, 1, 1),
|
|
28
28
|
'dv_study_migrator' : (0, 4, 1),
|
|
@@ -8,8 +8,11 @@ import datetime
|
|
|
8
8
|
import io
|
|
9
9
|
import logging
|
|
10
10
|
import pathlib
|
|
11
|
+
import random
|
|
11
12
|
import string
|
|
13
|
+
import sys
|
|
12
14
|
import tempfile
|
|
15
|
+
import time
|
|
13
16
|
import textwrap
|
|
14
17
|
import typing
|
|
15
18
|
import traceback
|
|
@@ -33,12 +36,59 @@ RETRY = Retry(total=10,
|
|
|
33
36
|
allowed_methods=['HEAD', 'GET', 'OPTIONS',
|
|
34
37
|
'POST', 'PUT'],
|
|
35
38
|
backoff_factor=1)
|
|
39
|
+
BAR_FORMAT='{l_bar}{bar}{n_fmt}/{total_fmt} : time remaining - {remaining}'
|
|
36
40
|
|
|
37
41
|
class MetadataError(Exception):
|
|
38
42
|
'''
|
|
39
43
|
MetadataError
|
|
40
44
|
'''
|
|
41
45
|
|
|
46
|
+
class RateLimiter:
|
|
47
|
+
'''
|
|
48
|
+
Pauses for a random interval
|
|
49
|
+
'''
|
|
50
|
+
#pylint: disable=too-few-public-methods
|
|
51
|
+
def __init__(self, **kwargs):
|
|
52
|
+
'''
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
**kwargs
|
|
56
|
+
|
|
57
|
+
Other parameters
|
|
58
|
+
----------------
|
|
59
|
+
rate_limit_on: bool
|
|
60
|
+
Turn on rate limit for requests
|
|
61
|
+
|
|
62
|
+
rate_limit_min : int
|
|
63
|
+
Minimum time between requests in seconds
|
|
64
|
+
|
|
65
|
+
rate_limit_max : int
|
|
66
|
+
Maximum time between requests in seconds
|
|
67
|
+
|
|
68
|
+
session : requests.Session
|
|
69
|
+
A requests session if available, to help
|
|
70
|
+
ensure against having too many open connections
|
|
71
|
+
|
|
72
|
+
Notes
|
|
73
|
+
-----
|
|
74
|
+
The rate limiter will wait for a random interval between
|
|
75
|
+
rate_limit_min and rate_limit_max. Obviously, if you want
|
|
76
|
+
a constant interval, set them to be equal.
|
|
77
|
+
'''
|
|
78
|
+
self.kwargs = kwargs
|
|
79
|
+
|
|
80
|
+
if not self.kwargs.get('rate_limit_on', False):
|
|
81
|
+
self.kwargs['rate_limit_on'] = False
|
|
82
|
+
self.kwargs['rate_limit_min'] = 0
|
|
83
|
+
self.kwargs['rate_limit_max'] = 0
|
|
84
|
+
|
|
85
|
+
def rate_limit(self):
|
|
86
|
+
'''
|
|
87
|
+
Sleep before requests for the time set by the rate limits
|
|
88
|
+
'''
|
|
89
|
+
time.sleep(random.uniform(self.kwargs['rate_limit_min'],
|
|
90
|
+
self.kwargs['rate_limit_max']))
|
|
91
|
+
|
|
42
92
|
class DvCollection:
|
|
43
93
|
'''
|
|
44
94
|
Metadata for an *entire* dataverse collection, recursively.
|
|
@@ -66,7 +116,29 @@ class DvCollection:
|
|
|
66
116
|
----------------
|
|
67
117
|
timeout : int
|
|
68
118
|
retry timeout in seconds
|
|
119
|
+
|
|
120
|
+
rate_limit_on: bool
|
|
121
|
+
Turn on rate limit for requests
|
|
122
|
+
|
|
123
|
+
rate_limit_min : int
|
|
124
|
+
Minimum time between requests in seconds
|
|
125
|
+
|
|
126
|
+
rate_limit_max : int
|
|
127
|
+
Maximum time between requests in seconds
|
|
128
|
+
|
|
129
|
+
session : requests.Session
|
|
130
|
+
A requests session if available, to help
|
|
131
|
+
ensure against having too many open connections
|
|
132
|
+
|
|
133
|
+
Notes
|
|
134
|
+
-----
|
|
135
|
+
The rate limiter will wait for a random interval between
|
|
136
|
+
rate_limit_min and rate_limit_max. Obviously, if you want
|
|
137
|
+
a constant interval, set them to be equal.
|
|
138
|
+
|
|
69
139
|
'''
|
|
140
|
+
self.kwargs = kwargs
|
|
141
|
+
self.limit = RateLimiter(**kwargs)
|
|
70
142
|
self.coll = coll
|
|
71
143
|
self.url = self.__clean_url(url)
|
|
72
144
|
self.headers = None
|
|
@@ -80,11 +152,27 @@ class DvCollection:
|
|
|
80
152
|
self.retry_strategy = RETRY
|
|
81
153
|
else:
|
|
82
154
|
self.retry_strategy = kwargs['retry']
|
|
83
|
-
self.session = requests.Session()
|
|
84
|
-
self.session.mount('https://',
|
|
85
|
-
requests.adapters.HTTPAdapter(max_retries=self.retry_strategy))
|
|
86
155
|
self.collections = None
|
|
156
|
+
self.session = kwargs.get('session', requests.Session())
|
|
157
|
+
self.session.mount('https://',
|
|
158
|
+
requests.adapters.HTTPAdapter(max_retries=RETRY))
|
|
87
159
|
self.studies = None
|
|
160
|
+
self.__root = None
|
|
161
|
+
self.all_colls = [self.root]
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def root(self):
|
|
165
|
+
'''
|
|
166
|
+
Return the name and short name of the top level collection
|
|
167
|
+
'''
|
|
168
|
+
if not self.__root:
|
|
169
|
+
self.limit.rate_limit()
|
|
170
|
+
x = self.session.get(f'{self.url}/api/dataverses/{self.coll}',
|
|
171
|
+
headers=self.headers,
|
|
172
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
173
|
+
x.raise_for_status()
|
|
174
|
+
self.__root = (x.json()['data']['name'], x.json()['data']['alias'])
|
|
175
|
+
return self.__root
|
|
88
176
|
|
|
89
177
|
def __clean_url(self, badurl:str):
|
|
90
178
|
'''
|
|
@@ -105,11 +193,14 @@ class DvCollection:
|
|
|
105
193
|
'''
|
|
106
194
|
Get collection short name.
|
|
107
195
|
'''
|
|
108
|
-
|
|
196
|
+
self.limit.rate_limit()
|
|
197
|
+
shortname = self.session.get(f'{self.url}/api/dataverses/{dvid}',
|
|
198
|
+
headers=self.headers,
|
|
199
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
109
200
|
shortname.raise_for_status()
|
|
110
201
|
return shortname.json()['data']['alias']
|
|
111
202
|
|
|
112
|
-
def get_collections(self, coll:str=None, output=None
|
|
203
|
+
def get_collections(self, coll:str=None, output=None)->list:#pylint: disable=unused-argument
|
|
113
204
|
'''
|
|
114
205
|
Get a [recursive] listing of all dataverses in a collection.
|
|
115
206
|
|
|
@@ -119,16 +210,15 @@ class DvCollection:
|
|
|
119
210
|
Collection short name or id
|
|
120
211
|
output : list, optional, default=[]
|
|
121
212
|
output list to append to
|
|
122
|
-
**kwargs : dict
|
|
123
|
-
Other keyword arguments
|
|
124
|
-
|
|
125
213
|
'''
|
|
126
214
|
if not output:
|
|
127
215
|
output = []
|
|
128
216
|
if not coll:
|
|
129
217
|
coll = self.coll
|
|
218
|
+
self.limit.rate_limit()
|
|
130
219
|
x = self.session.get(f'{self.url}/api/dataverses/{coll}/contents',
|
|
131
|
-
|
|
220
|
+
headers=self.headers,
|
|
221
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
132
222
|
data = x.json().get('data')
|
|
133
223
|
#---
|
|
134
224
|
#Because it's possible that permissions errors can cause API read errors,
|
|
@@ -142,7 +232,6 @@ class DvCollection:
|
|
|
142
232
|
out=self.__get_shortname(_['id'])
|
|
143
233
|
dvs.append((_['title'], out))
|
|
144
234
|
except Exception as e:
|
|
145
|
-
|
|
146
235
|
obscure_error = f'''
|
|
147
236
|
An error has occured where a collection can be
|
|
148
237
|
identified by ID but its name cannot be determined.
|
|
@@ -155,12 +244,13 @@ class DvCollection:
|
|
|
155
244
|
|
|
156
245
|
Problematic collection id number: {_.get("id",
|
|
157
246
|
"not available")}'''
|
|
158
|
-
|
|
159
|
-
print(
|
|
247
|
+
#to sys.stdout?
|
|
248
|
+
print(50*'-', file=sys.stderr)
|
|
249
|
+
print(textwrap.dedent(obscure_error), file=sys.stderr)
|
|
160
250
|
print(e)
|
|
161
251
|
LOGGER.error(textwrap.fill(textwrap.dedent(obscure_error).strip()))
|
|
162
252
|
traceback.print_exc()
|
|
163
|
-
print(50*'-')
|
|
253
|
+
print(50*'-', file=sys.stderr)
|
|
164
254
|
raise e
|
|
165
255
|
#---
|
|
166
256
|
if not dvs:
|
|
@@ -171,6 +261,8 @@ class DvCollection:
|
|
|
171
261
|
LOGGER.debug('recursive')
|
|
172
262
|
self.get_collections(dv[1], output)
|
|
173
263
|
self.collections = output
|
|
264
|
+
if self.root not in self.collections:
|
|
265
|
+
self.collections.insert(0, self.root)
|
|
174
266
|
return output
|
|
175
267
|
|
|
176
268
|
def get_studies(self, root:str=None):
|
|
@@ -185,10 +277,15 @@ class DvCollection:
|
|
|
185
277
|
all_studies = []
|
|
186
278
|
if not root:
|
|
187
279
|
root=self.coll
|
|
188
|
-
|
|
189
|
-
#
|
|
280
|
+
#Redundant, as root is now added to get_collections
|
|
281
|
+
#all_studies = self.get_collection_listing(root)
|
|
282
|
+
all_studies = []
|
|
190
283
|
collections = self.get_collections(root)
|
|
191
|
-
for collection in collections
|
|
284
|
+
for collection in tqdm.tqdm(collections,
|
|
285
|
+
desc='collections',
|
|
286
|
+
unit='collection',
|
|
287
|
+
leave=False,
|
|
288
|
+
bar_format=BAR_FORMAT):
|
|
192
289
|
all_studies.extend(self.get_collection_listing(collection[1]))
|
|
193
290
|
self.studies = all_studies
|
|
194
291
|
return all_studies
|
|
@@ -202,17 +299,31 @@ class DvCollection:
|
|
|
202
299
|
coll_id : str
|
|
203
300
|
Short name or id of a dataverse collection
|
|
204
301
|
'''
|
|
302
|
+
self.limit.rate_limit()
|
|
205
303
|
cl = self.session.get(f'{self.url}/api/dataverses/{coll_id}/contents',
|
|
206
|
-
|
|
304
|
+
headers=self.headers,
|
|
305
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
207
306
|
cl.raise_for_status()
|
|
208
307
|
pids = [f"{z['protocol']}:{z['authority']}/{z['identifier']}"
|
|
209
308
|
for z in cl.json()['data'] if z['type'] == 'dataset']
|
|
210
|
-
|
|
309
|
+
#Pass collection info into the study because that's not available from
|
|
310
|
+
#a metadata download
|
|
311
|
+
smkwargs = [{'collection_name':_[0] , 'collection_short_name':_[1]}
|
|
312
|
+
for _ in self.collections if coll_id == _[1]][0]
|
|
313
|
+
#out = [(self.get_study_info(pid, **smkwargs), pid) for pid in pids]
|
|
314
|
+
out = []
|
|
315
|
+
for pid in tqdm.tqdm(pids,
|
|
316
|
+
desc=smkwargs.get('collection_short_name', 'collection'),
|
|
317
|
+
unit='study',
|
|
318
|
+
leave=False,
|
|
319
|
+
colour='red',
|
|
320
|
+
bar_format=BAR_FORMAT):
|
|
321
|
+
out.append((self.get_study_info(pid, **smkwargs), pid))
|
|
211
322
|
for _ in out:
|
|
212
323
|
_[0].update({'pid': _[1]})
|
|
213
324
|
return [x[0] for x in out]
|
|
214
325
|
|
|
215
|
-
def get_study_info(self, pid):
|
|
326
|
+
def get_study_info(self, pid, **kwargs):
|
|
216
327
|
'''
|
|
217
328
|
Returns a StudyMetadata object with complete metadata for a study.
|
|
218
329
|
|
|
@@ -220,13 +331,19 @@ class DvCollection:
|
|
|
220
331
|
----------
|
|
221
332
|
pid : str
|
|
222
333
|
Persistent ID of a Dataverse study
|
|
334
|
+
|
|
335
|
+
**kwargs
|
|
336
|
+
Other useful information to pass onto StudyMetadata, such as collection info, etc.
|
|
223
337
|
'''
|
|
338
|
+
self.limit.rate_limit()
|
|
224
339
|
meta = self.session.get(f'{self.url}/api/datasets/:persistentId',
|
|
225
|
-
|
|
226
|
-
|
|
340
|
+
params={'persistentId': pid},
|
|
341
|
+
headers=self.headers,
|
|
342
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
227
343
|
meta.raise_for_status()
|
|
228
344
|
LOGGER.debug(pid)
|
|
229
|
-
return StudyMetadata(study_meta=meta.json(), key=self.__key, url=self.url
|
|
345
|
+
return StudyMetadata(study_meta=meta.json(), key=self.__key, url=self.url,
|
|
346
|
+
session=self.session, **kwargs)
|
|
230
347
|
|
|
231
348
|
class StudyMetadata(dict):
|
|
232
349
|
'''
|
|
@@ -257,25 +374,54 @@ class StudyMetadata(dict):
|
|
|
257
374
|
key : str
|
|
258
375
|
Dataverse instance API key (needed for unpublished studies)
|
|
259
376
|
|
|
377
|
+
rate_limit_on: bool
|
|
378
|
+
Turn on rate limit for requests
|
|
379
|
+
|
|
380
|
+
rate_limit_min : int
|
|
381
|
+
Minimum time between requests in seconds
|
|
382
|
+
|
|
383
|
+
rate_limit_max : int
|
|
384
|
+
Maximum time between requests in seconds
|
|
385
|
+
|
|
386
|
+
session : requests.Session
|
|
387
|
+
A requests session if available, to help
|
|
388
|
+
ensure against having too many open connections
|
|
389
|
+
|
|
260
390
|
Notes
|
|
261
391
|
-----
|
|
262
392
|
Either `study_meta` is required OR `pid` and `url`. `key` _may_ be required
|
|
263
393
|
if either a draft study is being accessed or the Dataverse installation
|
|
264
394
|
requires API keys for all requests.
|
|
395
|
+
|
|
396
|
+
The rate limiter will wait for a random interval between
|
|
397
|
+
rate_limit_min and rate_limit_max. Obviously, if you want
|
|
398
|
+
a constant interval, set them to be equal.
|
|
399
|
+
|
|
265
400
|
'''
|
|
266
401
|
self.kwargs = kwargs
|
|
402
|
+
self.session = kwargs.get('session', requests.Session())
|
|
403
|
+
self.session.mount('https://',
|
|
404
|
+
requests.adapters.HTTPAdapter(max_retries=RETRY))
|
|
405
|
+
self.limit = RateLimiter(**kwargs)
|
|
267
406
|
self.study_meta = kwargs.get('study_meta')
|
|
268
|
-
self.all_versions =
|
|
407
|
+
self.all_versions = kwargs.get('all_versions')
|
|
269
408
|
self.url = kwargs.get('url')
|
|
270
409
|
self.pid = kwargs.get('pid')
|
|
410
|
+
#If only there would be an easy way to check if something was deaccessioned
|
|
411
|
+
#without yet another request. But right now, let's assume it's fine.
|
|
412
|
+
#See below (under Key Error) where it get set
|
|
413
|
+
self.deaccession_flag = 0
|
|
271
414
|
if self.study_meta:
|
|
272
415
|
#self.pid = kwargs.get('pid', (f"{self.study_meta['data']['protocol']}:"
|
|
273
416
|
# f"{self.study_meta['data']['authority']}"
|
|
274
417
|
# f"/{self.study_meta['data']['identifier']}") if not
|
|
275
418
|
# self.pid else self.pid)
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
419
|
+
try:
|
|
420
|
+
self.pid = (f"{self.study_meta['data']['protocol']}:"
|
|
421
|
+
f"{self.study_meta['data']['authority']}"
|
|
422
|
+
f"/{self.study_meta['data']['identifier']}")
|
|
423
|
+
except (KeyError,) as e:
|
|
424
|
+
raise MetadataError(f'Key error: {e}') from e
|
|
279
425
|
|
|
280
426
|
self.headers = UAHEADER.copy()
|
|
281
427
|
if not (('study_meta' in kwargs) or ('url' in kwargs and 'pid' in kwargs)):
|
|
@@ -286,15 +432,23 @@ class StudyMetadata(dict):
|
|
|
286
432
|
try:
|
|
287
433
|
self.update(self.extract_metadata(self.study_meta['data']['latestVersion']))
|
|
288
434
|
except KeyError as e:
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
435
|
+
if (self.study_meta.get('status') == 'OK' and not
|
|
436
|
+
self.study_meta['data'].get('latestVersion')):
|
|
437
|
+
# Latest version is not available because API strips out all
|
|
438
|
+
# citation metadata for deaccessioned studies but doesn't
|
|
439
|
+
# actually indicate this in any obvious manner
|
|
440
|
+
# This is further complicated because *all* the metadata
|
|
441
|
+
# we want is in the metadata blocks, which won't exist in the JSON
|
|
442
|
+
# because for some idiotic reason it's OK to expose it in the GUI
|
|
443
|
+
# but not via API.
|
|
444
|
+
self.deaccession_flag = 1
|
|
445
|
+
else:
|
|
446
|
+
raise MetadataError(f'Unable to parse study metadata. Do you need an API key?\n'
|
|
447
|
+
f'{e} key not found.\n'
|
|
448
|
+
f'Offending JSON: {self.study_meta}') from e
|
|
292
449
|
self.__files = None
|
|
293
450
|
self.__all_files = None
|
|
294
|
-
|
|
295
|
-
# for n, _ in enumerate(self.all_versions['data'])}
|
|
296
|
-
#self.index = {_: n for _, n in enumerate(self.versions)}
|
|
297
|
-
self.index = dict(enumerate(self.versions))
|
|
451
|
+
self.index = {_: n for n, _ in enumerate(self.versions)}
|
|
298
452
|
|
|
299
453
|
def __obtain_metadata(self):
|
|
300
454
|
'''
|
|
@@ -303,16 +457,23 @@ class StudyMetadata(dict):
|
|
|
303
457
|
if self.kwargs.get('key'):
|
|
304
458
|
self.headers.update({'X-Dataverse-key':self.kwargs['key']})
|
|
305
459
|
params = {'persistentId': self.pid}
|
|
306
|
-
self.session = requests.Session()
|
|
307
|
-
self.session.mount('https://',
|
|
308
|
-
requests.adapters.HTTPAdapter(max_retries=RETRY))
|
|
309
460
|
self.url = self.url.strip('/')
|
|
310
461
|
if not self.url.startswith('https://'):
|
|
311
462
|
self.url = f'https://{self.url}'
|
|
463
|
+
self.limit.rate_limit()
|
|
464
|
+
LOGGER.debug('Attempting %s/api/datasets/, params %s, headers %s',
|
|
465
|
+
self.url, params, self.headers)
|
|
312
466
|
data = self.session.get(f'{self.url}/api/datasets/:persistentId',
|
|
313
|
-
headers=self.headers, params=params
|
|
467
|
+
headers=self.headers, params=params,
|
|
468
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
469
|
+
data.raise_for_status()
|
|
470
|
+
self.limit.rate_limit()
|
|
471
|
+
LOGGER.debug('Attempting %s/api/datasets/:persistentId/versions, params %s, headers %s',
|
|
472
|
+
self.url, params, self.headers)
|
|
314
473
|
all_versions = self.session.get(f'{self.url}/api/datasets/:persistentId/versions',
|
|
315
|
-
|
|
474
|
+
headers=self.headers, params=params,
|
|
475
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
476
|
+
all_versions.raise_for_status()
|
|
316
477
|
return data.json(), all_versions.json()
|
|
317
478
|
|
|
318
479
|
def __has_metadata(self)->bool:
|
|
@@ -354,6 +515,14 @@ class StudyMetadata(dict):
|
|
|
354
515
|
tmp['versionStatement'] = f"{chunk['versionNumber']}.{chunk['versionMinorNumber']}"
|
|
355
516
|
else:
|
|
356
517
|
tmp['versionStatement'] = f"{chunk.get('versionState', '')}"
|
|
518
|
+
#ADD fields here if they are not in the metadata and you need them
|
|
519
|
+
tmp['pid'] = self.pid #Because you need generally need this
|
|
520
|
+
#Collection info
|
|
521
|
+
for _ in ['collection_name', 'collection_short_name']:
|
|
522
|
+
if self.kwargs.get(_):
|
|
523
|
+
tmp[_] = self.kwargs[_]
|
|
524
|
+
#Latest version number or state for easy filtering @@@
|
|
525
|
+
tmp['is_current_version'] = tmp['versionStatement'] == self.current_version
|
|
357
526
|
return tmp
|
|
358
527
|
|
|
359
528
|
def extract_field_metadata(self, field):
|
|
@@ -394,7 +563,7 @@ class StudyMetadata(dict):
|
|
|
394
563
|
else:
|
|
395
564
|
#sometimes value is None because reasons.
|
|
396
565
|
interim[v3['typeName']] = [v3.get('value', [] )]
|
|
397
|
-
LOGGER.debug(interim)
|
|
566
|
+
#LOGGER.debug(interim)
|
|
398
567
|
for k9, v9 in interim.items():
|
|
399
568
|
out.update({k9: '; '.join(v9)})
|
|
400
569
|
|
|
@@ -438,8 +607,14 @@ class StudyMetadata(dict):
|
|
|
438
607
|
'''
|
|
439
608
|
Return a formatted version statement for the most recent version
|
|
440
609
|
'''
|
|
441
|
-
|
|
442
|
-
|
|
610
|
+
try:
|
|
611
|
+
return (f"{self.study_meta['data']['latestVersion']['versionNumber']}."
|
|
612
|
+
f"{self.study_meta['data']['latestVersion']['versionMinorNumber']}")
|
|
613
|
+
except (KeyError, ValueError):
|
|
614
|
+
try:
|
|
615
|
+
return f"{self.study_meta['data']['latestVersion']['versionState']}"
|
|
616
|
+
except (ValueError, KeyError):
|
|
617
|
+
return 'DEACCESSIONED'
|
|
443
618
|
|
|
444
619
|
@property
|
|
445
620
|
def versions(self)->list:
|
|
@@ -549,7 +724,7 @@ class StudyMetadata(dict):
|
|
|
549
724
|
|
|
550
725
|
files = [self.flatten(_) for _ in filelist]
|
|
551
726
|
for ff in files:
|
|
552
|
-
ff.update({'
|
|
727
|
+
ff.update({'dataset_pid': self.pid})
|
|
553
728
|
return files
|
|
554
729
|
|
|
555
730
|
def __extract_files(self):
|
|
@@ -560,9 +735,11 @@ class StudyMetadata(dict):
|
|
|
560
735
|
#but files would (usually) be an arbitrary number of files.
|
|
561
736
|
#That bothers me on an intellectual level. Therefore, it will be attribute.
|
|
562
737
|
#Iterate over StudyMetadata.files if you want to know the contents
|
|
563
|
-
if not self.__files:
|
|
738
|
+
if not self.__files and not self.deaccession_flag:
|
|
564
739
|
self.__files = self.extract_files(self.study_meta['data']
|
|
565
740
|
['latestVersion']['files'])
|
|
741
|
+
if self.deaccession_flag:
|
|
742
|
+
self.__files = []
|
|
566
743
|
|
|
567
744
|
def __extract_licence_info(self, indict)->dict:
|
|
568
745
|
'''
|
|
@@ -695,7 +872,6 @@ class ReadmeCreator:
|
|
|
695
872
|
return f'{inkey}: \n'
|
|
696
873
|
return f'{inkey}: '
|
|
697
874
|
|
|
698
|
-
|
|
699
875
|
def __extract_files(self):
|
|
700
876
|
'''
|
|
701
877
|
Extract file level metadata, and write to self.__files.
|
|
@@ -793,6 +969,12 @@ class ReadmeCreator:
|
|
|
793
969
|
entire StudyMetadata object.
|
|
794
970
|
'''
|
|
795
971
|
metatmp = self.meta.copy()
|
|
972
|
+
#Delete redundant info fields added when harvesting Study Metadata
|
|
973
|
+
for _ in ['pid', 'is_current_version', 'version_statement']:
|
|
974
|
+
try:
|
|
975
|
+
del metatmp[_]
|
|
976
|
+
except KeyError:
|
|
977
|
+
continue
|
|
796
978
|
neworder = self.reorder_fields(metatmp)
|
|
797
979
|
addme = self.concatenator(metatmp)
|
|
798
980
|
metatmp.update(addme)
|
|
@@ -1032,7 +1214,7 @@ class FileAnalysis(dict):
|
|
|
1032
1214
|
Download and analyze a file from a dataverse installation and
|
|
1033
1215
|
produce useful metadata.
|
|
1034
1216
|
'''
|
|
1035
|
-
|
|
1217
|
+
#pylint: disable=too-many-instance-attributes
|
|
1036
1218
|
def __init__(self, **kwargs):
|
|
1037
1219
|
'''
|
|
1038
1220
|
Intialize the object.
|
|
@@ -1065,16 +1247,34 @@ class FileAnalysis(dict):
|
|
|
1065
1247
|
filesize_bytes : int
|
|
1066
1248
|
File size in bytes
|
|
1067
1249
|
|
|
1250
|
+
rate_limit_on: bool
|
|
1251
|
+
Turn on rate limit for requests
|
|
1252
|
+
|
|
1253
|
+
rate_limit_min : int
|
|
1254
|
+
Minimum time between requests in seconds
|
|
1255
|
+
|
|
1256
|
+
rate_limit_max : int
|
|
1257
|
+
Maximum time between requests in seconds
|
|
1258
|
+
|
|
1259
|
+
session : requests.Session
|
|
1260
|
+
A requests session if available, to help
|
|
1261
|
+
ensure against having too many open connections
|
|
1262
|
+
|
|
1068
1263
|
Notes
|
|
1069
1264
|
-----
|
|
1070
1265
|
Either `local` must be supplied, or `url`, `key` and at least one of
|
|
1071
1266
|
`id` or `pid` must be supplied
|
|
1072
1267
|
|
|
1073
|
-
|
|
1268
|
+
The rate limiter will wait for a random interval between
|
|
1269
|
+
rate_limit_min and rate_limit_max. Obviously, if you want
|
|
1270
|
+
a constant interval, set them to be equal.
|
|
1074
1271
|
|
|
1272
|
+
'''
|
|
1273
|
+
#pylint disable=too-many-instance-attributes
|
|
1075
1274
|
#self.url = self.__clean_url(url)
|
|
1076
1275
|
self.headers = UAHEADER.copy()
|
|
1077
1276
|
self.kwargs = kwargs
|
|
1277
|
+
self.limit = RateLimiter(**kwargs)
|
|
1078
1278
|
if self.kwargs.get('key'):
|
|
1079
1279
|
self.headers.update({'X-Dataverse-key':self.kwargs['key']})
|
|
1080
1280
|
self.local = None
|
|
@@ -1084,7 +1284,7 @@ class FileAnalysis(dict):
|
|
|
1084
1284
|
'(pid or id)) or (local) keyword parameters.')
|
|
1085
1285
|
raise TypeError(err)
|
|
1086
1286
|
self.tempfile = None
|
|
1087
|
-
self.session = requests.Session()
|
|
1287
|
+
self.session = kwargs.get('session', requests.Session())
|
|
1088
1288
|
self.session.mount('https://',
|
|
1089
1289
|
requests.adapters.HTTPAdapter(max_retries=RETRY))
|
|
1090
1290
|
self.checkable = {'.sav': self.stat_file_metadata,
|
|
@@ -1196,17 +1396,20 @@ class FileAnalysis(dict):
|
|
|
1196
1396
|
start = datetime.datetime.now()
|
|
1197
1397
|
params = {'format':'original'}
|
|
1198
1398
|
url = self.__clean_url(self.kwargs['url'])
|
|
1399
|
+
self.limit.rate_limit()
|
|
1199
1400
|
if self.kwargs.get('pid'):
|
|
1200
1401
|
params.update({'persistentId':self.kwargs['pid']})
|
|
1201
1402
|
data = self.session.get(f'{url}/api/access/datafile/:persistentId',
|
|
1202
1403
|
headers=self.headers,
|
|
1203
1404
|
params=params,
|
|
1204
|
-
stream=True
|
|
1405
|
+
stream=True,
|
|
1406
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
1205
1407
|
else:
|
|
1206
1408
|
data = self.session.get(f'{url}/api/access/datafile/{self.kwargs["id"]}',
|
|
1207
1409
|
headers=self.headers,
|
|
1208
1410
|
params=params,
|
|
1209
|
-
stream=True
|
|
1411
|
+
stream=True,
|
|
1412
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
1210
1413
|
data.raise_for_status()
|
|
1211
1414
|
finish = datetime.datetime.now()
|
|
1212
1415
|
self.filename = self.__get_filename(data.headers)
|
|
@@ -1216,7 +1419,9 @@ class FileAnalysis(dict):
|
|
|
1216
1419
|
filesize = self.kwargs.get('filesize_bytes',
|
|
1217
1420
|
data.headers.get('content-length', 9e9))
|
|
1218
1421
|
filesize = int(filesize) # comes out as string from header
|
|
1219
|
-
with tqdm.tqdm(total=filesize, unit='B', unit_scale=True,
|
|
1422
|
+
with tqdm.tqdm(total=filesize, unit='B', unit_scale=True,
|
|
1423
|
+
desc=self.filename, leave=False,
|
|
1424
|
+
bar_format=BAR_FORMAT) as t:
|
|
1220
1425
|
for _ in data.iter_content(block_size):
|
|
1221
1426
|
self.tempfile.file.write(_)
|
|
1222
1427
|
t.update(len(_))
|