dataverse-utils 0.22.7__tar.gz → 0.22.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/PKG-INFO +2 -1
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/pyproject.toml +3 -2
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/__init__.py +2 -2
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/collections.py +202 -36
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_collection_info.py +78 -11
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/LICENCE.md +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/README.md +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/data/LDC_EULA_general.md +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/dataverse_utils.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/dvdata.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/ldc.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_del.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_ldc_uploader.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_list_files.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_manifest_gen.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_pg_facet_date.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_readme_creator.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_record_copy.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_release.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_replace_licence.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_study_migrator.py +0 -0
- {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_upload_tsv.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataverse-utils
|
|
3
|
-
Version: 0.22.
|
|
3
|
+
Version: 0.22.9
|
|
4
4
|
Summary: Utilities for the Dataverse data respository system
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENCE.md
|
|
@@ -26,6 +26,7 @@ Requires-Dist: pyreadstat (>=1.3.3,<2.0.0)
|
|
|
26
26
|
Requires-Dist: requests (>=2.33,<3.0)
|
|
27
27
|
Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
|
|
28
28
|
Requires-Dist: tqdm (>=4.67.3,<5.0.0)
|
|
29
|
+
Requires-Dist: urllib3 (>=2.7.0,<3.0.0)
|
|
29
30
|
Project-URL: Homepage, https://ubc-library-rc.github.io/dataverse_utils
|
|
30
31
|
Project-URL: Issue Tracker, https://github.com/ubc-library-rc/dataverse_utils/issues
|
|
31
32
|
Project-URL: Repository, https://github.com/ubc-library-rc/dataverse_utils.git
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "dataverse-utils"
|
|
3
|
-
version = "0.22.
|
|
3
|
+
version = "0.22.9"
|
|
4
4
|
description = "Utilities for the Dataverse data respository system"
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "Paul Lesack",email = "paul.lesack@ubc.ca"}
|
|
@@ -20,7 +20,8 @@ dependencies = [
|
|
|
20
20
|
"tqdm (>=4.67.3,<5.0.0)",
|
|
21
21
|
"dryad2dataverse (>=0.8.4,<0.9.0)",
|
|
22
22
|
"chardet (>=5.2)",
|
|
23
|
-
"requests (>=2.33,<3.0)"
|
|
23
|
+
"requests (>=2.33,<3.0)",
|
|
24
|
+
"urllib3 (>=2.7.0,<3.0.0)"
|
|
24
25
|
]
|
|
25
26
|
#Chardet and requests will need to be changed when requests goes to 2.32; at that
|
|
26
27
|
#point just remove chardet
|
|
@@ -7,7 +7,7 @@ import pathlib
|
|
|
7
7
|
import sys
|
|
8
8
|
from dataverse_utils.dataverse_utils import *
|
|
9
9
|
|
|
10
|
-
VERSION = (0, 22,
|
|
10
|
+
VERSION = (0, 22, '9a0')
|
|
11
11
|
__version__ = '.'.join([str(x) for x in VERSION])
|
|
12
12
|
|
|
13
13
|
USERAGENT = (f'dataverse_utils/v{__version__} ({sys.platform.capitalize()}); '
|
|
@@ -15,7 +15,7 @@ USERAGENT = (f'dataverse_utils/v{__version__} ({sys.platform.capitalize()}); '
|
|
|
15
15
|
UAHEADER = {'User-agent' : USERAGENT}
|
|
16
16
|
|
|
17
17
|
SCRIPT_VERSIONS={
|
|
18
|
-
'dv_collection_info' : (0,
|
|
18
|
+
'dv_collection_info' : (0, 4, 1),
|
|
19
19
|
'dv_del' : (0, 2, 4),
|
|
20
20
|
'dv_ldc_uploader' : (0, 4, 1),
|
|
21
21
|
'dv_list_files' : (0, 1, 1),
|
|
@@ -8,9 +8,11 @@ import datetime
|
|
|
8
8
|
import io
|
|
9
9
|
import logging
|
|
10
10
|
import pathlib
|
|
11
|
+
import random
|
|
11
12
|
import string
|
|
12
13
|
import sys
|
|
13
14
|
import tempfile
|
|
15
|
+
import time
|
|
14
16
|
import textwrap
|
|
15
17
|
import typing
|
|
16
18
|
import traceback
|
|
@@ -34,12 +36,59 @@ RETRY = Retry(total=10,
|
|
|
34
36
|
allowed_methods=['HEAD', 'GET', 'OPTIONS',
|
|
35
37
|
'POST', 'PUT'],
|
|
36
38
|
backoff_factor=1)
|
|
39
|
+
BAR_FORMAT='{l_bar}{bar}{n_fmt}/{total_fmt} : time remaining - {remaining}'
|
|
37
40
|
|
|
38
41
|
class MetadataError(Exception):
|
|
39
42
|
'''
|
|
40
43
|
MetadataError
|
|
41
44
|
'''
|
|
42
45
|
|
|
46
|
+
class RateLimiter:
|
|
47
|
+
'''
|
|
48
|
+
Pauses for a random interval
|
|
49
|
+
'''
|
|
50
|
+
#pylint: disable=too-few-public-methods
|
|
51
|
+
def __init__(self, **kwargs):
|
|
52
|
+
'''
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
**kwargs
|
|
56
|
+
|
|
57
|
+
Other parameters
|
|
58
|
+
----------------
|
|
59
|
+
rate_limit_on: bool
|
|
60
|
+
Turn on rate limit for requests
|
|
61
|
+
|
|
62
|
+
rate_limit_min : int
|
|
63
|
+
Minimum time between requests in seconds
|
|
64
|
+
|
|
65
|
+
rate_limit_max : int
|
|
66
|
+
Maximum time between requests in seconds
|
|
67
|
+
|
|
68
|
+
session : requests.Session
|
|
69
|
+
A requests session if available, to help
|
|
70
|
+
ensure against having too many open connections
|
|
71
|
+
|
|
72
|
+
Notes
|
|
73
|
+
-----
|
|
74
|
+
The rate limiter will wait for a random interval between
|
|
75
|
+
rate_limit_min and rate_limit_max. Obviously, if you want
|
|
76
|
+
a constant interval, set them to be equal.
|
|
77
|
+
'''
|
|
78
|
+
self.kwargs = kwargs
|
|
79
|
+
|
|
80
|
+
if not self.kwargs.get('rate_limit_on', False):
|
|
81
|
+
self.kwargs['rate_limit_on'] = False
|
|
82
|
+
self.kwargs['rate_limit_min'] = 0
|
|
83
|
+
self.kwargs['rate_limit_max'] = 0
|
|
84
|
+
|
|
85
|
+
def rate_limit(self):
|
|
86
|
+
'''
|
|
87
|
+
Sleep before requests for the time set by the rate limits
|
|
88
|
+
'''
|
|
89
|
+
time.sleep(random.uniform(self.kwargs['rate_limit_min'],
|
|
90
|
+
self.kwargs['rate_limit_max']))
|
|
91
|
+
|
|
43
92
|
class DvCollection:
|
|
44
93
|
'''
|
|
45
94
|
Metadata for an *entire* dataverse collection, recursively.
|
|
@@ -67,7 +116,29 @@ class DvCollection:
|
|
|
67
116
|
----------------
|
|
68
117
|
timeout : int
|
|
69
118
|
retry timeout in seconds
|
|
119
|
+
|
|
120
|
+
rate_limit_on: bool
|
|
121
|
+
Turn on rate limit for requests
|
|
122
|
+
|
|
123
|
+
rate_limit_min : int
|
|
124
|
+
Minimum time between requests in seconds
|
|
125
|
+
|
|
126
|
+
rate_limit_max : int
|
|
127
|
+
Maximum time between requests in seconds
|
|
128
|
+
|
|
129
|
+
session : requests.Session
|
|
130
|
+
A requests session if available, to help
|
|
131
|
+
ensure against having too many open connections
|
|
132
|
+
|
|
133
|
+
Notes
|
|
134
|
+
-----
|
|
135
|
+
The rate limiter will wait for a random interval between
|
|
136
|
+
rate_limit_min and rate_limit_max. Obviously, if you want
|
|
137
|
+
a constant interval, set them to be equal.
|
|
138
|
+
|
|
70
139
|
'''
|
|
140
|
+
self.kwargs = kwargs
|
|
141
|
+
self.limit = RateLimiter(**kwargs)
|
|
71
142
|
self.coll = coll
|
|
72
143
|
self.url = self.__clean_url(url)
|
|
73
144
|
self.headers = None
|
|
@@ -82,9 +153,9 @@ class DvCollection:
|
|
|
82
153
|
else:
|
|
83
154
|
self.retry_strategy = kwargs['retry']
|
|
84
155
|
self.collections = None
|
|
85
|
-
self.session = requests.Session()
|
|
156
|
+
self.session = kwargs.get('session', requests.Session())
|
|
86
157
|
self.session.mount('https://',
|
|
87
|
-
requests.adapters.HTTPAdapter(max_retries=
|
|
158
|
+
requests.adapters.HTTPAdapter(max_retries=RETRY))
|
|
88
159
|
self.studies = None
|
|
89
160
|
self.__root = None
|
|
90
161
|
self.all_colls = [self.root]
|
|
@@ -95,8 +166,10 @@ class DvCollection:
|
|
|
95
166
|
Return the name and short name of the top level collection
|
|
96
167
|
'''
|
|
97
168
|
if not self.__root:
|
|
169
|
+
self.limit.rate_limit()
|
|
98
170
|
x = self.session.get(f'{self.url}/api/dataverses/{self.coll}',
|
|
99
|
-
headers=self.headers
|
|
171
|
+
headers=self.headers,
|
|
172
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
100
173
|
x.raise_for_status()
|
|
101
174
|
self.__root = (x.json()['data']['name'], x.json()['data']['alias'])
|
|
102
175
|
return self.__root
|
|
@@ -120,11 +193,14 @@ class DvCollection:
|
|
|
120
193
|
'''
|
|
121
194
|
Get collection short name.
|
|
122
195
|
'''
|
|
123
|
-
|
|
196
|
+
self.limit.rate_limit()
|
|
197
|
+
shortname = self.session.get(f'{self.url}/api/dataverses/{dvid}',
|
|
198
|
+
headers=self.headers,
|
|
199
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
124
200
|
shortname.raise_for_status()
|
|
125
201
|
return shortname.json()['data']['alias']
|
|
126
202
|
|
|
127
|
-
def get_collections(self, coll:str=None, output=None
|
|
203
|
+
def get_collections(self, coll:str=None, output=None)->list:#pylint: disable=unused-argument
|
|
128
204
|
'''
|
|
129
205
|
Get a [recursive] listing of all dataverses in a collection.
|
|
130
206
|
|
|
@@ -134,16 +210,15 @@ class DvCollection:
|
|
|
134
210
|
Collection short name or id
|
|
135
211
|
output : list, optional, default=[]
|
|
136
212
|
output list to append to
|
|
137
|
-
**kwargs : dict
|
|
138
|
-
Other keyword arguments
|
|
139
|
-
|
|
140
213
|
'''
|
|
141
214
|
if not output:
|
|
142
215
|
output = []
|
|
143
216
|
if not coll:
|
|
144
217
|
coll = self.coll
|
|
218
|
+
self.limit.rate_limit()
|
|
145
219
|
x = self.session.get(f'{self.url}/api/dataverses/{coll}/contents',
|
|
146
|
-
|
|
220
|
+
headers=self.headers,
|
|
221
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
147
222
|
data = x.json().get('data')
|
|
148
223
|
#---
|
|
149
224
|
#Because it's possible that permissions errors can cause API read errors,
|
|
@@ -186,7 +261,8 @@ class DvCollection:
|
|
|
186
261
|
LOGGER.debug('recursive')
|
|
187
262
|
self.get_collections(dv[1], output)
|
|
188
263
|
self.collections = output
|
|
189
|
-
self.
|
|
264
|
+
if self.root not in self.collections:
|
|
265
|
+
self.collections.insert(0, self.root)
|
|
190
266
|
return output
|
|
191
267
|
|
|
192
268
|
def get_studies(self, root:str=None):
|
|
@@ -201,9 +277,15 @@ class DvCollection:
|
|
|
201
277
|
all_studies = []
|
|
202
278
|
if not root:
|
|
203
279
|
root=self.coll
|
|
204
|
-
|
|
280
|
+
#Redundant, as root is now added to get_collections
|
|
281
|
+
#all_studies = self.get_collection_listing(root)
|
|
282
|
+
all_studies = []
|
|
205
283
|
collections = self.get_collections(root)
|
|
206
|
-
for collection in tqdm.tqdm(collections
|
|
284
|
+
for collection in tqdm.tqdm(collections,
|
|
285
|
+
desc='collections',
|
|
286
|
+
unit='collection',
|
|
287
|
+
leave=False,
|
|
288
|
+
bar_format=BAR_FORMAT):
|
|
207
289
|
all_studies.extend(self.get_collection_listing(collection[1]))
|
|
208
290
|
self.studies = all_studies
|
|
209
291
|
return all_studies
|
|
@@ -217,8 +299,10 @@ class DvCollection:
|
|
|
217
299
|
coll_id : str
|
|
218
300
|
Short name or id of a dataverse collection
|
|
219
301
|
'''
|
|
302
|
+
self.limit.rate_limit()
|
|
220
303
|
cl = self.session.get(f'{self.url}/api/dataverses/{coll_id}/contents',
|
|
221
|
-
|
|
304
|
+
headers=self.headers,
|
|
305
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
222
306
|
cl.raise_for_status()
|
|
223
307
|
pids = [f"{z['protocol']}:{z['authority']}/{z['identifier']}"
|
|
224
308
|
for z in cl.json()['data'] if z['type'] == 'dataset']
|
|
@@ -226,7 +310,15 @@ class DvCollection:
|
|
|
226
310
|
#a metadata download
|
|
227
311
|
smkwargs = [{'collection_name':_[0] , 'collection_short_name':_[1]}
|
|
228
312
|
for _ in self.collections if coll_id == _[1]][0]
|
|
229
|
-
out = [(self.get_study_info(pid, **smkwargs), pid) for pid in pids]
|
|
313
|
+
#out = [(self.get_study_info(pid, **smkwargs), pid) for pid in pids]
|
|
314
|
+
out = []
|
|
315
|
+
for pid in tqdm.tqdm(pids,
|
|
316
|
+
desc=smkwargs.get('collection_short_name', 'collection'),
|
|
317
|
+
unit='study',
|
|
318
|
+
leave=False,
|
|
319
|
+
colour='red',
|
|
320
|
+
bar_format=BAR_FORMAT):
|
|
321
|
+
out.append((self.get_study_info(pid, **smkwargs), pid))
|
|
230
322
|
for _ in out:
|
|
231
323
|
_[0].update({'pid': _[1]})
|
|
232
324
|
return [x[0] for x in out]
|
|
@@ -243,12 +335,15 @@ class DvCollection:
|
|
|
243
335
|
**kwargs
|
|
244
336
|
Other useful information to pass onto StudyMetadata, such as collection info, etc.
|
|
245
337
|
'''
|
|
338
|
+
self.limit.rate_limit()
|
|
246
339
|
meta = self.session.get(f'{self.url}/api/datasets/:persistentId',
|
|
247
|
-
|
|
248
|
-
|
|
340
|
+
params={'persistentId': pid},
|
|
341
|
+
headers=self.headers,
|
|
342
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
249
343
|
meta.raise_for_status()
|
|
250
344
|
LOGGER.debug(pid)
|
|
251
|
-
return StudyMetadata(study_meta=meta.json(), key=self.__key, url=self.url,
|
|
345
|
+
return StudyMetadata(study_meta=meta.json(), key=self.__key, url=self.url,
|
|
346
|
+
session=self.session, **kwargs)
|
|
252
347
|
|
|
253
348
|
class StudyMetadata(dict):
|
|
254
349
|
'''
|
|
@@ -279,15 +374,37 @@ class StudyMetadata(dict):
|
|
|
279
374
|
key : str
|
|
280
375
|
Dataverse instance API key (needed for unpublished studies)
|
|
281
376
|
|
|
377
|
+
rate_limit_on: bool
|
|
378
|
+
Turn on rate limit for requests
|
|
379
|
+
|
|
380
|
+
rate_limit_min : int
|
|
381
|
+
Minimum time between requests in seconds
|
|
382
|
+
|
|
383
|
+
rate_limit_max : int
|
|
384
|
+
Maximum time between requests in seconds
|
|
385
|
+
|
|
386
|
+
session : requests.Session
|
|
387
|
+
A requests session if available, to help
|
|
388
|
+
ensure against having too many open connections
|
|
389
|
+
|
|
282
390
|
Notes
|
|
283
391
|
-----
|
|
284
392
|
Either `study_meta` is required OR `pid` and `url`. `key` _may_ be required
|
|
285
393
|
if either a draft study is being accessed or the Dataverse installation
|
|
286
394
|
requires API keys for all requests.
|
|
395
|
+
|
|
396
|
+
The rate limiter will wait for a random interval between
|
|
397
|
+
rate_limit_min and rate_limit_max. Obviously, if you want
|
|
398
|
+
a constant interval, set them to be equal.
|
|
399
|
+
|
|
287
400
|
'''
|
|
288
401
|
self.kwargs = kwargs
|
|
402
|
+
self.session = kwargs.get('session', requests.Session())
|
|
403
|
+
self.session.mount('https://',
|
|
404
|
+
requests.adapters.HTTPAdapter(max_retries=RETRY))
|
|
405
|
+
self.limit = RateLimiter(**kwargs)
|
|
289
406
|
self.study_meta = kwargs.get('study_meta')
|
|
290
|
-
self.all_versions =
|
|
407
|
+
self.all_versions = kwargs.get('all_versions')
|
|
291
408
|
self.url = kwargs.get('url')
|
|
292
409
|
self.pid = kwargs.get('pid')
|
|
293
410
|
#If only there would be an easy way to check if something was deaccessioned
|
|
@@ -299,9 +416,12 @@ class StudyMetadata(dict):
|
|
|
299
416
|
# f"{self.study_meta['data']['authority']}"
|
|
300
417
|
# f"/{self.study_meta['data']['identifier']}") if not
|
|
301
418
|
# self.pid else self.pid)
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
419
|
+
try:
|
|
420
|
+
self.pid = (f"{self.study_meta['data']['protocol']}:"
|
|
421
|
+
f"{self.study_meta['data']['authority']}"
|
|
422
|
+
f"/{self.study_meta['data']['identifier']}")
|
|
423
|
+
except (KeyError,) as e:
|
|
424
|
+
raise MetadataError(f'Key error: {e}') from e
|
|
305
425
|
|
|
306
426
|
self.headers = UAHEADER.copy()
|
|
307
427
|
if not (('study_meta' in kwargs) or ('url' in kwargs and 'pid' in kwargs)):
|
|
@@ -337,16 +457,23 @@ class StudyMetadata(dict):
|
|
|
337
457
|
if self.kwargs.get('key'):
|
|
338
458
|
self.headers.update({'X-Dataverse-key':self.kwargs['key']})
|
|
339
459
|
params = {'persistentId': self.pid}
|
|
340
|
-
self.session = requests.Session()
|
|
341
|
-
self.session.mount('https://',
|
|
342
|
-
requests.adapters.HTTPAdapter(max_retries=RETRY))
|
|
343
460
|
self.url = self.url.strip('/')
|
|
344
461
|
if not self.url.startswith('https://'):
|
|
345
462
|
self.url = f'https://{self.url}'
|
|
463
|
+
self.limit.rate_limit()
|
|
464
|
+
LOGGER.debug('Attempting %s/api/datasets/, params %s, headers %s',
|
|
465
|
+
self.url, params, self.headers)
|
|
346
466
|
data = self.session.get(f'{self.url}/api/datasets/:persistentId',
|
|
347
|
-
headers=self.headers, params=params
|
|
467
|
+
headers=self.headers, params=params,
|
|
468
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
469
|
+
data.raise_for_status()
|
|
470
|
+
self.limit.rate_limit()
|
|
471
|
+
LOGGER.debug('Attempting %s/api/datasets/:persistentId/versions, params %s, headers %s',
|
|
472
|
+
self.url, params, self.headers)
|
|
348
473
|
all_versions = self.session.get(f'{self.url}/api/datasets/:persistentId/versions',
|
|
349
|
-
|
|
474
|
+
headers=self.headers, params=params,
|
|
475
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
476
|
+
all_versions.raise_for_status()
|
|
350
477
|
return data.json(), all_versions.json()
|
|
351
478
|
|
|
352
479
|
def __has_metadata(self)->bool:
|
|
@@ -388,10 +515,14 @@ class StudyMetadata(dict):
|
|
|
388
515
|
tmp['versionStatement'] = f"{chunk['versionNumber']}.{chunk['versionMinorNumber']}"
|
|
389
516
|
else:
|
|
390
517
|
tmp['versionStatement'] = f"{chunk.get('versionState', '')}"
|
|
391
|
-
|
|
518
|
+
#ADD fields here if they are not in the metadata and you need them
|
|
519
|
+
tmp['pid'] = self.pid #Because you need generally need this
|
|
520
|
+
#Collection info
|
|
392
521
|
for _ in ['collection_name', 'collection_short_name']:
|
|
393
522
|
if self.kwargs.get(_):
|
|
394
523
|
tmp[_] = self.kwargs[_]
|
|
524
|
+
#Latest version number or state for easy filtering @@@
|
|
525
|
+
tmp['is_current_version'] = tmp['versionStatement'] == self.current_version
|
|
395
526
|
return tmp
|
|
396
527
|
|
|
397
528
|
def extract_field_metadata(self, field):
|
|
@@ -432,7 +563,7 @@ class StudyMetadata(dict):
|
|
|
432
563
|
else:
|
|
433
564
|
#sometimes value is None because reasons.
|
|
434
565
|
interim[v3['typeName']] = [v3.get('value', [] )]
|
|
435
|
-
LOGGER.debug(interim)
|
|
566
|
+
#LOGGER.debug(interim)
|
|
436
567
|
for k9, v9 in interim.items():
|
|
437
568
|
out.update({k9: '; '.join(v9)})
|
|
438
569
|
|
|
@@ -476,8 +607,14 @@ class StudyMetadata(dict):
|
|
|
476
607
|
'''
|
|
477
608
|
Return a formatted version statement for the most recent version
|
|
478
609
|
'''
|
|
479
|
-
|
|
480
|
-
|
|
610
|
+
try:
|
|
611
|
+
return (f"{self.study_meta['data']['latestVersion']['versionNumber']}."
|
|
612
|
+
f"{self.study_meta['data']['latestVersion']['versionMinorNumber']}")
|
|
613
|
+
except (KeyError, ValueError):
|
|
614
|
+
try:
|
|
615
|
+
return f"{self.study_meta['data']['latestVersion']['versionState']}"
|
|
616
|
+
except (ValueError, KeyError):
|
|
617
|
+
return 'DEACCESSIONED'
|
|
481
618
|
|
|
482
619
|
@property
|
|
483
620
|
def versions(self)->list:
|
|
@@ -832,6 +969,12 @@ class ReadmeCreator:
|
|
|
832
969
|
entire StudyMetadata object.
|
|
833
970
|
'''
|
|
834
971
|
metatmp = self.meta.copy()
|
|
972
|
+
#Delete redundant info fields added when harvesting Study Metadata
|
|
973
|
+
for _ in ['pid', 'is_current_version', 'version_statement']:
|
|
974
|
+
try:
|
|
975
|
+
del metatmp[_]
|
|
976
|
+
except KeyError:
|
|
977
|
+
continue
|
|
835
978
|
neworder = self.reorder_fields(metatmp)
|
|
836
979
|
addme = self.concatenator(metatmp)
|
|
837
980
|
metatmp.update(addme)
|
|
@@ -1071,7 +1214,7 @@ class FileAnalysis(dict):
|
|
|
1071
1214
|
Download and analyze a file from a dataverse installation and
|
|
1072
1215
|
produce useful metadata.
|
|
1073
1216
|
'''
|
|
1074
|
-
|
|
1217
|
+
#pylint: disable=too-many-instance-attributes
|
|
1075
1218
|
def __init__(self, **kwargs):
|
|
1076
1219
|
'''
|
|
1077
1220
|
Intialize the object.
|
|
@@ -1104,16 +1247,34 @@ class FileAnalysis(dict):
|
|
|
1104
1247
|
filesize_bytes : int
|
|
1105
1248
|
File size in bytes
|
|
1106
1249
|
|
|
1250
|
+
rate_limit_on: bool
|
|
1251
|
+
Turn on rate limit for requests
|
|
1252
|
+
|
|
1253
|
+
rate_limit_min : int
|
|
1254
|
+
Minimum time between requests in seconds
|
|
1255
|
+
|
|
1256
|
+
rate_limit_max : int
|
|
1257
|
+
Maximum time between requests in seconds
|
|
1258
|
+
|
|
1259
|
+
session : requests.Session
|
|
1260
|
+
A requests session if available, to help
|
|
1261
|
+
ensure against having too many open connections
|
|
1262
|
+
|
|
1107
1263
|
Notes
|
|
1108
1264
|
-----
|
|
1109
1265
|
Either `local` must be supplied, or `url`, `key` and at least one of
|
|
1110
1266
|
`id` or `pid` must be supplied
|
|
1111
1267
|
|
|
1112
|
-
|
|
1268
|
+
The rate limiter will wait for a random interval between
|
|
1269
|
+
rate_limit_min and rate_limit_max. Obviously, if you want
|
|
1270
|
+
a constant interval, set them to be equal.
|
|
1113
1271
|
|
|
1272
|
+
'''
|
|
1273
|
+
#pylint disable=too-many-instance-attributes
|
|
1114
1274
|
#self.url = self.__clean_url(url)
|
|
1115
1275
|
self.headers = UAHEADER.copy()
|
|
1116
1276
|
self.kwargs = kwargs
|
|
1277
|
+
self.limit = RateLimiter(**kwargs)
|
|
1117
1278
|
if self.kwargs.get('key'):
|
|
1118
1279
|
self.headers.update({'X-Dataverse-key':self.kwargs['key']})
|
|
1119
1280
|
self.local = None
|
|
@@ -1123,7 +1284,7 @@ class FileAnalysis(dict):
|
|
|
1123
1284
|
'(pid or id)) or (local) keyword parameters.')
|
|
1124
1285
|
raise TypeError(err)
|
|
1125
1286
|
self.tempfile = None
|
|
1126
|
-
self.session = requests.Session()
|
|
1287
|
+
self.session = kwargs.get('session', requests.Session())
|
|
1127
1288
|
self.session.mount('https://',
|
|
1128
1289
|
requests.adapters.HTTPAdapter(max_retries=RETRY))
|
|
1129
1290
|
self.checkable = {'.sav': self.stat_file_metadata,
|
|
@@ -1235,17 +1396,20 @@ class FileAnalysis(dict):
|
|
|
1235
1396
|
start = datetime.datetime.now()
|
|
1236
1397
|
params = {'format':'original'}
|
|
1237
1398
|
url = self.__clean_url(self.kwargs['url'])
|
|
1399
|
+
self.limit.rate_limit()
|
|
1238
1400
|
if self.kwargs.get('pid'):
|
|
1239
1401
|
params.update({'persistentId':self.kwargs['pid']})
|
|
1240
1402
|
data = self.session.get(f'{url}/api/access/datafile/:persistentId',
|
|
1241
1403
|
headers=self.headers,
|
|
1242
1404
|
params=params,
|
|
1243
|
-
stream=True
|
|
1405
|
+
stream=True,
|
|
1406
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
1244
1407
|
else:
|
|
1245
1408
|
data = self.session.get(f'{url}/api/access/datafile/{self.kwargs["id"]}',
|
|
1246
1409
|
headers=self.headers,
|
|
1247
1410
|
params=params,
|
|
1248
|
-
stream=True
|
|
1411
|
+
stream=True,
|
|
1412
|
+
timeout=self.kwargs.get('timeout', 15))
|
|
1249
1413
|
data.raise_for_status()
|
|
1250
1414
|
finish = datetime.datetime.now()
|
|
1251
1415
|
self.filename = self.__get_filename(data.headers)
|
|
@@ -1255,7 +1419,9 @@ class FileAnalysis(dict):
|
|
|
1255
1419
|
filesize = self.kwargs.get('filesize_bytes',
|
|
1256
1420
|
data.headers.get('content-length', 9e9))
|
|
1257
1421
|
filesize = int(filesize) # comes out as string from header
|
|
1258
|
-
with tqdm.tqdm(total=filesize, unit='B', unit_scale=True,
|
|
1422
|
+
with tqdm.tqdm(total=filesize, unit='B', unit_scale=True,
|
|
1423
|
+
desc=self.filename, leave=False,
|
|
1424
|
+
bar_format=BAR_FORMAT) as t:
|
|
1259
1425
|
for _ in data.iter_content(block_size):
|
|
1260
1426
|
self.tempfile.file.write(_)
|
|
1261
1427
|
t.update(len(_))
|
{dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_collection_info.py
RENAMED
|
@@ -5,6 +5,7 @@ outputs study metadata for the latest version
|
|
|
5
5
|
import argparse
|
|
6
6
|
import io
|
|
7
7
|
import csv
|
|
8
|
+
import logging
|
|
8
9
|
import pathlib
|
|
9
10
|
import sqlite3
|
|
10
11
|
import sys
|
|
@@ -57,9 +58,37 @@ def parse() -> argparse.ArgumentParser():
|
|
|
57
58
|
parser.add_argument('-s', '--sqlite',
|
|
58
59
|
help='Save output as SQLite3 database',
|
|
59
60
|
action='store_true')
|
|
61
|
+
parser.add_argument('-l', '--log',
|
|
62
|
+
help=textwrap.fill(textwrap.dedent(
|
|
63
|
+
'''
|
|
64
|
+
If you would like a log, provide a log file name here.
|
|
65
|
+
If no file name is provided, no log is created.
|
|
66
|
+
'''),80),
|
|
67
|
+
default=None)
|
|
68
|
+
parser.add_argument('--log-level',
|
|
69
|
+
help=textwrap.fill(textwrap.dedent(
|
|
70
|
+
'''
|
|
71
|
+
Log level. Acceptable values for log level are: debug, info,
|
|
72
|
+
warning, error, critical.
|
|
73
|
+
Default value: warning.
|
|
74
|
+
'''),80),
|
|
75
|
+
default='warning')
|
|
76
|
+
parser.add_argument('--rate-limit-off',
|
|
77
|
+
action='store_true',
|
|
78
|
+
help=('Turn off rate limiter. '
|
|
79
|
+
'Requests are randomly between min and max. Default is ON.'))
|
|
80
|
+
parser.add_argument('--rate-limit-min',
|
|
81
|
+
help='Minimum time before requests in seconds. Default 0.25',
|
|
82
|
+
default=0.25,
|
|
83
|
+
type=float)
|
|
84
|
+
parser.add_argument('--rate-limit-max',
|
|
85
|
+
help='Maximum time between requests in seconds: Default 1',
|
|
86
|
+
default=1,
|
|
87
|
+
type=float)
|
|
88
|
+
|
|
60
89
|
group = parser.add_argument_group(title='Harvest options',
|
|
61
90
|
description=textwrap.fill(
|
|
62
|
-
'
|
|
91
|
+
'You can obtain info for *either* a recursive crawl '
|
|
63
92
|
'of a collection (-c, --collection) OR for a single '
|
|
64
93
|
'Dataverse ' 'study (-p, --pid). '
|
|
65
94
|
'These arguments are mutually exclusive.'))
|
|
@@ -149,35 +178,71 @@ def extension(args:argparse.ArgumentParser):
|
|
|
149
178
|
return '.sqlite3'
|
|
150
179
|
return extype.get(args.delimiter, '.txt')
|
|
151
180
|
|
|
181
|
+
def logme(pargs:argparse.Namespace)->logging.Logger:
|
|
182
|
+
'''
|
|
183
|
+
Text logger
|
|
184
|
+
'''
|
|
185
|
+
logger=logging.getLogger()
|
|
186
|
+
l_format = logging.Formatter('%(name)s - %(asctime)s'
|
|
187
|
+
' - %(levelname)s - %(funcName)s - '
|
|
188
|
+
'%(message)s')
|
|
189
|
+
lookup = {'debug' : logging.DEBUG,
|
|
190
|
+
'info' : logging.INFO,
|
|
191
|
+
'warning': logging.WARNING,
|
|
192
|
+
'error': logging.ERROR,
|
|
193
|
+
'critical': logging.CRITICAL}
|
|
194
|
+
level = lookup.get(pargs.log_level.lower(), logging.WARNING)
|
|
195
|
+
logger.setLevel(level)
|
|
196
|
+
if pargs.log:
|
|
197
|
+
text = logging.FileHandler(pargs.log, encoding='utf-8', delay=True)
|
|
198
|
+
text.setFormatter(l_format)
|
|
199
|
+
logger.addHandler(text)
|
|
200
|
+
return logger
|
|
201
|
+
logger.addHandler(logging.NullHandler())
|
|
202
|
+
return logger
|
|
203
|
+
|
|
152
204
|
def main():
|
|
153
205
|
'''
|
|
154
206
|
You know what this is
|
|
155
207
|
'''
|
|
156
|
-
#pylint: disable=too-many-branches, too-many-locals
|
|
208
|
+
#pylint: disable=too-many-branches, too-many-locals, too-many-statements
|
|
157
209
|
args = parse().parse_args()
|
|
210
|
+
logger = logme(args)
|
|
158
211
|
if args.collection:
|
|
159
|
-
coll_me = dvc.DvCollection(args.url, args.collection, args.key
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
print(f'Error with parsing collection: {args.collection}', file=sys.stderr)
|
|
164
|
-
sys.exit()
|
|
212
|
+
coll_me = dvc.DvCollection(args.url, args.collection, args.key,
|
|
213
|
+
rate_limit_on=not args.rate_limit_off,
|
|
214
|
+
rate_limit_min=args.rate_limit_min,
|
|
215
|
+
rate_limit_max=args.rate_limit_max)
|
|
165
216
|
try:
|
|
166
217
|
coll_me.get_studies()
|
|
167
218
|
all_studies = coll_me.studies
|
|
219
|
+
if not all_studies: #Stupid but this happens
|
|
220
|
+
print('No studies in collection', file=sys.stderr)
|
|
221
|
+
logger.warning('No studies to process in collection %s', args.collection)
|
|
222
|
+
sys.exit()
|
|
168
223
|
except dataverse_utils.collections.MetadataError as e:
|
|
169
224
|
print(e, file=sys.stderr)
|
|
225
|
+
logger.critical(e)
|
|
226
|
+
sys.exit()
|
|
227
|
+
except TypeError as e:
|
|
228
|
+
print(f'Error with parsing collection: {args.collection}', file=sys.stderr)
|
|
229
|
+
logger.critical(e)
|
|
170
230
|
sys.exit()
|
|
171
231
|
else:
|
|
172
232
|
try:
|
|
173
|
-
all_studies = [dvc.StudyMetadata(url=args.url, pid=args.pid, key=args.key
|
|
233
|
+
all_studies = [dvc.StudyMetadata(url=args.url, pid=args.pid, key=args.key,
|
|
234
|
+
rate_limit_on=True,
|
|
235
|
+
rate_limit_min=0.25,
|
|
236
|
+
rate_limit_max=1)]
|
|
174
237
|
except (KeyError, dataverse_utils.collections.MetadataError) as e:
|
|
175
238
|
print(e, file=sys.stderr)
|
|
239
|
+
logger.critical(e)
|
|
176
240
|
sys.exit()
|
|
177
241
|
fname = {0: '_studies', 1:'_files'}
|
|
178
242
|
outdata = {}
|
|
179
243
|
for stud_file in range(2): # studies and files
|
|
180
|
-
fieldnames= fields(args.include_all_versions, stud_file, all_studies)
|
|
244
|
+
fieldnames = fields(args.include_all_versions, stud_file, all_studies)
|
|
245
|
+
logger.info(fieldnames)
|
|
181
246
|
out = io.StringIO(newline='')
|
|
182
247
|
writer = csv.DictWriter(out,
|
|
183
248
|
fieldnames=fieldnames,
|
|
@@ -186,10 +251,12 @@ def main():
|
|
|
186
251
|
extrasaction='ignore')
|
|
187
252
|
writer.writeheader()
|
|
188
253
|
for stud in all_studies:
|
|
254
|
+
logger.info(stud)
|
|
189
255
|
for row in output(stud, args.include_all_versions, stud_file):
|
|
190
256
|
data = {k:v.replace('\t',' ').replace('\r\n', ' ').replace('\n',' ')
|
|
191
257
|
if isinstance(v, str) else v
|
|
192
258
|
for k, v in row.items()}
|
|
259
|
+
logger.debug(data)
|
|
193
260
|
writer.writerow(data)
|
|
194
261
|
out.seek(0)
|
|
195
262
|
outdata[fname[stud_file][1:]] = out
|
|
@@ -206,7 +273,7 @@ def main():
|
|
|
206
273
|
file=sys.stdout)
|
|
207
274
|
conn = sqlite3.connect(pathlib.Path(args.output+extension(args)).expanduser())
|
|
208
275
|
for k,v in outdata.items():
|
|
209
|
-
x=pd.read_csv(v, delimiter=args.delimiter)
|
|
276
|
+
x = pd.read_csv(v, delimiter=args.delimiter)
|
|
210
277
|
x.to_sql(k, conn, if_exists='replace', index=0)
|
|
211
278
|
cursor = conn.cursor()
|
|
212
279
|
cursor.execute('DROP VIEW IF EXISTS short_combined_view;')
|
|
File without changes
|
|
File without changes
|
{dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/data/LDC_EULA_general.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_ldc_uploader.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_list_files.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_manifest_gen.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_pg_facet_date.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_readme_creator.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_record_copy.py
RENAMED
|
File without changes
|
|
File without changes
|
{dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_replace_licence.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_study_migrator.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_upload_tsv.py
RENAMED
|
File without changes
|