dataverse-utils 0.22.4__tar.gz → 0.22.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/PKG-INFO +3 -3
  2. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/pyproject.toml +3 -3
  3. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/__init__.py +4 -4
  4. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/collections.py +255 -50
  5. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/ldc.py +7 -5
  6. dataverse_utils-0.22.8/src/dataverse_utils/scripts/dv_collection_info.py +297 -0
  7. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_ldc_uploader.py +11 -7
  8. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_release.py +1 -2
  9. dataverse_utils-0.22.4/src/dataverse_utils/scripts/dv_collection_info.py +0 -244
  10. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/LICENCE.md +0 -0
  11. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/README.md +0 -0
  12. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/data/LDC_EULA_general.md +0 -0
  13. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/dataverse_utils.py +0 -0
  14. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/dvdata.py +0 -0
  15. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_del.py +0 -0
  16. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_list_files.py +0 -0
  17. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_manifest_gen.py +0 -0
  18. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_pg_facet_date.py +0 -0
  19. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_readme_creator.py +0 -0
  20. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_record_copy.py +0 -0
  21. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_replace_licence.py +0 -0
  22. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_study_migrator.py +0 -0
  23. {dataverse_utils-0.22.4 → dataverse_utils-0.22.8}/src/dataverse_utils/scripts/dv_upload_tsv.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataverse-utils
3
- Version: 0.22.4
3
+ Version: 0.22.8
4
4
  Summary: Utilities for the Dataverse data respository system
5
5
  License: MIT
6
6
  License-File: LICENCE.md
@@ -16,14 +16,14 @@ Classifier: Programming Language :: Python :: 3.12
16
16
  Classifier: Programming Language :: Python :: 3.13
17
17
  Classifier: Programming Language :: Python :: 3.14
18
18
  Requires-Dist: bs4 (>=0.0.2,<0.0.3)
19
- Requires-Dist: chardet (>=5.2,<5.3)
19
+ Requires-Dist: chardet (>=5.2)
20
20
  Requires-Dist: dryad2dataverse (>=0.8.4,<0.9.0)
21
21
  Requires-Dist: markdown (>=3.10.2,<4.0.0)
22
22
  Requires-Dist: markdown-pdf (>=1.13.1,<2.0.0)
23
23
  Requires-Dist: markdownify (>=1.2.2,<2.0.0)
24
24
  Requires-Dist: pyreadr (>=0.5.4,<0.6.0)
25
25
  Requires-Dist: pyreadstat (>=1.3.3,<2.0.0)
26
- Requires-Dist: requests (>=2.30.0,<3.0.0)
26
+ Requires-Dist: requests (>=2.33,<3.0)
27
27
  Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
28
28
  Requires-Dist: tqdm (>=4.67.3,<5.0.0)
29
29
  Project-URL: Homepage, https://ubc-library-rc.github.io/dataverse_utils
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dataverse-utils"
3
- version = "0.22.4"
3
+ version = "0.22.8"
4
4
  description = "Utilities for the Dataverse data respository system"
5
5
  authors = [
6
6
  {name = "Paul Lesack",email = "paul.lesack@ubc.ca"}
@@ -10,7 +10,6 @@ readme = "README.md"
10
10
  requires-python = ">=3.10, <4.0"
11
11
  #When requests 2.33 is released, update poetry and release
12
12
  dependencies = [
13
- "requests (>=2.30.0,<3.0.0)",
14
13
  "bs4 (>=0.0.2,<0.0.3)",
15
14
  "markdown (>=3.10.2,<4.0.0)",
16
15
  "markdown-pdf (>=1.13.1,<2.0.0)",
@@ -20,7 +19,8 @@ dependencies = [
20
19
  "requests-toolbelt (>=1.0.0,<2.0.0)",
21
20
  "tqdm (>=4.67.3,<5.0.0)",
22
21
  "dryad2dataverse (>=0.8.4,<0.9.0)",
23
- "chardet (>=5.2,<5.3)"
22
+ "chardet (>=5.2)",
23
+ "requests (>=2.33,<3.0)"
24
24
  ]
25
25
  #Chardet and requests will need to be changed when requests goes to 2.32; at that
26
26
  #point just remove chardet
@@ -7,7 +7,7 @@ import pathlib
7
7
  import sys
8
8
  from dataverse_utils.dataverse_utils import *
9
9
 
10
- VERSION = (0, 22, 4)
10
+ VERSION = (0, 22, 8)
11
11
  __version__ = '.'.join([str(x) for x in VERSION])
12
12
 
13
13
  USERAGENT = (f'dataverse_utils/v{__version__} ({sys.platform.capitalize()}); '
@@ -15,14 +15,14 @@ USERAGENT = (f'dataverse_utils/v{__version__} ({sys.platform.capitalize()}); '
15
15
  UAHEADER = {'User-agent' : USERAGENT}
16
16
 
17
17
  SCRIPT_VERSIONS={
18
- 'dv_collection_info' : (0, 1, 3),
18
+ 'dv_collection_info' : (0, 4, 0),
19
19
  'dv_del' : (0, 2, 4),
20
- 'dv_ldc_uploader' : (0, 3, 0),
20
+ 'dv_ldc_uploader' : (0, 4, 1),
21
21
  'dv_list_files' : (0, 1, 1),
22
22
  'dv_manifest_gen' : (0, 5, 1),
23
23
  'dv_pg_facet_date' : (0, 1, 1),
24
24
  'dv_record_copy' : (0, 1, 2),
25
- 'dv_release' : (0, 1, 2),
25
+ 'dv_release' : (0, 1, 3),
26
26
  'dv_replace_licence' : (0, 1, 1),
27
27
  'dv_readme_creator' : (0, 1, 1),
28
28
  'dv_study_migrator' : (0, 4, 1),
@@ -8,8 +8,11 @@ import datetime
8
8
  import io
9
9
  import logging
10
10
  import pathlib
11
+ import random
11
12
  import string
13
+ import sys
12
14
  import tempfile
15
+ import time
13
16
  import textwrap
14
17
  import typing
15
18
  import traceback
@@ -33,12 +36,59 @@ RETRY = Retry(total=10,
33
36
  allowed_methods=['HEAD', 'GET', 'OPTIONS',
34
37
  'POST', 'PUT'],
35
38
  backoff_factor=1)
39
+ BAR_FORMAT='{l_bar}{bar}{n_fmt}/{total_fmt} : time remaining - {remaining}'
36
40
 
37
41
  class MetadataError(Exception):
38
42
  '''
39
43
  MetadataError
40
44
  '''
41
45
 
46
+ class RateLimiter:
47
+ '''
48
+ Pauses for a random interval
49
+ '''
50
+ #pylint: disable=too-few-public-methods
51
+ def __init__(self, **kwargs):
52
+ '''
53
+ Parameters
54
+ ----------
55
+ **kwargs
56
+
57
+ Other parameters
58
+ ----------------
59
+ rate_limit_on: bool
60
+ Turn on rate limit for requests
61
+
62
+ rate_limit_min : int
63
+ Minimum time between requests in seconds
64
+
65
+ rate_limit_max : int
66
+ Maximum time between requests in seconds
67
+
68
+ session : requests.Session
69
+ A requests session if available, to help
70
+ ensure against having too many open connections
71
+
72
+ Notes
73
+ -----
74
+ The rate limiter will wait for a random interval between
75
+ rate_limit_min and rate_limit_max. Obviously, if you want
76
+ a constant interval, set them to be equal.
77
+ '''
78
+ self.kwargs = kwargs
79
+
80
+ if not self.kwargs.get('rate_limit_on', False):
81
+ self.kwargs['rate_limit_on'] = False
82
+ self.kwargs['rate_limit_min'] = 0
83
+ self.kwargs['rate_limit_max'] = 0
84
+
85
+ def rate_limit(self):
86
+ '''
87
+ Sleep before requests for the time set by the rate limits
88
+ '''
89
+ time.sleep(random.uniform(self.kwargs['rate_limit_min'],
90
+ self.kwargs['rate_limit_max']))
91
+
42
92
  class DvCollection:
43
93
  '''
44
94
  Metadata for an *entire* dataverse collection, recursively.
@@ -66,7 +116,29 @@ class DvCollection:
66
116
  ----------------
67
117
  timeout : int
68
118
  retry timeout in seconds
119
+
120
+ rate_limit_on: bool
121
+ Turn on rate limit for requests
122
+
123
+ rate_limit_min : int
124
+ Minimum time between requests in seconds
125
+
126
+ rate_limit_max : int
127
+ Maximum time between requests in seconds
128
+
129
+ session : requests.Session
130
+ A requests session if available, to help
131
+ ensure against having too many open connections
132
+
133
+ Notes
134
+ -----
135
+ The rate limiter will wait for a random interval between
136
+ rate_limit_min and rate_limit_max. Obviously, if you want
137
+ a constant interval, set them to be equal.
138
+
69
139
  '''
140
+ self.kwargs = kwargs
141
+ self.limit = RateLimiter(**kwargs)
70
142
  self.coll = coll
71
143
  self.url = self.__clean_url(url)
72
144
  self.headers = None
@@ -80,11 +152,27 @@ class DvCollection:
80
152
  self.retry_strategy = RETRY
81
153
  else:
82
154
  self.retry_strategy = kwargs['retry']
83
- self.session = requests.Session()
84
- self.session.mount('https://',
85
- requests.adapters.HTTPAdapter(max_retries=self.retry_strategy))
86
155
  self.collections = None
156
+ self.session = kwargs.get('session', requests.Session())
157
+ self.session.mount('https://',
158
+ requests.adapters.HTTPAdapter(max_retries=RETRY))
87
159
  self.studies = None
160
+ self.__root = None
161
+ self.all_colls = [self.root]
162
+
163
+ @property
164
+ def root(self):
165
+ '''
166
+ Return the name and short name of the top level collection
167
+ '''
168
+ if not self.__root:
169
+ self.limit.rate_limit()
170
+ x = self.session.get(f'{self.url}/api/dataverses/{self.coll}',
171
+ headers=self.headers,
172
+ timeout=self.kwargs.get('timeout', 15))
173
+ x.raise_for_status()
174
+ self.__root = (x.json()['data']['name'], x.json()['data']['alias'])
175
+ return self.__root
88
176
 
89
177
  def __clean_url(self, badurl:str):
90
178
  '''
@@ -105,11 +193,14 @@ class DvCollection:
105
193
  '''
106
194
  Get collection short name.
107
195
  '''
108
- shortname = self.session.get(f'{self.url}/api/dataverses/{dvid}', headers=self.headers)
196
+ self.limit.rate_limit()
197
+ shortname = self.session.get(f'{self.url}/api/dataverses/{dvid}',
198
+ headers=self.headers,
199
+ timeout=self.kwargs.get('timeout', 15))
109
200
  shortname.raise_for_status()
110
201
  return shortname.json()['data']['alias']
111
202
 
112
- def get_collections(self, coll:str=None, output=None, **kwargs)->list:#pylint: disable=unused-argument
203
+ def get_collections(self, coll:str=None, output=None)->list:#pylint: disable=unused-argument
113
204
  '''
114
205
  Get a [recursive] listing of all dataverses in a collection.
115
206
 
@@ -119,16 +210,15 @@ class DvCollection:
119
210
  Collection short name or id
120
211
  output : list, optional, default=[]
121
212
  output list to append to
122
- **kwargs : dict
123
- Other keyword arguments
124
-
125
213
  '''
126
214
  if not output:
127
215
  output = []
128
216
  if not coll:
129
217
  coll = self.coll
218
+ self.limit.rate_limit()
130
219
  x = self.session.get(f'{self.url}/api/dataverses/{coll}/contents',
131
- headers=self.headers)
220
+ headers=self.headers,
221
+ timeout=self.kwargs.get('timeout', 15))
132
222
  data = x.json().get('data')
133
223
  #---
134
224
  #Because it's possible that permissions errors can cause API read errors,
@@ -142,7 +232,6 @@ class DvCollection:
142
232
  out=self.__get_shortname(_['id'])
143
233
  dvs.append((_['title'], out))
144
234
  except Exception as e:
145
-
146
235
  obscure_error = f'''
147
236
  An error has occured where a collection can be
148
237
  identified by ID but its name cannot be determined.
@@ -155,12 +244,13 @@ class DvCollection:
155
244
 
156
245
  Problematic collection id number: {_.get("id",
157
246
  "not available")}'''
158
- print(50*'-')
159
- print(textwrap.dedent(obscure_error))
247
+ #to sys.stdout?
248
+ print(50*'-', file=sys.stderr)
249
+ print(textwrap.dedent(obscure_error), file=sys.stderr)
160
250
  print(e)
161
251
  LOGGER.error(textwrap.fill(textwrap.dedent(obscure_error).strip()))
162
252
  traceback.print_exc()
163
- print(50*'-')
253
+ print(50*'-', file=sys.stderr)
164
254
  raise e
165
255
  #---
166
256
  if not dvs:
@@ -171,6 +261,8 @@ class DvCollection:
171
261
  LOGGER.debug('recursive')
172
262
  self.get_collections(dv[1], output)
173
263
  self.collections = output
264
+ if self.root not in self.collections:
265
+ self.collections.insert(0, self.root)
174
266
  return output
175
267
 
176
268
  def get_studies(self, root:str=None):
@@ -185,10 +277,15 @@ class DvCollection:
185
277
  all_studies = []
186
278
  if not root:
187
279
  root=self.coll
188
- all_studies = self.get_collection_listing(root)
189
- #collections = self.get_collections(root, self.url)
280
+ #Redundant, as root is now added to get_collections
281
+ #all_studies = self.get_collection_listing(root)
282
+ all_studies = []
190
283
  collections = self.get_collections(root)
191
- for collection in collections:
284
+ for collection in tqdm.tqdm(collections,
285
+ desc='collections',
286
+ unit='collection',
287
+ leave=False,
288
+ bar_format=BAR_FORMAT):
192
289
  all_studies.extend(self.get_collection_listing(collection[1]))
193
290
  self.studies = all_studies
194
291
  return all_studies
@@ -202,17 +299,31 @@ class DvCollection:
202
299
  coll_id : str
203
300
  Short name or id of a dataverse collection
204
301
  '''
302
+ self.limit.rate_limit()
205
303
  cl = self.session.get(f'{self.url}/api/dataverses/{coll_id}/contents',
206
- headers=self.headers)
304
+ headers=self.headers,
305
+ timeout=self.kwargs.get('timeout', 15))
207
306
  cl.raise_for_status()
208
307
  pids = [f"{z['protocol']}:{z['authority']}/{z['identifier']}"
209
308
  for z in cl.json()['data'] if z['type'] == 'dataset']
210
- out = [(self.get_study_info(pid), pid) for pid in pids]
309
+ #Pass collection info into the study because that's not available from
310
+ #a metadata download
311
+ smkwargs = [{'collection_name':_[0] , 'collection_short_name':_[1]}
312
+ for _ in self.collections if coll_id == _[1]][0]
313
+ #out = [(self.get_study_info(pid, **smkwargs), pid) for pid in pids]
314
+ out = []
315
+ for pid in tqdm.tqdm(pids,
316
+ desc=smkwargs.get('collection_short_name', 'collection'),
317
+ unit='study',
318
+ leave=False,
319
+ colour='red',
320
+ bar_format=BAR_FORMAT):
321
+ out.append((self.get_study_info(pid, **smkwargs), pid))
211
322
  for _ in out:
212
323
  _[0].update({'pid': _[1]})
213
324
  return [x[0] for x in out]
214
325
 
215
- def get_study_info(self, pid):
326
+ def get_study_info(self, pid, **kwargs):
216
327
  '''
217
328
  Returns a StudyMetadata object with complete metadata for a study.
218
329
 
@@ -220,13 +331,19 @@ class DvCollection:
220
331
  ----------
221
332
  pid : str
222
333
  Persistent ID of a Dataverse study
334
+
335
+ **kwargs
336
+ Other useful information to pass onto StudyMetadata, such as collection info, etc.
223
337
  '''
338
+ self.limit.rate_limit()
224
339
  meta = self.session.get(f'{self.url}/api/datasets/:persistentId',
225
- params={'persistentId': pid},
226
- headers=self.headers)
340
+ params={'persistentId': pid},
341
+ headers=self.headers,
342
+ timeout=self.kwargs.get('timeout', 15))
227
343
  meta.raise_for_status()
228
344
  LOGGER.debug(pid)
229
- return StudyMetadata(study_meta=meta.json(), key=self.__key, url=self.url)
345
+ return StudyMetadata(study_meta=meta.json(), key=self.__key, url=self.url,
346
+ session=self.session, **kwargs)
230
347
 
231
348
  class StudyMetadata(dict):
232
349
  '''
@@ -257,25 +374,54 @@ class StudyMetadata(dict):
257
374
  key : str
258
375
  Dataverse instance API key (needed for unpublished studies)
259
376
 
377
+ rate_limit_on: bool
378
+ Turn on rate limit for requests
379
+
380
+ rate_limit_min : int
381
+ Minimum time between requests in seconds
382
+
383
+ rate_limit_max : int
384
+ Maximum time between requests in seconds
385
+
386
+ session : requests.Session
387
+ A requests session if available, to help
388
+ ensure against having too many open connections
389
+
260
390
  Notes
261
391
  -----
262
392
  Either `study_meta` is required OR `pid` and `url`. `key` _may_ be required
263
393
  if either a draft study is being accessed or the Dataverse installation
264
394
  requires API keys for all requests.
395
+
396
+ The rate limiter will wait for a random interval between
397
+ rate_limit_min and rate_limit_max. Obviously, if you want
398
+ a constant interval, set them to be equal.
399
+
265
400
  '''
266
401
  self.kwargs = kwargs
402
+ self.session = kwargs.get('session', requests.Session())
403
+ self.session.mount('https://',
404
+ requests.adapters.HTTPAdapter(max_retries=RETRY))
405
+ self.limit = RateLimiter(**kwargs)
267
406
  self.study_meta = kwargs.get('study_meta')
268
- self.all_versions = None
407
+ self.all_versions = kwargs.get('all_versions')
269
408
  self.url = kwargs.get('url')
270
409
  self.pid = kwargs.get('pid')
410
+ #If only there would be an easy way to check if something was deaccessioned
411
+ #without yet another request. But right now, let's assume it's fine.
412
+ #See below (under Key Error) where it get set
413
+ self.deaccession_flag = 0
271
414
  if self.study_meta:
272
415
  #self.pid = kwargs.get('pid', (f"{self.study_meta['data']['protocol']}:"
273
416
  # f"{self.study_meta['data']['authority']}"
274
417
  # f"/{self.study_meta['data']['identifier']}") if not
275
418
  # self.pid else self.pid)
276
- self.pid = (f"{self.study_meta['data']['protocol']}:"
277
- f"{self.study_meta['data']['authority']}"
278
- f"/{self.study_meta['data']['identifier']}")
419
+ try:
420
+ self.pid = (f"{self.study_meta['data']['protocol']}:"
421
+ f"{self.study_meta['data']['authority']}"
422
+ f"/{self.study_meta['data']['identifier']}")
423
+ except (KeyError,) as e:
424
+ raise MetadataError(f'Key error: {e}') from e
279
425
 
280
426
  self.headers = UAHEADER.copy()
281
427
  if not (('study_meta' in kwargs) or ('url' in kwargs and 'pid' in kwargs)):
@@ -286,15 +432,23 @@ class StudyMetadata(dict):
286
432
  try:
287
433
  self.update(self.extract_metadata(self.study_meta['data']['latestVersion']))
288
434
  except KeyError as e:
289
- raise MetadataError(f'Unable to parse study metadata. Do you need an API key?\n'
290
- f'{e} key not found.\n'
291
- f'Offending JSON: {self.study_meta}') from e
435
+ if (self.study_meta.get('status') == 'OK' and not
436
+ self.study_meta['data'].get('latestVersion')):
437
+ # Latest version is not available because API strips out all
438
+ # citation metadata for deaccessioned studies but doesn't
439
+ # actually indicate this in any obvious manner
440
+ # This is further complicated because *all* the metadata
441
+ # we want is in the metadata blocks, which won't exist in the JSON
442
+ # because for some idiotic reason it's OK to expose it in the GUI
443
+ # but not via API.
444
+ self.deaccession_flag = 1
445
+ else:
446
+ raise MetadataError(f'Unable to parse study metadata. Do you need an API key?\n'
447
+ f'{e} key not found.\n'
448
+ f'Offending JSON: {self.study_meta}') from e
292
449
  self.__files = None
293
450
  self.__all_files = None
294
- #self.index = {f"{_['versionNumber']}.{_['versionMinorNumber']}": n
295
- # for n, _ in enumerate(self.all_versions['data'])}
296
- #self.index = {_: n for _, n in enumerate(self.versions)}
297
- self.index = dict(enumerate(self.versions))
451
+ self.index = {_: n for n, _ in enumerate(self.versions)}
298
452
 
299
453
  def __obtain_metadata(self):
300
454
  '''
@@ -303,16 +457,23 @@ class StudyMetadata(dict):
303
457
  if self.kwargs.get('key'):
304
458
  self.headers.update({'X-Dataverse-key':self.kwargs['key']})
305
459
  params = {'persistentId': self.pid}
306
- self.session = requests.Session()
307
- self.session.mount('https://',
308
- requests.adapters.HTTPAdapter(max_retries=RETRY))
309
460
  self.url = self.url.strip('/')
310
461
  if not self.url.startswith('https://'):
311
462
  self.url = f'https://{self.url}'
463
+ self.limit.rate_limit()
464
+ LOGGER.debug('Attempting %s/api/datasets/, params %s, headers %s',
465
+ self.url, params, self.headers)
312
466
  data = self.session.get(f'{self.url}/api/datasets/:persistentId',
313
- headers=self.headers, params=params)
467
+ headers=self.headers, params=params,
468
+ timeout=self.kwargs.get('timeout', 15))
469
+ data.raise_for_status()
470
+ self.limit.rate_limit()
471
+ LOGGER.debug('Attempting %s/api/datasets/:persistentId/versions, params %s, headers %s',
472
+ self.url, params, self.headers)
314
473
  all_versions = self.session.get(f'{self.url}/api/datasets/:persistentId/versions',
315
- headers=self.headers, params=params)
474
+ headers=self.headers, params=params,
475
+ timeout=self.kwargs.get('timeout', 15))
476
+ all_versions.raise_for_status()
316
477
  return data.json(), all_versions.json()
317
478
 
318
479
  def __has_metadata(self)->bool:
@@ -354,6 +515,14 @@ class StudyMetadata(dict):
354
515
  tmp['versionStatement'] = f"{chunk['versionNumber']}.{chunk['versionMinorNumber']}"
355
516
  else:
356
517
  tmp['versionStatement'] = f"{chunk.get('versionState', '')}"
518
+ #ADD fields here if they are not in the metadata and you need them
519
+ tmp['pid'] = self.pid #Because you need generally need this
520
+ #Collection info
521
+ for _ in ['collection_name', 'collection_short_name']:
522
+ if self.kwargs.get(_):
523
+ tmp[_] = self.kwargs[_]
524
+ #Latest version number or state for easy filtering @@@
525
+ tmp['is_current_version'] = tmp['versionStatement'] == self.current_version
357
526
  return tmp
358
527
 
359
528
  def extract_field_metadata(self, field):
@@ -394,7 +563,7 @@ class StudyMetadata(dict):
394
563
  else:
395
564
  #sometimes value is None because reasons.
396
565
  interim[v3['typeName']] = [v3.get('value', [] )]
397
- LOGGER.debug(interim)
566
+ #LOGGER.debug(interim)
398
567
  for k9, v9 in interim.items():
399
568
  out.update({k9: '; '.join(v9)})
400
569
 
@@ -438,8 +607,14 @@ class StudyMetadata(dict):
438
607
  '''
439
608
  Return a formatted version statement for the most recent version
440
609
  '''
441
- return (f"{self.study_meta['data']['latestVersion']['versionNumber']}."
442
- f"{self.study_meta['data']['latestVersion']['versionMinorNumber']}")
610
+ try:
611
+ return (f"{self.study_meta['data']['latestVersion']['versionNumber']}."
612
+ f"{self.study_meta['data']['latestVersion']['versionMinorNumber']}")
613
+ except (KeyError, ValueError):
614
+ try:
615
+ return f"{self.study_meta['data']['latestVersion']['versionState']}"
616
+ except (ValueError, KeyError):
617
+ return 'DEACCESSIONED'
443
618
 
444
619
  @property
445
620
  def versions(self)->list:
@@ -549,7 +724,7 @@ class StudyMetadata(dict):
549
724
 
550
725
  files = [self.flatten(_) for _ in filelist]
551
726
  for ff in files:
552
- ff.update({'dataset_persistentId': self.pid})
727
+ ff.update({'dataset_pid': self.pid})
553
728
  return files
554
729
 
555
730
  def __extract_files(self):
@@ -560,9 +735,11 @@ class StudyMetadata(dict):
560
735
  #but files would (usually) be an arbitrary number of files.
561
736
  #That bothers me on an intellectual level. Therefore, it will be attribute.
562
737
  #Iterate over StudyMetadata.files if you want to know the contents
563
- if not self.__files:
738
+ if not self.__files and not self.deaccession_flag:
564
739
  self.__files = self.extract_files(self.study_meta['data']
565
740
  ['latestVersion']['files'])
741
+ if self.deaccession_flag:
742
+ self.__files = []
566
743
 
567
744
  def __extract_licence_info(self, indict)->dict:
568
745
  '''
@@ -695,7 +872,6 @@ class ReadmeCreator:
695
872
  return f'{inkey}: \n'
696
873
  return f'{inkey}: '
697
874
 
698
-
699
875
  def __extract_files(self):
700
876
  '''
701
877
  Extract file level metadata, and write to self.__files.
@@ -793,6 +969,12 @@ class ReadmeCreator:
793
969
  entire StudyMetadata object.
794
970
  '''
795
971
  metatmp = self.meta.copy()
972
+ #Delete redundant info fields added when harvesting Study Metadata
973
+ for _ in ['pid', 'is_current_version', 'version_statement']:
974
+ try:
975
+ del metatmp[_]
976
+ except KeyError:
977
+ continue
796
978
  neworder = self.reorder_fields(metatmp)
797
979
  addme = self.concatenator(metatmp)
798
980
  metatmp.update(addme)
@@ -1032,7 +1214,7 @@ class FileAnalysis(dict):
1032
1214
  Download and analyze a file from a dataverse installation and
1033
1215
  produce useful metadata.
1034
1216
  '''
1035
-
1217
+ #pylint: disable=too-many-instance-attributes
1036
1218
  def __init__(self, **kwargs):
1037
1219
  '''
1038
1220
  Intialize the object.
@@ -1065,16 +1247,34 @@ class FileAnalysis(dict):
1065
1247
  filesize_bytes : int
1066
1248
  File size in bytes
1067
1249
 
1250
+ rate_limit_on: bool
1251
+ Turn on rate limit for requests
1252
+
1253
+ rate_limit_min : int
1254
+ Minimum time between requests in seconds
1255
+
1256
+ rate_limit_max : int
1257
+ Maximum time between requests in seconds
1258
+
1259
+ session : requests.Session
1260
+ A requests session if available, to help
1261
+ ensure against having too many open connections
1262
+
1068
1263
  Notes
1069
1264
  -----
1070
1265
  Either `local` must be supplied, or `url`, `key` and at least one of
1071
1266
  `id` or `pid` must be supplied
1072
1267
 
1073
- '''
1268
+ The rate limiter will wait for a random interval between
1269
+ rate_limit_min and rate_limit_max. Obviously, if you want
1270
+ a constant interval, set them to be equal.
1074
1271
 
1272
+ '''
1273
+ #pylint disable=too-many-instance-attributes
1075
1274
  #self.url = self.__clean_url(url)
1076
1275
  self.headers = UAHEADER.copy()
1077
1276
  self.kwargs = kwargs
1277
+ self.limit = RateLimiter(**kwargs)
1078
1278
  if self.kwargs.get('key'):
1079
1279
  self.headers.update({'X-Dataverse-key':self.kwargs['key']})
1080
1280
  self.local = None
@@ -1084,7 +1284,7 @@ class FileAnalysis(dict):
1084
1284
  '(pid or id)) or (local) keyword parameters.')
1085
1285
  raise TypeError(err)
1086
1286
  self.tempfile = None
1087
- self.session = requests.Session()
1287
+ self.session = kwargs.get('session', requests.Session())
1088
1288
  self.session.mount('https://',
1089
1289
  requests.adapters.HTTPAdapter(max_retries=RETRY))
1090
1290
  self.checkable = {'.sav': self.stat_file_metadata,
@@ -1196,17 +1396,20 @@ class FileAnalysis(dict):
1196
1396
  start = datetime.datetime.now()
1197
1397
  params = {'format':'original'}
1198
1398
  url = self.__clean_url(self.kwargs['url'])
1399
+ self.limit.rate_limit()
1199
1400
  if self.kwargs.get('pid'):
1200
1401
  params.update({'persistentId':self.kwargs['pid']})
1201
1402
  data = self.session.get(f'{url}/api/access/datafile/:persistentId',
1202
1403
  headers=self.headers,
1203
1404
  params=params,
1204
- stream=True)
1405
+ stream=True,
1406
+ timeout=self.kwargs.get('timeout', 15))
1205
1407
  else:
1206
1408
  data = self.session.get(f'{url}/api/access/datafile/{self.kwargs["id"]}',
1207
1409
  headers=self.headers,
1208
1410
  params=params,
1209
- stream=True)
1411
+ stream=True,
1412
+ timeout=self.kwargs.get('timeout', 15))
1210
1413
  data.raise_for_status()
1211
1414
  finish = datetime.datetime.now()
1212
1415
  self.filename = self.__get_filename(data.headers)
@@ -1216,7 +1419,9 @@ class FileAnalysis(dict):
1216
1419
  filesize = self.kwargs.get('filesize_bytes',
1217
1420
  data.headers.get('content-length', 9e9))
1218
1421
  filesize = int(filesize) # comes out as string from header
1219
- with tqdm.tqdm(total=filesize, unit='B', unit_scale=True, desc=self.filename) as t:
1422
+ with tqdm.tqdm(total=filesize, unit='B', unit_scale=True,
1423
+ desc=self.filename, leave=False,
1424
+ bar_format=BAR_FORMAT) as t:
1220
1425
  for _ in data.iter_content(block_size):
1221
1426
  self.tempfile.file.write(_)
1222
1427
  t.update(len(_))