dataverse-utils 0.22.7__tar.gz → 0.22.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/PKG-INFO +2 -1
  2. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/pyproject.toml +3 -2
  3. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/__init__.py +2 -2
  4. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/collections.py +202 -36
  5. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_collection_info.py +78 -11
  6. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/LICENCE.md +0 -0
  7. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/README.md +0 -0
  8. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/data/LDC_EULA_general.md +0 -0
  9. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/dataverse_utils.py +0 -0
  10. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/dvdata.py +0 -0
  11. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/ldc.py +0 -0
  12. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_del.py +0 -0
  13. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_ldc_uploader.py +0 -0
  14. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_list_files.py +0 -0
  15. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_manifest_gen.py +0 -0
  16. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_pg_facet_date.py +0 -0
  17. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_readme_creator.py +0 -0
  18. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_record_copy.py +0 -0
  19. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_release.py +0 -0
  20. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_replace_licence.py +0 -0
  21. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_study_migrator.py +0 -0
  22. {dataverse_utils-0.22.7 → dataverse_utils-0.22.9}/src/dataverse_utils/scripts/dv_upload_tsv.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataverse-utils
3
- Version: 0.22.7
3
+ Version: 0.22.9
4
4
  Summary: Utilities for the Dataverse data respository system
5
5
  License: MIT
6
6
  License-File: LICENCE.md
@@ -26,6 +26,7 @@ Requires-Dist: pyreadstat (>=1.3.3,<2.0.0)
26
26
  Requires-Dist: requests (>=2.33,<3.0)
27
27
  Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
28
28
  Requires-Dist: tqdm (>=4.67.3,<5.0.0)
29
+ Requires-Dist: urllib3 (>=2.7.0,<3.0.0)
29
30
  Project-URL: Homepage, https://ubc-library-rc.github.io/dataverse_utils
30
31
  Project-URL: Issue Tracker, https://github.com/ubc-library-rc/dataverse_utils/issues
31
32
  Project-URL: Repository, https://github.com/ubc-library-rc/dataverse_utils.git
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dataverse-utils"
3
- version = "0.22.7"
3
+ version = "0.22.9"
4
4
  description = "Utilities for the Dataverse data respository system"
5
5
  authors = [
6
6
  {name = "Paul Lesack",email = "paul.lesack@ubc.ca"}
@@ -20,7 +20,8 @@ dependencies = [
20
20
  "tqdm (>=4.67.3,<5.0.0)",
21
21
  "dryad2dataverse (>=0.8.4,<0.9.0)",
22
22
  "chardet (>=5.2)",
23
- "requests (>=2.33,<3.0)"
23
+ "requests (>=2.33,<3.0)",
24
+ "urllib3 (>=2.7.0,<3.0.0)"
24
25
  ]
25
26
  #Chardet and requests will need to be changed when requests goes to 2.32; at that
26
27
  #point just remove chardet
@@ -7,7 +7,7 @@ import pathlib
7
7
  import sys
8
8
  from dataverse_utils.dataverse_utils import *
9
9
 
10
- VERSION = (0, 22, 7)
10
+ VERSION = (0, 22, '9a0')
11
11
  __version__ = '.'.join([str(x) for x in VERSION])
12
12
 
13
13
  USERAGENT = (f'dataverse_utils/v{__version__} ({sys.platform.capitalize()}); '
@@ -15,7 +15,7 @@ USERAGENT = (f'dataverse_utils/v{__version__} ({sys.platform.capitalize()}); '
15
15
  UAHEADER = {'User-agent' : USERAGENT}
16
16
 
17
17
  SCRIPT_VERSIONS={
18
- 'dv_collection_info' : (0, 3, 0),
18
+ 'dv_collection_info' : (0, 4, 1),
19
19
  'dv_del' : (0, 2, 4),
20
20
  'dv_ldc_uploader' : (0, 4, 1),
21
21
  'dv_list_files' : (0, 1, 1),
@@ -8,9 +8,11 @@ import datetime
8
8
  import io
9
9
  import logging
10
10
  import pathlib
11
+ import random
11
12
  import string
12
13
  import sys
13
14
  import tempfile
15
+ import time
14
16
  import textwrap
15
17
  import typing
16
18
  import traceback
@@ -34,12 +36,59 @@ RETRY = Retry(total=10,
34
36
  allowed_methods=['HEAD', 'GET', 'OPTIONS',
35
37
  'POST', 'PUT'],
36
38
  backoff_factor=1)
39
+ BAR_FORMAT='{l_bar}{bar}{n_fmt}/{total_fmt} : time remaining - {remaining}'
37
40
 
38
41
  class MetadataError(Exception):
39
42
  '''
40
43
  MetadataError
41
44
  '''
42
45
 
46
+ class RateLimiter:
47
+ '''
48
+ Pauses for a random interval
49
+ '''
50
+ #pylint: disable=too-few-public-methods
51
+ def __init__(self, **kwargs):
52
+ '''
53
+ Parameters
54
+ ----------
55
+ **kwargs
56
+
57
+ Other parameters
58
+ ----------------
59
+ rate_limit_on: bool
60
+ Turn on rate limit for requests
61
+
62
+ rate_limit_min : int
63
+ Minimum time between requests in seconds
64
+
65
+ rate_limit_max : int
66
+ Maximum time between requests in seconds
67
+
68
+ session : requests.Session
69
+ A requests session if available, to help
70
+ ensure against having too many open connections
71
+
72
+ Notes
73
+ -----
74
+ The rate limiter will wait for a random interval between
75
+ rate_limit_min and rate_limit_max. Obviously, if you want
76
+ a constant interval, set them to be equal.
77
+ '''
78
+ self.kwargs = kwargs
79
+
80
+ if not self.kwargs.get('rate_limit_on', False):
81
+ self.kwargs['rate_limit_on'] = False
82
+ self.kwargs['rate_limit_min'] = 0
83
+ self.kwargs['rate_limit_max'] = 0
84
+
85
+ def rate_limit(self):
86
+ '''
87
+ Sleep before requests for the time set by the rate limits
88
+ '''
89
+ time.sleep(random.uniform(self.kwargs['rate_limit_min'],
90
+ self.kwargs['rate_limit_max']))
91
+
43
92
  class DvCollection:
44
93
  '''
45
94
  Metadata for an *entire* dataverse collection, recursively.
@@ -67,7 +116,29 @@ class DvCollection:
67
116
  ----------------
68
117
  timeout : int
69
118
  retry timeout in seconds
119
+
120
+ rate_limit_on: bool
121
+ Turn on rate limit for requests
122
+
123
+ rate_limit_min : int
124
+ Minimum time between requests in seconds
125
+
126
+ rate_limit_max : int
127
+ Maximum time between requests in seconds
128
+
129
+ session : requests.Session
130
+ A requests session if available, to help
131
+ ensure against having too many open connections
132
+
133
+ Notes
134
+ -----
135
+ The rate limiter will wait for a random interval between
136
+ rate_limit_min and rate_limit_max. Obviously, if you want
137
+ a constant interval, set them to be equal.
138
+
70
139
  '''
140
+ self.kwargs = kwargs
141
+ self.limit = RateLimiter(**kwargs)
71
142
  self.coll = coll
72
143
  self.url = self.__clean_url(url)
73
144
  self.headers = None
@@ -82,9 +153,9 @@ class DvCollection:
82
153
  else:
83
154
  self.retry_strategy = kwargs['retry']
84
155
  self.collections = None
85
- self.session = requests.Session()
156
+ self.session = kwargs.get('session', requests.Session())
86
157
  self.session.mount('https://',
87
- requests.adapters.HTTPAdapter(max_retries=self.retry_strategy))
158
+ requests.adapters.HTTPAdapter(max_retries=RETRY))
88
159
  self.studies = None
89
160
  self.__root = None
90
161
  self.all_colls = [self.root]
@@ -95,8 +166,10 @@ class DvCollection:
95
166
  Return the name and short name of the top level collection
96
167
  '''
97
168
  if not self.__root:
169
+ self.limit.rate_limit()
98
170
  x = self.session.get(f'{self.url}/api/dataverses/{self.coll}',
99
- headers=self.headers)
171
+ headers=self.headers,
172
+ timeout=self.kwargs.get('timeout', 15))
100
173
  x.raise_for_status()
101
174
  self.__root = (x.json()['data']['name'], x.json()['data']['alias'])
102
175
  return self.__root
@@ -120,11 +193,14 @@ class DvCollection:
120
193
  '''
121
194
  Get collection short name.
122
195
  '''
123
- shortname = self.session.get(f'{self.url}/api/dataverses/{dvid}', headers=self.headers)
196
+ self.limit.rate_limit()
197
+ shortname = self.session.get(f'{self.url}/api/dataverses/{dvid}',
198
+ headers=self.headers,
199
+ timeout=self.kwargs.get('timeout', 15))
124
200
  shortname.raise_for_status()
125
201
  return shortname.json()['data']['alias']
126
202
 
127
- def get_collections(self, coll:str=None, output=None, **kwargs)->list:#pylint: disable=unused-argument
203
+ def get_collections(self, coll:str=None, output=None)->list:#pylint: disable=unused-argument
128
204
  '''
129
205
  Get a [recursive] listing of all dataverses in a collection.
130
206
 
@@ -134,16 +210,15 @@ class DvCollection:
134
210
  Collection short name or id
135
211
  output : list, optional, default=[]
136
212
  output list to append to
137
- **kwargs : dict
138
- Other keyword arguments
139
-
140
213
  '''
141
214
  if not output:
142
215
  output = []
143
216
  if not coll:
144
217
  coll = self.coll
218
+ self.limit.rate_limit()
145
219
  x = self.session.get(f'{self.url}/api/dataverses/{coll}/contents',
146
- headers=self.headers)
220
+ headers=self.headers,
221
+ timeout=self.kwargs.get('timeout', 15))
147
222
  data = x.json().get('data')
148
223
  #---
149
224
  #Because it's possible that permissions errors can cause API read errors,
@@ -186,7 +261,8 @@ class DvCollection:
186
261
  LOGGER.debug('recursive')
187
262
  self.get_collections(dv[1], output)
188
263
  self.collections = output
189
- self.collections.append(self.root)
264
+ if self.root not in self.collections:
265
+ self.collections.insert(0, self.root)
190
266
  return output
191
267
 
192
268
  def get_studies(self, root:str=None):
@@ -201,9 +277,15 @@ class DvCollection:
201
277
  all_studies = []
202
278
  if not root:
203
279
  root=self.coll
204
- all_studies = self.get_collection_listing(root)
280
+ #Redundant, as root is now added to get_collections
281
+ #all_studies = self.get_collection_listing(root)
282
+ all_studies = []
205
283
  collections = self.get_collections(root)
206
- for collection in tqdm.tqdm(collections):
284
+ for collection in tqdm.tqdm(collections,
285
+ desc='collections',
286
+ unit='collection',
287
+ leave=False,
288
+ bar_format=BAR_FORMAT):
207
289
  all_studies.extend(self.get_collection_listing(collection[1]))
208
290
  self.studies = all_studies
209
291
  return all_studies
@@ -217,8 +299,10 @@ class DvCollection:
217
299
  coll_id : str
218
300
  Short name or id of a dataverse collection
219
301
  '''
302
+ self.limit.rate_limit()
220
303
  cl = self.session.get(f'{self.url}/api/dataverses/{coll_id}/contents',
221
- headers=self.headers)
304
+ headers=self.headers,
305
+ timeout=self.kwargs.get('timeout', 15))
222
306
  cl.raise_for_status()
223
307
  pids = [f"{z['protocol']}:{z['authority']}/{z['identifier']}"
224
308
  for z in cl.json()['data'] if z['type'] == 'dataset']
@@ -226,7 +310,15 @@ class DvCollection:
226
310
  #a metadata download
227
311
  smkwargs = [{'collection_name':_[0] , 'collection_short_name':_[1]}
228
312
  for _ in self.collections if coll_id == _[1]][0]
229
- out = [(self.get_study_info(pid, **smkwargs), pid) for pid in pids]
313
+ #out = [(self.get_study_info(pid, **smkwargs), pid) for pid in pids]
314
+ out = []
315
+ for pid in tqdm.tqdm(pids,
316
+ desc=smkwargs.get('collection_short_name', 'collection'),
317
+ unit='study',
318
+ leave=False,
319
+ colour='red',
320
+ bar_format=BAR_FORMAT):
321
+ out.append((self.get_study_info(pid, **smkwargs), pid))
230
322
  for _ in out:
231
323
  _[0].update({'pid': _[1]})
232
324
  return [x[0] for x in out]
@@ -243,12 +335,15 @@ class DvCollection:
243
335
  **kwargs
244
336
  Other useful information to pass onto StudyMetadata, such as collection info, etc.
245
337
  '''
338
+ self.limit.rate_limit()
246
339
  meta = self.session.get(f'{self.url}/api/datasets/:persistentId',
247
- params={'persistentId': pid},
248
- headers=self.headers)
340
+ params={'persistentId': pid},
341
+ headers=self.headers,
342
+ timeout=self.kwargs.get('timeout', 15))
249
343
  meta.raise_for_status()
250
344
  LOGGER.debug(pid)
251
- return StudyMetadata(study_meta=meta.json(), key=self.__key, url=self.url, **kwargs)
345
+ return StudyMetadata(study_meta=meta.json(), key=self.__key, url=self.url,
346
+ session=self.session, **kwargs)
252
347
 
253
348
  class StudyMetadata(dict):
254
349
  '''
@@ -279,15 +374,37 @@ class StudyMetadata(dict):
279
374
  key : str
280
375
  Dataverse instance API key (needed for unpublished studies)
281
376
 
377
+ rate_limit_on: bool
378
+ Turn on rate limit for requests
379
+
380
+ rate_limit_min : int
381
+ Minimum time between requests in seconds
382
+
383
+ rate_limit_max : int
384
+ Maximum time between requests in seconds
385
+
386
+ session : requests.Session
387
+ A requests session if available, to help
388
+ ensure against having too many open connections
389
+
282
390
  Notes
283
391
  -----
284
392
  Either `study_meta` is required OR `pid` and `url`. `key` _may_ be required
285
393
  if either a draft study is being accessed or the Dataverse installation
286
394
  requires API keys for all requests.
395
+
396
+ The rate limiter will wait for a random interval between
397
+ rate_limit_min and rate_limit_max. Obviously, if you want
398
+ a constant interval, set them to be equal.
399
+
287
400
  '''
288
401
  self.kwargs = kwargs
402
+ self.session = kwargs.get('session', requests.Session())
403
+ self.session.mount('https://',
404
+ requests.adapters.HTTPAdapter(max_retries=RETRY))
405
+ self.limit = RateLimiter(**kwargs)
289
406
  self.study_meta = kwargs.get('study_meta')
290
- self.all_versions = None
407
+ self.all_versions = kwargs.get('all_versions')
291
408
  self.url = kwargs.get('url')
292
409
  self.pid = kwargs.get('pid')
293
410
  #If only there would be an easy way to check if something was deaccessioned
@@ -299,9 +416,12 @@ class StudyMetadata(dict):
299
416
  # f"{self.study_meta['data']['authority']}"
300
417
  # f"/{self.study_meta['data']['identifier']}") if not
301
418
  # self.pid else self.pid)
302
- self.pid = (f"{self.study_meta['data']['protocol']}:"
303
- f"{self.study_meta['data']['authority']}"
304
- f"/{self.study_meta['data']['identifier']}")
419
+ try:
420
+ self.pid = (f"{self.study_meta['data']['protocol']}:"
421
+ f"{self.study_meta['data']['authority']}"
422
+ f"/{self.study_meta['data']['identifier']}")
423
+ except (KeyError,) as e:
424
+ raise MetadataError(f'Key error: {e}') from e
305
425
 
306
426
  self.headers = UAHEADER.copy()
307
427
  if not (('study_meta' in kwargs) or ('url' in kwargs and 'pid' in kwargs)):
@@ -337,16 +457,23 @@ class StudyMetadata(dict):
337
457
  if self.kwargs.get('key'):
338
458
  self.headers.update({'X-Dataverse-key':self.kwargs['key']})
339
459
  params = {'persistentId': self.pid}
340
- self.session = requests.Session()
341
- self.session.mount('https://',
342
- requests.adapters.HTTPAdapter(max_retries=RETRY))
343
460
  self.url = self.url.strip('/')
344
461
  if not self.url.startswith('https://'):
345
462
  self.url = f'https://{self.url}'
463
+ self.limit.rate_limit()
464
+ LOGGER.debug('Attempting %s/api/datasets/, params %s, headers %s',
465
+ self.url, params, self.headers)
346
466
  data = self.session.get(f'{self.url}/api/datasets/:persistentId',
347
- headers=self.headers, params=params)
467
+ headers=self.headers, params=params,
468
+ timeout=self.kwargs.get('timeout', 15))
469
+ data.raise_for_status()
470
+ self.limit.rate_limit()
471
+ LOGGER.debug('Attempting %s/api/datasets/:persistentId/versions, params %s, headers %s',
472
+ self.url, params, self.headers)
348
473
  all_versions = self.session.get(f'{self.url}/api/datasets/:persistentId/versions',
349
- headers=self.headers, params=params)
474
+ headers=self.headers, params=params,
475
+ timeout=self.kwargs.get('timeout', 15))
476
+ all_versions.raise_for_status()
350
477
  return data.json(), all_versions.json()
351
478
 
352
479
  def __has_metadata(self)->bool:
@@ -388,10 +515,14 @@ class StudyMetadata(dict):
388
515
  tmp['versionStatement'] = f"{chunk['versionNumber']}.{chunk['versionMinorNumber']}"
389
516
  else:
390
517
  tmp['versionStatement'] = f"{chunk.get('versionState', '')}"
391
-
518
+ #ADD fields here if they are not in the metadata and you need them
519
+ tmp['pid'] = self.pid #Because you need generally need this
520
+ #Collection info
392
521
  for _ in ['collection_name', 'collection_short_name']:
393
522
  if self.kwargs.get(_):
394
523
  tmp[_] = self.kwargs[_]
524
+ #Latest version number or state for easy filtering @@@
525
+ tmp['is_current_version'] = tmp['versionStatement'] == self.current_version
395
526
  return tmp
396
527
 
397
528
  def extract_field_metadata(self, field):
@@ -432,7 +563,7 @@ class StudyMetadata(dict):
432
563
  else:
433
564
  #sometimes value is None because reasons.
434
565
  interim[v3['typeName']] = [v3.get('value', [] )]
435
- LOGGER.debug(interim)
566
+ #LOGGER.debug(interim)
436
567
  for k9, v9 in interim.items():
437
568
  out.update({k9: '; '.join(v9)})
438
569
 
@@ -476,8 +607,14 @@ class StudyMetadata(dict):
476
607
  '''
477
608
  Return a formatted version statement for the most recent version
478
609
  '''
479
- return (f"{self.study_meta['data']['latestVersion']['versionNumber']}."
480
- f"{self.study_meta['data']['latestVersion']['versionMinorNumber']}")
610
+ try:
611
+ return (f"{self.study_meta['data']['latestVersion']['versionNumber']}."
612
+ f"{self.study_meta['data']['latestVersion']['versionMinorNumber']}")
613
+ except (KeyError, ValueError):
614
+ try:
615
+ return f"{self.study_meta['data']['latestVersion']['versionState']}"
616
+ except (ValueError, KeyError):
617
+ return 'DEACCESSIONED'
481
618
 
482
619
  @property
483
620
  def versions(self)->list:
@@ -832,6 +969,12 @@ class ReadmeCreator:
832
969
  entire StudyMetadata object.
833
970
  '''
834
971
  metatmp = self.meta.copy()
972
+ #Delete redundant info fields added when harvesting Study Metadata
973
+ for _ in ['pid', 'is_current_version', 'version_statement']:
974
+ try:
975
+ del metatmp[_]
976
+ except KeyError:
977
+ continue
835
978
  neworder = self.reorder_fields(metatmp)
836
979
  addme = self.concatenator(metatmp)
837
980
  metatmp.update(addme)
@@ -1071,7 +1214,7 @@ class FileAnalysis(dict):
1071
1214
  Download and analyze a file from a dataverse installation and
1072
1215
  produce useful metadata.
1073
1216
  '''
1074
-
1217
+ #pylint: disable=too-many-instance-attributes
1075
1218
  def __init__(self, **kwargs):
1076
1219
  '''
1077
1220
  Intialize the object.
@@ -1104,16 +1247,34 @@ class FileAnalysis(dict):
1104
1247
  filesize_bytes : int
1105
1248
  File size in bytes
1106
1249
 
1250
+ rate_limit_on: bool
1251
+ Turn on rate limit for requests
1252
+
1253
+ rate_limit_min : int
1254
+ Minimum time between requests in seconds
1255
+
1256
+ rate_limit_max : int
1257
+ Maximum time between requests in seconds
1258
+
1259
+ session : requests.Session
1260
+ A requests session if available, to help
1261
+ ensure against having too many open connections
1262
+
1107
1263
  Notes
1108
1264
  -----
1109
1265
  Either `local` must be supplied, or `url`, `key` and at least one of
1110
1266
  `id` or `pid` must be supplied
1111
1267
 
1112
- '''
1268
+ The rate limiter will wait for a random interval between
1269
+ rate_limit_min and rate_limit_max. Obviously, if you want
1270
+ a constant interval, set them to be equal.
1113
1271
 
1272
+ '''
1273
+ #pylint disable=too-many-instance-attributes
1114
1274
  #self.url = self.__clean_url(url)
1115
1275
  self.headers = UAHEADER.copy()
1116
1276
  self.kwargs = kwargs
1277
+ self.limit = RateLimiter(**kwargs)
1117
1278
  if self.kwargs.get('key'):
1118
1279
  self.headers.update({'X-Dataverse-key':self.kwargs['key']})
1119
1280
  self.local = None
@@ -1123,7 +1284,7 @@ class FileAnalysis(dict):
1123
1284
  '(pid or id)) or (local) keyword parameters.')
1124
1285
  raise TypeError(err)
1125
1286
  self.tempfile = None
1126
- self.session = requests.Session()
1287
+ self.session = kwargs.get('session', requests.Session())
1127
1288
  self.session.mount('https://',
1128
1289
  requests.adapters.HTTPAdapter(max_retries=RETRY))
1129
1290
  self.checkable = {'.sav': self.stat_file_metadata,
@@ -1235,17 +1396,20 @@ class FileAnalysis(dict):
1235
1396
  start = datetime.datetime.now()
1236
1397
  params = {'format':'original'}
1237
1398
  url = self.__clean_url(self.kwargs['url'])
1399
+ self.limit.rate_limit()
1238
1400
  if self.kwargs.get('pid'):
1239
1401
  params.update({'persistentId':self.kwargs['pid']})
1240
1402
  data = self.session.get(f'{url}/api/access/datafile/:persistentId',
1241
1403
  headers=self.headers,
1242
1404
  params=params,
1243
- stream=True)
1405
+ stream=True,
1406
+ timeout=self.kwargs.get('timeout', 15))
1244
1407
  else:
1245
1408
  data = self.session.get(f'{url}/api/access/datafile/{self.kwargs["id"]}',
1246
1409
  headers=self.headers,
1247
1410
  params=params,
1248
- stream=True)
1411
+ stream=True,
1412
+ timeout=self.kwargs.get('timeout', 15))
1249
1413
  data.raise_for_status()
1250
1414
  finish = datetime.datetime.now()
1251
1415
  self.filename = self.__get_filename(data.headers)
@@ -1255,7 +1419,9 @@ class FileAnalysis(dict):
1255
1419
  filesize = self.kwargs.get('filesize_bytes',
1256
1420
  data.headers.get('content-length', 9e9))
1257
1421
  filesize = int(filesize) # comes out as string from header
1258
- with tqdm.tqdm(total=filesize, unit='B', unit_scale=True, desc=self.filename) as t:
1422
+ with tqdm.tqdm(total=filesize, unit='B', unit_scale=True,
1423
+ desc=self.filename, leave=False,
1424
+ bar_format=BAR_FORMAT) as t:
1259
1425
  for _ in data.iter_content(block_size):
1260
1426
  self.tempfile.file.write(_)
1261
1427
  t.update(len(_))
@@ -5,6 +5,7 @@ outputs study metadata for the latest version
5
5
  import argparse
6
6
  import io
7
7
  import csv
8
+ import logging
8
9
  import pathlib
9
10
  import sqlite3
10
11
  import sys
@@ -57,9 +58,37 @@ def parse() -> argparse.ArgumentParser():
57
58
  parser.add_argument('-s', '--sqlite',
58
59
  help='Save output as SQLite3 database',
59
60
  action='store_true')
61
+ parser.add_argument('-l', '--log',
62
+ help=textwrap.fill(textwrap.dedent(
63
+ '''
64
+ If you would like a log, provide a log file name here.
65
+ If no file name is provided, no log is created.
66
+ '''),80),
67
+ default=None)
68
+ parser.add_argument('--log-level',
69
+ help=textwrap.fill(textwrap.dedent(
70
+ '''
71
+ Log level. Acceptable values for log level are: debug, info,
72
+ warning, error, critical.
73
+ Default value: warning.
74
+ '''),80),
75
+ default='warning')
76
+ parser.add_argument('--rate-limit-off',
77
+ action='store_true',
78
+ help=('Turn off rate limiter. '
79
+ 'Requests are randomly between min and max. Default is ON.'))
80
+ parser.add_argument('--rate-limit-min',
81
+ help='Minimum time before requests in seconds. Default 0.25',
82
+ default=0.25,
83
+ type=float)
84
+ parser.add_argument('--rate-limit-max',
85
+ help='Maximum time between requests in seconds: Default 1',
86
+ default=1,
87
+ type=float)
88
+
60
89
  group = parser.add_argument_group(title='Harvest options',
61
90
  description=textwrap.fill(
62
- ' You can obtain info for *either* a recursive crawl '
91
+ 'You can obtain info for *either* a recursive crawl '
63
92
  'of a collection (-c, --collection) OR for a single '
64
93
  'Dataverse ' 'study (-p, --pid). '
65
94
  'These arguments are mutually exclusive.'))
@@ -149,35 +178,71 @@ def extension(args:argparse.ArgumentParser):
149
178
  return '.sqlite3'
150
179
  return extype.get(args.delimiter, '.txt')
151
180
 
181
+ def logme(pargs:argparse.Namespace)->logging.Logger:
182
+ '''
183
+ Text logger
184
+ '''
185
+ logger=logging.getLogger()
186
+ l_format = logging.Formatter('%(name)s - %(asctime)s'
187
+ ' - %(levelname)s - %(funcName)s - '
188
+ '%(message)s')
189
+ lookup = {'debug' : logging.DEBUG,
190
+ 'info' : logging.INFO,
191
+ 'warning': logging.WARNING,
192
+ 'error': logging.ERROR,
193
+ 'critical': logging.CRITICAL}
194
+ level = lookup.get(pargs.log_level.lower(), logging.WARNING)
195
+ logger.setLevel(level)
196
+ if pargs.log:
197
+ text = logging.FileHandler(pargs.log, encoding='utf-8', delay=True)
198
+ text.setFormatter(l_format)
199
+ logger.addHandler(text)
200
+ return logger
201
+ logger.addHandler(logging.NullHandler())
202
+ return logger
203
+
152
204
  def main():
153
205
  '''
154
206
  You know what this is
155
207
  '''
156
- #pylint: disable=too-many-branches, too-many-locals
208
+ #pylint: disable=too-many-branches, too-many-locals, too-many-statements
157
209
  args = parse().parse_args()
210
+ logger = logme(args)
158
211
  if args.collection:
159
- coll_me = dvc.DvCollection(args.url, args.collection, args.key)
160
- try:
161
- coll_me.get_collections()
162
- except TypeError:
163
- print(f'Error with parsing collection: {args.collection}', file=sys.stderr)
164
- sys.exit()
212
+ coll_me = dvc.DvCollection(args.url, args.collection, args.key,
213
+ rate_limit_on=not args.rate_limit_off,
214
+ rate_limit_min=args.rate_limit_min,
215
+ rate_limit_max=args.rate_limit_max)
165
216
  try:
166
217
  coll_me.get_studies()
167
218
  all_studies = coll_me.studies
219
+ if not all_studies: #Stupid but this happens
220
+ print('No studies in collection', file=sys.stderr)
221
+ logger.warning('No studies to process in collection %s', args.collection)
222
+ sys.exit()
168
223
  except dataverse_utils.collections.MetadataError as e:
169
224
  print(e, file=sys.stderr)
225
+ logger.critical(e)
226
+ sys.exit()
227
+ except TypeError as e:
228
+ print(f'Error with parsing collection: {args.collection}', file=sys.stderr)
229
+ logger.critical(e)
170
230
  sys.exit()
171
231
  else:
172
232
  try:
173
- all_studies = [dvc.StudyMetadata(url=args.url, pid=args.pid, key=args.key)]
233
+ all_studies = [dvc.StudyMetadata(url=args.url, pid=args.pid, key=args.key,
234
+ rate_limit_on=True,
235
+ rate_limit_min=0.25,
236
+ rate_limit_max=1)]
174
237
  except (KeyError, dataverse_utils.collections.MetadataError) as e:
175
238
  print(e, file=sys.stderr)
239
+ logger.critical(e)
176
240
  sys.exit()
177
241
  fname = {0: '_studies', 1:'_files'}
178
242
  outdata = {}
179
243
  for stud_file in range(2): # studies and files
180
- fieldnames= fields(args.include_all_versions, stud_file, all_studies)
244
+ fieldnames = fields(args.include_all_versions, stud_file, all_studies)
245
+ logger.info(fieldnames)
181
246
  out = io.StringIO(newline='')
182
247
  writer = csv.DictWriter(out,
183
248
  fieldnames=fieldnames,
@@ -186,10 +251,12 @@ def main():
186
251
  extrasaction='ignore')
187
252
  writer.writeheader()
188
253
  for stud in all_studies:
254
+ logger.info(stud)
189
255
  for row in output(stud, args.include_all_versions, stud_file):
190
256
  data = {k:v.replace('\t',' ').replace('\r\n', ' ').replace('\n',' ')
191
257
  if isinstance(v, str) else v
192
258
  for k, v in row.items()}
259
+ logger.debug(data)
193
260
  writer.writerow(data)
194
261
  out.seek(0)
195
262
  outdata[fname[stud_file][1:]] = out
@@ -206,7 +273,7 @@ def main():
206
273
  file=sys.stdout)
207
274
  conn = sqlite3.connect(pathlib.Path(args.output+extension(args)).expanduser())
208
275
  for k,v in outdata.items():
209
- x=pd.read_csv(v, delimiter=args.delimiter)
276
+ x = pd.read_csv(v, delimiter=args.delimiter)
210
277
  x.to_sql(k, conn, if_exists='replace', index=0)
211
278
  cursor = conn.cursor()
212
279
  cursor.execute('DROP VIEW IF EXISTS short_combined_view;')