dryad2dataverse 0.6.2__py3-none-any.whl → 0.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,9 @@ Modules included:
22
22
 
23
23
  dryad2dataverse.exceptions : Custom exceptions.
24
24
  '''
25
+ import sys
25
26
 
26
- VERSION = (0, 6, 2)
27
-
27
+ VERSION = (0, 7, 4)
28
28
  __version__ = '.'.join([str(x) for x in VERSION])
29
+ USERAGENT = (f'dryad2dataverse/v{__version__} ({sys.platform.capitalize()}); '
30
+ f'Python {sys.version[:sys.version.find("(")-1]}')
@@ -325,7 +325,10 @@ class Monitor():
325
325
  # because of code duplication below.
326
326
  for f in oldFiles:
327
327
  #Download links are not persistent. Be warned
328
- downLink = f['_links']['stash:file-download']['href']
328
+ try:
329
+ downLink = f['_links']['stash:file-download']['href']
330
+ except KeyError:
331
+ downLink = f['_links']['stash:download']['href']
329
332
  downLink = f'{constants.DRYURL}{downLink}'
330
333
  name = f['path']
331
334
  mimeType = f['mimeType']
@@ -27,7 +27,7 @@ import dryad2dataverse.serializer
27
27
  import dryad2dataverse.transfer
28
28
  from dryad2dataverse.handlers import SSLSMTPHandler
29
29
 
30
- VERSION = (0, 5, 3)
30
+ VERSION = (0, 5, 4)
31
31
  __version__ = '.'.join([str(x) for x in VERSION])
32
32
 
33
33
  DRY = 'https://datadryad.org/api/v2'
@@ -485,6 +485,9 @@ def checkwarn(val:int, **kwargs) -> None:
485
485
  {'warn_too_many': bool}
486
486
 
487
487
  '''
488
+ print(kwargs)
489
+ #print(vars(kwargs))
490
+ return
488
491
  if not kwargs.get('warn_too_many'):
489
492
  return
490
493
  if val >= kwargs.get('warn',0):
@@ -495,7 +498,7 @@ def checkwarn(val:int, **kwargs) -> None:
495
498
  for logme in kwargs.get('loggers'):
496
499
  logme.warning(mess)
497
500
  notify(msgtxt=(subject, mess),
498
- **vars(kwargs))
501
+ **kwargs)
499
502
  sys.exit()
500
503
 
501
504
  def verbo(verbosity:bool, **kwargs)->None:
@@ -671,5 +674,23 @@ def main(log='/var/log/dryadd.log', level=logging.WARNING):
671
674
  print(f'Error: {err}. Exiting. For details see log at {args.log}.')
672
675
  sys.exit()
673
676
 
677
+ def main2(log='/var/log/dryadd.log', level=logging.WARNING):
678
+ '''
679
+ Main Dryad transfer daemon
680
+
681
+ log : str
682
+ path to logfile
683
+ level : int
684
+ log level, usually one of logging.LOGLEVEL (ie, logging.warning)
685
+ '''
686
+ #pylint: disable=too-many-branches
687
+ #pylint: disable=too-many-statements
688
+ #pylint: disable=too-many-locals
689
+ parser = argp()
690
+ args = parser.parse_args()
691
+ print(args)
692
+ checkwarn(val=26,
693
+ loggers=[],
694
+ **vars(args))
674
695
  if __name__ == '__main__':
675
- main()
696
+ main2()
@@ -11,11 +11,13 @@ import requests
11
11
  from requests.adapters import HTTPAdapter
12
12
 
13
13
  from dryad2dataverse import constants
14
+ from dryad2dataverse import USERAGENT
14
15
 
15
16
  LOGGER = logging.getLogger(__name__)
16
17
  #Connection monitoring as per
17
18
  #https://stackoverflow.com/questions/16337511/log-all-requests-from-the-python-requests-module
18
19
  URL_LOGGER = logging.getLogger('urllib3')
20
+ USER_AGENT = {'User-agent': USERAGENT}
19
21
 
20
22
  class Serializer():
21
23
  '''
@@ -72,6 +74,7 @@ class Serializer():
72
74
  try:
73
75
  headers = {'accept':'application/json',
74
76
  'Content-Type':'application/json'}
77
+ headers.update(USER_AGENT)
75
78
  doiClean = urllib.parse.quote(self.doi, safe='')
76
79
  resp = self.session.get(f'{url}/api/v2/datasets/{doiClean}',
77
80
  headers=headers, timeout=timeout)
@@ -164,6 +167,7 @@ class Serializer():
164
167
  self._fileJson = []
165
168
  headers = {'accept':'application/json',
166
169
  'Content-Type':'application/json'}
170
+ headers.update(USER_AGENT)
167
171
  fileList = self.session.get(f'{constants.DRYURL}/api/v2/versions/{self.id}/files',
168
172
  headers=headers,
169
173
  timeout=timeout)
@@ -202,7 +206,11 @@ class Serializer():
202
206
  files = page['_embedded'].get('stash:files')
203
207
  if files:
204
208
  for f in files:
205
- downLink = f['_links']['stash:file-download']['href']
209
+ #This broke with this commit:
210
+ # https://github.com/datadryad/dryad-app/commit/b8a333ba34b14e55cbc1d7ed5aa4451e0f41db66
211
+
212
+ #downLink = f['_links']['stash:file-download']['href']
213
+ downLink = f['_links']['stash:download']['href']
206
214
  downLink = f'{constants.DRYURL}{downLink}'
207
215
  name = f['path']
208
216
  mimeType = f['mimeType']
@@ -2,6 +2,7 @@
2
2
  This module handles data downloads and uploads from a Dryad instance to a Dataverse instance
3
3
  '''
4
4
 
5
+ #TODO harmonize headers instead of hideous copypasta
5
6
  import hashlib
6
7
  import io
7
8
  import json
@@ -9,17 +10,30 @@ import logging
9
10
  import os
10
11
  import time
11
12
  import traceback
13
+ import zlib #crc32, adler32
12
14
 
15
+ import Crypto.Hash.MD2 #md2
13
16
  import requests
14
17
  from requests.adapters import HTTPAdapter
15
18
  from requests_toolbelt.multipart.encoder import MultipartEncoder
16
19
 
17
20
  from dryad2dataverse import constants
18
21
  from dryad2dataverse import exceptions
22
+ from dryad2dataverse import USERAGENT
19
23
 
24
+ USER_AGENT = {'User-agent': USERAGENT}
20
25
  LOGGER = logging.getLogger(__name__)
21
26
  URL_LOGGER = logging.getLogger('urllib3')
22
27
 
28
+ HASHTABLE = {'adler-32' : zlib.adler32, #zlib?
29
+ 'crc-32' : zlib.crc32, #zlib
30
+ 'md2' : Crypto.Hash.MD2, #insecure
31
+ 'md5' : hashlib.md5,
32
+ 'sha-1' : hashlib.sha1,
33
+ 'sha-256' : hashlib.sha256,
34
+ 'sha-384' : hashlib.sha384,
35
+ 'sha-512': hashlib.sha512}
36
+
23
37
  class Transfer():
24
38
  '''
25
39
  Transfers metadata and data files from a
@@ -74,6 +88,7 @@ class Transfer():
74
88
  if not url:
75
89
  url = constants.DVURL
76
90
  headers = {'X-Dataverse-key': apikey if apikey else constants.APIKEY}
91
+ headers.update(USER_AGENT)
77
92
  bad_test = self.session.get(f'{url}/api/datasets/:persistentId',
78
93
  headers=headers,
79
94
  params=params)
@@ -221,6 +236,7 @@ class Transfer():
221
236
  else:
222
237
  headers = {'X-Dataverse-key' : constants.APIKEY}
223
238
 
239
+ headers.update(USER_AGENT)
224
240
  params = {'persistentId': hdl}
225
241
  set_date = self.session.put(f'{url}/api/datasets/:persistentId/citationdate',
226
242
  headers=headers,
@@ -273,7 +289,7 @@ class Transfer():
273
289
  if not apikey:
274
290
  apikey = constants.APIKEY
275
291
  headers = {'X-Dataverse-key' : apikey}
276
-
292
+ headers.update(USER_AGENT)
277
293
  targetDv = kwargs.get('targetDv')
278
294
  dvpid = kwargs.get('dvpid')
279
295
  #dryFid = kwargs.get('dryFid') #Why did I put this here?
@@ -342,34 +358,60 @@ class Transfer():
342
358
  return self.dvpid
343
359
 
344
360
  @staticmethod
345
- def _check_md5(infile):
361
+ def _check_md5(infile, dig_type):
346
362
  '''
347
- Returns the md5 checksum of a file.
363
+ Returns the hex digest of a file (formerly just md5sum).
348
364
 
349
365
  ----------------------------------------
350
366
  Parameters:
351
367
 
352
368
  infile : str
353
369
  — Complete path to target file.
370
+
371
+ dig_type : str or None
372
+ — Digest type
354
373
  ----------------------------------------
355
374
  '''
375
+ #From Ryan Scherle
376
+ #When Dryad calculates a digest, it only uses MD5.
377
+ #But if you have precomputed some other type of digest, we should accept it.
378
+ #The list of allowed values is:
379
+ #('adler-32','crc-32','md2','md5','sha-1','sha-256','sha-384','sha-512')
380
+ #hashlib doesn't support adler-32, crc-32, md2
381
+
356
382
  blocksize = 2**16
383
+ #Well, this is inelegant
357
384
  with open(infile, 'rb') as m:
358
- fmd5 = hashlib.md5()
359
- fblock = m.read(blocksize)
360
- while fblock:
361
- fmd5.update(fblock)
385
+ #fmd5 = hashlib.md5()
386
+ ## var name kept for posterity. Maybe refactor
387
+ if dig_type in ['sha-1', 'sha-256', 'sha-384', 'sha-512', 'md5', 'md2']:
388
+ if dig_type == 'md2':
389
+ fmd5 = Crypto.Hash.MD2.new()
390
+ else:
391
+ fmd5 = HASHTABLE[dig_type]()
392
+ fblock = m.read(blocksize)
393
+ while fblock:
394
+ fmd5.update(fblock)
395
+ fblock = m.read(blocksize)
396
+ return fmd5.hexdigest()
397
+ if dig_type in ['adler-32', 'crc-32']:
362
398
  fblock = m.read(blocksize)
363
- return fmd5.hexdigest()
399
+ curvalue = HASHTABLE[dig_type](fblock)
400
+ while fblock:
401
+ fblock = m.read(blocksize)
402
+ curvalue = HASHTABLE[dig_type](fblock, curvalue)
403
+ return curvalue
404
+ raise exceptions.HashError(f'Unable to determine hash type for{infile}: {dig_type}')
405
+
364
406
 
365
- def download_file(self, url, filename, tmp=None,
366
- size=None, chk=None, timeout=45):
407
+ def download_file(self, url=None, filename=None, tmp=None,
408
+ size=None, chk=None, timeout=45, **kwargs):
367
409
  '''
368
410
  Downloads a file via requests streaming and saves to constants.TMP.
369
- returns md5sum on success and an exception on failure.
411
+ returns checksum on success and an exception on failure.
370
412
 
371
413
  ----------------------------------------
372
- Parameters:
414
+ Required keyword arguments:
373
415
 
374
416
  url : str
375
417
  — URL of download.
@@ -388,8 +430,11 @@ class Transfer():
388
430
  — Reported file size in bytes.
389
431
  Defaults to dryad2dataverse.constants.MAX_UPLOAD.
390
432
 
433
+ digest_type: str
434
+ — checksum type (ie, md5, sha-256, etc)
435
+
391
436
  chk : str
392
- - md5 sum of file (if available and known).
437
+ — checksum of file (if available and known).
393
438
  ----------------------------------------
394
439
  '''
395
440
  LOGGER.debug('Start download sequence')
@@ -402,6 +447,7 @@ class Transfer():
402
447
 
403
448
  if size:
404
449
  if size > constants.MAX_UPLOAD:
450
+ #TOO BIG
405
451
  LOGGER.warning('%s: File %s exceeds '
406
452
  'Dataverse MAX_UPLOAD size. Skipping download.',
407
453
  self.doi, filename)
@@ -430,11 +476,13 @@ class Transfer():
430
476
  LOGGER.exception(e)
431
477
  raise
432
478
  #now check the md5
433
- md5 = Transfer._check_md5(f'{tmp}{os.sep}{filename}')
434
- if chk:
479
+ md5 = None
480
+ if chk and kwargs.get('digest_type') in HASHTABLE:
481
+ md5 = Transfer._check_md5(f'{tmp}{os.sep}{filename}',
482
+ kwargs['digest_type'])
435
483
  if md5 != chk:
436
484
  try:
437
- raise exceptions.HashError('Hex digest mismatch: {md5} : {chk}')
485
+ raise exceptions.HashError(f'Hex digest mismatch: {md5} : {chk}')
438
486
  #is this really what I want to do on a bad checksum?
439
487
  except exceptions.HashError as e:
440
488
  LOGGER.exception(e)
@@ -443,6 +491,7 @@ class Transfer():
443
491
  if url == i[0]:
444
492
  i[-1] = md5
445
493
  LOGGER.debug('Complete download sequence')
494
+ #This doesn't actually return an md5, just the hash value
446
495
  return md5
447
496
  except (requests.exceptions.HTTPError,
448
497
  requests.exceptions.ConnectionError) as err:
@@ -474,7 +523,13 @@ class Transfer():
474
523
  files = self.files
475
524
  try:
476
525
  for f in files:
477
- self.download_file(f[0], f[1], size=f[3], chk=f[-1])
526
+ self.download_file(url=f[0],
527
+ filename=f[1],
528
+ mimetype=f[2],
529
+ size=f[3],
530
+ descr=f[4],
531
+ digest_type=f[5],
532
+ chk=f[-1])
478
533
  except exceptions.DataverseDownloadError as e:
479
534
  LOGGER.exception('Unable to download file with info %s\n%s', f, e)
480
535
  raise
@@ -512,6 +567,8 @@ class Transfer():
512
567
  headers = {'X-Dataverse-key': apikey}
513
568
  else:
514
569
  headers = self.auth
570
+
571
+ headers.update(USER_AGENT)
515
572
  params = {'persistentId': study}
516
573
  try:
517
574
  lock_status = self.session.get(f'{dv_url}/api/datasets/:persistentId/locks',
@@ -560,6 +617,8 @@ class Transfer():
560
617
  headers = {'X-Dataverse-key': apikey}
561
618
  else:
562
619
  headers = self.auth
620
+
621
+ headers.update(USER_AGENT)
563
622
  params = {'persistentId': study}
564
623
  lock_status = self.session.get(f'{dv_url}/api/datasets/:persistentId/locks',
565
624
  headers=headers,
@@ -593,7 +652,9 @@ class Transfer():
593
652
 
594
653
  def upload_file(self, dryadUrl=None, filename=None,
595
654
  mimetype=None, size=None, descr=None,
596
- md5=None, studyId=None, dest=None,
655
+ hashtype=None,
656
+ #md5=None, studyId=None, dest=None,
657
+ digest=None, studyId=None, dest=None,
597
658
  fprefix=None, force_unlock=False, timeout=300):
598
659
  '''
599
660
  Uploads file to Dataverse study. Returns a tuple of the
@@ -620,8 +681,11 @@ class Transfer():
620
681
  dest : str
621
682
  — Destination dataverse installation url.
622
683
  Defaults to constants.DVURL.
684
+ hashtype: str
685
+ original Dryad hash type
623
686
 
624
- md5 : str
687
+ #md5 : str
688
+ digest
625
689
  — md5 checksum for file.
626
690
 
627
691
  fprefix : str
@@ -644,6 +708,8 @@ class Transfer():
644
708
 
645
709
  ----------------------------------------
646
710
  '''
711
+ #return locals()
712
+ #TODONE remove above
647
713
  if not studyId:
648
714
  studyId = self.dvpid
649
715
  if not dest:
@@ -682,6 +748,7 @@ class Transfer():
682
748
  ctype = {'Content-type' : multi.content_type}
683
749
  tmphead = self.auth.copy()
684
750
  tmphead.update(ctype)
751
+ tmphead.update(USER_AGENT)
685
752
  url = dest + '/api/datasets/:persistentId/add'
686
753
  try:
687
754
  upload = self.session.post(url, params=params,
@@ -691,9 +758,22 @@ class Transfer():
691
758
  upload.raise_for_status()
692
759
  self.fileUpRecord.append((fid, upload.json()))
693
760
  upmd5 = upload.json()['data']['files'][0]['dataFile']['checksum']['value']
694
- if md5 and upmd5 != md5:
761
+ #Dataverse hash type
762
+ _type = upload.json()['data']['files'][0]['dataFile']['checksum']['type']
763
+ if _type.lower() != hashtype.lower():
764
+ comparator = self._check_md5(upfile, _type.lower())
765
+ else:
766
+ comparator = digest
767
+ #if hashtype.lower () != 'md5':
768
+ # #get an md5 because dataverse uses md5s. Or most of them do anyway.
769
+ # #One day this will be rewritten properly.
770
+ # md5 = self._check_md5(filename, 'md5')
771
+ #else:
772
+ # md5 = digest
773
+ #if md5 and (upmd5 != md5):
774
+ if upmd5 != comparator:
695
775
  try:
696
- raise exceptions.HashError(f'md5sum mismatch:\nlocal: {md5}\nuploaded: {upmd5}')
776
+ raise exceptions.HashError(f'{_type} mismatch:\nlocal: {comparator}\nuploaded: {upmd5}')
697
777
  except exceptions.HashError as e:
698
778
  LOGGER.exception(e)
699
779
  raise
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dryad2dataverse
3
- Version: 0.6.2
3
+ Version: 0.7.4
4
4
  Summary: Utility for copying and syncing data from a Dryad data repository to a Dataverse repository
5
5
  Author-email: Paul Lesack <paul.lesack@ubc.ca>
6
6
  Project-URL: Homepage, https://ubc-library-rc.github.io/dryad2dataverse
@@ -20,7 +20,8 @@ Description-Content-Type: text/markdown
20
20
  Requires-Dist: certifi >=2022.12.7
21
21
  Requires-Dist: charset-normalizer >=2.0.4
22
22
  Requires-Dist: chardet >=3.0.4
23
- Requires-Dist: idna >=2.10
23
+ Requires-Dist: idna >=2.10.0
24
+ Requires-Dist: pycryptodome >=3.20.0
24
25
  Requires-Dist: requests >=2.26.0
25
26
  Requires-Dist: requests-toolbelt >=0.9.1
26
27
  Requires-Dist: urllib3 >=1.26.6
@@ -0,0 +1,13 @@
1
+ dryad2dataverse/__init__.py,sha256=13WoiArwE8vvmWq4vGGDVH8BHhn0QEZrPVFE8boCFd4,865
2
+ dryad2dataverse/constants.py,sha256=ZfD2N0f742nnP8NPUV0QsDdVVAbrW-3Py8Lg9al1Z5c,1429
3
+ dryad2dataverse/exceptions.py,sha256=oIP1_fSEvLF3HpK6gOYb05vUisY-IAxwXZDeNoAvCPM,1008
4
+ dryad2dataverse/handlers.py,sha256=Xb0vvs1HE92qaK6g-Gu3eyHkLrSwU0-RQjLcl6FZPUY,1487
5
+ dryad2dataverse/monitor.py,sha256=KOyWCpPTZLYRStB-RN0e5kgHTfbxHsByD72K1VtEPP8,26406
6
+ dryad2dataverse/serializer.py,sha256=DoIjHYKtoH047X5Gd-WUdoLpL-kvTtSAPg-lUElCx8c,33865
7
+ dryad2dataverse/transfer.py,sha256=83tju_o4DSgSkF7JDLsgTpAwm03b0CMb0OjcKAEACuY,37548
8
+ dryad2dataverse/scripts/dryadd.py,sha256=rv8waNgJ7sdVF-nfiPCNa1_5p568CfSvBWUWwYYRP_A,26836
9
+ dryad2dataverse-0.7.4.dist-info/METADATA,sha256=Er-lR8tTWTOKGC0QhT7sKWX9F894DaJXL46kfjADBy0,3299
10
+ dryad2dataverse-0.7.4.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
11
+ dryad2dataverse-0.7.4.dist-info/entry_points.txt,sha256=9kBsBa5SivAtfAox__vZGL7H-HI7Vd-jGztCh_eIJEc,63
12
+ dryad2dataverse-0.7.4.dist-info/top_level.txt,sha256=0X45AghpKfL69Oc51sRddeiHtq8o-OyOhFX3AMal6YI,16
13
+ dryad2dataverse-0.7.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.2)
2
+ Generator: setuptools (74.1.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,13 +0,0 @@
1
- dryad2dataverse/__init__.py,sha256=j0GU7htgnQj_RNsQmz3ni5uBcUgipdqhwsvpakMaWFM,712
2
- dryad2dataverse/constants.py,sha256=ZfD2N0f742nnP8NPUV0QsDdVVAbrW-3Py8Lg9al1Z5c,1429
3
- dryad2dataverse/exceptions.py,sha256=oIP1_fSEvLF3HpK6gOYb05vUisY-IAxwXZDeNoAvCPM,1008
4
- dryad2dataverse/handlers.py,sha256=Xb0vvs1HE92qaK6g-Gu3eyHkLrSwU0-RQjLcl6FZPUY,1487
5
- dryad2dataverse/monitor.py,sha256=qj-jXl3_SJBPof1qKOiSHQCvc083PHH2afkl-y_qQAU,26267
6
- dryad2dataverse/serializer.py,sha256=fM5owzRfdZb799fTg45n0iqzBWZSUHpTp2pXnuKg_z0,33476
7
- dryad2dataverse/transfer.py,sha256=tfuSSfOsXTCMEJ_K65J0hyOI9O_5GFUHEpPyNQGXVbs,34125
8
- dryad2dataverse/scripts/dryadd.py,sha256=chQVEAYWTHvKa5QZH0PIj1EgBlJO4qd1Xw2vkf1c_i8,26291
9
- dryad2dataverse-0.6.2.dist-info/METADATA,sha256=t7WcbZ40n3nZOHrQYTGCl32Fi2tCuLEhtKus_PKYMTg,3260
10
- dryad2dataverse-0.6.2.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
11
- dryad2dataverse-0.6.2.dist-info/entry_points.txt,sha256=9kBsBa5SivAtfAox__vZGL7H-HI7Vd-jGztCh_eIJEc,63
12
- dryad2dataverse-0.6.2.dist-info/top_level.txt,sha256=0X45AghpKfL69Oc51sRddeiHtq8o-OyOhFX3AMal6YI,16
13
- dryad2dataverse-0.6.2.dist-info/RECORD,,