dryad2dataverse 0.7.11a0__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dryad2dataverse/__init__.py +14 -12
- dryad2dataverse/auth.py +94 -0
- dryad2dataverse/config.py +180 -0
- dryad2dataverse/data/dryad2dataverse_config.yml +127 -0
- dryad2dataverse/handlers.py +6 -2
- dryad2dataverse/monitor.py +146 -140
- dryad2dataverse/scripts/dryadd.py +224 -291
- dryad2dataverse/serializer.py +129 -140
- dryad2dataverse/transfer.py +296 -396
- {dryad2dataverse-0.7.11a0.dist-info → dryad2dataverse-0.8.1.dist-info}/METADATA +4 -3
- dryad2dataverse-0.8.1.dist-info/RECORD +14 -0
- dryad2dataverse/constants.py +0 -45
- dryad2dataverse-0.7.11a0.dist-info/RECORD +0 -12
- {dryad2dataverse-0.7.11a0.dist-info → dryad2dataverse-0.8.1.dist-info}/WHEEL +0 -0
- {dryad2dataverse-0.7.11a0.dist-info → dryad2dataverse-0.8.1.dist-info}/entry_points.txt +0 -0
dryad2dataverse/monitor.py
CHANGED
|
@@ -8,14 +8,14 @@ The monitor's primary function is to allow for state checking
|
|
|
8
8
|
for Dryad studies so that files and studies aren't downloaded
|
|
9
9
|
unneccessarily.
|
|
10
10
|
'''
|
|
11
|
-
|
|
11
|
+
#pylint: disable=invalid-name
|
|
12
12
|
import copy
|
|
13
|
-
import
|
|
13
|
+
import datetime
|
|
14
14
|
import json
|
|
15
|
+
import logging
|
|
16
|
+
import pathlib
|
|
15
17
|
import sqlite3
|
|
16
|
-
import datetime
|
|
17
18
|
|
|
18
|
-
from dryad2dataverse import constants
|
|
19
19
|
from dryad2dataverse import exceptions
|
|
20
20
|
|
|
21
21
|
LOGGER = logging.getLogger(__name__)
|
|
@@ -26,82 +26,88 @@ class Monitor():
|
|
|
26
26
|
Dryad files can be monitored and updated over time. Monitor is a singleton,
|
|
27
27
|
but is not thread-safe.
|
|
28
28
|
'''
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def __new__(cls, dbase=None, *args, **kwargs):
|
|
29
|
+
def __new__(cls, *args, **kwargs):
|
|
32
30
|
'''
|
|
33
31
|
Creates a new singleton instance of Monitor.
|
|
34
32
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
*args
|
|
36
|
+
**kwargs
|
|
37
|
+
'''
|
|
38
|
+
if not hasattr(cls, 'inst'):
|
|
39
|
+
cls.inst = super().__new__(cls)
|
|
40
|
+
#This ensures only the first set of kwargs (on instantiation)
|
|
41
|
+
#are used.
|
|
42
|
+
cls.init = 0
|
|
43
|
+
cls.kwargs = kwargs
|
|
44
|
+
if not cls.kwargs.get('dbase'):
|
|
45
|
+
try:
|
|
46
|
+
cls.kwargs['dbase'] = args[0]
|
|
47
|
+
except ValueError as e:
|
|
48
|
+
raise KeyError from e
|
|
49
|
+
cls.conn = sqlite3.connect(pathlib.Path(cls.kwargs['dbase']).expanduser().absolute())
|
|
50
|
+
cls.cursor = cls.conn.cursor()
|
|
51
|
+
LOGGER.info('Open database %s', cls.kwargs['dbase'])
|
|
52
|
+
return cls.inst
|
|
39
53
|
|
|
54
|
+
def __init__(self, *args, **kwargs):
|
|
55
|
+
'''
|
|
56
|
+
Initialize singleton instance of Monitor
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
*args
|
|
61
|
+
Positional arguments. Only the first is used
|
|
62
|
+
**kwargs
|
|
63
|
+
Keyword arguments. Only dbase is used, and it overwrites args[0] if present
|
|
64
|
+
|
|
65
|
+
Notes
|
|
66
|
+
-----
|
|
67
|
+
Normally you would just pass a dryad2dataverse.config.Config object,
|
|
68
|
+
ie. Monitor(**config)
|
|
69
|
+
|
|
70
|
+
These keyword parameters are required at a minimum, and are included as part of a
|
|
71
|
+
Config instance.
|
|
40
72
|
dbase : str
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
73
|
+
Path to dryad2dataverse monitor database
|
|
74
|
+
dry_url : str
|
|
75
|
+
Dryad base URL
|
|
44
76
|
'''
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
cls.cursor = cls.conn.cursor()
|
|
77
|
+
#pylint: disable=unused-argument
|
|
78
|
+
#arguments are parsed in __new__ to make a singleton
|
|
79
|
+
#but they need to be passed in __init__
|
|
80
|
+
if not self.init:
|
|
81
|
+
|
|
82
|
+
conn = sqlite3.connect(pathlib.Path(self.kwargs['dbase']).expanduser().absolute())
|
|
83
|
+
cursor = conn.cursor()
|
|
53
84
|
create = ['CREATE TABLE IF NOT EXISTS dryadStudy \
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
85
|
+
(uid INTEGER PRIMARY KEY AUTOINCREMENT, \
|
|
86
|
+
doi TEXT, lastmoddate TEXT, dryadjson TEXT, \
|
|
87
|
+
dvjson TEXT);',
|
|
88
|
+
'CREATE TABLE IF NOT EXISTS dryadFiles \
|
|
89
|
+
(dryaduid INTEGER REFERENCES dryadStudy (uid), \
|
|
90
|
+
dryfilesjson TEXT);',
|
|
91
|
+
'CREATE TABLE IF NOT EXISTS dvStudy \
|
|
92
|
+
(dryaduid INTEGER references dryadStudy (uid), \
|
|
93
|
+
dvpid TEXT);',
|
|
94
|
+
'CREATE TABLE IF NOT EXISTS dvFiles \
|
|
95
|
+
(dryaduid INTEGER references dryadStudy (uid), \
|
|
96
|
+
dryfid INT, \
|
|
97
|
+
drymd5 TEXT, dvfid TEXT, dvmd5 TEXT, \
|
|
98
|
+
dvfilejson TEXT);',
|
|
99
|
+
'CREATE TABLE IF NOT EXISTS lastcheck \
|
|
100
|
+
(checkdate TEXT);',
|
|
101
|
+
'CREATE TABLE IF NOT EXISTS failed_uploads \
|
|
102
|
+
(dryaduid INTEGER references dryadstudy (uid), \
|
|
103
|
+
dryfid INT, status TEXT);'
|
|
73
104
|
]
|
|
74
105
|
|
|
75
106
|
for line in create:
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
return cls.__instance
|
|
81
|
-
|
|
82
|
-
def __init__(self, dbase=None, *args, **kwargs):
|
|
83
|
-
# remove args and kwargs when you find out how init interacts with new.
|
|
84
|
-
'''
|
|
85
|
-
Initialize the Monitor instance if not instantiated already (ie, Monitor
|
|
86
|
-
is a singleton).
|
|
87
|
-
|
|
88
|
-
----------------------------------------
|
|
89
|
-
Parameters:
|
|
90
|
-
|
|
91
|
-
dbase : str
|
|
92
|
-
— Complete path to desired location of tracking database
|
|
93
|
-
(eg: /tmp/test.db).
|
|
94
|
-
|
|
95
|
-
Defaults to dryad2dataverse.constants.DBASE.
|
|
96
|
-
----------------------------------------
|
|
97
|
-
'''
|
|
98
|
-
if self.__initialized:
|
|
99
|
-
return
|
|
100
|
-
self.__initialized = True
|
|
101
|
-
if not dbase:
|
|
102
|
-
self.dbase = constants.DBASE
|
|
103
|
-
else:
|
|
104
|
-
self.dbase = dbase
|
|
107
|
+
cursor.execute(line)
|
|
108
|
+
conn.commit()
|
|
109
|
+
conn.close()
|
|
110
|
+
self.init = 1
|
|
105
111
|
|
|
106
112
|
def __del__(self):
|
|
107
113
|
'''
|
|
@@ -121,31 +127,40 @@ class Monitor():
|
|
|
121
127
|
return last_mod[0][0]
|
|
122
128
|
return None
|
|
123
129
|
|
|
124
|
-
def status(self, serial):
|
|
130
|
+
def status(self, serial)->dict:
|
|
125
131
|
'''
|
|
126
132
|
Returns a dictionary with keys 'status' and 'dvpid' and 'notes'.
|
|
133
|
+
|
|
134
|
+
Parameters
|
|
135
|
+
----------
|
|
136
|
+
serial : dryad2dataverse.serializer.Serializer
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
127
140
|
`{status :'updated', 'dvpid':'doi://some/ident'}`.
|
|
128
141
|
|
|
142
|
+
Notes
|
|
143
|
+
------
|
|
129
144
|
`status` is one of 'new', 'identical', 'lastmodsame',
|
|
130
145
|
'updated'
|
|
131
146
|
|
|
132
|
-
|
|
147
|
+
'new' is a completely new file.
|
|
133
148
|
|
|
134
|
-
|
|
135
|
-
|
|
149
|
+
'identical' The metadata from Dryad is *identical* to the last time
|
|
150
|
+
the check was run.
|
|
136
151
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
152
|
+
'lastmodsame' Dryad lastModificationDate == last modification date
|
|
153
|
+
in database AND output JSON is different.
|
|
154
|
+
This can indicate a Dryad
|
|
155
|
+
API output change, reindexing or something else.
|
|
156
|
+
But the lastModificationDate
|
|
157
|
+
is supposed to be an indicator of meaningful change, so this option
|
|
158
|
+
exists so you can decide what to do given this option
|
|
144
159
|
|
|
145
|
-
|
|
160
|
+
'updated' Indicates changes to lastModificationDate
|
|
146
161
|
|
|
147
|
-
|
|
148
|
-
|
|
162
|
+
Note that Dryad constantly changes their API output, so the changes
|
|
163
|
+
may not actually be meaningful.
|
|
149
164
|
|
|
150
165
|
`dvpid` is a Dataverse persistent identifier.
|
|
151
166
|
`None` in the case of status='new'
|
|
@@ -155,12 +170,6 @@ class Monitor():
|
|
|
155
170
|
not `new` or `identical`. Note that Dryad has no way to indicate *both*
|
|
156
171
|
a file and metadata change, so this value reflects only the *last* change
|
|
157
172
|
in the Dryad state.
|
|
158
|
-
|
|
159
|
-
----------------------------------------
|
|
160
|
-
Parameters:
|
|
161
|
-
|
|
162
|
-
serial : dryad2dataverse.serializer instance
|
|
163
|
-
----------------------------------------
|
|
164
173
|
'''
|
|
165
174
|
# Last mod date is indicator of change.
|
|
166
175
|
# From email w/Ryan Scherle 10 Nov 2020
|
|
@@ -199,13 +208,11 @@ class Monitor():
|
|
|
199
208
|
dryaduid = ?', (dryaduid,))
|
|
200
209
|
dvpid = self.cursor.fetchall()[-1][0]
|
|
201
210
|
serial.dvpid = dvpid
|
|
202
|
-
except TypeError:
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
LOGGER.exception(e)
|
|
208
|
-
raise
|
|
211
|
+
except TypeError as exc:
|
|
212
|
+
LOGGER.error('Dryad DOI : %s. Error finding Dataverse PID', doi)
|
|
213
|
+
LOGGER.exception(exc)
|
|
214
|
+
raise exceptions.DatabaseError from exc
|
|
215
|
+
|
|
209
216
|
newfile = copy.deepcopy(serial.dryadJson)
|
|
210
217
|
testfile = copy.deepcopy(json.loads(result[-1][3]))
|
|
211
218
|
if newfile == testfile:
|
|
@@ -220,23 +227,25 @@ class Monitor():
|
|
|
220
227
|
'''
|
|
221
228
|
Analyzes differences in metadata between current serializer
|
|
222
229
|
instance and last updated serializer instance.
|
|
223
|
-
Returns a list of field changes consisting of:
|
|
224
230
|
|
|
231
|
+
Parameters
|
|
232
|
+
----------
|
|
233
|
+
serial : dryad2dataverse.serializer.Serializer
|
|
234
|
+
|
|
235
|
+
Returns
|
|
236
|
+
-------
|
|
237
|
+
Returns a list of field changes consisting of:
|
|
225
238
|
[{key: (old_value, new_value}] or None if no changes.
|
|
226
239
|
|
|
240
|
+
Notes
|
|
241
|
+
-----
|
|
227
242
|
For example:
|
|
228
|
-
|
|
229
243
|
```
|
|
230
244
|
[{'title':
|
|
231
245
|
('Cascading effects of algal warming in a freshwater community',
|
|
232
246
|
'Cascading effects of algal warming in a freshwater community theatre')}
|
|
233
247
|
]
|
|
234
248
|
```
|
|
235
|
-
----------------------------------------
|
|
236
|
-
Parameters:
|
|
237
|
-
|
|
238
|
-
serial : dryad2dataverse.serializer.Serializer instance
|
|
239
|
-
----------------------------------------
|
|
240
249
|
'''
|
|
241
250
|
if self.status(serial)['status'] == 'updated':
|
|
242
251
|
self.cursor.execute('SELECT dryadjson from dryadStudy \
|
|
@@ -261,10 +270,12 @@ class Monitor():
|
|
|
261
270
|
Assumes name, mimeType, size, descr all unchanged, which is not
|
|
262
271
|
necessarily a valid assumption
|
|
263
272
|
|
|
264
|
-
|
|
273
|
+
Parameters
|
|
274
|
+
----------
|
|
275
|
+
oldFiles : Union[list, tuple]
|
|
265
276
|
(name, mimeType, size, descr, digestType, digest)
|
|
266
277
|
|
|
267
|
-
newFiles: list
|
|
278
|
+
newFiles : Union[list, tuple]
|
|
268
279
|
(name, mimeType, size, descr, digestType, digest)
|
|
269
280
|
'''
|
|
270
281
|
hash_change = []
|
|
@@ -294,12 +305,12 @@ class Monitor():
|
|
|
294
305
|
`{'add':[dyadfiletuples], 'delete:[dryadfiletuples],
|
|
295
306
|
'hash_change': [dryadfiletuples]}`
|
|
296
307
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
serial : dryad2dataverse.serializer.Serializer instance
|
|
301
|
-
----------------------------------------
|
|
308
|
+
Parameters
|
|
309
|
+
----------
|
|
310
|
+
serial : dryad2dataverse.serializer.Serializer
|
|
302
311
|
'''
|
|
312
|
+
#pylint: disable=too-many-locals
|
|
313
|
+
|
|
303
314
|
diffReport = {}
|
|
304
315
|
if self.status(serial)['status'] == 'new':
|
|
305
316
|
#do we want to show what needs to be added?
|
|
@@ -329,7 +340,7 @@ class Monitor():
|
|
|
329
340
|
downLink = f['_links']['stash:file-download']['href']
|
|
330
341
|
except KeyError:
|
|
331
342
|
downLink = f['_links']['stash:download']['href']
|
|
332
|
-
downLink = f'{
|
|
343
|
+
downLink = f'{self.kwargs.get("dry_url", "https://datadryad.org")}{downLink}'
|
|
333
344
|
name = f['path']
|
|
334
345
|
mimeType = f['mimeType']
|
|
335
346
|
size = f['size']
|
|
@@ -379,13 +390,11 @@ class Monitor():
|
|
|
379
390
|
file download link. Normally used for determining dataverse
|
|
380
391
|
file ids for *deletion* in case of dryad file changes.
|
|
381
392
|
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
393
|
+
Parameters
|
|
394
|
+
----------
|
|
385
395
|
url : str
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
----------------------------------------
|
|
396
|
+
*Dryad* file URL in form of
|
|
397
|
+
'https://datadryad.org/api/v2/files/385819/download'.
|
|
389
398
|
'''
|
|
390
399
|
fid = url[url.rfind('/', 0, -10)+1:].strip('/download')
|
|
391
400
|
try:
|
|
@@ -413,11 +422,10 @@ class Monitor():
|
|
|
413
422
|
dryad2dataverse.monitor.Monitor.diff_files['delete']
|
|
414
423
|
to discover Dataverse file ids for deletion.
|
|
415
424
|
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
425
|
+
Parameters
|
|
426
|
+
----------
|
|
419
427
|
filelist : list
|
|
420
|
-
|
|
428
|
+
List of Dryad file tuples: eg:
|
|
421
429
|
|
|
422
430
|
```
|
|
423
431
|
[('https://datadryad.org/api/v2/files/385819/download',
|
|
@@ -427,7 +435,6 @@ class Monitor():
|
|
|
427
435
|
'Readme_ACG_Mortality.txt',
|
|
428
436
|
'text/plain', 1350)]
|
|
429
437
|
```
|
|
430
|
-
----------------------------------------
|
|
431
438
|
'''
|
|
432
439
|
fids = []
|
|
433
440
|
for f in filelist:
|
|
@@ -435,18 +442,20 @@ class Monitor():
|
|
|
435
442
|
return fids
|
|
436
443
|
# return [self.get_dv_fid(f[0]) for f in filelist]
|
|
437
444
|
|
|
438
|
-
def get_json_dvfids(self, serial):
|
|
445
|
+
def get_json_dvfids(self, serial)->list:
|
|
439
446
|
'''
|
|
440
447
|
Return a list of Dataverse file ids for Dryad JSONs which were
|
|
441
448
|
uploaded to Dataverse.
|
|
442
449
|
Normally used to discover the file IDs to remove Dryad JSONs
|
|
443
450
|
which have changed.
|
|
444
451
|
|
|
445
|
-
|
|
446
|
-
|
|
452
|
+
Parameters
|
|
453
|
+
----------
|
|
454
|
+
serial : dryad2dataverse.serializer.Serializer
|
|
447
455
|
|
|
448
|
-
|
|
449
|
-
|
|
456
|
+
Returns
|
|
457
|
+
-------
|
|
458
|
+
list
|
|
450
459
|
'''
|
|
451
460
|
self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE doi=?',
|
|
452
461
|
(serial.doi,))
|
|
@@ -471,12 +480,11 @@ class Monitor():
|
|
|
471
480
|
This method should be called after all transfers are completed,
|
|
472
481
|
including Dryad JSON updates, as the last action for transfer.
|
|
473
482
|
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
transfer : dryad2dataverse.transfer.Transfer instance
|
|
478
|
-
----------------------------------------
|
|
483
|
+
Parameters
|
|
484
|
+
----------
|
|
485
|
+
transfer : dryad2dataverse.transfer.Transfer
|
|
479
486
|
'''
|
|
487
|
+
#pylint: disable=too-many-branches, too-many-statements, too-many-locals
|
|
480
488
|
# get the pre-update dryad uid in case we need it.
|
|
481
489
|
self.cursor.execute('SELECT max(uid) FROM dryadStudy WHERE doi = ?',
|
|
482
490
|
(transfer.dryad.dryadJson['identifier'],))
|
|
@@ -612,14 +620,12 @@ class Monitor():
|
|
|
612
620
|
for subsequent checking for updates. To query last modification time,
|
|
613
621
|
use the dataverse2dryad.monitor.Monitor.lastmod attribute.
|
|
614
622
|
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
623
|
+
Parameters
|
|
624
|
+
----------
|
|
618
625
|
curdate : str
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
----------------------------------------
|
|
626
|
+
UTC datetime string in the format suitable for the Dryad API.
|
|
627
|
+
eg. 2021-01-21T21:42:40Z
|
|
628
|
+
or .strftime('%Y-%m-%dT%H:%M:%SZ').
|
|
623
629
|
'''
|
|
624
630
|
#Dryad API uses Zulu time
|
|
625
631
|
if not curdate:
|