dryad2dataverse 0.7.11a0__tar.gz → 0.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,18 +1,17 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dryad2dataverse
3
- Version: 0.7.11a0
3
+ Version: 0.8.1
4
4
  Summary: Utility for copying and syncing data from a Dryad data repository to a Dataverse repository
5
5
  License: MIT
6
6
  Keywords: Harvard Dataverse,Dataverse,research data management,data repository,Dryad,datadryad.org,dataverse.org
7
7
  Author: Paul Lesack
8
8
  Author-email: paul.lesack@ubc.ca
9
- Requires-Python: >=3.9, <4
9
+ Requires-Python: >=3.10, <4
10
10
  Classifier: Development Status :: 4 - Beta
11
11
  Classifier: Environment :: Console
12
12
  Classifier: Intended Audience :: Education
13
13
  Classifier: License :: OSI Approved :: MIT License
14
14
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: 3.12
@@ -27,9 +26,11 @@ Requires-Dist: charset-normalizer (>=2.0.4)
27
26
  Requires-Dist: idna (>=2.10.0)
28
27
  Requires-Dist: pycryptodome (>=3.20.0)
29
28
  Requires-Dist: pydoc-markdown (>=4.8.2,<5.0.0)
29
+ Requires-Dist: pyyaml (>=6.0.3,<7.0.0)
30
30
  Requires-Dist: requests (>=2.26.0)
31
31
  Requires-Dist: requests-toolbelt (>=0.9.1)
32
32
  Requires-Dist: setuptools (>=80.8.0,<81.0.0)
33
+ Requires-Dist: tomli-w (>=1.2.0,<2.0.0)
33
34
  Requires-Dist: urllib3 (>=2.6.3)
34
35
  Project-URL: Bug Tracker, https://github.com/ubc-library-rc/dryad2dataverse/issues
35
36
  Project-URL: Documentation, https://ubc-library-rc.github.io/dryad2dataverse
@@ -1,13 +1,13 @@
1
1
  [project]
2
2
  name = "dryad2dataverse"
3
- version = "0.7.11a0"
3
+ version = "0.8.1"
4
4
  description = "Utility for copying and syncing data from a Dryad data repository to a Dataverse repository"
5
5
  authors = [
6
6
  {name = "Paul Lesack",email = "paul.lesack@ubc.ca"}
7
7
  ]
8
8
  license = {text = "MIT"}
9
9
  readme = "README.md"
10
- requires-python = ">=3.9, <4"
10
+ requires-python = ">=3.10, <4"
11
11
  dependencies = [
12
12
  "certifi (>=2022.12.7)",
13
13
  "charset-normalizer (>=2.0.4)",
@@ -18,7 +18,9 @@ dependencies = [
18
18
  "requests-toolbelt (>=0.9.1)",
19
19
  "urllib3 (>=2.6.3)",
20
20
  "pydoc-markdown (>=4.8.2,<5.0.0)",
21
- "setuptools (>=80.8.0,<81.0.0)"
21
+ "setuptools (>=80.8.0,<81.0.0)",
22
+ "tomli-w (>=1.2.0,<2.0.0)",
23
+ "pyyaml (>=6.0.3,<7.0.0)",
22
24
  ]
23
25
 
24
26
  keywords =['Harvard Dataverse',
@@ -42,6 +44,8 @@ Repository = 'https://github.com/ubc-library-rc/dryad2dataverse.git'
42
44
 
43
45
 
44
46
  [tool.poetry]
47
+ include = ["dryad2dataverse/data/dryad2dataverse_config.yml"]
48
+
45
49
  classifiers = ['Development Status :: 4 - Beta',
46
50
  'Environment :: Console',
47
51
  'Intended Audience :: Education',
@@ -50,10 +54,15 @@ classifiers = ['Development Status :: 4 - Beta',
50
54
  'Topic :: Internet :: WWW/HTTP :: Site Management',
51
55
  'Topic :: Utilities']
52
56
 
57
+ [tool.poetry.group.dev]
58
+ optional = true
59
+
53
60
  [tool.poetry.group.dev.dependencies]
54
61
  pylint = ">=3.3.6"
55
62
  mkdocs = ">=1.6.1"
56
63
  pydoc-markdown = ">=4.8.2"
64
+ mkdocstrings-python = "^2.0.1"
65
+
57
66
 
58
67
  [build-system]
59
68
  requires = ["poetry-core>=2.0.0,<3.0.0"]
@@ -0,0 +1,32 @@
1
+ '''
2
+ Dryad to Dataverse utilities. No modules are loaded by default, so
3
+
4
+ `>>> import dryad2dataverse`
5
+
6
+ will work, but will have no effect.
7
+
8
+ Modules included:
9
+
10
+ * **dryad2dataverse.config** : Configuration for all modules. URLs, API keys,
11
+ etc are all here.
12
+ Base configurations are read out of a yaml file in ./data
13
+
14
+ * **dryad2dataverse.serializer** : Download and serialize Dryad
15
+ JSON to Dataverse JSON.
16
+
17
+ * **dryad2dataverse.transfer** : metadata and file transfer
18
+ utilities.
19
+
20
+ * **dryad2dataverse.monitor** : Monitoring and database tools
21
+ for maintaining a pipeline to Dataverse without unnecessary
22
+ downloading and file duplication.
23
+
24
+ * **dryad2dataverse.exceptions** : Custom exceptions.
25
+ '''
26
+
27
+ import sys
28
+
29
+ VERSION = (0, 8, 1)
30
+ __version__ = '.'.join([str(x) for x in VERSION])
31
+ USERAGENT = (f'dryad2dataverse/v{__version__} ({sys.platform.capitalize()}); '
32
+ f'Python {sys.version[:sys.version.find("(")-1]}')
@@ -0,0 +1,94 @@
1
+ '''
2
+ Handles authentication and bearer tokens using
3
+ Dryad's application ID and secret
4
+ '''
5
+ import datetime
6
+ import logging
7
+ import requests
8
+ from dryad2dataverse import USERAGENT
9
+
10
+ LOGGER = logging.getLogger(__name__)
11
+
12
+ class Token:
13
+ '''
14
+ Self updating bearer token generator
15
+ '''
16
+ def __init__(self, **kwargs):
17
+ '''
18
+ Obtain bearer token
19
+
20
+ Parameters
21
+ ----------
22
+ **kwargs
23
+ Must include required keyword arguments as below
24
+ dry_url : str
25
+ Dryad base url (eg: https://datadryad.org)
26
+ app_id : str
27
+ Dryad application ID
28
+ secret : str
29
+ Application secret
30
+
31
+ Other parameters
32
+ ----------------
33
+ timeout : int
34
+ timeout in seconds
35
+
36
+ '''
37
+ self.kwargs = kwargs
38
+ self.path = '/oauth/token'
39
+ self.data = {'client_id': kwargs['app_id'],
40
+ 'client_secret' : kwargs['secret'],
41
+ 'grant_type': 'client_credentials'}
42
+ self.headers = {'User-agent': USERAGENT,
43
+ 'charset' : 'UTF-8'}
44
+ self.timeout = kwargs.get('timeout', 100)
45
+ self.expiry_time = None
46
+ self.__token_info = None
47
+
48
+ def get_bearer_token(self):
49
+ '''
50
+ Obtain a brand new bearer token
51
+ '''
52
+ try:
53
+ tokenr = requests.post(f"{self.kwargs['dry_url']}{self.path}",
54
+ headers=self.headers,
55
+ data=self.data,
56
+ timeout=self.timeout)
57
+ tokenr.raise_for_status()
58
+ self.__token_info = tokenr.json()
59
+
60
+ except (requests.exceptions.HTTPError,
61
+ requests.exceptions.RequestException) as err:
62
+ LOGGER.exception('HTTP Error:, %s', err)
63
+ raise err
64
+
65
+ def check_token_valid(self)->bool:
66
+ '''
67
+ Checks to see if token is still valid
68
+ '''
69
+ expiry_time = (datetime.datetime.fromtimestamp(self.__token_info['created_at']) +
70
+ datetime.timedelta(seconds=self.__token_info['expires_in']))
71
+ self.expiry_time = expiry_time.strftime('%Y-%m-%dT%H:%M:%SZ')
72
+ if datetime.datetime.now() > expiry_time:
73
+ return False
74
+ return True
75
+
76
+ @property
77
+ def token(self)->str:
78
+ '''
79
+ Return only a valid token
80
+ '''
81
+ if not self.__token_info:
82
+ self.get_bearer_token()
83
+ if not self.check_token_valid():
84
+ self.get_bearer_token()
85
+ return self.__token_info['access_token']
86
+
87
+ @property
88
+ def auth_header(self)->dict:
89
+ '''
90
+ Return valid authorization header
91
+ '''
92
+ return {'Accept' : 'application/json',
93
+ 'Content-Type' : 'application/json',
94
+ 'Authorization' : f'Bearer {self.token}'}
@@ -0,0 +1,180 @@
1
+ '''
2
+ This module contains the information that configures all the parameters
3
+ required to transfer data from Dryad to Dataverse.
4
+
5
+ "Constants" may be a bit strong, but the only constant is the
6
+ presence of change.
7
+ '''
8
+ import logging
9
+ import pathlib
10
+ import importlib.resources
11
+ import sys
12
+
13
+ from typing import Union
14
+ #from requests.packages.urllib3.util.retry import Retry
15
+ #Above causes Pylint error. WHY?
16
+ #Because it's a fake path and just a pointer. See requests source
17
+ from urllib3.util import Retry
18
+ import yaml
19
+
20
+ from dryad2dataverse import USERAGENT
21
+
22
+ LOGGER = logging.getLogger(__name__)
23
+ #Requests session retry strategy in case of bad connections
24
+ #See :https://findwork.dev/blog/
25
+ #advanced-usage-python-requests-timeouts-retries-hooks/#retry-on-failure
26
+ #also
27
+ #https://stackoverflow.com/questions/15431044/
28
+ #can-i-set-max-retries-for-requests-request
29
+ RETRY_STRATEGY = Retry(total=10,
30
+ status_forcelist=[429, 500, 502, 503, 504],
31
+ allowed_methods=['HEAD', 'GET', 'OPTIONS',
32
+ 'POST', 'PUT'],
33
+ backoff_factor=1)
34
+
35
+ #Variable listings from previous versions of this file
36
+ #that are now included in Constants
37
+ #
38
+ ##used in dryad2dataverse.serializer
39
+ #DRYURL = 'https://datadryad.org'
40
+ #TMP = '/tmp'
41
+ #
42
+ ##used in dryad2dataverse.transfer
43
+ #DVURL = 'https://borealisdata.ca'
44
+ #APIKEY = None
45
+ #MAX_UPLOAD = 3221225472 #Max 3GB upload
46
+ #DV_CONTACT_EMAIL = None
47
+ #DV_CONTACT_NAME = None
48
+ #NOTAB = ['.sav', '.por', '.zip', '.csv', '.tsv', '.dta',
49
+ # '.rdata', '.xslx', '.xls']
50
+ #
51
+ ##used in dryad2dataverse.monitor
52
+ #HOME = os.path.expanduser('~')
53
+ #DBASE = pathlib.Path(HOME, 'dryad_dataverse_monitor.sqlite3')
54
+
55
+ class Config(dict):
56
+ '''
57
+ Holds all the information about dryad2dataverse parameters
58
+ '''
59
+ def __init__(self, cpath: Union[pathlib.Path, str]=None,
60
+ fname:str=None,
61
+ force:bool=False):
62
+ '''
63
+ Initalize
64
+
65
+ Parameters
66
+ ----------
67
+ force : bool
68
+ Force writing a new config file
69
+ '''
70
+ self.cpath = cpath
71
+ self.fname = fname
72
+ self.force = force
73
+ self.default_locations = {'ios': '~/.config/dryad2dataverse',
74
+ 'linux' : '~/.config/dryad2dataverse',
75
+ 'darwin': '~/Library/Application Support/dryad2dataverse',
76
+ 'win32' : 'AppData/Roaming/dryad2dataverse',
77
+ 'cygwin' : '~/.config/dryad2dataverse'}
78
+
79
+ #Use read() instead of yaml.safe_load.read_text() so that
80
+ #comments are preserved
81
+ with open(importlib.resources.files(
82
+ 'dryad2dataverse.data').joinpath(
83
+ 'dryad2dataverse_config.yml'), mode='r',
84
+ encoding='utf-8') as w:
85
+ self.template = w.read()
86
+
87
+ if not self.cpath:
88
+ self.cpath = self.default_locations[sys.platform]
89
+ if not self.fname:
90
+ self.fname = 'dryad2dataverse_config.yml'
91
+ self.configfile = pathlib.Path(self.cpath, self.fname).expanduser()
92
+
93
+ if self.make_config_template():
94
+ self.load_config()
95
+ else:
96
+ raise FileNotFoundError(f'Can\'t find {self.configfile}')
97
+
98
+ @classmethod
99
+ def update_headers(cls,
100
+ inheader:Union[None, dict]=None,
101
+ **kwargs)->dict:
102
+ '''
103
+ Update headers with user agent and token information (if present)
104
+
105
+ Parameters
106
+ ----------
107
+ inheader : dict
108
+ Existing header if present
109
+
110
+ **kwargs
111
+ Keyword arguments, one of which should be 'token' containing
112
+ a dryad2dataverse.auth.Token instance
113
+ '''
114
+ if not kwargs:
115
+ kwargs = {}
116
+ if not inheader:
117
+ inheader = {}
118
+ headers = {'Accept':'application/json',
119
+ 'Content-Type':'application/json'}
120
+ headers.update({'User-agent' : USERAGENT})
121
+ if kwargs.get('token'):
122
+ headers.update(kwargs['token'].auth_header)
123
+ headers.update(inheader)
124
+ return headers
125
+
126
+ def make_config_template(self):
127
+ '''
128
+ Make a default config if one does not exist
129
+ Returns
130
+ -------
131
+ True if created
132
+ False if not
133
+ '''
134
+ if self.configfile.exists() and not self.force:
135
+ return 1
136
+ if not self.configfile.parent.exists():
137
+ self.configfile.parent.mkdir(parents=True)
138
+ with open(self.configfile, 'w', encoding='utf-8') as f:
139
+ f.write(self.template)
140
+ if self.configfile.exists():
141
+ return 1
142
+ return 0
143
+
144
+ def load_config(self):
145
+ '''
146
+ Loads the config to a dict
147
+ '''
148
+ try:
149
+ with open(self.configfile, 'r', encoding='utf-8') as f:
150
+ self.update(yaml.safe_load(f))
151
+ except yaml.YAMLError as e:
152
+ LOGGER.exception('Unable to load config file, %s', e)
153
+ sys.exit()
154
+
155
+ def overwrite(self):
156
+ '''
157
+ Overwrite the config file with current contents.
158
+
159
+ Note that this will remove the comments from the YAML file.
160
+ '''
161
+ with open(self.configfile, 'w', encoding='utf-8') as w:
162
+ yaml.safe_dump(self, w)
163
+
164
+ def validate(self):
165
+ '''
166
+ Ensure all keys have values
167
+ '''
168
+ can_be_false = ['force_unlock', 'test_mode']
169
+ badkey = [k for k, v in self.items() if not v]
170
+ for rm in can_be_false:
171
+ badkey.remove(rm)#It can be false
172
+ listkeys = {k:v for k,v in self.items() if isinstance(v, list)}
173
+ for k, v in listkeys.items():
174
+ for sub_v in v:
175
+ if not sub_v:
176
+ badkey.append(k)
177
+ break
178
+ if badkey:
179
+ raise ValueError('Null values in configuration. '
180
+ f'See:\n{"\n".join([str(_) for _ in badkey])}')
@@ -0,0 +1,127 @@
1
+ #Sample configuration file dryad2dataverse
2
+ #It will *not* work unless you fill it in, because both
3
+ #Dryad and Dataverse require user information.
4
+
5
+ #------
6
+ #Dryad configuration
7
+ #------
8
+ #Dryad base URL
9
+ dry_url: https://datadryad.org
10
+ #API path
11
+ api_path: /api/v2
12
+ #Application ID (contact Dryad to get an institutional account)
13
+ app_id: null
14
+ #Secret key, should have come with your application ID.
15
+ secret: null
16
+
17
+ #------
18
+ #Dataverse configuration
19
+ #------
20
+ #Base url of Dataverse instance (eg: https://borealisdata.ca)
21
+ dv_url: null
22
+ #Dataverse API KEY
23
+ api_key: null
24
+ #Maximum upload size in bytes (contact Dataverse administrator for value if unknown)
25
+ max_upload: 3221225472
26
+ #Contact email address for Dataverse record, eg: research.data@test.invalid
27
+ dv_contact_email: null
28
+ #Contact name associated with the address (like, say, "[University] Research Data Services")
29
+ dv_contact_name: null
30
+ #Dataverse target collection shortname
31
+ target: Null
32
+ #To stop conversion to tabular data, add extensions here. Tabular processing can cause
33
+ #problems and the original files were not processed that way. It is recommended to
34
+ #keep this as is and add more if required.
35
+ notab:
36
+ - .sav
37
+ - .por
38
+ - .zip
39
+ - .csv
40
+ - .tsv
41
+ - .dta
42
+ - .rdata
43
+ - .xslx
44
+ - .xls
45
+
46
+ #------
47
+ #Monitoring configuration
48
+ #------
49
+ #Location of persistent database which tracks transfers over time.
50
+ #If you ever move the database, you must change this to the new location or everything will be transferred again
51
+ dbase: ~/dryad_dataverse_monitor.sqlite3
52
+
53
+ #------
54
+ #Transfer information
55
+ #------
56
+ #Institutional ROR. Find your ROR here: https://ror.org/search
57
+ ror: null
58
+
59
+ #Location of temporarily downloaded files. This doesn't default to the normal
60
+ #temp file location because the files can be gigantic, and so is manually specified
61
+ tempfile_location: /tmp
62
+
63
+ #Email address which sends update notifications.
64
+ #Note, OATH2 is not supported. Yahoo is free
65
+ #and you may as well use it
66
+ sending_email: null
67
+ #Account username. Check provider for details
68
+ sending_email_username: null
69
+ #Account password. Check provider for details; may be different than
70
+ #an ordinary account if using an application
71
+ email_send_password: null
72
+ #SMTP server configuration
73
+ smtp_server: smtp.mail.yahoo.com
74
+ #Mail is sent using SSL; check with provider for details
75
+ ssl_port: 465
76
+ #List of email addresses that will receive notifications
77
+ recipients:
78
+ - null
79
+ #location of dryadd log
80
+ #include full file name: eg: /var/log/dryadd.log
81
+ #The default below will exist but is a terrible place
82
+ #for a log so you should change it.
83
+ log: ~/dryadd.log
84
+ #level at which to write a log message. Select from:
85
+ # debug, info, warning, error or critical
86
+ loglevel: warning
87
+ #level at which to send an email message about problems.
88
+ #Same levels as above, obviously.
89
+ email_loglevel: warning
90
+
91
+ #Forcible file unlock. Forcible file unlocking requires admin privileges in Dataverse.
92
+ #Normally you wouldn't need to change this.
93
+ force_unlock: false
94
+ #Number of database backups to keep
95
+ number_of_backups: 3
96
+
97
+ #------
98
+ #Troubleshooting options
99
+ #------
100
+ #Warn if too many new updates. Occasionally, Dryad will change their
101
+ #"persistent" IDs and then everything looks new, which causes everything
102
+ #to be loaded again. It's recommended that this be "true" to stop an accidental
103
+ #complete reingest
104
+ warn_too_many: true
105
+ #Number of new Dryad surveys which will trigger a warning and stop execution.
106
+ #This is to prevent accidentally ingesting thousands of surveys if you
107
+ #misconfigure something
108
+ warning_threshold: 15
109
+ #Force dryadd into test mode
110
+ test_mode: false
111
+ #Test mode - only transfer first [n] of the total number of (new) records.
112
+ #Old ones will still be updated, though
113
+ test_mode_limit: 5
114
+
115
+
116
+ #------
117
+ #Exclusion list
118
+ #------
119
+ #Dryad DOIs to exclude from transfers. This is usually because the files in the
120
+ #study are too large to be ingested into Dataverse, but may also be used for
121
+ #studies with errors or any other reason
122
+ #
123
+ #IMPORTANT!
124
+ #
125
+ #Uncomment below and add dois in place of null, one per line.
126
+ #exclude_list:
127
+ #- null
@@ -12,9 +12,13 @@ class SSLSMTPHandler(SMTPHandler):
12
12
  An SSL handler for logging.handlers
13
13
  '''
14
14
  def emit(self, record:logging.LogRecord):
15
- """
15
+ '''
16
16
  Emit a record while using an SSL mail server.
17
- """
17
+
18
+ Parameters
19
+ ----------
20
+ record : logging.LogRecord
21
+ '''
18
22
  #Praise be to
19
23
  #https://stackoverflow.com/questions/36937461/
20
24
  #how-can-i-send-an-email-using-python-loggings-