dataverse-utils 0.22.4__tar.gz → 0.22.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/PKG-INFO +3 -3
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/pyproject.toml +3 -3
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/__init__.py +4 -4
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/collections.py +59 -20
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/ldc.py +7 -5
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_collection_info.py +87 -99
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_ldc_uploader.py +11 -7
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_release.py +1 -2
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/LICENCE.md +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/README.md +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/data/LDC_EULA_general.md +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/dataverse_utils.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/dvdata.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_del.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_list_files.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_manifest_gen.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_pg_facet_date.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_readme_creator.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_record_copy.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_replace_licence.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_study_migrator.py +0 -0
- {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_upload_tsv.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataverse-utils
|
|
3
|
-
Version: 0.22.
|
|
3
|
+
Version: 0.22.7
|
|
4
4
|
Summary: Utilities for the Dataverse data respository system
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENCE.md
|
|
@@ -16,14 +16,14 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.13
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.14
|
|
18
18
|
Requires-Dist: bs4 (>=0.0.2,<0.0.3)
|
|
19
|
-
Requires-Dist: chardet (>=5.2
|
|
19
|
+
Requires-Dist: chardet (>=5.2)
|
|
20
20
|
Requires-Dist: dryad2dataverse (>=0.8.4,<0.9.0)
|
|
21
21
|
Requires-Dist: markdown (>=3.10.2,<4.0.0)
|
|
22
22
|
Requires-Dist: markdown-pdf (>=1.13.1,<2.0.0)
|
|
23
23
|
Requires-Dist: markdownify (>=1.2.2,<2.0.0)
|
|
24
24
|
Requires-Dist: pyreadr (>=0.5.4,<0.6.0)
|
|
25
25
|
Requires-Dist: pyreadstat (>=1.3.3,<2.0.0)
|
|
26
|
-
Requires-Dist: requests (>=2.
|
|
26
|
+
Requires-Dist: requests (>=2.33,<3.0)
|
|
27
27
|
Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
|
|
28
28
|
Requires-Dist: tqdm (>=4.67.3,<5.0.0)
|
|
29
29
|
Project-URL: Homepage, https://ubc-library-rc.github.io/dataverse_utils
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "dataverse-utils"
|
|
3
|
-
version = "0.22.
|
|
3
|
+
version = "0.22.7"
|
|
4
4
|
description = "Utilities for the Dataverse data respository system"
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "Paul Lesack",email = "paul.lesack@ubc.ca"}
|
|
@@ -10,7 +10,6 @@ readme = "README.md"
|
|
|
10
10
|
requires-python = ">=3.10, <4.0"
|
|
11
11
|
#When requests 2.33 is released, update poetry and release
|
|
12
12
|
dependencies = [
|
|
13
|
-
"requests (>=2.30.0,<3.0.0)",
|
|
14
13
|
"bs4 (>=0.0.2,<0.0.3)",
|
|
15
14
|
"markdown (>=3.10.2,<4.0.0)",
|
|
16
15
|
"markdown-pdf (>=1.13.1,<2.0.0)",
|
|
@@ -20,7 +19,8 @@ dependencies = [
|
|
|
20
19
|
"requests-toolbelt (>=1.0.0,<2.0.0)",
|
|
21
20
|
"tqdm (>=4.67.3,<5.0.0)",
|
|
22
21
|
"dryad2dataverse (>=0.8.4,<0.9.0)",
|
|
23
|
-
"chardet (>=5.2
|
|
22
|
+
"chardet (>=5.2)",
|
|
23
|
+
"requests (>=2.33,<3.0)"
|
|
24
24
|
]
|
|
25
25
|
#Chardet and requests will need to be changed when requests goes to 2.32; at that
|
|
26
26
|
#point just remove chardet
|
|
@@ -7,7 +7,7 @@ import pathlib
|
|
|
7
7
|
import sys
|
|
8
8
|
from dataverse_utils.dataverse_utils import *
|
|
9
9
|
|
|
10
|
-
VERSION = (0, 22,
|
|
10
|
+
VERSION = (0, 22, 7)
|
|
11
11
|
__version__ = '.'.join([str(x) for x in VERSION])
|
|
12
12
|
|
|
13
13
|
USERAGENT = (f'dataverse_utils/v{__version__} ({sys.platform.capitalize()}); '
|
|
@@ -15,14 +15,14 @@ USERAGENT = (f'dataverse_utils/v{__version__} ({sys.platform.capitalize()}); '
|
|
|
15
15
|
UAHEADER = {'User-agent' : USERAGENT}
|
|
16
16
|
|
|
17
17
|
SCRIPT_VERSIONS={
|
|
18
|
-
'dv_collection_info' : (0,
|
|
18
|
+
'dv_collection_info' : (0, 3, 0),
|
|
19
19
|
'dv_del' : (0, 2, 4),
|
|
20
|
-
'dv_ldc_uploader' : (0,
|
|
20
|
+
'dv_ldc_uploader' : (0, 4, 1),
|
|
21
21
|
'dv_list_files' : (0, 1, 1),
|
|
22
22
|
'dv_manifest_gen' : (0, 5, 1),
|
|
23
23
|
'dv_pg_facet_date' : (0, 1, 1),
|
|
24
24
|
'dv_record_copy' : (0, 1, 2),
|
|
25
|
-
'dv_release' : (0, 1,
|
|
25
|
+
'dv_release' : (0, 1, 3),
|
|
26
26
|
'dv_replace_licence' : (0, 1, 1),
|
|
27
27
|
'dv_readme_creator' : (0, 1, 1),
|
|
28
28
|
'dv_study_migrator' : (0, 4, 1),
|
|
@@ -9,6 +9,7 @@ import io
|
|
|
9
9
|
import logging
|
|
10
10
|
import pathlib
|
|
11
11
|
import string
|
|
12
|
+
import sys
|
|
12
13
|
import tempfile
|
|
13
14
|
import textwrap
|
|
14
15
|
import typing
|
|
@@ -80,11 +81,25 @@ class DvCollection:
|
|
|
80
81
|
self.retry_strategy = RETRY
|
|
81
82
|
else:
|
|
82
83
|
self.retry_strategy = kwargs['retry']
|
|
84
|
+
self.collections = None
|
|
83
85
|
self.session = requests.Session()
|
|
84
86
|
self.session.mount('https://',
|
|
85
87
|
requests.adapters.HTTPAdapter(max_retries=self.retry_strategy))
|
|
86
|
-
self.collections = None
|
|
87
88
|
self.studies = None
|
|
89
|
+
self.__root = None
|
|
90
|
+
self.all_colls = [self.root]
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def root(self):
|
|
94
|
+
'''
|
|
95
|
+
Return the name and short name of the top level collection
|
|
96
|
+
'''
|
|
97
|
+
if not self.__root:
|
|
98
|
+
x = self.session.get(f'{self.url}/api/dataverses/{self.coll}',
|
|
99
|
+
headers=self.headers)
|
|
100
|
+
x.raise_for_status()
|
|
101
|
+
self.__root = (x.json()['data']['name'], x.json()['data']['alias'])
|
|
102
|
+
return self.__root
|
|
88
103
|
|
|
89
104
|
def __clean_url(self, badurl:str):
|
|
90
105
|
'''
|
|
@@ -142,7 +157,6 @@ class DvCollection:
|
|
|
142
157
|
out=self.__get_shortname(_['id'])
|
|
143
158
|
dvs.append((_['title'], out))
|
|
144
159
|
except Exception as e:
|
|
145
|
-
|
|
146
160
|
obscure_error = f'''
|
|
147
161
|
An error has occured where a collection can be
|
|
148
162
|
identified by ID but its name cannot be determined.
|
|
@@ -155,12 +169,13 @@ class DvCollection:
|
|
|
155
169
|
|
|
156
170
|
Problematic collection id number: {_.get("id",
|
|
157
171
|
"not available")}'''
|
|
158
|
-
|
|
159
|
-
print(
|
|
172
|
+
#to sys.stdout?
|
|
173
|
+
print(50*'-', file=sys.stderr)
|
|
174
|
+
print(textwrap.dedent(obscure_error), file=sys.stderr)
|
|
160
175
|
print(e)
|
|
161
176
|
LOGGER.error(textwrap.fill(textwrap.dedent(obscure_error).strip()))
|
|
162
177
|
traceback.print_exc()
|
|
163
|
-
print(50*'-')
|
|
178
|
+
print(50*'-', file=sys.stderr)
|
|
164
179
|
raise e
|
|
165
180
|
#---
|
|
166
181
|
if not dvs:
|
|
@@ -171,6 +186,7 @@ class DvCollection:
|
|
|
171
186
|
LOGGER.debug('recursive')
|
|
172
187
|
self.get_collections(dv[1], output)
|
|
173
188
|
self.collections = output
|
|
189
|
+
self.collections.append(self.root)
|
|
174
190
|
return output
|
|
175
191
|
|
|
176
192
|
def get_studies(self, root:str=None):
|
|
@@ -186,9 +202,8 @@ class DvCollection:
|
|
|
186
202
|
if not root:
|
|
187
203
|
root=self.coll
|
|
188
204
|
all_studies = self.get_collection_listing(root)
|
|
189
|
-
#collections = self.get_collections(root, self.url)
|
|
190
205
|
collections = self.get_collections(root)
|
|
191
|
-
for collection in collections:
|
|
206
|
+
for collection in tqdm.tqdm(collections):
|
|
192
207
|
all_studies.extend(self.get_collection_listing(collection[1]))
|
|
193
208
|
self.studies = all_studies
|
|
194
209
|
return all_studies
|
|
@@ -207,12 +222,16 @@ class DvCollection:
|
|
|
207
222
|
cl.raise_for_status()
|
|
208
223
|
pids = [f"{z['protocol']}:{z['authority']}/{z['identifier']}"
|
|
209
224
|
for z in cl.json()['data'] if z['type'] == 'dataset']
|
|
210
|
-
|
|
225
|
+
#Pass collection info into the study because that's not available from
|
|
226
|
+
#a metadata download
|
|
227
|
+
smkwargs = [{'collection_name':_[0] , 'collection_short_name':_[1]}
|
|
228
|
+
for _ in self.collections if coll_id == _[1]][0]
|
|
229
|
+
out = [(self.get_study_info(pid, **smkwargs), pid) for pid in pids]
|
|
211
230
|
for _ in out:
|
|
212
231
|
_[0].update({'pid': _[1]})
|
|
213
232
|
return [x[0] for x in out]
|
|
214
233
|
|
|
215
|
-
def get_study_info(self, pid):
|
|
234
|
+
def get_study_info(self, pid, **kwargs):
|
|
216
235
|
'''
|
|
217
236
|
Returns a StudyMetadata object with complete metadata for a study.
|
|
218
237
|
|
|
@@ -220,13 +239,16 @@ class DvCollection:
|
|
|
220
239
|
----------
|
|
221
240
|
pid : str
|
|
222
241
|
Persistent ID of a Dataverse study
|
|
242
|
+
|
|
243
|
+
**kwargs
|
|
244
|
+
Other useful information to pass onto StudyMetadata, such as collection info, etc.
|
|
223
245
|
'''
|
|
224
246
|
meta = self.session.get(f'{self.url}/api/datasets/:persistentId',
|
|
225
247
|
params={'persistentId': pid},
|
|
226
248
|
headers=self.headers)
|
|
227
249
|
meta.raise_for_status()
|
|
228
250
|
LOGGER.debug(pid)
|
|
229
|
-
return StudyMetadata(study_meta=meta.json(), key=self.__key, url=self.url)
|
|
251
|
+
return StudyMetadata(study_meta=meta.json(), key=self.__key, url=self.url, **kwargs)
|
|
230
252
|
|
|
231
253
|
class StudyMetadata(dict):
|
|
232
254
|
'''
|
|
@@ -268,6 +290,10 @@ class StudyMetadata(dict):
|
|
|
268
290
|
self.all_versions = None
|
|
269
291
|
self.url = kwargs.get('url')
|
|
270
292
|
self.pid = kwargs.get('pid')
|
|
293
|
+
#If only there would be an easy way to check if something was deaccessioned
|
|
294
|
+
#without yet another request. But right now, let's assume it's fine.
|
|
295
|
+
#See below (under Key Error) where it get set
|
|
296
|
+
self.deaccession_flag = 0
|
|
271
297
|
if self.study_meta:
|
|
272
298
|
#self.pid = kwargs.get('pid', (f"{self.study_meta['data']['protocol']}:"
|
|
273
299
|
# f"{self.study_meta['data']['authority']}"
|
|
@@ -286,15 +312,23 @@ class StudyMetadata(dict):
|
|
|
286
312
|
try:
|
|
287
313
|
self.update(self.extract_metadata(self.study_meta['data']['latestVersion']))
|
|
288
314
|
except KeyError as e:
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
315
|
+
if (self.study_meta.get('status') == 'OK' and not
|
|
316
|
+
self.study_meta['data'].get('latestVersion')):
|
|
317
|
+
# Latest version is not available because API strips out all
|
|
318
|
+
# citation metadata for deaccessioned studies but doesn't
|
|
319
|
+
# actually indicate this in any obvious manner
|
|
320
|
+
# This is further complicated because *all* the metadata
|
|
321
|
+
# we want is in the metadata blocks, which won't exist in the JSON
|
|
322
|
+
# because for some idiotic reason it's OK to expose it in the GUI
|
|
323
|
+
# but not via API.
|
|
324
|
+
self.deaccession_flag = 1
|
|
325
|
+
else:
|
|
326
|
+
raise MetadataError(f'Unable to parse study metadata. Do you need an API key?\n'
|
|
327
|
+
f'{e} key not found.\n'
|
|
328
|
+
f'Offending JSON: {self.study_meta}') from e
|
|
292
329
|
self.__files = None
|
|
293
330
|
self.__all_files = None
|
|
294
|
-
|
|
295
|
-
# for n, _ in enumerate(self.all_versions['data'])}
|
|
296
|
-
#self.index = {_: n for _, n in enumerate(self.versions)}
|
|
297
|
-
self.index = dict(enumerate(self.versions))
|
|
331
|
+
self.index = {_: n for n, _ in enumerate(self.versions)}
|
|
298
332
|
|
|
299
333
|
def __obtain_metadata(self):
|
|
300
334
|
'''
|
|
@@ -354,6 +388,10 @@ class StudyMetadata(dict):
|
|
|
354
388
|
tmp['versionStatement'] = f"{chunk['versionNumber']}.{chunk['versionMinorNumber']}"
|
|
355
389
|
else:
|
|
356
390
|
tmp['versionStatement'] = f"{chunk.get('versionState', '')}"
|
|
391
|
+
|
|
392
|
+
for _ in ['collection_name', 'collection_short_name']:
|
|
393
|
+
if self.kwargs.get(_):
|
|
394
|
+
tmp[_] = self.kwargs[_]
|
|
357
395
|
return tmp
|
|
358
396
|
|
|
359
397
|
def extract_field_metadata(self, field):
|
|
@@ -549,7 +587,7 @@ class StudyMetadata(dict):
|
|
|
549
587
|
|
|
550
588
|
files = [self.flatten(_) for _ in filelist]
|
|
551
589
|
for ff in files:
|
|
552
|
-
ff.update({'
|
|
590
|
+
ff.update({'dataset_pid': self.pid})
|
|
553
591
|
return files
|
|
554
592
|
|
|
555
593
|
def __extract_files(self):
|
|
@@ -560,9 +598,11 @@ class StudyMetadata(dict):
|
|
|
560
598
|
#but files would (usually) be an arbitrary number of files.
|
|
561
599
|
#That bothers me on an intellectual level. Therefore, it will be attribute.
|
|
562
600
|
#Iterate over StudyMetadata.files if you want to know the contents
|
|
563
|
-
if not self.__files:
|
|
601
|
+
if not self.__files and not self.deaccession_flag:
|
|
564
602
|
self.__files = self.extract_files(self.study_meta['data']
|
|
565
603
|
['latestVersion']['files'])
|
|
604
|
+
if self.deaccession_flag:
|
|
605
|
+
self.__files = []
|
|
566
606
|
|
|
567
607
|
def __extract_licence_info(self, indict)->dict:
|
|
568
608
|
'''
|
|
@@ -695,7 +735,6 @@ class ReadmeCreator:
|
|
|
695
735
|
return f'{inkey}: \n'
|
|
696
736
|
return f'{inkey}: '
|
|
697
737
|
|
|
698
|
-
|
|
699
738
|
def __extract_files(self):
|
|
700
739
|
'''
|
|
701
740
|
Extract file level metadata, and write to self.__files.
|
|
@@ -14,6 +14,7 @@ import requests
|
|
|
14
14
|
from requests.adapters import HTTPAdapter
|
|
15
15
|
from bs4 import BeautifulSoup as bs
|
|
16
16
|
import dryad2dataverse.serializer as ds
|
|
17
|
+
import dryad2dataverse.config as dc
|
|
17
18
|
from dataverse_utils import UAHEADER
|
|
18
19
|
|
|
19
20
|
#pylint: disable=invalid-name
|
|
@@ -29,7 +30,7 @@ class Ldc(ds.Serializer):#pylint: disable=too-many-instance-attributes
|
|
|
29
30
|
An LDC item (eg, LDC2021T01)
|
|
30
31
|
'''
|
|
31
32
|
#pylint: disable=super-init-not-called, arguments-differ
|
|
32
|
-
def __init__(self, ldc, cert=None):
|
|
33
|
+
def __init__(self, ldc, cert=None, **kwargs):
|
|
33
34
|
'''
|
|
34
35
|
Returns a dict with keys created from an LDC catalogue web
|
|
35
36
|
page.
|
|
@@ -54,10 +55,11 @@ class Ldc(ds.Serializer):#pylint: disable=too-many-instance-attributes
|
|
|
54
55
|
self.cert = cert
|
|
55
56
|
self.session = requests.Session()
|
|
56
57
|
self.session.mount('https://',
|
|
57
|
-
HTTPAdapter(max_retries=
|
|
58
|
+
HTTPAdapter(max_retries=dc.RETRY_STRATEGY))
|
|
58
59
|
if self.cert:
|
|
59
60
|
self.cert = os.path.expanduser(self.cert)
|
|
60
61
|
self.__fixdesc = None
|
|
62
|
+
self.kwargs = kwargs
|
|
61
63
|
|
|
62
64
|
@property
|
|
63
65
|
def ldcJson(self):
|
|
@@ -120,7 +122,7 @@ class Ldc(ds.Serializer):#pylint: disable=too-many-instance-attributes
|
|
|
120
122
|
'''
|
|
121
123
|
#pylint: disable=property-with-parameters
|
|
122
124
|
if not maxsize:
|
|
123
|
-
maxsize =
|
|
125
|
+
maxsize = self.kwargs.get('max_upload', 68719476736)
|
|
124
126
|
|
|
125
127
|
@property
|
|
126
128
|
def id(self):
|
|
@@ -129,7 +131,7 @@ class Ldc(ds.Serializer):#pylint: disable=too-many-instance-attributes
|
|
|
129
131
|
'''
|
|
130
132
|
return self.ldc
|
|
131
133
|
|
|
132
|
-
def
|
|
134
|
+
def fetch_ldc_record(self, timeout=45):
|
|
133
135
|
'''
|
|
134
136
|
Downloads record from LDC website
|
|
135
137
|
|
|
@@ -150,7 +152,7 @@ class Ldc(ds.Serializer):#pylint: disable=too-many-instance-attributes
|
|
|
150
152
|
page.
|
|
151
153
|
'''
|
|
152
154
|
if not self.ldcHtml:
|
|
153
|
-
self.
|
|
155
|
+
self.fetch_ldc_record()
|
|
154
156
|
soup = bs(self.ldcHtml, 'html.parser')
|
|
155
157
|
#Should data just look in the *first* table? Specifically tbody?
|
|
156
158
|
#Is it always the first? I assume yes.
|
{dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_collection_info.py
RENAMED
|
@@ -5,8 +5,12 @@ outputs study metadata for the latest version
|
|
|
5
5
|
import argparse
|
|
6
6
|
import io
|
|
7
7
|
import csv
|
|
8
|
+
import pathlib
|
|
9
|
+
import sqlite3
|
|
8
10
|
import sys
|
|
9
11
|
import textwrap
|
|
12
|
+
|
|
13
|
+
import pandas as pd # I could use sqlite but why go the hassle
|
|
10
14
|
import dataverse_utils
|
|
11
15
|
import dataverse_utils.collections as dvc
|
|
12
16
|
|
|
@@ -19,11 +23,14 @@ def parse() -> argparse.ArgumentParser():
|
|
|
19
23
|
description = textwrap.fill(textwrap.dedent(
|
|
20
24
|
'''
|
|
21
25
|
Recursively parses a dataverse collection and
|
|
22
|
-
outputs study metadata for the latest version.
|
|
26
|
+
outputs study and file metadata for the latest version.
|
|
23
27
|
|
|
24
28
|
If analyzing publicly available collections, a
|
|
25
29
|
dataverse API key for the target system is not
|
|
26
30
|
required.
|
|
31
|
+
|
|
32
|
+
Study and file output can be joined on 'pid' (studies) and
|
|
33
|
+
'dataset_pid' (files).
|
|
27
34
|
'''), 80)
|
|
28
35
|
parser = argparse.ArgumentParser(description=description,
|
|
29
36
|
formatter_class=argparse.RawTextHelpFormatter)
|
|
@@ -32,27 +39,24 @@ def parse() -> argparse.ArgumentParser():
|
|
|
32
39
|
'defaults to "https://abacus.library.ubc.ca"'))
|
|
33
40
|
parser.add_argument('-k', '--key', required=False,
|
|
34
41
|
help='API key', default=None)
|
|
42
|
+
parser.add_argument('output',
|
|
43
|
+
help=textwrap.fill(textwrap.dedent(
|
|
44
|
+
'''
|
|
45
|
+
Output file name prefix. If tsv output is chosen,
|
|
46
|
+
files will be saved as [prefix]_studies.tsv
|
|
47
|
+
and [prefix]_files.tsv.
|
|
48
|
+
|
|
49
|
+
If SQLite output is chosen, it will be a single file file: [prefix].sqlite3.
|
|
50
|
+
'''),80))
|
|
35
51
|
parser.add_argument('-d', '--delimiter', required=False,
|
|
36
52
|
help='Delimiter for output spreadsheet. Default: tab (\\t)',
|
|
37
53
|
default='\t')
|
|
38
|
-
parser.add_argument('-f', '--fields',
|
|
39
|
-
help=textwrap.fill(('Record metadata fields to output. '
|
|
40
|
-
'For all fields, use "all". '
|
|
41
|
-
'Default: title, author. for '
|
|
42
|
-
'study metadata and file label, id for file metadata' )),
|
|
43
|
-
nargs='*',
|
|
44
|
-
default=['title', 'author', 'label', 'dataFile_id'])
|
|
45
|
-
parser.add_argument('-o', '--output', help='Output file name.',
|
|
46
|
-
required=False)
|
|
47
54
|
parser.add_argument('-i','--include-all-versions',
|
|
48
55
|
help='Include *all** versions, not just the current version',
|
|
49
56
|
action='store_true')
|
|
50
|
-
parser.add_argument('--
|
|
51
|
-
help=
|
|
52
|
-
|
|
53
|
-
'and the version (if applicable) so that study metadata '
|
|
54
|
-
'and file metadata can be linked')),
|
|
55
|
-
action='store_true')
|
|
57
|
+
parser.add_argument('-s', '--sqlite',
|
|
58
|
+
help='Save output as SQLite3 database',
|
|
59
|
+
action='store_true')
|
|
56
60
|
group = parser.add_argument_group(title='Harvest options',
|
|
57
61
|
description=textwrap.fill(
|
|
58
62
|
' You can obtain info for *either* a recursive crawl '
|
|
@@ -64,18 +68,17 @@ def parse() -> argparse.ArgumentParser():
|
|
|
64
68
|
help=('Dataverse collection shortname or id at the '
|
|
65
69
|
'top of the tree'))
|
|
66
70
|
mgroup.add_argument('-p', '--pid',
|
|
67
|
-
help=
|
|
68
|
-
'top of the tree'))
|
|
71
|
+
help='Dataverse study persistent identifier (DOI/handle)')
|
|
69
72
|
parser.add_argument('-v', '--version', action='version',
|
|
70
73
|
version=dataverse_utils.script_ver_stmt(parser.prog),
|
|
71
74
|
help='Show version number and exit')
|
|
72
75
|
return parser
|
|
73
76
|
|
|
74
|
-
def fields(
|
|
77
|
+
def fields(include_all:bool, is_file:bool, all_studies)->dict:
|
|
75
78
|
'''
|
|
76
79
|
Outputs appropriate header fields based on argparse values
|
|
77
80
|
'''
|
|
78
|
-
match (
|
|
81
|
+
match (include_all, is_file):
|
|
79
82
|
case (0, 0):
|
|
80
83
|
fieldnames = sorted(list(set(key for study in all_studies for key in study)))
|
|
81
84
|
case (1, 0):
|
|
@@ -104,54 +107,6 @@ def fields(args:argparse.ArgumentParser, all_studies)->dict:
|
|
|
104
107
|
|
|
105
108
|
return fieldnames
|
|
106
109
|
|
|
107
|
-
def fields_no(args:argparse.ArgumentParser, all_studies, fmeta=False)->dict:
|
|
108
|
-
'''
|
|
109
|
-
Outputs appropriate header fields based on argparse values
|
|
110
|
-
'''
|
|
111
|
-
#print(args)
|
|
112
|
-
match (args.include_all_versions, args.files, fmeta):
|
|
113
|
-
case (0, 0, 0):
|
|
114
|
-
fieldnames = sorted(list(set(key for study in all_studies for key in study)))
|
|
115
|
-
case (1, 0, 0):
|
|
116
|
-
fieldnames = sorted(list(set(key for study in all_studies
|
|
117
|
-
for ver in study.versions
|
|
118
|
-
for key in study.version_metadata(ver))))
|
|
119
|
-
case (0, 1, 0):
|
|
120
|
-
fieldnames = sorted(list(set(key for study in all_studies
|
|
121
|
-
for file in study.files
|
|
122
|
-
for key in file)))
|
|
123
|
-
#this is actually an outer join
|
|
124
|
-
#case (1, 1, 0):
|
|
125
|
-
# fieldnames1 = sorted(list(set(key for study in coll_me.studies
|
|
126
|
-
# for ver in study.versions
|
|
127
|
-
# for file in study.version_files(ver)
|
|
128
|
-
# for key in file)))
|
|
129
|
-
# fieldnames = sorted(list(set(key for study in coll_me.studies
|
|
130
|
-
# for ver in study.versions
|
|
131
|
-
# for key in study.version_metadata(ver))))
|
|
132
|
-
# fieldnames.extend(fieldnames1)
|
|
133
|
-
case (1, 1, 0):
|
|
134
|
-
fieldnames = sorted(list(set(key for study in all_studies
|
|
135
|
-
for ver in study.versions
|
|
136
|
-
for file in study.version_files(ver)
|
|
137
|
-
for key in file)))
|
|
138
|
-
|
|
139
|
-
case (1, 0, 1):
|
|
140
|
-
fieldnames = sorted(list(set(key for ver in all_studies[0].versions
|
|
141
|
-
for key in all_studies[0].version_metadata(ver))))
|
|
142
|
-
case (1, 1, 1):
|
|
143
|
-
fieldnames = sorted(list(set(key
|
|
144
|
-
for ver in all_studies[0].versions
|
|
145
|
-
for file in all_studies[0].version_files(ver)
|
|
146
|
-
for key in file)))
|
|
147
|
-
case (0, 1, 1):
|
|
148
|
-
fieldnames = sorted(list(set(key for file in all_studies[0].files
|
|
149
|
-
for key in file)))
|
|
150
|
-
|
|
151
|
-
case (0, 0, 1):
|
|
152
|
-
fieldnames = sorted(list(set(all_studies[0])))
|
|
153
|
-
return fieldnames
|
|
154
|
-
|
|
155
110
|
def output(study, include_all=False, file=False)->list:
|
|
156
111
|
'''
|
|
157
112
|
Returns a list of appropriately selected metadata
|
|
@@ -162,7 +117,7 @@ def output(study, include_all=False, file=False)->list:
|
|
|
162
117
|
return [study]
|
|
163
118
|
case (1,0):
|
|
164
119
|
for v in study.versions:
|
|
165
|
-
out.append(study.
|
|
120
|
+
out.append(study.version_metadata(v))
|
|
166
121
|
return out
|
|
167
122
|
case (0,1):
|
|
168
123
|
return study.files
|
|
@@ -184,14 +139,24 @@ def output(study, include_all=False, file=False)->list:
|
|
|
184
139
|
case _:
|
|
185
140
|
return []
|
|
186
141
|
|
|
142
|
+
def extension(args:argparse.ArgumentParser):
|
|
143
|
+
'''
|
|
144
|
+
Return extension for output
|
|
145
|
+
'''
|
|
146
|
+
extype ={'\t' : '.tsv',
|
|
147
|
+
',' : '.csv'}
|
|
148
|
+
if args.sqlite:
|
|
149
|
+
return '.sqlite3'
|
|
150
|
+
return extype.get(args.delimiter, '.txt')
|
|
151
|
+
|
|
187
152
|
def main():
|
|
188
153
|
'''
|
|
189
154
|
You know what this is
|
|
190
155
|
'''
|
|
191
|
-
#pylint: disable=too-many-branches
|
|
156
|
+
#pylint: disable=too-many-branches, too-many-locals
|
|
192
157
|
args = parse().parse_args()
|
|
193
158
|
if args.collection:
|
|
194
|
-
coll_me
|
|
159
|
+
coll_me = dvc.DvCollection(args.url, args.collection, args.key)
|
|
195
160
|
try:
|
|
196
161
|
coll_me.get_collections()
|
|
197
162
|
except TypeError:
|
|
@@ -209,36 +174,59 @@ def main():
|
|
|
209
174
|
except (KeyError, dataverse_utils.collections.MetadataError) as e:
|
|
210
175
|
print(e, file=sys.stderr)
|
|
211
176
|
sys.exit()
|
|
212
|
-
|
|
213
|
-
|
|
177
|
+
fname = {0: '_studies', 1:'_files'}
|
|
178
|
+
outdata = {}
|
|
179
|
+
for stud_file in range(2): # studies and files
|
|
180
|
+
fieldnames= fields(args.include_all_versions, stud_file, all_studies)
|
|
181
|
+
out = io.StringIO(newline='')
|
|
182
|
+
writer = csv.DictWriter(out,
|
|
183
|
+
fieldnames=fieldnames,
|
|
184
|
+
delimiter=args.delimiter,
|
|
185
|
+
quoting=csv.QUOTE_MINIMAL,
|
|
186
|
+
extrasaction='ignore')
|
|
187
|
+
writer.writeheader()
|
|
188
|
+
for stud in all_studies:
|
|
189
|
+
for row in output(stud, args.include_all_versions, stud_file):
|
|
190
|
+
data = {k:v.replace('\t',' ').replace('\r\n', ' ').replace('\n',' ')
|
|
191
|
+
if isinstance(v, str) else v
|
|
192
|
+
for k, v in row.items()}
|
|
193
|
+
writer.writerow(data)
|
|
194
|
+
out.seek(0)
|
|
195
|
+
outdata[fname[stud_file][1:]] = out
|
|
196
|
+
if not args.sqlite:
|
|
197
|
+
outf = pathlib.Path(args.output+f'{fname[stud_file]}{extension(args)}').expanduser()
|
|
198
|
+
with open(outf,
|
|
199
|
+
'w', encoding='utf-8') as f:
|
|
200
|
+
print(f'Writing {str(outf)}', file=sys.stdout)
|
|
201
|
+
f.write(out.read())
|
|
214
202
|
|
|
215
|
-
#if 'all' in [x.lower() for x in args.fields] and args.pid:
|
|
216
|
-
# fieldnames = fields(args, all_studies, 1)
|
|
217
|
-
if 'all' in [x.lower() for x in args.fields]:
|
|
218
|
-
fieldnames = fields(args, all_studies)
|
|
219
203
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
204
|
+
if args.sqlite:
|
|
205
|
+
print(f'Writing {str(pathlib.Path(args.output+extension(args)).expanduser())}',
|
|
206
|
+
file=sys.stdout)
|
|
207
|
+
conn = sqlite3.connect(pathlib.Path(args.output+extension(args)).expanduser())
|
|
208
|
+
for k,v in outdata.items():
|
|
209
|
+
x=pd.read_csv(v, delimiter=args.delimiter)
|
|
210
|
+
x.to_sql(k, conn, if_exists='replace', index=0)
|
|
211
|
+
cursor = conn.cursor()
|
|
212
|
+
cursor.execute('DROP VIEW IF EXISTS short_combined_view;')
|
|
213
|
+
query = textwrap.fill(textwrap.dedent(
|
|
214
|
+
'''CREATE VIEW short_combined_view AS
|
|
215
|
+
SELECT studies.pid AS pid,
|
|
216
|
+
studies.authorName AS author,
|
|
217
|
+
studies.title AS title,
|
|
218
|
+
studies.dateOfDeposit AS deposit_date,
|
|
219
|
+
studies.versionStatement AS version_statement,
|
|
220
|
+
files.dataFile_filename AS file_name,
|
|
221
|
+
files.dataFile_id AS file_id,
|
|
222
|
+
files.restricted AS restricted,
|
|
223
|
+
files.version AS file_version
|
|
224
|
+
FROM studies
|
|
225
|
+
INNER JOIN files ON studies.pid = files.dataset_pid;
|
|
226
|
+
'''
|
|
227
|
+
),80)
|
|
228
|
+
cursor.execute(query)
|
|
229
|
+
conn.close()
|
|
242
230
|
|
|
243
231
|
if __name__ == '__main__':
|
|
244
232
|
main()
|
{dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_ldc_uploader.py
RENAMED
|
@@ -6,6 +6,7 @@ python3 uploadme.py LDC20201S01 . . . LDC2021T21 apikey
|
|
|
6
6
|
'''
|
|
7
7
|
import argparse
|
|
8
8
|
import sys
|
|
9
|
+
import dryad2dataverse.config as dc
|
|
9
10
|
import dataverse_utils as du
|
|
10
11
|
from dataverse_utils import ldc
|
|
11
12
|
|
|
@@ -69,7 +70,7 @@ def parse() -> argparse.ArgumentParser():
|
|
|
69
70
|
|
|
70
71
|
def upload_meta(ldccat: str, url: str, key: str,#pylint: disable = too-many-arguments, too-many-positional-arguments
|
|
71
72
|
dvs: str, verbose: bool = False,
|
|
72
|
-
certchain: str = None) -> str:
|
|
73
|
+
certchain: str = None, **kwargs) -> str:
|
|
73
74
|
'''
|
|
74
75
|
Uploads metadata to target dataverse collection. Returns persistentId.
|
|
75
76
|
|
|
@@ -85,9 +86,11 @@ def upload_meta(ldccat: str, url: str, key: str,#pylint: disable = too-many-argu
|
|
|
85
86
|
Target Dataverse collection short name
|
|
86
87
|
certchain : str
|
|
87
88
|
Path to LDC .PEM certificate chain
|
|
89
|
+
**kwargs
|
|
90
|
+
Other parameters, notably dv_contact_email and dv_contact_name
|
|
88
91
|
'''
|
|
89
|
-
stud = ldc.Ldc(ldccat, cert=certchain)
|
|
90
|
-
stud.
|
|
92
|
+
stud = ldc.Ldc(ldccat, cert=certchain, **kwargs)
|
|
93
|
+
stud.fetch_ldc_record()
|
|
91
94
|
if verbose:
|
|
92
95
|
print(f'Uploading {stud.ldc} metadata')
|
|
93
96
|
info = stud.upload_metadata(url=url, key=key, dv=dvs)
|
|
@@ -99,15 +102,16 @@ def main() -> None:
|
|
|
99
102
|
'''
|
|
100
103
|
parser = parse()
|
|
101
104
|
args = parser.parse_args()
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
+
dc_config = dc.Config()
|
|
106
|
+
contact_info={'dv_contact_name' : args.cname,
|
|
107
|
+
'dv_contact_email' : args.email}
|
|
108
|
+
dc_config.update(contact_info)
|
|
105
109
|
if args.tsv:
|
|
106
110
|
if len(args.studies) > 1:
|
|
107
111
|
print('Error: Only one LDC study may be processed with the -t/--tsv option')
|
|
108
112
|
sys.exit()
|
|
109
113
|
pid = upload_meta(args.studies[0], args.url, args.key,
|
|
110
|
-
args.dvs, args.verbose, args.certchain)
|
|
114
|
+
args.dvs, args.verbose, args.certchain, **dc_config)
|
|
111
115
|
if args.verbose:
|
|
112
116
|
print(f'Uploading files to {pid}')
|
|
113
117
|
with open(args.tsv, encoding='utf-8', newline='') as fil:
|
|
@@ -95,7 +95,6 @@ class Dverse():
|
|
|
95
95
|
return [x['storageIdentifier'].replace('file://', f'{self.hdl}:') for x in data
|
|
96
96
|
if x['type'] == 'dataset']
|
|
97
97
|
|
|
98
|
-
@property
|
|
99
98
|
def unreleased(self, all_stud: list = None) -> list:
|
|
100
99
|
'''
|
|
101
100
|
Finds only unreleased studies from a list of studies
|
|
@@ -208,7 +207,7 @@ def main():
|
|
|
208
207
|
args = parser.parse_args()
|
|
209
208
|
if args.dv:
|
|
210
209
|
the_dv = Dverse(args.url, args.key, args.dv)
|
|
211
|
-
un_rel = the_dv.unreleased
|
|
210
|
+
un_rel = the_dv.unreleased()
|
|
212
211
|
else:
|
|
213
212
|
un_rel = args.pid
|
|
214
213
|
if args.dryrun:
|
|
File without changes
|
|
File without changes
|
{dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/data/LDC_EULA_general.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_list_files.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_manifest_gen.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_pg_facet_date.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_readme_creator.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_record_copy.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_replace_licence.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_study_migrator.py
RENAMED
|
File without changes
|
{dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_upload_tsv.py
RENAMED
|
File without changes
|