dataverse-utils 0.22.4__tar.gz → 0.22.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/PKG-INFO +3 -3
  2. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/pyproject.toml +3 -3
  3. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/__init__.py +4 -4
  4. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/collections.py +59 -20
  5. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/ldc.py +7 -5
  6. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_collection_info.py +87 -99
  7. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_ldc_uploader.py +11 -7
  8. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_release.py +1 -2
  9. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/LICENCE.md +0 -0
  10. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/README.md +0 -0
  11. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/data/LDC_EULA_general.md +0 -0
  12. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/dataverse_utils.py +0 -0
  13. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/dvdata.py +0 -0
  14. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_del.py +0 -0
  15. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_list_files.py +0 -0
  16. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_manifest_gen.py +0 -0
  17. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_pg_facet_date.py +0 -0
  18. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_readme_creator.py +0 -0
  19. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_record_copy.py +0 -0
  20. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_replace_licence.py +0 -0
  21. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_study_migrator.py +0 -0
  22. {dataverse_utils-0.22.4 → dataverse_utils-0.22.7}/src/dataverse_utils/scripts/dv_upload_tsv.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataverse-utils
3
- Version: 0.22.4
3
+ Version: 0.22.7
4
4
  Summary: Utilities for the Dataverse data respository system
5
5
  License: MIT
6
6
  License-File: LICENCE.md
@@ -16,14 +16,14 @@ Classifier: Programming Language :: Python :: 3.12
16
16
  Classifier: Programming Language :: Python :: 3.13
17
17
  Classifier: Programming Language :: Python :: 3.14
18
18
  Requires-Dist: bs4 (>=0.0.2,<0.0.3)
19
- Requires-Dist: chardet (>=5.2,<5.3)
19
+ Requires-Dist: chardet (>=5.2)
20
20
  Requires-Dist: dryad2dataverse (>=0.8.4,<0.9.0)
21
21
  Requires-Dist: markdown (>=3.10.2,<4.0.0)
22
22
  Requires-Dist: markdown-pdf (>=1.13.1,<2.0.0)
23
23
  Requires-Dist: markdownify (>=1.2.2,<2.0.0)
24
24
  Requires-Dist: pyreadr (>=0.5.4,<0.6.0)
25
25
  Requires-Dist: pyreadstat (>=1.3.3,<2.0.0)
26
- Requires-Dist: requests (>=2.30.0,<3.0.0)
26
+ Requires-Dist: requests (>=2.33,<3.0)
27
27
  Requires-Dist: requests-toolbelt (>=1.0.0,<2.0.0)
28
28
  Requires-Dist: tqdm (>=4.67.3,<5.0.0)
29
29
  Project-URL: Homepage, https://ubc-library-rc.github.io/dataverse_utils
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dataverse-utils"
3
- version = "0.22.4"
3
+ version = "0.22.7"
4
4
  description = "Utilities for the Dataverse data respository system"
5
5
  authors = [
6
6
  {name = "Paul Lesack",email = "paul.lesack@ubc.ca"}
@@ -10,7 +10,6 @@ readme = "README.md"
10
10
  requires-python = ">=3.10, <4.0"
11
11
  #When requests 2.33 is released, update poetry and release
12
12
  dependencies = [
13
- "requests (>=2.30.0,<3.0.0)",
14
13
  "bs4 (>=0.0.2,<0.0.3)",
15
14
  "markdown (>=3.10.2,<4.0.0)",
16
15
  "markdown-pdf (>=1.13.1,<2.0.0)",
@@ -20,7 +19,8 @@ dependencies = [
20
19
  "requests-toolbelt (>=1.0.0,<2.0.0)",
21
20
  "tqdm (>=4.67.3,<5.0.0)",
22
21
  "dryad2dataverse (>=0.8.4,<0.9.0)",
23
- "chardet (>=5.2,<5.3)"
22
+ "chardet (>=5.2)",
23
+ "requests (>=2.33,<3.0)"
24
24
  ]
25
25
  #Chardet and requests will need to be changed when requests goes to 2.32; at that
26
26
  #point just remove chardet
@@ -7,7 +7,7 @@ import pathlib
7
7
  import sys
8
8
  from dataverse_utils.dataverse_utils import *
9
9
 
10
- VERSION = (0, 22, 4)
10
+ VERSION = (0, 22, 7)
11
11
  __version__ = '.'.join([str(x) for x in VERSION])
12
12
 
13
13
  USERAGENT = (f'dataverse_utils/v{__version__} ({sys.platform.capitalize()}); '
@@ -15,14 +15,14 @@ USERAGENT = (f'dataverse_utils/v{__version__} ({sys.platform.capitalize()}); '
15
15
  UAHEADER = {'User-agent' : USERAGENT}
16
16
 
17
17
  SCRIPT_VERSIONS={
18
- 'dv_collection_info' : (0, 1, 3),
18
+ 'dv_collection_info' : (0, 3, 0),
19
19
  'dv_del' : (0, 2, 4),
20
- 'dv_ldc_uploader' : (0, 3, 0),
20
+ 'dv_ldc_uploader' : (0, 4, 1),
21
21
  'dv_list_files' : (0, 1, 1),
22
22
  'dv_manifest_gen' : (0, 5, 1),
23
23
  'dv_pg_facet_date' : (0, 1, 1),
24
24
  'dv_record_copy' : (0, 1, 2),
25
- 'dv_release' : (0, 1, 2),
25
+ 'dv_release' : (0, 1, 3),
26
26
  'dv_replace_licence' : (0, 1, 1),
27
27
  'dv_readme_creator' : (0, 1, 1),
28
28
  'dv_study_migrator' : (0, 4, 1),
@@ -9,6 +9,7 @@ import io
9
9
  import logging
10
10
  import pathlib
11
11
  import string
12
+ import sys
12
13
  import tempfile
13
14
  import textwrap
14
15
  import typing
@@ -80,11 +81,25 @@ class DvCollection:
80
81
  self.retry_strategy = RETRY
81
82
  else:
82
83
  self.retry_strategy = kwargs['retry']
84
+ self.collections = None
83
85
  self.session = requests.Session()
84
86
  self.session.mount('https://',
85
87
  requests.adapters.HTTPAdapter(max_retries=self.retry_strategy))
86
- self.collections = None
87
88
  self.studies = None
89
+ self.__root = None
90
+ self.all_colls = [self.root]
91
+
92
+ @property
93
+ def root(self):
94
+ '''
95
+ Return the name and short name of the top level collection
96
+ '''
97
+ if not self.__root:
98
+ x = self.session.get(f'{self.url}/api/dataverses/{self.coll}',
99
+ headers=self.headers)
100
+ x.raise_for_status()
101
+ self.__root = (x.json()['data']['name'], x.json()['data']['alias'])
102
+ return self.__root
88
103
 
89
104
  def __clean_url(self, badurl:str):
90
105
  '''
@@ -142,7 +157,6 @@ class DvCollection:
142
157
  out=self.__get_shortname(_['id'])
143
158
  dvs.append((_['title'], out))
144
159
  except Exception as e:
145
-
146
160
  obscure_error = f'''
147
161
  An error has occured where a collection can be
148
162
  identified by ID but its name cannot be determined.
@@ -155,12 +169,13 @@ class DvCollection:
155
169
 
156
170
  Problematic collection id number: {_.get("id",
157
171
  "not available")}'''
158
- print(50*'-')
159
- print(textwrap.dedent(obscure_error))
172
+ #to sys.stdout?
173
+ print(50*'-', file=sys.stderr)
174
+ print(textwrap.dedent(obscure_error), file=sys.stderr)
160
175
  print(e)
161
176
  LOGGER.error(textwrap.fill(textwrap.dedent(obscure_error).strip()))
162
177
  traceback.print_exc()
163
- print(50*'-')
178
+ print(50*'-', file=sys.stderr)
164
179
  raise e
165
180
  #---
166
181
  if not dvs:
@@ -171,6 +186,7 @@ class DvCollection:
171
186
  LOGGER.debug('recursive')
172
187
  self.get_collections(dv[1], output)
173
188
  self.collections = output
189
+ self.collections.append(self.root)
174
190
  return output
175
191
 
176
192
  def get_studies(self, root:str=None):
@@ -186,9 +202,8 @@ class DvCollection:
186
202
  if not root:
187
203
  root=self.coll
188
204
  all_studies = self.get_collection_listing(root)
189
- #collections = self.get_collections(root, self.url)
190
205
  collections = self.get_collections(root)
191
- for collection in collections:
206
+ for collection in tqdm.tqdm(collections):
192
207
  all_studies.extend(self.get_collection_listing(collection[1]))
193
208
  self.studies = all_studies
194
209
  return all_studies
@@ -207,12 +222,16 @@ class DvCollection:
207
222
  cl.raise_for_status()
208
223
  pids = [f"{z['protocol']}:{z['authority']}/{z['identifier']}"
209
224
  for z in cl.json()['data'] if z['type'] == 'dataset']
210
- out = [(self.get_study_info(pid), pid) for pid in pids]
225
+ #Pass collection info into the study because that's not available from
226
+ #a metadata download
227
+ smkwargs = [{'collection_name':_[0] , 'collection_short_name':_[1]}
228
+ for _ in self.collections if coll_id == _[1]][0]
229
+ out = [(self.get_study_info(pid, **smkwargs), pid) for pid in pids]
211
230
  for _ in out:
212
231
  _[0].update({'pid': _[1]})
213
232
  return [x[0] for x in out]
214
233
 
215
- def get_study_info(self, pid):
234
+ def get_study_info(self, pid, **kwargs):
216
235
  '''
217
236
  Returns a StudyMetadata object with complete metadata for a study.
218
237
 
@@ -220,13 +239,16 @@ class DvCollection:
220
239
  ----------
221
240
  pid : str
222
241
  Persistent ID of a Dataverse study
242
+
243
+ **kwargs
244
+ Other useful information to pass onto StudyMetadata, such as collection info, etc.
223
245
  '''
224
246
  meta = self.session.get(f'{self.url}/api/datasets/:persistentId',
225
247
  params={'persistentId': pid},
226
248
  headers=self.headers)
227
249
  meta.raise_for_status()
228
250
  LOGGER.debug(pid)
229
- return StudyMetadata(study_meta=meta.json(), key=self.__key, url=self.url)
251
+ return StudyMetadata(study_meta=meta.json(), key=self.__key, url=self.url, **kwargs)
230
252
 
231
253
  class StudyMetadata(dict):
232
254
  '''
@@ -268,6 +290,10 @@ class StudyMetadata(dict):
268
290
  self.all_versions = None
269
291
  self.url = kwargs.get('url')
270
292
  self.pid = kwargs.get('pid')
293
+ #If only there would be an easy way to check if something was deaccessioned
294
+ #without yet another request. But right now, let's assume it's fine.
295
+ #See below (under Key Error) where it get set
296
+ self.deaccession_flag = 0
271
297
  if self.study_meta:
272
298
  #self.pid = kwargs.get('pid', (f"{self.study_meta['data']['protocol']}:"
273
299
  # f"{self.study_meta['data']['authority']}"
@@ -286,15 +312,23 @@ class StudyMetadata(dict):
286
312
  try:
287
313
  self.update(self.extract_metadata(self.study_meta['data']['latestVersion']))
288
314
  except KeyError as e:
289
- raise MetadataError(f'Unable to parse study metadata. Do you need an API key?\n'
290
- f'{e} key not found.\n'
291
- f'Offending JSON: {self.study_meta}') from e
315
+ if (self.study_meta.get('status') == 'OK' and not
316
+ self.study_meta['data'].get('latestVersion')):
317
+ # Latest version is not available because API strips out all
318
+ # citation metadata for deaccessioned studies but doesn't
319
+ # actually indicate this in any obvious manner
320
+ # This is further complicated because *all* the metadata
321
+ # we want is in the metadata blocks, which won't exist in the JSON
322
+ # because for some idiotic reason it's OK to expose it in the GUI
323
+ # but not via API.
324
+ self.deaccession_flag = 1
325
+ else:
326
+ raise MetadataError(f'Unable to parse study metadata. Do you need an API key?\n'
327
+ f'{e} key not found.\n'
328
+ f'Offending JSON: {self.study_meta}') from e
292
329
  self.__files = None
293
330
  self.__all_files = None
294
- #self.index = {f"{_['versionNumber']}.{_['versionMinorNumber']}": n
295
- # for n, _ in enumerate(self.all_versions['data'])}
296
- #self.index = {_: n for _, n in enumerate(self.versions)}
297
- self.index = dict(enumerate(self.versions))
331
+ self.index = {_: n for n, _ in enumerate(self.versions)}
298
332
 
299
333
  def __obtain_metadata(self):
300
334
  '''
@@ -354,6 +388,10 @@ class StudyMetadata(dict):
354
388
  tmp['versionStatement'] = f"{chunk['versionNumber']}.{chunk['versionMinorNumber']}"
355
389
  else:
356
390
  tmp['versionStatement'] = f"{chunk.get('versionState', '')}"
391
+
392
+ for _ in ['collection_name', 'collection_short_name']:
393
+ if self.kwargs.get(_):
394
+ tmp[_] = self.kwargs[_]
357
395
  return tmp
358
396
 
359
397
  def extract_field_metadata(self, field):
@@ -549,7 +587,7 @@ class StudyMetadata(dict):
549
587
 
550
588
  files = [self.flatten(_) for _ in filelist]
551
589
  for ff in files:
552
- ff.update({'dataset_persistentId': self.pid})
590
+ ff.update({'dataset_pid': self.pid})
553
591
  return files
554
592
 
555
593
  def __extract_files(self):
@@ -560,9 +598,11 @@ class StudyMetadata(dict):
560
598
  #but files would (usually) be an arbitrary number of files.
561
599
  #That bothers me on an intellectual level. Therefore, it will be attribute.
562
600
  #Iterate over StudyMetadata.files if you want to know the contents
563
- if not self.__files:
601
+ if not self.__files and not self.deaccession_flag:
564
602
  self.__files = self.extract_files(self.study_meta['data']
565
603
  ['latestVersion']['files'])
604
+ if self.deaccession_flag:
605
+ self.__files = []
566
606
 
567
607
  def __extract_licence_info(self, indict)->dict:
568
608
  '''
@@ -695,7 +735,6 @@ class ReadmeCreator:
695
735
  return f'{inkey}: \n'
696
736
  return f'{inkey}: '
697
737
 
698
-
699
738
  def __extract_files(self):
700
739
  '''
701
740
  Extract file level metadata, and write to self.__files.
@@ -14,6 +14,7 @@ import requests
14
14
  from requests.adapters import HTTPAdapter
15
15
  from bs4 import BeautifulSoup as bs
16
16
  import dryad2dataverse.serializer as ds
17
+ import dryad2dataverse.config as dc
17
18
  from dataverse_utils import UAHEADER
18
19
 
19
20
  #pylint: disable=invalid-name
@@ -29,7 +30,7 @@ class Ldc(ds.Serializer):#pylint: disable=too-many-instance-attributes
29
30
  An LDC item (eg, LDC2021T01)
30
31
  '''
31
32
  #pylint: disable=super-init-not-called, arguments-differ
32
- def __init__(self, ldc, cert=None):
33
+ def __init__(self, ldc, cert=None, **kwargs):
33
34
  '''
34
35
  Returns a dict with keys created from an LDC catalogue web
35
36
  page.
@@ -54,10 +55,11 @@ class Ldc(ds.Serializer):#pylint: disable=too-many-instance-attributes
54
55
  self.cert = cert
55
56
  self.session = requests.Session()
56
57
  self.session.mount('https://',
57
- HTTPAdapter(max_retries=ds.constants.RETRY_STRATEGY))
58
+ HTTPAdapter(max_retries=dc.RETRY_STRATEGY))
58
59
  if self.cert:
59
60
  self.cert = os.path.expanduser(self.cert)
60
61
  self.__fixdesc = None
62
+ self.kwargs = kwargs
61
63
 
62
64
  @property
63
65
  def ldcJson(self):
@@ -120,7 +122,7 @@ class Ldc(ds.Serializer):#pylint: disable=too-many-instance-attributes
120
122
  '''
121
123
  #pylint: disable=property-with-parameters
122
124
  if not maxsize:
123
- maxsize = ds.constants.MAX_UPLOAD
125
+ maxsize = self.kwargs.get('max_upload', 68719476736)
124
126
 
125
127
  @property
126
128
  def id(self):
@@ -129,7 +131,7 @@ class Ldc(ds.Serializer):#pylint: disable=too-many-instance-attributes
129
131
  '''
130
132
  return self.ldc
131
133
 
132
- def fetch_record(self, timeout=45):
134
+ def fetch_ldc_record(self, timeout=45):
133
135
  '''
134
136
  Downloads record from LDC website
135
137
 
@@ -150,7 +152,7 @@ class Ldc(ds.Serializer):#pylint: disable=too-many-instance-attributes
150
152
  page.
151
153
  '''
152
154
  if not self.ldcHtml:
153
- self.fetch_record()
155
+ self.fetch_ldc_record()
154
156
  soup = bs(self.ldcHtml, 'html.parser')
155
157
  #Should data just look in the *first* table? Specifically tbody?
156
158
  #Is it always the first? I assume yes.
@@ -5,8 +5,12 @@ outputs study metadata for the latest version
5
5
  import argparse
6
6
  import io
7
7
  import csv
8
+ import pathlib
9
+ import sqlite3
8
10
  import sys
9
11
  import textwrap
12
+
13
+ import pandas as pd # I could use sqlite but why go the hassle
10
14
  import dataverse_utils
11
15
  import dataverse_utils.collections as dvc
12
16
 
@@ -19,11 +23,14 @@ def parse() -> argparse.ArgumentParser():
19
23
  description = textwrap.fill(textwrap.dedent(
20
24
  '''
21
25
  Recursively parses a dataverse collection and
22
- outputs study metadata for the latest version.
26
+ outputs study and file metadata for the latest version.
23
27
 
24
28
  If analyzing publicly available collections, a
25
29
  dataverse API key for the target system is not
26
30
  required.
31
+
32
+ Study and file output can be joined on 'pid' (studies) and
33
+ 'dataset_pid' (files).
27
34
  '''), 80)
28
35
  parser = argparse.ArgumentParser(description=description,
29
36
  formatter_class=argparse.RawTextHelpFormatter)
@@ -32,27 +39,24 @@ def parse() -> argparse.ArgumentParser():
32
39
  'defaults to "https://abacus.library.ubc.ca"'))
33
40
  parser.add_argument('-k', '--key', required=False,
34
41
  help='API key', default=None)
42
+ parser.add_argument('output',
43
+ help=textwrap.fill(textwrap.dedent(
44
+ '''
45
+ Output file name prefix. If tsv output is chosen,
46
+ files will be saved as [prefix]_studies.tsv
47
+ and [prefix]_files.tsv.
48
+
49
+ If SQLite output is chosen, it will be a single file file: [prefix].sqlite3.
50
+ '''),80))
35
51
  parser.add_argument('-d', '--delimiter', required=False,
36
52
  help='Delimiter for output spreadsheet. Default: tab (\\t)',
37
53
  default='\t')
38
- parser.add_argument('-f', '--fields',
39
- help=textwrap.fill(('Record metadata fields to output. '
40
- 'For all fields, use "all". '
41
- 'Default: title, author. for '
42
- 'study metadata and file label, id for file metadata' )),
43
- nargs='*',
44
- default=['title', 'author', 'label', 'dataFile_id'])
45
- parser.add_argument('-o', '--output', help='Output file name.',
46
- required=False)
47
54
  parser.add_argument('-i','--include-all-versions',
48
55
  help='Include *all** versions, not just the current version',
49
56
  action='store_true')
50
- parser.add_argument('--files',
51
- help=textwrap.fill(('Show only the *files* associated with a study.'
52
- 'The output will contain the PID of the study '
53
- 'and the version (if applicable) so that study metadata '
54
- 'and file metadata can be linked')),
55
- action='store_true')
57
+ parser.add_argument('-s', '--sqlite',
58
+ help='Save output as SQLite3 database',
59
+ action='store_true')
56
60
  group = parser.add_argument_group(title='Harvest options',
57
61
  description=textwrap.fill(
58
62
  ' You can obtain info for *either* a recursive crawl '
@@ -64,18 +68,17 @@ def parse() -> argparse.ArgumentParser():
64
68
  help=('Dataverse collection shortname or id at the '
65
69
  'top of the tree'))
66
70
  mgroup.add_argument('-p', '--pid',
67
- help=('Dataverse study persistent identifier (DOI/handle)'
68
- 'top of the tree'))
71
+ help='Dataverse study persistent identifier (DOI/handle)')
69
72
  parser.add_argument('-v', '--version', action='version',
70
73
  version=dataverse_utils.script_ver_stmt(parser.prog),
71
74
  help='Show version number and exit')
72
75
  return parser
73
76
 
74
- def fields(args:argparse.ArgumentParser, all_studies)->dict:
77
+ def fields(include_all:bool, is_file:bool, all_studies)->dict:
75
78
  '''
76
79
  Outputs appropriate header fields based on argparse values
77
80
  '''
78
- match (args.include_all_versions, args.files):
81
+ match (include_all, is_file):
79
82
  case (0, 0):
80
83
  fieldnames = sorted(list(set(key for study in all_studies for key in study)))
81
84
  case (1, 0):
@@ -104,54 +107,6 @@ def fields(args:argparse.ArgumentParser, all_studies)->dict:
104
107
 
105
108
  return fieldnames
106
109
 
107
- def fields_no(args:argparse.ArgumentParser, all_studies, fmeta=False)->dict:
108
- '''
109
- Outputs appropriate header fields based on argparse values
110
- '''
111
- #print(args)
112
- match (args.include_all_versions, args.files, fmeta):
113
- case (0, 0, 0):
114
- fieldnames = sorted(list(set(key for study in all_studies for key in study)))
115
- case (1, 0, 0):
116
- fieldnames = sorted(list(set(key for study in all_studies
117
- for ver in study.versions
118
- for key in study.version_metadata(ver))))
119
- case (0, 1, 0):
120
- fieldnames = sorted(list(set(key for study in all_studies
121
- for file in study.files
122
- for key in file)))
123
- #this is actually an outer join
124
- #case (1, 1, 0):
125
- # fieldnames1 = sorted(list(set(key for study in coll_me.studies
126
- # for ver in study.versions
127
- # for file in study.version_files(ver)
128
- # for key in file)))
129
- # fieldnames = sorted(list(set(key for study in coll_me.studies
130
- # for ver in study.versions
131
- # for key in study.version_metadata(ver))))
132
- # fieldnames.extend(fieldnames1)
133
- case (1, 1, 0):
134
- fieldnames = sorted(list(set(key for study in all_studies
135
- for ver in study.versions
136
- for file in study.version_files(ver)
137
- for key in file)))
138
-
139
- case (1, 0, 1):
140
- fieldnames = sorted(list(set(key for ver in all_studies[0].versions
141
- for key in all_studies[0].version_metadata(ver))))
142
- case (1, 1, 1):
143
- fieldnames = sorted(list(set(key
144
- for ver in all_studies[0].versions
145
- for file in all_studies[0].version_files(ver)
146
- for key in file)))
147
- case (0, 1, 1):
148
- fieldnames = sorted(list(set(key for file in all_studies[0].files
149
- for key in file)))
150
-
151
- case (0, 0, 1):
152
- fieldnames = sorted(list(set(all_studies[0])))
153
- return fieldnames
154
-
155
110
  def output(study, include_all=False, file=False)->list:
156
111
  '''
157
112
  Returns a list of appropriately selected metadata
@@ -162,7 +117,7 @@ def output(study, include_all=False, file=False)->list:
162
117
  return [study]
163
118
  case (1,0):
164
119
  for v in study.versions:
165
- out.append(study.study_version_metadata(v))
120
+ out.append(study.version_metadata(v))
166
121
  return out
167
122
  case (0,1):
168
123
  return study.files
@@ -184,14 +139,24 @@ def output(study, include_all=False, file=False)->list:
184
139
  case _:
185
140
  return []
186
141
 
142
+ def extension(args:argparse.ArgumentParser):
143
+ '''
144
+ Return extension for output
145
+ '''
146
+ extype ={'\t' : '.tsv',
147
+ ',' : '.csv'}
148
+ if args.sqlite:
149
+ return '.sqlite3'
150
+ return extype.get(args.delimiter, '.txt')
151
+
187
152
  def main():
188
153
  '''
189
154
  You know what this is
190
155
  '''
191
- #pylint: disable=too-many-branches
156
+ #pylint: disable=too-many-branches, too-many-locals
192
157
  args = parse().parse_args()
193
158
  if args.collection:
194
- coll_me = dvc.DvCollection(args.url, args.collection, args.key)
159
+ coll_me = dvc.DvCollection(args.url, args.collection, args.key)
195
160
  try:
196
161
  coll_me.get_collections()
197
162
  except TypeError:
@@ -209,36 +174,59 @@ def main():
209
174
  except (KeyError, dataverse_utils.collections.MetadataError) as e:
210
175
  print(e, file=sys.stderr)
211
176
  sys.exit()
212
- #if 'all' in [x.lower() for x in args.fields] and args.collection:
213
- # fieldnames = fields(args, all_studies)
177
+ fname = {0: '_studies', 1:'_files'}
178
+ outdata = {}
179
+ for stud_file in range(2): # studies and files
180
+ fieldnames= fields(args.include_all_versions, stud_file, all_studies)
181
+ out = io.StringIO(newline='')
182
+ writer = csv.DictWriter(out,
183
+ fieldnames=fieldnames,
184
+ delimiter=args.delimiter,
185
+ quoting=csv.QUOTE_MINIMAL,
186
+ extrasaction='ignore')
187
+ writer.writeheader()
188
+ for stud in all_studies:
189
+ for row in output(stud, args.include_all_versions, stud_file):
190
+ data = {k:v.replace('\t',' ').replace('\r\n', ' ').replace('\n',' ')
191
+ if isinstance(v, str) else v
192
+ for k, v in row.items()}
193
+ writer.writerow(data)
194
+ out.seek(0)
195
+ outdata[fname[stud_file][1:]] = out
196
+ if not args.sqlite:
197
+ outf = pathlib.Path(args.output+f'{fname[stud_file]}{extension(args)}').expanduser()
198
+ with open(outf,
199
+ 'w', encoding='utf-8') as f:
200
+ print(f'Writing {str(outf)}', file=sys.stdout)
201
+ f.write(out.read())
214
202
 
215
- #if 'all' in [x.lower() for x in args.fields] and args.pid:
216
- # fieldnames = fields(args, all_studies, 1)
217
- if 'all' in [x.lower() for x in args.fields]:
218
- fieldnames = fields(args, all_studies)
219
203
 
220
- else:
221
- fieldnames = args.fields[2:] if args.files else args.fields[:2]
222
- out = io.StringIO(newline='')
223
- writer = csv.DictWriter(out,
224
- fieldnames=fieldnames,
225
- delimiter=args.delimiter,
226
- quoting=csv.QUOTE_MINIMAL,
227
- extrasaction='ignore')
228
- writer.writeheader()
229
- #for stud in coll_me.studies:
230
- for stud in all_studies:
231
- for row in output(stud, args.include_all_versions, args.files):
232
- writer.writerow({k:v.replace('\t',' ').replace('\r\n', ' ').replace('\n',' ')
233
- if isinstance(v, str) else v
234
- for k, v in row.items()})
235
- out.seek(0)
236
- if args.output:
237
- with open(args.output, mode='w', encoding='utf-8', newline='') as f:
238
- f.write(out.read())
239
- return
240
- else:
241
- print(out.read())
204
+ if args.sqlite:
205
+ print(f'Writing {str(pathlib.Path(args.output+extension(args)).expanduser())}',
206
+ file=sys.stdout)
207
+ conn = sqlite3.connect(pathlib.Path(args.output+extension(args)).expanduser())
208
+ for k,v in outdata.items():
209
+ x=pd.read_csv(v, delimiter=args.delimiter)
210
+ x.to_sql(k, conn, if_exists='replace', index=0)
211
+ cursor = conn.cursor()
212
+ cursor.execute('DROP VIEW IF EXISTS short_combined_view;')
213
+ query = textwrap.fill(textwrap.dedent(
214
+ '''CREATE VIEW short_combined_view AS
215
+ SELECT studies.pid AS pid,
216
+ studies.authorName AS author,
217
+ studies.title AS title,
218
+ studies.dateOfDeposit AS deposit_date,
219
+ studies.versionStatement AS version_statement,
220
+ files.dataFile_filename AS file_name,
221
+ files.dataFile_id AS file_id,
222
+ files.restricted AS restricted,
223
+ files.version AS file_version
224
+ FROM studies
225
+ INNER JOIN files ON studies.pid = files.dataset_pid;
226
+ '''
227
+ ),80)
228
+ cursor.execute(query)
229
+ conn.close()
242
230
 
243
231
  if __name__ == '__main__':
244
232
  main()
@@ -6,6 +6,7 @@ python3 uploadme.py LDC20201S01 . . . LDC2021T21 apikey
6
6
  '''
7
7
  import argparse
8
8
  import sys
9
+ import dryad2dataverse.config as dc
9
10
  import dataverse_utils as du
10
11
  from dataverse_utils import ldc
11
12
 
@@ -69,7 +70,7 @@ def parse() -> argparse.ArgumentParser():
69
70
 
70
71
  def upload_meta(ldccat: str, url: str, key: str,#pylint: disable = too-many-arguments, too-many-positional-arguments
71
72
  dvs: str, verbose: bool = False,
72
- certchain: str = None) -> str:
73
+ certchain: str = None, **kwargs) -> str:
73
74
  '''
74
75
  Uploads metadata to target dataverse collection. Returns persistentId.
75
76
 
@@ -85,9 +86,11 @@ def upload_meta(ldccat: str, url: str, key: str,#pylint: disable = too-many-argu
85
86
  Target Dataverse collection short name
86
87
  certchain : str
87
88
  Path to LDC .PEM certificate chain
89
+ **kwargs
90
+ Other parameters, notably dv_contact_email and dv_contact_name
88
91
  '''
89
- stud = ldc.Ldc(ldccat, cert=certchain)
90
- stud.fetch_record()
92
+ stud = ldc.Ldc(ldccat, cert=certchain, **kwargs)
93
+ stud.fetch_ldc_record()
91
94
  if verbose:
92
95
  print(f'Uploading {stud.ldc} metadata')
93
96
  info = stud.upload_metadata(url=url, key=key, dv=dvs)
@@ -99,15 +102,16 @@ def main() -> None:
99
102
  '''
100
103
  parser = parse()
101
104
  args = parser.parse_args()
102
- ldc.ds.constants.DV_CONTACT_EMAIL = args.email
103
- ldc.ds.constants.DV_CONTACT_NAME = args.cname
104
- #print(args)
105
+ dc_config = dc.Config()
106
+ contact_info={'dv_contact_name' : args.cname,
107
+ 'dv_contact_email' : args.email}
108
+ dc_config.update(contact_info)
105
109
  if args.tsv:
106
110
  if len(args.studies) > 1:
107
111
  print('Error: Only one LDC study may be processed with the -t/--tsv option')
108
112
  sys.exit()
109
113
  pid = upload_meta(args.studies[0], args.url, args.key,
110
- args.dvs, args.verbose, args.certchain)
114
+ args.dvs, args.verbose, args.certchain, **dc_config)
111
115
  if args.verbose:
112
116
  print(f'Uploading files to {pid}')
113
117
  with open(args.tsv, encoding='utf-8', newline='') as fil:
@@ -95,7 +95,6 @@ class Dverse():
95
95
  return [x['storageIdentifier'].replace('file://', f'{self.hdl}:') for x in data
96
96
  if x['type'] == 'dataset']
97
97
 
98
- @property
99
98
  def unreleased(self, all_stud: list = None) -> list:
100
99
  '''
101
100
  Finds only unreleased studies from a list of studies
@@ -208,7 +207,7 @@ def main():
208
207
  args = parser.parse_args()
209
208
  if args.dv:
210
209
  the_dv = Dverse(args.url, args.key, args.dv)
211
- un_rel = the_dv.unreleased
210
+ un_rel = the_dv.unreleased()
212
211
  else:
213
212
  un_rel = args.pid
214
213
  if args.dryrun: