sxs 2024.0.43__py3-none-any.whl → 2025.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sxs/caltechdata/login.py DELETED
@@ -1,506 +0,0 @@
1
- """Class encapsulating interactions with CaltechDATA
2
-
3
- This class handles requests through the CaltechDATA web API. In particular, it manages authorization,
4
- and retries any failed requests automatically.
5
-
6
- """
7
-
8
- import requests
9
-
10
- class CaltechDATAAuth(requests.auth.AuthBase):
11
- def __init__(self, base_url, access_token):
12
- self.base_url = base_url
13
- self.access_token = access_token
14
- #super(CaltechDATAAuth, self).__init__()
15
-
16
- def __call__(self, request):
17
- if request.url.startswith(self.base_url):
18
- request.headers["Authorization"] = f"Bearer {self.access_token}"
19
- return request
20
-
21
- def __eq__(self, other):
22
- return all(
23
- [
24
- self.base_url == getattr(other, "base_url", None),
25
- self.access_token == getattr(other, "access_token", None),
26
- ]
27
- )
28
-
29
- def __ne__(self, other):
30
- return not self == other
31
-
32
-
33
- class Login(object):
34
- def __init__(
35
- self,
36
- url="https://cd-sandbox.tind.io/",
37
- access_token=None,
38
- access_token_path=None,
39
- total_retry_count=50,
40
- backoff_factor=0.1,
41
- backoff_max=20.0,
42
- session=None,
43
- ):
44
- """Initialize a Login object for interacting with CaltechDATA
45
-
46
- This object encapsulates the credentials needed to interact with the CaltechDATA API, and
47
- exposes a Session object that can be used to make requests which automatically include the
48
- credentials. It can be used for generic requests, but note that other objects in this
49
- module make certain tasks easier -- such as creating or modifying a "deposit", which is
50
- CaltechDATA's name for a new upload. The Deposit object should be created from this object.
51
-
52
- These actions require a CaltechDATA API personal access token. These are obtained from the
53
- website -- either
54
- https://data.caltech.edu/account/settings/applications/tokens/new/
55
- or
56
- https://cd-sandbox.tind.io/account/settings/applications/tokens/new/
57
- Note that these two options use separate login systems and separate access tokens. The
58
- access token may either be passed as a string to this function (though that means it will
59
- probably be found in some script file somewhere, which is probably not a good idea for
60
- security), or can be read from a file. By default the file from which the token is read is
61
- '~/.credentials/caltechdata/access_token' or the same name with '_sandbox' appended. Thus,
62
- it is probably easiest to simply place your access tokens in those files, so that no
63
- arguments need to be passed to this function. As a basic security measure, please ensure
64
- that those files are not readable by anyone but the user.
65
-
66
- Parameters
67
- ----------
68
- url : str [default: "https://cd-sandbox.tind.io/"]
69
- The base URL of the archive. Note that the default URL is the "sandbox" version for
70
- CaltechDATA, which is just for testing purposes, and will likely be cleaned out
71
- regularly. To upload to the archival site, pass its URL: "https://data.caltech.edu/".
72
-
73
- access_token: string or None [default: None]
74
- If present, this is used as the CaltechDATA API access token.
75
-
76
- access_token_path: string or None [default: None]
77
- If `access_token` is not given, this file is read and the first line is used as the
78
- access token. If this argument is None, it defaults to either
79
- '~/.credentials/caltechdata/access_token' for the regular website or
80
- '~/.credentials/caltechdata/access_token_sandbox' for the sandbox website.
81
-
82
- total_retry_count: int [default: 50]
83
- Total number of times to retry requests that fail for retry-able reasons.
84
-
85
- backoff_factor: float [default: 0.1]
86
- A delay factor to apply between requests after the second try (most errors are resolved
87
- immediately by a second try without a delay). After a certain number of total retries,
88
- the request Session will sleep for:
89
-
90
- {backoff factor} * (2 ^ ({number of total retries} - 1))
91
-
92
- seconds before trying again. For example, if the `backoff_factor` is 0.1, then the
93
- session will sleep for [0.0s, 0.2s, 0.4s, 0.8s, ...] between retries. It will never be
94
- longer than `backoff_max`.
95
-
96
- backoff_max: float [default: 20.0]
97
- Longest time (in seconds) to wait between retries.
98
-
99
- session: requests.Session or None [default: None]
100
- This is the object that handles all of the requests made to the API. If `None`, a
101
- Session is created for you, and sensible default headers (including the access token)
102
- are created. If you need to adjust some of the Session parameters like proxies or SSL
103
- verification, you can simply create your own and pass it in here. Note that any `auth`
104
- property on the passed object will be replaced by one that adds this to the header of
105
- each request to the chosen CaltechDATA domain:
106
- {"Authorization": "Bearer <YourAccessTokenHere>"}
107
-
108
- """
109
- import os
110
- import requests
111
- from requests.adapters import HTTPAdapter
112
- from urllib3.util.retry import Retry
113
- from datacite import DataCiteRESTClient
114
-
115
- self.base_url = url
116
-
117
- # The `session` object will handle all requests we make.
118
- self.session = session or requests.Session()
119
-
120
- # Set the CaltechDATA API access token
121
- if "sandbox" in url:
122
- access_token_path = "~/.credentials/caltechdata/access_token_sandbox"
123
- doi_auth_path = "~/.credentials/caltechdata/doi_auth_sandbox"
124
- self.doi_prefix = "10.80269"
125
- test_mode = True
126
- else:
127
- access_token_path = "~/.credentials/caltechdata/access_token"
128
- doi_auth_path = "~/.credentials/caltechdata/doi_auth"
129
- self.doi_prefix = "10.22002"
130
- test_mode = False
131
- with open(os.path.expanduser(doi_auth_path), "r") as f:
132
- user, password = f.readline().strip().split(":", 1)
133
- self.datacite = DataCiteRESTClient(
134
- username=user,
135
- password=password,
136
- prefix=self.doi_prefix,
137
- test_mode=test_mode,
138
- )
139
- path = os.path.expanduser(access_token_path)
140
- try:
141
- with open(path, "r") as f:
142
- self.access_token = f.readline().strip()
143
- except IOError:
144
- print("Unable to find the CaltechDATA access token needed to change a record.")
145
- print(f"Failed to open file '{path}' for reading.")
146
- raise
147
- if not self.access_token:
148
- print(f"The file '{path}' did not contain any text on the first line.")
149
- print("This is should be a CaltechDATA access token, which is needed to change a record.")
150
- raise ValueError("Deposit requires a CaltechDATA access token")
151
-
152
- # Ensure that this session sends the Authorization header with every request to the base_url
153
- self.session.auth = CaltechDATAAuth(self.base_url, self.access_token)
154
-
155
- # Note that some requests require different choices for 'Accept' and 'Content-Type'; these
156
- # are altered in the corresponding methods below.
157
- default_headers = {
158
- "Accept": "application/json",
159
- "Content-Type": "application/json",
160
- }
161
- self.session.headers.update(default_headers)
162
-
163
- ## Retry automatically on certain types of errors
164
- Retry.BACKOFF_MAX = backoff_max # Must be set on the class, not the instance
165
- retry = Retry(
166
- total=total_retry_count,
167
- backoff_factor=backoff_factor,
168
- status_forcelist=[
169
- 500,
170
- 502,
171
- 503,
172
- 504,
173
- ],
174
- )
175
- adapter = HTTPAdapter(max_retries=retry)
176
- self.session.mount(self.base_url, adapter)
177
-
178
- # Test to see if we can use the given access token
179
- url = "{0}api/records".format(self.base_url)
180
- r = self.session.get(url)
181
- if r.status_code != 200:
182
- if r.status_code == 401:
183
- print(
184
- f"The given CaltechDATA access token was not accepted by {self.base_url}. Please ensure that it is still valid."
185
- )
186
- print(
187
- "Also note that the standard site and the sandbox site use separate logins and separate access tokens."
188
- )
189
- else:
190
- print(f"An unknown error occurred when trying to access {self.base_url}.")
191
- try:
192
- print(r.json())
193
- except:
194
- pass
195
- r.raise_for_status()
196
- raise RuntimeError() # Will only happen if the response was not strictly an HTTP error
197
-
198
- def send_s3(self, path, name=None, verbose=False):
199
- import sys
200
- import pathlib
201
- #import tqdm
202
-
203
- if name is None:
204
- name = str(path)
205
- path = pathlib.Path(path).expanduser().resolve()
206
- size = path.stat().st_size
207
-
208
- if verbose:
209
- print(f" Uploading {name} ({size:_} B) ", end="", flush=True)
210
-
211
- s3url = f"{self.base_url}tindfiles/sign_s3/" # Note trailing slash
212
- chkurl = f"{self.base_url}tindfiles/md5_s3"
213
-
214
- r = self.session.get(s3url)
215
- if r.status_code != 200:
216
- if r.status_code == 401:
217
- print(
218
- f"The given CaltechDATA access token was not accepted by {self.base_url}. Please ensure that it is still valid."
219
- )
220
- print(
221
- "Also note that the standard site and the sandbox site use separate logins and separate access tokens."
222
- )
223
- print(f"Used headers {r.request.headers}")
224
- else:
225
- print(f"An unknown error occurred when trying to access {self.base_url}.")
226
- try:
227
- print(r.json())
228
- except:
229
- pass
230
- r.raise_for_status()
231
- raise RuntimeError() # Will only happen if the response was not strictly an HTTP error
232
- s3 = r.json()
233
- data = s3["data"]
234
- bucket = s3["bucket"]
235
-
236
- key = data["fields"]["key"]
237
- policy = data["fields"]["policy"]
238
- aid = data["fields"]["AWSAccessKeyId"]
239
- signature = data["fields"]["signature"]
240
- url = data["url"]
241
-
242
- s3headers = {
243
- "Host": f"{bucket}.s3.amazonaws.com",
244
- "Date": "date",
245
- "x-amz-acl": "public-read",
246
- "Access-Control-Allow-Origin": "*",
247
- }
248
-
249
- with path.open("rb") as f:
250
- #with tqdm.tqdm.wrapattr(f, "read", total=size, desc=" ") as fw: # To monitor upload progress
251
- form = (
252
- ("key", key),
253
- ("acl", "public-read"),
254
- ("AWSAccessKeyID", aid),
255
- ("policy", policy),
256
- ("signature", signature),
257
- ("file", f),
258
- )
259
- response = requests.session().post(url, files=form, headers=s3headers)
260
- if response.status_code != 204:
261
- if response.status_code == 400:
262
- print(f"Bad request: Probably caused by incorrectly formed input")
263
- print(f"Used headers {response.request.headers}")
264
- else:
265
- print(f"An unknown error occurred when trying to access {self.base_url}.")
266
- try:
267
- print(response.json())
268
- except:
269
- pass
270
- try:
271
- print(response.text)
272
- except:
273
- pass
274
- response.raise_for_status()
275
- raise RuntimeError() # Will only happen if the response was not strictly an HTTP error
276
-
277
- response = self.session.get(f"{chkurl}/{bucket}/{key}/")
278
- md5 = response.json()["md5"]
279
-
280
- fileinfo = {"url": key, "filename": name, "md5": md5, "size": size}
281
-
282
- if verbose:
283
- print(f"✓")
284
-
285
- return fileinfo
286
-
287
- def download(self, url, path):
288
- """Download large file efficiently
289
-
290
- Parameters
291
- ----------
292
- url: string
293
- The URL to download from. Redirects are followed.
294
- path: string
295
- Relative or absolute path to the file in which the download will be stored. If this is
296
- an existing directory or ends in a path separator, the "path" component of the URL will
297
- be used as the file name, and the full directory path will be created.
298
-
299
- """
300
- from shutil import copyfileobj
301
- from os import makedirs
302
- from os.path import split, exists, join, isdir
303
- from functools import partial
304
- from urllib.parse import urlparse
305
-
306
- url_path = urlparse(url).path
307
- if isdir(path):
308
- path = join(path, url_path[1:])
309
- directory, filename = split(path)
310
- if not exists(directory):
311
- makedirs(directory)
312
- local_filename = join(directory, filename)
313
- else:
314
- directory, filename = split(path)
315
- if not exists(directory):
316
- makedirs(directory)
317
- if not filename:
318
- filename = url_path
319
- local_filename = join(directory, filename)
320
- r = self.session.get(url, stream=True, allow_redirects=True)
321
- if r.status_code != 200:
322
- print("An error occurred when trying to access <{0}>.".format(url))
323
- try:
324
- print(r.json())
325
- except:
326
- pass
327
- r.raise_for_status()
328
- raise RuntimeError() # Will only happen if the response was not strictly an error
329
- r.raw.read = partial(r.raw.read, decode_content=True)
330
- # r.raw.decode_content = True
331
- with open(local_filename, "wb") as f:
332
- copyfileobj(r.raw, f)
333
- return local_filename
334
-
335
- @property
336
- def new_deposit(self):
337
- """Create a new Deposit object using this login"""
338
- return self.deposit()
339
-
340
- def deposit(self, deposition_id=None, ignore_deletion=False):
341
- """Retrieve a deposit created with this login"""
342
- from .deposit import Deposit
343
-
344
- return Deposit(self, deposition_id, ignore_deletion)
345
-
346
- def search(self, q=None, sort=None, size=1000, page=1, allversions=False, max_pages=10):
347
- """Return list of dictionaries describing each deposit created with this login
348
-
349
- It is possible to filter the results use the optional parameters. Note that the web interface
350
- can sometimes be used to find search parameters by looking in the `search-hidden-params`
351
- parameter of the `invenio-search` tag.
352
-
353
- Example queries
354
- ---------------
355
- 'title:"SXS:BBH:0003"' # Finds titles with given string; use quotes for robustness
356
- 'communities:sxs' # Records in the 'sxs' CaltechDATA community
357
- 'provisional_communities:sxs' # Records awaiting approval by the community curator
358
- 'owners: 38418' # Find records by id number of owner
359
-
360
- Optional parameters
361
- -------------------
362
- q: string [optional]
363
- Search query, using Elasticsearch query string syntax. See
364
- https://help.zenodo.org/guides/search/ for details.
365
- sort: string [optional]
366
- Sort order ('bestmatch' or 'mostrecent'). Prefix with minus to change from ascending to
367
- descending (e.g., '-mostrecent').
368
- size: int [optional, defaults to 1000]
369
- Number of results to return per page. Note that CaltechDATA (as of this writing) seems to
370
- place a hard limit of 9999 responses. Anything more will result in an error. Use
371
- multiple pages to get more results.
372
- page: int [optional, defaults to 1]
373
- Page number for pagination
374
- allversions: bool [optional, defaults to False]
375
- If True return all records, including older versions of published records.
376
- max_pages: int [optional, defaults to 10]
377
- If the query returns a number of records equal to `size`, it is evidently incomplete.
378
- This function will attempt to retrieve successive pages until the number of records is
379
- less than `size`. If the query is still incomplete after this many pages, just return
380
- what we've got.
381
-
382
- """
383
- params = {}
384
- if q is not None:
385
- params["q"] = q
386
- if sort is not None and sort in ["bestmatch", "mostrecent", "-bestmatch", "-mostrecent"]:
387
- params["sort"] = sort
388
- params["page"] = page
389
- params["size"] = size
390
- if allversions:
391
- params["allversions"] = ""
392
-
393
- url = "{0}api/records".format(self.base_url)
394
- r = self.session.get(url, params=params)
395
- if r.status_code != 200:
396
- print("An unknown error occurred when trying to access {0}.".format(url))
397
- print('The search parameters were "{0}"'.format(params))
398
- try:
399
- print(r.json())
400
- except:
401
- pass
402
- r.raise_for_status()
403
- raise RuntimeError() # Will only happen if the response was not strictly an error
404
-
405
- json = r.json()
406
- if len(json) == size:
407
- page += 1
408
- if page > max_pages:
409
- print("Search is not yet complete after {0} pages; returning with what we have.".format(max_pages))
410
- return json # Note: This will percolate back up the recursion to include other results
411
- return json + self.search(
412
- q=q, sort=sort, page=page, size=size, allversions=allversions, max_pages=max_pages
413
- )
414
-
415
- return json
416
-
417
- def delete_untitled_empty_deposits(self):
418
- deleted_deposits = 0
419
- deposits = self.search()
420
- for d in deposits:
421
- try:
422
- if d["title"] == "":
423
- d = self.deposit(d["id"], ignore_deletion=True)
424
- if not d.files:
425
- d.delete_deposit(confirmed=True)
426
- deleted_deposits += 1
427
- except:
428
- pass
429
- print("Deleted {0} deposits".format(deleted_deposits))
430
-
431
- def discard_all_drafts(self):
432
- discarded_drafts = 0
433
- deposits = self.search()
434
- for d in deposits:
435
- try:
436
- if d["state"] == "inprogress":
437
- d = self.deposit(d["id"], ignore_deletion=True)
438
- d.discard()
439
- discarded_drafts += 1
440
- except:
441
- pass
442
- print("Discarded {0} drafts".format(discarded_drafts))
443
-
444
- def awaiting_approval(self, community_id):
445
- """List all records awaiting approval for the given community"""
446
- url = "{0}/api/records/?q=provisional_communities:{1}".format(self.base_url, community_id)
447
- r = self.session.get(url)
448
- if r.status_code != 200:
449
- print("Unable to find any records for community {0}.".format(community_id))
450
- try:
451
- print(r.json())
452
- except:
453
- pass
454
- r.raise_for_status()
455
- raise RuntimeError() # Will only happen if the response was not strictly an error
456
- return r.json()
457
-
458
- def community_curate_accept(self, community_id, record_id):
459
- """Accept a record into the community"""
460
- url = "{0}/communities/{1}/curaterecord/".format(self.base_url, community_id)
461
- data = {"recid": int(record_id), "action": "accept"}
462
- r = self.session.post(url, json=data)
463
- if r.status_code != 200:
464
- print(
465
- "Unable to accept record id {0} into community {1}; status code={2}.".format(
466
- record_id, community_id, r.status_code
467
- )
468
- )
469
- try:
470
- r_json = r.json()
471
- print("Response JSON:")
472
- print(r_json)
473
- except:
474
- pass
475
- r.raise_for_status()
476
- raise RuntimeError() # Will only happen if the response was not strictly an error
477
- return r.json()
478
-
479
- def total_deposit_size(self, deposition_id=None, human_readable=True):
480
- import math
481
-
482
- def convert_size(size_bytes):
483
- if size_bytes == 0:
484
- return "0B"
485
- size_name = ("B ", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB")
486
- i = int(math.floor(math.log(size_bytes, 1024)))
487
- p = math.pow(1024, i)
488
- s = round(size_bytes / p, 3)
489
- return "{0:>8.3f} {1}".format(s, size_name[i])
490
-
491
- if deposition_id is None:
492
- depositions = self.search()
493
- else:
494
- depositions = [self.deposit(deposition_id, ignore_deletion=True).representation]
495
- total_size = 0
496
- for deposition in depositions:
497
- id = deposition["id"]
498
- d = self.deposit(id, ignore_deletion=True)
499
- d_total_size = sum([f["filesize"] for f in d.files])
500
- print('{1} in "{2}" (CaltechDATA ID {0})'.format(id, convert_size(d_total_size), d.title))
501
- total_size += d_total_size
502
- print("{0} in {1} deposits".format(convert_size(total_size), len(depositions)))
503
- if human_readable:
504
- return convert_size(total_size) # Note: the return type will be str
505
- else:
506
- return total_size # Note: the return type will be int