skilleter-thingy 0.0.24__py3-none-any.whl → 0.0.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skilleter-thingy might be problematic. Click here for more details.

@@ -1,383 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- """Selectively synchronise an S3 bucket to a local destination.
4
- Similar to the aws s3 sync CLI command, but faster, has better
5
- options to filter files, only downloads from S3 to local and
6
- doesn't support the huge range of command line options."""
7
-
8
- import os
9
- import argparse
10
- import sys
11
- import fnmatch
12
- import datetime
13
- import threading
14
- import queue
15
-
16
- from pathlib import Path
17
-
18
- import boto3
19
-
20
- from botocore.exceptions import ClientError
21
-
22
- ################################################################################
23
-
24
- # Number of download threads to run - doing the downloads in threads is about
25
- # six times faster than doing so sequentially.
26
-
27
- NUM_THREADS = 12
28
-
29
- # Translate our environment names to AWS ARNs
30
-
31
- AWS_ACCOUNT_ARNS = {
32
- 'prod': 'arn:aws:iam::459580378985:role/ERSReadOnlyRole',
33
- 'test': 'arn:aws:iam::094438481629:role/ERSReadOnlyRole',
34
- 'dev': 'arn:aws:iam::402653103803:role/ERSReadOnlyRole',
35
- 'mgmt': 'arn:aws:iam::125943076446:role/ERSReadOnlyRole',
36
- 'audit': 'arn:aws:iam::229627323276:role/ERSReadOnlyRole',
37
- }
38
-
39
- ################################################################################
40
-
41
- def error(msg, status=1):
42
- """Report an error message and exit"""
43
-
44
- print(f'ERROR: {msg}')
45
- sys.exit(status)
46
-
47
- ################################################################################
48
-
49
- def verbose(args, msg):
50
- """Report a message in verbose mode"""
51
-
52
- if not args or args.verbose:
53
- print(msg)
54
-
55
- ################################################################################
56
-
57
- def splitlist(lists, deliminator):
58
- """Create a list from a list of deliminated strings"""
59
-
60
- result = []
61
-
62
- for item in lists or []:
63
- result += item.split(deliminator)
64
-
65
- return result
66
-
67
- ################################################################################
68
-
69
- def configure():
70
- """Parse the command line"""
71
-
72
- parser = argparse.ArgumentParser(description='Selectively sync an S3 bucket to a local directory')
73
-
74
- parser.add_argument('--verbose', '-v', action='store_true', help='Report verbose results (includes number of commits between branch and parent)')
75
-
76
- parser.add_argument('--profile', '-p', action='store', help='Specify the AWS profile')
77
-
78
- parser.add_argument('--include', '-i', action='append', help='Comma-separated list of wildcards to sync - if specified, only files matching one or more of these are synced')
79
- parser.add_argument('--exclude', '-x', action='append', help='Comma-separated list of wildcards NOT to sync - if specified, only files NOT matching any of these are synced')
80
-
81
- parser.add_argument('--include-type', '-I', action='append',
82
- help='Comma-separated list of file types to sync - if specified, only files matching one or more of these are synced')
83
- parser.add_argument('--exclude-type', '-X', action='append',
84
- help='Comma-separated list of file types NOT to sync - if specified, only files NOT matching any of these are synced')
85
-
86
- # TODO: parser.add_argument('--delete', '-d', action='store_true', help='Delete local files that don\'t exist in the bucket')
87
- parser.add_argument('--force', '-f', action='store_true', help='Always overwrite locals files (by default files are only overwritten if they are older or a different size)')
88
-
89
- parser.add_argument('--max-objects', '-m', action='store', type=int, help='Limit the number of objects to download')
90
- parser.add_argument('--threads', '-t', action='store', type=int, default=NUM_THREADS, help='Number of parallel threads to run')
91
- parser.add_argument('source', action='store', nargs=1, type=str, help='Name of the S3 bucket, optionally including path within the bucket')
92
- parser.add_argument('destination', action='store', nargs=1, type=str, help='Name of the local directory to sync into')
93
-
94
- args = parser.parse_args()
95
-
96
- # Convert the arguments to single items, but 1-entry lists
97
-
98
- args.source = args.source[0]
99
- args.destination = args.destination[0]
100
-
101
- # Convert the include/exclude parameters to lists
102
-
103
- args.include = splitlist(args.include, ',')
104
- args.exclude = splitlist(args.exclude, ',')
105
-
106
- args.include_type = splitlist(args.include_type, ',')
107
- args.exclude_type = splitlist(args.exclude_type, ',')
108
-
109
- return args
110
-
111
- ################################################################################
112
-
113
- def get_client(args):
114
- """Create an S3 client for the specified profile"""
115
-
116
- if args.profile:
117
- profile = args.profile.split('-')[0]
118
- else:
119
- try:
120
- profile = os.environ['AWS_PROFILE']
121
- except KeyError:
122
- error('The AWS profile must be specified via the AWS_PROFILE environment variable or the --profile command line option')
123
-
124
- try:
125
- arn = AWS_ACCOUNT_ARNS[profile]
126
- except KeyError:
127
- error(f'Invalid AWS profile "{profile}"')
128
-
129
- sts_connection = boto3.client("sts")
130
-
131
- try:
132
- acct_b = sts_connection.assume_role(RoleArn=arn, RoleSessionName='s3-selective-sync')
133
- except ClientError as exc:
134
- error(f'{exc.response["Error"]["Message"]}')
135
-
136
- access_key = acct_b["Credentials"]["AccessKeyId"]
137
- secret_key = acct_b["Credentials"]["SecretAccessKey"]
138
- session_token = acct_b["Credentials"]["SessionToken"]
139
-
140
- session = boto3.Session(
141
- aws_access_key_id=access_key,
142
- aws_secret_access_key=secret_key,
143
- aws_session_token=session_token)
144
-
145
- return session.client('s3')
146
-
147
- ################################################################################
148
-
149
- def download_filter(args, s3_client, s3_bucket, s3_object):
150
- """Decide whether to download an object from S3
151
- Returns True if the object should be downloaded, or False if it should be skipped."""
152
-
153
- # Ignore directories
154
-
155
- if s3_object['Key'][-1] == '/':
156
- verbose(args, f'{s3_object["Key"]} is a prefix, so will be skipped')
157
- return False
158
-
159
- # Handle the object as a Path for simpicity
160
-
161
- object_path = Path(s3_object['Key'])
162
-
163
- # Filter according to wildcard
164
-
165
- if args.include:
166
- for wildcard in args.include:
167
- if '/' in wildcard:
168
- if fnmatch.fnmatch(s3_object['Key'], wildcard):
169
- break
170
- elif fnmatch.fnmatch(object_path.name, wildcard):
171
- break
172
- else:
173
- verbose(args, f'"{s3_object["Key"]}" does not match any include wildcards, so will be skipped')
174
- return False
175
-
176
- if args.exclude:
177
- for wildcard in args.exclude:
178
- if '/' in wildcard:
179
- if fnmatch.fnmatch(s3_object['Key'], wildcard):
180
- verbose(args, f'"{s3_object["Key"]}" matches one or more exclude wildcards, so will be skipped')
181
- elif fnmatch.fnmatch(object_path.name, wildcard):
182
- verbose(args, f'"{s3_object["Key"]}" matches one or more exclude wildcards, so will be skipped')
183
- return False
184
-
185
- # Filter according to content type
186
-
187
- if args.include_type or args.exclude_type:
188
- object_type = s3_client.head_object(Bucket=s3_bucket, Key=s3_object["Key"])['ContentType']
189
-
190
- if args.include_type:
191
- for include_type in args.include_type:
192
- if object_type == include_type:
193
- break
194
- else:
195
- verbose(args, f'"{s3_object["Key"]}" is of type "{object_type}" which does not match any entries in the the type include list, so will be skipped')
196
- return False
197
-
198
- if args.exclude_type:
199
- for exclude_type in args.exclude_type:
200
- if object_type == exclude_type:
201
- verbose(args, f'"{s3_object["Key"]}" is of type "{object_type}" which matches one of the entries in the type exclude list, so will be skipped')
202
- return False
203
-
204
- # Unless we are in force-download mode, check if the destination file already exists and see if it needs to be overwritten
205
-
206
- if not args.force:
207
- dest_file = args.destination / object_path
208
-
209
- if dest_file.exists():
210
- # Overwrite if destination is older or a different size
211
-
212
- dest_stat = dest_file.stat()
213
- dest_timestamp = datetime.datetime.fromtimestamp(dest_stat.st_mtime, tz=datetime.timezone.utc)
214
-
215
- if dest_timestamp >= s3_object['LastModified']:
216
- verbose(args, f'Destination file already exists and is same age or newer, so "{s3_object["Key"]}" will be skipped')
217
- return False
218
-
219
- return True
220
-
221
- ################################################################################
222
-
223
- def download(args, s3_client, mkdir_lock, bucket, s3_object):
224
- """Attempt to download an object from S3 to an equivalent local location"""
225
-
226
- local_path = Path(args.destination) / s3_object['Key']
227
-
228
- with mkdir_lock:
229
- if local_path.parent.exists():
230
- if not local_path.parent.is_dir():
231
- error(f'Unable to download "{s3_object["Key"]}" as the destination path is not a directory')
232
- else:
233
- local_path.parent.mkdir(parents=True)
234
-
235
- # Download the object and the set the file timestamp to the same as the object
236
-
237
- object_timestamp = s3_object['LastModified'].timestamp()
238
- s3_client.download_file(bucket, s3_object['Key'], local_path)
239
- os.utime(local_path, (object_timestamp, object_timestamp))
240
-
241
- ################################################################################
242
-
243
- def downloader(args, s3_client, mkdir_lock, bucket, object_queue, error_queue, sem_counter, real_thread=True):
244
- """Download thread"""
245
-
246
- finished = False
247
- while not finished:
248
- # Get the next object to download (waiting for one to be added to the queue)
249
-
250
- s3_object = object_queue.get()
251
-
252
- # If it is a candidate for downloading (meetings the criteria specified on the command
253
- # line and, unless force-downloading, hasn't already been downloaded) then attempt to
254
- # download it.
255
-
256
- # If the semaphore is being used to limit the number of downloads, attempt to acquire it
257
- # If we couldn't, then we've reached the download limit so we'll finish.
258
-
259
- if download_filter(args, s3_client, bucket, s3_object):
260
-
261
- if not sem_counter or sem_counter.acquire(blocking=False):
262
- print(f'Downloading "{s3_object["Key"]}"')
263
- try:
264
- download(args, s3_client, mkdir_lock, bucket, s3_object)
265
- except ClientError as exc:
266
- error_queue.put(f'Failed to download "{s3_object["Key"]}" - {exc.response["Error"]["Message"]}')
267
-
268
- if sem_counter:
269
- sem_counter.release()
270
- else:
271
- print(f' Done "{s3_object["Key"]}"')
272
-
273
- else:
274
- finished = True
275
-
276
- # Indicate the queued item has been consumed
277
-
278
- object_queue.task_done()
279
-
280
- # If we were using a download semaphore then drain the queue (this will happen in all
281
- # threads and will never terminate, but we're running as a daemon so it doesn't matter too much).
282
-
283
- if sem_counter and real_thread:
284
- while True:
285
- object_queue.get()
286
- object_queue.task_done()
287
-
288
- ################################################################################
289
-
290
- def thread_exception_handler(args):
291
- """Brute-force thread exception handler"""
292
-
293
- _ = args
294
- sys.exit(1)
295
-
296
- ################################################################################
297
-
298
- def main():
299
- """Entry point"""
300
-
301
- args = configure()
302
-
303
- s3_client = get_client(args)
304
-
305
- bucket = args.source
306
-
307
- # Remove the 's3://' prefix, if present so that we can split bucket and folder
308
- # if specified
309
-
310
- if bucket.startswith('s3://'):
311
- bucket = bucket[5:]
312
-
313
- if '/' in bucket:
314
- bucket, prefix = bucket.split('/', 1)
315
- else:
316
- prefix = ''
317
-
318
- # Semaphore to protect download counter
319
-
320
- sem_counter = threading.Semaphore(value=args.max_objects) if args.max_objects else None
321
-
322
- # Create the download queue and the worker threads
323
-
324
- object_queue = queue.Queue()
325
-
326
- # Create the queue for reporting errors back from the threads
327
-
328
- error_queue = queue.Queue()
329
-
330
- # Lock to prevent race conditions around directory creation
331
-
332
- mkdir_lock = threading.Lock()
333
-
334
- if args.threads > 1:
335
- # Create threads
336
-
337
- threading.excepthook = thread_exception_handler
338
-
339
- for _ in range(NUM_THREADS):
340
- thread = threading.Thread(target=downloader, daemon=True, args=(args, s3_client, mkdir_lock, bucket, object_queue, error_queue, sem_counter))
341
- thread.start()
342
-
343
- # Read all the objects in the bucket and queue them for consideration by the download workers
344
-
345
- for page in s3_client.get_paginator('list_objects_v2').paginate(Bucket=bucket, Prefix=prefix):
346
- for s3_object in page['Contents']:
347
- object_queue.put(s3_object)
348
-
349
- print('Finished queuing objects')
350
-
351
- if args.threads > 1:
352
- # Wait for the queues to drain
353
-
354
- object_queue.join()
355
- else:
356
- downloader(args, s3_client, mkdir_lock, bucket, object_queue, error_queue, sem_counter, real_thread=False)
357
-
358
- # Report any errors:
359
-
360
- if not error_queue.empty():
361
- sys.stderr.write('\nErrors were encountered downloading some of the objects:\n\n\n')
362
-
363
- while not error_queue.empty():
364
- error_msg = error_queue.get()
365
- sys.stderr.write(f'{error_msg}\n')
366
- error_queue.task_done()
367
-
368
- ################################################################################
369
-
370
- def s3_sync():
371
- """Entry point"""
372
-
373
- try:
374
- main()
375
- except KeyboardInterrupt:
376
- sys.exit(1)
377
- except BrokenPipeError:
378
- sys.exit(2)
379
-
380
- ################################################################################
381
-
382
- if __name__ == '__main__':
383
- s3_sync()