geoseeq 0.5.6a7__py3-none-any.whl → 0.5.6a8__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
geoseeq/cli/main.py CHANGED
@@ -53,7 +53,7 @@ def version():
53
53
  Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
54
54
  Run `geoseeq eula show` to view the EULA.
55
55
  """
56
- click.echo('0.5.6a7') # remember to update setup
56
+ click.echo('0.5.6a8') # remember to update setup
57
57
 
58
58
 
59
59
  @main.group('advanced')
@@ -107,7 +107,8 @@ def cli_upload_file(state, cores, yes, private, link_type, recursive, hidden, ge
107
107
  link_type=link_type,
108
108
  progress_tracker_factory=PBarManager().get_new_bar,
109
109
  log_level=state.log_level,
110
- overwrite=True
110
+ overwrite=True,
111
+ use_cache=state.use_cache,
111
112
  )
112
113
  for geoseeq_file_name, file_path in name_pairs:
113
114
  if isfile(file_path):
@@ -140,7 +141,8 @@ def cli_upload_folder(state, cores, yes, private, recursive, hidden, project_or_
140
141
  link_type='upload',
141
142
  progress_tracker_factory=PBarManager().get_new_bar,
142
143
  log_level=logging.INFO,
143
- overwrite=True
144
+ overwrite=True,
145
+ use_cache=state.use_cache,
144
146
  )
145
147
  for folder_name in folder_names:
146
148
  result_folder = root_obj.result_folder(folder_name).idem()
@@ -95,6 +95,7 @@ def _do_upload(groups, module_name, link_type, lib, filepaths, overwrite, cores,
95
95
  log_level=state.log_level,
96
96
  overwrite=overwrite,
97
97
  progress_tracker_factory=PBarManager().get_new_bar,
98
+ use_cache=state.use_cache,
98
99
  )
99
100
  for group in groups:
100
101
  sample = lib.sample(group['sample_name']).idem()
@@ -15,7 +15,7 @@ CACHE_DIR = join(
15
15
  "geoseeq"
16
16
  )
17
17
  USE_GEOSEEQ_CACHE = None
18
-
18
+ GEOSEEQ_CACHE_DIR = abspath(f'{CACHE_DIR}/geoseeq_api_cache/v1/')
19
19
 
20
20
  def hash_obj(obj):
21
21
  val = obj
@@ -41,7 +41,7 @@ class FileSystemCache:
41
41
 
42
42
  @property
43
43
  def cache_dir_path(self):
44
- return abspath(f'{CACHE_DIR}/geoseeq_api_cache/v1/')
44
+ return GEOSEEQ_CACHE_DIR
45
45
 
46
46
  def setup(self):
47
47
  if self.no_cache:
@@ -1,7 +1,8 @@
1
1
 
2
2
  import time
3
3
  import json
4
- from os.path import basename, getsize
4
+ import os
5
+ from os.path import basename, getsize, join, dirname, isfile
5
6
  from pathlib import Path
6
7
 
7
8
  import requests
@@ -11,7 +12,7 @@ from geoseeq.constants import FIVE_MB
11
12
  from geoseeq.utils import md5_checksum
12
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
13
14
  from .utils import *
14
-
15
+ from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
15
16
 
16
17
  class FileChunker:
17
18
 
@@ -38,6 +39,77 @@ class FileChunker:
38
39
  def get_chunk_size(self, num):
39
40
  self.load_all_chunks()
40
41
  return len(self.loaded_parts[num])
42
+
43
+
44
+ class ResumableUploadTracker:
45
+
46
+ def __init__(self, filepath, chunk_size, tracker_file_prefix="gs_resumable_upload_tracker"):
47
+ self.open, self.upload_started = True, False
48
+ self.upload_id, self.urls = None, None
49
+ self.filepath = filepath
50
+ self.tracker_file = join(
51
+ GEOSEEQ_CACHE_DIR, 'upload',
52
+ tracker_file_prefix + f".{chunk_size}." + basename(filepath)
53
+ )
54
+ try:
55
+ os.makedirs(dirname(self.tracker_file), exist_ok=True)
56
+ except Exception as e:
57
+ logger.warning(f'Could not create resumable upload tracker directory. {e}')
58
+ self.open = False
59
+ self._loaded_parts = {}
60
+ self._load_parts_from_file()
61
+
62
+ def start_upload(self, upload_id, urls):
63
+ if not self.open:
64
+ return
65
+ if self.upload_started:
66
+ raise GeoseeqGeneralError("Upload has already started.")
67
+ blob = dict(upload_id=upload_id, urls=urls)
68
+ serialized = json.dumps(blob)
69
+ with open(self.tracker_file, "w") as f:
70
+ f.write(serialized + "\n")
71
+ self.upload_id, self.urls = upload_id, urls
72
+ self.upload_started = True
73
+
74
+ def add_part(self, part_upload_info):
75
+ if not self.open:
76
+ return
77
+ part_id = part_upload_info["PartNumber"]
78
+ serialized = json.dumps(part_upload_info)
79
+ with open(self.tracker_file, "a") as f:
80
+ f.write(serialized + "\n")
81
+ self._loaded_parts[part_id] = part_upload_info
82
+ if len(self._loaded_parts) == len(self.urls):
83
+ self.cleanup()
84
+ self.open = False
85
+
86
+ def _load_parts_from_file(self):
87
+ if not isfile(self.tracker_file):
88
+ return
89
+ with open(self.tracker_file, "r") as f:
90
+ header_blob = json.loads(f.readline())
91
+ self.upload_id, self.urls = header_blob["upload_id"], header_blob["urls"]
92
+ self.upload_started = True
93
+ for line in f:
94
+ blob = json.loads(line)
95
+ part_id = blob["PartNumber"]
96
+ self._loaded_parts[part_id] = blob
97
+
98
+ def part_has_been_uploaded(self, part_number):
99
+ if not self.open:
100
+ return False
101
+ return part_number in self._loaded_parts
102
+
103
+ def get_part_info(self, part_number):
104
+ return self._loaded_parts[part_number]
105
+
106
+ def cleanup(self):
107
+ if not self.open:
108
+ return
109
+ try:
110
+ os.remove(self.tracker_file)
111
+ except FileNotFoundError:
112
+ pass
41
113
 
42
114
 
43
115
  class ResultFileUpload:
@@ -74,7 +146,10 @@ class ResultFileUpload:
74
146
  urls = response
75
147
  return upload_id, urls
76
148
 
77
- def _upload_one_part(self, file_chunker, url, num, max_retries, session=None):
149
+ def _upload_one_part(self, file_chunker, url, num, max_retries, session=None, resumable_upload_tracker=None):
150
+ if resumable_upload_tracker and resumable_upload_tracker.part_has_been_uploaded(num + 1):
151
+ logger.info(f"Part {num + 1} has already been uploaded. Skipping.")
152
+ return resumable_upload_tracker.get_part_info(num + 1)
78
153
  file_chunk = file_chunker.get_chunk(num)
79
154
  attempts = 0
80
155
  while attempts < max_retries:
@@ -94,7 +169,12 @@ class ResultFileUpload:
94
169
  if attempts == max_retries:
95
170
  raise
96
171
  time.sleep(10**attempts) # exponential backoff, (10 ** 2)s default max
97
- return {"ETag": http_response.headers["ETag"], "PartNumber": num + 1}
172
+ etag = http_response.headers["ETag"].replace('"', "")
173
+ blob = {"ETag": etag, "PartNumber": num + 1}
174
+ if resumable_upload_tracker:
175
+ # TODO technically not thread safe, but should be fine for now
176
+ resumable_upload_tracker.add_part(blob)
177
+ return blob
98
178
 
99
179
  def _finish_multipart_upload(self, upload_id, complete_parts):
100
180
  response = self.knex.post(
@@ -108,12 +188,12 @@ class ResultFileUpload:
108
188
  )
109
189
  response.raise_for_status()
110
190
 
111
- def _upload_parts(self, file_chunker, urls, max_retries, session, progress_tracker, threads):
191
+ def _upload_parts(self, file_chunker, urls, max_retries, session, progress_tracker, threads, resumable_upload_tracker=None):
112
192
  if threads == 1:
113
193
  logger.info(f"Uploading parts in series for {file_chunker.filepath}")
114
194
  complete_parts = []
115
195
  for num, url in enumerate(list(urls.values())):
116
- response_part = self._upload_one_part(file_chunker, url, num, max_retries, session)
196
+ response_part = self._upload_one_part(file_chunker, url, num, max_retries, session, resumable_upload_tracker)
117
197
  complete_parts.append(response_part)
118
198
  if progress_tracker: progress_tracker.update(file_chunker.get_chunk_size(num))
119
199
  logger.info(f'Uploaded part {num + 1} of {len(urls)} for "{file_chunker.filepath}"')
@@ -124,7 +204,7 @@ class ResultFileUpload:
124
204
  futures = []
125
205
  for num, url in enumerate(list(urls.values())):
126
206
  future = executor.submit(
127
- self._upload_one_part, file_chunker, url, num, max_retries, session
207
+ self._upload_one_part, file_chunker, url, num, max_retries, session, resumable_upload_tracker
128
208
  )
129
209
  futures.append(future)
130
210
  complete_parts = []
@@ -148,15 +228,34 @@ class ResultFileUpload:
148
228
  session=None,
149
229
  progress_tracker=None,
150
230
  threads=1,
231
+ use_cache=True,
151
232
  ):
152
233
  """Upload a file to S3 using the multipart upload process."""
153
234
  logger.info(f"Uploading {filepath} to S3 using multipart upload.")
154
- upload_id, urls = self._prep_multipart_upload(filepath, file_size, chunk_size, optional_fields)
235
+ resumable_upload_tracker = None
236
+ if use_cache and file_size > 10 * FIVE_MB: # only use resumable upload tracker for larger files
237
+ resumable_upload_tracker = ResumableUploadTracker(filepath, chunk_size)
238
+ if resumable_upload_tracker and resumable_upload_tracker.upload_started:
239
+ upload_id, urls = resumable_upload_tracker.upload_id, resumable_upload_tracker.urls
240
+ logger.info(f'Resuming upload for "{filepath}", upload_id: "{upload_id}"')
241
+ else:
242
+ upload_id, urls = self._prep_multipart_upload(filepath, file_size, chunk_size, optional_fields)
243
+ if resumable_upload_tracker:
244
+ logger.info(f'Creating new resumable upload for "{filepath}", upload_id: "{upload_id}"')
245
+ resumable_upload_tracker.start_upload(upload_id, urls)
155
246
  logger.info(f'Starting upload for "{filepath}"')
156
247
  complete_parts = []
157
248
  file_chunker = FileChunker(filepath, chunk_size).load_all_chunks()
158
249
  if progress_tracker: progress_tracker.set_num_chunks(file_chunker.file_size)
159
- complete_parts = self._upload_parts(file_chunker, urls, max_retries, session, progress_tracker, threads)
250
+ complete_parts = self._upload_parts(
251
+ file_chunker,
252
+ urls,
253
+ max_retries,
254
+ session,
255
+ progress_tracker,
256
+ threads,
257
+ resumable_upload_tracker=resumable_upload_tracker
258
+ )
160
259
  self._finish_multipart_upload(upload_id, complete_parts)
161
260
  logger.info(f'Finished Upload for "{filepath}"')
162
261
  return self
@@ -19,12 +19,12 @@ def _make_in_process_logger(log_level):
19
19
 
20
20
 
21
21
  def _upload_one_file(args):
22
- result_file, filepath, session, progress_tracker, link_type, overwrite, log_level, parallel_uploads = args
22
+ result_file, filepath, session, progress_tracker, link_type, overwrite, log_level, parallel_uploads, use_cache = args
23
23
  if parallel_uploads:
24
24
  _make_in_process_logger(log_level)
25
25
  if link_type == 'upload':
26
26
  # TODO: check checksums to see if the file is the same
27
- result_file.upload_file(filepath, session=session, overwrite=overwrite, progress_tracker=progress_tracker, threads=4)
27
+ result_file.upload_file(filepath, session=session, overwrite=overwrite, progress_tracker=progress_tracker, threads=4, use_cache=use_cache)
28
28
  else:
29
29
  result_file.link_file(link_type, filepath)
30
30
  return result_file
@@ -38,7 +38,8 @@ class GeoSeeqUploadManager:
38
38
  link_type='upload',
39
39
  progress_tracker_factory=None,
40
40
  log_level=logging.WARNING,
41
- overwrite=True):
41
+ overwrite=True,
42
+ use_cache=True):
42
43
  self.session = session
43
44
  self.n_parallel_uploads = n_parallel_uploads
44
45
  self.progress_tracker_factory = progress_tracker_factory if progress_tracker_factory else lambda x: None
@@ -46,6 +47,7 @@ class GeoSeeqUploadManager:
46
47
  self.link_type = link_type
47
48
  self.overwrite = overwrite
48
49
  self._result_files = []
50
+ self.use_cache = use_cache
49
51
 
50
52
  def add_result_file(self, result_file, local_path):
51
53
  self._result_files.append((result_file, local_path))
@@ -70,7 +72,7 @@ class GeoSeeqUploadManager:
70
72
  result_file, local_path,
71
73
  self.session, self.progress_tracker_factory(local_path),
72
74
  self.link_type, self.overwrite, self.log_level,
73
- self.n_parallel_uploads > 1
75
+ self.n_parallel_uploads > 1, self.use_cache
74
76
  ) for result_file, local_path in self._result_files
75
77
  ]
76
78
  out = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geoseeq
3
- Version: 0.5.6a7
3
+ Version: 0.5.6a8
4
4
  Summary: GeoSeeq command line tools and python API
5
5
  Author: David C. Danko
6
6
  Author-email: "David C. Danko" <dcdanko@biotia.io>
@@ -3,7 +3,7 @@ geoseeq/app.py,sha256=Y6d1UzxFLfE3RNccATbFCVi6kH3eFmzwoUbeR2Ry09A,2387
3
3
  geoseeq/blob_constructors.py,sha256=AkWpDQY0EdGMxF1p6eRspyHKubcUdiW4it-_Q7S2QWk,188
4
4
  geoseeq/bulk_creators.py,sha256=pdn-Dv7yv5SFv-PfDuQbuOnw2W4-BfIfRJVRAhM8U6s,2115
5
5
  geoseeq/constants.py,sha256=h9RURz4xs2bZyDrSGocej7ANJvRLr_H1H7JRxpNUXJM,431
6
- geoseeq/file_system_cache.py,sha256=fZrvqWmUtTLFOpm_qG0fz1Q0GWnw_yVOAcFVPloc75c,4101
6
+ geoseeq/file_system_cache.py,sha256=HzVZWtwLD2fjWWSo_UfWmGeBltm9He4lP_OqzKwNGWg,4138
7
7
  geoseeq/knex.py,sha256=6fPO8F8yxgBgBXZiliMJvYYjgf_16chfJPyWLe-kpPk,7898
8
8
  geoseeq/organization.py,sha256=a9xmGDE0tQsjPJfyFkYnWagxZ8xpdeckkwvkhH6LNIk,2462
9
9
  geoseeq/pipeline.py,sha256=89mhWaecsKnm6tyRkdkaVp4dmZh62_v42Ze0oXf8OTY,9873
@@ -11,7 +11,7 @@ geoseeq/project.py,sha256=-9Y2ik0-BpT3iqh89v8VQBbdadhI58oaUP9oZK8oetc,13741
11
11
  geoseeq/remote_object.py,sha256=Es-JlAz8iLRmCpAzh1MOwUh2MqtbuQM-p8wHIBAqNlQ,7131
12
12
  geoseeq/sample.py,sha256=whgEVk6GnDJJLjn5uTOqFqRtVxZD3BgjTo7brAC5noU,7981
13
13
  geoseeq/search.py,sha256=gawad6Cx5FxJBPlYkXWb-UKAO-UC0_yhvyU9Ca1kaNI,3388
14
- geoseeq/upload_download_manager.py,sha256=Xk5VB5qkDRZGQj6DqXOEzUmphXltF66FAVjxKWaweUE,7090
14
+ geoseeq/upload_download_manager.py,sha256=Nh2cahwn7qvjfuWJSru40_k7q65NBSRQBu0EMdx-Gqc,7206
15
15
  geoseeq/user.py,sha256=tol8i1UGLRrbMw5jeJDnna1ikRgrCDd50Jxz0a1lSgg,690
16
16
  geoseeq/utils.py,sha256=PDRiEQIZYTcfEV9AYvloQVvfqs5JaebcFZodAa2SUW8,3577
17
17
  geoseeq/work_orders.py,sha256=5uLVVfdKE8qh4gGaHkdBpXJGRTujuSg59knWCqEET4A,8071
@@ -22,7 +22,7 @@ geoseeq/cli/detail.py,sha256=q8Suu-j2k18knfSVFG-SWWGNsKM-n8y9RMA3LcIIi9Y,4132
22
22
  geoseeq/cli/download.py,sha256=ldpqpnRe00utb1EL1T_5CyPbFrZbtauIvOSOHtxz9qc,17656
23
23
  geoseeq/cli/fastq_utils.py,sha256=-bmeQLaiMBm57zWOF0R5OlWTU0_3sh1JBC1RYw2BOFM,3083
24
24
  geoseeq/cli/get_eula.py,sha256=79mbUwyiF7O1r0g6UTxG9kJGQEqKuH805E6eLkPC6Y4,997
25
- geoseeq/cli/main.py,sha256=ONd6tMTFSLtEQ8MiL53Ig2Gncindxmpcgm8MXBWWCE8,3259
25
+ geoseeq/cli/main.py,sha256=Suhx91FX_oXb2Zd923oln3Bwk9W1gL5WTUQYk4CgjEk,3259
26
26
  geoseeq/cli/manage.py,sha256=wGXAcVaXqE5JQEU8Jh6OlHr02nB396bpS_SFcOZdrEo,5929
27
27
  geoseeq/cli/progress_bar.py,sha256=p1Xl01nkYxSBZCB30ue2verIIi22W93m3ZAMAxipD0g,738
28
28
  geoseeq/cli/run.py,sha256=bx2AV6VIqOSTlxUda78xl0XxcZ8TXlQx02-e7iLQPwI,3838
@@ -37,9 +37,9 @@ geoseeq/cli/shared_params/id_handlers.py,sha256=501K9sCVkI0YGDQ62vXk_DM5lMMDrdB5
37
37
  geoseeq/cli/shared_params/obj_getters.py,sha256=ZSkt6LnDkVFlNVYKgLrjzg60-6BthZMr3eeD3HNqzac,2741
38
38
  geoseeq/cli/shared_params/opts_and_args.py,sha256=4ZnXe0MfxHPIh1pqkpAHwoi1I0XXGGSBkAr0ltfSvEk,1948
39
39
  geoseeq/cli/upload/__init__.py,sha256=3C9_S9t7chmYU-2ot89NV03x-EtmsjibulErKaU9w1k,627
40
- geoseeq/cli/upload/upload.py,sha256=F8n7eo1lblWHPABjlUSBVmU4XLyBSPbCt1wXTDp-INE,8963
40
+ geoseeq/cli/upload/upload.py,sha256=JsQnVyk4NLxgBtYfytORs8hgAeMGEE2o8atx1g2-Mek,9035
41
41
  geoseeq/cli/upload/upload_advanced.py,sha256=Jq5eGe-wOdrzxGWVwaFPg0BAJcW0YSx_eHEmYjJeKuA,3434
42
- geoseeq/cli/upload/upload_reads.py,sha256=Wh6Fvl5Ob4C5JdKow3otnnVdII1VY5G6q2VaOAWIcW0,7264
42
+ geoseeq/cli/upload/upload_reads.py,sha256=y9qsiBLF708QwQmZXqQIH2M1Ubhz7lbviupa3lbLFTw,7303
43
43
  geoseeq/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  geoseeq/contrib/ncbi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  geoseeq/contrib/ncbi/api.py,sha256=WQeLoGA_-Zha-QeSO8_i7HpvXyD8UkV0qc5okm11KiA,1056
@@ -64,7 +64,7 @@ geoseeq/plotting/map/overlay.py,sha256=4VmxqOESTQra9tPr8b8OLEUhJSit9lNipabeSznEY
64
64
  geoseeq/result/__init__.py,sha256=IFHIyRV8ZzuKIfwfze1SXgcKwNMcSgMAknLHMkwjXIU,356
65
65
  geoseeq/result/bioinfo.py,sha256=QQtbyogrdro9avJSN0713sxLVnVeA24mFw3hWtKDKyw,1782
66
66
  geoseeq/result/file_download.py,sha256=XQA5bdQJJSZIgbFcN09OvVdLq12fnA98kPCIONAkLk0,5568
67
- geoseeq/result/file_upload.py,sha256=dl5T3bUNfb4JdotldAQxZV0wwYcA_XUTlHOApFl8Bgw,7390
67
+ geoseeq/result/file_upload.py,sha256=uAPPIouCbLjS5qgU0QsJZ7xFj4QrvCpmskVYQAI6ApA,11496
68
68
  geoseeq/result/result_file.py,sha256=HG1gKpgIcWImWuf6LduVLEOyW_auuQ-dWld8MNOXGLE,8433
69
69
  geoseeq/result/result_folder.py,sha256=6porOXPh7Tpxw3oX5yMRPYQzNCGYqszqmFJd3SwQmTc,11122
70
70
  geoseeq/result/utils.py,sha256=C-CxGzB3WddlnRiqFSkrY78I_m0yFgNqsTBRzGU-y8Q,2772
@@ -80,9 +80,9 @@ geoseeq/vc/vc_stub.py,sha256=IQr8dI0zsWKVAeY_5ybDD6n49_3othcgfHS3P0O9tuY,3110
80
80
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
81
  tests/test_api_client.py,sha256=TS5njc5pcPP_Ycy-ljcfPVT1hQRBsFVdQ0lCqBmoesU,12810
82
82
  tests/test_plotting.py,sha256=TcTu-2ARr8sxZJ7wPQxmbs3-gHw7uRvsgrhhhg0qKik,784
83
- geoseeq-0.5.6a7.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
84
- geoseeq-0.5.6a7.dist-info/METADATA,sha256=2FR5sXg1kzhWv9LZHlcaKxgLUmGSeLpRdxqVZJN861A,4805
85
- geoseeq-0.5.6a7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
86
- geoseeq-0.5.6a7.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
87
- geoseeq-0.5.6a7.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
88
- geoseeq-0.5.6a7.dist-info/RECORD,,
83
+ geoseeq-0.5.6a8.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
84
+ geoseeq-0.5.6a8.dist-info/METADATA,sha256=yKLoJOHbteUBODSqcT9MpvbjKazjex-YIJjoHunZKuk,4805
85
+ geoseeq-0.5.6a8.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
86
+ geoseeq-0.5.6a8.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
87
+ geoseeq-0.5.6a8.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
88
+ geoseeq-0.5.6a8.dist-info/RECORD,,