geoseeq 0.5.6a7__py3-none-any.whl → 0.5.6a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geoseeq/cli/main.py CHANGED
@@ -53,7 +53,7 @@ def version():
53
53
  Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
54
54
  Run `geoseeq eula show` to view the EULA.
55
55
  """
56
- click.echo('0.5.6a7') # remember to update setup
56
+ click.echo('0.5.6a8') # remember to update setup
57
57
 
58
58
 
59
59
  @main.group('advanced')
@@ -107,7 +107,8 @@ def cli_upload_file(state, cores, yes, private, link_type, recursive, hidden, ge
107
107
  link_type=link_type,
108
108
  progress_tracker_factory=PBarManager().get_new_bar,
109
109
  log_level=state.log_level,
110
- overwrite=True
110
+ overwrite=True,
111
+ use_cache=state.use_cache,
111
112
  )
112
113
  for geoseeq_file_name, file_path in name_pairs:
113
114
  if isfile(file_path):
@@ -140,7 +141,8 @@ def cli_upload_folder(state, cores, yes, private, recursive, hidden, project_or_
140
141
  link_type='upload',
141
142
  progress_tracker_factory=PBarManager().get_new_bar,
142
143
  log_level=logging.INFO,
143
- overwrite=True
144
+ overwrite=True,
145
+ use_cache=state.use_cache,
144
146
  )
145
147
  for folder_name in folder_names:
146
148
  result_folder = root_obj.result_folder(folder_name).idem()
@@ -95,6 +95,7 @@ def _do_upload(groups, module_name, link_type, lib, filepaths, overwrite, cores,
95
95
  log_level=state.log_level,
96
96
  overwrite=overwrite,
97
97
  progress_tracker_factory=PBarManager().get_new_bar,
98
+ use_cache=state.use_cache,
98
99
  )
99
100
  for group in groups:
100
101
  sample = lib.sample(group['sample_name']).idem()
@@ -15,7 +15,7 @@ CACHE_DIR = join(
15
15
  "geoseeq"
16
16
  )
17
17
  USE_GEOSEEQ_CACHE = None
18
-
18
+ GEOSEEQ_CACHE_DIR = abspath(f'{CACHE_DIR}/geoseeq_api_cache/v1/')
19
19
 
20
20
  def hash_obj(obj):
21
21
  val = obj
@@ -41,7 +41,7 @@ class FileSystemCache:
41
41
 
42
42
  @property
43
43
  def cache_dir_path(self):
44
- return abspath(f'{CACHE_DIR}/geoseeq_api_cache/v1/')
44
+ return GEOSEEQ_CACHE_DIR
45
45
 
46
46
  def setup(self):
47
47
  if self.no_cache:
@@ -1,7 +1,8 @@
1
1
 
2
2
  import time
3
3
  import json
4
- from os.path import basename, getsize
4
+ import os
5
+ from os.path import basename, getsize, join, dirname, isfile
5
6
  from pathlib import Path
6
7
 
7
8
  import requests
@@ -11,7 +12,7 @@ from geoseeq.constants import FIVE_MB
11
12
  from geoseeq.utils import md5_checksum
12
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
13
14
  from .utils import *
14
-
15
+ from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
15
16
 
16
17
  class FileChunker:
17
18
 
@@ -38,6 +39,77 @@ class FileChunker:
38
39
  def get_chunk_size(self, num):
39
40
  self.load_all_chunks()
40
41
  return len(self.loaded_parts[num])
42
+
43
+
44
+ class ResumableUploadTracker:
45
+
46
+ def __init__(self, filepath, chunk_size, tracker_file_prefix="gs_resumable_upload_tracker"):
47
+ self.open, self.upload_started = True, False
48
+ self.upload_id, self.urls = None, None
49
+ self.filepath = filepath
50
+ self.tracker_file = join(
51
+ GEOSEEQ_CACHE_DIR, 'upload',
52
+ tracker_file_prefix + f".{chunk_size}." + basename(filepath)
53
+ )
54
+ try:
55
+ os.makedirs(dirname(self.tracker_file), exist_ok=True)
56
+ except Exception as e:
57
+ logger.warning(f'Could not create resumable upload tracker directory. {e}')
58
+ self.open = False
59
+ self._loaded_parts = {}
60
+ self._load_parts_from_file()
61
+
62
+ def start_upload(self, upload_id, urls):
63
+ if not self.open:
64
+ return
65
+ if self.upload_started:
66
+ raise GeoseeqGeneralError("Upload has already started.")
67
+ blob = dict(upload_id=upload_id, urls=urls)
68
+ serialized = json.dumps(blob)
69
+ with open(self.tracker_file, "w") as f:
70
+ f.write(serialized + "\n")
71
+ self.upload_id, self.urls = upload_id, urls
72
+ self.upload_started = True
73
+
74
+ def add_part(self, part_upload_info):
75
+ if not self.open:
76
+ return
77
+ part_id = part_upload_info["PartNumber"]
78
+ serialized = json.dumps(part_upload_info)
79
+ with open(self.tracker_file, "a") as f:
80
+ f.write(serialized + "\n")
81
+ self._loaded_parts[part_id] = part_upload_info
82
+ if len(self._loaded_parts) == len(self.urls):
83
+ self.cleanup()
84
+ self.open = False
85
+
86
+ def _load_parts_from_file(self):
87
+ if not isfile(self.tracker_file):
88
+ return
89
+ with open(self.tracker_file, "r") as f:
90
+ header_blob = json.loads(f.readline())
91
+ self.upload_id, self.urls = header_blob["upload_id"], header_blob["urls"]
92
+ self.upload_started = True
93
+ for line in f:
94
+ blob = json.loads(line)
95
+ part_id = blob["PartNumber"]
96
+ self._loaded_parts[part_id] = blob
97
+
98
+ def part_has_been_uploaded(self, part_number):
99
+ if not self.open:
100
+ return False
101
+ return part_number in self._loaded_parts
102
+
103
+ def get_part_info(self, part_number):
104
+ return self._loaded_parts[part_number]
105
+
106
+ def cleanup(self):
107
+ if not self.open:
108
+ return
109
+ try:
110
+ os.remove(self.tracker_file)
111
+ except FileNotFoundError:
112
+ pass
41
113
 
42
114
 
43
115
  class ResultFileUpload:
@@ -74,7 +146,10 @@ class ResultFileUpload:
74
146
  urls = response
75
147
  return upload_id, urls
76
148
 
77
- def _upload_one_part(self, file_chunker, url, num, max_retries, session=None):
149
+ def _upload_one_part(self, file_chunker, url, num, max_retries, session=None, resumable_upload_tracker=None):
150
+ if resumable_upload_tracker and resumable_upload_tracker.part_has_been_uploaded(num + 1):
151
+ logger.info(f"Part {num + 1} has already been uploaded. Skipping.")
152
+ return resumable_upload_tracker.get_part_info(num + 1)
78
153
  file_chunk = file_chunker.get_chunk(num)
79
154
  attempts = 0
80
155
  while attempts < max_retries:
@@ -94,7 +169,12 @@ class ResultFileUpload:
94
169
  if attempts == max_retries:
95
170
  raise
96
171
  time.sleep(10**attempts) # exponential backoff, (10 ** 2)s default max
97
- return {"ETag": http_response.headers["ETag"], "PartNumber": num + 1}
172
+ etag = http_response.headers["ETag"].replace('"', "")
173
+ blob = {"ETag": etag, "PartNumber": num + 1}
174
+ if resumable_upload_tracker:
175
+ # TODO technically not thread safe, but should be fine for now
176
+ resumable_upload_tracker.add_part(blob)
177
+ return blob
98
178
 
99
179
  def _finish_multipart_upload(self, upload_id, complete_parts):
100
180
  response = self.knex.post(
@@ -108,12 +188,12 @@ class ResultFileUpload:
108
188
  )
109
189
  response.raise_for_status()
110
190
 
111
- def _upload_parts(self, file_chunker, urls, max_retries, session, progress_tracker, threads):
191
+ def _upload_parts(self, file_chunker, urls, max_retries, session, progress_tracker, threads, resumable_upload_tracker=None):
112
192
  if threads == 1:
113
193
  logger.info(f"Uploading parts in series for {file_chunker.filepath}")
114
194
  complete_parts = []
115
195
  for num, url in enumerate(list(urls.values())):
116
- response_part = self._upload_one_part(file_chunker, url, num, max_retries, session)
196
+ response_part = self._upload_one_part(file_chunker, url, num, max_retries, session, resumable_upload_tracker)
117
197
  complete_parts.append(response_part)
118
198
  if progress_tracker: progress_tracker.update(file_chunker.get_chunk_size(num))
119
199
  logger.info(f'Uploaded part {num + 1} of {len(urls)} for "{file_chunker.filepath}"')
@@ -124,7 +204,7 @@ class ResultFileUpload:
124
204
  futures = []
125
205
  for num, url in enumerate(list(urls.values())):
126
206
  future = executor.submit(
127
- self._upload_one_part, file_chunker, url, num, max_retries, session
207
+ self._upload_one_part, file_chunker, url, num, max_retries, session, resumable_upload_tracker
128
208
  )
129
209
  futures.append(future)
130
210
  complete_parts = []
@@ -148,15 +228,34 @@ class ResultFileUpload:
148
228
  session=None,
149
229
  progress_tracker=None,
150
230
  threads=1,
231
+ use_cache=True,
151
232
  ):
152
233
  """Upload a file to S3 using the multipart upload process."""
153
234
  logger.info(f"Uploading {filepath} to S3 using multipart upload.")
154
- upload_id, urls = self._prep_multipart_upload(filepath, file_size, chunk_size, optional_fields)
235
+ resumable_upload_tracker = None
236
+ if use_cache and file_size > 10 * FIVE_MB: # only use resumable upload tracker for larger files
237
+ resumable_upload_tracker = ResumableUploadTracker(filepath, chunk_size)
238
+ if resumable_upload_tracker and resumable_upload_tracker.upload_started:
239
+ upload_id, urls = resumable_upload_tracker.upload_id, resumable_upload_tracker.urls
240
+ logger.info(f'Resuming upload for "{filepath}", upload_id: "{upload_id}"')
241
+ else:
242
+ upload_id, urls = self._prep_multipart_upload(filepath, file_size, chunk_size, optional_fields)
243
+ if resumable_upload_tracker:
244
+ logger.info(f'Creating new resumable upload for "{filepath}", upload_id: "{upload_id}"')
245
+ resumable_upload_tracker.start_upload(upload_id, urls)
155
246
  logger.info(f'Starting upload for "{filepath}"')
156
247
  complete_parts = []
157
248
  file_chunker = FileChunker(filepath, chunk_size).load_all_chunks()
158
249
  if progress_tracker: progress_tracker.set_num_chunks(file_chunker.file_size)
159
- complete_parts = self._upload_parts(file_chunker, urls, max_retries, session, progress_tracker, threads)
250
+ complete_parts = self._upload_parts(
251
+ file_chunker,
252
+ urls,
253
+ max_retries,
254
+ session,
255
+ progress_tracker,
256
+ threads,
257
+ resumable_upload_tracker=resumable_upload_tracker
258
+ )
160
259
  self._finish_multipart_upload(upload_id, complete_parts)
161
260
  logger.info(f'Finished Upload for "{filepath}"')
162
261
  return self
@@ -19,12 +19,12 @@ def _make_in_process_logger(log_level):
19
19
 
20
20
 
21
21
  def _upload_one_file(args):
22
- result_file, filepath, session, progress_tracker, link_type, overwrite, log_level, parallel_uploads = args
22
+ result_file, filepath, session, progress_tracker, link_type, overwrite, log_level, parallel_uploads, use_cache = args
23
23
  if parallel_uploads:
24
24
  _make_in_process_logger(log_level)
25
25
  if link_type == 'upload':
26
26
  # TODO: check checksums to see if the file is the same
27
- result_file.upload_file(filepath, session=session, overwrite=overwrite, progress_tracker=progress_tracker, threads=4)
27
+ result_file.upload_file(filepath, session=session, overwrite=overwrite, progress_tracker=progress_tracker, threads=4, use_cache=use_cache)
28
28
  else:
29
29
  result_file.link_file(link_type, filepath)
30
30
  return result_file
@@ -38,7 +38,8 @@ class GeoSeeqUploadManager:
38
38
  link_type='upload',
39
39
  progress_tracker_factory=None,
40
40
  log_level=logging.WARNING,
41
- overwrite=True):
41
+ overwrite=True,
42
+ use_cache=True):
42
43
  self.session = session
43
44
  self.n_parallel_uploads = n_parallel_uploads
44
45
  self.progress_tracker_factory = progress_tracker_factory if progress_tracker_factory else lambda x: None
@@ -46,6 +47,7 @@ class GeoSeeqUploadManager:
46
47
  self.link_type = link_type
47
48
  self.overwrite = overwrite
48
49
  self._result_files = []
50
+ self.use_cache = use_cache
49
51
 
50
52
  def add_result_file(self, result_file, local_path):
51
53
  self._result_files.append((result_file, local_path))
@@ -70,7 +72,7 @@ class GeoSeeqUploadManager:
70
72
  result_file, local_path,
71
73
  self.session, self.progress_tracker_factory(local_path),
72
74
  self.link_type, self.overwrite, self.log_level,
73
- self.n_parallel_uploads > 1
75
+ self.n_parallel_uploads > 1, self.use_cache
74
76
  ) for result_file, local_path in self._result_files
75
77
  ]
76
78
  out = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geoseeq
3
- Version: 0.5.6a7
3
+ Version: 0.5.6a8
4
4
  Summary: GeoSeeq command line tools and python API
5
5
  Author: David C. Danko
6
6
  Author-email: "David C. Danko" <dcdanko@biotia.io>
@@ -3,7 +3,7 @@ geoseeq/app.py,sha256=Y6d1UzxFLfE3RNccATbFCVi6kH3eFmzwoUbeR2Ry09A,2387
3
3
  geoseeq/blob_constructors.py,sha256=AkWpDQY0EdGMxF1p6eRspyHKubcUdiW4it-_Q7S2QWk,188
4
4
  geoseeq/bulk_creators.py,sha256=pdn-Dv7yv5SFv-PfDuQbuOnw2W4-BfIfRJVRAhM8U6s,2115
5
5
  geoseeq/constants.py,sha256=h9RURz4xs2bZyDrSGocej7ANJvRLr_H1H7JRxpNUXJM,431
6
- geoseeq/file_system_cache.py,sha256=fZrvqWmUtTLFOpm_qG0fz1Q0GWnw_yVOAcFVPloc75c,4101
6
+ geoseeq/file_system_cache.py,sha256=HzVZWtwLD2fjWWSo_UfWmGeBltm9He4lP_OqzKwNGWg,4138
7
7
  geoseeq/knex.py,sha256=6fPO8F8yxgBgBXZiliMJvYYjgf_16chfJPyWLe-kpPk,7898
8
8
  geoseeq/organization.py,sha256=a9xmGDE0tQsjPJfyFkYnWagxZ8xpdeckkwvkhH6LNIk,2462
9
9
  geoseeq/pipeline.py,sha256=89mhWaecsKnm6tyRkdkaVp4dmZh62_v42Ze0oXf8OTY,9873
@@ -11,7 +11,7 @@ geoseeq/project.py,sha256=-9Y2ik0-BpT3iqh89v8VQBbdadhI58oaUP9oZK8oetc,13741
11
11
  geoseeq/remote_object.py,sha256=Es-JlAz8iLRmCpAzh1MOwUh2MqtbuQM-p8wHIBAqNlQ,7131
12
12
  geoseeq/sample.py,sha256=whgEVk6GnDJJLjn5uTOqFqRtVxZD3BgjTo7brAC5noU,7981
13
13
  geoseeq/search.py,sha256=gawad6Cx5FxJBPlYkXWb-UKAO-UC0_yhvyU9Ca1kaNI,3388
14
- geoseeq/upload_download_manager.py,sha256=Xk5VB5qkDRZGQj6DqXOEzUmphXltF66FAVjxKWaweUE,7090
14
+ geoseeq/upload_download_manager.py,sha256=Nh2cahwn7qvjfuWJSru40_k7q65NBSRQBu0EMdx-Gqc,7206
15
15
  geoseeq/user.py,sha256=tol8i1UGLRrbMw5jeJDnna1ikRgrCDd50Jxz0a1lSgg,690
16
16
  geoseeq/utils.py,sha256=PDRiEQIZYTcfEV9AYvloQVvfqs5JaebcFZodAa2SUW8,3577
17
17
  geoseeq/work_orders.py,sha256=5uLVVfdKE8qh4gGaHkdBpXJGRTujuSg59knWCqEET4A,8071
@@ -22,7 +22,7 @@ geoseeq/cli/detail.py,sha256=q8Suu-j2k18knfSVFG-SWWGNsKM-n8y9RMA3LcIIi9Y,4132
22
22
  geoseeq/cli/download.py,sha256=ldpqpnRe00utb1EL1T_5CyPbFrZbtauIvOSOHtxz9qc,17656
23
23
  geoseeq/cli/fastq_utils.py,sha256=-bmeQLaiMBm57zWOF0R5OlWTU0_3sh1JBC1RYw2BOFM,3083
24
24
  geoseeq/cli/get_eula.py,sha256=79mbUwyiF7O1r0g6UTxG9kJGQEqKuH805E6eLkPC6Y4,997
25
- geoseeq/cli/main.py,sha256=ONd6tMTFSLtEQ8MiL53Ig2Gncindxmpcgm8MXBWWCE8,3259
25
+ geoseeq/cli/main.py,sha256=Suhx91FX_oXb2Zd923oln3Bwk9W1gL5WTUQYk4CgjEk,3259
26
26
  geoseeq/cli/manage.py,sha256=wGXAcVaXqE5JQEU8Jh6OlHr02nB396bpS_SFcOZdrEo,5929
27
27
  geoseeq/cli/progress_bar.py,sha256=p1Xl01nkYxSBZCB30ue2verIIi22W93m3ZAMAxipD0g,738
28
28
  geoseeq/cli/run.py,sha256=bx2AV6VIqOSTlxUda78xl0XxcZ8TXlQx02-e7iLQPwI,3838
@@ -37,9 +37,9 @@ geoseeq/cli/shared_params/id_handlers.py,sha256=501K9sCVkI0YGDQ62vXk_DM5lMMDrdB5
37
37
  geoseeq/cli/shared_params/obj_getters.py,sha256=ZSkt6LnDkVFlNVYKgLrjzg60-6BthZMr3eeD3HNqzac,2741
38
38
  geoseeq/cli/shared_params/opts_and_args.py,sha256=4ZnXe0MfxHPIh1pqkpAHwoi1I0XXGGSBkAr0ltfSvEk,1948
39
39
  geoseeq/cli/upload/__init__.py,sha256=3C9_S9t7chmYU-2ot89NV03x-EtmsjibulErKaU9w1k,627
40
- geoseeq/cli/upload/upload.py,sha256=F8n7eo1lblWHPABjlUSBVmU4XLyBSPbCt1wXTDp-INE,8963
40
+ geoseeq/cli/upload/upload.py,sha256=JsQnVyk4NLxgBtYfytORs8hgAeMGEE2o8atx1g2-Mek,9035
41
41
  geoseeq/cli/upload/upload_advanced.py,sha256=Jq5eGe-wOdrzxGWVwaFPg0BAJcW0YSx_eHEmYjJeKuA,3434
42
- geoseeq/cli/upload/upload_reads.py,sha256=Wh6Fvl5Ob4C5JdKow3otnnVdII1VY5G6q2VaOAWIcW0,7264
42
+ geoseeq/cli/upload/upload_reads.py,sha256=y9qsiBLF708QwQmZXqQIH2M1Ubhz7lbviupa3lbLFTw,7303
43
43
  geoseeq/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  geoseeq/contrib/ncbi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  geoseeq/contrib/ncbi/api.py,sha256=WQeLoGA_-Zha-QeSO8_i7HpvXyD8UkV0qc5okm11KiA,1056
@@ -64,7 +64,7 @@ geoseeq/plotting/map/overlay.py,sha256=4VmxqOESTQra9tPr8b8OLEUhJSit9lNipabeSznEY
64
64
  geoseeq/result/__init__.py,sha256=IFHIyRV8ZzuKIfwfze1SXgcKwNMcSgMAknLHMkwjXIU,356
65
65
  geoseeq/result/bioinfo.py,sha256=QQtbyogrdro9avJSN0713sxLVnVeA24mFw3hWtKDKyw,1782
66
66
  geoseeq/result/file_download.py,sha256=XQA5bdQJJSZIgbFcN09OvVdLq12fnA98kPCIONAkLk0,5568
67
- geoseeq/result/file_upload.py,sha256=dl5T3bUNfb4JdotldAQxZV0wwYcA_XUTlHOApFl8Bgw,7390
67
+ geoseeq/result/file_upload.py,sha256=uAPPIouCbLjS5qgU0QsJZ7xFj4QrvCpmskVYQAI6ApA,11496
68
68
  geoseeq/result/result_file.py,sha256=HG1gKpgIcWImWuf6LduVLEOyW_auuQ-dWld8MNOXGLE,8433
69
69
  geoseeq/result/result_folder.py,sha256=6porOXPh7Tpxw3oX5yMRPYQzNCGYqszqmFJd3SwQmTc,11122
70
70
  geoseeq/result/utils.py,sha256=C-CxGzB3WddlnRiqFSkrY78I_m0yFgNqsTBRzGU-y8Q,2772
@@ -80,9 +80,9 @@ geoseeq/vc/vc_stub.py,sha256=IQr8dI0zsWKVAeY_5ybDD6n49_3othcgfHS3P0O9tuY,3110
80
80
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
81
  tests/test_api_client.py,sha256=TS5njc5pcPP_Ycy-ljcfPVT1hQRBsFVdQ0lCqBmoesU,12810
82
82
  tests/test_plotting.py,sha256=TcTu-2ARr8sxZJ7wPQxmbs3-gHw7uRvsgrhhhg0qKik,784
83
- geoseeq-0.5.6a7.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
84
- geoseeq-0.5.6a7.dist-info/METADATA,sha256=2FR5sXg1kzhWv9LZHlcaKxgLUmGSeLpRdxqVZJN861A,4805
85
- geoseeq-0.5.6a7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
86
- geoseeq-0.5.6a7.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
87
- geoseeq-0.5.6a7.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
88
- geoseeq-0.5.6a7.dist-info/RECORD,,
83
+ geoseeq-0.5.6a8.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
84
+ geoseeq-0.5.6a8.dist-info/METADATA,sha256=yKLoJOHbteUBODSqcT9MpvbjKazjex-YIJjoHunZKuk,4805
85
+ geoseeq-0.5.6a8.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
86
+ geoseeq-0.5.6a8.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
87
+ geoseeq-0.5.6a8.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
88
+ geoseeq-0.5.6a8.dist-info/RECORD,,