geoseeq 0.5.6a15__py3-none-any.whl → 0.6.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
geoseeq/cli/download.py CHANGED
@@ -468,3 +468,4 @@ def cli_download_fastqs(state, cores, target_dir, yes, first, download, module_n
468
468
  click.confirm('Continue?', abort=True)
469
469
  logger.info(f'Downloading {len(download_manager)} files to {target_dir}')
470
470
  download_manager.download_files()
471
+
geoseeq/cli/main.py CHANGED
@@ -18,6 +18,7 @@ from .shared_params.opts_and_args import overwrite_option, yes_option
18
18
  from .detail import cli_detail
19
19
  from .run import cli_app
20
20
  from .get_eula import cli_eula
21
+ from .project import cli_project
21
22
 
22
23
  logger = logging.getLogger('geoseeq_api')
23
24
  handler = logging.StreamHandler()
@@ -53,7 +54,7 @@ def version():
53
54
  Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
54
55
  Run `geoseeq eula show` to view the EULA.
55
56
  """
56
- click.echo('0.5.6a15') # remember to update setup
57
+ click.echo('0.6.0') # remember to update setup
57
58
 
58
59
 
59
60
  @main.group('advanced')
@@ -65,6 +66,7 @@ cli_advanced.add_command(cli_copy)
65
66
  cli_advanced.add_command(cli_user)
66
67
  cli_advanced.add_command(cli_detail)
67
68
  cli_advanced.add_command(cli_upload_advanced)
69
+ cli_advanced.add_command(cli_project)
68
70
 
69
71
  @cli_advanced.group('experimental')
70
72
  def cli_experimental():
@@ -101,4 +103,20 @@ def cli_config(yes, api_token, endpoint, profile, overwrite):
101
103
  click.echo('You must accept the EULA to use the GeoSeeq API.')
102
104
  return
103
105
  set_profile(api_token, endpoint=endpoint, profile=profile, overwrite=overwrite)
104
- click.echo(f'Profile configured.')
106
+ click.echo(f'Profile configured.')
107
+
108
+
109
+ @main.command('clear-cache')
110
+ @yes_option
111
+ def cli_clear_cache(yes):
112
+ """Clear the local cache.
113
+
114
+ ---
115
+
116
+ Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
117
+ Run `geoseeq eula show` to view the EULA.
118
+ """
119
+ from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
120
+ import shutil
121
+ if yes or click.confirm('Are you sure you want to clear the cache?'):
122
+ shutil.rmtree(GEOSEEQ_CACHE_DIR, ignore_errors=True)
geoseeq/cli/project.py ADDED
@@ -0,0 +1,96 @@
1
+ import json
2
+ import logging
3
+ from os import makedirs, getcwd
4
+ from os.path import dirname, join
5
+
6
+ import click
7
+ import pandas as pd
8
+ from multiprocessing import Pool
9
+ from .shared_params import (
10
+ handle_project_id,
11
+ handle_folder_id,
12
+ project_id_arg,
13
+ sample_ids_arg,
14
+ handle_multiple_sample_ids,
15
+ handle_multiple_result_file_ids,
16
+ use_common_state,
17
+ flatten_list_of_els_and_files,
18
+ yes_option,
19
+ module_option,
20
+ ignore_errors_option,
21
+ folder_ids_arg,
22
+ )
23
+ from geoseeq.result.file_download import download_url
24
+ from geoseeq.utils import download_ftp
25
+ from geoseeq.id_constructors import (
26
+ result_file_from_uuid,
27
+ result_file_from_name,
28
+ )
29
+ from geoseeq.knex import GeoseeqNotFoundError
30
+ from .progress_bar import PBarManager
31
+ from .utils import convert_size
32
+ from geoseeq.constants import FASTQ_MODULE_NAMES
33
+ from geoseeq.result import ResultFile
34
+ from geoseeq.upload_download_manager import GeoSeeqDownloadManager
35
+ from geoseeq.file_system.filesystem_download import (
36
+ ProjectOnFilesystem,
37
+ FILE_STATUS_MODIFIED_REMOTE,
38
+ FILE_STATUS_MODIFIED_LOCAL,
39
+ FILE_STATUS_NEW_LOCAL,
40
+ FILE_STATUS_NEW_REMOTE,
41
+ FILE_STATUS_IS_LOCAL_STUB,
42
+ )
43
+
44
+
45
+ logger = logging.getLogger('geoseeq_api')
46
+
47
+
48
+ @click.group("project")
49
+ def cli_project():
50
+ """Download data from GeoSeeq."""
51
+ pass
52
+
53
+
54
+ @cli_project.command("clone")
55
+ @use_common_state
56
+ @click.option('--use-stubs/--full-files', default=True, help='Download full files or stubs')
57
+ @click.option('--target-dir', '-d', default=None, help='Directory to download the project to')
58
+ @project_id_arg
59
+ def cli_clone_project(state, use_stubs, target_dir, project_id):
60
+ """Clone a project to the local filesystem.
61
+ """
62
+ knex = state.get_knex().set_auth_required()
63
+ proj = handle_project_id(knex, project_id)
64
+ logger.info(f"Found project \"{proj.name}\"")
65
+ if target_dir is None:
66
+ target_dir = proj.name
67
+
68
+ project = ProjectOnFilesystem(proj, target_dir)
69
+ project.download(use_stubs=use_stubs)
70
+
71
+
72
+ @cli_project.command("status")
73
+ @use_common_state
74
+ def cli_project_status(state):
75
+ """Check the status of a project on the local filesystem.
76
+ """
77
+ project = ProjectOnFilesystem.from_path(getcwd(), recursive=True)
78
+
79
+ objs_by_status = {
80
+ FILE_STATUS_MODIFIED_LOCAL: [],
81
+ FILE_STATUS_MODIFIED_REMOTE: [],
82
+ FILE_STATUS_NEW_LOCAL: [],
83
+ FILE_STATUS_NEW_REMOTE: [],
84
+ FILE_STATUS_IS_LOCAL_STUB: [],
85
+ }
86
+ for obj_type, status, local_path, obj in project.list_abnormal_objects():
87
+ objs_by_status[status].append((obj_type, local_path, obj))
88
+
89
+ print(f"Project: {project.project.name}")
90
+ for status, objs in objs_by_status.items():
91
+ print(f"Status: {status}")
92
+ for obj_type, local_path, obj in objs:
93
+ if status in (FILE_STATUS_MODIFIED_LOCAL, FILE_STATUS_NEW_LOCAL):
94
+ print(f" {obj_type}: {project.path_from_project_root(local_path)} -> {obj}")
95
+ else:
96
+ print(f" {obj_type}: {obj} -> {project.path_from_project_root(local_path)}")
geoseeq/cli/raw.py ADDED
@@ -0,0 +1,59 @@
1
+ import click
2
+ import json
3
+ from .shared_params import use_common_state, overwrite_option
4
+ from geoseeq import GeoseeqNotFoundError
5
+ from geoseeq.blob_constructors import (
6
+ sample_result_file_from_uuid,
7
+ project_result_file_from_uuid,
8
+ sample_result_folder_from_uuid,
9
+ project_result_folder_from_uuid,
10
+ )
11
+
12
+
13
+ @click.group('raw')
14
+ def cli_raw():
15
+ """Low-level commands for interacting with the API."""
16
+ pass
17
+
18
+
19
+ @cli_raw.command('get-file-data')
20
+ @use_common_state
21
+ @click.argument('file_ids', nargs=-1)
22
+ def cli_get_file_data(state, file_ids):
23
+ """Print the raw stored data in a result file object."""
24
+ knex = state.get_knex()
25
+ for file_id in file_ids:
26
+ file_id = file_id.split(':')[-1]
27
+ try:
28
+ result_file = sample_result_file_from_uuid(knex, file_id)
29
+ except GeoseeqNotFoundError:
30
+ result_file = project_result_file_from_uuid(knex, file_id)
31
+ print(json.dumps(result_file.stored_data, indent=2), file=state.outfile)
32
+
33
+
34
+ @cli_raw.command('create-raw-file')
35
+ @use_common_state
36
+ @overwrite_option
37
+ @click.argument('folder_id')
38
+ @click.argument('result_filename')
39
+ @click.argument('filename', type=click.File('r'))
40
+ def cli_get_file_data(state, overwrite, folder_id, result_filename, filename):
41
+ """Print the raw stored data in a result file object."""
42
+ knex = state.get_knex()
43
+
44
+ folder_id = folder_id.split(':')[-1]
45
+ try:
46
+ result_folder = sample_result_folder_from_uuid(knex, folder_id)
47
+ except GeoseeqNotFoundError:
48
+ result_folder = project_result_folder_from_uuid(knex, folder_id)
49
+ blob = json.load(filename)
50
+ result_file = result_folder.result_file(result_filename)
51
+ if overwrite:
52
+ result_file.idem()
53
+ result_file.stored_data = blob
54
+ result_file.save()
55
+ else:
56
+ result_file.create()
57
+ click.echo(f'Created file {result_file.uuid}', file=state.outfile)
58
+
59
+
@@ -40,7 +40,7 @@ hidden_option = click.option('--hidden/--no-hidden', default=False, help='Upload
40
40
  @click.option('--cores', default=1, help='Number of uploads to run in parallel', show_default=True)
41
41
  @click.option('--threads-per-upload', default=4, help='Number of threads used to upload each file', show_default=True)
42
42
  @click.option('--num-retries', default=3, help='Number of times to retry a failed upload', show_default=True)
43
- @click.option('--chunk-size-mb', default=5, help='Size of chunks to upload in MB', show_default=True)
43
+ @click.option('--chunk-size-mb', default=-1, help='Size of chunks to upload in MB', show_default=True)
44
44
  @ignore_errors_option
45
45
  @yes_option
46
46
  @private_option
@@ -122,8 +122,9 @@ def cli_upload_file(state, cores, threads_per_upload, num_retries, chunk_size_mb
122
122
  use_cache=state.use_cache,
123
123
  num_retries=num_retries,
124
124
  ignore_errors=ignore_errors,
125
- session=knex.new_session(),
126
- chunk_size_mb=chunk_size_mb,
125
+ use_atomic_upload=True,
126
+ session=None, #knex.new_session(),
127
+ chunk_size_mb=chunk_size_mb if chunk_size_mb > 0 else None,
127
128
  )
128
129
  for geoseeq_file_name, file_path in name_pairs:
129
130
  if isfile(file_path):
@@ -160,6 +161,7 @@ def cli_upload_folder(state, cores, yes, private, recursive, hidden, no_new_vers
160
161
  overwrite=True,
161
162
  use_cache=state.use_cache,
162
163
  no_new_versions=no_new_versions,
164
+ use_atomic_upload=True,
163
165
  )
164
166
  for folder_name in folder_names:
165
167
  result_folder = root_obj.result_folder(folder_name).idem()
@@ -98,6 +98,7 @@ def _do_upload(groups, module_name, link_type, lib, filepaths, overwrite, no_new
98
98
  progress_tracker_factory=PBarManager().get_new_bar,
99
99
  use_cache=state.use_cache,
100
100
  no_new_versions=no_new_versions,
101
+ use_atomic_upload=True,
101
102
  )
102
103
  for group in groups:
103
104
  sample = lib.sample(group['sample_name']).idem()
@@ -0,0 +1,50 @@
1
+
2
+ from os.path import getsize
3
+ import logging
4
+
5
+ logger = logging.getLogger("geoseeq_api") # Same name as calling module
6
+ logger.addHandler(logging.NullHandler())
7
+
8
+
9
+ class FileChunker:
10
+
11
+ def __init__(self, filepath, chunk_size):
12
+ self.filepath = filepath
13
+ self.chunk_size = chunk_size
14
+ self.file_size = getsize(filepath)
15
+ self.n_parts = int(self.file_size / self.chunk_size) + 1
16
+ self.loaded_parts = []
17
+
18
+ def load_all_chunks(self):
19
+ if len(self.loaded_parts) != self.n_parts:
20
+ with open(self.filepath, "rb") as f:
21
+ f.seek(0)
22
+ for i in range(self.n_parts):
23
+ chunk = f.read(self.chunk_size)
24
+ self.loaded_parts.append(chunk)
25
+ return self # convenience for chaining
26
+
27
+ def chunk_is_preloaded(self, num):
28
+ return len(self.loaded_parts) > num and self.loaded_parts[num]
29
+
30
+ def read_one_chunk(self, num):
31
+ if not self.chunk_is_preloaded(num):
32
+ logger.debug(f"Reading chunk {num} from {self.filepath}")
33
+ with open(self.filepath, "rb") as f:
34
+ f.seek(num * self.chunk_size)
35
+ chunk = f.read(self.chunk_size)
36
+ return chunk
37
+ return self.loaded_parts[num]
38
+
39
+ def get_chunk(self, num):
40
+ if self.chunk_is_preloaded(num):
41
+ return self.loaded_parts[num]
42
+ return self.read_one_chunk(num)
43
+
44
+ def get_chunk_size(self, num):
45
+ if num < (self.n_parts - 1): # all but the last chunk
46
+ return self.chunk_size
47
+ if self.chunk_is_preloaded(num): # last chunk, pre-loaded
48
+ return len(self.loaded_parts[num])
49
+ return len(self.read_one_chunk(num)) # last chunk, not pre-loaded
50
+
@@ -12,10 +12,10 @@ from geoseeq.constants import FIVE_MB
12
12
  logger = logging.getLogger("geoseeq_api") # Same name as calling module
13
13
 
14
14
 
15
- def _download_head(url, filename, head=None, progress_tracker=None):
15
+ def _download_head(url, filename, head=None, start=0, progress_tracker=None):
16
16
  headers = None
17
17
  if head and head > 0:
18
- headers = {"Range": f"bytes=0-{head}"}
18
+ headers = {"Range": f"bytes={start}-{head}"}
19
19
  response = requests.get(url, stream=True, headers=headers)
20
20
  response.raise_for_status()
21
21
  total_size_in_bytes = int(response.headers.get('content-length', 0))
@@ -67,7 +67,6 @@ def download_url(url, kind='guess', filename=None, head=None, progress_tracker=N
67
67
  raise ValueError(f"Unknown download kind: {kind}")
68
68
 
69
69
 
70
-
71
70
  class ResultFileDownload:
72
71
  """Abstract class that handles download methods for result files."""
73
72
 
@@ -13,130 +13,21 @@ from geoseeq.utils import md5_checksum
13
13
  from concurrent.futures import ThreadPoolExecutor, as_completed
14
14
  from .utils import *
15
15
  from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
16
-
17
- class FileChunker:
18
-
19
- def __init__(self, filepath, chunk_size):
20
- self.filepath = filepath
21
- self.chunk_size = chunk_size
22
- self.file_size = getsize(filepath)
23
- self.n_parts = int(self.file_size / self.chunk_size) + 1
24
- self.loaded_parts = []
25
-
26
- def load_all_chunks(self):
27
- if len(self.loaded_parts) != self.n_parts:
28
- with open(self.filepath, "rb") as f:
29
- f.seek(0)
30
- for i in range(self.n_parts):
31
- chunk = f.read(self.chunk_size)
32
- self.loaded_parts.append(chunk)
33
- return self # convenience for chaining
34
-
35
- def chunk_is_preloaded(self, num):
36
- return len(self.loaded_parts) > num and self.loaded_parts[num]
37
-
38
- def read_one_chunk(self, num):
39
- if not self.chunk_is_preloaded(num):
40
- logger.debug(f"Reading chunk {num} from {self.filepath}")
41
- with open(self.filepath, "rb") as f:
42
- f.seek(num * self.chunk_size)
43
- chunk = f.read(self.chunk_size)
44
- return chunk
45
- return self.loaded_parts[num]
46
-
47
- def get_chunk(self, num):
48
- if self.chunk_is_preloaded(num):
49
- return self.loaded_parts[num]
50
- return self.read_one_chunk(num)
51
-
52
- def get_chunk_size(self, num):
53
- if num < (self.n_parts - 1): # all but the last chunk
54
- return self.chunk_size
55
- if self.chunk_is_preloaded(num): # last chunk, pre-loaded
56
- return len(self.loaded_parts[num])
57
- return len(self.read_one_chunk(num)) # last chunk, not pre-loaded
58
-
59
-
60
- class ResumableUploadTracker:
61
-
62
- def __init__(self, filepath, chunk_size, tracker_file_prefix="gs_resumable_upload_tracker"):
63
- self.open, self.upload_started = True, False
64
- self.upload_id, self.urls = None, None
65
- self.filepath = filepath
66
- self.tracker_file = join(
67
- GEOSEEQ_CACHE_DIR, 'upload',
68
- tracker_file_prefix + f".{chunk_size}.{getsize(filepath)}." + basename(filepath)
69
- )
70
- try:
71
- os.makedirs(dirname(self.tracker_file), exist_ok=True)
72
- except Exception as e:
73
- logger.warning(f'Could not create resumable upload tracker directory. {e}')
74
- self.open = False
75
- self._loaded_parts = {}
76
- self._load_parts_from_file()
77
-
78
- def start_upload(self, upload_id, urls):
79
- if not self.open:
80
- return
81
- if self.upload_started:
82
- raise GeoseeqGeneralError("Upload has already started.")
83
- blob = dict(upload_id=upload_id, urls=urls, start_time=time.time())
84
- serialized = json.dumps(blob)
85
- with open(self.tracker_file, "w") as f:
86
- f.write(serialized + "\n")
87
- self.upload_id, self.urls = upload_id, urls
88
- self.upload_started = True
89
-
90
- def add_part(self, part_upload_info):
91
- if not self.open:
92
- return
93
- part_id = part_upload_info["PartNumber"]
94
- serialized = json.dumps(part_upload_info)
95
- with open(self.tracker_file, "a") as f:
96
- f.write(serialized + "\n")
97
- self._loaded_parts[part_id] = part_upload_info
98
- if len(self._loaded_parts) == len(self.urls):
99
- self.cleanup()
100
- self.open = False
101
-
102
- def _load_parts_from_file(self):
103
- if not isfile(self.tracker_file):
104
- return
105
- with open(self.tracker_file, "r") as f:
106
- header_blob = json.loads(f.readline())
107
- self.upload_id, self.urls = header_blob["upload_id"], header_blob["urls"]
108
- start_time = header_blob["start_time"]
109
- if (time.time() - start_time) > (60 * 60 * 23):
110
- logger.warning(f"Tracker file {self.tracker_file} is too old. Deleting.")
111
- os.remove(self.tracker_file)
112
- return
113
- self.upload_started = True
114
- for line in f:
115
- blob = json.loads(line)
116
- part_id = blob["PartNumber"]
117
- self._loaded_parts[part_id] = blob
118
-
119
- def part_has_been_uploaded(self, part_number):
120
- if not self.open:
121
- return False
122
- return part_number in self._loaded_parts
123
-
124
- def get_part_info(self, part_number):
125
- return self._loaded_parts[part_number]
126
-
127
- def cleanup(self):
128
- if not self.open:
129
- return
130
- try:
131
- os.remove(self.tracker_file)
132
- except FileNotFoundError:
133
- pass
16
+ from .file_chunker import FileChunker
17
+ from .resumable_upload_tracker import ResumableUploadTracker
134
18
 
135
19
 
136
20
  class ResultFileUpload:
137
21
  """Abstract class that handles upload methods for result files."""
138
22
 
139
- def _create_multipart_upload(self, filepath, file_size, optional_fields):
23
+ def _result_type(self, atomic=False):
24
+ if self.is_sample_result:
25
+ return "sample"
26
+ if atomic:
27
+ return "project"
28
+ return "group"
29
+
30
+ def _create_multipart_upload(self, filepath, file_size, optional_fields, atomic=False):
140
31
  optional_fields = optional_fields if optional_fields else {}
141
32
  optional_fields.update(
142
33
  {
@@ -147,23 +38,31 @@ class ResultFileUpload:
147
38
  data = {
148
39
  "filename": basename(filepath),
149
40
  "optional_fields": optional_fields,
150
- "result_type": "sample" if self.is_sample_result else "group",
41
+ "result_type": self._result_type(atomic),
151
42
  }
152
- response = self.knex.post(f"/ar_fields/{self.uuid}/create_upload", json=data)
43
+ url = f"/ar_fields/{self.uuid}/create_upload"
44
+ if atomic:
45
+ data["fieldname"] = self.name
46
+ url = f"/ars/{self.parent.uuid}/create_atomic_upload"
47
+ response = self.knex.post(url, json=data)
153
48
  return response
154
49
 
155
- def _prep_multipart_upload(self, filepath, file_size, chunk_size, optional_fields):
50
+ def _prep_multipart_upload(self, filepath, file_size, chunk_size, optional_fields, atomic=False):
156
51
  n_parts = int(file_size / chunk_size) + 1
157
- response = self._create_multipart_upload(filepath, file_size, optional_fields)
52
+ response = self._create_multipart_upload(filepath, file_size, optional_fields, atomic=atomic)
158
53
  upload_id = response["upload_id"]
159
- parts = list(range(1, n_parts + 1))
160
54
  data = {
161
- "parts": parts,
55
+ "parts": list(range(1, n_parts + 1)),
162
56
  "stance": "upload-multipart",
163
57
  "upload_id": upload_id,
164
- "result_type": "sample" if self.is_sample_result else "group",
58
+ "result_type": self._result_type(atomic),
165
59
  }
166
- response = self.knex.post(f"/ar_fields/{self.uuid}/create_upload_urls", json=data)
60
+ url = f"/ar_fields/{self.uuid}/create_upload_urls"
61
+ if atomic:
62
+ data["uuid"] = response["uuid"]
63
+ data["fieldname"] = self.name
64
+ url = f"ars/{self.parent.uuid}/create_atomic_upload_urls"
65
+ response = self.knex.post(url, json=data)
167
66
  urls = response
168
67
  return upload_id, urls
169
68
 
@@ -175,6 +74,7 @@ class ResultFileUpload:
175
74
  attempts = 0
176
75
  while attempts < max_retries:
177
76
  try:
77
+ # url = url.replace("s3.wasabisys.com", "s3.us-east-1.wasabisys.com")
178
78
  logger.debug(f"Uploading part {num + 1} to {url}. Size: {len(file_chunk)} bytes.")
179
79
  if session:
180
80
  http_response = session.put(url, data=file_chunk)
@@ -192,7 +92,7 @@ class ResultFileUpload:
192
92
  raise e
193
93
 
194
94
  retry_time = min(8 ** attempts, 120) # exponential backoff, max 120s
195
- retry_time *= 0.8 + (random() * 0.4) # randomize to avoid thundering herd
95
+ retry_time *= 0.6 + (random() * 0.8) # randomize to avoid thundering herd
196
96
  logger.debug(f"Retrying upload for part {num + 1} in {retry_time} seconds.")
197
97
  time.sleep(retry_time)
198
98
 
@@ -203,16 +103,17 @@ class ResultFileUpload:
203
103
  resumable_upload_tracker.add_part(blob)
204
104
  return blob
205
105
 
206
- def _finish_multipart_upload(self, upload_id, complete_parts):
207
- response = self.knex.post(
208
- f"/ar_fields/{self.uuid}/complete_upload",
209
- json={
210
- "parts": complete_parts,
211
- "upload_id": upload_id,
212
- "result_type": "sample" if self.is_sample_result else "group",
213
- },
214
- json_response=False,
215
- )
106
+ def _finish_multipart_upload(self, upload_id, complete_parts, atomic=False):
107
+ data = {
108
+ "parts": complete_parts,
109
+ "upload_id": upload_id,
110
+ "result_type": self._result_type(atomic),
111
+ }
112
+ url = f"/ar_fields/{self.uuid}/complete_upload"
113
+ if atomic:
114
+ data["fieldname"] = self.name
115
+ url = f"/ars/{self.parent.uuid}/complete_atomic_upload"
116
+ response = self.knex.post(url, json=data, json_response=False)
216
117
  response.raise_for_status()
217
118
 
218
119
  def _upload_parts(self, file_chunker, urls, max_retries, session, progress_tracker, threads, resumable_upload_tracker=None):
@@ -250,26 +151,38 @@ class ResultFileUpload:
250
151
  filepath,
251
152
  file_size,
252
153
  optional_fields=None,
253
- chunk_size=FIVE_MB,
154
+ chunk_size=None,
254
155
  max_retries=3,
255
156
  session=None,
256
157
  progress_tracker=None,
257
158
  threads=1,
258
159
  use_cache=True,
160
+ use_atomic_upload=False,
259
161
  ):
260
162
  """Upload a file to S3 using the multipart upload process."""
261
163
  logger.info(f"Uploading {filepath} to S3 using multipart upload.")
164
+ if not chunk_size:
165
+ chunk_size = FIVE_MB
166
+ if file_size >= 10 * FIVE_MB:
167
+ chunk_size = 5 * FIVE_MB
168
+ logger.debug(f"Using chunk size of {chunk_size} bytes.")
262
169
  resumable_upload_tracker = None
263
170
  if use_cache and file_size > 10 * FIVE_MB: # only use resumable upload tracker for larger files
264
- resumable_upload_tracker = ResumableUploadTracker(filepath, chunk_size)
171
+ upload_target_uuid = self.parent.uuid if use_atomic_upload else self.uuid
172
+ resumable_upload_tracker = ResumableUploadTracker(filepath, chunk_size, upload_target_uuid)
173
+
265
174
  if resumable_upload_tracker and resumable_upload_tracker.upload_started:
175
+ # a resumable upload for this file has already started
176
+ resumable_upload_exists_and_is_valid = True
266
177
  upload_id, urls = resumable_upload_tracker.upload_id, resumable_upload_tracker.urls
178
+ use_atomic_upload = resumable_upload_tracker.is_atomic_upload
267
179
  logger.info(f'Resuming upload for "{filepath}", upload_id: "{upload_id}"')
268
180
  else:
269
- upload_id, urls = self._prep_multipart_upload(filepath, file_size, chunk_size, optional_fields)
181
+ upload_id, urls = self._prep_multipart_upload(filepath, file_size, chunk_size, optional_fields, atomic=use_atomic_upload)
270
182
  if resumable_upload_tracker:
271
183
  logger.info(f'Creating new resumable upload for "{filepath}", upload_id: "{upload_id}"')
272
- resumable_upload_tracker.start_upload(upload_id, urls)
184
+ resumable_upload_tracker.start_upload(upload_id, urls, is_atomic_upload=use_atomic_upload)
185
+
273
186
  logger.info(f'Starting upload for "{filepath}"')
274
187
  complete_parts = []
275
188
  file_chunker = FileChunker(filepath, chunk_size)
@@ -288,14 +201,20 @@ class ResultFileUpload:
288
201
  threads,
289
202
  resumable_upload_tracker=resumable_upload_tracker
290
203
  )
291
- self._finish_multipart_upload(upload_id, complete_parts)
204
+ self._finish_multipart_upload(upload_id, complete_parts, atomic=use_atomic_upload)
292
205
  logger.info(f'Finished Upload for "{filepath}"')
206
+ if use_atomic_upload:
207
+ # if this was an atomic upload then this result may not have existed on the server before
208
+ self.get()
293
209
  return self
294
210
 
295
211
  def upload_file(self, filepath, multipart_thresh=FIVE_MB, overwrite=True, no_new_versions=False, **kwargs):
296
212
  if self.exists() and not overwrite:
297
213
  raise GeoseeqGeneralError(f"Overwrite is set to False and file {self.uuid} already exists.")
298
- self.idem()
214
+ if not kwargs.get("use_atomic_upload", False):
215
+ self.idem()
216
+ else:
217
+ self.parent.idem()
299
218
  if no_new_versions and self.has_downloadable_file():
300
219
  raise GeoseeqGeneralError(f"File {self} already has a downloadable file. Not uploading a new version.")
301
220
  resolved_path = Path(filepath).resolve()
@@ -0,0 +1,100 @@
1
+
2
+ import time
3
+ import json
4
+ import os
5
+ from os.path import basename, getsize, join, dirname, isfile, getctime
6
+ from pathlib import Path
7
+ from random import random
8
+ import requests
9
+
10
+ from geoseeq.knex import GeoseeqGeneralError
11
+ from geoseeq.constants import FIVE_MB
12
+ from geoseeq.utils import md5_checksum
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ from .utils import *
15
+ from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
16
+ from .file_chunker import FileChunker
17
+
18
+
19
+ class ResumableUploadTracker:
20
+
21
+ def __init__(self, filepath, chunk_size, upload_target_uuid, tracker_file_prefix="gs_resumable_upload_tracker"):
22
+ self.open, self.upload_started = True, False
23
+ self.upload_id, self.urls, self.is_atomic_upload = None, None, None
24
+ self.upload_target_uuid = upload_target_uuid
25
+ self.filepath = filepath
26
+ self.tracker_file_dir = join(GEOSEEQ_CACHE_DIR, 'upload')
27
+ self.tracker_file = join(
28
+ self.tracker_file_dir,
29
+ tracker_file_prefix + f".{upload_target_uuid}.{chunk_size}.{getsize(filepath)}." + basename(filepath)
30
+ )
31
+ try:
32
+ os.makedirs(self.tracker_file_dir, exist_ok=True)
33
+ except Exception as e:
34
+ logger.warning(f'Could not create resumable upload tracker directory. {e}')
35
+ self.open = False
36
+ self._loaded_parts = {}
37
+ self._load_parts_from_file()
38
+
39
+ def start_upload(self, upload_id, urls, is_atomic_upload=False):
40
+ if not self.open:
41
+ return
42
+ if self.upload_started:
43
+ raise GeoseeqGeneralError("Upload has already started.")
44
+ self.upload_started = True
45
+ blob = dict(upload_id=upload_id,
46
+ urls=urls,
47
+ is_atomic_upload=is_atomic_upload,
48
+ upload_target_uuid=self.upload_target_uuid,
49
+ start_time=time.time())
50
+ serialized = json.dumps(blob)
51
+ with open(self.tracker_file, "w") as f:
52
+ f.write(serialized + "\n")
53
+ self.upload_id, self.urls, self.is_atomic_upload = upload_id, urls, is_atomic_upload
54
+
55
+ def add_part(self, part_upload_info):
56
+ if not self.open:
57
+ return
58
+ part_id = part_upload_info["PartNumber"]
59
+ serialized = json.dumps(part_upload_info)
60
+ with open(self.tracker_file, "a") as f:
61
+ f.write(serialized + "\n")
62
+ self._loaded_parts[part_id] = part_upload_info
63
+ if len(self._loaded_parts) == len(self.urls):
64
+ self.cleanup()
65
+ self.open = False
66
+
67
+ def _load_parts_from_file(self):
68
+ if not isfile(self.tracker_file):
69
+ return
70
+ with open(self.tracker_file, "r") as f:
71
+ header_blob = json.loads(f.readline())
72
+ self.upload_id, self.urls, self.is_atomic_upload = (
73
+ header_blob["upload_id"], header_blob["urls"], header_blob["is_atomic_upload"]
74
+ )
75
+ start_time = header_blob["start_time"]
76
+ if (time.time() - start_time) > (60 * 60 * 23):
77
+ logger.warning(f"Tracker file {self.tracker_file} is too old. Deleting.")
78
+ os.remove(self.tracker_file)
79
+ return
80
+ self.upload_started = True
81
+ for line in f:
82
+ blob = json.loads(line)
83
+ part_id = blob["PartNumber"]
84
+ self._loaded_parts[part_id] = blob
85
+
86
+ def part_has_been_uploaded(self, part_number):
87
+ if not self.open:
88
+ return False
89
+ return part_number in self._loaded_parts
90
+
91
+ def get_part_info(self, part_number):
92
+ return self._loaded_parts[part_number]
93
+
94
+ def cleanup(self):
95
+ if not self.open:
96
+ return
97
+ try:
98
+ os.remove(self.tracker_file)
99
+ except FileNotFoundError:
100
+ pass
@@ -22,8 +22,8 @@ def _upload_one_file(args):
22
22
  (result_file, filepath, session, progress_tracker,
23
23
  link_type, overwrite, log_level, parallel_uploads,
24
24
  use_cache, no_new_versions, threads_per_upload,
25
- num_retries, ignore_errors, chunk_size_mb) = args
26
- chunk_size = chunk_size_mb * 1024 * 1024
25
+ num_retries, ignore_errors, chunk_size_mb, use_atomic_upload) = args
26
+ chunk_size = chunk_size_mb * 1024 * 1024 if chunk_size_mb else None
27
27
  if parallel_uploads:
28
28
  _make_in_process_logger(log_level)
29
29
  try:
@@ -34,6 +34,7 @@ def _upload_one_file(args):
34
34
  session=session, overwrite=overwrite, progress_tracker=progress_tracker,
35
35
  threads=threads_per_upload, use_cache=use_cache, chunk_size=chunk_size,
36
36
  no_new_versions=no_new_versions, max_retries=num_retries,
37
+ use_atomic_upload=use_atomic_upload
37
38
  )
38
39
  else:
39
40
  result_file.link_file(link_type, filepath)
@@ -59,6 +60,7 @@ class GeoSeeqUploadManager:
59
60
  num_retries=3,
60
61
  ignore_errors=False,
61
62
  chunk_size_mb=5,
63
+ use_atomic_upload=True,
62
64
  use_cache=True):
63
65
  self.session = session
64
66
  self.n_parallel_uploads = n_parallel_uploads
@@ -73,12 +75,18 @@ class GeoSeeqUploadManager:
73
75
  self.num_retries = num_retries
74
76
  self.ignore_errors = ignore_errors
75
77
  self.chunk_size_mb = chunk_size_mb
78
+ self.use_atomic_upload = use_atomic_upload
76
79
 
77
80
  def add_result_file(self, result_file, local_path):
78
81
  self._result_files.append((result_file, local_path))
79
82
 
80
83
  def add_local_file_to_result_folder(self, result_folder, local_path, geoseeq_file_name=None):
81
- geoseeq_file_name = geoseeq_file_name if geoseeq_file_name else local_path
84
+ if not geoseeq_file_name:
85
+ if local_path.startswith("/"): # if local path is an absolute path use the basename
86
+ geoseeq_file_name = basename(local_path)
87
+ else:
88
+ # remove "./" and "../" from local path to get a geoseeq file name
89
+ geoseeq_file_name = local_path.replace("./", "").replace("../", "")
82
90
  result_file = result_folder.result_file(geoseeq_file_name)
83
91
  self.add_result_file(result_file, local_path)
84
92
 
@@ -99,7 +107,7 @@ class GeoSeeqUploadManager:
99
107
  self.link_type, self.overwrite, self.log_level,
100
108
  self.n_parallel_uploads > 1, self.use_cache, self.no_new_versions,
101
109
  self.threads_per_upload, self.num_retries, self.ignore_errors,
102
- self.chunk_size_mb,
110
+ self.chunk_size_mb, self.use_atomic_upload
103
111
  ) for result_file, local_path in self._result_files
104
112
  ]
105
113
  out = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geoseeq
3
- Version: 0.5.6a15
3
+ Version: 0.6.0
4
4
  Summary: GeoSeeq command line tools and python API
5
5
  Author: David C. Danko
6
6
  Author-email: "David C. Danko" <dcdanko@biotia.io>
@@ -11,7 +11,7 @@ geoseeq/project.py,sha256=-9Y2ik0-BpT3iqh89v8VQBbdadhI58oaUP9oZK8oetc,13741
11
11
  geoseeq/remote_object.py,sha256=Es-JlAz8iLRmCpAzh1MOwUh2MqtbuQM-p8wHIBAqNlQ,7131
12
12
  geoseeq/sample.py,sha256=whgEVk6GnDJJLjn5uTOqFqRtVxZD3BgjTo7brAC5noU,7981
13
13
  geoseeq/search.py,sha256=gawad6Cx5FxJBPlYkXWb-UKAO-UC0_yhvyU9Ca1kaNI,3388
14
- geoseeq/upload_download_manager.py,sha256=2WM1yvEseMxjrLsE_tNMvt0cCldMScAOuHxj2l0ICrc,8248
14
+ geoseeq/upload_download_manager.py,sha256=DNI4nce0MCds-wGoTA3fP_msz3kGOAoJNItoUv7L0uQ,8751
15
15
  geoseeq/user.py,sha256=tol8i1UGLRrbMw5jeJDnna1ikRgrCDd50Jxz0a1lSgg,690
16
16
  geoseeq/utils.py,sha256=PDRiEQIZYTcfEV9AYvloQVvfqs5JaebcFZodAa2SUW8,3577
17
17
  geoseeq/work_orders.py,sha256=5uLVVfdKE8qh4gGaHkdBpXJGRTujuSg59knWCqEET4A,8071
@@ -19,12 +19,14 @@ geoseeq/cli/__init__.py,sha256=4WnK87K5seRK3SGJAxNWnQTqyg5uBhdhrOrzB1D4b3M,24
19
19
  geoseeq/cli/constants.py,sha256=Do5AUf9lMO9_P8KpFJ3XwwFBAWsxSjZ6sx9_QEGyC_c,176
20
20
  geoseeq/cli/copy.py,sha256=02U9kdrAIbbM8MlRMLL6p-LMYFSuRObE3h5jyvcL__M,2275
21
21
  geoseeq/cli/detail.py,sha256=q8Suu-j2k18knfSVFG-SWWGNsKM-n8y9RMA3LcIIi9Y,4132
22
- geoseeq/cli/download.py,sha256=_upzZo08K0fAPbEsyi1uN0HGNUaY1pl6OoGPcWmvSUY,17765
22
+ geoseeq/cli/download.py,sha256=N_Wrg9d1kY9eJ6C1l0xc_YFjiri8gkXBo9JiuHx9xxE,17766
23
23
  geoseeq/cli/fastq_utils.py,sha256=-bmeQLaiMBm57zWOF0R5OlWTU0_3sh1JBC1RYw2BOFM,3083
24
24
  geoseeq/cli/get_eula.py,sha256=79mbUwyiF7O1r0g6UTxG9kJGQEqKuH805E6eLkPC6Y4,997
25
- geoseeq/cli/main.py,sha256=ExPXiBu0SIphQx1pjpmTVGyPPrkEVM7pHKwyi_U5Eao,3260
25
+ geoseeq/cli/main.py,sha256=Vze6p8cNGsMQmsr5bkhglOxWPIPqxk0BM6417iKvhb4,3791
26
26
  geoseeq/cli/manage.py,sha256=wGXAcVaXqE5JQEU8Jh6OlHr02nB396bpS_SFcOZdrEo,5929
27
27
  geoseeq/cli/progress_bar.py,sha256=p1Xl01nkYxSBZCB30ue2verIIi22W93m3ZAMAxipD0g,738
28
+ geoseeq/cli/project.py,sha256=V5SdXm2Hwo2lxrkpwRDedw-mAE4XnM2uwT-Gj1D90VQ,3030
29
+ geoseeq/cli/raw.py,sha256=EASkIBr3AhBg6FOiLElw8Nuj_okQqf9vBXLdow7JQGw,1884
28
30
  geoseeq/cli/run.py,sha256=bx2AV6VIqOSTlxUda78xl0XxcZ8TXlQx02-e7iLQPwI,3838
29
31
  geoseeq/cli/search.py,sha256=wgyprEf_Tm5i_rYl9KTxrmFrD4ohy7qS5ttjg7V3xRY,1015
30
32
  geoseeq/cli/user.py,sha256=fARv_5vx-QYT765Hsjgwr6J5ddA_viCPQdv9iUoVX6w,925
@@ -37,9 +39,9 @@ geoseeq/cli/shared_params/id_handlers.py,sha256=501K9sCVkI0YGDQ62vXk_DM5lMMDrdB5
37
39
  geoseeq/cli/shared_params/obj_getters.py,sha256=ZSkt6LnDkVFlNVYKgLrjzg60-6BthZMr3eeD3HNqzac,2741
38
40
  geoseeq/cli/shared_params/opts_and_args.py,sha256=LrDkv9WtUryM4uUMXPRk04-EBcTQ7q5V6Yu-XRDUvvA,2083
39
41
  geoseeq/cli/upload/__init__.py,sha256=3C9_S9t7chmYU-2ot89NV03x-EtmsjibulErKaU9w1k,627
40
- geoseeq/cli/upload/upload.py,sha256=iZT_4M2I-muUo8cD5exLE49DrKjYuJA-xVKV69N0PB8,9978
42
+ geoseeq/cli/upload/upload.py,sha256=JZkhe1q3KOp7-tKyzwi860TQhZoNDnZs4yB2PJhOjl0,10081
41
43
  geoseeq/cli/upload/upload_advanced.py,sha256=Jq5eGe-wOdrzxGWVwaFPg0BAJcW0YSx_eHEmYjJeKuA,3434
42
- geoseeq/cli/upload/upload_reads.py,sha256=EMGqyZf11xwN4v2j8gNxMagTbE4kaOd-_hwupmg5I-8,10670
44
+ geoseeq/cli/upload/upload_reads.py,sha256=dvmOVq0lJSbpQDyWkNEnZmhkMvfEByV-i8xD75Ai4dA,10706
43
45
  geoseeq/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
46
  geoseeq/contrib/ncbi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
47
  geoseeq/contrib/ncbi/api.py,sha256=WQeLoGA_-Zha-QeSO8_i7HpvXyD8UkV0qc5okm11KiA,1056
@@ -63,10 +65,12 @@ geoseeq/plotting/map/map.py,sha256=h2QPLGqe-SamhfaTij53S9cQIiO8orCJUAUh0hRicSM,3
63
65
  geoseeq/plotting/map/overlay.py,sha256=4VmxqOESTQra9tPr8b8OLEUhJSit9lNipabeSznEYwE,1795
64
66
  geoseeq/result/__init__.py,sha256=IFHIyRV8ZzuKIfwfze1SXgcKwNMcSgMAknLHMkwjXIU,356
65
67
  geoseeq/result/bioinfo.py,sha256=QQtbyogrdro9avJSN0713sxLVnVeA24mFw3hWtKDKyw,1782
66
- geoseeq/result/file_download.py,sha256=vbYo2B4JshTIqLaklcgcBb7NY9cD5pMkas95GuQxW8s,5776
67
- geoseeq/result/file_upload.py,sha256=qkJug2ptmfBwmuz577DtPG_Z7eLTtatJzycSjE1UTXk,13430
68
+ geoseeq/result/file_chunker.py,sha256=bXq1csuRtqMB5sbH-AfWo6gdPwrivv5DJPuHVj-h08w,1758
69
+ geoseeq/result/file_download.py,sha256=gV9-C_CMPpOWYi21eagsoiri53yzRHQx351nLBUj4WM,5790
70
+ geoseeq/result/file_upload.py,sha256=xs1DrI-h4ZP7xN8HPBc3SFpcPAxR5HAolraP1Zu7tvE,10648
68
71
  geoseeq/result/result_file.py,sha256=1Yj9fkZhds3J-tay6eNH2-EHi00MovHGV1M80_ckHD8,8677
69
72
  geoseeq/result/result_folder.py,sha256=6porOXPh7Tpxw3oX5yMRPYQzNCGYqszqmFJd3SwQmTc,11122
73
+ geoseeq/result/resumable_upload_tracker.py,sha256=2aI09gYz2yw63jEXqs8lmCRKQ79TIc3YuPETvP0Jeek,3811
70
74
  geoseeq/result/utils.py,sha256=C-CxGzB3WddlnRiqFSkrY78I_m0yFgNqsTBRzGU-y8Q,2772
71
75
  geoseeq/vc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
76
  geoseeq/vc/checksum.py,sha256=y8rh1asUZNbE_NLiFO0-9hImLNiTOc2YXQBRKORWK7k,710
@@ -80,9 +84,9 @@ geoseeq/vc/vc_stub.py,sha256=IQr8dI0zsWKVAeY_5ybDD6n49_3othcgfHS3P0O9tuY,3110
80
84
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
85
  tests/test_api_client.py,sha256=TS5njc5pcPP_Ycy-ljcfPVT1hQRBsFVdQ0lCqBmoesU,12810
82
86
  tests/test_plotting.py,sha256=TcTu-2ARr8sxZJ7wPQxmbs3-gHw7uRvsgrhhhg0qKik,784
83
- geoseeq-0.5.6a15.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
84
- geoseeq-0.5.6a15.dist-info/METADATA,sha256=REa9myn8L8WcbxLg0BXH1ucW-A6OtEclEHEb2Usr-0g,4806
85
- geoseeq-0.5.6a15.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
86
- geoseeq-0.5.6a15.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
87
- geoseeq-0.5.6a15.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
88
- geoseeq-0.5.6a15.dist-info/RECORD,,
87
+ geoseeq-0.6.0.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
88
+ geoseeq-0.6.0.dist-info/METADATA,sha256=mDqowxeSFM0nNuY_354pumCtTUpztbhhRe1Dv2rqn5g,4803
89
+ geoseeq-0.6.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
90
+ geoseeq-0.6.0.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
91
+ geoseeq-0.6.0.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
92
+ geoseeq-0.6.0.dist-info/RECORD,,