geoseeq 0.5.6a16__py3-none-any.whl → 0.6.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- geoseeq/cli/download.py +1 -0
- geoseeq/cli/main.py +3 -1
- geoseeq/cli/project.py +96 -0
- geoseeq/cli/raw.py +59 -0
- geoseeq/cli/upload/upload.py +2 -0
- geoseeq/cli/upload/upload_reads.py +1 -0
- geoseeq/result/file_chunker.py +50 -0
- geoseeq/result/file_download.py +2 -3
- geoseeq/result/file_upload.py +55 -142
- geoseeq/result/resumable_upload_tracker.py +100 -0
- geoseeq/upload_download_manager.py +11 -3
- {geoseeq-0.5.6a16.dist-info → geoseeq-0.6.0.dist-info}/METADATA +1 -1
- {geoseeq-0.5.6a16.dist-info → geoseeq-0.6.0.dist-info}/RECORD +17 -13
- {geoseeq-0.5.6a16.dist-info → geoseeq-0.6.0.dist-info}/LICENSE +0 -0
- {geoseeq-0.5.6a16.dist-info → geoseeq-0.6.0.dist-info}/WHEEL +0 -0
- {geoseeq-0.5.6a16.dist-info → geoseeq-0.6.0.dist-info}/entry_points.txt +0 -0
- {geoseeq-0.5.6a16.dist-info → geoseeq-0.6.0.dist-info}/top_level.txt +0 -0
geoseeq/cli/download.py
CHANGED
geoseeq/cli/main.py
CHANGED
@@ -18,6 +18,7 @@ from .shared_params.opts_and_args import overwrite_option, yes_option
|
|
18
18
|
from .detail import cli_detail
|
19
19
|
from .run import cli_app
|
20
20
|
from .get_eula import cli_eula
|
21
|
+
from .project import cli_project
|
21
22
|
|
22
23
|
logger = logging.getLogger('geoseeq_api')
|
23
24
|
handler = logging.StreamHandler()
|
@@ -53,7 +54,7 @@ def version():
|
|
53
54
|
Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
|
54
55
|
Run `geoseeq eula show` to view the EULA.
|
55
56
|
"""
|
56
|
-
click.echo('0.
|
57
|
+
click.echo('0.6.0') # remember to update setup
|
57
58
|
|
58
59
|
|
59
60
|
@main.group('advanced')
|
@@ -65,6 +66,7 @@ cli_advanced.add_command(cli_copy)
|
|
65
66
|
cli_advanced.add_command(cli_user)
|
66
67
|
cli_advanced.add_command(cli_detail)
|
67
68
|
cli_advanced.add_command(cli_upload_advanced)
|
69
|
+
cli_advanced.add_command(cli_project)
|
68
70
|
|
69
71
|
@cli_advanced.group('experimental')
|
70
72
|
def cli_experimental():
|
geoseeq/cli/project.py
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from os import makedirs, getcwd
|
4
|
+
from os.path import dirname, join
|
5
|
+
|
6
|
+
import click
|
7
|
+
import pandas as pd
|
8
|
+
from multiprocessing import Pool
|
9
|
+
from .shared_params import (
|
10
|
+
handle_project_id,
|
11
|
+
handle_folder_id,
|
12
|
+
project_id_arg,
|
13
|
+
sample_ids_arg,
|
14
|
+
handle_multiple_sample_ids,
|
15
|
+
handle_multiple_result_file_ids,
|
16
|
+
use_common_state,
|
17
|
+
flatten_list_of_els_and_files,
|
18
|
+
yes_option,
|
19
|
+
module_option,
|
20
|
+
ignore_errors_option,
|
21
|
+
folder_ids_arg,
|
22
|
+
)
|
23
|
+
from geoseeq.result.file_download import download_url
|
24
|
+
from geoseeq.utils import download_ftp
|
25
|
+
from geoseeq.id_constructors import (
|
26
|
+
result_file_from_uuid,
|
27
|
+
result_file_from_name,
|
28
|
+
)
|
29
|
+
from geoseeq.knex import GeoseeqNotFoundError
|
30
|
+
from .progress_bar import PBarManager
|
31
|
+
from .utils import convert_size
|
32
|
+
from geoseeq.constants import FASTQ_MODULE_NAMES
|
33
|
+
from geoseeq.result import ResultFile
|
34
|
+
from geoseeq.upload_download_manager import GeoSeeqDownloadManager
|
35
|
+
from geoseeq.file_system.filesystem_download import (
|
36
|
+
ProjectOnFilesystem,
|
37
|
+
FILE_STATUS_MODIFIED_REMOTE,
|
38
|
+
FILE_STATUS_MODIFIED_LOCAL,
|
39
|
+
FILE_STATUS_NEW_LOCAL,
|
40
|
+
FILE_STATUS_NEW_REMOTE,
|
41
|
+
FILE_STATUS_IS_LOCAL_STUB,
|
42
|
+
)
|
43
|
+
|
44
|
+
|
45
|
+
logger = logging.getLogger('geoseeq_api')
|
46
|
+
|
47
|
+
|
48
|
+
@click.group("project")
|
49
|
+
def cli_project():
|
50
|
+
"""Download data from GeoSeeq."""
|
51
|
+
pass
|
52
|
+
|
53
|
+
|
54
|
+
@cli_project.command("clone")
|
55
|
+
@use_common_state
|
56
|
+
@click.option('--use-stubs/--full-files', default=True, help='Download full files or stubs')
|
57
|
+
@click.option('--target-dir', '-d', default=None, help='Directory to download the project to')
|
58
|
+
@project_id_arg
|
59
|
+
def cli_clone_project(state, use_stubs, target_dir, project_id):
|
60
|
+
"""Clone a project to the local filesystem.
|
61
|
+
"""
|
62
|
+
knex = state.get_knex().set_auth_required()
|
63
|
+
proj = handle_project_id(knex, project_id)
|
64
|
+
logger.info(f"Found project \"{proj.name}\"")
|
65
|
+
if target_dir is None:
|
66
|
+
target_dir = proj.name
|
67
|
+
|
68
|
+
project = ProjectOnFilesystem(proj, target_dir)
|
69
|
+
project.download(use_stubs=use_stubs)
|
70
|
+
|
71
|
+
|
72
|
+
@cli_project.command("status")
|
73
|
+
@use_common_state
|
74
|
+
def cli_project_status(state):
|
75
|
+
"""Check the status of a project on the local filesystem.
|
76
|
+
"""
|
77
|
+
project = ProjectOnFilesystem.from_path(getcwd(), recursive=True)
|
78
|
+
|
79
|
+
objs_by_status = {
|
80
|
+
FILE_STATUS_MODIFIED_LOCAL: [],
|
81
|
+
FILE_STATUS_MODIFIED_REMOTE: [],
|
82
|
+
FILE_STATUS_NEW_LOCAL: [],
|
83
|
+
FILE_STATUS_NEW_REMOTE: [],
|
84
|
+
FILE_STATUS_IS_LOCAL_STUB: [],
|
85
|
+
}
|
86
|
+
for obj_type, status, local_path, obj in project.list_abnormal_objects():
|
87
|
+
objs_by_status[status].append((obj_type, local_path, obj))
|
88
|
+
|
89
|
+
print(f"Project: {project.project.name}")
|
90
|
+
for status, objs in objs_by_status.items():
|
91
|
+
print(f"Status: {status}")
|
92
|
+
for obj_type, local_path, obj in objs:
|
93
|
+
if status in (FILE_STATUS_MODIFIED_LOCAL, FILE_STATUS_NEW_LOCAL):
|
94
|
+
print(f" {obj_type}: {project.path_from_project_root(local_path)} -> {obj}")
|
95
|
+
else:
|
96
|
+
print(f" {obj_type}: {obj} -> {project.path_from_project_root(local_path)}")
|
geoseeq/cli/raw.py
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
import click
|
2
|
+
import json
|
3
|
+
from .shared_params import use_common_state, overwrite_option
|
4
|
+
from geoseeq import GeoseeqNotFoundError
|
5
|
+
from geoseeq.blob_constructors import (
|
6
|
+
sample_result_file_from_uuid,
|
7
|
+
project_result_file_from_uuid,
|
8
|
+
sample_result_folder_from_uuid,
|
9
|
+
project_result_folder_from_uuid,
|
10
|
+
)
|
11
|
+
|
12
|
+
|
13
|
+
@click.group('raw')
|
14
|
+
def cli_raw():
|
15
|
+
"""Low-level commands for interacting with the API."""
|
16
|
+
pass
|
17
|
+
|
18
|
+
|
19
|
+
@cli_raw.command('get-file-data')
|
20
|
+
@use_common_state
|
21
|
+
@click.argument('file_ids', nargs=-1)
|
22
|
+
def cli_get_file_data(state, file_ids):
|
23
|
+
"""Print the raw stored data in a result file object."""
|
24
|
+
knex = state.get_knex()
|
25
|
+
for file_id in file_ids:
|
26
|
+
file_id = file_id.split(':')[-1]
|
27
|
+
try:
|
28
|
+
result_file = sample_result_file_from_uuid(knex, file_id)
|
29
|
+
except GeoseeqNotFoundError:
|
30
|
+
result_file = project_result_file_from_uuid(knex, file_id)
|
31
|
+
print(json.dumps(result_file.stored_data, indent=2), file=state.outfile)
|
32
|
+
|
33
|
+
|
34
|
+
@cli_raw.command('create-raw-file')
|
35
|
+
@use_common_state
|
36
|
+
@overwrite_option
|
37
|
+
@click.argument('folder_id')
|
38
|
+
@click.argument('result_filename')
|
39
|
+
@click.argument('filename', type=click.File('r'))
|
40
|
+
def cli_get_file_data(state, overwrite, folder_id, result_filename, filename):
|
41
|
+
"""Print the raw stored data in a result file object."""
|
42
|
+
knex = state.get_knex()
|
43
|
+
|
44
|
+
folder_id = folder_id.split(':')[-1]
|
45
|
+
try:
|
46
|
+
result_folder = sample_result_folder_from_uuid(knex, folder_id)
|
47
|
+
except GeoseeqNotFoundError:
|
48
|
+
result_folder = project_result_folder_from_uuid(knex, folder_id)
|
49
|
+
blob = json.load(filename)
|
50
|
+
result_file = result_folder.result_file(result_filename)
|
51
|
+
if overwrite:
|
52
|
+
result_file.idem()
|
53
|
+
result_file.stored_data = blob
|
54
|
+
result_file.save()
|
55
|
+
else:
|
56
|
+
result_file.create()
|
57
|
+
click.echo(f'Created file {result_file.uuid}', file=state.outfile)
|
58
|
+
|
59
|
+
|
geoseeq/cli/upload/upload.py
CHANGED
@@ -122,6 +122,7 @@ def cli_upload_file(state, cores, threads_per_upload, num_retries, chunk_size_mb
|
|
122
122
|
use_cache=state.use_cache,
|
123
123
|
num_retries=num_retries,
|
124
124
|
ignore_errors=ignore_errors,
|
125
|
+
use_atomic_upload=True,
|
125
126
|
session=None, #knex.new_session(),
|
126
127
|
chunk_size_mb=chunk_size_mb if chunk_size_mb > 0 else None,
|
127
128
|
)
|
@@ -160,6 +161,7 @@ def cli_upload_folder(state, cores, yes, private, recursive, hidden, no_new_vers
|
|
160
161
|
overwrite=True,
|
161
162
|
use_cache=state.use_cache,
|
162
163
|
no_new_versions=no_new_versions,
|
164
|
+
use_atomic_upload=True,
|
163
165
|
)
|
164
166
|
for folder_name in folder_names:
|
165
167
|
result_folder = root_obj.result_folder(folder_name).idem()
|
@@ -98,6 +98,7 @@ def _do_upload(groups, module_name, link_type, lib, filepaths, overwrite, no_new
|
|
98
98
|
progress_tracker_factory=PBarManager().get_new_bar,
|
99
99
|
use_cache=state.use_cache,
|
100
100
|
no_new_versions=no_new_versions,
|
101
|
+
use_atomic_upload=True,
|
101
102
|
)
|
102
103
|
for group in groups:
|
103
104
|
sample = lib.sample(group['sample_name']).idem()
|
@@ -0,0 +1,50 @@
|
|
1
|
+
|
2
|
+
from os.path import getsize
|
3
|
+
import logging
|
4
|
+
|
5
|
+
logger = logging.getLogger("geoseeq_api") # Same name as calling module
|
6
|
+
logger.addHandler(logging.NullHandler())
|
7
|
+
|
8
|
+
|
9
|
+
class FileChunker:
|
10
|
+
|
11
|
+
def __init__(self, filepath, chunk_size):
|
12
|
+
self.filepath = filepath
|
13
|
+
self.chunk_size = chunk_size
|
14
|
+
self.file_size = getsize(filepath)
|
15
|
+
self.n_parts = int(self.file_size / self.chunk_size) + 1
|
16
|
+
self.loaded_parts = []
|
17
|
+
|
18
|
+
def load_all_chunks(self):
|
19
|
+
if len(self.loaded_parts) != self.n_parts:
|
20
|
+
with open(self.filepath, "rb") as f:
|
21
|
+
f.seek(0)
|
22
|
+
for i in range(self.n_parts):
|
23
|
+
chunk = f.read(self.chunk_size)
|
24
|
+
self.loaded_parts.append(chunk)
|
25
|
+
return self # convenience for chaining
|
26
|
+
|
27
|
+
def chunk_is_preloaded(self, num):
|
28
|
+
return len(self.loaded_parts) > num and self.loaded_parts[num]
|
29
|
+
|
30
|
+
def read_one_chunk(self, num):
|
31
|
+
if not self.chunk_is_preloaded(num):
|
32
|
+
logger.debug(f"Reading chunk {num} from {self.filepath}")
|
33
|
+
with open(self.filepath, "rb") as f:
|
34
|
+
f.seek(num * self.chunk_size)
|
35
|
+
chunk = f.read(self.chunk_size)
|
36
|
+
return chunk
|
37
|
+
return self.loaded_parts[num]
|
38
|
+
|
39
|
+
def get_chunk(self, num):
|
40
|
+
if self.chunk_is_preloaded(num):
|
41
|
+
return self.loaded_parts[num]
|
42
|
+
return self.read_one_chunk(num)
|
43
|
+
|
44
|
+
def get_chunk_size(self, num):
|
45
|
+
if num < (self.n_parts - 1): # all but the last chunk
|
46
|
+
return self.chunk_size
|
47
|
+
if self.chunk_is_preloaded(num): # last chunk, pre-loaded
|
48
|
+
return len(self.loaded_parts[num])
|
49
|
+
return len(self.read_one_chunk(num)) # last chunk, not pre-loaded
|
50
|
+
|
geoseeq/result/file_download.py
CHANGED
@@ -12,10 +12,10 @@ from geoseeq.constants import FIVE_MB
|
|
12
12
|
logger = logging.getLogger("geoseeq_api") # Same name as calling module
|
13
13
|
|
14
14
|
|
15
|
-
def _download_head(url, filename, head=None, progress_tracker=None):
|
15
|
+
def _download_head(url, filename, head=None, start=0, progress_tracker=None):
|
16
16
|
headers = None
|
17
17
|
if head and head > 0:
|
18
|
-
headers = {"Range": f"bytes=
|
18
|
+
headers = {"Range": f"bytes={start}-{head}"}
|
19
19
|
response = requests.get(url, stream=True, headers=headers)
|
20
20
|
response.raise_for_status()
|
21
21
|
total_size_in_bytes = int(response.headers.get('content-length', 0))
|
@@ -67,7 +67,6 @@ def download_url(url, kind='guess', filename=None, head=None, progress_tracker=N
|
|
67
67
|
raise ValueError(f"Unknown download kind: {kind}")
|
68
68
|
|
69
69
|
|
70
|
-
|
71
70
|
class ResultFileDownload:
|
72
71
|
"""Abstract class that handles download methods for result files."""
|
73
72
|
|
geoseeq/result/file_upload.py
CHANGED
@@ -13,130 +13,21 @@ from geoseeq.utils import md5_checksum
|
|
13
13
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
14
14
|
from .utils import *
|
15
15
|
from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
def __init__(self, filepath, chunk_size):
|
20
|
-
self.filepath = filepath
|
21
|
-
self.chunk_size = chunk_size
|
22
|
-
self.file_size = getsize(filepath)
|
23
|
-
self.n_parts = int(self.file_size / self.chunk_size) + 1
|
24
|
-
self.loaded_parts = []
|
25
|
-
|
26
|
-
def load_all_chunks(self):
|
27
|
-
if len(self.loaded_parts) != self.n_parts:
|
28
|
-
with open(self.filepath, "rb") as f:
|
29
|
-
f.seek(0)
|
30
|
-
for i in range(self.n_parts):
|
31
|
-
chunk = f.read(self.chunk_size)
|
32
|
-
self.loaded_parts.append(chunk)
|
33
|
-
return self # convenience for chaining
|
34
|
-
|
35
|
-
def chunk_is_preloaded(self, num):
|
36
|
-
return len(self.loaded_parts) > num and self.loaded_parts[num]
|
37
|
-
|
38
|
-
def read_one_chunk(self, num):
|
39
|
-
if not self.chunk_is_preloaded(num):
|
40
|
-
logger.debug(f"Reading chunk {num} from {self.filepath}")
|
41
|
-
with open(self.filepath, "rb") as f:
|
42
|
-
f.seek(num * self.chunk_size)
|
43
|
-
chunk = f.read(self.chunk_size)
|
44
|
-
return chunk
|
45
|
-
return self.loaded_parts[num]
|
46
|
-
|
47
|
-
def get_chunk(self, num):
|
48
|
-
if self.chunk_is_preloaded(num):
|
49
|
-
return self.loaded_parts[num]
|
50
|
-
return self.read_one_chunk(num)
|
51
|
-
|
52
|
-
def get_chunk_size(self, num):
|
53
|
-
if num < (self.n_parts - 1): # all but the last chunk
|
54
|
-
return self.chunk_size
|
55
|
-
if self.chunk_is_preloaded(num): # last chunk, pre-loaded
|
56
|
-
return len(self.loaded_parts[num])
|
57
|
-
return len(self.read_one_chunk(num)) # last chunk, not pre-loaded
|
58
|
-
|
59
|
-
|
60
|
-
class ResumableUploadTracker:
|
61
|
-
|
62
|
-
def __init__(self, filepath, chunk_size, tracker_file_prefix="gs_resumable_upload_tracker"):
|
63
|
-
self.open, self.upload_started = True, False
|
64
|
-
self.upload_id, self.urls = None, None
|
65
|
-
self.filepath = filepath
|
66
|
-
self.tracker_file = join(
|
67
|
-
GEOSEEQ_CACHE_DIR, 'upload',
|
68
|
-
tracker_file_prefix + f".{chunk_size}.{getsize(filepath)}." + basename(filepath)
|
69
|
-
)
|
70
|
-
try:
|
71
|
-
os.makedirs(dirname(self.tracker_file), exist_ok=True)
|
72
|
-
except Exception as e:
|
73
|
-
logger.warning(f'Could not create resumable upload tracker directory. {e}')
|
74
|
-
self.open = False
|
75
|
-
self._loaded_parts = {}
|
76
|
-
self._load_parts_from_file()
|
77
|
-
|
78
|
-
def start_upload(self, upload_id, urls):
|
79
|
-
if not self.open:
|
80
|
-
return
|
81
|
-
if self.upload_started:
|
82
|
-
raise GeoseeqGeneralError("Upload has already started.")
|
83
|
-
blob = dict(upload_id=upload_id, urls=urls, start_time=time.time())
|
84
|
-
serialized = json.dumps(blob)
|
85
|
-
with open(self.tracker_file, "w") as f:
|
86
|
-
f.write(serialized + "\n")
|
87
|
-
self.upload_id, self.urls = upload_id, urls
|
88
|
-
self.upload_started = True
|
89
|
-
|
90
|
-
def add_part(self, part_upload_info):
|
91
|
-
if not self.open:
|
92
|
-
return
|
93
|
-
part_id = part_upload_info["PartNumber"]
|
94
|
-
serialized = json.dumps(part_upload_info)
|
95
|
-
with open(self.tracker_file, "a") as f:
|
96
|
-
f.write(serialized + "\n")
|
97
|
-
self._loaded_parts[part_id] = part_upload_info
|
98
|
-
if len(self._loaded_parts) == len(self.urls):
|
99
|
-
self.cleanup()
|
100
|
-
self.open = False
|
101
|
-
|
102
|
-
def _load_parts_from_file(self):
|
103
|
-
if not isfile(self.tracker_file):
|
104
|
-
return
|
105
|
-
with open(self.tracker_file, "r") as f:
|
106
|
-
header_blob = json.loads(f.readline())
|
107
|
-
self.upload_id, self.urls = header_blob["upload_id"], header_blob["urls"]
|
108
|
-
start_time = header_blob["start_time"]
|
109
|
-
if (time.time() - start_time) > (60 * 60 * 23):
|
110
|
-
logger.warning(f"Tracker file {self.tracker_file} is too old. Deleting.")
|
111
|
-
os.remove(self.tracker_file)
|
112
|
-
return
|
113
|
-
self.upload_started = True
|
114
|
-
for line in f:
|
115
|
-
blob = json.loads(line)
|
116
|
-
part_id = blob["PartNumber"]
|
117
|
-
self._loaded_parts[part_id] = blob
|
118
|
-
|
119
|
-
def part_has_been_uploaded(self, part_number):
|
120
|
-
if not self.open:
|
121
|
-
return False
|
122
|
-
return part_number in self._loaded_parts
|
123
|
-
|
124
|
-
def get_part_info(self, part_number):
|
125
|
-
return self._loaded_parts[part_number]
|
126
|
-
|
127
|
-
def cleanup(self):
|
128
|
-
if not self.open:
|
129
|
-
return
|
130
|
-
try:
|
131
|
-
os.remove(self.tracker_file)
|
132
|
-
except FileNotFoundError:
|
133
|
-
pass
|
16
|
+
from .file_chunker import FileChunker
|
17
|
+
from .resumable_upload_tracker import ResumableUploadTracker
|
134
18
|
|
135
19
|
|
136
20
|
class ResultFileUpload:
|
137
21
|
"""Abstract class that handles upload methods for result files."""
|
138
22
|
|
139
|
-
def
|
23
|
+
def _result_type(self, atomic=False):
|
24
|
+
if self.is_sample_result:
|
25
|
+
return "sample"
|
26
|
+
if atomic:
|
27
|
+
return "project"
|
28
|
+
return "group"
|
29
|
+
|
30
|
+
def _create_multipart_upload(self, filepath, file_size, optional_fields, atomic=False):
|
140
31
|
optional_fields = optional_fields if optional_fields else {}
|
141
32
|
optional_fields.update(
|
142
33
|
{
|
@@ -147,23 +38,31 @@ class ResultFileUpload:
|
|
147
38
|
data = {
|
148
39
|
"filename": basename(filepath),
|
149
40
|
"optional_fields": optional_fields,
|
150
|
-
"result_type":
|
41
|
+
"result_type": self._result_type(atomic),
|
151
42
|
}
|
152
|
-
|
43
|
+
url = f"/ar_fields/{self.uuid}/create_upload"
|
44
|
+
if atomic:
|
45
|
+
data["fieldname"] = self.name
|
46
|
+
url = f"/ars/{self.parent.uuid}/create_atomic_upload"
|
47
|
+
response = self.knex.post(url, json=data)
|
153
48
|
return response
|
154
49
|
|
155
|
-
def _prep_multipart_upload(self, filepath, file_size, chunk_size, optional_fields):
|
50
|
+
def _prep_multipart_upload(self, filepath, file_size, chunk_size, optional_fields, atomic=False):
|
156
51
|
n_parts = int(file_size / chunk_size) + 1
|
157
|
-
response = self._create_multipart_upload(filepath, file_size, optional_fields)
|
52
|
+
response = self._create_multipart_upload(filepath, file_size, optional_fields, atomic=atomic)
|
158
53
|
upload_id = response["upload_id"]
|
159
|
-
parts = list(range(1, n_parts + 1))
|
160
54
|
data = {
|
161
|
-
"parts":
|
55
|
+
"parts": list(range(1, n_parts + 1)),
|
162
56
|
"stance": "upload-multipart",
|
163
57
|
"upload_id": upload_id,
|
164
|
-
"result_type":
|
58
|
+
"result_type": self._result_type(atomic),
|
165
59
|
}
|
166
|
-
|
60
|
+
url = f"/ar_fields/{self.uuid}/create_upload_urls"
|
61
|
+
if atomic:
|
62
|
+
data["uuid"] = response["uuid"]
|
63
|
+
data["fieldname"] = self.name
|
64
|
+
url = f"ars/{self.parent.uuid}/create_atomic_upload_urls"
|
65
|
+
response = self.knex.post(url, json=data)
|
167
66
|
urls = response
|
168
67
|
return upload_id, urls
|
169
68
|
|
@@ -204,16 +103,17 @@ class ResultFileUpload:
|
|
204
103
|
resumable_upload_tracker.add_part(blob)
|
205
104
|
return blob
|
206
105
|
|
207
|
-
def _finish_multipart_upload(self, upload_id, complete_parts):
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
106
|
+
def _finish_multipart_upload(self, upload_id, complete_parts, atomic=False):
|
107
|
+
data = {
|
108
|
+
"parts": complete_parts,
|
109
|
+
"upload_id": upload_id,
|
110
|
+
"result_type": self._result_type(atomic),
|
111
|
+
}
|
112
|
+
url = f"/ar_fields/{self.uuid}/complete_upload"
|
113
|
+
if atomic:
|
114
|
+
data["fieldname"] = self.name
|
115
|
+
url = f"/ars/{self.parent.uuid}/complete_atomic_upload"
|
116
|
+
response = self.knex.post(url, json=data, json_response=False)
|
217
117
|
response.raise_for_status()
|
218
118
|
|
219
119
|
def _upload_parts(self, file_chunker, urls, max_retries, session, progress_tracker, threads, resumable_upload_tracker=None):
|
@@ -257,6 +157,7 @@ class ResultFileUpload:
|
|
257
157
|
progress_tracker=None,
|
258
158
|
threads=1,
|
259
159
|
use_cache=True,
|
160
|
+
use_atomic_upload=False,
|
260
161
|
):
|
261
162
|
"""Upload a file to S3 using the multipart upload process."""
|
262
163
|
logger.info(f"Uploading {filepath} to S3 using multipart upload.")
|
@@ -267,15 +168,21 @@ class ResultFileUpload:
|
|
267
168
|
logger.debug(f"Using chunk size of {chunk_size} bytes.")
|
268
169
|
resumable_upload_tracker = None
|
269
170
|
if use_cache and file_size > 10 * FIVE_MB: # only use resumable upload tracker for larger files
|
270
|
-
|
171
|
+
upload_target_uuid = self.parent.uuid if use_atomic_upload else self.uuid
|
172
|
+
resumable_upload_tracker = ResumableUploadTracker(filepath, chunk_size, upload_target_uuid)
|
173
|
+
|
271
174
|
if resumable_upload_tracker and resumable_upload_tracker.upload_started:
|
175
|
+
# a resumable upload for this file has already started
|
176
|
+
resumable_upload_exists_and_is_valid = True
|
272
177
|
upload_id, urls = resumable_upload_tracker.upload_id, resumable_upload_tracker.urls
|
178
|
+
use_atomic_upload = resumable_upload_tracker.is_atomic_upload
|
273
179
|
logger.info(f'Resuming upload for "{filepath}", upload_id: "{upload_id}"')
|
274
180
|
else:
|
275
|
-
upload_id, urls = self._prep_multipart_upload(filepath, file_size, chunk_size, optional_fields)
|
181
|
+
upload_id, urls = self._prep_multipart_upload(filepath, file_size, chunk_size, optional_fields, atomic=use_atomic_upload)
|
276
182
|
if resumable_upload_tracker:
|
277
183
|
logger.info(f'Creating new resumable upload for "{filepath}", upload_id: "{upload_id}"')
|
278
|
-
resumable_upload_tracker.start_upload(upload_id, urls)
|
184
|
+
resumable_upload_tracker.start_upload(upload_id, urls, is_atomic_upload=use_atomic_upload)
|
185
|
+
|
279
186
|
logger.info(f'Starting upload for "{filepath}"')
|
280
187
|
complete_parts = []
|
281
188
|
file_chunker = FileChunker(filepath, chunk_size)
|
@@ -294,14 +201,20 @@ class ResultFileUpload:
|
|
294
201
|
threads,
|
295
202
|
resumable_upload_tracker=resumable_upload_tracker
|
296
203
|
)
|
297
|
-
self._finish_multipart_upload(upload_id, complete_parts)
|
204
|
+
self._finish_multipart_upload(upload_id, complete_parts, atomic=use_atomic_upload)
|
298
205
|
logger.info(f'Finished Upload for "{filepath}"')
|
206
|
+
if use_atomic_upload:
|
207
|
+
# if this was an atomic upload then this result may not have existed on the server before
|
208
|
+
self.get()
|
299
209
|
return self
|
300
210
|
|
301
211
|
def upload_file(self, filepath, multipart_thresh=FIVE_MB, overwrite=True, no_new_versions=False, **kwargs):
|
302
212
|
if self.exists() and not overwrite:
|
303
213
|
raise GeoseeqGeneralError(f"Overwrite is set to False and file {self.uuid} already exists.")
|
304
|
-
|
214
|
+
if not kwargs.get("use_atomic_upload", False):
|
215
|
+
self.idem()
|
216
|
+
else:
|
217
|
+
self.parent.idem()
|
305
218
|
if no_new_versions and self.has_downloadable_file():
|
306
219
|
raise GeoseeqGeneralError(f"File {self} already has a downloadable file. Not uploading a new version.")
|
307
220
|
resolved_path = Path(filepath).resolve()
|
@@ -0,0 +1,100 @@
|
|
1
|
+
|
2
|
+
import time
|
3
|
+
import json
|
4
|
+
import os
|
5
|
+
from os.path import basename, getsize, join, dirname, isfile, getctime
|
6
|
+
from pathlib import Path
|
7
|
+
from random import random
|
8
|
+
import requests
|
9
|
+
|
10
|
+
from geoseeq.knex import GeoseeqGeneralError
|
11
|
+
from geoseeq.constants import FIVE_MB
|
12
|
+
from geoseeq.utils import md5_checksum
|
13
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
14
|
+
from .utils import *
|
15
|
+
from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
|
16
|
+
from .file_chunker import FileChunker
|
17
|
+
|
18
|
+
|
19
|
+
class ResumableUploadTracker:
|
20
|
+
|
21
|
+
def __init__(self, filepath, chunk_size, upload_target_uuid, tracker_file_prefix="gs_resumable_upload_tracker"):
|
22
|
+
self.open, self.upload_started = True, False
|
23
|
+
self.upload_id, self.urls, self.is_atomic_upload = None, None, None
|
24
|
+
self.upload_target_uuid = upload_target_uuid
|
25
|
+
self.filepath = filepath
|
26
|
+
self.tracker_file_dir = join(GEOSEEQ_CACHE_DIR, 'upload')
|
27
|
+
self.tracker_file = join(
|
28
|
+
self.tracker_file_dir,
|
29
|
+
tracker_file_prefix + f".{upload_target_uuid}.{chunk_size}.{getsize(filepath)}." + basename(filepath)
|
30
|
+
)
|
31
|
+
try:
|
32
|
+
os.makedirs(self.tracker_file_dir, exist_ok=True)
|
33
|
+
except Exception as e:
|
34
|
+
logger.warning(f'Could not create resumable upload tracker directory. {e}')
|
35
|
+
self.open = False
|
36
|
+
self._loaded_parts = {}
|
37
|
+
self._load_parts_from_file()
|
38
|
+
|
39
|
+
def start_upload(self, upload_id, urls, is_atomic_upload=False):
|
40
|
+
if not self.open:
|
41
|
+
return
|
42
|
+
if self.upload_started:
|
43
|
+
raise GeoseeqGeneralError("Upload has already started.")
|
44
|
+
self.upload_started = True
|
45
|
+
blob = dict(upload_id=upload_id,
|
46
|
+
urls=urls,
|
47
|
+
is_atomic_upload=is_atomic_upload,
|
48
|
+
upload_target_uuid=self.upload_target_uuid,
|
49
|
+
start_time=time.time())
|
50
|
+
serialized = json.dumps(blob)
|
51
|
+
with open(self.tracker_file, "w") as f:
|
52
|
+
f.write(serialized + "\n")
|
53
|
+
self.upload_id, self.urls, self.is_atomic_upload = upload_id, urls, is_atomic_upload
|
54
|
+
|
55
|
+
def add_part(self, part_upload_info):
|
56
|
+
if not self.open:
|
57
|
+
return
|
58
|
+
part_id = part_upload_info["PartNumber"]
|
59
|
+
serialized = json.dumps(part_upload_info)
|
60
|
+
with open(self.tracker_file, "a") as f:
|
61
|
+
f.write(serialized + "\n")
|
62
|
+
self._loaded_parts[part_id] = part_upload_info
|
63
|
+
if len(self._loaded_parts) == len(self.urls):
|
64
|
+
self.cleanup()
|
65
|
+
self.open = False
|
66
|
+
|
67
|
+
def _load_parts_from_file(self):
|
68
|
+
if not isfile(self.tracker_file):
|
69
|
+
return
|
70
|
+
with open(self.tracker_file, "r") as f:
|
71
|
+
header_blob = json.loads(f.readline())
|
72
|
+
self.upload_id, self.urls, self.is_atomic_upload = (
|
73
|
+
header_blob["upload_id"], header_blob["urls"], header_blob["is_atomic_upload"]
|
74
|
+
)
|
75
|
+
start_time = header_blob["start_time"]
|
76
|
+
if (time.time() - start_time) > (60 * 60 * 23):
|
77
|
+
logger.warning(f"Tracker file {self.tracker_file} is too old. Deleting.")
|
78
|
+
os.remove(self.tracker_file)
|
79
|
+
return
|
80
|
+
self.upload_started = True
|
81
|
+
for line in f:
|
82
|
+
blob = json.loads(line)
|
83
|
+
part_id = blob["PartNumber"]
|
84
|
+
self._loaded_parts[part_id] = blob
|
85
|
+
|
86
|
+
def part_has_been_uploaded(self, part_number):
|
87
|
+
if not self.open:
|
88
|
+
return False
|
89
|
+
return part_number in self._loaded_parts
|
90
|
+
|
91
|
+
def get_part_info(self, part_number):
|
92
|
+
return self._loaded_parts[part_number]
|
93
|
+
|
94
|
+
def cleanup(self):
|
95
|
+
if not self.open:
|
96
|
+
return
|
97
|
+
try:
|
98
|
+
os.remove(self.tracker_file)
|
99
|
+
except FileNotFoundError:
|
100
|
+
pass
|
@@ -22,7 +22,7 @@ def _upload_one_file(args):
|
|
22
22
|
(result_file, filepath, session, progress_tracker,
|
23
23
|
link_type, overwrite, log_level, parallel_uploads,
|
24
24
|
use_cache, no_new_versions, threads_per_upload,
|
25
|
-
num_retries, ignore_errors, chunk_size_mb) = args
|
25
|
+
num_retries, ignore_errors, chunk_size_mb, use_atomic_upload) = args
|
26
26
|
chunk_size = chunk_size_mb * 1024 * 1024 if chunk_size_mb else None
|
27
27
|
if parallel_uploads:
|
28
28
|
_make_in_process_logger(log_level)
|
@@ -34,6 +34,7 @@ def _upload_one_file(args):
|
|
34
34
|
session=session, overwrite=overwrite, progress_tracker=progress_tracker,
|
35
35
|
threads=threads_per_upload, use_cache=use_cache, chunk_size=chunk_size,
|
36
36
|
no_new_versions=no_new_versions, max_retries=num_retries,
|
37
|
+
use_atomic_upload=use_atomic_upload
|
37
38
|
)
|
38
39
|
else:
|
39
40
|
result_file.link_file(link_type, filepath)
|
@@ -59,6 +60,7 @@ class GeoSeeqUploadManager:
|
|
59
60
|
num_retries=3,
|
60
61
|
ignore_errors=False,
|
61
62
|
chunk_size_mb=5,
|
63
|
+
use_atomic_upload=True,
|
62
64
|
use_cache=True):
|
63
65
|
self.session = session
|
64
66
|
self.n_parallel_uploads = n_parallel_uploads
|
@@ -73,12 +75,18 @@ class GeoSeeqUploadManager:
|
|
73
75
|
self.num_retries = num_retries
|
74
76
|
self.ignore_errors = ignore_errors
|
75
77
|
self.chunk_size_mb = chunk_size_mb
|
78
|
+
self.use_atomic_upload = use_atomic_upload
|
76
79
|
|
77
80
|
def add_result_file(self, result_file, local_path):
|
78
81
|
self._result_files.append((result_file, local_path))
|
79
82
|
|
80
83
|
def add_local_file_to_result_folder(self, result_folder, local_path, geoseeq_file_name=None):
|
81
|
-
|
84
|
+
if not geoseeq_file_name:
|
85
|
+
if local_path.startswith("/"): # if local path is an absolute path use the basename
|
86
|
+
geoseeq_file_name = basename(local_path)
|
87
|
+
else:
|
88
|
+
# remove "./" and "../" from local path to get a geoseeq file name
|
89
|
+
geoseeq_file_name = local_path.replace("./", "").replace("../", "")
|
82
90
|
result_file = result_folder.result_file(geoseeq_file_name)
|
83
91
|
self.add_result_file(result_file, local_path)
|
84
92
|
|
@@ -99,7 +107,7 @@ class GeoSeeqUploadManager:
|
|
99
107
|
self.link_type, self.overwrite, self.log_level,
|
100
108
|
self.n_parallel_uploads > 1, self.use_cache, self.no_new_versions,
|
101
109
|
self.threads_per_upload, self.num_retries, self.ignore_errors,
|
102
|
-
self.chunk_size_mb,
|
110
|
+
self.chunk_size_mb, self.use_atomic_upload
|
103
111
|
) for result_file, local_path in self._result_files
|
104
112
|
]
|
105
113
|
out = []
|
@@ -11,7 +11,7 @@ geoseeq/project.py,sha256=-9Y2ik0-BpT3iqh89v8VQBbdadhI58oaUP9oZK8oetc,13741
|
|
11
11
|
geoseeq/remote_object.py,sha256=Es-JlAz8iLRmCpAzh1MOwUh2MqtbuQM-p8wHIBAqNlQ,7131
|
12
12
|
geoseeq/sample.py,sha256=whgEVk6GnDJJLjn5uTOqFqRtVxZD3BgjTo7brAC5noU,7981
|
13
13
|
geoseeq/search.py,sha256=gawad6Cx5FxJBPlYkXWb-UKAO-UC0_yhvyU9Ca1kaNI,3388
|
14
|
-
geoseeq/upload_download_manager.py,sha256=
|
14
|
+
geoseeq/upload_download_manager.py,sha256=DNI4nce0MCds-wGoTA3fP_msz3kGOAoJNItoUv7L0uQ,8751
|
15
15
|
geoseeq/user.py,sha256=tol8i1UGLRrbMw5jeJDnna1ikRgrCDd50Jxz0a1lSgg,690
|
16
16
|
geoseeq/utils.py,sha256=PDRiEQIZYTcfEV9AYvloQVvfqs5JaebcFZodAa2SUW8,3577
|
17
17
|
geoseeq/work_orders.py,sha256=5uLVVfdKE8qh4gGaHkdBpXJGRTujuSg59knWCqEET4A,8071
|
@@ -19,12 +19,14 @@ geoseeq/cli/__init__.py,sha256=4WnK87K5seRK3SGJAxNWnQTqyg5uBhdhrOrzB1D4b3M,24
|
|
19
19
|
geoseeq/cli/constants.py,sha256=Do5AUf9lMO9_P8KpFJ3XwwFBAWsxSjZ6sx9_QEGyC_c,176
|
20
20
|
geoseeq/cli/copy.py,sha256=02U9kdrAIbbM8MlRMLL6p-LMYFSuRObE3h5jyvcL__M,2275
|
21
21
|
geoseeq/cli/detail.py,sha256=q8Suu-j2k18knfSVFG-SWWGNsKM-n8y9RMA3LcIIi9Y,4132
|
22
|
-
geoseeq/cli/download.py,sha256=
|
22
|
+
geoseeq/cli/download.py,sha256=N_Wrg9d1kY9eJ6C1l0xc_YFjiri8gkXBo9JiuHx9xxE,17766
|
23
23
|
geoseeq/cli/fastq_utils.py,sha256=-bmeQLaiMBm57zWOF0R5OlWTU0_3sh1JBC1RYw2BOFM,3083
|
24
24
|
geoseeq/cli/get_eula.py,sha256=79mbUwyiF7O1r0g6UTxG9kJGQEqKuH805E6eLkPC6Y4,997
|
25
|
-
geoseeq/cli/main.py,sha256=
|
25
|
+
geoseeq/cli/main.py,sha256=Vze6p8cNGsMQmsr5bkhglOxWPIPqxk0BM6417iKvhb4,3791
|
26
26
|
geoseeq/cli/manage.py,sha256=wGXAcVaXqE5JQEU8Jh6OlHr02nB396bpS_SFcOZdrEo,5929
|
27
27
|
geoseeq/cli/progress_bar.py,sha256=p1Xl01nkYxSBZCB30ue2verIIi22W93m3ZAMAxipD0g,738
|
28
|
+
geoseeq/cli/project.py,sha256=V5SdXm2Hwo2lxrkpwRDedw-mAE4XnM2uwT-Gj1D90VQ,3030
|
29
|
+
geoseeq/cli/raw.py,sha256=EASkIBr3AhBg6FOiLElw8Nuj_okQqf9vBXLdow7JQGw,1884
|
28
30
|
geoseeq/cli/run.py,sha256=bx2AV6VIqOSTlxUda78xl0XxcZ8TXlQx02-e7iLQPwI,3838
|
29
31
|
geoseeq/cli/search.py,sha256=wgyprEf_Tm5i_rYl9KTxrmFrD4ohy7qS5ttjg7V3xRY,1015
|
30
32
|
geoseeq/cli/user.py,sha256=fARv_5vx-QYT765Hsjgwr6J5ddA_viCPQdv9iUoVX6w,925
|
@@ -37,9 +39,9 @@ geoseeq/cli/shared_params/id_handlers.py,sha256=501K9sCVkI0YGDQ62vXk_DM5lMMDrdB5
|
|
37
39
|
geoseeq/cli/shared_params/obj_getters.py,sha256=ZSkt6LnDkVFlNVYKgLrjzg60-6BthZMr3eeD3HNqzac,2741
|
38
40
|
geoseeq/cli/shared_params/opts_and_args.py,sha256=LrDkv9WtUryM4uUMXPRk04-EBcTQ7q5V6Yu-XRDUvvA,2083
|
39
41
|
geoseeq/cli/upload/__init__.py,sha256=3C9_S9t7chmYU-2ot89NV03x-EtmsjibulErKaU9w1k,627
|
40
|
-
geoseeq/cli/upload/upload.py,sha256=
|
42
|
+
geoseeq/cli/upload/upload.py,sha256=JZkhe1q3KOp7-tKyzwi860TQhZoNDnZs4yB2PJhOjl0,10081
|
41
43
|
geoseeq/cli/upload/upload_advanced.py,sha256=Jq5eGe-wOdrzxGWVwaFPg0BAJcW0YSx_eHEmYjJeKuA,3434
|
42
|
-
geoseeq/cli/upload/upload_reads.py,sha256=
|
44
|
+
geoseeq/cli/upload/upload_reads.py,sha256=dvmOVq0lJSbpQDyWkNEnZmhkMvfEByV-i8xD75Ai4dA,10706
|
43
45
|
geoseeq/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
46
|
geoseeq/contrib/ncbi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
47
|
geoseeq/contrib/ncbi/api.py,sha256=WQeLoGA_-Zha-QeSO8_i7HpvXyD8UkV0qc5okm11KiA,1056
|
@@ -63,10 +65,12 @@ geoseeq/plotting/map/map.py,sha256=h2QPLGqe-SamhfaTij53S9cQIiO8orCJUAUh0hRicSM,3
|
|
63
65
|
geoseeq/plotting/map/overlay.py,sha256=4VmxqOESTQra9tPr8b8OLEUhJSit9lNipabeSznEYwE,1795
|
64
66
|
geoseeq/result/__init__.py,sha256=IFHIyRV8ZzuKIfwfze1SXgcKwNMcSgMAknLHMkwjXIU,356
|
65
67
|
geoseeq/result/bioinfo.py,sha256=QQtbyogrdro9avJSN0713sxLVnVeA24mFw3hWtKDKyw,1782
|
66
|
-
geoseeq/result/
|
67
|
-
geoseeq/result/
|
68
|
+
geoseeq/result/file_chunker.py,sha256=bXq1csuRtqMB5sbH-AfWo6gdPwrivv5DJPuHVj-h08w,1758
|
69
|
+
geoseeq/result/file_download.py,sha256=gV9-C_CMPpOWYi21eagsoiri53yzRHQx351nLBUj4WM,5790
|
70
|
+
geoseeq/result/file_upload.py,sha256=xs1DrI-h4ZP7xN8HPBc3SFpcPAxR5HAolraP1Zu7tvE,10648
|
68
71
|
geoseeq/result/result_file.py,sha256=1Yj9fkZhds3J-tay6eNH2-EHi00MovHGV1M80_ckHD8,8677
|
69
72
|
geoseeq/result/result_folder.py,sha256=6porOXPh7Tpxw3oX5yMRPYQzNCGYqszqmFJd3SwQmTc,11122
|
73
|
+
geoseeq/result/resumable_upload_tracker.py,sha256=2aI09gYz2yw63jEXqs8lmCRKQ79TIc3YuPETvP0Jeek,3811
|
70
74
|
geoseeq/result/utils.py,sha256=C-CxGzB3WddlnRiqFSkrY78I_m0yFgNqsTBRzGU-y8Q,2772
|
71
75
|
geoseeq/vc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
72
76
|
geoseeq/vc/checksum.py,sha256=y8rh1asUZNbE_NLiFO0-9hImLNiTOc2YXQBRKORWK7k,710
|
@@ -80,9 +84,9 @@ geoseeq/vc/vc_stub.py,sha256=IQr8dI0zsWKVAeY_5ybDD6n49_3othcgfHS3P0O9tuY,3110
|
|
80
84
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
81
85
|
tests/test_api_client.py,sha256=TS5njc5pcPP_Ycy-ljcfPVT1hQRBsFVdQ0lCqBmoesU,12810
|
82
86
|
tests/test_plotting.py,sha256=TcTu-2ARr8sxZJ7wPQxmbs3-gHw7uRvsgrhhhg0qKik,784
|
83
|
-
geoseeq-0.
|
84
|
-
geoseeq-0.
|
85
|
-
geoseeq-0.
|
86
|
-
geoseeq-0.
|
87
|
-
geoseeq-0.
|
88
|
-
geoseeq-0.
|
87
|
+
geoseeq-0.6.0.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
|
88
|
+
geoseeq-0.6.0.dist-info/METADATA,sha256=mDqowxeSFM0nNuY_354pumCtTUpztbhhRe1Dv2rqn5g,4803
|
89
|
+
geoseeq-0.6.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
90
|
+
geoseeq-0.6.0.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
|
91
|
+
geoseeq-0.6.0.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
|
92
|
+
geoseeq-0.6.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|