geoseeq 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
geoseeq/cli/main.py CHANGED
@@ -54,7 +54,7 @@ def version():
54
54
  Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
55
55
  Run `geoseeq eula show` to view the EULA.
56
56
  """
57
- click.echo('0.6.0') # remember to update setup
57
+ click.echo('0.6.2') # remember to update setup
58
58
 
59
59
 
60
60
  @main.group('advanced')
File without changes
@@ -0,0 +1,434 @@
1
+
2
+ import os
3
+ import json
4
+ from geoseeq import (
5
+ result_file_from_id,
6
+ result_folder_from_id,
7
+ sample_from_id,
8
+ project_from_id,
9
+ )
10
+ from geoseeq.utils import md5_checksum
11
+ from time import time
12
+
13
+ FILE_STATUS_MODIFIED_REMOTE = 'MODIFIED_REMOTE'
14
+ FILE_STATUS_MODIFIED_LOCAL = 'MODIFIED_LOCAL'
15
+ FILE_STATUS_NEW_LOCAL = 'NEW_LOCAL'
16
+ FILE_STATUS_NEW_REMOTE = 'NEW_REMOTE'
17
+ FILE_STATUS_IS_LOCAL_STUB = 'IS_LOCAL_STUB'
18
+
19
+
20
+ def dedupe_modified_files(modified_files):
21
+ """Remove duplicates from a list of modified files.
22
+
23
+ This function will remove duplicates from a list of modified files
24
+ based on the path to the file. The first instance of the file will be
25
+ kept and all others will be removed.
26
+ """
27
+ seen = set()
28
+ deduped = []
29
+ for x in modified_files:
30
+ if x[2] not in seen:
31
+ deduped.append(x)
32
+ seen.add(x[2])
33
+ return deduped
34
+
35
+
36
+ class ResultFileOnFilesystem:
37
+ """
38
+
39
+ Note: unlike other filesystem classes the `path` is a file, not
40
+ a directory. This is because the file is downloaded directly to
41
+ the path.
42
+ """
43
+
44
+ def __init__(self, result_file, path, kind):
45
+ self.result_file = result_file
46
+ self.path = path
47
+ self.kind = kind
48
+
49
+ @property
50
+ def info_filepath(self):
51
+ dirpath = os.path.dirname(self.path)
52
+ basename = os.path.basename(self.path)
53
+ return os.path.join(dirpath, f'.gs_result_file__{basename}')
54
+
55
+ @property
56
+ def is_stub(self):
57
+ return os.path.exists(self.path) and os.path.getsize(self.path) == 0
58
+
59
+ def file_is_ok(self, stubs_are_ok=False):
60
+ if self.is_stub:
61
+ return stubs_are_ok
62
+ return self.result_file.download_needs_update(self.path)
63
+
64
+ def download(self, use_stubs=False, exists_ok=False):
65
+ if os.path.exists(self.info_filepath):
66
+ if exists_ok and self.file_is_ok(stubs_are_ok=use_stubs):
67
+ return
68
+ elif not exists_ok:
69
+ raise ValueError('Result file already exists at path: {}'.format(self.info_filepath))
70
+
71
+ # Download the file
72
+ if use_stubs:
73
+ open(self.path, 'w').close()
74
+ else:
75
+ self.result_file.download(self.path)
76
+
77
+ self.write_info_file()
78
+
79
+ def local_file_checksum(self):
80
+ if self.is_stub:
81
+ return "__STUB__"
82
+ return md5_checksum(self.path)
83
+
84
+ def locally_modified(self):
85
+ raise NotImplementedError('This function is not implemented')
86
+
87
+ def status_is_ok(self, stubs_are_ok=False):
88
+ # check for an info file
89
+ if not os.path.exists(self.info_filepath):
90
+ return False
91
+ if stubs_are_ok:
92
+ return True
93
+ return not self.result_file.download_needs_update(self.path)
94
+
95
+ def write_info_file(self):
96
+ result_file_info = {
97
+ "uuid": self.result_file.uuid,
98
+ "kind": self.kind,
99
+ "checksum": self.local_file_checksum(),
100
+ }
101
+ with open(self.info_filepath, 'w') as f:
102
+ json.dump(result_file_info, f)
103
+
104
+ @classmethod
105
+ def from_path(cls, path):
106
+ obj = cls(None, path, None)
107
+ try:
108
+ with open(obj.info_filepath, 'r') as f:
109
+ result_file_info = json.load(f)
110
+ obj.result_file = result_file_from_id(result_file_info['uuid'])
111
+ obj.kind = result_file_info['kind']
112
+ obj.stored_checksum = result_file_info['checksum']
113
+ except FileNotFoundError:
114
+ pass
115
+ return obj
116
+
117
+ def write_info_file(self):
118
+ result_file_info = {
119
+ "uuid": self.result_file.uuid,
120
+ "kind": self.kind,
121
+ "checksum": self.local_file_checksum(),
122
+ }
123
+ with open(self.info_filepath, 'w') as f:
124
+ json.dump(result_file_info, f)
125
+
126
+ def list_abnormal_objects(self):
127
+ """Return a list of files that have been modified.
128
+
129
+ Since this class is a single file the list will either be empty
130
+ or have one element.
131
+
132
+ Note that if a file was modified locally then uploaded to the server
133
+ the file will be marked as modified remote.
134
+ """
135
+ if self.result_file is None:
136
+ return [('FILE', FILE_STATUS_NEW_LOCAL, self.path, None)]
137
+ if not os.path.exists(self.path):
138
+ return [('FILE', FILE_STATUS_NEW_REMOTE, self.path, self.result_file)]
139
+ if self.is_stub:
140
+ return [('FILE', FILE_STATUS_IS_LOCAL_STUB, self.path, self.result_file)]
141
+ if self.result_file and self.result_file.download_needs_update(self.path):
142
+ return [('FILE', FILE_STATUS_MODIFIED_REMOTE, self.path, self.result_file)]
143
+ if self.locally_modified():
144
+ return [('FILE', FILE_STATUS_MODIFIED_LOCAL, self.path, self.result_file)]
145
+
146
+ return []
147
+
148
+
149
+ class ResultFolderOnFilesystem:
150
+
151
+ def __init__(self, result_folder, path, kind):
152
+ self.result_folder = result_folder
153
+ self.path = path
154
+ self.kind = kind
155
+
156
+ @property
157
+ def info_filepath(self):
158
+ return os.path.join(self.path, '.gs_result_folder')
159
+
160
+ def download(self, use_stubs=False, exists_ok=False):
161
+ if os.path.exists(self.info_filepath) and not exists_ok:
162
+ raise ValueError('Result folder already exists at path: {}'.format(self.info_filepath))
163
+
164
+ # Download the files in the result folder
165
+ for result_file in self.result_folder.get_fields():
166
+ result_file_local_path = os.path.join(self.path, result_file.name)
167
+ os.makedirs(os.path.dirname(result_file_local_path), exist_ok=True)
168
+ ResultFileOnFilesystem(result_file, result_file_local_path, self.kind)\
169
+ .download(use_stubs=use_stubs, exists_ok=exists_ok)
170
+
171
+ # Write the result folder data
172
+ result_folder_info = {
173
+ "uuid": self.result_folder.uuid,
174
+ "kind": self.kind
175
+ }
176
+ with open(self.info_filepath, 'w') as f:
177
+ json.dump(result_folder_info, f)
178
+
179
+ def status_is_ok(self):
180
+ # check for an info file
181
+ if not os.path.exists(self.info_filepath):
182
+ return False
183
+
184
+ # check that all files are downloaded
185
+ for result_file in self.result_folder.get_files():
186
+ result_file_path = os.path.join(self.path, result_file.name)
187
+ if not os.path.exists(result_file_path):
188
+ return False
189
+
190
+ return True
191
+
192
+ @classmethod
193
+ def from_path(cls, path):
194
+ obj = cls(None, path, None)
195
+ try:
196
+ with open(os.path.join(path, '.gs_result_folder'), 'r') as f:
197
+ result_folder_info = json.load(f)
198
+ obj.result_folder = result_folder_from_id(result_folder_info['uuid'])
199
+ obj.kind = result_folder_info['kind']
200
+ except FileNotFoundError:
201
+ pass
202
+ return obj
203
+
204
+ def list_abnormal_objects(self):
205
+ """Return a list of files that have been modified.
206
+
207
+ This function will return a list of tuples where the first element
208
+ is the status of the file and the second element is the path to the file.
209
+ """
210
+ modified_files = []
211
+ if not self.result_folder:
212
+ modified_files.append(('FOLDER', FILE_STATUS_NEW_LOCAL, self.path, None))
213
+ if not os.path.exists(self.path):
214
+ modified_files.append(('FOLDER', FILE_STATUS_NEW_REMOTE, self.path, self.result_folder))
215
+
216
+ # list local files
217
+ if os.path.exists(self.path):
218
+ for local_file in os.listdir(self.path):
219
+ if local_file.startswith('.gs_'):
220
+ continue
221
+ local_file_path = os.path.join(self.path, local_file)
222
+ result_file_on_fs = ResultFileOnFilesystem.from_path(local_file_path)
223
+ modified_files.extend(result_file_on_fs.list_abnormal_objects())
224
+
225
+ # list remote files
226
+ if self.result_folder:
227
+ for result_file in self.result_folder.get_fields():
228
+ result_file_path = os.path.join(self.path, result_file.name)
229
+ result_file_on_fs = ResultFileOnFilesystem(result_file, result_file_path, self.kind)
230
+ modified_files.extend(result_file_on_fs.list_abnormal_objects())
231
+
232
+ return dedupe_modified_files(modified_files)
233
+
234
+
235
+ class SampleOnFilesystem:
236
+
237
+ def __init__(self, sample, path):
238
+ self.sample = sample
239
+ self.path = path if path[-1] != '/' else path[:-1] # remove trailing slash
240
+
241
+ @property
242
+ def info_filepath(self):
243
+ return os.path.join(self.path, '.gs_sample')
244
+
245
+ def download(self, use_stubs=False, exists_ok=False):
246
+ if os.path.exists(self.info_filepath) and not exists_ok:
247
+ raise ValueError('Sample already exists at path: {}'.format(self.info_filepath))
248
+
249
+ # download result folders
250
+ for result_folder in self.sample.get_result_folders():
251
+ result_folder_local_path = os.path.join(self.path, result_folder.name)
252
+ os.makedirs(result_folder_local_path, exist_ok=True)
253
+ ResultFolderOnFilesystem(result_folder, result_folder_local_path, "sample")\
254
+ .download(use_stubs=use_stubs, exists_ok=exists_ok)
255
+
256
+ # Write the sample data
257
+ sample_info = {
258
+ "uuid": self.sample.uuid
259
+ }
260
+ with open(self.info_filepath, 'w') as f:
261
+ json.dump(sample_info, f)
262
+
263
+ def status_is_ok(self):
264
+ # check for an info file
265
+ if not os.path.exists(self.info_filepath):
266
+ return False
267
+
268
+ # check that all result folders are downloaded
269
+ for result_folder in self.sample.get_result_folders():
270
+ result_folder_local_path = os.path.join(self.path, result_folder.name)
271
+ result_folder_on_fs = ResultFolderOnFilesystem.from_path(result_folder_local_path, "sample")
272
+ if not result_folder_on_fs.status_is_ok():
273
+ return False
274
+
275
+ return True
276
+
277
+ @classmethod
278
+ def from_path(cls, path):
279
+ obj = cls(None, path)
280
+ try:
281
+ with open(os.path.join(path, '.gs_sample'), 'r') as f:
282
+ sample_info = json.load(f)
283
+ obj.sample = sample_from_id(sample_info['uuid'])
284
+ except FileNotFoundError:
285
+ pass
286
+ return obj
287
+
288
+ def list_abnormal_objects(self):
289
+ """Return a list of files that have been modified.
290
+
291
+ This function will return a list of tuples where the first element
292
+ is the status of the file and the second element is the path to the file.
293
+ """
294
+ modified_files = []
295
+ if not self.sample:
296
+ modified_files.append(('SAMPLE', FILE_STATUS_NEW_LOCAL, self.path, None))
297
+ if not os.path.exists(self.path):
298
+ modified_files.append(('SAMPLE', FILE_STATUS_NEW_REMOTE, self.path, self.sample))
299
+
300
+ # list local folders
301
+ if os.path.exists(self.path):
302
+ for local_folder in os.listdir(self.path):
303
+ local_folder_path = os.path.join(self.path, local_folder)
304
+ if not os.path.isdir(local_folder_path):
305
+ continue
306
+ result_folder_on_fs = ResultFolderOnFilesystem.from_path(local_folder_path)
307
+ modified_files.extend(result_folder_on_fs.list_abnormal_objects())
308
+
309
+ # list remote folders
310
+ if self.sample:
311
+ for result_folder in self.sample.get_result_folders():
312
+ result_folder_path = os.path.join(self.path, result_folder.name)
313
+ result_folder_on_fs = ResultFolderOnFilesystem(result_folder, result_folder_path, "sample")
314
+ modified_files.extend(result_folder_on_fs.list_abnormal_objects())
315
+
316
+ return dedupe_modified_files(modified_files)
317
+
318
+
319
+ class ProjectOnFilesystem:
320
+
321
+ def __init__(self, project, path):
322
+ self.project = project
323
+ self.path = path
324
+
325
+ @property
326
+ def info_filepath(self):
327
+ return os.path.join(self.path, '.gs_project')
328
+
329
+ def download(self, use_stubs=False, exists_ok=False):
330
+ if os.path.exists(self.info_filepath) and not exists_ok:
331
+ raise ValueError('Project already exists at path: {}'.format(self.info_filepath))
332
+
333
+ # download samples
334
+ for sample in self.project.get_samples():
335
+ sample_local_path = os.path.join(self.path, "sample_results", sample.name)
336
+ os.makedirs(sample_local_path, exist_ok=True)
337
+ SampleOnFilesystem(sample, sample_local_path)\
338
+ .download(use_stubs=use_stubs, exists_ok=exists_ok)
339
+
340
+ # download project result folders
341
+ for result_folder in self.project.get_result_folders():
342
+ result_folder_local_path = os.path.join(self.path, "project_results", result_folder.name)
343
+ os.makedirs(result_folder_local_path, exist_ok=True)
344
+ ResultFolderOnFilesystem(result_folder, result_folder_local_path, "project")\
345
+ .download(use_stubs=use_stubs, exists_ok=exists_ok)
346
+
347
+ # Write the project data
348
+ project_info = {
349
+ "uuid": self.project.uuid
350
+ }
351
+ with open(self.info_filepath, 'w') as f:
352
+ json.dump(project_info, f)
353
+
354
+ def status_is_ok(self):
355
+ # check for an info file
356
+ if not os.path.exists(self.info_filepath):
357
+ return False
358
+
359
+ # check that all samples are downloaded
360
+ for sample in self.project.get_samples():
361
+ sample_local_path = os.path.join(self.path, "sample_results", sample.name)
362
+ sample_on_fs = SampleOnFilesystem.from_path(sample_local_path)
363
+ if not sample_on_fs.status_is_ok():
364
+ return False
365
+
366
+ # check that all project result folders are downloaded
367
+ for result_folder in self.project.get_result_folders():
368
+ result_folder_local_path = os.path.join(self.path, "project_results", result_folder.name)
369
+ result_folder_on_fs = ResultFolderOnFilesystem.from_path(result_folder_local_path, "project")
370
+ if not result_folder_on_fs.status_is_ok():
371
+ return False
372
+
373
+ return True
374
+
375
+ @classmethod
376
+ def from_path(cls, path, recursive=False):
377
+ try:
378
+ with open(os.path.join(path, '.gs_project'), 'r') as f:
379
+ project_info = json.load(f)
380
+ project = project_from_id(project_info['uuid'])
381
+ return cls(project, path)
382
+ except FileNotFoundError:
383
+ if not recursive:
384
+ raise ValueError('No project found in path or parent directories')
385
+ updir = os.path.dirname(os.path.abspath(path))
386
+ if updir == path:
387
+ raise ValueError('No project found in path or parent directories')
388
+ return cls.from_path(updir, recursive=recursive)
389
+
390
+ def path_from_project_root(self, path):
391
+ if path[0] == "/":
392
+ return path.replace(self.path, "")[1:]
393
+ return path
394
+
395
+ def list_abnormal_objects(self):
396
+ """Return a list of files that have been modified.
397
+
398
+ This function will return a list of tuples where the first element
399
+ is the status of the file and the second element is the path to the file.
400
+ """
401
+ modified_files = []
402
+
403
+ # list remote samples
404
+ for sample in self.project.get_samples():
405
+ sample_path = os.path.join(self.path, "sample_results", sample.name)
406
+ sample_on_fs = SampleOnFilesystem(sample, sample_path)
407
+ modified_files.extend(sample_on_fs.list_abnormal_objects())
408
+
409
+ # list remote project result folders
410
+ for result_folder in self.project.get_result_folders():
411
+ result_folder_path = os.path.join(self.path, "project_results", result_folder.name)
412
+
413
+ result_folder_on_fs = ResultFolderOnFilesystem(result_folder, result_folder_path, "project")
414
+ modified_files.extend(result_folder_on_fs.list_abnormal_objects())
415
+
416
+ # list local samples
417
+ for local_sample in os.listdir(os.path.join(self.path, "sample_results")):
418
+ local_sample_path = os.path.join(self.path, "sample_results", local_sample)
419
+ if not os.path.isdir(local_sample_path):
420
+ continue
421
+ sample_on_fs = SampleOnFilesystem.from_path(local_sample_path)
422
+ modified_files.extend(sample_on_fs.list_abnormal_objects())
423
+
424
+ # list local project result folders
425
+ for local_result_folder in os.listdir(os.path.join(self.path, "project_results")):
426
+ local_result_folder_path = os.path.join(self.path, "project_results", local_result_folder)
427
+ if not os.path.isdir(local_result_folder_path):
428
+ continue
429
+ result_folder_on_fs = ResultFolderOnFilesystem.from_path(local_result_folder_path)
430
+ modified_files.extend(result_folder_on_fs.list_abnormal_objects())
431
+ return dedupe_modified_files(modified_files)
432
+
433
+
434
+
@@ -0,0 +1,122 @@
1
+ from fuse import FUSE, Operations
2
+ import os
3
+
4
+
5
+ class GeoSeeqProjectFileSystem(Operations):
6
+ """Mount a GeoSeeq project as a filesystem.
7
+
8
+ The project will automatically have this directory structure:
9
+ - <root>/project_results/<project_result_folder_name>/...
10
+ - <root>/sample_results/<sample_name>/...
11
+ - <root>/metadata/sample_metadata.csv
12
+ - <root>/.config/config.json
13
+ """
14
+
15
+ def __init__(self, root, project):
16
+ self.root = root
17
+ self.project = project
18
+
19
+ def access(self, path, mode):
20
+ pass
21
+
22
+ def chmod(self, path, mode):
23
+ pass
24
+
25
+ def chown(self, path, uid, gid):
26
+ pass
27
+
28
+ def getattr(self, path, fh=None):
29
+ pass
30
+
31
+ def readdir(self, path, fh):
32
+ pass
33
+
34
+ def readlink(self, path):
35
+ pass
36
+
37
+ def mknod(self, path, mode, dev):
38
+ pass
39
+
40
+ def rmdir(self, path):
41
+ pass
42
+
43
+ def mkdir(self, path, mode):
44
+ pass
45
+
46
+ def statfs(self, path):
47
+ pass
48
+
49
+ def unlink(self, path):
50
+ pass
51
+
52
+ def symlink(self, name, target):
53
+ pass
54
+
55
+ def rename(self, old, new):
56
+ pass
57
+
58
+ def link(self, target, name):
59
+ pass
60
+
61
+ def utimens(self, path, times=None):
62
+ pass
63
+
64
+ def open(self, path, flags):
65
+ tkns = path.split('/')
66
+ if tkns[0] == 'project_results':
67
+ result_folder_name, result_file_name = tkns[2], '/'.join(tkns[3:])
68
+ result_folder = self.project.get_result_folder(result_folder_name).get()
69
+ result_file = result_folder.get_file(result_file_name).get()
70
+ result_file.download(path)
71
+ elif tkns[0] == 'sample_results':
72
+ sample_name, result_folder_name, result_file_name = tkns[2], tkns[3], '/'.join(tkns[4:])
73
+ sample = self.project.get_sample(sample_name).get()
74
+ result_folder = sample.get_result_folder(result_folder_name).get()
75
+ result_file = result_folder.get_file(result_file_name).get()
76
+ result_file.download(path)
77
+ elif tkns[0] == 'metadata':
78
+ raise NotImplementedError('TODO')
79
+
80
+ return os.open(self._full_local_path(path), flags)
81
+
82
+ def create(self, path, mode, fi=None):
83
+ tkns = path.split('/')
84
+ if tkns[0] == 'project_results':
85
+ result_name, file_name = tkns[2], '/'.join(tkns[3:])
86
+ result_folder = self.project.get_result_folder(result_name).idem()
87
+ result_file = result_folder.get_file(file_name).create()
88
+ result_file.download(path) # nothing to download at this point
89
+ elif tkns[0] == 'sample_results':
90
+ sample_name, result_folder_name, result_file_name = tkns[2], tkns[3], '/'.join(tkns[4:])
91
+ sample = self.project.get_sample(sample_name).idem()
92
+ result_folder = sample.get_result_folder(result_folder_name).idem()
93
+ result_file = result_folder.get_file(result_file_name).create()
94
+ result_file.download(path) # nothing to download at this point
95
+ elif tkns[0] == 'metadata':
96
+ raise NotImplementedError('TODO')
97
+
98
+ def read(self, path, length, offset, fh):
99
+ os.lseek(fh, offset, os.SEEK_SET)
100
+ return os.read(fh, length)
101
+
102
+ def write(self, path, buf, offset, fh):
103
+ pass
104
+
105
+ def truncate(self, path, length, fh=None):
106
+ pass
107
+
108
+ def flush(self, path, fh):
109
+ pass
110
+
111
+ def release(self, path, fh):
112
+ pass
113
+
114
+ def fsync(self, path, fdatasync, fh):
115
+ pass
116
+
117
+ def _full_local_path(self, partial):
118
+ if partial.startswith("/"):
119
+ partial = partial[1:]
120
+ return os.path.join(self.root, partial)
121
+
122
+
@@ -2,15 +2,22 @@
2
2
  import urllib.request
3
3
  import logging
4
4
  import requests
5
- from os.path import basename, getsize, join, isfile, getmtime
5
+ import os
6
+ from os.path import basename, getsize, join, isfile, getmtime, dirname
6
7
  from pathlib import Path
7
8
  from tempfile import NamedTemporaryFile
8
9
 
9
10
  from geoseeq.utils import download_ftp
10
11
  from geoseeq.constants import FIVE_MB
12
+ from hashlib import md5
13
+ from .resumable_download_tracker import ResumableDownloadTracker
11
14
 
12
15
  logger = logging.getLogger("geoseeq_api") # Same name as calling module
13
16
 
17
+ def url_to_id(url):
18
+ url = url.split("?")[0]
19
+ return md5(url.encode()).hexdigest()[:16]
20
+
14
21
 
15
22
  def _download_head(url, filename, head=None, start=0, progress_tracker=None):
16
23
  headers = None
@@ -20,11 +27,43 @@ def _download_head(url, filename, head=None, start=0, progress_tracker=None):
20
27
  response.raise_for_status()
21
28
  total_size_in_bytes = int(response.headers.get('content-length', 0))
22
29
  if progress_tracker: progress_tracker.set_num_chunks(total_size_in_bytes)
23
- block_size = FIVE_MB
30
+ if total_size_in_bytes > 10 * FIVE_MB: # Use resumable download
31
+ print("Using resumable download")
32
+ return _download_resumable(response, filename, total_size_in_bytes, progress_tracker)
33
+ else:
34
+ block_size = FIVE_MB
35
+ with open(filename, 'wb') as file:
36
+ for data in response.iter_content(block_size):
37
+ if progress_tracker: progress_tracker.update(len(data))
38
+ file.write(data)
39
+ return filename
40
+
41
+
42
+ def _download_resumable(response, filename, total_size_in_bytes, progress_tracker=None, chunk_size=5 * FIVE_MB, part_prefix=".gs_download_{}_{}."):
43
+ target_id = url_to_id(response.url)
44
+ tracker = ResumableDownloadTracker(chunk_size, target_id, filename)
45
+ if not tracker.download_started: tracker.start_download(response.url)
46
+ n_chunks = total_size_in_bytes // chunk_size
47
+ for i in range(n_chunks):
48
+ bytes_start, bytes_end = i * chunk_size, min((i + 1) * chunk_size - 1, total_size_in_bytes - 1)
49
+ if tracker.part_has_been_downloaded(i):
50
+ logger.debug(f"Part {i} has already been downloaded.")
51
+ else:
52
+ logger.debug(f"Downloading part {i} of {n_chunks - 1}")
53
+ part_filename = join(dirname(filename), part_prefix.format(i, n_chunks - 1) + basename(filename))
54
+ _download_head(response.url, part_filename, head=bytes_end, start=bytes_start, progress_tracker=None)
55
+ part_info = dict(part_number=i, start=bytes_start, end=bytes_end, part_filename=part_filename)
56
+ tracker.add_part(part_info)
57
+ if progress_tracker: progress_tracker.update(bytes_end - bytes_start + 1)
58
+
59
+ # at this point all parts have been downloaded
24
60
  with open(filename, 'wb') as file:
25
- for data in response.iter_content(block_size):
26
- if progress_tracker: progress_tracker.update(len(data))
27
- file.write(data)
61
+ for i in range(n_chunks):
62
+ part_info = tracker.get_part_info(i)
63
+ part_filename = part_info["part_filename"]
64
+ with open(part_filename, 'rb') as part_file:
65
+ file.write(part_file.read())
66
+ tracker.cleanup()
28
67
  return filename
29
68
 
30
69
 
@@ -44,7 +83,7 @@ def guess_download_kind(url):
44
83
  return 'generic'
45
84
 
46
85
 
47
- def download_url(url, kind='guess', filename=None, head=None, progress_tracker=None):
86
+ def download_url(url, kind='guess', filename=None, head=None, progress_tracker=None, target_uuid=None):
48
87
  """Return a local filepath to the downloaded file. Download the file."""
49
88
  if filename and isfile(filename):
50
89
  file_size = getsize(filename)
@@ -135,7 +174,7 @@ class ResultFileDownload:
135
174
  url = self.get_download_url()
136
175
  filepath = download_url(
137
176
  url, blob_type, filename,
138
- head=head, progress_tracker=progress_tracker
177
+ head=head, progress_tracker=progress_tracker,
139
178
  )
140
179
  if cache and flag_suffix:
141
180
  # create flag file
@@ -0,0 +1,99 @@
1
+
2
+ import time
3
+ import json
4
+ import os
5
+ from os.path import basename, getsize, join, dirname, isfile, getctime
6
+ from pathlib import Path
7
+ from random import random
8
+ import requests
9
+
10
+ from geoseeq.knex import GeoseeqGeneralError
11
+ from geoseeq.constants import FIVE_MB
12
+ from geoseeq.utils import md5_checksum
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ from .utils import *
15
+ from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
16
+ from .file_chunker import FileChunker
17
+
18
+
19
+
20
+ class ResumableDownloadTracker:
21
+
22
+ def __init__(self, chunk_size, download_target_id, target_local_path, tracker_file_prefix="gs_resumable_download_tracker"):
23
+ self.open, self.download_started = True, False
24
+ self.download_target_id = download_target_id
25
+ self.target_local_path = target_local_path
26
+ self.tracker_file_dir = join(GEOSEEQ_CACHE_DIR, 'download')
27
+ self.tracker_file = join(
28
+ self.tracker_file_dir,
29
+ tracker_file_prefix + f".{download_target_id}.{chunk_size}." + basename(target_local_path)
30
+ )
31
+ try:
32
+ os.makedirs(self.tracker_file_dir, exist_ok=True)
33
+ except Exception as e:
34
+ logger.warning(f'Could not create resumable download tracker directory. {e}')
35
+ self.open = False
36
+ self._loaded_parts = {}
37
+ self._load_parts_from_file()
38
+
39
+ def start_download(self, download_url):
40
+ if not self.open:
41
+ return
42
+ if self.download_started:
43
+ raise GeoseeqGeneralError("Download has already started.")
44
+ self.download_started = True
45
+ blob = dict(download_url=download_url,
46
+ download_target_id=self.download_target_id,
47
+ start_time=time.time())
48
+ serialized = json.dumps(blob)
49
+ with open(self.tracker_file, "w") as f:
50
+ f.write(serialized + "\n")
51
+ self.download_url = download_url
52
+ return self
53
+
54
+ def add_part(self, part_download_info):
55
+ if not self.open:
56
+ assert False, "Cannot add part to closed ResumableDownloadTracker"
57
+ part_id = part_download_info["part_number"]
58
+ serialized = json.dumps(part_download_info)
59
+ with open(self.tracker_file, "a") as f:
60
+ f.write(serialized + "\n")
61
+ self._loaded_parts[part_id] = part_download_info
62
+
63
+ def _load_parts_from_file(self):
64
+ if not isfile(self.tracker_file):
65
+ return
66
+ with open(self.tracker_file, "r") as f:
67
+ header_blob = json.loads(f.readline())
68
+ self.download_url = header_blob["download_url"]
69
+ start_time = header_blob["start_time"] # for now we don't expire resumable downloads
70
+ self.download_started = True
71
+ for line in f:
72
+ part_info = json.loads(line)
73
+ part_id = part_info["part_number"]
74
+ self._loaded_parts[part_id] = part_info
75
+
76
+ def part_has_been_downloaded(self, part_number):
77
+ if not self.open:
78
+ return False
79
+ if part_number not in self._loaded_parts:
80
+ return False
81
+ part_info = self._loaded_parts[part_number]
82
+ part_path = part_info["part_filename"]
83
+ return isfile(part_path)
84
+
85
+ def get_part_info(self, part_number):
86
+ if not self.open:
87
+ return None
88
+ return self._loaded_parts.get(part_number, None)
89
+
90
+ def cleanup(self):
91
+ if not self.open:
92
+ return
93
+ for part in self._loaded_parts.values():
94
+ part_path = part["part_filename"]
95
+ if isfile(part_path):
96
+ os.remove(part_path)
97
+ os.remove(self.tracker_file)
98
+ self.open = False
99
+
@@ -194,7 +194,7 @@ class GeoSeeqDownloadManager:
194
194
  self._convert_result_files_to_urls()
195
195
  download_args = [(
196
196
  url, file_path,
197
- self.progress_tracker_factory(url),
197
+ self.progress_tracker_factory(file_path),
198
198
  self.ignore_errors, self.head, self.log_level,
199
199
  self.n_parallel_downloads > 1
200
200
  ) for url, file_path in self._result_files]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geoseeq
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: GeoSeeq command line tools and python API
5
5
  Author: David C. Danko
6
6
  Author-email: "David C. Danko" <dcdanko@biotia.io>
@@ -11,7 +11,7 @@ geoseeq/project.py,sha256=-9Y2ik0-BpT3iqh89v8VQBbdadhI58oaUP9oZK8oetc,13741
11
11
  geoseeq/remote_object.py,sha256=Es-JlAz8iLRmCpAzh1MOwUh2MqtbuQM-p8wHIBAqNlQ,7131
12
12
  geoseeq/sample.py,sha256=whgEVk6GnDJJLjn5uTOqFqRtVxZD3BgjTo7brAC5noU,7981
13
13
  geoseeq/search.py,sha256=gawad6Cx5FxJBPlYkXWb-UKAO-UC0_yhvyU9Ca1kaNI,3388
14
- geoseeq/upload_download_manager.py,sha256=DNI4nce0MCds-wGoTA3fP_msz3kGOAoJNItoUv7L0uQ,8751
14
+ geoseeq/upload_download_manager.py,sha256=FMRqLLg77o1qFbWZc5Yc86a2pjeZrrn1rHJr1iaxKCU,8757
15
15
  geoseeq/user.py,sha256=tol8i1UGLRrbMw5jeJDnna1ikRgrCDd50Jxz0a1lSgg,690
16
16
  geoseeq/utils.py,sha256=PDRiEQIZYTcfEV9AYvloQVvfqs5JaebcFZodAa2SUW8,3577
17
17
  geoseeq/work_orders.py,sha256=5uLVVfdKE8qh4gGaHkdBpXJGRTujuSg59knWCqEET4A,8071
@@ -22,7 +22,7 @@ geoseeq/cli/detail.py,sha256=q8Suu-j2k18knfSVFG-SWWGNsKM-n8y9RMA3LcIIi9Y,4132
22
22
  geoseeq/cli/download.py,sha256=N_Wrg9d1kY9eJ6C1l0xc_YFjiri8gkXBo9JiuHx9xxE,17766
23
23
  geoseeq/cli/fastq_utils.py,sha256=-bmeQLaiMBm57zWOF0R5OlWTU0_3sh1JBC1RYw2BOFM,3083
24
24
  geoseeq/cli/get_eula.py,sha256=79mbUwyiF7O1r0g6UTxG9kJGQEqKuH805E6eLkPC6Y4,997
25
- geoseeq/cli/main.py,sha256=Vze6p8cNGsMQmsr5bkhglOxWPIPqxk0BM6417iKvhb4,3791
25
+ geoseeq/cli/main.py,sha256=y6OK6ryYf7TyMtufl0kGESro5Fy5Hu7_xzIc3aYDKCo,3791
26
26
  geoseeq/cli/manage.py,sha256=wGXAcVaXqE5JQEU8Jh6OlHr02nB396bpS_SFcOZdrEo,5929
27
27
  geoseeq/cli/progress_bar.py,sha256=p1Xl01nkYxSBZCB30ue2verIIi22W93m3ZAMAxipD0g,738
28
28
  geoseeq/cli/project.py,sha256=V5SdXm2Hwo2lxrkpwRDedw-mAE4XnM2uwT-Gj1D90VQ,3030
@@ -48,6 +48,9 @@ geoseeq/contrib/ncbi/api.py,sha256=WQeLoGA_-Zha-QeSO8_i7HpvXyD8UkV0qc5okm11KiA,1
48
48
  geoseeq/contrib/ncbi/bioproject.py,sha256=_oThTd_iLDOC8cLOlJKAatSr362OBYZCEV3YrqodhFg,4341
49
49
  geoseeq/contrib/ncbi/cli.py,sha256=j9zEcaZPTryK3a4xluRxigcJKDhRpRxbp3KZSx-Bfhk,2400
50
50
  geoseeq/contrib/ncbi/setup_logging.py,sha256=Tp1bY1U0f-o739aHpvVYriG2qdd1lFvCYBXZeXQgt-w,175
51
+ geoseeq/file_system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
+ geoseeq/file_system/filesystem_download.py,sha256=8bcnxjWltekmCvb5N0b1guBIjLp4-CL2VtsEok-snv4,16963
53
+ geoseeq/file_system/main.py,sha256=4HgYGq7WhlF96JlVIf16iFBTDujlBpxImmtoh4VCzDA,3627
51
54
  geoseeq/id_constructors/__init__.py,sha256=w5E0PNQ9UuAxBeZbDI7KBnUoERd85gGz3nScz45bd2o,126
52
55
  geoseeq/id_constructors/from_blobs.py,sha256=aj7M7NRpKGs3u3xUvuFJwmJdFeIcJPmaI2_bhwbFfEs,5702
53
56
  geoseeq/id_constructors/from_ids.py,sha256=bbAJX4LnuN70v9bny6N-jAwOudb2-ztHvlMBgRuSDz0,3151
@@ -66,10 +69,11 @@ geoseeq/plotting/map/overlay.py,sha256=4VmxqOESTQra9tPr8b8OLEUhJSit9lNipabeSznEY
66
69
  geoseeq/result/__init__.py,sha256=IFHIyRV8ZzuKIfwfze1SXgcKwNMcSgMAknLHMkwjXIU,356
67
70
  geoseeq/result/bioinfo.py,sha256=QQtbyogrdro9avJSN0713sxLVnVeA24mFw3hWtKDKyw,1782
68
71
  geoseeq/result/file_chunker.py,sha256=bXq1csuRtqMB5sbH-AfWo6gdPwrivv5DJPuHVj-h08w,1758
69
- geoseeq/result/file_download.py,sha256=gV9-C_CMPpOWYi21eagsoiri53yzRHQx351nLBUj4WM,5790
72
+ geoseeq/result/file_download.py,sha256=2VFy_p20VxAu1ItNNM1PBcDKSp9dhRuyOhcb5UBwYEU,7805
70
73
  geoseeq/result/file_upload.py,sha256=xs1DrI-h4ZP7xN8HPBc3SFpcPAxR5HAolraP1Zu7tvE,10648
71
74
  geoseeq/result/result_file.py,sha256=1Yj9fkZhds3J-tay6eNH2-EHi00MovHGV1M80_ckHD8,8677
72
75
  geoseeq/result/result_folder.py,sha256=6porOXPh7Tpxw3oX5yMRPYQzNCGYqszqmFJd3SwQmTc,11122
76
+ geoseeq/result/resumable_download_tracker.py,sha256=YEzqHBBnE7L3XokTvlTAhHZ8TcDTIE_pyTQ7YadOfbU,3667
73
77
  geoseeq/result/resumable_upload_tracker.py,sha256=2aI09gYz2yw63jEXqs8lmCRKQ79TIc3YuPETvP0Jeek,3811
74
78
  geoseeq/result/utils.py,sha256=C-CxGzB3WddlnRiqFSkrY78I_m0yFgNqsTBRzGU-y8Q,2772
75
79
  geoseeq/vc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -84,9 +88,9 @@ geoseeq/vc/vc_stub.py,sha256=IQr8dI0zsWKVAeY_5ybDD6n49_3othcgfHS3P0O9tuY,3110
84
88
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
89
  tests/test_api_client.py,sha256=TS5njc5pcPP_Ycy-ljcfPVT1hQRBsFVdQ0lCqBmoesU,12810
86
90
  tests/test_plotting.py,sha256=TcTu-2ARr8sxZJ7wPQxmbs3-gHw7uRvsgrhhhg0qKik,784
87
- geoseeq-0.6.0.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
88
- geoseeq-0.6.0.dist-info/METADATA,sha256=mDqowxeSFM0nNuY_354pumCtTUpztbhhRe1Dv2rqn5g,4803
89
- geoseeq-0.6.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
90
- geoseeq-0.6.0.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
91
- geoseeq-0.6.0.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
92
- geoseeq-0.6.0.dist-info/RECORD,,
91
+ geoseeq-0.6.2.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
92
+ geoseeq-0.6.2.dist-info/METADATA,sha256=WSI2kZ4-2pMME7jQCQ3Hzg9dU3Gm7R6tFrKdTj1PCbg,4803
93
+ geoseeq-0.6.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
94
+ geoseeq-0.6.2.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
95
+ geoseeq-0.6.2.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
96
+ geoseeq-0.6.2.dist-info/RECORD,,