geoseeq 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geoseeq/cli/main.py CHANGED
@@ -54,7 +54,7 @@ def version():
54
54
  Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
55
55
  Run `geoseeq eula show` to view the EULA.
56
56
  """
57
- click.echo('0.6.0') # remember to update setup
57
+ click.echo('0.6.1') # remember to update setup
58
58
 
59
59
 
60
60
  @main.group('advanced')
@@ -2,15 +2,22 @@
2
2
  import urllib.request
3
3
  import logging
4
4
  import requests
5
- from os.path import basename, getsize, join, isfile, getmtime
5
+ import os
6
+ from os.path import basename, getsize, join, isfile, getmtime, dirname
6
7
  from pathlib import Path
7
8
  from tempfile import NamedTemporaryFile
8
9
 
9
10
  from geoseeq.utils import download_ftp
10
11
  from geoseeq.constants import FIVE_MB
12
+ from hashlib import md5
13
+ from .resumable_download_tracker import ResumableDownloadTracker
11
14
 
12
15
  logger = logging.getLogger("geoseeq_api") # Same name as calling module
13
16
 
17
+ def url_to_id(url):
18
+ url = url.split("?")[0]
19
+ return md5(url.encode()).hexdigest()[:16]
20
+
14
21
 
15
22
  def _download_head(url, filename, head=None, start=0, progress_tracker=None):
16
23
  headers = None
@@ -20,11 +27,43 @@ def _download_head(url, filename, head=None, start=0, progress_tracker=None):
20
27
  response.raise_for_status()
21
28
  total_size_in_bytes = int(response.headers.get('content-length', 0))
22
29
  if progress_tracker: progress_tracker.set_num_chunks(total_size_in_bytes)
23
- block_size = FIVE_MB
30
+ if total_size_in_bytes > 10 * FIVE_MB: # Use resumable download
31
+ print("Using resumable download")
32
+ return _download_resumable(response, filename, total_size_in_bytes, progress_tracker)
33
+ else:
34
+ block_size = FIVE_MB
35
+ with open(filename, 'wb') as file:
36
+ for data in response.iter_content(block_size):
37
+ if progress_tracker: progress_tracker.update(len(data))
38
+ file.write(data)
39
+ return filename
40
+
41
+
42
+ def _download_resumable(response, filename, total_size_in_bytes, progress_tracker=None, chunk_size=5 * FIVE_MB, part_prefix=".gs_download_{}_{}."):
43
+ target_id = url_to_id(response.url)
44
+ tracker = ResumableDownloadTracker(chunk_size, target_id, filename)
45
+ if not tracker.download_started: tracker.start_download(response.url)
46
+ n_chunks = total_size_in_bytes // chunk_size
47
+ for i in range(n_chunks):
48
+ bytes_start, bytes_end = i * chunk_size, min((i + 1) * chunk_size - 1, total_size_in_bytes - 1)
49
+ if tracker.part_has_been_downloaded(i):
50
+ logger.debug(f"Part {i} has already been downloaded.")
51
+ else:
52
+ logger.debug(f"Downloading part {i} of {n_chunks - 1}")
53
+ part_filename = join(dirname(filename), part_prefix.format(i, n_chunks - 1) + basename(filename))
54
+ _download_head(response.url, part_filename, head=bytes_end, start=bytes_start, progress_tracker=None)
55
+ part_info = dict(part_number=i, start=bytes_start, end=bytes_end, part_filename=part_filename)
56
+ tracker.add_part(part_info)
57
+ if progress_tracker: progress_tracker.update(bytes_end - bytes_start + 1)
58
+
59
+ # at this point all parts have been downloaded
24
60
  with open(filename, 'wb') as file:
25
- for data in response.iter_content(block_size):
26
- if progress_tracker: progress_tracker.update(len(data))
27
- file.write(data)
61
+ for i in range(n_chunks):
62
+ part_info = tracker.get_part_info(i)
63
+ part_filename = part_info["part_filename"]
64
+ with open(part_filename, 'rb') as part_file:
65
+ file.write(part_file.read())
66
+ tracker.cleanup()
28
67
  return filename
29
68
 
30
69
 
@@ -44,7 +83,7 @@ def guess_download_kind(url):
44
83
  return 'generic'
45
84
 
46
85
 
47
- def download_url(url, kind='guess', filename=None, head=None, progress_tracker=None):
86
+ def download_url(url, kind='guess', filename=None, head=None, progress_tracker=None, target_uuid=None):
48
87
  """Return a local filepath to the downloaded file. Download the file."""
49
88
  if filename and isfile(filename):
50
89
  file_size = getsize(filename)
@@ -135,7 +174,7 @@ class ResultFileDownload:
135
174
  url = self.get_download_url()
136
175
  filepath = download_url(
137
176
  url, blob_type, filename,
138
- head=head, progress_tracker=progress_tracker
177
+ head=head, progress_tracker=progress_tracker,
139
178
  )
140
179
  if cache and flag_suffix:
141
180
  # create flag file
@@ -0,0 +1,99 @@
1
+
2
+ import time
3
+ import json
4
+ import os
5
+ from os.path import basename, getsize, join, dirname, isfile, getctime
6
+ from pathlib import Path
7
+ from random import random
8
+ import requests
9
+
10
+ from geoseeq.knex import GeoseeqGeneralError
11
+ from geoseeq.constants import FIVE_MB
12
+ from geoseeq.utils import md5_checksum
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ from .utils import *
15
+ from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
16
+ from .file_chunker import FileChunker
17
+
18
+
19
+
20
+ class ResumableDownloadTracker:
21
+
22
+ def __init__(self, chunk_size, download_target_id, target_local_path, tracker_file_prefix="gs_resumable_download_tracker"):
23
+ self.open, self.download_started = True, False
24
+ self.download_target_id = download_target_id
25
+ self.target_local_path = target_local_path
26
+ self.tracker_file_dir = join(GEOSEEQ_CACHE_DIR, 'download')
27
+ self.tracker_file = join(
28
+ self.tracker_file_dir,
29
+ tracker_file_prefix + f".{download_target_id}.{chunk_size}." + basename(target_local_path)
30
+ )
31
+ try:
32
+ os.makedirs(self.tracker_file_dir, exist_ok=True)
33
+ except Exception as e:
34
+ logger.warning(f'Could not create resumable download tracker directory. {e}')
35
+ self.open = False
36
+ self._loaded_parts = {}
37
+ self._load_parts_from_file()
38
+
39
+ def start_download(self, download_url):
40
+ if not self.open:
41
+ return
42
+ if self.download_started:
43
+ raise GeoseeqGeneralError("Download has already started.")
44
+ self.download_started = True
45
+ blob = dict(download_url=download_url,
46
+ download_target_id=self.download_target_id,
47
+ start_time=time.time())
48
+ serialized = json.dumps(blob)
49
+ with open(self.tracker_file, "w") as f:
50
+ f.write(serialized + "\n")
51
+ self.download_url = download_url
52
+ return self
53
+
54
+ def add_part(self, part_download_info):
55
+ if not self.open:
56
+ assert False, "Cannot add part to closed ResumableDownloadTracker"
57
+ part_id = part_download_info["part_number"]
58
+ serialized = json.dumps(part_download_info)
59
+ with open(self.tracker_file, "a") as f:
60
+ f.write(serialized + "\n")
61
+ self._loaded_parts[part_id] = part_download_info
62
+
63
+ def _load_parts_from_file(self):
64
+ if not isfile(self.tracker_file):
65
+ return
66
+ with open(self.tracker_file, "r") as f:
67
+ header_blob = json.loads(f.readline())
68
+ self.download_url = header_blob["download_url"]
69
+ start_time = header_blob["start_time"] # for now we don't expire resumable downloads
70
+ self.download_started = True
71
+ for line in f:
72
+ part_info = json.loads(line)
73
+ part_id = part_info["part_number"]
74
+ self._loaded_parts[part_id] = part_info
75
+
76
+ def part_has_been_downloaded(self, part_number):
77
+ if not self.open:
78
+ return False
79
+ if part_number not in self._loaded_parts:
80
+ return False
81
+ part_info = self._loaded_parts[part_number]
82
+ part_path = part_info["part_filename"]
83
+ return isfile(part_path)
84
+
85
+ def get_part_info(self, part_number):
86
+ if not self.open:
87
+ return None
88
+ return self._loaded_parts.get(part_number, None)
89
+
90
+ def cleanup(self):
91
+ if not self.open:
92
+ return
93
+ for part in self._loaded_parts.values():
94
+ part_path = part["part_filename"]
95
+ if isfile(part_path):
96
+ os.remove(part_path)
97
+ os.remove(self.tracker_file)
98
+ self.open = False
99
+
@@ -194,7 +194,7 @@ class GeoSeeqDownloadManager:
194
194
  self._convert_result_files_to_urls()
195
195
  download_args = [(
196
196
  url, file_path,
197
- self.progress_tracker_factory(url),
197
+ self.progress_tracker_factory(file_path),
198
198
  self.ignore_errors, self.head, self.log_level,
199
199
  self.n_parallel_downloads > 1
200
200
  ) for url, file_path in self._result_files]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geoseeq
3
- Version: 0.6.0
3
+ Version: 0.6.1
4
4
  Summary: GeoSeeq command line tools and python API
5
5
  Author: David C. Danko
6
6
  Author-email: "David C. Danko" <dcdanko@biotia.io>
@@ -11,7 +11,7 @@ geoseeq/project.py,sha256=-9Y2ik0-BpT3iqh89v8VQBbdadhI58oaUP9oZK8oetc,13741
11
11
  geoseeq/remote_object.py,sha256=Es-JlAz8iLRmCpAzh1MOwUh2MqtbuQM-p8wHIBAqNlQ,7131
12
12
  geoseeq/sample.py,sha256=whgEVk6GnDJJLjn5uTOqFqRtVxZD3BgjTo7brAC5noU,7981
13
13
  geoseeq/search.py,sha256=gawad6Cx5FxJBPlYkXWb-UKAO-UC0_yhvyU9Ca1kaNI,3388
14
- geoseeq/upload_download_manager.py,sha256=DNI4nce0MCds-wGoTA3fP_msz3kGOAoJNItoUv7L0uQ,8751
14
+ geoseeq/upload_download_manager.py,sha256=FMRqLLg77o1qFbWZc5Yc86a2pjeZrrn1rHJr1iaxKCU,8757
15
15
  geoseeq/user.py,sha256=tol8i1UGLRrbMw5jeJDnna1ikRgrCDd50Jxz0a1lSgg,690
16
16
  geoseeq/utils.py,sha256=PDRiEQIZYTcfEV9AYvloQVvfqs5JaebcFZodAa2SUW8,3577
17
17
  geoseeq/work_orders.py,sha256=5uLVVfdKE8qh4gGaHkdBpXJGRTujuSg59knWCqEET4A,8071
@@ -22,7 +22,7 @@ geoseeq/cli/detail.py,sha256=q8Suu-j2k18knfSVFG-SWWGNsKM-n8y9RMA3LcIIi9Y,4132
22
22
  geoseeq/cli/download.py,sha256=N_Wrg9d1kY9eJ6C1l0xc_YFjiri8gkXBo9JiuHx9xxE,17766
23
23
  geoseeq/cli/fastq_utils.py,sha256=-bmeQLaiMBm57zWOF0R5OlWTU0_3sh1JBC1RYw2BOFM,3083
24
24
  geoseeq/cli/get_eula.py,sha256=79mbUwyiF7O1r0g6UTxG9kJGQEqKuH805E6eLkPC6Y4,997
25
- geoseeq/cli/main.py,sha256=Vze6p8cNGsMQmsr5bkhglOxWPIPqxk0BM6417iKvhb4,3791
25
+ geoseeq/cli/main.py,sha256=omW-_xyCEL-rKyELtNu84Ofi-L4hm_17udQ4X6blr4I,3791
26
26
  geoseeq/cli/manage.py,sha256=wGXAcVaXqE5JQEU8Jh6OlHr02nB396bpS_SFcOZdrEo,5929
27
27
  geoseeq/cli/progress_bar.py,sha256=p1Xl01nkYxSBZCB30ue2verIIi22W93m3ZAMAxipD0g,738
28
28
  geoseeq/cli/project.py,sha256=V5SdXm2Hwo2lxrkpwRDedw-mAE4XnM2uwT-Gj1D90VQ,3030
@@ -66,10 +66,11 @@ geoseeq/plotting/map/overlay.py,sha256=4VmxqOESTQra9tPr8b8OLEUhJSit9lNipabeSznEY
66
66
  geoseeq/result/__init__.py,sha256=IFHIyRV8ZzuKIfwfze1SXgcKwNMcSgMAknLHMkwjXIU,356
67
67
  geoseeq/result/bioinfo.py,sha256=QQtbyogrdro9avJSN0713sxLVnVeA24mFw3hWtKDKyw,1782
68
68
  geoseeq/result/file_chunker.py,sha256=bXq1csuRtqMB5sbH-AfWo6gdPwrivv5DJPuHVj-h08w,1758
69
- geoseeq/result/file_download.py,sha256=gV9-C_CMPpOWYi21eagsoiri53yzRHQx351nLBUj4WM,5790
69
+ geoseeq/result/file_download.py,sha256=2VFy_p20VxAu1ItNNM1PBcDKSp9dhRuyOhcb5UBwYEU,7805
70
70
  geoseeq/result/file_upload.py,sha256=xs1DrI-h4ZP7xN8HPBc3SFpcPAxR5HAolraP1Zu7tvE,10648
71
71
  geoseeq/result/result_file.py,sha256=1Yj9fkZhds3J-tay6eNH2-EHi00MovHGV1M80_ckHD8,8677
72
72
  geoseeq/result/result_folder.py,sha256=6porOXPh7Tpxw3oX5yMRPYQzNCGYqszqmFJd3SwQmTc,11122
73
+ geoseeq/result/resumable_download_tracker.py,sha256=YEzqHBBnE7L3XokTvlTAhHZ8TcDTIE_pyTQ7YadOfbU,3667
73
74
  geoseeq/result/resumable_upload_tracker.py,sha256=2aI09gYz2yw63jEXqs8lmCRKQ79TIc3YuPETvP0Jeek,3811
74
75
  geoseeq/result/utils.py,sha256=C-CxGzB3WddlnRiqFSkrY78I_m0yFgNqsTBRzGU-y8Q,2772
75
76
  geoseeq/vc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -84,9 +85,9 @@ geoseeq/vc/vc_stub.py,sha256=IQr8dI0zsWKVAeY_5ybDD6n49_3othcgfHS3P0O9tuY,3110
84
85
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
86
  tests/test_api_client.py,sha256=TS5njc5pcPP_Ycy-ljcfPVT1hQRBsFVdQ0lCqBmoesU,12810
86
87
  tests/test_plotting.py,sha256=TcTu-2ARr8sxZJ7wPQxmbs3-gHw7uRvsgrhhhg0qKik,784
87
- geoseeq-0.6.0.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
88
- geoseeq-0.6.0.dist-info/METADATA,sha256=mDqowxeSFM0nNuY_354pumCtTUpztbhhRe1Dv2rqn5g,4803
89
- geoseeq-0.6.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
90
- geoseeq-0.6.0.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
91
- geoseeq-0.6.0.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
92
- geoseeq-0.6.0.dist-info/RECORD,,
88
+ geoseeq-0.6.1.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
89
+ geoseeq-0.6.1.dist-info/METADATA,sha256=aSgyentw6vNb53rZD6IlBy-ZlPSjPRGQK-dq9QnY1no,4803
90
+ geoseeq-0.6.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
91
+ geoseeq-0.6.1.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
92
+ geoseeq-0.6.1.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
93
+ geoseeq-0.6.1.dist-info/RECORD,,