geoseeq 0.5.6a10__py3-none-any.whl → 0.5.6a12__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
geoseeq/cli/main.py CHANGED
@@ -53,7 +53,7 @@ def version():
53
53
  Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
54
54
  Run `geoseeq eula show` to view the EULA.
55
55
  """
56
- click.echo('0.5.6a10') # remember to update setup
56
+ click.echo('0.5.6a12') # remember to update setup
57
57
 
58
58
 
59
59
  @main.group('advanced')
@@ -36,7 +36,9 @@ hidden_option = click.option('--hidden/--no-hidden', default=False, help='Upload
36
36
 
37
37
  @click.command('files')
38
38
  @use_common_state
39
- @click.option('--cores', default=1, help='Number of uploads to run in parallel')
39
+ @click.option('--cores', default=1, help='Number of uploads to run in parallel', show_default=True)
40
+ @click.option('--threads-per-upload', default=4, help='Number of threads used to upload each file', show_default=True)
41
+ @click.option('--num-retries', default=3, help='Number of times to retry a failed upload', show_default=True)
40
42
  @yes_option
41
43
  @private_option
42
44
  @link_option
@@ -44,10 +46,11 @@ hidden_option = click.option('--hidden/--no-hidden', default=False, help='Upload
44
46
  @hidden_option
45
47
  @no_new_versions_option
46
48
  @click.option('-n', '--geoseeq-file-name', default=None, multiple=True,
47
- help='Specify a different name for the file on GeoSeeq than the local file name.')
49
+ help='Specify a different name for the file on GeoSeeq than the local file name.',
50
+ show_default=True)
48
51
  @folder_id_arg
49
52
  @click.argument('file_paths', type=click.Path(exists=True), nargs=-1)
50
- def cli_upload_file(state, cores, yes, private, link_type, recursive, hidden, no_new_versions, geoseeq_file_name, folder_id, file_paths):
53
+ def cli_upload_file(state, cores, threads_per_upload, num_retries, yes, private, link_type, recursive, hidden, no_new_versions, geoseeq_file_name, folder_id, file_paths):
51
54
  """Upload files to GeoSeeq.
52
55
 
53
56
  This command uploads files to either a sample or project on GeoSeeq. It can be used to upload
@@ -92,6 +95,8 @@ def cli_upload_file(state, cores, yes, private, link_type, recursive, hidden, no
92
95
 
93
96
  ---
94
97
  """
98
+ if num_retries < 1:
99
+ raise click.UsageError('--num-retries must be at least 1')
95
100
  knex = state.get_knex()
96
101
  result_folder = handle_folder_id(knex, folder_id, yes=yes, private=private, create=True)
97
102
  if geoseeq_file_name:
@@ -106,11 +111,13 @@ def cli_upload_file(state, cores, yes, private, link_type, recursive, hidden, no
106
111
 
107
112
  upload_manager = GeoSeeqUploadManager(
108
113
  n_parallel_uploads=cores,
114
+ threads_per_upload=threads_per_upload,
109
115
  link_type=link_type,
110
116
  progress_tracker_factory=PBarManager().get_new_bar,
111
117
  log_level=state.log_level,
112
118
  no_new_versions=no_new_versions,
113
119
  use_cache=state.use_cache,
120
+ num_retries=num_retries,
114
121
  )
115
122
  for geoseeq_file_name, file_path in name_pairs:
116
123
  if isfile(file_path):
@@ -132,6 +132,28 @@ def flatten_list_of_fastqs(filepaths):
132
132
  return flattened
133
133
 
134
134
 
135
+ def _is_bam(path):
136
+ for ext in ['.bam', '.bai']:
137
+ if path.endswith(ext):
138
+ return True
139
+ return False
140
+
141
+
142
+ def flatten_list_of_bams(filepaths):
143
+ """Turn a list of bam filepaths and txt files containing bam filepaths into a single list of bam filepaths."""
144
+ flattened = []
145
+ for path in filepaths:
146
+ if _is_bam(path):
147
+ flattened.append(path)
148
+ else:
149
+ with open(path) as f:
150
+ for line in f:
151
+ line = line.strip()
152
+ if line and not line.startswith('#'):
153
+ flattened.append(line)
154
+ return flattened
155
+
156
+
135
157
 
136
158
  @click.command('reads')
137
159
  @use_common_state
@@ -200,3 +222,67 @@ def cli_upload_reads_wizard(state, cores, overwrite, yes, regex, private, link_t
200
222
  regex = _get_regex(knex, filepaths, module_name, proj, regex)
201
223
  groups = _group_files(knex, filepaths, module_name, regex, yes)
202
224
  _do_upload(groups, module_name, link_type, proj, filepaths, overwrite, no_new_versions, cores, state)
225
+
226
+
227
+ # @click.command('bam')
228
+ # @use_common_state
229
+ # @click.option('--genome', default=None, help='The genome aligned to the BAM files. Should be in 2bit format.')
230
+ # @click.option('--cores', default=1, help='Number of uploads to run in parallel')
231
+ # @overwrite_option
232
+ # @yes_option
233
+ # @click.option('--regex', default=None, help='An optional regex to use to extract sample names from the file names')
234
+ # @private_option
235
+ # @link_option
236
+ # @no_new_versions_option
237
+ # @project_id_arg
238
+ # @click.argument('files', type=click.Path(exists=True), nargs=-1)
239
+ # def cli_upload_bams(state, genome, cores, overwrite, yes, regex, private, link_type, no_new_versions, project_id, files):
240
+ """Upload BAM files to GeoSeeq.
241
+
242
+ This command automatically groups bams with their index files.
243
+
244
+ ---
245
+
246
+ Example Usage:
247
+
248
+ \b
249
+ # Upload a list of BAM files to a project, useful if you have hundreds of files
250
+ $ ls -1 path/to/bam/files/*.bam > file_list.txt
251
+ $ geoseeq upload bams "GeoSeeq/Example CLI Project" file_list.txt
252
+
253
+ \b
254
+ # Upload all the BAM files in a directory to a project with BAM indexes
255
+ $ geoseeq upload bams ed59b913-91ec-489b-a1b9-4ea137a6e5cf path/to/bam/files/*.bam path/to/bam/files/*.bam.bai
256
+
257
+ \b
258
+ # Upload all the BAM files in a directory to a project, performing 4 uploads in parallel
259
+ $ geoseeq upload bams --cores 4 ed59b913-91ec-489b-a1b9-4ea137a6e5cf path/to/bam/files/*.bam
260
+
261
+ \b
262
+ # Upload a list of BAM files to a project, automatically creating a new project and overwriting existing files
263
+ $ ls -1 path/to/bam/files/*.bam > file_list.txt
264
+ $ geoseeq upload bams --yes --overwrite "GeoSeeq/Example CLI Project" file_list.txt
265
+
266
+ ---
267
+
268
+ Command Arguments:
269
+
270
+ [PROJECT_ID] Can be a project UUID, GeoSeeq Resource Number (GRN), or an
271
+ organization name and project name separated by a slash.
272
+
273
+ \b
274
+ Examples:
275
+ - Name pair: "GeoSeeq/Example CLI Project"
276
+ - UUID: "ed59b913-91ec-489b-a1b9-4ea137a6e5cf"
277
+ - GRN: "grn:gs1:project:ed59b913-91ec-489b-a1b9-4ea137a6e5cf"
278
+
279
+ \b
280
+ [FILES...] can be paths to BAM files or a file containing a list of paths, or a mix of both.
281
+ Example: "path/to/bam/files
282
+ """
283
+ knex = state.get_knex()
284
+ proj = handle_project_id(knex, project_id, yes, private)
285
+ filepaths = {basename(line): line for line in flatten_list_of_bams(files)}
286
+ click.echo(f'Found {len(filepaths)} files to upload.', err=True)
287
+ groups = _group_files(knex, filepaths, 'bam::bam', regex, yes)
288
+ _do_upload(groups, 'bam::bam', link_type, proj, filepaths, overwrite, no_new_versions, cores, state)
@@ -46,6 +46,11 @@ def guess_download_kind(url):
46
46
 
47
47
  def download_url(url, kind='guess', filename=None, head=None, progress_tracker=None):
48
48
  """Return a local filepath to the downloaded file. Download the file."""
49
+ if filename and isfile(filename):
50
+ file_size = getsize(filename)
51
+ if file_size > 0:
52
+ logger.info(f"File already exists: {filename}. Not overwriting.")
53
+ return filename
49
54
  if kind == 'guess':
50
55
  kind = guess_download_kind(url)
51
56
  logger.info(f"Guessed download kind: {kind} for {url}")
@@ -31,14 +31,30 @@ class FileChunker:
31
31
  chunk = f.read(self.chunk_size)
32
32
  self.loaded_parts.append(chunk)
33
33
  return self # convenience for chaining
34
+
35
+ def chunk_is_preloaded(self, num):
36
+ return len(self.loaded_parts) > num and self.loaded_parts[num]
37
+
38
+ def read_one_chunk(self, num):
39
+ if not self.chunk_is_preloaded(num):
40
+ logger.debug(f"Reading chunk {num} from {self.filepath}")
41
+ with open(self.filepath, "rb") as f:
42
+ f.seek(num * self.chunk_size)
43
+ chunk = f.read(self.chunk_size)
44
+ return chunk
45
+ return self.loaded_parts[num]
34
46
 
35
47
  def get_chunk(self, num):
36
- self.load_all_chunks()
37
- return self.loaded_parts[num]
48
+ if self.chunk_is_preloaded(num):
49
+ return self.loaded_parts[num]
50
+ return self.read_one_chunk(num)
38
51
 
39
52
  def get_chunk_size(self, num):
40
- self.load_all_chunks()
41
- return len(self.loaded_parts[num])
53
+ if num < (self.n_parts - 1): # all but the last chunk
54
+ return self.chunk_size
55
+ if self.chunk_is_preloaded(num): # last chunk, pre-loaded
56
+ return len(self.loaded_parts[num])
57
+ return len(self.read_one_chunk(num)) # last chunk, not pre-loaded
42
58
 
43
59
 
44
60
  class ResumableUploadTracker:
@@ -159,6 +175,7 @@ class ResultFileUpload:
159
175
  attempts = 0
160
176
  while attempts < max_retries:
161
177
  try:
178
+ logger.debug(f"Uploading part {num + 1} to {url}. Size: {len(file_chunk)} bytes.")
162
179
  if session:
163
180
  http_response = session.put(url, data=file_chunk)
164
181
  else:
@@ -171,9 +188,11 @@ class ResultFileUpload:
171
188
  f"Upload for part {num + 1} failed. Attempt {attempts + 1} of {max_retries}."
172
189
  )
173
190
  attempts += 1
174
- if attempts == max_retries:
191
+ if attempts >= max_retries:
175
192
  raise e
176
- time.sleep(10**attempts) # exponential backoff, (10 ** 2)s default max
193
+
194
+ retry_time = min(8 ** attempts, 120) # exponential backoff, max 120s
195
+ time.sleep(retry_time)
177
196
 
178
197
  etag = http_response.headers["ETag"].replace('"', "")
179
198
  blob = {"ETag": etag, "PartNumber": num + 1}
@@ -251,7 +270,12 @@ class ResultFileUpload:
251
270
  resumable_upload_tracker.start_upload(upload_id, urls)
252
271
  logger.info(f'Starting upload for "{filepath}"')
253
272
  complete_parts = []
254
- file_chunker = FileChunker(filepath, chunk_size).load_all_chunks()
273
+ file_chunker = FileChunker(filepath, chunk_size)
274
+ if file_chunker.file_size < 10 * FIVE_MB:
275
+ file_chunker.load_all_chunks()
276
+ logger.debug(f"Preloaded all chunks for {filepath}")
277
+ else:
278
+ logger.debug(f"Did not preload chunks for {filepath}")
255
279
  if progress_tracker: progress_tracker.set_num_chunks(file_chunker.file_size)
256
280
  complete_parts = self._upload_parts(
257
281
  file_chunker,
@@ -21,7 +21,8 @@ def _make_in_process_logger(log_level):
21
21
  def _upload_one_file(args):
22
22
  (result_file, filepath, session, progress_tracker,
23
23
  link_type, overwrite, log_level, parallel_uploads,
24
- use_cache, no_new_versions) = args
24
+ use_cache, no_new_versions, threads_per_upload,
25
+ num_retries) = args
25
26
  if parallel_uploads:
26
27
  _make_in_process_logger(log_level)
27
28
  if link_type == 'upload':
@@ -29,7 +30,8 @@ def _upload_one_file(args):
29
30
  result_file.upload_file(
30
31
  filepath,
31
32
  session=session, overwrite=overwrite, progress_tracker=progress_tracker,
32
- threads=4, use_cache=use_cache, no_new_versions=no_new_versions
33
+ threads=threads_per_upload, use_cache=use_cache,
34
+ no_new_versions=no_new_versions, max_retries=num_retries,
33
35
  )
34
36
  else:
35
37
  result_file.link_file(link_type, filepath)
@@ -40,12 +42,14 @@ class GeoSeeqUploadManager:
40
42
 
41
43
  def __init__(self,
42
44
  n_parallel_uploads=1,
45
+ threads_per_upload=4,
43
46
  session=None,
44
47
  link_type='upload',
45
48
  progress_tracker_factory=None,
46
49
  log_level=logging.WARNING,
47
50
  overwrite=True,
48
51
  no_new_versions=False,
52
+ num_retries=3,
49
53
  use_cache=True):
50
54
  self.session = session
51
55
  self.n_parallel_uploads = n_parallel_uploads
@@ -56,6 +60,8 @@ class GeoSeeqUploadManager:
56
60
  self._result_files = []
57
61
  self.no_new_versions = no_new_versions
58
62
  self.use_cache = use_cache
63
+ self.threads_per_upload = threads_per_upload
64
+ self.num_retries = num_retries
59
65
 
60
66
  def add_result_file(self, result_file, local_path):
61
67
  self._result_files.append((result_file, local_path))
@@ -80,7 +86,8 @@ class GeoSeeqUploadManager:
80
86
  result_file, local_path,
81
87
  self.session, self.progress_tracker_factory(local_path),
82
88
  self.link_type, self.overwrite, self.log_level,
83
- self.n_parallel_uploads > 1, self.use_cache, self.no_new_versions
89
+ self.n_parallel_uploads > 1, self.use_cache, self.no_new_versions,
90
+ self.threads_per_upload, self.num_retries
84
91
  ) for result_file, local_path in self._result_files
85
92
  ]
86
93
  out = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geoseeq
3
- Version: 0.5.6a10
3
+ Version: 0.5.6a12
4
4
  Summary: GeoSeeq command line tools and python API
5
5
  Author: David C. Danko
6
6
  Author-email: "David C. Danko" <dcdanko@biotia.io>
@@ -11,7 +11,7 @@ geoseeq/project.py,sha256=-9Y2ik0-BpT3iqh89v8VQBbdadhI58oaUP9oZK8oetc,13741
11
11
  geoseeq/remote_object.py,sha256=Es-JlAz8iLRmCpAzh1MOwUh2MqtbuQM-p8wHIBAqNlQ,7131
12
12
  geoseeq/sample.py,sha256=whgEVk6GnDJJLjn5uTOqFqRtVxZD3BgjTo7brAC5noU,7981
13
13
  geoseeq/search.py,sha256=gawad6Cx5FxJBPlYkXWb-UKAO-UC0_yhvyU9Ca1kaNI,3388
14
- geoseeq/upload_download_manager.py,sha256=BGaEBAKu05CqftwRu3BjzL6FvcHp_w122yOS3LVVzd4,7423
14
+ geoseeq/upload_download_manager.py,sha256=-s2Zdru_O2YWThAdmXRksIx2q8RwSOEZ2nLvm1VM4EM,7738
15
15
  geoseeq/user.py,sha256=tol8i1UGLRrbMw5jeJDnna1ikRgrCDd50Jxz0a1lSgg,690
16
16
  geoseeq/utils.py,sha256=PDRiEQIZYTcfEV9AYvloQVvfqs5JaebcFZodAa2SUW8,3577
17
17
  geoseeq/work_orders.py,sha256=5uLVVfdKE8qh4gGaHkdBpXJGRTujuSg59knWCqEET4A,8071
@@ -22,7 +22,7 @@ geoseeq/cli/detail.py,sha256=q8Suu-j2k18knfSVFG-SWWGNsKM-n8y9RMA3LcIIi9Y,4132
22
22
  geoseeq/cli/download.py,sha256=_upzZo08K0fAPbEsyi1uN0HGNUaY1pl6OoGPcWmvSUY,17765
23
23
  geoseeq/cli/fastq_utils.py,sha256=-bmeQLaiMBm57zWOF0R5OlWTU0_3sh1JBC1RYw2BOFM,3083
24
24
  geoseeq/cli/get_eula.py,sha256=79mbUwyiF7O1r0g6UTxG9kJGQEqKuH805E6eLkPC6Y4,997
25
- geoseeq/cli/main.py,sha256=xbSTOVLf1iFdw-aUu6l_UEVeiVrPgi9Liylt5UjNRzU,3260
25
+ geoseeq/cli/main.py,sha256=lqRT8Y62SqLQs9J_P7TCZIZ9CGk6sMphKOALz9TSQ0o,3260
26
26
  geoseeq/cli/manage.py,sha256=wGXAcVaXqE5JQEU8Jh6OlHr02nB396bpS_SFcOZdrEo,5929
27
27
  geoseeq/cli/progress_bar.py,sha256=p1Xl01nkYxSBZCB30ue2verIIi22W93m3ZAMAxipD0g,738
28
28
  geoseeq/cli/run.py,sha256=bx2AV6VIqOSTlxUda78xl0XxcZ8TXlQx02-e7iLQPwI,3838
@@ -37,9 +37,9 @@ geoseeq/cli/shared_params/id_handlers.py,sha256=501K9sCVkI0YGDQ62vXk_DM5lMMDrdB5
37
37
  geoseeq/cli/shared_params/obj_getters.py,sha256=ZSkt6LnDkVFlNVYKgLrjzg60-6BthZMr3eeD3HNqzac,2741
38
38
  geoseeq/cli/shared_params/opts_and_args.py,sha256=LrDkv9WtUryM4uUMXPRk04-EBcTQ7q5V6Yu-XRDUvvA,2083
39
39
  geoseeq/cli/upload/__init__.py,sha256=3C9_S9t7chmYU-2ot89NV03x-EtmsjibulErKaU9w1k,627
40
- geoseeq/cli/upload/upload.py,sha256=vbUslOSPf8EDYFzFxukE4EFK_h8nfMYNbSczR2AJNxk,9203
40
+ geoseeq/cli/upload/upload.py,sha256=whhj-cX324_N_IQKuR3t-Zw4BFzDz7NHzMrmWg5aqiE,9688
41
41
  geoseeq/cli/upload/upload_advanced.py,sha256=Jq5eGe-wOdrzxGWVwaFPg0BAJcW0YSx_eHEmYjJeKuA,3434
42
- geoseeq/cli/upload/upload_reads.py,sha256=iInPDnfgEuLa8E4gfENFaZ_Uhxfv1Zfkc38iPvTsTwg,7450
42
+ geoseeq/cli/upload/upload_reads.py,sha256=EMGqyZf11xwN4v2j8gNxMagTbE4kaOd-_hwupmg5I-8,10670
43
43
  geoseeq/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  geoseeq/contrib/ncbi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  geoseeq/contrib/ncbi/api.py,sha256=WQeLoGA_-Zha-QeSO8_i7HpvXyD8UkV0qc5okm11KiA,1056
@@ -63,8 +63,8 @@ geoseeq/plotting/map/map.py,sha256=h2QPLGqe-SamhfaTij53S9cQIiO8orCJUAUh0hRicSM,3
63
63
  geoseeq/plotting/map/overlay.py,sha256=4VmxqOESTQra9tPr8b8OLEUhJSit9lNipabeSznEYwE,1795
64
64
  geoseeq/result/__init__.py,sha256=IFHIyRV8ZzuKIfwfze1SXgcKwNMcSgMAknLHMkwjXIU,356
65
65
  geoseeq/result/bioinfo.py,sha256=QQtbyogrdro9avJSN0713sxLVnVeA24mFw3hWtKDKyw,1782
66
- geoseeq/result/file_download.py,sha256=XQA5bdQJJSZIgbFcN09OvVdLq12fnA98kPCIONAkLk0,5568
67
- geoseeq/result/file_upload.py,sha256=D8gbdsyzw5ztjT7Vmq4InxKhyl7l8n2leH4GYVD-seM,12109
66
+ geoseeq/result/file_download.py,sha256=vbYo2B4JshTIqLaklcgcBb7NY9cD5pMkas95GuQxW8s,5776
67
+ geoseeq/result/file_upload.py,sha256=qQ2F28W6ieLoEsbqCWr61cQ8hYlNaKkhn4DBQ5xqR-I,13214
68
68
  geoseeq/result/result_file.py,sha256=1Yj9fkZhds3J-tay6eNH2-EHi00MovHGV1M80_ckHD8,8677
69
69
  geoseeq/result/result_folder.py,sha256=6porOXPh7Tpxw3oX5yMRPYQzNCGYqszqmFJd3SwQmTc,11122
70
70
  geoseeq/result/utils.py,sha256=C-CxGzB3WddlnRiqFSkrY78I_m0yFgNqsTBRzGU-y8Q,2772
@@ -80,9 +80,9 @@ geoseeq/vc/vc_stub.py,sha256=IQr8dI0zsWKVAeY_5ybDD6n49_3othcgfHS3P0O9tuY,3110
80
80
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
81
  tests/test_api_client.py,sha256=TS5njc5pcPP_Ycy-ljcfPVT1hQRBsFVdQ0lCqBmoesU,12810
82
82
  tests/test_plotting.py,sha256=TcTu-2ARr8sxZJ7wPQxmbs3-gHw7uRvsgrhhhg0qKik,784
83
- geoseeq-0.5.6a10.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
84
- geoseeq-0.5.6a10.dist-info/METADATA,sha256=RKguQ4reNa4rH6Pv20JBMCbyo2inLjKcO5VFkcCgyxg,4806
85
- geoseeq-0.5.6a10.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
86
- geoseeq-0.5.6a10.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
87
- geoseeq-0.5.6a10.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
88
- geoseeq-0.5.6a10.dist-info/RECORD,,
83
+ geoseeq-0.5.6a12.dist-info/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
84
+ geoseeq-0.5.6a12.dist-info/METADATA,sha256=DTEadRCMzytqJtg9vPNagyrMCY8l8gIslgIr7PaOA1o,4806
85
+ geoseeq-0.5.6a12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
86
+ geoseeq-0.5.6a12.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
87
+ geoseeq-0.5.6a12.dist-info/top_level.txt,sha256=zZk7mmeaqAYqFJG8nq2DTgSQPbflRjJwkDIhNURPDEU,14
88
+ geoseeq-0.5.6a12.dist-info/RECORD,,