geoseeq 0.7.4__py3-none-any.whl → 0.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
geoseeq/cli/download.py CHANGED
@@ -3,6 +3,7 @@ import logging
3
3
  from os import makedirs
4
4
  from os.path import dirname, join
5
5
 
6
+ import gzip
6
7
  import click
7
8
  import pandas as pd
8
9
  from multiprocessing import Pool
@@ -32,6 +33,7 @@ from .utils import convert_size
32
33
  from geoseeq.constants import FASTQ_MODULE_NAMES
33
34
  from geoseeq.result import ResultFile
34
35
  from geoseeq.upload_download_manager import GeoSeeqDownloadManager
36
+ import os
35
37
 
36
38
  logger = logging.getLogger('geoseeq_api')
37
39
 
@@ -378,7 +380,27 @@ def cli_download_ids(state, cores, target_dir, file_name, yes, download, head, i
378
380
  download_manager.download_files()
379
381
 
380
382
 
381
- def _get_sample_result_files_with_names(sample, module_name=None, first=False):
383
+ def _get_local_filename_for_fastq(sample, result_file, read_type, read_num, lane_num, file_name_mode):
384
+ """Return a local filename for a fastq file based on the specified naming mode."""
385
+ if file_name_mode == "original":
386
+ return result_file.get_stored_data_filename()
387
+ elif file_name_mode == "geoseeq":
388
+ sname = sample.name.replace(".", "-").replace(" ", "_").lower()
389
+ rtype = read_type.replace("::", "__").replace(".", "-").replace(" ", "_").lower()
390
+ filename = f"{sname}.{rtype}.R{read_num}.L{lane_num}.fastq.gz"
391
+ return filename
392
+ elif file_name_mode == "sample-uuid":
393
+ filename = f"{sample.uuid}.R{read_num}.L{lane_num}.fastq.gz"
394
+ return filename
395
+ elif file_name_mode == "file-uuid":
396
+ filename = f"{result_file.uuid}.fastq.gz"
397
+ return filename
398
+ else:
399
+ raise ValueError(f"Unknown file name mode: {file_name_mode}")
400
+
401
+
402
+ def _get_sample_result_files_with_names(sample, module_name=None, which_fastqs_mode='all', file_name_mode='original'):
403
+ """Return list of (result_file, filename, key) tuples for all fastq files in a sample."""
382
404
  result_files_with_names = []
383
405
  for read_type, folder in sample.get_all_fastqs().items():
384
406
  if module_name and module_name != read_type:
@@ -388,19 +410,18 @@ def _get_sample_result_files_with_names(sample, module_name=None, first=False):
388
410
  lane_num = lane_num + 1 # 1 indexed
389
411
  if read_type in ["short_read::paired_end"]:
390
412
  key = (sample, read_type, 1, lane_num) # sample name, read type, read number, lane number
391
- result_files_with_names.append(
392
- (result_file[0], result_file[0].get_referenced_filename(), key)
393
- )
413
+ fname = _get_local_filename_for_fastq(sample, result_file[0], read_type, 1, lane_num, file_name_mode)
414
+ result_files_with_names.append((result_file[0], fname, key))
415
+ if which_fastqs_mode == "first-r1":
416
+ break
394
417
  key = (sample, read_type, 2, lane_num)
395
- result_files_with_names.append(
396
- (result_file[1], result_file[1].get_referenced_filename(), key)
397
- )
418
+ fname = _get_local_filename_for_fastq(sample, result_file[1], read_type, 2, lane_num, file_name_mode)
419
+ result_files_with_names.append((result_file[1], fname, key))
398
420
  else:
399
421
  key = (sample, read_type, 1, lane_num)
400
- result_files_with_names.append(
401
- (result_file, result_file.get_referenced_filename(), key)
402
- )
403
- if first:
422
+ fname = _get_local_filename_for_fastq(sample, result_file, read_type, 1, lane_num, file_name_mode)
423
+ result_files_with_names.append((result_file, fname, key))
424
+ if which_fastqs_mode in ["first-all", "first-r1"]:
404
425
  break
405
426
 
406
427
  return result_files_with_names
@@ -442,14 +463,52 @@ def _make_read_configs(download_results, config_dir="."):
442
463
  with open(config_path, "w") as f:
443
464
  json.dump(config_blob, f, indent=4)
444
465
 
466
+ def _open_maybe_gzip(local_path):
467
+ """Open a file that may be gzipped. Do not rely on file extension."""
468
+ with open(local_path, "rb") as f:
469
+ magic_number = f.read(2)
470
+ if magic_number == b'\x1f\x8b':
471
+ return gzip.open(local_path, "rt")
472
+ else:
473
+ return open(local_path, "r")
474
+
475
+
476
+ def _trim_fastq_to_complete_reads(key, local_path):
477
+ """Trim a fastq file to the nearest complete read boundary under head_bytes.
478
+
479
+ Write the output as a gzipped file regardless of input compression.
480
+ """
481
+ temp_path = local_path + ".tmp"
482
+ with _open_maybe_gzip(local_path) as infile, gzip.open(temp_path, "wt") as outfile:
483
+ lines_written = 0
484
+ while True:
485
+ read_lines = []
486
+ for _ in range(4):
487
+ line = infile.readline()
488
+ if not line:
489
+ break
490
+ read_lines.append(line)
491
+ if len(read_lines) < 4:
492
+ break # end of file
493
+ if infile.tell() > key[4]: # key[4] is head_bytes
494
+ break # reached head limit
495
+ for line in read_lines:
496
+ outfile.write(line)
497
+ lines_written += 4
498
+ # Replace original file with trimmed file
499
+
500
+ os.replace(temp_path, local_path)
501
+
445
502
 
446
503
  @cli_download.command("fastqs")
447
504
  @use_common_state
448
505
  @cores_option
449
506
  @click.option("--target-dir", default=".")
450
507
  @yes_option
451
- @click.option("--first/--all", default=False, help="Download only the first folder of fastq files for each sample.")
508
+ @click.option('--file-name-mode', type=click.Choice(['original', 'geoseeq', 'sample-uuid', 'file-uuid']), help="Choose how the downloaded fastq files are named.", default='original')
509
+ @click.option("--which-fastqs-mode", type=click.Choice(["first-all", "first-r1", "all"]), default="all", help="Choose which fastq files to download per sample. ")
452
510
  @click.option("--download/--urls-only", default=True, help="Download files or just print urls")
511
+ @head_option
453
512
  @click.option("--config-dir", default=None, help="Directory to write read config files. If unset do not write config files.")
454
513
  @module_option(FASTQ_MODULE_NAMES, use_default=False)
455
514
  @ignore_errors_option
@@ -460,8 +519,10 @@ def cli_download_fastqs(state,
460
519
  cores,
461
520
  target_dir,
462
521
  yes,
463
- first,
522
+ file_name_mode,
523
+ which_fastqs_mode,
464
524
  download,
525
+ head,
465
526
  config_dir,
466
527
  module_name,
467
528
  ignore_errors,
@@ -474,6 +535,20 @@ def cli_download_fastqs(state,
474
535
  This command will download fastq files from a GeoSeeq project. You can filter
475
536
  files by sample name and by specific fastq read types.
476
537
 
538
+ The filenames of the downloaded fastq files can be controlled using the --file-name-mode option:
539
+ - original: Use the original filename as uploaded to GeoSeeq (default)
540
+ - geoseeq: Use a normalized GeoSeeq generated filename that includes the sample name, read type, read number, and lane number.
541
+ - sample-uuid: Use the GeoSeeq UUID of the sample along with lane number and read number.
542
+ - file-uuid: Use the GeoSeeq UUID of the result file only.
543
+
544
+ If the --head option is used to only download the first N bytes of each fastq file, this command
545
+ will automatically clip the fastq files at the nearest complete read boundary to avoid incomplete reads.
546
+
547
+ The --which-fastqs-mode option controls which fastq files are downloaded per sample:
548
+ - first-all: Download all fastq files but from the first fastq folder only.
549
+ - first-r1: Download only the first read (R1) fastq file from the first fastq folder.
550
+ - all: Download all fastq files from all folders.
551
+
477
552
  ---
478
553
 
479
554
  Example Usage:
@@ -523,7 +598,7 @@ def cli_download_fastqs(state,
523
598
  result_files_with_names = []
524
599
  for sample in samples:
525
600
  try:
526
- result_files_with_names += _get_sample_result_files_with_names(sample, module_name, first)
601
+ result_files_with_names += _get_sample_result_files_with_names(sample, module_name, which_fastqs_mode, file_name_mode)
527
602
  except Exception as e:
528
603
  logger.error(f"Error fetching fastq files for sample {sample.name}: {e}")
529
604
  if not ignore_errors:
@@ -538,9 +613,13 @@ def cli_download_fastqs(state,
538
613
  ignore_errors=ignore_errors,
539
614
  log_level=state.log_level,
540
615
  progress_tracker_factory=PBarManager().get_new_bar,
616
+ head=head,
541
617
  )
542
618
  for result_file, filename, key in result_files_with_names:
543
- download_manager.add_download(result_file, join(target_dir, filename), key=key)
619
+ callback = None
620
+ if head:
621
+ callback = _trim_fastq_to_complete_reads
622
+ download_manager.add_download(result_file, join(target_dir, filename), key=key, callback=callback)
544
623
  if not download:
545
624
  print(download_manager.get_url_string(), file=state.outfile)
546
625
  else:
geoseeq/cli/main.py CHANGED
@@ -55,7 +55,7 @@ def version():
55
55
  Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
56
56
  Run `geoseeq eula show` to view the EULA.
57
57
  """
58
- click.echo("0.7.4") # remember to update pyproject.toml
58
+ click.echo("0.7.6") # remember to update pyproject.toml
59
59
 
60
60
 
61
61
  @main.group("advanced")
@@ -103,6 +103,18 @@ class ResultFile(RemoteObject, ResultFileUpload, ResultFileDownload, ResultFileS
103
103
  # except TypeError:
104
104
  # return basename(self.get_blob_filename())
105
105
 
106
+ def get_stored_data_filename(self):
107
+ """Return the filename that is stored in the stored_data field.
108
+
109
+ This is typically the filename that was originally uploaded to create this result file.
110
+ """
111
+ try:
112
+ key = [k for k in ["filename", "uri", "url"] if k in self.stored_data][0]
113
+ except IndexError:
114
+ raise TypeError("Cannot make a reference filename for a BLOB type result field.")
115
+ filepath = self.stored_data[key]
116
+ return basename(filepath)
117
+
106
118
  def _save(self):
107
119
  data = {field: getattr(self, field) for field in self.remote_fields if hasattr(self, field)}
108
120
  data["analysis_result"] = self.parent.uuid
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: geoseeq
3
- Version: 0.7.4
3
+ Version: 0.7.6
4
4
  Summary: GeoSeeq command line tools and python API
5
5
  Project-URL: Homepage, https://github.com/biotia/geoseeq_api_client
6
6
  Project-URL: Issues, https://github.com/biotia/geoseeq_api_client/issues
@@ -21,11 +21,11 @@ geoseeq/cli/__init__.py,sha256=4WnK87K5seRK3SGJAxNWnQTqyg5uBhdhrOrzB1D4b3M,24
21
21
  geoseeq/cli/constants.py,sha256=NtRSNBuna42605LE0sVywTPfmzYQnG-3yrT_M7Ml5B0,213
22
22
  geoseeq/cli/copy.py,sha256=02U9kdrAIbbM8MlRMLL6p-LMYFSuRObE3h5jyvcL__M,2275
23
23
  geoseeq/cli/detail.py,sha256=q8Suu-j2k18knfSVFG-SWWGNsKM-n8y9RMA3LcIIi9Y,4132
24
- geoseeq/cli/download.py,sha256=Znjuc9IFOcIa5_Od9mFXHJdYAJtgw9Bc_wPPcOVXn7s,21298
24
+ geoseeq/cli/download.py,sha256=JHKd4VKAgJmvd3fVpSNlxJJT2goLB7eYkWY5UkfXESI,25347
25
25
  geoseeq/cli/fastq_utils.py,sha256=-bmeQLaiMBm57zWOF0R5OlWTU0_3sh1JBC1RYw2BOFM,3083
26
26
  geoseeq/cli/find_grn.py,sha256=oMDxkzGQBQb2_cCuvmwoeHOsFHqyO9RLeJzrB6bAe5M,439
27
27
  geoseeq/cli/get_eula.py,sha256=79mbUwyiF7O1r0g6UTxG9kJGQEqKuH805E6eLkPC6Y4,997
28
- geoseeq/cli/main.py,sha256=gHZzocvIrfigz9EP3V1BxPh5OFNsp6i6_jqbP6vb_a8,4133
28
+ geoseeq/cli/main.py,sha256=mDfD0R7d-FJJyKFMnsMIZDGprVoslE6Ts5QbJTTIBFY,4133
29
29
  geoseeq/cli/manage.py,sha256=wGXAcVaXqE5JQEU8Jh6OlHr02nB396bpS_SFcOZdrEo,5929
30
30
  geoseeq/cli/progress_bar.py,sha256=p1Xl01nkYxSBZCB30ue2verIIi22W93m3ZAMAxipD0g,738
31
31
  geoseeq/cli/project.py,sha256=V5SdXm2Hwo2lxrkpwRDedw-mAE4XnM2uwT-Gj1D90VQ,3030
@@ -76,7 +76,7 @@ geoseeq/result/bioinfo.py,sha256=QQtbyogrdro9avJSN0713sxLVnVeA24mFw3hWtKDKyw,178
76
76
  geoseeq/result/file_chunker.py,sha256=bXq1csuRtqMB5sbH-AfWo6gdPwrivv5DJPuHVj-h08w,1758
77
77
  geoseeq/result/file_download.py,sha256=5IXg_dIWlrRHBJQssO42da5_bIJOyH0_b8K2KWVAFBE,8210
78
78
  geoseeq/result/file_upload.py,sha256=xs1DrI-h4ZP7xN8HPBc3SFpcPAxR5HAolraP1Zu7tvE,10648
79
- geoseeq/result/result_file.py,sha256=mkFh2DpKO1-kEAARCMYjkc7TmkJh41azyauGIHl_VZo,9173
79
+ geoseeq/result/result_file.py,sha256=Mu_8cJYN3tVlkLYnbd_pyGinBoePlmQxLFIbeF-PWyo,9698
80
80
  geoseeq/result/result_folder.py,sha256=iyO0hwZWokrH6oWhBgHlunWMpCMpejKb8v2sHFhecws,11283
81
81
  geoseeq/result/resumable_download_tracker.py,sha256=YEzqHBBnE7L3XokTvlTAhHZ8TcDTIE_pyTQ7YadOfbU,3667
82
82
  geoseeq/result/resumable_upload_tracker.py,sha256=2aI09gYz2yw63jEXqs8lmCRKQ79TIc3YuPETvP0Jeek,3811
@@ -92,8 +92,8 @@ geoseeq/vc/vc_cache.py,sha256=P4LXTbq2zOIv1OhP7Iw5MmypR2vXuy29Pq5K6gRvi-M,730
92
92
  geoseeq/vc/vc_dir.py,sha256=A9CLTh2wWCRzZjiLyqXD1vhtsWZGD3OjaMT5KqlfAXI,457
93
93
  geoseeq/vc/vc_sample.py,sha256=qZeioWydXvfu4rGMs20nICfNcp46y_XkND-bHdV6P5M,3850
94
94
  geoseeq/vc/vc_stub.py,sha256=IQr8dI0zsWKVAeY_5ybDD6n49_3othcgfHS3P0O9tuY,3110
95
- geoseeq-0.7.4.dist-info/METADATA,sha256=cJnudtWL38eoOqQF1IfjmYaslq5beDTFMNPYP7Y2vio,5652
96
- geoseeq-0.7.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
97
- geoseeq-0.7.4.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
98
- geoseeq-0.7.4.dist-info/licenses/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
99
- geoseeq-0.7.4.dist-info/RECORD,,
95
+ geoseeq-0.7.6.dist-info/METADATA,sha256=yTvBpyBSN4eelbfdHrL3jxWYG38vN7uOL7fSUlFp5ok,5652
96
+ geoseeq-0.7.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
97
+ geoseeq-0.7.6.dist-info/entry_points.txt,sha256=yF-6KDM8zXib4Al0qn49TX-qM7PUkWUIcYtsgt36rjM,45
98
+ geoseeq-0.7.6.dist-info/licenses/LICENSE,sha256=IuhIl1XCxXLPLJT_coN1CNqQU4Khlq7x4IdW7ioOJD8,1067
99
+ geoseeq-0.7.6.dist-info/RECORD,,