PyPI - geoseeq - Versions diffs - 0.2.1__tar.gz → 0.2.3__tar.gz - Mend

geoseeq 0.2.1tar.gz → 0.2.3tar.gz

Files changed (75) hide show

{geoseeq-0.2.1 → geoseeq-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geoseeq
-Version: 0.2.1
+Version: 0.2.3
 Summary: # Geoseeq API Client
 Author: David C. Danko
 Author-email: dcdanko@biotia.io

{geoseeq-0.2.1 → geoseeq-0.2.3}/geoseeq/cli/download.py RENAMED Viewed

@@ -5,7 +5,7 @@ from os.path import dirname, join
 import click
 import pandas as pd
+from multiprocessing import Pool
 from .shared_params import (
     handle_project_id,
     project_id_arg,
@@ -14,13 +14,14 @@ from .shared_params import (
     use_common_state,
     flatten_list_of_els_and_files
 )
-from geoseeq.result.utils import _download_head
+from geoseeq.result.file_download import download_url
 from geoseeq.utils import download_ftp
 from geoseeq.blob_constructors import (
     sample_result_file_from_uuid,
     project_result_file_from_uuid,
 )
 from geoseeq.knex import GeoseeqNotFoundError
+from .progress_bar import PBarManager
 from .utils import convert_size
 logger = logging.getLogger('geoseeq_api')
@@ -83,8 +84,16 @@ def cli_download_metadata(state, sample_ids):
     click.echo("Metadata successfully downloaded for samples.", err=True)
+def _download_one_file(args):
+    url, file_path, pbar = args
+    return download_url(url, filename=file_path, progress_tracker=pbar)
+cores_option = click.option('--cores', default=1, help='Number of downloads to run in parallel')
 @cli_download.command("files")
 @use_common_state
+@cores_option
 @click.option("--target-dir", default=".")
 @click.option('--yes/--confirm', default=False, help='Skip confirmation prompts')
 @click.option("--download/--urls-only", default=True, help="Download files or just print urls")
@@ -98,6 +107,7 @@ def cli_download_metadata(state, sample_ids):
 @sample_ids_arg
 def cli_download_files(
     state,
+    cores,
     sample_name_includes,
     target_dir,
     yes,
@@ -186,23 +196,32 @@ def cli_download_files(
         if not yes:
             click.confirm('Do you want to download these files?', abort=True)
+        download_args = []
+        pbars = PBarManager()
         for fname, url in response["links"].items():
             click.echo(f"Downloading file {fname}")
             file_path = join(target_dir, fname)
             makedirs(dirname(file_path), exist_ok=True)
-            if url.startswith("ftp"):
-                download_ftp(url, file_path)
-            else:
-                _download_head(url, file_path)
+            pbar = pbars.get_new_bar(file_path)
+            download_args.append((url, file_path, pbar))
+            if cores == 1:
+                download_url(url, filename=file_path, progress_tracker=pbar)
+        if cores > 1:
+            with Pool(cores) as p:
+                for _ in p.imap_unordered(_download_one_file, download_args):
+                    pass
 @cli_download.command("ids")
 @use_common_state
+@cores_option
 @click.option("--target-dir", default=".")
 @click.option('--yes/--confirm', default=False, help='Skip confirmation prompts')
 @click.option("--download/--urls-only", default=True, help="Download files or just print urls")
+@click.option('--head', default=None, type=int, help='Download the first N bytes of each file')
 @click.argument("ids", nargs=-1)
-def cli_download_ids(state, target_dir, yes, download, ids):
+def cli_download_ids(state, cores, target_dir, yes, download, head, ids):
     """Download a files from GeoSeeq based on their UUID or GeoSeeq Resource Number (GRN).
     This command downloads files directly based on their ID. This is used for "manual"
@@ -228,6 +247,7 @@ def cli_download_ids(state, target_dir, yes, download, ids):
     ---
     """
     result_file_ids = flatten_list_of_els_and_files(ids)
+    cores = max(cores, len(result_file_ids))  # don't use more cores than files
     knex = state.get_knex()
     result_files = []
     for result_id in result_file_ids:
@@ -249,8 +269,18 @@ def cli_download_ids(state, target_dir, yes, download, ids):
     if not yes:
         click.confirm('Do you want to download these files?', abort=True)
+    download_args = []
+    pbars = PBarManager()
     for result_file in result_files:
         click.echo(f"Downloading file {result_file.get_referenced_filename()}")
         file_path = join(target_dir, result_file.get_referenced_filename())
         makedirs(dirname(file_path), exist_ok=True)
-        result_file.download(file_path)
+        pbar = pbars.get_new_bar(file_path)
+        download_args.append((result_file, file_path, pbar))
+        if cores == 1:
+            result_file.download(file_path, progress_tracker=pbar, head=head)
+    if cores > 1:
+        with Pool(cores) as p:
+            for _ in p.imap_unordered(_download_one_file, download_args):
+                pass

{geoseeq-0.2.1 → geoseeq-0.2.3}/geoseeq/cli/main.py RENAMED Viewed

@@ -31,7 +31,7 @@ main.add_command(cli_upload)
 @main.command()
 def version():
     """Print the version of the Geoseeq API being used."""
-    click.echo('0.2.1')  # remember to update setup
+    click.echo('0.2.3')  # remember to update setup
 @main.group('advanced')

geoseeq-0.2.3/geoseeq/cli/progress_bar.py ADDED Viewed

@@ -0,0 +1,28 @@
+from tqdm import tqdm
+from os.path import basename
+class TQBar:
+    def __init__(self, pos, desc) -> None:
+        self.n_bars = 0
+        self.pos = pos
+        self.desc = desc
+        self.bar = None
+    def set_num_chunks(self, n_chunks):
+        self.n_bars = n_chunks
+        self.bar = tqdm(total=n_chunks, position=self.pos, desc=self.desc, leave=False)
+    def update(self, chunk_num):
+        self.bar.update(chunk_num)
+class PBarManager:
+    def __init__(self):
+        self.n_bars = 0
+        self.pbars = []
+    def get_new_bar(self, filepath):
+        self.n_bars += 1
+        return TQBar(self.n_bars, basename(filepath))

{geoseeq-0.2.1 → geoseeq-0.2.3}/geoseeq/cli/upload/upload_reads.py RENAMED Viewed

@@ -1,5 +1,4 @@
 import logging
-from tqdm import tqdm
 import click
 import requests
 from os.path import basename
@@ -20,39 +19,11 @@ from geoseeq.cli.shared_params import (
 )
 from geoseeq.constants import FASTQ_MODULE_NAMES
+from geoseeq.cli.progress_bar import PBarManager
 logger = logging.getLogger('geoseeq_api')
-class TQBar:
-    def __init__(self, pos, desc) -> None:
-        self.n_bars = 0
-        self.pos = pos
-        self.desc = desc
-        self.bar = None
-    def set_num_chunks(self, n_chunks):
-        self.n_bars = n_chunks
-        self.bar = tqdm(total=n_chunks, position=self.pos, desc=self.desc, leave=False)
-    def update(self, chunk_num):
-        self.bar.update(chunk_num)
-class PBarManager:
-    def __init__(self):
-        self.n_bars = 0
-        self.pbars = []
-    def get_new_bar(self, filepath):
-        self.n_bars += 1
-        return TQBar(self.n_bars, basename(filepath))
 def _make_in_process_logger(log_level):
     logger = logging.getLogger('geoseeq_api')
     logger.setLevel(log_level)

geoseeq-0.2.3/geoseeq/result/file_download.py ADDED Viewed

@@ -0,0 +1,102 @@
+import urllib.request
+import logging
+import requests
+from os.path import basename, getsize, join
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from geoseeq.utils import download_ftp
+from geoseeq.constants import FIVE_MB
+logger = logging.getLogger("geoseeq_api")  # Same name as calling module
+def _download_head(url, filename, head=None, progress_tracker=None):
+    headers = None
+    if head and head > 0:
+        headers = {"Range": f"bytes=0-{head}"}
+    response = requests.get(url, stream=True, headers=headers)
+    total_size_in_bytes = int(response.headers.get('content-length', 0))
+    if progress_tracker: progress_tracker.set_num_chunks(total_size_in_bytes)
+    block_size = FIVE_MB
+    with open(filename, 'wb') as file:
+        for data in response.iter_content(block_size):
+            if progress_tracker: progress_tracker.update(len(data))
+            file.write(data)
+    return filename
+def _download_generic(url, filename, head=None):
+    urllib.request.urlretrieve(url, filename)
+    return filename
+def guess_download_kind(url):
+    if 'azure' in url:
+        return 'azure'
+    elif 's3' in url:
+        return 's3'
+    elif 'ftp' in url:
+        return 'ftp'
+    else:
+        return 'generic'
+def download_url(url, kind='guess', filename=None, head=None, progress_tracker=None):
+    """Return a local filepath to the downloaded file. Download the file."""
+    if kind == 'guess':
+        kind = guess_download_kind(url)
+        logger.info(f"Guessed download kind: {kind} for {url}")
+    logger.info(f"Downloading {kind} file to {filename}")
+    if kind == 'generic':
+        return _download_generic(url, filename, head=head)
+    elif kind == 's3':
+        return _download_head(url, filename, head=head, progress_tracker=progress_tracker)
+    elif kind == 'azure':
+        return _download_head(url, filename, head=head)
+    elif kind == 'ftp':
+        return download_ftp(url, filename, head=head)
+    else:
+        raise ValueError(f"Unknown download kind: {kind}")
+class ResultFileDownload:
+    """Abstract class that handles download methods for result files."""
+    def get_download_url(self):
+        """Return a URL that can be used to download the file for this result."""
+        blob_type = self.stored_data.get("__type__", "").lower()
+        if blob_type not in ["s3", "sra", "ftp", "azure"]:
+            raise ValueError(f'Unknown URL type: "{blob_type}"')
+        key = 'url' if 'url' in self.stored_data else 'uri'
+        if blob_type in ["s3", "azure"]:
+            try:
+                url = self.stored_data["presigned_url"]
+            except KeyError:
+                url = self.stored_data[key]
+            if url.startswith("s3://"):
+                url = self.stored_data["endpoint_url"] + "/" + url[5:]
+            return url
+        else:
+            return self.stored_data[key]
+    def download(self, filename=None, cache=True, head=None, progress_tracker=None):
+        """Return a local filepath to the file this result points to."""
+        if not filename:
+            self._temp_filename = True
+            myfile = NamedTemporaryFile(delete=False)
+            myfile.close()
+            filename = myfile.name
+        blob_type = self.stored_data.get("__type__", "").lower()
+        if cache and self._cached_filename:
+            return self._cached_filename
+        url = self.get_download_url()
+        filepath = download_url(
+            url, blob_type, filename,
+            head=head, progress_tracker=progress_tracker
+        )
+        if cache:
+            self._cached_filename = filepath
+        return filepath

{geoseeq-0.2.1 → geoseeq-0.2.3}/geoseeq/result/file_upload.py RENAMED Viewed

@@ -108,7 +108,7 @@ class ResultFileUpload:
             for num, url in enumerate(list(urls.values())):
                 response_part = self._upload_one_part(file_chunker, url, num, max_retries, session)
                 complete_parts.append(response_part)
-                progress_tracker.update(file_chunker.get_chunk_size(num))
+                if progress_tracker: progress_tracker.update(file_chunker.get_chunk_size(num))
                 logger.info(f'Uploaded part {num + 1} of {len(urls)} for "{file_chunker.filepath}"')
             return complete_parts
@@ -123,7 +123,7 @@ class ResultFileUpload:
             for future in as_completed(futures):
                 response_part = future.result()
                 complete_parts.append(response_part)
-                progress_tracker.update(file_chunker.get_chunk_size(response_part["PartNumber"] - 1))
+                if progress_tracker: progress_tracker.update(file_chunker.get_chunk_size(response_part["PartNumber"] - 1))
                 logger.info(
                     f'Uploaded part {response_part["PartNumber"]} of {len(urls)} for "{file_chunker.filepath}"'
                 )
@@ -137,7 +137,7 @@ class ResultFileUpload:
         chunk_size=FIVE_MB,
         max_retries=3,
         session=None,
-        progress_tracker=lambda x: None,
+        progress_tracker=None,
         threads=1,
     ):
         """Upload a file to S3 using the multipart upload process."""
@@ -146,7 +146,7 @@ class ResultFileUpload:
         logger.info(f'Starting upload for "{filepath}"')
         complete_parts = []
         file_chunker = FileChunker(filepath, chunk_size).load_all_chunks()
-        progress_tracker.set_num_chunks(file_chunker.file_size)
+        if progress_tracker: progress_tracker.set_num_chunks(file_chunker.file_size)
         complete_parts = self._upload_parts(file_chunker, urls, max_retries, session, progress_tracker, threads)
         self._finish_multipart_upload(upload_id, complete_parts)
         logger.info(f'Finished Upload for "{filepath}"')

{geoseeq-0.2.1 → geoseeq-0.2.3}/geoseeq/result/utils.py RENAMED Viewed

@@ -15,22 +15,6 @@ from geoseeq.utils import download_ftp, md5_checksum
 logger = logging.getLogger("geoseeq_api")  # Same name as calling module
 logger.addHandler(logging.NullHandler())  # No output unless configured by calling program
-def _download_head(url, filename, head=None):
-    if head and head > 0:
-        opener = urllib.request.build_opener()
-        if head:
-            opener.addheaders = [('Range', f'bytes=0-{head}')]
-        urllib.request.install_opener(opener)
-    try:
-        urllib.request.urlretrieve(url, filename)  # can throw 416 error if head is too large
-    except urllib.error.HTTPError as e:
-        if e.code == 416:
-            logger.warning(f"HEAD request failed, trying again without HEAD.")
-            _download_head(url, filename, head=None)
-        else:
-            raise e
 def diff_dicts(blob1, blob2):

{geoseeq-0.2.1 → geoseeq-0.2.3}/geoseeq.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geoseeq
-Version: 0.2.1
+Version: 0.2.3
 Summary: # Geoseeq API Client
 Author: David C. Danko
 Author-email: dcdanko@biotia.io

{geoseeq-0.2.1 → geoseeq-0.2.3}/geoseeq.egg-info/SOURCES.txt RENAMED Viewed

@@ -32,6 +32,7 @@ geoseeq/cli/download.py
 geoseeq/cli/fastq_utils.py
 geoseeq/cli/list.py
 geoseeq/cli/main.py
+geoseeq/cli/progress_bar.py
 geoseeq/cli/user.py
 geoseeq/cli/utils.py
 geoseeq/cli/view.py

{geoseeq-0.2.1 → geoseeq-0.2.3}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ import setuptools
 setuptools.setup(
     name='geoseeq',
-    version='0.2.1',  # remember to update version string in CLI as well
+    version='0.2.3',  # remember to update version string in CLI as well
     author="David C. Danko",
     author_email='dcdanko@biotia.io',
     description=open('README.md').read(),

geoseeq-0.2.1/geoseeq/result/file_download.py DELETED Viewed

@@ -1,95 +0,0 @@
-import urllib.request
-from os.path import basename, getsize, join
-from pathlib import Path
-from tempfile import NamedTemporaryFile
-from geoseeq.utils import download_ftp
-from .utils import *
-class ResultFileDownload:
-    """Abstract class that handles download methods for result files."""
-    def get_download_url(self):
-        """Return a URL that can be used to download the file for this result."""
-        blob_type = self.stored_data.get("__type__", "").lower()
-        if blob_type not in ["s3", "sra"]:
-            raise TypeError("Cannot fetch a file for a BLOB type result field.")
-        if blob_type == "s3":
-            try:
-                url = self.stored_data["presigned_url"]
-            except KeyError:
-                url = self.stored_data["uri"]
-            if url.startswith("s3://"):
-                url = self.stored_data["endpoint_url"] + "/" + url[5:]
-            return url
-        elif blob_type == "sra":
-            url = self.stored_data["url"]
-            return url
-    def download_file(self, filename=None, cache=True, head=None):
-        """Return a local filepath to the file this result points to."""
-        if not filename:
-            self._temp_filename = True
-            myfile = NamedTemporaryFile(delete=False)
-            myfile.close()
-            filename = myfile.name
-        blob_type = self.stored_data.get("__type__", "").lower()
-        if cache and self._cached_filename:
-            return self._cached_filename
-        if blob_type == "s3":
-            return self._download_s3(filename, cache, head=head)
-        elif blob_type == "sra":
-            return self._download_sra(filename, cache)
-        elif blob_type == "ftp":
-            return self._download_ftp(filename, cache)
-        elif blob_type == "azure":
-            return self._download_azure(filename, cache, head=head)
-        else:
-            raise TypeError("Cannot fetch a file for a BLOB type result field.")
-    def _download_s3(self, filename, cache, head=None):
-        logger.info(f"Downloading S3 file to {filename}")
-        try:
-            url = self.stored_data["presigned_url"]
-        except KeyError:
-            key = 'uri' if 'uri' in self.stored_data else 'url'
-            url = self.stored_data[key]
-        if url.startswith("s3://"):
-            url = self.stored_data["endpoint_url"] + "/" + url[5:]
-        _download_head(url, filename, head=head)
-        if cache:
-            self._cached_filename = filename
-        return filename
-    def _download_azure(self, filename, cache, head=None):
-        logger.info(f"Downloading Azure file to {filename}")
-        try:
-            url = self.stored_data["presigned_url"]
-        except KeyError:
-            key = 'uri' if 'uri' in self.stored_data else 'url'
-            url = self.stored_data[key]
-        _download_head(url, filename, head=head)
-        if cache:
-            self._cached_filename = filename
-        return filename
-    def _download_sra(self, filename, cache):
-        return self._download_generic_url(filename, cache)
-    def _download_ftp(self, filename, cache, head=None):
-        logger.info(f"Downloading FTP file to {filename}")
-        key = 'url' if 'url' in self.stored_data else 'uri'
-        download_ftp(self.stored_data[key], filename, head=head)
-        return filename
-    def _download_generic_url(self, filename, cache):
-        logger.info(f"Downloading generic URL file to {filename}")
-        key = 'url' if 'url' in self.stored_data else 'uri'
-        url = self.stored_data[key]
-        urllib.request.urlretrieve(url, filename)
-        if cache:
-            self._cached_filename = filename
-        return filename