PyPI - geoseeq - Versions diffs - 0.2.22__tar.gz → 0.3.1__tar.gz - Mend

geoseeq 0.2.22tar.gz → 0.3.1tar.gz

Files changed (91) hide show

{geoseeq-0.2.22 → geoseeq-0.3.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geoseeq
-Version: 0.2.22
+Version: 0.3.1
 Summary: # Geoseeq API Client
 Author: David C. Danko
 Author-email: dcdanko@biotia.io

geoseeq-0.3.1/geoseeq/cli/detail.py ADDED Viewed

@@ -0,0 +1,39 @@
+import click
+import json
+from .shared_params import (
+    use_common_state,
+    project_id_arg,
+    sample_ids_arg,
+    yes_option,
+    private_option,
+    org_id_arg,
+    handle_project_id,
+    handle_multiple_sample_ids,
+    handle_org_id,
+)
+from geoseeq.id_constructors import resolve_id
+@click.group('detail')
+def cli_detail():
+    """Detail objects on GeoSeeq."""
+    pass
+@cli_detail.command('folder')
+@use_common_state
+@click.argument('grn')
+def detail_folder(state, grn):
+    kind, rfolder = resolve_id(state.get_knex(), grn)
+    assert kind == 'folder'
+    click.echo('Folder:')
+    click.echo(rfolder)
+    click.echo('Created at: {}'.format(rfolder.created_at))
+    click.echo('Updated at: {}'.format(rfolder.updated_at))
+    click.echo('Files:')
+    for rfile in rfolder.get_result_files():
+        click.echo(rfile)
+        click.echo('Created at: {}'.format(rfile.created_at))
+        click.echo('Updated at: {}'.format(rfile.updated_at))
+        click.echo(json.dumps(rfile.stored_data, indent=2))
+        click.echo('--')

{geoseeq-0.2.22 → geoseeq-0.3.1}/geoseeq/cli/download.py RENAMED Viewed

@@ -166,22 +166,22 @@ def cli_download_files(
     """
     knex = state.get_knex()
     proj = handle_project_id(knex, project_id)
+    logger.info(f"Found project \"{proj.name}\"")
     samples = []
     if sample_ids:
         logger.info(f"Fetching info for {len(sample_ids)} samples.")
         samples = handle_multiple_sample_ids(knex, sample_ids, proj=proj)
-    data = {
-        "sample_uuids": [s.uuid for s in samples],
-        "sample_names": sample_name_includes,
-        "folder_type": folder_type,
-        "folder_names": folder_name,
-        "file_names": file_name,
-        "extensions": extension,
-        "with_versions": with_versions
-    }
-    url = f"sample_groups/{proj.uuid}/download"
-    response = knex.post(url, data)
+    response = proj.bulk_find_files(
+        sample_uuids=[s.uuid for s in samples],
+        sample_name_includes=sample_name_includes,
+        folder_types=folder_type,
+        folder_names=folder_name,
+        file_names=file_name,
+        extensions=extension,
+        with_versions=with_versions,
+    )
     if not download:
         data = json.dumps(response["links"])

{geoseeq-0.2.22 → geoseeq-0.3.1}/geoseeq/cli/main.py RENAMED Viewed

@@ -7,7 +7,7 @@ import click
 from .copy import cli_copy
 from .manage import cli_manage
 from .download import cli_download
-from .upload import cli_upload
+from .upload import cli_upload, cli_upload_advanced
 from .user import cli_user
 from .view import cli_view
 from .search import cli_search
@@ -15,6 +15,7 @@ from geoseeq.vc.cli import cli_vc
 from geoseeq.knex import DEFAULT_ENDPOINT
 from .shared_params.config import set_profile
 from .shared_params.opts_and_args import overwrite_option
+from .detail import cli_detail
 logger = logging.getLogger('geoseeq_api')
 handler = logging.StreamHandler()
@@ -35,7 +36,7 @@ main.add_command(cli_search)
 @main.command()
 def version():
     """Print the version of the Geoseeq API being used."""
-    click.echo('0.2.22')  # remember to update setup
+    click.echo('0.3.1')  # remember to update setup
 @main.group('advanced')
@@ -45,6 +46,8 @@ def cli_advanced():
 cli_advanced.add_command(cli_copy)
 cli_advanced.add_command(cli_user)
+cli_advanced.add_command(cli_detail)
+cli_advanced.add_command(cli_upload_advanced)
 @cli_advanced.group('experimental')
 def cli_experimental():

{geoseeq-0.2.22 → geoseeq-0.3.1}/geoseeq/cli/upload/__init__.py RENAMED Viewed

@@ -5,7 +5,7 @@ from .upload import (
     cli_metadata,
 )
 from .upload_reads import cli_upload_reads_wizard
+from .upload_advanced import cli_find_urls_for_reads
 @click.group('upload')
 def cli_upload():
@@ -15,3 +15,10 @@ def cli_upload():
 cli_upload.add_command(cli_upload_reads_wizard)
 cli_upload.add_command(cli_upload_file)
 cli_upload.add_command(cli_metadata)
+@click.group('upload')
+def cli_upload_advanced():
+    """Advanced tools to upload files to GeoSeeq."""
+    pass
+cli_upload_advanced.add_command(cli_find_urls_for_reads)

geoseeq-0.3.1/geoseeq/cli/upload/upload_advanced.py ADDED Viewed

@@ -0,0 +1,92 @@
+import logging
+import click
+import requests
+from os.path import basename, getsize
+from .upload_reads import (
+    _make_in_process_logger,
+    _get_regex,
+    _group_files,
+    flatten_list_of_fastqs,
+)
+from multiprocessing import Pool, current_process
+from geoseeq.cli.constants import *
+from geoseeq.cli.shared_params import (
+    handle_project_id,
+    private_option,
+    module_option,
+    project_id_arg,
+    overwrite_option,
+    yes_option,
+    use_common_state,
+)
+from geoseeq.constants import FASTQ_MODULE_NAMES
+from geoseeq.cli.progress_bar import PBarManager
+logger = logging.getLogger('geoseeq_api')
+def _keep_only_authentication_url_args(url):
+    """Return a url with only the S3 authentication args"""
+    root, args = url.split('?')
+    args = args.split('&')
+    args = [arg for arg in args if arg.startswith('AWSAccessKeyId=') or arg.startswith('Signature=')]
+    return root + '?' + '&'.join(args)
+def _get_url_for_one_file(args):
+    """Return a tuple of the filepath and the url to upload it to"""
+    result_file, filepath, overwrite, log_level = args
+    _make_in_process_logger(log_level)
+    if result_file.exists() and not overwrite:
+        return
+    result_file = result_file.idem()
+    file_size = getsize(filepath)
+    _, urls = result_file._prep_multipart_upload(filepath, file_size, file_size + 1, {})
+    url = _keep_only_authentication_url_args(urls['1'])
+    return filepath, url
+def _find_target_urls(groups, module_name, lib, filepaths, overwrite, cores, state):
+    """Use GeoSeeq to get target urls for a set of files"""
+    with requests.Session() as session:
+        find_url_args = []
+        for group in groups:
+            sample = lib.sample(group['sample_name']).idem()
+            read_folder = sample.result_folder(module_name).idem()
+            for field_name, path in group['fields'].items():
+                result_file = read_folder.read_file(field_name)
+                filepath = filepaths[path]
+                find_url_args.append((
+                    result_file, filepath, overwrite, state.log_level
+                ))
+        with Pool(cores) as p:
+            for (file_name, target_url) in p.imap_unordered(_get_url_for_one_file, find_url_args):
+                yield file_name, target_url
+@click.command('read-links')
+@use_common_state
+@click.option('--cores', default=1, help='Number of uploads to run in parallel')
+@overwrite_option
+@yes_option
+@click.option('--regex', default=None, help='An optional regex to use to extract sample names from the file names')
+@private_option
+@module_option(FASTQ_MODULE_NAMES)
+@project_id_arg
+@click.argument('fastq_files', type=click.Path(exists=True), nargs=-1)
+def cli_find_urls_for_reads(state, cores, overwrite, yes, regex, private, module_name, project_id, fastq_files):
+    """Print a two column list with filenames and a target storage URL
+    """
+    knex = state.get_knex()
+    proj = handle_project_id(knex, project_id, yes, private)
+    filepaths = {basename(line): line for line in flatten_list_of_fastqs(fastq_files)}
+    click.echo(f'Found {len(filepaths)} files to upload.', err=True)
+    regex = _get_regex(knex, filepaths, module_name, proj, regex)
+    groups = _group_files(knex, filepaths, module_name, regex, yes)
+    for file_name, target_url in _find_target_urls(groups, module_name, proj, filepaths, overwrite, cores, state):
+        print(f'{file_name}\t{target_url}', file=state.outfile)

{geoseeq-0.2.22 → geoseeq-0.3.1}/geoseeq/project.py RENAMED Viewed

@@ -4,7 +4,9 @@ from .sample import Sample
 from .utils import paginated_iterator
 import json
 import pandas as pd
+import logging
+logger = logging.getLogger("geoseeq_api")
 class Project(RemoteObject):
@@ -14,9 +16,11 @@ class Project(RemoteObject):
         "name",
         "privacy_level",
         "description",
+        "samples_count",
     ]
     optional_remote_fields = [
         "privacy_level",
+        "samples_count",
     ]
     parent_field = "org"
     url_prefix = "sample_groups"
@@ -195,6 +199,18 @@ class Project(RemoteObject):
         for sample_blob in paginated_iterator(self.knex, url, error_handler=error_handler):
             yield sample_blob['uuid']
+    def _batch_sample_uuids(self, batch_size, input_sample_uuids=[]):
+        """Yield batches of sample uuids."""
+        uuids_to_batch = input_sample_uuids if input_sample_uuids else self.get_sample_uuids()
+        sample_uuids = []
+        for sample_uuid in uuids_to_batch:
+            sample_uuids.append(sample_uuid)
+            if len(sample_uuids) == batch_size:
+                yield sample_uuids
+                sample_uuids = []
+        if sample_uuids:
+            yield sample_uuids
     def get_analysis_results(self, cache=True):
         """Yield ProjectResultFolder objects for this project fetched from the server.
@@ -239,6 +255,76 @@ class Project(RemoteObject):
         url = f"sample_groups/{self.uuid}/metadata"
         blob = self.knex.get(url)
         return pd.DataFrame.from_dict(blob, orient="index")
+    @property
+    def n_samples(self):
+        """Return the number of samples in this project."""
+        if self.hasattr('samples_count') and self.samples_count is not None:
+            return self.samples_count
+        return len(list(self.get_sample_uuids()))
+    def bulk_find_files(self,
+                        sample_uuids=[],
+                        sample_name_includes=[],
+                        folder_types="all",
+                        folder_names=[],
+                        file_names=[],
+                        extensions=[],
+                        with_versions=False,
+                        use_batches_cutoff=500):
+        """Return a dict with links to download files that match the given criteria.
+        Options:
+        - sample_uuids: list of sample uuids; if blank search all samples in project
+        - sample_name_includes: list of strings; finds samples with names that include these strings
+        - folder_types: "all", "project", "sample"; finds files in folders of these types
+        - folder_names: list of strings; finds files in folders that have these strings in their names
+        - file_names: list of strings; finds files that have these strings in their names
+        - extensions: list of strings; finds files with these file extensions
+        - with_versions: bool; if True, include all versions of files in results
+        """
+        def _my_bulk_find(sample_uuids=[]):  # curry to save typing
+            return self._bulk_find_files_batch(sample_uuids=sample_uuids,
+                                             sample_name_includes=sample_name_includes,
+                                             folder_types=folder_types,
+                                             folder_names=folder_names,
+                                             file_names=file_names,
+                                             extensions=extensions,
+                                             with_versions=with_versions)
+        n_samples = len(sample_uuids) if sample_uuids else self.n_samples
+        if n_samples < use_batches_cutoff:
+            logger.debug(f"Using single batch bulk_find for {n_samples} samples")
+            return _my_bulk_find()
+        else:
+            logger.debug(f"Using multi batch bulk_find for {n_samples} samples")
+            merged_response = {'file_size_bytes': 0, 'links': {}, 'no_size_info_count': 0}
+            for batch in self._batch_sample_uuids(use_batches_cutoff - 1, input_sample_uuids=sample_uuids):
+                response = _my_bulk_find(sample_uuids=batch)
+                merged_response['file_size_bytes'] += response['file_size_bytes']
+                merged_response['links'].update(response['links'])
+                merged_response['no_size_info_count'] += response['no_size_info_count']
+            return merged_response
+    def _bulk_find_files_batch(self,
+                               sample_uuids=[],
+                               sample_name_includes=[],
+                               folder_types=[],
+                               folder_names=[],
+                               file_names=[],
+                               extensions=[],
+                               with_versions=False):
+        data = {
+            "sample_uuids": sample_uuids,
+            "sample_names": sample_name_includes,
+            "folder_type": folder_types,
+            "folder_names": folder_names,
+            "file_names": file_names,
+            "extensions": extensions,
+            "with_versions": with_versions
+        }
+        url = f"sample_groups/{self.uuid}/download"
+        response = self.knex.post(url, data)
+        return response
     def __str__(self):
         return f"<Geoseeq::Project {self.name} {self.uuid} />"

{geoseeq-0.2.22 → geoseeq-0.3.1}/geoseeq/result/file_upload.py RENAMED Viewed

@@ -41,14 +41,13 @@ class FileChunker:
 class ResultFileUpload:
     """Abstract class that handles upload methods for result files."""
-    def _prep_multipart_upload(self, filepath, file_size, chunk_size, optional_fields):
-        n_parts = int(file_size / chunk_size) + 1
+    def _create_multipart_upload(self, filepath, file_size, optional_fields):
         optional_fields = optional_fields if optional_fields else {}
         optional_fields.update(
             {
                 "md5_checksum": md5_checksum(filepath),
-                "file_size_bytes": getsize(filepath),
+                "file_size_bytes": file_size,
             }
         )
         data = {
@@ -57,6 +56,11 @@ class ResultFileUpload:
             "result_type": "sample" if self.is_sample_result else "group",
         }
         response = self.knex.post(f"/ar_fields/{self.uuid}/create_upload", json=data)
+        return response
+    def _prep_multipart_upload(self, filepath, file_size, chunk_size, optional_fields):
+        n_parts = int(file_size / chunk_size) + 1
+        response = self._create_multipart_upload(filepath, file_size, optional_fields)
         upload_id = response["upload_id"]
         parts = list(range(1, n_parts + 1))
         data = {
@@ -105,6 +109,7 @@ class ResultFileUpload:
     def _upload_parts(self, file_chunker, urls, max_retries, session, progress_tracker, threads):
         if threads == 1:
+            logger.info(f"Uploading parts in series for {file_chunker.filepath}")
             complete_parts = []
             for num, url in enumerate(list(urls.values())):
                 response_part = self._upload_one_part(file_chunker, url, num, max_retries, session)
@@ -114,6 +119,7 @@ class ResultFileUpload:
             return complete_parts
         with ThreadPoolExecutor(max_workers=threads) as executor:
+            logger.info(f"Uploading parts in parallel for {file_chunker.filepath} with {threads} threads.")
             futures = []
             for num, url in enumerate(list(urls.values())):
                 future = executor.submit(
@@ -128,6 +134,7 @@ class ResultFileUpload:
                 logger.info(
                     f'Uploaded part {response_part["PartNumber"]} of {len(urls)} for "{file_chunker.filepath}"'
                 )
+        complete_parts = sorted(complete_parts, key=lambda x: x["PartNumber"])
         return complete_parts
     def multipart_upload_file(

{geoseeq-0.2.22 → geoseeq-0.3.1}/geoseeq.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geoseeq
-Version: 0.2.22
+Version: 0.3.1
 Summary: # Geoseeq API Client
 Author: David C. Danko
 Author-email: dcdanko@biotia.io

{geoseeq-0.2.22 → geoseeq-0.3.1}/geoseeq.egg-info/SOURCES.txt RENAMED Viewed

@@ -27,6 +27,7 @@ geoseeq.egg-info/top_level.txt
 geoseeq/cli/__init__.py
 geoseeq/cli/constants.py
 geoseeq/cli/copy.py
+geoseeq/cli/detail.py
 geoseeq/cli/download.py
 geoseeq/cli/fastq_utils.py
 geoseeq/cli/main.py
@@ -44,6 +45,7 @@ geoseeq/cli/shared_params/obj_getters.py
 geoseeq/cli/shared_params/opts_and_args.py
 geoseeq/cli/upload/__init__.py
 geoseeq/cli/upload/upload.py
+geoseeq/cli/upload/upload_advanced.py
 geoseeq/cli/upload/upload_reads.py
 geoseeq/contrib/__init__.py
 geoseeq/contrib/ncbi/__init__.py

{geoseeq-0.2.22 → geoseeq-0.3.1}/setup.py RENAMED Viewed

@@ -5,7 +5,7 @@ import setuptools
 setuptools.setup(
     name='geoseeq',
-    version='0.2.22',  # remember to update version string in CLI as well
+    version='0.3.1',  # remember to update version string in CLI as well
     author="David C. Danko",
     author_email='dcdanko@biotia.io',
     description=open('README.md').read(),