PyPI - geoseeq - Versions diffs - 0.5.6a15__tar.gz → 0.6.0__tar.gz - Mend

geoseeq 0.5.6a15tar.gz → 0.6.0tar.gz

Files changed (97) hide show

{geoseeq-0.5.6a15 → geoseeq-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geoseeq
-Version: 0.5.6a15
+Version: 0.6.0
 Summary: GeoSeeq command line tools and python API
 Author: David C. Danko
 Author-email: "David C. Danko" <dcdanko@biotia.io>

{geoseeq-0.5.6a15 → geoseeq-0.6.0}/geoseeq/cli/download.py RENAMED Viewed

@@ -468,3 +468,4 @@ def cli_download_fastqs(state, cores, target_dir, yes, first, download, module_n
             click.confirm('Continue?', abort=True)
         logger.info(f'Downloading {len(download_manager)} files to {target_dir}')
         download_manager.download_files()

{geoseeq-0.5.6a15 → geoseeq-0.6.0}/geoseeq/cli/main.py RENAMED Viewed

@@ -18,6 +18,7 @@ from .shared_params.opts_and_args import overwrite_option, yes_option
 from .detail import cli_detail
 from .run import cli_app
 from .get_eula import cli_eula
+from .project import cli_project
 logger = logging.getLogger('geoseeq_api')
 handler = logging.StreamHandler()
@@ -53,7 +54,7 @@ def version():
     Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
     Run `geoseeq eula show` to view the EULA.
     """
-    click.echo('0.5.6a15')  # remember to update setup
+    click.echo('0.6.0')  # remember to update setup
 @main.group('advanced')
@@ -65,6 +66,7 @@ cli_advanced.add_command(cli_copy)
 cli_advanced.add_command(cli_user)
 cli_advanced.add_command(cli_detail)
 cli_advanced.add_command(cli_upload_advanced)
+cli_advanced.add_command(cli_project)
 @cli_advanced.group('experimental')
 def cli_experimental():
@@ -101,4 +103,20 @@ def cli_config(yes, api_token, endpoint, profile, overwrite):
             click.echo('You must accept the EULA to use the GeoSeeq API.')
             return
     set_profile(api_token, endpoint=endpoint, profile=profile, overwrite=overwrite)
-    click.echo(f'Profile configured.')
+    click.echo(f'Profile configured.')
+@main.command('clear-cache')
+@yes_option
+def cli_clear_cache(yes):
+    """Clear the local cache.
+    ---
+    Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
+    Run `geoseeq eula show` to view the EULA.
+    """
+    from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
+    import shutil
+    if yes or click.confirm('Are you sure you want to clear the cache?'):
+        shutil.rmtree(GEOSEEQ_CACHE_DIR, ignore_errors=True)

geoseeq-0.6.0/geoseeq/cli/project.py ADDED Viewed

@@ -0,0 +1,96 @@
+import json
+import logging
+from os import makedirs, getcwd
+from os.path import dirname, join
+import click
+import pandas as pd
+from multiprocessing import Pool
+from .shared_params import (
+    handle_project_id,
+    handle_folder_id,
+    project_id_arg,
+    sample_ids_arg,
+    handle_multiple_sample_ids,
+    handle_multiple_result_file_ids,
+    use_common_state,
+    flatten_list_of_els_and_files,
+    yes_option,
+    module_option,
+    ignore_errors_option,
+    folder_ids_arg,
+)
+from geoseeq.result.file_download import download_url
+from geoseeq.utils import download_ftp
+from geoseeq.id_constructors import (
+    result_file_from_uuid,
+    result_file_from_name,
+)
+from geoseeq.knex import GeoseeqNotFoundError
+from .progress_bar import PBarManager
+from .utils import convert_size
+from geoseeq.constants import FASTQ_MODULE_NAMES
+from geoseeq.result import ResultFile
+from geoseeq.upload_download_manager import GeoSeeqDownloadManager
+from geoseeq.file_system.filesystem_download import (
+    ProjectOnFilesystem,
+    FILE_STATUS_MODIFIED_REMOTE,
+    FILE_STATUS_MODIFIED_LOCAL,
+    FILE_STATUS_NEW_LOCAL,
+    FILE_STATUS_NEW_REMOTE,
+    FILE_STATUS_IS_LOCAL_STUB,
+)
+logger = logging.getLogger('geoseeq_api')
+@click.group("project")
+def cli_project():
+    """Download data from GeoSeeq."""
+    pass
+@cli_project.command("clone")
+@use_common_state
+@click.option('--use-stubs/--full-files', default=True, help='Download full files or stubs')
+@click.option('--target-dir', '-d', default=None, help='Directory to download the project to')
+@project_id_arg
+def cli_clone_project(state, use_stubs, target_dir, project_id):
+    """Clone a project to the local filesystem.
+    """
+    knex = state.get_knex().set_auth_required()
+    proj = handle_project_id(knex, project_id)
+    logger.info(f"Found project \"{proj.name}\"")
+    if target_dir is None:
+        target_dir = proj.name
+    project = ProjectOnFilesystem(proj, target_dir)
+    project.download(use_stubs=use_stubs)
+@cli_project.command("status")
+@use_common_state
+def cli_project_status(state):
+    """Check the status of a project on the local filesystem.
+    """
+    project = ProjectOnFilesystem.from_path(getcwd(), recursive=True)
+    objs_by_status = {
+        FILE_STATUS_MODIFIED_LOCAL: [],
+        FILE_STATUS_MODIFIED_REMOTE: [],
+        FILE_STATUS_NEW_LOCAL: [],
+        FILE_STATUS_NEW_REMOTE: [],
+        FILE_STATUS_IS_LOCAL_STUB: [],
+    }
+    for obj_type, status, local_path, obj in project.list_abnormal_objects():
+        objs_by_status[status].append((obj_type, local_path, obj))
+    print(f"Project: {project.project.name}")
+    for status, objs in objs_by_status.items():
+        print(f"Status: {status}")
+        for obj_type, local_path, obj in objs:
+            if status in (FILE_STATUS_MODIFIED_LOCAL, FILE_STATUS_NEW_LOCAL):
+                print(f"  {obj_type}: {project.path_from_project_root(local_path)} -> {obj}")
+            else:
+                print(f"  {obj_type}:  {obj} -> {project.path_from_project_root(local_path)}")

geoseeq-0.6.0/geoseeq/cli/raw.py ADDED Viewed

@@ -0,0 +1,59 @@
+import click
+import json
+from .shared_params import use_common_state, overwrite_option
+from geoseeq import GeoseeqNotFoundError
+from geoseeq.blob_constructors import (
+    sample_result_file_from_uuid,
+    project_result_file_from_uuid,
+    sample_result_folder_from_uuid,
+    project_result_folder_from_uuid,
+)
+@click.group('raw')
+def cli_raw():
+    """Low-level commands for interacting with the API."""
+    pass
+@cli_raw.command('get-file-data')
+@use_common_state
+@click.argument('file_ids', nargs=-1)
+def cli_get_file_data(state, file_ids):
+    """Print the raw stored data in a result file object."""
+    knex = state.get_knex()
+    for file_id in file_ids:
+        file_id = file_id.split(':')[-1]
+        try:
+            result_file = sample_result_file_from_uuid(knex, file_id)
+        except GeoseeqNotFoundError:
+            result_file = project_result_file_from_uuid(knex, file_id)
+        print(json.dumps(result_file.stored_data, indent=2), file=state.outfile)
+@cli_raw.command('create-raw-file')
+@use_common_state
+@overwrite_option
+@click.argument('folder_id')
+@click.argument('result_filename')
+@click.argument('filename', type=click.File('r'))
+def cli_get_file_data(state, overwrite, folder_id, result_filename, filename):
+    """Print the raw stored data in a result file object."""
+    knex = state.get_knex()
+    folder_id = folder_id.split(':')[-1]
+    try:
+        result_folder = sample_result_folder_from_uuid(knex, folder_id)
+    except GeoseeqNotFoundError:
+        result_folder = project_result_folder_from_uuid(knex, folder_id)
+    blob = json.load(filename)
+    result_file = result_folder.result_file(result_filename)
+    if overwrite:
+        result_file.idem()
+        result_file.stored_data = blob
+        result_file.save()
+    else:
+        result_file.create()
+    click.echo(f'Created file {result_file.uuid}', file=state.outfile)

{geoseeq-0.5.6a15 → geoseeq-0.6.0}/geoseeq/cli/upload/upload.py RENAMED Viewed

@@ -40,7 +40,7 @@ hidden_option = click.option('--hidden/--no-hidden', default=False, help='Upload
 @click.option('--cores', default=1, help='Number of uploads to run in parallel', show_default=True)
 @click.option('--threads-per-upload', default=4, help='Number of threads used to upload each file', show_default=True)
 @click.option('--num-retries', default=3, help='Number of times to retry a failed upload', show_default=True)
-@click.option('--chunk-size-mb', default=5, help='Size of chunks to upload in MB', show_default=True)
+@click.option('--chunk-size-mb', default=-1, help='Size of chunks to upload in MB', show_default=True)
 @ignore_errors_option
 @yes_option
 @private_option
@@ -122,8 +122,9 @@ def cli_upload_file(state, cores, threads_per_upload, num_retries, chunk_size_mb
         use_cache=state.use_cache,
         num_retries=num_retries,
         ignore_errors=ignore_errors,
-        session=knex.new_session(),
-        chunk_size_mb=chunk_size_mb,
+        use_atomic_upload=True,
+        session=None, #knex.new_session(),
+        chunk_size_mb=chunk_size_mb if chunk_size_mb > 0 else None,
     )
     for geoseeq_file_name, file_path in name_pairs:
         if isfile(file_path):
@@ -160,6 +161,7 @@ def cli_upload_folder(state, cores, yes, private, recursive, hidden, no_new_vers
         overwrite=True,
         use_cache=state.use_cache,
         no_new_versions=no_new_versions,
+        use_atomic_upload=True,
     )
     for folder_name in folder_names:
         result_folder = root_obj.result_folder(folder_name).idem()

{geoseeq-0.5.6a15 → geoseeq-0.6.0}/geoseeq/cli/upload/upload_reads.py RENAMED Viewed

@@ -98,6 +98,7 @@ def _do_upload(groups, module_name, link_type, lib, filepaths, overwrite, no_new
             progress_tracker_factory=PBarManager().get_new_bar,
             use_cache=state.use_cache,
             no_new_versions=no_new_versions,
+            use_atomic_upload=True,
         )
         for group in groups:
             sample = lib.sample(group['sample_name']).idem()

geoseeq-0.6.0/geoseeq/result/file_chunker.py ADDED Viewed

@@ -0,0 +1,50 @@
+from os.path import getsize
+import logging
+logger = logging.getLogger("geoseeq_api")  # Same name as calling module
+logger.addHandler(logging.NullHandler())
+class FileChunker:
+    def __init__(self, filepath, chunk_size):
+        self.filepath = filepath
+        self.chunk_size = chunk_size
+        self.file_size = getsize(filepath)
+        self.n_parts = int(self.file_size / self.chunk_size) + 1
+        self.loaded_parts = []
+    def load_all_chunks(self):
+        if len(self.loaded_parts) != self.n_parts:
+            with open(self.filepath, "rb") as f:
+                f.seek(0)
+                for i in range(self.n_parts):
+                    chunk = f.read(self.chunk_size)
+                    self.loaded_parts.append(chunk)
+        return self  # convenience for chaining
+    def chunk_is_preloaded(self, num):
+        return len(self.loaded_parts) > num and self.loaded_parts[num]
+    def read_one_chunk(self, num):
+        if not self.chunk_is_preloaded(num):
+            logger.debug(f"Reading chunk {num} from {self.filepath}")
+            with open(self.filepath, "rb") as f:
+                f.seek(num * self.chunk_size)
+                chunk = f.read(self.chunk_size)
+                return chunk
+        return self.loaded_parts[num]
+    def get_chunk(self, num):
+        if self.chunk_is_preloaded(num):
+            return self.loaded_parts[num]
+        return self.read_one_chunk(num)
+    def get_chunk_size(self, num):
+        if num < (self.n_parts - 1):  # all but the last chunk
+            return self.chunk_size
+        if self.chunk_is_preloaded(num):  # last chunk, pre-loaded
+            return len(self.loaded_parts[num])
+        return len(self.read_one_chunk(num))  # last chunk, not pre-loaded

{geoseeq-0.5.6a15 → geoseeq-0.6.0}/geoseeq/result/file_download.py RENAMED Viewed

@@ -12,10 +12,10 @@ from geoseeq.constants import FIVE_MB
 logger = logging.getLogger("geoseeq_api")  # Same name as calling module
-def _download_head(url, filename, head=None, progress_tracker=None):
+def _download_head(url, filename, head=None, start=0, progress_tracker=None):
     headers = None
     if head and head > 0:
-        headers = {"Range": f"bytes=0-{head}"}
+        headers = {"Range": f"bytes={start}-{head}"}
     response = requests.get(url, stream=True, headers=headers)
     response.raise_for_status()
     total_size_in_bytes = int(response.headers.get('content-length', 0))
@@ -67,7 +67,6 @@ def download_url(url, kind='guess', filename=None, head=None, progress_tracker=N
         raise ValueError(f"Unknown download kind: {kind}")
 class ResultFileDownload:
     """Abstract class that handles download methods for result files."""

{geoseeq-0.5.6a15 → geoseeq-0.6.0}/geoseeq/result/file_upload.py RENAMED Viewed

@@ -13,130 +13,21 @@ from geoseeq.utils import md5_checksum
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from .utils import *
 from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
-class FileChunker:
-    def __init__(self, filepath, chunk_size):
-        self.filepath = filepath
-        self.chunk_size = chunk_size
-        self.file_size = getsize(filepath)
-        self.n_parts = int(self.file_size / self.chunk_size) + 1
-        self.loaded_parts = []
-    def load_all_chunks(self):
-        if len(self.loaded_parts) != self.n_parts:
-            with open(self.filepath, "rb") as f:
-                f.seek(0)
-                for i in range(self.n_parts):
-                    chunk = f.read(self.chunk_size)
-                    self.loaded_parts.append(chunk)
-        return self  # convenience for chaining
-    def chunk_is_preloaded(self, num):
-        return len(self.loaded_parts) > num and self.loaded_parts[num]
-    def read_one_chunk(self, num):
-        if not self.chunk_is_preloaded(num):
-            logger.debug(f"Reading chunk {num} from {self.filepath}")
-            with open(self.filepath, "rb") as f:
-                f.seek(num * self.chunk_size)
-                chunk = f.read(self.chunk_size)
-                return chunk
-        return self.loaded_parts[num]
-    def get_chunk(self, num):
-        if self.chunk_is_preloaded(num):
-            return self.loaded_parts[num]
-        return self.read_one_chunk(num)
-    def get_chunk_size(self, num):
-        if num < (self.n_parts - 1):  # all but the last chunk
-            return self.chunk_size
-        if self.chunk_is_preloaded(num):  # last chunk, pre-loaded
-            return len(self.loaded_parts[num])
-        return len(self.read_one_chunk(num))  # last chunk, not pre-loaded
-class ResumableUploadTracker:
-    def __init__(self, filepath, chunk_size, tracker_file_prefix="gs_resumable_upload_tracker"):
-        self.open, self.upload_started = True, False
-        self.upload_id, self.urls = None, None
-        self.filepath = filepath
-        self.tracker_file = join(
-            GEOSEEQ_CACHE_DIR, 'upload',
-            tracker_file_prefix + f".{chunk_size}.{getsize(filepath)}." + basename(filepath)
-        )
-        try:
-            os.makedirs(dirname(self.tracker_file), exist_ok=True)
-        except Exception as e:
-            logger.warning(f'Could not create resumable upload tracker directory. {e}')
-            self.open = False
-        self._loaded_parts = {}
-        self._load_parts_from_file()
-    def start_upload(self, upload_id, urls):
-        if not self.open:
-            return
-        if self.upload_started:
-            raise GeoseeqGeneralError("Upload has already started.")
-        blob = dict(upload_id=upload_id, urls=urls, start_time=time.time())
-        serialized = json.dumps(blob)
-        with open(self.tracker_file, "w") as f:
-            f.write(serialized + "\n")
-        self.upload_id, self.urls = upload_id, urls
-        self.upload_started = True
-    def add_part(self, part_upload_info):
-        if not self.open:
-            return
-        part_id = part_upload_info["PartNumber"]
-        serialized = json.dumps(part_upload_info)
-        with open(self.tracker_file, "a") as f:
-            f.write(serialized + "\n")
-        self._loaded_parts[part_id] = part_upload_info
-        if len(self._loaded_parts) == len(self.urls):
-            self.cleanup()
-            self.open = False
-    def _load_parts_from_file(self):
-        if not isfile(self.tracker_file):
-            return
-        with open(self.tracker_file, "r") as f:
-            header_blob = json.loads(f.readline())
-            self.upload_id, self.urls = header_blob["upload_id"], header_blob["urls"]
-            start_time = header_blob["start_time"]
-            if (time.time() - start_time) > (60 * 60 * 23):
-                logger.warning(f"Tracker file {self.tracker_file} is too old. Deleting.")
-                os.remove(self.tracker_file)
-                return
-            self.upload_started = True
-            for line in f:
-                blob = json.loads(line)
-                part_id = blob["PartNumber"]
-                self._loaded_parts[part_id] = blob
-    def part_has_been_uploaded(self, part_number):
-        if not self.open:
-            return False
-        return part_number in self._loaded_parts
-    def get_part_info(self, part_number):
-        return self._loaded_parts[part_number]
-    def cleanup(self):
-        if not self.open:
-            return
-        try:
-            os.remove(self.tracker_file)
-        except FileNotFoundError:
-            pass
+from .file_chunker import FileChunker
+from .resumable_upload_tracker import ResumableUploadTracker
 class ResultFileUpload:
     """Abstract class that handles upload methods for result files."""
-    def _create_multipart_upload(self, filepath, file_size, optional_fields):
+    def _result_type(self, atomic=False):
+        if self.is_sample_result:
+            return "sample"
+        if atomic:
+            return "project"
+        return "group"
+    def _create_multipart_upload(self, filepath, file_size, optional_fields, atomic=False):
         optional_fields = optional_fields if optional_fields else {}
         optional_fields.update(
             {
@@ -147,23 +38,31 @@ class ResultFileUpload:
         data = {
             "filename": basename(filepath),
             "optional_fields": optional_fields,
-            "result_type": "sample" if self.is_sample_result else "group",
+            "result_type": self._result_type(atomic),
         }
-        response = self.knex.post(f"/ar_fields/{self.uuid}/create_upload", json=data)
+        url = f"/ar_fields/{self.uuid}/create_upload"
+        if atomic:
+            data["fieldname"] = self.name
+            url = f"/ars/{self.parent.uuid}/create_atomic_upload"
+        response = self.knex.post(url, json=data)
         return response
-    def _prep_multipart_upload(self, filepath, file_size, chunk_size, optional_fields):
+    def _prep_multipart_upload(self, filepath, file_size, chunk_size, optional_fields, atomic=False):
         n_parts = int(file_size / chunk_size) + 1
-        response = self._create_multipart_upload(filepath, file_size, optional_fields)
+        response = self._create_multipart_upload(filepath, file_size, optional_fields, atomic=atomic)
         upload_id = response["upload_id"]
-        parts = list(range(1, n_parts + 1))
         data = {
-            "parts": parts,
+            "parts": list(range(1, n_parts + 1)),
             "stance": "upload-multipart",
             "upload_id": upload_id,
-            "result_type": "sample" if self.is_sample_result else "group",
+            "result_type": self._result_type(atomic),
         }
-        response = self.knex.post(f"/ar_fields/{self.uuid}/create_upload_urls", json=data)
+        url = f"/ar_fields/{self.uuid}/create_upload_urls"
+        if atomic:
+            data["uuid"] = response["uuid"]
+            data["fieldname"] = self.name
+            url = f"ars/{self.parent.uuid}/create_atomic_upload_urls"
+        response = self.knex.post(url, json=data)
         urls = response
         return upload_id, urls
@@ -175,6 +74,7 @@ class ResultFileUpload:
         attempts = 0
         while attempts < max_retries:
             try:
+                # url = url.replace("s3.wasabisys.com", "s3.us-east-1.wasabisys.com")
                 logger.debug(f"Uploading part {num + 1} to {url}. Size: {len(file_chunk)} bytes.")
                 if session:
                     http_response = session.put(url, data=file_chunk)
@@ -192,7 +92,7 @@ class ResultFileUpload:
                     raise e
                 retry_time = min(8 ** attempts, 120)  # exponential backoff, max 120s
-                retry_time *= 0.8 + (random() * 0.4)  # randomize to avoid thundering herd
+                retry_time *= 0.6 + (random() * 0.8)  # randomize to avoid thundering herd
                 logger.debug(f"Retrying upload for part {num + 1} in {retry_time} seconds.")
                 time.sleep(retry_time)
@@ -203,16 +103,17 @@ class ResultFileUpload:
             resumable_upload_tracker.add_part(blob)
         return blob
-    def _finish_multipart_upload(self, upload_id, complete_parts):
-        response = self.knex.post(
-            f"/ar_fields/{self.uuid}/complete_upload",
-            json={
-                "parts": complete_parts,
-                "upload_id": upload_id,
-                "result_type": "sample" if self.is_sample_result else "group",
-            },
-            json_response=False,
-        )
+    def _finish_multipart_upload(self, upload_id, complete_parts, atomic=False):
+        data = {
+            "parts": complete_parts,
+            "upload_id": upload_id,
+            "result_type": self._result_type(atomic),
+        }
+        url = f"/ar_fields/{self.uuid}/complete_upload"
+        if atomic:
+            data["fieldname"] = self.name
+            url = f"/ars/{self.parent.uuid}/complete_atomic_upload"
+        response = self.knex.post(url, json=data, json_response=False)
         response.raise_for_status()
     def _upload_parts(self, file_chunker, urls, max_retries, session, progress_tracker, threads, resumable_upload_tracker=None):
@@ -250,26 +151,38 @@ class ResultFileUpload:
         filepath,
         file_size,
         optional_fields=None,
-        chunk_size=FIVE_MB,
+        chunk_size=None,
         max_retries=3,
         session=None,
         progress_tracker=None,
         threads=1,
         use_cache=True,
+        use_atomic_upload=False,
     ):
         """Upload a file to S3 using the multipart upload process."""
         logger.info(f"Uploading {filepath} to S3 using multipart upload.")
+        if not chunk_size:
+            chunk_size = FIVE_MB
+            if file_size >= 10 * FIVE_MB:
+                chunk_size = 5 * FIVE_MB
+        logger.debug(f"Using chunk size of {chunk_size} bytes.")
         resumable_upload_tracker = None
         if use_cache and file_size > 10 * FIVE_MB:  # only use resumable upload tracker for larger files
-            resumable_upload_tracker = ResumableUploadTracker(filepath, chunk_size)
+            upload_target_uuid = self.parent.uuid if use_atomic_upload else self.uuid
+            resumable_upload_tracker = ResumableUploadTracker(filepath, chunk_size, upload_target_uuid)
         if resumable_upload_tracker and resumable_upload_tracker.upload_started:
+            # a resumable upload for this file has already started
+            resumable_upload_exists_and_is_valid = True
             upload_id, urls = resumable_upload_tracker.upload_id, resumable_upload_tracker.urls
+            use_atomic_upload = resumable_upload_tracker.is_atomic_upload
             logger.info(f'Resuming upload for "{filepath}", upload_id: "{upload_id}"')
         else:
-            upload_id, urls = self._prep_multipart_upload(filepath, file_size, chunk_size, optional_fields)
+            upload_id, urls = self._prep_multipart_upload(filepath, file_size, chunk_size, optional_fields, atomic=use_atomic_upload)
             if resumable_upload_tracker:
                 logger.info(f'Creating new resumable upload for "{filepath}", upload_id: "{upload_id}"')
-                resumable_upload_tracker.start_upload(upload_id, urls)
+                resumable_upload_tracker.start_upload(upload_id, urls, is_atomic_upload=use_atomic_upload)
         logger.info(f'Starting upload for "{filepath}"')
         complete_parts = []
         file_chunker = FileChunker(filepath, chunk_size)
@@ -288,14 +201,20 @@ class ResultFileUpload:
             threads,
             resumable_upload_tracker=resumable_upload_tracker
         )
-        self._finish_multipart_upload(upload_id, complete_parts)
+        self._finish_multipart_upload(upload_id, complete_parts, atomic=use_atomic_upload)
         logger.info(f'Finished Upload for "{filepath}"')
+        if use_atomic_upload:
+            # if this was an atomic upload then this result may not have existed on the server before
+            self.get()
         return self
     def upload_file(self, filepath, multipart_thresh=FIVE_MB, overwrite=True, no_new_versions=False, **kwargs):
         if self.exists() and not overwrite:
             raise GeoseeqGeneralError(f"Overwrite is set to False and file {self.uuid} already exists.")
-        self.idem()
+        if not kwargs.get("use_atomic_upload", False):
+            self.idem()
+        else:
+            self.parent.idem()
         if no_new_versions and self.has_downloadable_file():
             raise GeoseeqGeneralError(f"File {self} already has a downloadable file. Not uploading a new version.")
         resolved_path = Path(filepath).resolve()

geoseeq-0.6.0/geoseeq/result/resumable_upload_tracker.py ADDED Viewed

@@ -0,0 +1,100 @@
+import time
+import json
+import os
+from os.path import basename, getsize, join, dirname, isfile, getctime
+from pathlib import Path
+from random import random
+import requests
+from geoseeq.knex import GeoseeqGeneralError
+from geoseeq.constants import FIVE_MB
+from geoseeq.utils import md5_checksum
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from .utils import *
+from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
+from .file_chunker import FileChunker
+class ResumableUploadTracker:
+    def __init__(self, filepath, chunk_size, upload_target_uuid, tracker_file_prefix="gs_resumable_upload_tracker"):
+        self.open, self.upload_started = True, False
+        self.upload_id, self.urls, self.is_atomic_upload = None, None, None
+        self.upload_target_uuid = upload_target_uuid
+        self.filepath = filepath
+        self.tracker_file_dir = join(GEOSEEQ_CACHE_DIR, 'upload')
+        self.tracker_file = join(
+            self.tracker_file_dir,
+            tracker_file_prefix + f".{upload_target_uuid}.{chunk_size}.{getsize(filepath)}." + basename(filepath)
+        )
+        try:
+            os.makedirs(self.tracker_file_dir, exist_ok=True)
+        except Exception as e:
+            logger.warning(f'Could not create resumable upload tracker directory. {e}')
+            self.open = False
+        self._loaded_parts = {}
+        self._load_parts_from_file()
+    def start_upload(self, upload_id, urls, is_atomic_upload=False):
+        if not self.open:
+            return
+        if self.upload_started:
+            raise GeoseeqGeneralError("Upload has already started.")
+        self.upload_started = True
+        blob = dict(upload_id=upload_id,
+                    urls=urls,
+                    is_atomic_upload=is_atomic_upload,
+                    upload_target_uuid=self.upload_target_uuid,
+                    start_time=time.time())
+        serialized = json.dumps(blob)
+        with open(self.tracker_file, "w") as f:
+            f.write(serialized + "\n")
+        self.upload_id, self.urls, self.is_atomic_upload = upload_id, urls, is_atomic_upload
+    def add_part(self, part_upload_info):
+        if not self.open:
+            return
+        part_id = part_upload_info["PartNumber"]
+        serialized = json.dumps(part_upload_info)
+        with open(self.tracker_file, "a") as f:
+            f.write(serialized + "\n")
+        self._loaded_parts[part_id] = part_upload_info
+        if len(self._loaded_parts) == len(self.urls):
+            self.cleanup()
+            self.open = False
+    def _load_parts_from_file(self):
+        if not isfile(self.tracker_file):
+            return
+        with open(self.tracker_file, "r") as f:
+            header_blob = json.loads(f.readline())
+            self.upload_id, self.urls, self.is_atomic_upload = (
+                header_blob["upload_id"], header_blob["urls"], header_blob["is_atomic_upload"]
+            )
+            start_time = header_blob["start_time"]
+            if (time.time() - start_time) > (60 * 60 * 23):
+                logger.warning(f"Tracker file {self.tracker_file} is too old. Deleting.")
+                os.remove(self.tracker_file)
+                return
+            self.upload_started = True
+            for line in f:
+                blob = json.loads(line)
+                part_id = blob["PartNumber"]
+                self._loaded_parts[part_id] = blob
+    def part_has_been_uploaded(self, part_number):
+        if not self.open:
+            return False
+        return part_number in self._loaded_parts
+    def get_part_info(self, part_number):
+        return self._loaded_parts[part_number]
+    def cleanup(self):
+        if not self.open:
+            return
+        try:
+            os.remove(self.tracker_file)
+        except FileNotFoundError:
+            pass

{geoseeq-0.5.6a15 → geoseeq-0.6.0}/geoseeq/upload_download_manager.py RENAMED Viewed

@@ -22,8 +22,8 @@ def _upload_one_file(args):
     (result_file, filepath, session, progress_tracker,
      link_type, overwrite, log_level, parallel_uploads,
      use_cache, no_new_versions, threads_per_upload,
-     num_retries, ignore_errors, chunk_size_mb) = args
-    chunk_size = chunk_size_mb * 1024 * 1024
+     num_retries, ignore_errors, chunk_size_mb, use_atomic_upload) = args
+    chunk_size = chunk_size_mb * 1024 * 1024 if chunk_size_mb else None
     if parallel_uploads:
         _make_in_process_logger(log_level)
     try:
@@ -34,6 +34,7 @@ def _upload_one_file(args):
                 session=session, overwrite=overwrite, progress_tracker=progress_tracker,
                 threads=threads_per_upload, use_cache=use_cache, chunk_size=chunk_size,
                 no_new_versions=no_new_versions, max_retries=num_retries,
+                use_atomic_upload=use_atomic_upload
             )
         else:
             result_file.link_file(link_type, filepath)
@@ -59,6 +60,7 @@ class GeoSeeqUploadManager:
                  num_retries=3,
                  ignore_errors=False,
                  chunk_size_mb=5,
+                 use_atomic_upload=True,
                  use_cache=True):
         self.session = session
         self.n_parallel_uploads = n_parallel_uploads
@@ -73,12 +75,18 @@ class GeoSeeqUploadManager:
         self.num_retries = num_retries
         self.ignore_errors = ignore_errors
         self.chunk_size_mb = chunk_size_mb
+        self.use_atomic_upload = use_atomic_upload
     def add_result_file(self, result_file, local_path):
         self._result_files.append((result_file, local_path))
     def add_local_file_to_result_folder(self, result_folder, local_path, geoseeq_file_name=None):
-        geoseeq_file_name = geoseeq_file_name if geoseeq_file_name else local_path
+        if not geoseeq_file_name:
+            if local_path.startswith("/"):  # if local path is an absolute path use the basename
+                geoseeq_file_name = basename(local_path)
+            else:
+                # remove "./" and "../" from local path to get a geoseeq file name
+                geoseeq_file_name = local_path.replace("./", "").replace("../", "")
         result_file = result_folder.result_file(geoseeq_file_name)
         self.add_result_file(result_file, local_path)
@@ -99,7 +107,7 @@ class GeoSeeqUploadManager:
                 self.link_type, self.overwrite, self.log_level,
                 self.n_parallel_uploads > 1, self.use_cache, self.no_new_versions,
                 self.threads_per_upload, self.num_retries, self.ignore_errors,
-                self.chunk_size_mb,
+                self.chunk_size_mb, self.use_atomic_upload
             ) for result_file, local_path in self._result_files
         ]
         out = []

{geoseeq-0.5.6a15 → geoseeq-0.6.0}/geoseeq.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geoseeq
-Version: 0.5.6a15
+Version: 0.6.0
 Summary: GeoSeeq command line tools and python API
 Author: David C. Danko
 Author-email: "David C. Danko" <dcdanko@biotia.io>

{geoseeq-0.5.6a15 → geoseeq-0.6.0}/geoseeq.egg-info/SOURCES.txt RENAMED Viewed

@@ -34,6 +34,8 @@ geoseeq/cli/get_eula.py
 geoseeq/cli/main.py
 geoseeq/cli/manage.py
 geoseeq/cli/progress_bar.py
+geoseeq/cli/project.py
+geoseeq/cli/raw.py
 geoseeq/cli/run.py
 geoseeq/cli/search.py
 geoseeq/cli/user.py
@@ -72,10 +74,12 @@ geoseeq/plotting/map/map.py
 geoseeq/plotting/map/overlay.py
 geoseeq/result/__init__.py
 geoseeq/result/bioinfo.py
+geoseeq/result/file_chunker.py
 geoseeq/result/file_download.py
 geoseeq/result/file_upload.py
 geoseeq/result/result_file.py
 geoseeq/result/result_folder.py
+geoseeq/result/resumable_upload_tracker.py
 geoseeq/result/utils.py
 geoseeq/vc/__init__.py
 geoseeq/vc/checksum.py

{geoseeq-0.5.6a15 → geoseeq-0.6.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "geoseeq"
-version = "0.5.6a15"
+version = "0.6.0"
 authors = [
   { name="David C. Danko", email="dcdanko@biotia.io" },
 ]