PyPI - geoseeq - Versions diffs - 0.5.6a16__tar.gz → 0.6.1__tar.gz - Mend

geoseeq 0.5.6a16tar.gz → 0.6.1tar.gz

Files changed (98) hide show

{geoseeq-0.5.6a16 → geoseeq-0.6.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geoseeq
-Version: 0.5.6a16
+Version: 0.6.1
 Summary: GeoSeeq command line tools and python API
 Author: David C. Danko
 Author-email: "David C. Danko" <dcdanko@biotia.io>

{geoseeq-0.5.6a16 → geoseeq-0.6.1}/geoseeq/cli/download.py RENAMED Viewed

@@ -468,3 +468,4 @@ def cli_download_fastqs(state, cores, target_dir, yes, first, download, module_n
             click.confirm('Continue?', abort=True)
         logger.info(f'Downloading {len(download_manager)} files to {target_dir}')
         download_manager.download_files()

{geoseeq-0.5.6a16 → geoseeq-0.6.1}/geoseeq/cli/main.py RENAMED Viewed

@@ -18,6 +18,7 @@ from .shared_params.opts_and_args import overwrite_option, yes_option
 from .detail import cli_detail
 from .run import cli_app
 from .get_eula import cli_eula
+from .project import cli_project
 logger = logging.getLogger('geoseeq_api')
 handler = logging.StreamHandler()
@@ -53,7 +54,7 @@ def version():
     Use of this tool implies acceptance of the GeoSeeq End User License Agreement.
     Run `geoseeq eula show` to view the EULA.
     """
-    click.echo('0.5.6a16')  # remember to update setup
+    click.echo('0.6.1')  # remember to update setup
 @main.group('advanced')
@@ -65,6 +66,7 @@ cli_advanced.add_command(cli_copy)
 cli_advanced.add_command(cli_user)
 cli_advanced.add_command(cli_detail)
 cli_advanced.add_command(cli_upload_advanced)
+cli_advanced.add_command(cli_project)
 @cli_advanced.group('experimental')
 def cli_experimental():

geoseeq-0.6.1/geoseeq/cli/project.py ADDED Viewed

@@ -0,0 +1,96 @@
+import json
+import logging
+from os import makedirs, getcwd
+from os.path import dirname, join
+import click
+import pandas as pd
+from multiprocessing import Pool
+from .shared_params import (
+    handle_project_id,
+    handle_folder_id,
+    project_id_arg,
+    sample_ids_arg,
+    handle_multiple_sample_ids,
+    handle_multiple_result_file_ids,
+    use_common_state,
+    flatten_list_of_els_and_files,
+    yes_option,
+    module_option,
+    ignore_errors_option,
+    folder_ids_arg,
+)
+from geoseeq.result.file_download import download_url
+from geoseeq.utils import download_ftp
+from geoseeq.id_constructors import (
+    result_file_from_uuid,
+    result_file_from_name,
+)
+from geoseeq.knex import GeoseeqNotFoundError
+from .progress_bar import PBarManager
+from .utils import convert_size
+from geoseeq.constants import FASTQ_MODULE_NAMES
+from geoseeq.result import ResultFile
+from geoseeq.upload_download_manager import GeoSeeqDownloadManager
+from geoseeq.file_system.filesystem_download import (
+    ProjectOnFilesystem,
+    FILE_STATUS_MODIFIED_REMOTE,
+    FILE_STATUS_MODIFIED_LOCAL,
+    FILE_STATUS_NEW_LOCAL,
+    FILE_STATUS_NEW_REMOTE,
+    FILE_STATUS_IS_LOCAL_STUB,
+)
+logger = logging.getLogger('geoseeq_api')
+@click.group("project")
+def cli_project():
+    """Download data from GeoSeeq."""
+    pass
+@cli_project.command("clone")
+@use_common_state
+@click.option('--use-stubs/--full-files', default=True, help='Download full files or stubs')
+@click.option('--target-dir', '-d', default=None, help='Directory to download the project to')
+@project_id_arg
+def cli_clone_project(state, use_stubs, target_dir, project_id):
+    """Clone a project to the local filesystem.
+    """
+    knex = state.get_knex().set_auth_required()
+    proj = handle_project_id(knex, project_id)
+    logger.info(f"Found project \"{proj.name}\"")
+    if target_dir is None:
+        target_dir = proj.name
+    project = ProjectOnFilesystem(proj, target_dir)
+    project.download(use_stubs=use_stubs)
+@cli_project.command("status")
+@use_common_state
+def cli_project_status(state):
+    """Check the status of a project on the local filesystem.
+    """
+    project = ProjectOnFilesystem.from_path(getcwd(), recursive=True)
+    objs_by_status = {
+        FILE_STATUS_MODIFIED_LOCAL: [],
+        FILE_STATUS_MODIFIED_REMOTE: [],
+        FILE_STATUS_NEW_LOCAL: [],
+        FILE_STATUS_NEW_REMOTE: [],
+        FILE_STATUS_IS_LOCAL_STUB: [],
+    }
+    for obj_type, status, local_path, obj in project.list_abnormal_objects():
+        objs_by_status[status].append((obj_type, local_path, obj))
+    print(f"Project: {project.project.name}")
+    for status, objs in objs_by_status.items():
+        print(f"Status: {status}")
+        for obj_type, local_path, obj in objs:
+            if status in (FILE_STATUS_MODIFIED_LOCAL, FILE_STATUS_NEW_LOCAL):
+                print(f"  {obj_type}: {project.path_from_project_root(local_path)} -> {obj}")
+            else:
+                print(f"  {obj_type}:  {obj} -> {project.path_from_project_root(local_path)}")

geoseeq-0.6.1/geoseeq/cli/raw.py ADDED Viewed

@@ -0,0 +1,59 @@
+import click
+import json
+from .shared_params import use_common_state, overwrite_option
+from geoseeq import GeoseeqNotFoundError
+from geoseeq.blob_constructors import (
+    sample_result_file_from_uuid,
+    project_result_file_from_uuid,
+    sample_result_folder_from_uuid,
+    project_result_folder_from_uuid,
+)
+@click.group('raw')
+def cli_raw():
+    """Low-level commands for interacting with the API."""
+    pass
+@cli_raw.command('get-file-data')
+@use_common_state
+@click.argument('file_ids', nargs=-1)
+def cli_get_file_data(state, file_ids):
+    """Print the raw stored data in a result file object."""
+    knex = state.get_knex()
+    for file_id in file_ids:
+        file_id = file_id.split(':')[-1]
+        try:
+            result_file = sample_result_file_from_uuid(knex, file_id)
+        except GeoseeqNotFoundError:
+            result_file = project_result_file_from_uuid(knex, file_id)
+        print(json.dumps(result_file.stored_data, indent=2), file=state.outfile)
+@cli_raw.command('create-raw-file')
+@use_common_state
+@overwrite_option
+@click.argument('folder_id')
+@click.argument('result_filename')
+@click.argument('filename', type=click.File('r'))
+def cli_get_file_data(state, overwrite, folder_id, result_filename, filename):
+    """Print the raw stored data in a result file object."""
+    knex = state.get_knex()
+    folder_id = folder_id.split(':')[-1]
+    try:
+        result_folder = sample_result_folder_from_uuid(knex, folder_id)
+    except GeoseeqNotFoundError:
+        result_folder = project_result_folder_from_uuid(knex, folder_id)
+    blob = json.load(filename)
+    result_file = result_folder.result_file(result_filename)
+    if overwrite:
+        result_file.idem()
+        result_file.stored_data = blob
+        result_file.save()
+    else:
+        result_file.create()
+    click.echo(f'Created file {result_file.uuid}', file=state.outfile)

{geoseeq-0.5.6a16 → geoseeq-0.6.1}/geoseeq/cli/upload/upload.py RENAMED Viewed

@@ -122,6 +122,7 @@ def cli_upload_file(state, cores, threads_per_upload, num_retries, chunk_size_mb
         use_cache=state.use_cache,
         num_retries=num_retries,
         ignore_errors=ignore_errors,
+        use_atomic_upload=True,
         session=None, #knex.new_session(),
         chunk_size_mb=chunk_size_mb if chunk_size_mb > 0 else None,
     )
@@ -160,6 +161,7 @@ def cli_upload_folder(state, cores, yes, private, recursive, hidden, no_new_vers
         overwrite=True,
         use_cache=state.use_cache,
         no_new_versions=no_new_versions,
+        use_atomic_upload=True,
     )
     for folder_name in folder_names:
         result_folder = root_obj.result_folder(folder_name).idem()

{geoseeq-0.5.6a16 → geoseeq-0.6.1}/geoseeq/cli/upload/upload_reads.py RENAMED Viewed

@@ -98,6 +98,7 @@ def _do_upload(groups, module_name, link_type, lib, filepaths, overwrite, no_new
             progress_tracker_factory=PBarManager().get_new_bar,
             use_cache=state.use_cache,
             no_new_versions=no_new_versions,
+            use_atomic_upload=True,
         )
         for group in groups:
             sample = lib.sample(group['sample_name']).idem()

geoseeq-0.6.1/geoseeq/result/file_chunker.py ADDED Viewed

@@ -0,0 +1,50 @@
+from os.path import getsize
+import logging
+logger = logging.getLogger("geoseeq_api")  # Same name as calling module
+logger.addHandler(logging.NullHandler())
+class FileChunker:
+    def __init__(self, filepath, chunk_size):
+        self.filepath = filepath
+        self.chunk_size = chunk_size
+        self.file_size = getsize(filepath)
+        self.n_parts = int(self.file_size / self.chunk_size) + 1
+        self.loaded_parts = []
+    def load_all_chunks(self):
+        if len(self.loaded_parts) != self.n_parts:
+            with open(self.filepath, "rb") as f:
+                f.seek(0)
+                for i in range(self.n_parts):
+                    chunk = f.read(self.chunk_size)
+                    self.loaded_parts.append(chunk)
+        return self  # convenience for chaining
+    def chunk_is_preloaded(self, num):
+        return len(self.loaded_parts) > num and self.loaded_parts[num]
+    def read_one_chunk(self, num):
+        if not self.chunk_is_preloaded(num):
+            logger.debug(f"Reading chunk {num} from {self.filepath}")
+            with open(self.filepath, "rb") as f:
+                f.seek(num * self.chunk_size)
+                chunk = f.read(self.chunk_size)
+                return chunk
+        return self.loaded_parts[num]
+    def get_chunk(self, num):
+        if self.chunk_is_preloaded(num):
+            return self.loaded_parts[num]
+        return self.read_one_chunk(num)
+    def get_chunk_size(self, num):
+        if num < (self.n_parts - 1):  # all but the last chunk
+            return self.chunk_size
+        if self.chunk_is_preloaded(num):  # last chunk, pre-loaded
+            return len(self.loaded_parts[num])
+        return len(self.read_one_chunk(num))  # last chunk, not pre-loaded

{geoseeq-0.5.6a16 → geoseeq-0.6.1}/geoseeq/result/file_download.py RENAMED Viewed

@@ -2,29 +2,68 @@
 import urllib.request
 import logging
 import requests
-from os.path import basename, getsize, join, isfile, getmtime
+import os
+from os.path import basename, getsize, join, isfile, getmtime, dirname
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from geoseeq.utils import download_ftp
 from geoseeq.constants import FIVE_MB
+from hashlib import md5
+from .resumable_download_tracker import ResumableDownloadTracker
 logger = logging.getLogger("geoseeq_api")  # Same name as calling module
+def url_to_id(url):
+    url = url.split("?")[0]
+    return md5(url.encode()).hexdigest()[:16]
-def _download_head(url, filename, head=None, progress_tracker=None):
+def _download_head(url, filename, head=None, start=0, progress_tracker=None):
     headers = None
     if head and head > 0:
-        headers = {"Range": f"bytes=0-{head}"}
+        headers = {"Range": f"bytes={start}-{head}"}
     response = requests.get(url, stream=True, headers=headers)
     response.raise_for_status()
     total_size_in_bytes = int(response.headers.get('content-length', 0))
     if progress_tracker: progress_tracker.set_num_chunks(total_size_in_bytes)
-    block_size = FIVE_MB
+    if total_size_in_bytes > 10 * FIVE_MB:  # Use resumable download
+        print("Using resumable download")
+        return _download_resumable(response, filename, total_size_in_bytes, progress_tracker)
+    else:
+        block_size = FIVE_MB
+        with open(filename, 'wb') as file:
+            for data in response.iter_content(block_size):
+                if progress_tracker: progress_tracker.update(len(data))
+                file.write(data)
+        return filename
+def _download_resumable(response, filename, total_size_in_bytes, progress_tracker=None, chunk_size=5 * FIVE_MB, part_prefix=".gs_download_{}_{}."):
+    target_id = url_to_id(response.url)
+    tracker = ResumableDownloadTracker(chunk_size, target_id, filename)
+    if not tracker.download_started: tracker.start_download(response.url)
+    n_chunks = total_size_in_bytes // chunk_size
+    for i in range(n_chunks):
+        bytes_start, bytes_end = i * chunk_size, min((i + 1) * chunk_size - 1, total_size_in_bytes - 1)
+        if tracker.part_has_been_downloaded(i):
+            logger.debug(f"Part {i} has already been downloaded.")
+        else:
+            logger.debug(f"Downloading part {i} of {n_chunks - 1}")
+            part_filename = join(dirname(filename), part_prefix.format(i, n_chunks - 1) + basename(filename))
+            _download_head(response.url, part_filename, head=bytes_end, start=bytes_start, progress_tracker=None)
+            part_info = dict(part_number=i, start=bytes_start, end=bytes_end, part_filename=part_filename)
+            tracker.add_part(part_info)
+        if progress_tracker: progress_tracker.update(bytes_end - bytes_start + 1)
+    # at this point all parts have been downloaded
     with open(filename, 'wb') as file:
-        for data in response.iter_content(block_size):
-            if progress_tracker: progress_tracker.update(len(data))
-            file.write(data)
+        for i in range(n_chunks):
+            part_info = tracker.get_part_info(i)
+            part_filename = part_info["part_filename"]
+            with open(part_filename, 'rb') as part_file:
+                file.write(part_file.read())
+    tracker.cleanup()
     return filename
@@ -44,7 +83,7 @@ def guess_download_kind(url):
         return 'generic'
-def download_url(url, kind='guess', filename=None, head=None, progress_tracker=None):
+def download_url(url, kind='guess', filename=None, head=None, progress_tracker=None, target_uuid=None):
     """Return a local filepath to the downloaded file. Download the file."""
     if filename and isfile(filename):
         file_size = getsize(filename)
@@ -67,7 +106,6 @@ def download_url(url, kind='guess', filename=None, head=None, progress_tracker=N
         raise ValueError(f"Unknown download kind: {kind}")
 class ResultFileDownload:
     """Abstract class that handles download methods for result files."""
@@ -136,7 +174,7 @@ class ResultFileDownload:
         url = self.get_download_url()
         filepath = download_url(
             url, blob_type, filename,
-            head=head, progress_tracker=progress_tracker
+            head=head, progress_tracker=progress_tracker,
         )
         if cache and flag_suffix:
             # create flag file

{geoseeq-0.5.6a16 → geoseeq-0.6.1}/geoseeq/result/file_upload.py RENAMED Viewed

@@ -13,130 +13,21 @@ from geoseeq.utils import md5_checksum
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from .utils import *
 from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
-class FileChunker:
-    def __init__(self, filepath, chunk_size):
-        self.filepath = filepath
-        self.chunk_size = chunk_size
-        self.file_size = getsize(filepath)
-        self.n_parts = int(self.file_size / self.chunk_size) + 1
-        self.loaded_parts = []
-    def load_all_chunks(self):
-        if len(self.loaded_parts) != self.n_parts:
-            with open(self.filepath, "rb") as f:
-                f.seek(0)
-                for i in range(self.n_parts):
-                    chunk = f.read(self.chunk_size)
-                    self.loaded_parts.append(chunk)
-        return self  # convenience for chaining
-    def chunk_is_preloaded(self, num):
-        return len(self.loaded_parts) > num and self.loaded_parts[num]
-    def read_one_chunk(self, num):
-        if not self.chunk_is_preloaded(num):
-            logger.debug(f"Reading chunk {num} from {self.filepath}")
-            with open(self.filepath, "rb") as f:
-                f.seek(num * self.chunk_size)
-                chunk = f.read(self.chunk_size)
-                return chunk
-        return self.loaded_parts[num]
-    def get_chunk(self, num):
-        if self.chunk_is_preloaded(num):
-            return self.loaded_parts[num]
-        return self.read_one_chunk(num)
-    def get_chunk_size(self, num):
-        if num < (self.n_parts - 1):  # all but the last chunk
-            return self.chunk_size
-        if self.chunk_is_preloaded(num):  # last chunk, pre-loaded
-            return len(self.loaded_parts[num])
-        return len(self.read_one_chunk(num))  # last chunk, not pre-loaded
-class ResumableUploadTracker:
-    def __init__(self, filepath, chunk_size, tracker_file_prefix="gs_resumable_upload_tracker"):
-        self.open, self.upload_started = True, False
-        self.upload_id, self.urls = None, None
-        self.filepath = filepath
-        self.tracker_file = join(
-            GEOSEEQ_CACHE_DIR, 'upload',
-            tracker_file_prefix + f".{chunk_size}.{getsize(filepath)}." + basename(filepath)
-        )
-        try:
-            os.makedirs(dirname(self.tracker_file), exist_ok=True)
-        except Exception as e:
-            logger.warning(f'Could not create resumable upload tracker directory. {e}')
-            self.open = False
-        self._loaded_parts = {}
-        self._load_parts_from_file()
-    def start_upload(self, upload_id, urls):
-        if not self.open:
-            return
-        if self.upload_started:
-            raise GeoseeqGeneralError("Upload has already started.")
-        blob = dict(upload_id=upload_id, urls=urls, start_time=time.time())
-        serialized = json.dumps(blob)
-        with open(self.tracker_file, "w") as f:
-            f.write(serialized + "\n")
-        self.upload_id, self.urls = upload_id, urls
-        self.upload_started = True
-    def add_part(self, part_upload_info):
-        if not self.open:
-            return
-        part_id = part_upload_info["PartNumber"]
-        serialized = json.dumps(part_upload_info)
-        with open(self.tracker_file, "a") as f:
-            f.write(serialized + "\n")
-        self._loaded_parts[part_id] = part_upload_info
-        if len(self._loaded_parts) == len(self.urls):
-            self.cleanup()
-            self.open = False
-    def _load_parts_from_file(self):
-        if not isfile(self.tracker_file):
-            return
-        with open(self.tracker_file, "r") as f:
-            header_blob = json.loads(f.readline())
-            self.upload_id, self.urls = header_blob["upload_id"], header_blob["urls"]
-            start_time = header_blob["start_time"]
-            if (time.time() - start_time) > (60 * 60 * 23):
-                logger.warning(f"Tracker file {self.tracker_file} is too old. Deleting.")
-                os.remove(self.tracker_file)
-                return
-            self.upload_started = True
-            for line in f:
-                blob = json.loads(line)
-                part_id = blob["PartNumber"]
-                self._loaded_parts[part_id] = blob
-    def part_has_been_uploaded(self, part_number):
-        if not self.open:
-            return False
-        return part_number in self._loaded_parts
-    def get_part_info(self, part_number):
-        return self._loaded_parts[part_number]
-    def cleanup(self):
-        if not self.open:
-            return
-        try:
-            os.remove(self.tracker_file)
-        except FileNotFoundError:
-            pass
+from .file_chunker import FileChunker
+from .resumable_upload_tracker import ResumableUploadTracker
 class ResultFileUpload:
     """Abstract class that handles upload methods for result files."""
-    def _create_multipart_upload(self, filepath, file_size, optional_fields):
+    def _result_type(self, atomic=False):
+        if self.is_sample_result:
+            return "sample"
+        if atomic:
+            return "project"
+        return "group"
+    def _create_multipart_upload(self, filepath, file_size, optional_fields, atomic=False):
         optional_fields = optional_fields if optional_fields else {}
         optional_fields.update(
             {
@@ -147,23 +38,31 @@ class ResultFileUpload:
         data = {
             "filename": basename(filepath),
             "optional_fields": optional_fields,
-            "result_type": "sample" if self.is_sample_result else "group",
+            "result_type": self._result_type(atomic),
         }
-        response = self.knex.post(f"/ar_fields/{self.uuid}/create_upload", json=data)
+        url = f"/ar_fields/{self.uuid}/create_upload"
+        if atomic:
+            data["fieldname"] = self.name
+            url = f"/ars/{self.parent.uuid}/create_atomic_upload"
+        response = self.knex.post(url, json=data)
         return response
-    def _prep_multipart_upload(self, filepath, file_size, chunk_size, optional_fields):
+    def _prep_multipart_upload(self, filepath, file_size, chunk_size, optional_fields, atomic=False):
         n_parts = int(file_size / chunk_size) + 1
-        response = self._create_multipart_upload(filepath, file_size, optional_fields)
+        response = self._create_multipart_upload(filepath, file_size, optional_fields, atomic=atomic)
         upload_id = response["upload_id"]
-        parts = list(range(1, n_parts + 1))
         data = {
-            "parts": parts,
+            "parts": list(range(1, n_parts + 1)),
             "stance": "upload-multipart",
             "upload_id": upload_id,
-            "result_type": "sample" if self.is_sample_result else "group",
+            "result_type": self._result_type(atomic),
         }
-        response = self.knex.post(f"/ar_fields/{self.uuid}/create_upload_urls", json=data)
+        url = f"/ar_fields/{self.uuid}/create_upload_urls"
+        if atomic:
+            data["uuid"] = response["uuid"]
+            data["fieldname"] = self.name
+            url = f"ars/{self.parent.uuid}/create_atomic_upload_urls"
+        response = self.knex.post(url, json=data)
         urls = response
         return upload_id, urls
@@ -204,16 +103,17 @@ class ResultFileUpload:
             resumable_upload_tracker.add_part(blob)
         return blob
-    def _finish_multipart_upload(self, upload_id, complete_parts):
-        response = self.knex.post(
-            f"/ar_fields/{self.uuid}/complete_upload",
-            json={
-                "parts": complete_parts,
-                "upload_id": upload_id,
-                "result_type": "sample" if self.is_sample_result else "group",
-            },
-            json_response=False,
-        )
+    def _finish_multipart_upload(self, upload_id, complete_parts, atomic=False):
+        data = {
+            "parts": complete_parts,
+            "upload_id": upload_id,
+            "result_type": self._result_type(atomic),
+        }
+        url = f"/ar_fields/{self.uuid}/complete_upload"
+        if atomic:
+            data["fieldname"] = self.name
+            url = f"/ars/{self.parent.uuid}/complete_atomic_upload"
+        response = self.knex.post(url, json=data, json_response=False)
         response.raise_for_status()
     def _upload_parts(self, file_chunker, urls, max_retries, session, progress_tracker, threads, resumable_upload_tracker=None):
@@ -257,6 +157,7 @@ class ResultFileUpload:
         progress_tracker=None,
         threads=1,
         use_cache=True,
+        use_atomic_upload=False,
     ):
         """Upload a file to S3 using the multipart upload process."""
         logger.info(f"Uploading {filepath} to S3 using multipart upload.")
@@ -267,15 +168,21 @@ class ResultFileUpload:
         logger.debug(f"Using chunk size of {chunk_size} bytes.")
         resumable_upload_tracker = None
         if use_cache and file_size > 10 * FIVE_MB:  # only use resumable upload tracker for larger files
-            resumable_upload_tracker = ResumableUploadTracker(filepath, chunk_size)
+            upload_target_uuid = self.parent.uuid if use_atomic_upload else self.uuid
+            resumable_upload_tracker = ResumableUploadTracker(filepath, chunk_size, upload_target_uuid)
         if resumable_upload_tracker and resumable_upload_tracker.upload_started:
+            # a resumable upload for this file has already started
+            resumable_upload_exists_and_is_valid = True
             upload_id, urls = resumable_upload_tracker.upload_id, resumable_upload_tracker.urls
+            use_atomic_upload = resumable_upload_tracker.is_atomic_upload
             logger.info(f'Resuming upload for "{filepath}", upload_id: "{upload_id}"')
         else:
-            upload_id, urls = self._prep_multipart_upload(filepath, file_size, chunk_size, optional_fields)
+            upload_id, urls = self._prep_multipart_upload(filepath, file_size, chunk_size, optional_fields, atomic=use_atomic_upload)
             if resumable_upload_tracker:
                 logger.info(f'Creating new resumable upload for "{filepath}", upload_id: "{upload_id}"')
-                resumable_upload_tracker.start_upload(upload_id, urls)
+                resumable_upload_tracker.start_upload(upload_id, urls, is_atomic_upload=use_atomic_upload)
         logger.info(f'Starting upload for "{filepath}"')
         complete_parts = []
         file_chunker = FileChunker(filepath, chunk_size)
@@ -294,14 +201,20 @@ class ResultFileUpload:
             threads,
             resumable_upload_tracker=resumable_upload_tracker
         )
-        self._finish_multipart_upload(upload_id, complete_parts)
+        self._finish_multipart_upload(upload_id, complete_parts, atomic=use_atomic_upload)
         logger.info(f'Finished Upload for "{filepath}"')
+        if use_atomic_upload:
+            # if this was an atomic upload then this result may not have existed on the server before
+            self.get()
         return self
     def upload_file(self, filepath, multipart_thresh=FIVE_MB, overwrite=True, no_new_versions=False, **kwargs):
         if self.exists() and not overwrite:
             raise GeoseeqGeneralError(f"Overwrite is set to False and file {self.uuid} already exists.")
-        self.idem()
+        if not kwargs.get("use_atomic_upload", False):
+            self.idem()
+        else:
+            self.parent.idem()
         if no_new_versions and self.has_downloadable_file():
             raise GeoseeqGeneralError(f"File {self} already has a downloadable file. Not uploading a new version.")
         resolved_path = Path(filepath).resolve()

geoseeq-0.6.1/geoseeq/result/resumable_download_tracker.py ADDED Viewed

@@ -0,0 +1,99 @@
+import time
+import json
+import os
+from os.path import basename, getsize, join, dirname, isfile, getctime
+from pathlib import Path
+from random import random
+import requests
+from geoseeq.knex import GeoseeqGeneralError
+from geoseeq.constants import FIVE_MB
+from geoseeq.utils import md5_checksum
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from .utils import *
+from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
+from .file_chunker import FileChunker
+class ResumableDownloadTracker:
+    def __init__(self, chunk_size, download_target_id, target_local_path, tracker_file_prefix="gs_resumable_download_tracker"):
+        self.open, self.download_started = True, False
+        self.download_target_id = download_target_id
+        self.target_local_path = target_local_path
+        self.tracker_file_dir = join(GEOSEEQ_CACHE_DIR, 'download')
+        self.tracker_file = join(
+            self.tracker_file_dir,
+            tracker_file_prefix + f".{download_target_id}.{chunk_size}." + basename(target_local_path)
+        )
+        try:
+            os.makedirs(self.tracker_file_dir, exist_ok=True)
+        except Exception as e:
+            logger.warning(f'Could not create resumable download tracker directory. {e}')
+            self.open = False
+        self._loaded_parts = {}
+        self._load_parts_from_file()
+    def start_download(self, download_url):
+        if not self.open:
+            return
+        if self.download_started:
+            raise GeoseeqGeneralError("Download has already started.")
+        self.download_started = True
+        blob = dict(download_url=download_url,
+                    download_target_id=self.download_target_id,
+                    start_time=time.time())
+        serialized = json.dumps(blob)
+        with open(self.tracker_file, "w") as f:
+            f.write(serialized + "\n")
+        self.download_url = download_url
+        return self
+    def add_part(self, part_download_info):
+        if not self.open:
+            assert False, "Cannot add part to closed ResumableDownloadTracker"
+        part_id = part_download_info["part_number"]
+        serialized = json.dumps(part_download_info)
+        with open(self.tracker_file, "a") as f:
+            f.write(serialized + "\n")
+        self._loaded_parts[part_id] = part_download_info
+    def _load_parts_from_file(self):
+        if not isfile(self.tracker_file):
+            return
+        with open(self.tracker_file, "r") as f:
+            header_blob = json.loads(f.readline())
+            self.download_url = header_blob["download_url"]
+            start_time = header_blob["start_time"]  # for now we don't expire resumable downloads
+            self.download_started = True
+            for line in f:
+                part_info = json.loads(line)
+                part_id = part_info["part_number"]
+                self._loaded_parts[part_id] = part_info
+    def part_has_been_downloaded(self, part_number):
+        if not self.open:
+            return False
+        if part_number not in self._loaded_parts:
+            return False
+        part_info = self._loaded_parts[part_number]
+        part_path = part_info["part_filename"]
+        return isfile(part_path)
+    def get_part_info(self, part_number):
+        if not self.open:
+            return None
+        return self._loaded_parts.get(part_number, None)
+    def cleanup(self):
+        if not self.open:
+            return
+        for part in self._loaded_parts.values():
+            part_path = part["part_filename"]
+            if isfile(part_path):
+                os.remove(part_path)
+        os.remove(self.tracker_file)
+        self.open = False

geoseeq-0.6.1/geoseeq/result/resumable_upload_tracker.py ADDED Viewed

@@ -0,0 +1,100 @@
+import time
+import json
+import os
+from os.path import basename, getsize, join, dirname, isfile, getctime
+from pathlib import Path
+from random import random
+import requests
+from geoseeq.knex import GeoseeqGeneralError
+from geoseeq.constants import FIVE_MB
+from geoseeq.utils import md5_checksum
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from .utils import *
+from geoseeq.file_system_cache import GEOSEEQ_CACHE_DIR
+from .file_chunker import FileChunker
+class ResumableUploadTracker:
+    def __init__(self, filepath, chunk_size, upload_target_uuid, tracker_file_prefix="gs_resumable_upload_tracker"):
+        self.open, self.upload_started = True, False
+        self.upload_id, self.urls, self.is_atomic_upload = None, None, None
+        self.upload_target_uuid = upload_target_uuid
+        self.filepath = filepath
+        self.tracker_file_dir = join(GEOSEEQ_CACHE_DIR, 'upload')
+        self.tracker_file = join(
+            self.tracker_file_dir,
+            tracker_file_prefix + f".{upload_target_uuid}.{chunk_size}.{getsize(filepath)}." + basename(filepath)
+        )
+        try:
+            os.makedirs(self.tracker_file_dir, exist_ok=True)
+        except Exception as e:
+            logger.warning(f'Could not create resumable upload tracker directory. {e}')
+            self.open = False
+        self._loaded_parts = {}
+        self._load_parts_from_file()
+    def start_upload(self, upload_id, urls, is_atomic_upload=False):
+        if not self.open:
+            return
+        if self.upload_started:
+            raise GeoseeqGeneralError("Upload has already started.")
+        self.upload_started = True
+        blob = dict(upload_id=upload_id,
+                    urls=urls,
+                    is_atomic_upload=is_atomic_upload,
+                    upload_target_uuid=self.upload_target_uuid,
+                    start_time=time.time())
+        serialized = json.dumps(blob)
+        with open(self.tracker_file, "w") as f:
+            f.write(serialized + "\n")
+        self.upload_id, self.urls, self.is_atomic_upload = upload_id, urls, is_atomic_upload
+    def add_part(self, part_upload_info):
+        if not self.open:
+            return
+        part_id = part_upload_info["PartNumber"]
+        serialized = json.dumps(part_upload_info)
+        with open(self.tracker_file, "a") as f:
+            f.write(serialized + "\n")
+        self._loaded_parts[part_id] = part_upload_info
+        if len(self._loaded_parts) == len(self.urls):
+            self.cleanup()
+            self.open = False
+    def _load_parts_from_file(self):
+        if not isfile(self.tracker_file):
+            return
+        with open(self.tracker_file, "r") as f:
+            header_blob = json.loads(f.readline())
+            self.upload_id, self.urls, self.is_atomic_upload = (
+                header_blob["upload_id"], header_blob["urls"], header_blob["is_atomic_upload"]
+            )
+            start_time = header_blob["start_time"]
+            if (time.time() - start_time) > (60 * 60 * 23):
+                logger.warning(f"Tracker file {self.tracker_file} is too old. Deleting.")
+                os.remove(self.tracker_file)
+                return
+            self.upload_started = True
+            for line in f:
+                blob = json.loads(line)
+                part_id = blob["PartNumber"]
+                self._loaded_parts[part_id] = blob
+    def part_has_been_uploaded(self, part_number):
+        if not self.open:
+            return False
+        return part_number in self._loaded_parts
+    def get_part_info(self, part_number):
+        return self._loaded_parts[part_number]
+    def cleanup(self):
+        if not self.open:
+            return
+        try:
+            os.remove(self.tracker_file)
+        except FileNotFoundError:
+            pass

{geoseeq-0.5.6a16 → geoseeq-0.6.1}/geoseeq/upload_download_manager.py RENAMED Viewed

@@ -22,7 +22,7 @@ def _upload_one_file(args):
     (result_file, filepath, session, progress_tracker,
      link_type, overwrite, log_level, parallel_uploads,
      use_cache, no_new_versions, threads_per_upload,
-     num_retries, ignore_errors, chunk_size_mb) = args
+     num_retries, ignore_errors, chunk_size_mb, use_atomic_upload) = args
     chunk_size = chunk_size_mb * 1024 * 1024 if chunk_size_mb else None
     if parallel_uploads:
         _make_in_process_logger(log_level)
@@ -34,6 +34,7 @@ def _upload_one_file(args):
                 session=session, overwrite=overwrite, progress_tracker=progress_tracker,
                 threads=threads_per_upload, use_cache=use_cache, chunk_size=chunk_size,
                 no_new_versions=no_new_versions, max_retries=num_retries,
+                use_atomic_upload=use_atomic_upload
             )
         else:
             result_file.link_file(link_type, filepath)
@@ -59,6 +60,7 @@ class GeoSeeqUploadManager:
                  num_retries=3,
                  ignore_errors=False,
                  chunk_size_mb=5,
+                 use_atomic_upload=True,
                  use_cache=True):
         self.session = session
         self.n_parallel_uploads = n_parallel_uploads
@@ -73,12 +75,18 @@ class GeoSeeqUploadManager:
         self.num_retries = num_retries
         self.ignore_errors = ignore_errors
         self.chunk_size_mb = chunk_size_mb
+        self.use_atomic_upload = use_atomic_upload
     def add_result_file(self, result_file, local_path):
         self._result_files.append((result_file, local_path))
     def add_local_file_to_result_folder(self, result_folder, local_path, geoseeq_file_name=None):
-        geoseeq_file_name = geoseeq_file_name if geoseeq_file_name else local_path
+        if not geoseeq_file_name:
+            if local_path.startswith("/"):  # if local path is an absolute path use the basename
+                geoseeq_file_name = basename(local_path)
+            else:
+                # remove "./" and "../" from local path to get a geoseeq file name
+                geoseeq_file_name = local_path.replace("./", "").replace("../", "")
         result_file = result_folder.result_file(geoseeq_file_name)
         self.add_result_file(result_file, local_path)
@@ -99,7 +107,7 @@ class GeoSeeqUploadManager:
                 self.link_type, self.overwrite, self.log_level,
                 self.n_parallel_uploads > 1, self.use_cache, self.no_new_versions,
                 self.threads_per_upload, self.num_retries, self.ignore_errors,
-                self.chunk_size_mb,
+                self.chunk_size_mb, self.use_atomic_upload
             ) for result_file, local_path in self._result_files
         ]
         out = []
@@ -186,7 +194,7 @@ class GeoSeeqDownloadManager:
         self._convert_result_files_to_urls()
         download_args = [(
             url, file_path,
-            self.progress_tracker_factory(url),
+            self.progress_tracker_factory(file_path),
             self.ignore_errors, self.head, self.log_level,
             self.n_parallel_downloads > 1
         ) for url, file_path in self._result_files]

{geoseeq-0.5.6a16 → geoseeq-0.6.1}/geoseeq.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: geoseeq
-Version: 0.5.6a16
+Version: 0.6.1
 Summary: GeoSeeq command line tools and python API
 Author: David C. Danko
 Author-email: "David C. Danko" <dcdanko@biotia.io>

{geoseeq-0.5.6a16 → geoseeq-0.6.1}/geoseeq.egg-info/SOURCES.txt RENAMED Viewed

@@ -34,6 +34,8 @@ geoseeq/cli/get_eula.py
 geoseeq/cli/main.py
 geoseeq/cli/manage.py
 geoseeq/cli/progress_bar.py
+geoseeq/cli/project.py
+geoseeq/cli/raw.py
 geoseeq/cli/run.py
 geoseeq/cli/search.py
 geoseeq/cli/user.py
@@ -72,10 +74,13 @@ geoseeq/plotting/map/map.py
 geoseeq/plotting/map/overlay.py
 geoseeq/result/__init__.py
 geoseeq/result/bioinfo.py
+geoseeq/result/file_chunker.py
 geoseeq/result/file_download.py
 geoseeq/result/file_upload.py
 geoseeq/result/result_file.py
 geoseeq/result/result_folder.py
+geoseeq/result/resumable_download_tracker.py
+geoseeq/result/resumable_upload_tracker.py
 geoseeq/result/utils.py
 geoseeq/vc/__init__.py
 geoseeq/vc/checksum.py

{geoseeq-0.5.6a16 → geoseeq-0.6.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "geoseeq"
-version = "0.5.6a16"
+version = "0.6.1"
 authors = [
   { name="David C. Danko", email="dcdanko@biotia.io" },
 ]