PyPI - pybioos - Versions diffs - 0.0.3__py3-none-any.whl - Mend

pybioos 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pybioos might be problematic. Click here for more details.

Files changed (38) hide show

bioos/__about__.py +4 -0
bioos/__init__.py +1 -0
bioos/bioos.py +90 -0
bioos/bioos_workflow.py +284 -0
bioos/config.py +147 -0
bioos/errors.py +89 -0
bioos/internal/__init__.py +1 -0
bioos/internal/tos.py +306 -0
bioos/log.py +125 -0
bioos/models/__init__.py +1 -0
bioos/models/models.py +13 -0
bioos/resource/__init__.py +1 -0
bioos/resource/data_models.py +157 -0
bioos/resource/files.py +229 -0
bioos/resource/utility.py +45 -0
bioos/resource/workflows.py +590 -0
bioos/resource/workspaces.py +123 -0
bioos/service/BioOsService.py +191 -0
bioos/service/__init__.py +1 -0
bioos/service/api.py +291 -0
bioos/service/config.py +37 -0
bioos/tests/__init__.py +0 -0
bioos/tests/base.py +21 -0
bioos/tests/bioos.py +43 -0
bioos/tests/data_models.py +259 -0
bioos/tests/files.py +174 -0
bioos/tests/utils.py +68 -0
bioos/tests/workflows.py +287 -0
bioos/tests/workspaces.py +115 -0
bioos/utils/__init__.py +0 -0
bioos/utils/common_tools.py +57 -0
bioos/utils/workflows.py +2 -0
pybioos-0.0.3.dist-info/LICENSE +21 -0
pybioos-0.0.3.dist-info/METADATA +24 -0
pybioos-0.0.3.dist-info/RECORD +38 -0
pybioos-0.0.3.dist-info/WHEEL +5 -0
pybioos-0.0.3.dist-info/entry_points.txt +2 -0
pybioos-0.0.3.dist-info/top_level.txt +1 -0

bioos/internal/tos.py ADDED Viewed

@@ -0,0 +1,306 @@
+import math
+import os
+import re
+from typing import List
+import tos
+from tos import DataTransferType, HttpMethodType
+from tos.exceptions import TosClientError
+from tos.models2 import DeleteError, ListedObject, ObjectTobeDeleted
+from bioos.config import Config
+from bioos.errors import ParameterError
+from bioos.log import Logger
+DEFAULT_THREAD = 10
+LIST_OBJECT_MAX_KEYS = 1000
+SIMPLE_UPLOAD_LIMITATION = 1024 * 1024 * 100
+ONE_BATCH_WRITE_SIZE = 1024 * 1024 * 10
+MAX_ALLOWED_PARTS = 10000
+MIN_PART_SIZE = 1024 * 1024 * 5
+ONE_BATCH_REQUEST = 50
+ONE_BATCH_MAX_DELETE = 1000
+REFRESH_TOKEN_TIME_BEFORE_EXPIRE = 20 * 60
+CRC_CHECK_ERROR_PREFIX = "Check CRC failed"
+def tos_percentage(consumed_bytes, total_bytes, rw_once_bytes,
+                   type_: DataTransferType):
+    if rw_once_bytes == 0:
+        return
+    parts_num = math.ceil(float(total_bytes) / float(rw_once_bytes))
+    cur_part = math.ceil(float(consumed_bytes) / float(rw_once_bytes))
+    notify_num = int(parts_num / 10)
+    if total_bytes and notify_num and cur_part % notify_num == 0:
+        rate = int(100 * float(consumed_bytes) / float(total_bytes))
+        Config.Logger.info(
+            "rate:{}, consumed_bytes:{},total_bytes:{}, rw_once_bytes:{}".
+            format(rate, consumed_bytes, total_bytes, rw_once_bytes))
+class TOSHandler:
+    def __init__(self,
+                 client: tos.clientv2,
+                 bucket: str,
+                 logger: Logger = Config.Logger):
+        # client should be with federation_credential
+        self._client = client
+        self._bucket = bucket
+        self._debug_logging = logger.debug
+        self._info_logging = logger.info
+        self._warn_logging = logger.warn
+        self._error_logging = logger.error
+    def _is_crc_check_error(self, error: TosClientError) -> bool:
+        if not isinstance(error, TosClientError):
+            return False
+        if not error.message:
+            return False
+        return error.message.startswith(CRC_CHECK_ERROR_PREFIX)
+    def presign_download_url(self, file_path: str, duration: int) -> str:
+        return self._client.pre_signed_url(HttpMethodType.Http_Method_Get,
+                                           self._bucket, file_path,
+                                           duration).signed_url
+    def list_objects(self, target_path: str, num: int) -> List[ListedObject]:
+        object_list = []
+        if num != 0:
+            if num <= LIST_OBJECT_MAX_KEYS:
+                resp = self._client.list_objects(bucket=self._bucket,
+                                                 prefix=target_path,
+                                                 max_keys=num)
+                object_list = resp.contents
+            else:
+                remain = num
+                cur_marker = None
+                while True:
+                    if remain <= LIST_OBJECT_MAX_KEYS:
+                        object_list += self._client.list_objects(
+                            bucket=self._bucket,
+                            prefix=target_path,
+                            marker=cur_marker,
+                            max_keys=remain).contents
+                        break
+                    else:
+                        resp = self._client.list_objects(
+                            bucket=self._bucket,
+                            prefix=target_path,
+                            marker=cur_marker,
+                            max_keys=LIST_OBJECT_MAX_KEYS)
+                        object_list += resp.contents
+                        if not resp.is_truncated:
+                            break
+                        cur_marker = resp.next_marker
+                        remain = remain - LIST_OBJECT_MAX_KEYS
+        else:
+            cur_marker = None
+            while True:
+                resp = self._client.list_objects(bucket=self._bucket,
+                                                 prefix=target_path,
+                                                 marker=cur_marker,
+                                                 max_keys=LIST_OBJECT_MAX_KEYS)
+                object_list += resp.contents
+                if not resp.is_truncated:
+                    break
+                cur_marker = resp.next_marker
+        return object_list
+    def upload_objects(
+        self,
+        files_to_upload: List[str],
+        target_path: str,
+        flatten: bool,
+        ignore: str = "",
+        include: str = "",
+    ) -> List[str]:
+        def _upload_fail(error_list_: List[str], file_path_: str):
+            error_list_.append(file_path_)
+        def _upload_small_file(file_path_, tos_target_path_):
+            self._client.put_object_from_file(
+                bucket=self._bucket,
+                key=tos_target_path_,
+                file_path=file_path_,
+                # don't show progress while uploading small file
+                # data_transfer_listener=tos_percentage
+            )
+        def _upload_big_file(file_path_, tos_target_path_, fsize_):
+            part_size = max(int(fsize_ / MAX_ALLOWED_PARTS) + 1, MIN_PART_SIZE)
+            self._client.upload_file(bucket=self._bucket,
+                                     key=tos_target_path_,
+                                     file_path=file_path_,
+                                     part_size=part_size,
+                                     task_num=DEFAULT_THREAD,
+                                     data_transfer_listener=tos_percentage)
+        files_to_upload = self.files_filter(files_to_upload, include, ignore)
+        if len(files_to_upload) == 0:
+            self._info_logging("no files to upload")
+            return
+        error_list = []
+        for file_path in files_to_upload:
+            if not os.path.isfile(file_path):
+                error_list.append(file_path)
+                self._error_logging(f"'{file_path}' is not a file")
+                continue
+            fsize = os.path.getsize(file_path)
+            if flatten:
+                to_upload_path = os.path.basename(file_path)
+            else:
+                to_upload_path = os.path.normpath(file_path)
+            if os.path.isabs(to_upload_path):
+                to_upload_path = to_upload_path.lstrip("/")
+            tos_target_path = os.path.normpath(
+                os.path.join(target_path, to_upload_path))
+            self._debug_logging(
+                f"[{file_path}] begins to upload to [{tos_target_path}]")
+            try:
+                if fsize == 0:
+                    self._error_logging(
+                        f"can not upload empty file {tos_target_path}")
+                    _upload_fail(error_list, file_path)
+                    continue
+                if fsize <= SIMPLE_UPLOAD_LIMITATION:
+                    _upload_small_file(file_path, tos_target_path)
+                else:
+                    _upload_big_file(file_path, tos_target_path, fsize)
+            except Exception as err_:
+                if self._is_crc_check_error(err_):
+                    self._warn_logging(f"CRC check {tos_target_path} failed, "
+                                       f"pls delete the uploaded file by hand")
+                self._error_logging(f"upload {tos_target_path} failed: {err_}")
+                _upload_fail(error_list, file_path)
+                continue
+            self._debug_logging(f"{file_path} uploads succeed")
+        if error_list:
+            self._error_logging(
+                f"{len(error_list)} uploaded failed, please upload them again: "
+                f"\n{error_list}")
+        return error_list
+    def download_objects(self,
+                         files_to_download: List[str],
+                         local_path: str,
+                         flatten: bool,
+                         ignore: str = "",
+                         include: str = "",
+                         force: bool = True) -> List[str]:
+        files_to_download = self.files_filter(files_to_download, include,
+                                              ignore)
+        files_failed = []
+        if len(files_to_download) == 0:
+            self._info_logging("no files to download")
+            return
+        for f in files_to_download:
+            # handle the situation that the file on internal with the name formates "xxx/"
+            if len(f) > 0 and f[-1] == "/":
+                self._warn_logging(
+                    "can't download the file with the name formats 'xxx/'")
+                continue
+            local_target_path = os.path.basename(
+                f) if flatten else os.path.normpath(f)
+            if not force:
+                if os.path.isfile(local_target_path):
+                    self._debug_logging(
+                        f"skip downloading {local_target_path}")
+                    continue
+            try:
+                resp = self._client.head_object(bucket=self._bucket, key=f)
+                fsize_ = resp.content_length
+                part_size = max(
+                    int(fsize_ / MAX_ALLOWED_PARTS) + 1, MIN_PART_SIZE)
+                actual_file_path = os.path.join(local_path, local_target_path)
+                self._client.download_file(
+                    bucket=self._bucket,
+                    key=f,
+                    file_path=actual_file_path,
+                    part_size=part_size,
+                    task_num=DEFAULT_THREAD,
+                    data_transfer_listener=tos_percentage)
+            except tos.exceptions.TosServerError as e:
+                if e.status_code == 404:
+                    self._warn_logging(f"'{f}' not found")
+                    files_failed.append(f)
+            except Exception as err_:
+                raise err_
+                if self._is_crc_check_error(err_):
+                    self._warn_logging(
+                        f"CRC check {actual_file_path} failed, file will be removed"
+                    )
+                    os.remove(actual_file_path)
+                self._error_logging(f"download {f} failed: {err_}")
+                files_failed.append(f)
+        if len(files_failed) > 0:
+            self._warn_logging(f"failed to download {files_failed}")
+        return files_failed
+    def delete_objects(self, files_to_delete: List[str], ignore: str = "", include: str = "") \
+            -> List[DeleteError]:
+        files_to_delete = self.files_filter(files_to_delete, include, ignore)
+        if len(files_to_delete) == 0:
+            self._info_logging("no files to delete")
+            return
+        cur = 0
+        cur_end = min((cur + ONE_BATCH_MAX_DELETE), len(files_to_delete))
+        error_list = []
+        while cur < len(files_to_delete):
+            # default quiet mode will only return error_list
+            resp = self._client.delete_multi_objects(
+                bucket=self._bucket,
+                objects=[
+                    ObjectTobeDeleted(f) for f in files_to_delete[cur:cur_end]
+                ])
+            cur = cur_end
+            cur_end = min((cur + ONE_BATCH_MAX_DELETE), len(files_to_delete))
+            if len(resp.error) != 0:
+                error_list += resp.error_list
+        if len(error_list) > 0:
+            self._info_logging(
+                f"{len(error_list)} files left undeleted: {[err.key for err in error_list]}."
+            )
+        return error_list
+    def files_filter(self,
+                     files: List[str],
+                     include: str = "",
+                     ignore: str = "") -> List[str]:
+        file_lst = []
+        for f in files:
+            if f.endswith("/"):
+                raise ParameterError("tos files path")
+            basename = os.path.basename(os.path.normpath(f))
+            if include != "":
+                if not re.fullmatch(include, basename) or (
+                        ignore != "" and re.fullmatch(ignore, basename)):
+                    continue
+            else:
+                if ignore != "" and re.fullmatch(ignore, basename):
+                    continue
+            file_lst.append(f)
+        return file_lst

bioos/log.py ADDED Viewed

@@ -0,0 +1,125 @@
+import logging
+import click
+from colorama import Fore
+class Logger:
+    _ERROR_LEVEL = 30
+    _WARNING_LEVEL = 20
+    _INFO_LEVEL = 10
+    _DEBUG_LEVEL = 0
+    _nameToLevel = {
+        'ERROR': _ERROR_LEVEL,
+        'WARN': _WARNING_LEVEL,
+        'INFO': _INFO_LEVEL,
+        'DEBUG': _DEBUG_LEVEL,
+    }
+    DEFAULT_LOGGER_LEVEL = _INFO_LEVEL
+    _CUR_LEVEL = DEFAULT_LOGGER_LEVEL
+    @classmethod
+    def _check_level(cls, level):
+        return cls._CUR_LEVEL <= level
+    @classmethod
+    def set_level(cls, level):
+        if isinstance(level, int):
+            cls._CUR_LEVEL = level
+        elif str(level) == level:
+            if level not in cls._nameToLevel:
+                raise ValueError("Unknown level: %r" % level)
+            cls._CUR_LEVEL = cls._nameToLevel[level]
+        else:
+            raise TypeError("Level not an integer or a valid string: %r" %
+                            level)
+        return cls._CUR_LEVEL
+    @classmethod
+    def debug(cls, content):
+        pass
+    @classmethod
+    def info(cls, content):
+        pass
+    @classmethod
+    def warn(cls, content):
+        pass
+    @classmethod
+    def error(cls, content):
+        pass
+# TODO will be used for cli in the future
+class ClickLogger(Logger):
+    @classmethod
+    def debug(cls, content):
+        if cls._check_level(cls._DEBUG_LEVEL):
+            click.secho(f"[DEBUG]:{content}", fg="green")
+    @classmethod
+    def info(cls, content):
+        if cls._check_level(cls._INFO_LEVEL):
+            click.secho(f"[INFO]:{content}")
+    @classmethod
+    def warn(cls, content):
+        if cls._check_level(cls._WARNING_LEVEL):
+            click.secho(f"[WARN]:{content}", fg="yellow")
+    @classmethod
+    def error(cls, content):
+        if cls._check_level(cls._ERROR_LEVEL):
+            click.secho(f"[ERROR]{content}", fg="red")
+class PyLogger(Logger):
+    class CustomFormatter(logging.Formatter):
+        reset = "\x1b[0m"
+        format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        FORMATS = {
+            logging.DEBUG: Fore.GREEN + format + reset,
+            logging.INFO: Fore.LIGHTWHITE_EX + format + reset,
+            logging.WARNING: Fore.YELLOW + format + reset,
+            logging.ERROR: Fore.RED + format + reset,
+        }
+        def format(self, record):
+            log_fmt = self.FORMATS.get(record.levelno)
+            formatter = logging.Formatter(log_fmt)
+            return formatter.format(record)
+    name = "bioos-sdk"
+    logger = logging.getLogger(name)
+    handler = logging.StreamHandler()
+    handler.setFormatter(CustomFormatter())
+    logger.addHandler(handler)
+    logger.setLevel(logging.DEBUG)
+    @classmethod
+    def debug(cls, content):
+        if cls._check_level(cls._DEBUG_LEVEL):
+            cls.logger.debug(content)
+    @classmethod
+    def info(cls, content):
+        if cls._check_level(cls._INFO_LEVEL):
+            cls.logger.info(content)
+    @classmethod
+    def warn(cls, content):
+        if cls._check_level(cls._WARNING_LEVEL):
+            cls.logger.warning(content)
+    @classmethod
+    def error(cls, content):
+        if cls._check_level(cls._ERROR_LEVEL):
+            cls.logger.error(content)

bioos/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

bioos/models/models.py ADDED Viewed

@@ -0,0 +1,13 @@
+from tos.models2 import ListedObject
+class DisplayListedObject:
+    def __init__(self, o: ListedObject, s3_url: str, https_url: str):
+        self.key = o.key
+        self.last_modified = o.last_modified
+        self.size = o.size
+        self.owner = o.owner.display_name
+        # self.hash_crc64_ecma = o.hash_crc64_ecma
+        self.s3_url = s3_url
+        self.https_url = https_url

bioos/resource/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

bioos/resource/data_models.py ADDED Viewed

@@ -0,0 +1,157 @@
+from typing import Dict, Iterable, Union
+import pandas as pd
+from cachetools import TTLCache, cached
+from pandas import DataFrame
+from bioos.config import Config
+from bioos.errors import ConflictError, NotFoundError
+from bioos.utils.common_tools import SingletonType
+class DataModelResource(metaclass=SingletonType):
+    def __init__(self, workspace_id: str):
+        self.workspace_id = workspace_id
+    def __repr__(self):
+        return f"DataModelInfo:\n{self._entities_with_cache()}"
+    @cached(cache=TTLCache(maxsize=100, ttl=1))
+    def _entities_with_cache(self) -> pd.DataFrame:
+        return self.list()
+    def list(self) -> pd.DataFrame:
+        """Returns all 'normal' data_models with .
+        :return: table of 'normal' data models
+        :rtype: DataFrame
+        """
+        models = Config.service().list_data_models({
+            'WorkspaceID':
+            self.workspace_id,
+        }).get("Items")
+        df = pd.DataFrame.from_records(models)
+        return df[df.Type == "normal"].reset_index(drop=True)
+    def write(self, sources: Dict[str, DataFrame], force: bool = True):
+        """Writes the given data to the remote 'normal' data_model .
+        *Example*:
+        ::
+            import pandas as pd
+            ws = bioos.workspace("foo")
+            data = pd.DataFrame({"aaa": "bbb", "ccc": "ddd"})
+            ws.data_models.write(sources = data, force = False)
+        :param sources: data_model content or a batch of data_model content
+        :type sources: Dict[str, DataFrame]
+        :param force: Whether to cover the same name data_model
+        :type force: bool
+        """
+        if not force:
+            entities = self.list()
+            all_normal_models_set = set()
+            for _, entity in entities.iterrows():
+                all_normal_models_set.add(entity.Name)
+            duplicate_models_set = all_normal_models_set.intersection(
+                set(sources.keys()))
+            if len(duplicate_models_set) > 0:
+                raise ConflictError(
+                    "sources", f"{duplicate_models_set} already exists, "
+                    f"pls use force=True to overwrite")
+        for name, data in sources.items():
+            Config.service().create_data_model({
+                'WorkspaceID': self.workspace_id,
+                'Name': name,
+                'Headers': list(data.head()),
+                'Rows': data.values.tolist(),
+            })
+    def read(
+        self,
+        sources: Union[str, Iterable[str],
+                       None] = None) -> Dict[str, DataFrame]:
+        """Reads the data from the remote 'normal' data_models .
+        return all data_models if `sources` not set
+        *Example*:
+        ::
+            ws = bioos.workspace("foo")
+            ws.data_models.read(sources = "bar", force = False) #output: {"bar": DataFrame}
+        :param sources: name of data_model to read
+        :type sources: Union[str, Iterable[str]]
+        :return: Reading result
+        :rtype: Dict[str, DataFrame]
+        """
+        if sources is not None:
+            sources = {sources} if isinstance(sources, str) else set(sources)
+        entities = self.list()
+        all_normal_models = {}
+        for _, entity in entities.iterrows():
+            all_normal_models[entity.Name] = entity.ID
+        # return all data_models if empty
+        if not sources:
+            models_to_find = all_normal_models.keys()
+        else:
+            models_to_find = sources.intersection(set(
+                all_normal_models.keys()))
+        if len(models_to_find) == 0:
+            raise NotFoundError("sources", sources)
+        models_res = {}
+        for model in models_to_find:
+            content = Config.service().list_data_model_rows({
+                'WorkspaceID':
+                self.workspace_id,
+                'ID':
+                all_normal_models[model],
+                'PageSize':
+                0,
+            })
+            if content and content["TotalCount"] > 0:
+                res_df = pd.DataFrame.from_records(content['Rows'])
+                res_df.columns = content['Headers']
+                models_res[model] = res_df
+        return models_res
+    def delete(self, target: str):
+        """Deletes a remote 'normal' data_model for given name.
+        *Example*:
+        ::
+            ws = bioos.workspace("foo")
+            ws.data_models.delete(target = "bar")
+        :param target: name of data_model to delete
+        :type target: str
+        """
+        entities = self.list()
+        entity_row = entities[entities["Name"] == target]
+        if entity_row.empty:
+            raise NotFoundError("target", target)
+        ids = Config.service().list_data_model_row_ids({
+            'WorkspaceID':
+            self.workspace_id,
+            'ID':
+            entity_row.ID.iloc[0],
+        })
+        Config.service().delete_data_model_rows_and_headers({
+            'WorkspaceID':
+            self.workspace_id,
+            'ID':
+            entity_row.ID.iloc[0],
+            'RowIDs':
+            ids["RowIDs"]
+        })