pybioos 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pybioos might be problematic. Click here for more details.

bioos/internal/tos.py ADDED
@@ -0,0 +1,306 @@
1
+ import math
2
+ import os
3
+ import re
4
+ from typing import List
5
+
6
+ import tos
7
+ from tos import DataTransferType, HttpMethodType
8
+ from tos.exceptions import TosClientError
9
+ from tos.models2 import DeleteError, ListedObject, ObjectTobeDeleted
10
+
11
+ from bioos.config import Config
12
+ from bioos.errors import ParameterError
13
+ from bioos.log import Logger
14
+
15
+ DEFAULT_THREAD = 10
16
+ LIST_OBJECT_MAX_KEYS = 1000
17
+ SIMPLE_UPLOAD_LIMITATION = 1024 * 1024 * 100
18
+ ONE_BATCH_WRITE_SIZE = 1024 * 1024 * 10
19
+ MAX_ALLOWED_PARTS = 10000
20
+ MIN_PART_SIZE = 1024 * 1024 * 5
21
+ ONE_BATCH_REQUEST = 50
22
+ ONE_BATCH_MAX_DELETE = 1000
23
+ REFRESH_TOKEN_TIME_BEFORE_EXPIRE = 20 * 60
24
+
25
+ CRC_CHECK_ERROR_PREFIX = "Check CRC failed"
26
+
27
+
28
+ def tos_percentage(consumed_bytes, total_bytes, rw_once_bytes,
29
+ type_: DataTransferType):
30
+ if rw_once_bytes == 0:
31
+ return
32
+ parts_num = math.ceil(float(total_bytes) / float(rw_once_bytes))
33
+ cur_part = math.ceil(float(consumed_bytes) / float(rw_once_bytes))
34
+ notify_num = int(parts_num / 10)
35
+ if total_bytes and notify_num and cur_part % notify_num == 0:
36
+ rate = int(100 * float(consumed_bytes) / float(total_bytes))
37
+ Config.Logger.info(
38
+ "rate:{}, consumed_bytes:{},total_bytes:{}, rw_once_bytes:{}".
39
+ format(rate, consumed_bytes, total_bytes, rw_once_bytes))
40
+
41
+
42
+ class TOSHandler:
43
+
44
+ def __init__(self,
45
+ client: tos.clientv2,
46
+ bucket: str,
47
+ logger: Logger = Config.Logger):
48
+ # client should be with federation_credential
49
+ self._client = client
50
+ self._bucket = bucket
51
+
52
+ self._debug_logging = logger.debug
53
+ self._info_logging = logger.info
54
+ self._warn_logging = logger.warn
55
+ self._error_logging = logger.error
56
+
57
+ def _is_crc_check_error(self, error: TosClientError) -> bool:
58
+ if not isinstance(error, TosClientError):
59
+ return False
60
+ if not error.message:
61
+ return False
62
+ return error.message.startswith(CRC_CHECK_ERROR_PREFIX)
63
+
64
+ def presign_download_url(self, file_path: str, duration: int) -> str:
65
+ return self._client.pre_signed_url(HttpMethodType.Http_Method_Get,
66
+ self._bucket, file_path,
67
+ duration).signed_url
68
+
69
+ def list_objects(self, target_path: str, num: int) -> List[ListedObject]:
70
+ object_list = []
71
+ if num != 0:
72
+ if num <= LIST_OBJECT_MAX_KEYS:
73
+ resp = self._client.list_objects(bucket=self._bucket,
74
+ prefix=target_path,
75
+ max_keys=num)
76
+ object_list = resp.contents
77
+ else:
78
+ remain = num
79
+ cur_marker = None
80
+ while True:
81
+ if remain <= LIST_OBJECT_MAX_KEYS:
82
+ object_list += self._client.list_objects(
83
+ bucket=self._bucket,
84
+ prefix=target_path,
85
+ marker=cur_marker,
86
+ max_keys=remain).contents
87
+ break
88
+ else:
89
+ resp = self._client.list_objects(
90
+ bucket=self._bucket,
91
+ prefix=target_path,
92
+ marker=cur_marker,
93
+ max_keys=LIST_OBJECT_MAX_KEYS)
94
+ object_list += resp.contents
95
+ if not resp.is_truncated:
96
+ break
97
+ cur_marker = resp.next_marker
98
+ remain = remain - LIST_OBJECT_MAX_KEYS
99
+
100
+ else:
101
+ cur_marker = None
102
+ while True:
103
+ resp = self._client.list_objects(bucket=self._bucket,
104
+ prefix=target_path,
105
+ marker=cur_marker,
106
+ max_keys=LIST_OBJECT_MAX_KEYS)
107
+ object_list += resp.contents
108
+ if not resp.is_truncated:
109
+ break
110
+ cur_marker = resp.next_marker
111
+ return object_list
112
+
113
+ def upload_objects(
114
+ self,
115
+ files_to_upload: List[str],
116
+ target_path: str,
117
+ flatten: bool,
118
+ ignore: str = "",
119
+ include: str = "",
120
+ ) -> List[str]:
121
+
122
+ def _upload_fail(error_list_: List[str], file_path_: str):
123
+ error_list_.append(file_path_)
124
+
125
+ def _upload_small_file(file_path_, tos_target_path_):
126
+ self._client.put_object_from_file(
127
+ bucket=self._bucket,
128
+ key=tos_target_path_,
129
+ file_path=file_path_,
130
+ # don't show progress while uploading small file
131
+ # data_transfer_listener=tos_percentage
132
+ )
133
+
134
+ def _upload_big_file(file_path_, tos_target_path_, fsize_):
135
+ part_size = max(int(fsize_ / MAX_ALLOWED_PARTS) + 1, MIN_PART_SIZE)
136
+ self._client.upload_file(bucket=self._bucket,
137
+ key=tos_target_path_,
138
+ file_path=file_path_,
139
+ part_size=part_size,
140
+ task_num=DEFAULT_THREAD,
141
+ data_transfer_listener=tos_percentage)
142
+
143
+ files_to_upload = self.files_filter(files_to_upload, include, ignore)
144
+ if len(files_to_upload) == 0:
145
+ self._info_logging("no files to upload")
146
+ return
147
+
148
+ error_list = []
149
+ for file_path in files_to_upload:
150
+ if not os.path.isfile(file_path):
151
+ error_list.append(file_path)
152
+ self._error_logging(f"'{file_path}' is not a file")
153
+ continue
154
+ fsize = os.path.getsize(file_path)
155
+
156
+ if flatten:
157
+ to_upload_path = os.path.basename(file_path)
158
+ else:
159
+ to_upload_path = os.path.normpath(file_path)
160
+
161
+ if os.path.isabs(to_upload_path):
162
+ to_upload_path = to_upload_path.lstrip("/")
163
+
164
+ tos_target_path = os.path.normpath(
165
+ os.path.join(target_path, to_upload_path))
166
+
167
+ self._debug_logging(
168
+ f"[{file_path}] begins to upload to [{tos_target_path}]")
169
+
170
+ try:
171
+ if fsize == 0:
172
+ self._error_logging(
173
+ f"can not upload empty file {tos_target_path}")
174
+ _upload_fail(error_list, file_path)
175
+ continue
176
+ if fsize <= SIMPLE_UPLOAD_LIMITATION:
177
+ _upload_small_file(file_path, tos_target_path)
178
+ else:
179
+ _upload_big_file(file_path, tos_target_path, fsize)
180
+ except Exception as err_:
181
+ if self._is_crc_check_error(err_):
182
+ self._warn_logging(f"CRC check {tos_target_path} failed, "
183
+ f"pls delete the uploaded file by hand")
184
+ self._error_logging(f"upload {tos_target_path} failed: {err_}")
185
+ _upload_fail(error_list, file_path)
186
+ continue
187
+
188
+ self._debug_logging(f"{file_path} uploads succeed")
189
+
190
+ if error_list:
191
+ self._error_logging(
192
+ f"{len(error_list)} uploaded failed, please upload them again: "
193
+ f"\n{error_list}")
194
+
195
+ return error_list
196
+
197
+ def download_objects(self,
198
+ files_to_download: List[str],
199
+ local_path: str,
200
+ flatten: bool,
201
+ ignore: str = "",
202
+ include: str = "",
203
+ force: bool = True) -> List[str]:
204
+ files_to_download = self.files_filter(files_to_download, include,
205
+ ignore)
206
+
207
+ files_failed = []
208
+ if len(files_to_download) == 0:
209
+ self._info_logging("no files to download")
210
+ return
211
+
212
+ for f in files_to_download:
213
+ # handle the situation that the file on internal with the name formates "xxx/"
214
+ if len(f) > 0 and f[-1] == "/":
215
+ self._warn_logging(
216
+ "can't download the file with the name formats 'xxx/'")
217
+ continue
218
+
219
+ local_target_path = os.path.basename(
220
+ f) if flatten else os.path.normpath(f)
221
+
222
+ if not force:
223
+ if os.path.isfile(local_target_path):
224
+ self._debug_logging(
225
+ f"skip downloading {local_target_path}")
226
+ continue
227
+
228
+ try:
229
+ resp = self._client.head_object(bucket=self._bucket, key=f)
230
+ fsize_ = resp.content_length
231
+ part_size = max(
232
+ int(fsize_ / MAX_ALLOWED_PARTS) + 1, MIN_PART_SIZE)
233
+
234
+ actual_file_path = os.path.join(local_path, local_target_path)
235
+ self._client.download_file(
236
+ bucket=self._bucket,
237
+ key=f,
238
+ file_path=actual_file_path,
239
+ part_size=part_size,
240
+ task_num=DEFAULT_THREAD,
241
+ data_transfer_listener=tos_percentage)
242
+ except tos.exceptions.TosServerError as e:
243
+ if e.status_code == 404:
244
+ self._warn_logging(f"'{f}' not found")
245
+ files_failed.append(f)
246
+ except Exception as err_:
247
+ raise err_
248
+ if self._is_crc_check_error(err_):
249
+ self._warn_logging(
250
+ f"CRC check {actual_file_path} failed, file will be removed"
251
+ )
252
+ os.remove(actual_file_path)
253
+ self._error_logging(f"download {f} failed: {err_}")
254
+ files_failed.append(f)
255
+
256
+ if len(files_failed) > 0:
257
+ self._warn_logging(f"failed to download {files_failed}")
258
+ return files_failed
259
+
260
+ def delete_objects(self, files_to_delete: List[str], ignore: str = "", include: str = "") \
261
+ -> List[DeleteError]:
262
+ files_to_delete = self.files_filter(files_to_delete, include, ignore)
263
+
264
+ if len(files_to_delete) == 0:
265
+ self._info_logging("no files to delete")
266
+ return
267
+
268
+ cur = 0
269
+ cur_end = min((cur + ONE_BATCH_MAX_DELETE), len(files_to_delete))
270
+ error_list = []
271
+ while cur < len(files_to_delete):
272
+ # default quiet mode will only return error_list
273
+ resp = self._client.delete_multi_objects(
274
+ bucket=self._bucket,
275
+ objects=[
276
+ ObjectTobeDeleted(f) for f in files_to_delete[cur:cur_end]
277
+ ])
278
+ cur = cur_end
279
+ cur_end = min((cur + ONE_BATCH_MAX_DELETE), len(files_to_delete))
280
+ if len(resp.error) != 0:
281
+ error_list += resp.error_list
282
+ if len(error_list) > 0:
283
+ self._info_logging(
284
+ f"{len(error_list)} files left undeleted: {[err.key for err in error_list]}."
285
+ )
286
+ return error_list
287
+
288
+ def files_filter(self,
289
+ files: List[str],
290
+ include: str = "",
291
+ ignore: str = "") -> List[str]:
292
+ file_lst = []
293
+ for f in files:
294
+ if f.endswith("/"):
295
+ raise ParameterError("tos files path")
296
+ basename = os.path.basename(os.path.normpath(f))
297
+ if include != "":
298
+ if not re.fullmatch(include, basename) or (
299
+ ignore != "" and re.fullmatch(ignore, basename)):
300
+ continue
301
+ else:
302
+ if ignore != "" and re.fullmatch(ignore, basename):
303
+ continue
304
+
305
+ file_lst.append(f)
306
+ return file_lst
bioos/log.py ADDED
@@ -0,0 +1,125 @@
1
+ import logging
2
+
3
+ import click
4
+ from colorama import Fore
5
+
6
+
7
+ class Logger:
8
+ _ERROR_LEVEL = 30
9
+ _WARNING_LEVEL = 20
10
+ _INFO_LEVEL = 10
11
+ _DEBUG_LEVEL = 0
12
+
13
+ _nameToLevel = {
14
+ 'ERROR': _ERROR_LEVEL,
15
+ 'WARN': _WARNING_LEVEL,
16
+ 'INFO': _INFO_LEVEL,
17
+ 'DEBUG': _DEBUG_LEVEL,
18
+ }
19
+
20
+ DEFAULT_LOGGER_LEVEL = _INFO_LEVEL
21
+ _CUR_LEVEL = DEFAULT_LOGGER_LEVEL
22
+
23
+ @classmethod
24
+ def _check_level(cls, level):
25
+ return cls._CUR_LEVEL <= level
26
+
27
+ @classmethod
28
+ def set_level(cls, level):
29
+ if isinstance(level, int):
30
+ cls._CUR_LEVEL = level
31
+ elif str(level) == level:
32
+ if level not in cls._nameToLevel:
33
+ raise ValueError("Unknown level: %r" % level)
34
+ cls._CUR_LEVEL = cls._nameToLevel[level]
35
+ else:
36
+ raise TypeError("Level not an integer or a valid string: %r" %
37
+ level)
38
+ return cls._CUR_LEVEL
39
+
40
+ @classmethod
41
+ def debug(cls, content):
42
+ pass
43
+
44
+ @classmethod
45
+ def info(cls, content):
46
+ pass
47
+
48
+ @classmethod
49
+ def warn(cls, content):
50
+ pass
51
+
52
+ @classmethod
53
+ def error(cls, content):
54
+ pass
55
+
56
+
57
+ # TODO will be used for cli in the future
58
+ class ClickLogger(Logger):
59
+
60
+ @classmethod
61
+ def debug(cls, content):
62
+ if cls._check_level(cls._DEBUG_LEVEL):
63
+ click.secho(f"[DEBUG]:{content}", fg="green")
64
+
65
+ @classmethod
66
+ def info(cls, content):
67
+ if cls._check_level(cls._INFO_LEVEL):
68
+ click.secho(f"[INFO]:{content}")
69
+
70
+ @classmethod
71
+ def warn(cls, content):
72
+ if cls._check_level(cls._WARNING_LEVEL):
73
+ click.secho(f"[WARN]:{content}", fg="yellow")
74
+
75
+ @classmethod
76
+ def error(cls, content):
77
+ if cls._check_level(cls._ERROR_LEVEL):
78
+ click.secho(f"[ERROR]{content}", fg="red")
79
+
80
+
81
+ class PyLogger(Logger):
82
+
83
+ class CustomFormatter(logging.Formatter):
84
+
85
+ reset = "\x1b[0m"
86
+ format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
87
+
88
+ FORMATS = {
89
+ logging.DEBUG: Fore.GREEN + format + reset,
90
+ logging.INFO: Fore.LIGHTWHITE_EX + format + reset,
91
+ logging.WARNING: Fore.YELLOW + format + reset,
92
+ logging.ERROR: Fore.RED + format + reset,
93
+ }
94
+
95
+ def format(self, record):
96
+ log_fmt = self.FORMATS.get(record.levelno)
97
+ formatter = logging.Formatter(log_fmt)
98
+ return formatter.format(record)
99
+
100
+ name = "bioos-sdk"
101
+ logger = logging.getLogger(name)
102
+ handler = logging.StreamHandler()
103
+ handler.setFormatter(CustomFormatter())
104
+ logger.addHandler(handler)
105
+ logger.setLevel(logging.DEBUG)
106
+
107
+ @classmethod
108
+ def debug(cls, content):
109
+ if cls._check_level(cls._DEBUG_LEVEL):
110
+ cls.logger.debug(content)
111
+
112
+ @classmethod
113
+ def info(cls, content):
114
+ if cls._check_level(cls._INFO_LEVEL):
115
+ cls.logger.info(content)
116
+
117
+ @classmethod
118
+ def warn(cls, content):
119
+ if cls._check_level(cls._WARNING_LEVEL):
120
+ cls.logger.warning(content)
121
+
122
+ @classmethod
123
+ def error(cls, content):
124
+ if cls._check_level(cls._ERROR_LEVEL):
125
+ cls.logger.error(content)
@@ -0,0 +1 @@
1
+
bioos/models/models.py ADDED
@@ -0,0 +1,13 @@
1
+ from tos.models2 import ListedObject
2
+
3
+
4
+ class DisplayListedObject:
5
+
6
+ def __init__(self, o: ListedObject, s3_url: str, https_url: str):
7
+ self.key = o.key
8
+ self.last_modified = o.last_modified
9
+ self.size = o.size
10
+ self.owner = o.owner.display_name
11
+ # self.hash_crc64_ecma = o.hash_crc64_ecma
12
+ self.s3_url = s3_url
13
+ self.https_url = https_url
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,157 @@
1
+ from typing import Dict, Iterable, Union
2
+
3
+ import pandas as pd
4
+ from cachetools import TTLCache, cached
5
+ from pandas import DataFrame
6
+
7
+ from bioos.config import Config
8
+ from bioos.errors import ConflictError, NotFoundError
9
+ from bioos.utils.common_tools import SingletonType
10
+
11
+
12
+ class DataModelResource(metaclass=SingletonType):
13
+
14
+ def __init__(self, workspace_id: str):
15
+ self.workspace_id = workspace_id
16
+
17
+ def __repr__(self):
18
+ return f"DataModelInfo:\n{self._entities_with_cache()}"
19
+
20
+ @cached(cache=TTLCache(maxsize=100, ttl=1))
21
+ def _entities_with_cache(self) -> pd.DataFrame:
22
+ return self.list()
23
+
24
+ def list(self) -> pd.DataFrame:
25
+ """Returns all 'normal' data_models with .
26
+
27
+ :return: table of 'normal' data models
28
+ :rtype: DataFrame
29
+ """
30
+ models = Config.service().list_data_models({
31
+ 'WorkspaceID':
32
+ self.workspace_id,
33
+ }).get("Items")
34
+ df = pd.DataFrame.from_records(models)
35
+ return df[df.Type == "normal"].reset_index(drop=True)
36
+
37
+ def write(self, sources: Dict[str, DataFrame], force: bool = True):
38
+ """Writes the given data to the remote 'normal' data_model .
39
+
40
+ *Example*:
41
+ ::
42
+
43
+ import pandas as pd
44
+ ws = bioos.workspace("foo")
45
+ data = pd.DataFrame({"aaa": "bbb", "ccc": "ddd"})
46
+ ws.data_models.write(sources = data, force = False)
47
+
48
+ :param sources: data_model content or a batch of data_model content
49
+ :type sources: Dict[str, DataFrame]
50
+ :param force: Whether to cover the same name data_model
51
+ :type force: bool
52
+ """
53
+ if not force:
54
+ entities = self.list()
55
+ all_normal_models_set = set()
56
+ for _, entity in entities.iterrows():
57
+ all_normal_models_set.add(entity.Name)
58
+ duplicate_models_set = all_normal_models_set.intersection(
59
+ set(sources.keys()))
60
+ if len(duplicate_models_set) > 0:
61
+ raise ConflictError(
62
+ "sources", f"{duplicate_models_set} already exists, "
63
+ f"pls use force=True to overwrite")
64
+
65
+ for name, data in sources.items():
66
+ Config.service().create_data_model({
67
+ 'WorkspaceID': self.workspace_id,
68
+ 'Name': name,
69
+ 'Headers': list(data.head()),
70
+ 'Rows': data.values.tolist(),
71
+ })
72
+
73
+ def read(
74
+ self,
75
+ sources: Union[str, Iterable[str],
76
+ None] = None) -> Dict[str, DataFrame]:
77
+ """Reads the data from the remote 'normal' data_models .
78
+
79
+ return all data_models if `sources` not set
80
+
81
+ *Example*:
82
+ ::
83
+
84
+ ws = bioos.workspace("foo")
85
+ ws.data_models.read(sources = "bar", force = False) #output: {"bar": DataFrame}
86
+
87
+ :param sources: name of data_model to read
88
+ :type sources: Union[str, Iterable[str]]
89
+ :return: Reading result
90
+ :rtype: Dict[str, DataFrame]
91
+ """
92
+ if sources is not None:
93
+ sources = {sources} if isinstance(sources, str) else set(sources)
94
+
95
+ entities = self.list()
96
+ all_normal_models = {}
97
+ for _, entity in entities.iterrows():
98
+ all_normal_models[entity.Name] = entity.ID
99
+ # return all data_models if empty
100
+ if not sources:
101
+ models_to_find = all_normal_models.keys()
102
+ else:
103
+ models_to_find = sources.intersection(set(
104
+ all_normal_models.keys()))
105
+
106
+ if len(models_to_find) == 0:
107
+ raise NotFoundError("sources", sources)
108
+
109
+ models_res = {}
110
+ for model in models_to_find:
111
+ content = Config.service().list_data_model_rows({
112
+ 'WorkspaceID':
113
+ self.workspace_id,
114
+ 'ID':
115
+ all_normal_models[model],
116
+ 'PageSize':
117
+ 0,
118
+ })
119
+ if content and content["TotalCount"] > 0:
120
+ res_df = pd.DataFrame.from_records(content['Rows'])
121
+ res_df.columns = content['Headers']
122
+ models_res[model] = res_df
123
+ return models_res
124
+
125
+ def delete(self, target: str):
126
+ """Deletes a remote 'normal' data_model for given name.
127
+
128
+ *Example*:
129
+ ::
130
+
131
+ ws = bioos.workspace("foo")
132
+ ws.data_models.delete(target = "bar")
133
+
134
+ :param target: name of data_model to delete
135
+ :type target: str
136
+ """
137
+ entities = self.list()
138
+
139
+ entity_row = entities[entities["Name"] == target]
140
+ if entity_row.empty:
141
+ raise NotFoundError("target", target)
142
+
143
+ ids = Config.service().list_data_model_row_ids({
144
+ 'WorkspaceID':
145
+ self.workspace_id,
146
+ 'ID':
147
+ entity_row.ID.iloc[0],
148
+ })
149
+
150
+ Config.service().delete_data_model_rows_and_headers({
151
+ 'WorkspaceID':
152
+ self.workspace_id,
153
+ 'ID':
154
+ entity_row.ID.iloc[0],
155
+ 'RowIDs':
156
+ ids["RowIDs"]
157
+ })