pybioos 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pybioos might be problematic. Click here for more details.

@@ -0,0 +1,590 @@
1
+ from datetime import datetime
2
+ from typing import List
3
+
4
+ import pandas as pd
5
+ from cachetools import TTLCache, cached
6
+ from pandas import DataFrame
7
+ from typing_extensions import Literal
8
+
9
+ from bioos.config import Config
10
+ from bioos.errors import ConflictError, NotFoundError, ParameterError
11
+ from bioos.resource.data_models import DataModelResource
12
+ from bioos.utils import workflows
13
+ from bioos.utils.common_tools import SingletonType, dict_str, is_json
14
+
15
+ UNKNOWN = "Unknown"
16
+ SUBMISSION_STATUS = Literal["Succeeded", "Failed", "Running", "Pending"]
17
+ RUN_STATUS = Literal["Succeeded", "Failed", "Running", "Pending"]
18
+ WORKFLOW_LANGUAGE = Literal["WDL"]
19
+
20
+
21
+ class Run(metaclass=SingletonType): # 单例模式,why
22
+ """Represents a specific run of submission .
23
+ """
24
+
25
+ def __repr__(self):
26
+ info_dict = dict_str({
27
+ "id": self.id,
28
+ "workspace_id": self.workspace_id,
29
+ "submission_id": self.submission,
30
+ "engine_run_id": self.engine_run_id,
31
+ "inputs": self.inputs,
32
+ "outputs": self.outputs,
33
+ "log_path": self.log,
34
+ "error_message": self.error,
35
+ "duration": self.duration,
36
+ "start_time": self.start_time,
37
+ "finish_time": self.finish_time,
38
+ "status": self.status,
39
+ })
40
+ # task info的输出格式待优化
41
+ return f"RunInfo:\n[\n{info_dict}]\n" \
42
+ f"TasksInfo:\n{self.tasks if self.tasks is not None else []}"
43
+
44
+ def __init__(self, workspace_id: str, id_: str, submission_id: str):
45
+ """
46
+ :param workspace_id: Workspace id
47
+ :type workspace_id: str
48
+ :param id_: Run id
49
+ :type id_: str
50
+ :param submission_id: Submission ID
51
+ :type submission_id: str
52
+ """
53
+
54
+ self.workspace_id = workspace_id
55
+ self.id = id_
56
+ self.submission = submission_id
57
+ self._engine_run_id = UNKNOWN
58
+ self.inputs = UNKNOWN
59
+ self.outputs = UNKNOWN
60
+ self.start_time = 0
61
+ self._log: str = UNKNOWN
62
+ self._error: str = UNKNOWN
63
+ self._duration = 0
64
+ self._finish_time = 0
65
+ self._status = UNKNOWN
66
+ self._tasks: pd.DataFrame = None
67
+ self.sync() # 这里会初始化上方的UNKNOWN
68
+
69
+ @property
70
+ def status(self) -> RUN_STATUS:
71
+ """Returns the Run status.
72
+
73
+ :return: Run status
74
+ :rtype: Literal["Succeeded", "Failed", "Running", "Pending"]
75
+ """
76
+ if self._status in ("Succeeded", "Failed"): #判断是否已结束流程,只有在结束前才会触发查询
77
+ return self._status
78
+ self.sync()
79
+ return self._status
80
+
81
+ @property
82
+ def finish_time(self) -> int:
83
+ """Returns the finish time of the Run.
84
+
85
+ :return: The finish time of the run
86
+ :rtype: int
87
+ """
88
+ if self._finish_time:
89
+ return self._finish_time
90
+ self.sync()
91
+ return self._finish_time
92
+
93
+ @property
94
+ def duration(self) -> int:
95
+ """Returns the running duration of the Run.
96
+
97
+ :return: The running duration of the run
98
+ :rtype: int
99
+ """
100
+ if self._duration:
101
+ return self._duration
102
+ self.sync()
103
+ return self._duration
104
+
105
+ @property
106
+ def log(self) -> str:
107
+ """Returns the log s3 path of the Run.
108
+
109
+ :return: The log s3 path of the run
110
+ :rtype: str
111
+ """
112
+ if self._log != UNKNOWN:
113
+ return self._log
114
+ self.sync()
115
+ return self._log
116
+
117
+ @property
118
+ def error(self) -> str:
119
+ """Returns the error message of the Run.
120
+
121
+ :return: The error message of the run
122
+ :rtype: str
123
+ """
124
+ if self._error != UNKNOWN:
125
+ return self._error
126
+ self.sync()
127
+ return self._error
128
+
129
+ @property
130
+ def tasks(self) -> pd.DataFrame:
131
+ """Returns the information of the tasks bound to the Run.
132
+
133
+ :return: The Information of the tasks bound to the Run
134
+ :rtype: str
135
+ """
136
+ if self._tasks is not None:
137
+ res = self._tasks.query("Status=='Running'")
138
+ if res.empty:
139
+ return self._tasks
140
+ tasks = Config.service().list_tasks({
141
+ "RunID": self.id,
142
+ "WorkspaceID": self.workspace_id
143
+ }).get("Items")
144
+ if len(tasks) == 0:
145
+ return None
146
+ self._tasks = pd.DataFrame.from_records(tasks)
147
+ return self._tasks
148
+
149
+ @property
150
+ def engine_run_id(self) -> str:
151
+ """Returns the workflow engine id of the Run.
152
+
153
+ :return: The workflow engine id of the Run
154
+ :rtype: str
155
+ """
156
+ if self._engine_run_id != UNKNOWN:
157
+ return self._engine_run_id
158
+ self.sync()
159
+ return self._engine_run_id
160
+
161
+ @cached(cache=TTLCache(maxsize=100, ttl=1))
162
+ def sync(self):
163
+ """Synchronizes with the remote end
164
+ """
165
+ resp = Config.service().list_runs({
166
+ "SubmissionID": self.submission,
167
+ "WorkspaceID": self.workspace_id,
168
+ "Filter": {
169
+ "IDs": [self.id]
170
+ },
171
+ })
172
+ # not found runs
173
+ if len(resp.get("Items")) != 1:
174
+ return
175
+ item = resp.get("Items")[0]
176
+ self._status = item.get("Status")
177
+ self.start_time = item.get("StartTime")
178
+ self.inputs = item.get("Inputs")
179
+ self.outputs = item.get("Outputs")
180
+ if not item.get("Status") == "Running":
181
+ self._engine_run_id = item.get("EngineRunID")
182
+ self._finish_time = item.get("FinishTime")
183
+ self._duration = item.get("Duration")
184
+ self._log = item.get("Log")
185
+ self._error = item.get("Message")
186
+
187
+
188
+ class Submission(metaclass=SingletonType): # 与run class行为相同
189
+ """Represents a submission of a workflow .
190
+ """
191
+
192
+ def __repr__(self):
193
+ info_dict = dict_str({
194
+ "id": self.id,
195
+ "workspace_id": self.workspace_id,
196
+ "name": self.name,
197
+ "owner": self.owner,
198
+ "description": self.description,
199
+ "data_model": self.data_model,
200
+ "data_model_rows": self.data_model_rows,
201
+ "call_cache": self.call_cache,
202
+ "inputs": self.inputs,
203
+ "outputs": self.outputs,
204
+ "start_time": self.start_time,
205
+ "finish_time": self.finish_time,
206
+ "status": self.status,
207
+ })
208
+ return f"SubmissionInfo:\n{info_dict}\n"
209
+
210
+ def __init__(self, workspace_id: str, id_: str):
211
+ """
212
+ :param workspace_id: Workspace id
213
+ :type workspace_id: str
214
+ :param id_: Submission id
215
+ :type id_: str
216
+ """
217
+ self.workspace_id = workspace_id
218
+ self.id = id_
219
+ self.data_model_rows: List[str] = []
220
+ self.name = UNKNOWN
221
+ self.data_model = UNKNOWN
222
+ self.call_cache = False
223
+ self.outputs = UNKNOWN
224
+ self.inputs = UNKNOWN
225
+ self.description = UNKNOWN
226
+ self.start_time = 0
227
+ self._finish_time = 0
228
+ self._status = UNKNOWN
229
+ self.owner = UNKNOWN
230
+ runs = Config.service().list_runs({
231
+ "WorkspaceID": self.workspace_id,
232
+ "SubmissionID": self.id,
233
+ 'PageSize': 0
234
+ }).get("Items")
235
+ self.runs = [
236
+ Run(self.workspace_id, run.get("ID"), self.id) for run in runs
237
+ ]
238
+ self.sync()
239
+
240
+ @property
241
+ def finish_time(self) -> int:
242
+ """Returns the finish time of the submission.
243
+
244
+ :return: The finish time of submission
245
+ :rtype: int
246
+ """
247
+ if self._finish_time:
248
+ return self._finish_time
249
+ self.sync()
250
+ return self._finish_time
251
+
252
+ @property
253
+ def status(self) -> SUBMISSION_STATUS: #Literal 在这里的作用是做类型标注
254
+ """Returns the Submission status.
255
+
256
+ :return: Submission status
257
+ :rtype: Literal["Succeeded", "Failed", "Running", "Pending"]
258
+ """
259
+ if self._status in ("Succeeded", "Failed"):
260
+ return self._status
261
+ self.sync()
262
+ return self._status
263
+
264
+ @cached(cache=TTLCache(maxsize=100, ttl=1))
265
+ def sync(self):
266
+ """Synchronizes with the remote end
267
+ """
268
+ resp = Config.service().list_submissions({
269
+ "WorkspaceID": self.workspace_id,
270
+ "Filter": {
271
+ "IDs": [self.id]
272
+ },
273
+ # "ID": self.id,
274
+ })
275
+ # not found submission
276
+ if len(resp.get("Items")) != 1:
277
+ return
278
+ item = resp.get("Items")[0]
279
+
280
+ # list data entity rows by call list runs
281
+ runs = Config.service().list_runs({
282
+ 'WorkspaceID': self.workspace_id,
283
+ "SubmissionID": self.id,
284
+ }).get("Items")
285
+ data_entity_row_ids = set()
286
+ for run in runs:
287
+ if run.get("DataEntityRowID") != "":
288
+ data_entity_row_ids.add(run.get("DataEntityRowID"))
289
+ self.data_model_rows = list(data_entity_row_ids)
290
+ # get data model name by call list data models
291
+ models = Config.service().list_data_models({
292
+ 'WorkspaceID':
293
+ self.workspace_id,
294
+ }).get("Items")
295
+ for model in models:
296
+ if model["ID"] == item["DataModelID"]:
297
+ self.data_model = model.get("Name")
298
+ break
299
+
300
+ self.call_cache = item.get("ExposedOptions").get("ReadFromCache")
301
+ self.outputs = item.get("Outputs")
302
+ self.inputs = item.get("Inputs")
303
+ self.owner = item.get("OwnerName")
304
+ self.name = item.get("Name")
305
+ self.description = item.get("Description")
306
+ self.start_time = item.get("StartTime")
307
+ self._status = item.get("Status")
308
+
309
+ if not item.get("Status") in ("Running", "Pending"):
310
+ self._finish_time = item.get("FinishTime")
311
+
312
+
313
+ class WorkflowResource(metaclass=SingletonType):
314
+
315
+ def __init__(self, workspace_id: str):
316
+ self.workspace_id = workspace_id
317
+
318
+ def __repr__(self):
319
+ info_dict = dict_str({
320
+ "cluster": self.get_cluster,
321
+ })
322
+ return f"WorkflowsInfo:\n{info_dict}\n{self.list()}"
323
+
324
+ @property
325
+ def get_cluster(self) -> str: # 查看在运行的机器?
326
+ """Gets the bound cluster id supporting running workflow
327
+
328
+ :return: The bound cluster id
329
+ :rtype: str
330
+ """
331
+ workflow_env_info = Config.service().list_cluster(
332
+ params={
333
+ 'Type': "workflow",
334
+ "ID": self.workspace_id
335
+ })
336
+ for cluster in workflow_env_info.get('Items'):
337
+ info = cluster["ClusterInfo"]
338
+ if info['Status'] == "Running":
339
+ # one workspace will only be bind to one cluster so far
340
+ return info['ID']
341
+ raise NotFoundError("cluster", "workflow")
342
+
343
+ # 这里需要有线下的简易,WDL文件或者压缩包的import逻辑
344
+ def import_workflow(self,
345
+ source: str,
346
+ name: str,
347
+ language: WORKFLOW_LANGUAGE,
348
+ tag: str,
349
+ main_workflow_path: str,
350
+ description: str = "",
351
+ token: str = "") -> str:
352
+ """Imports a workflow .
353
+
354
+ *Example*:
355
+ ::
356
+
357
+ ws = bioos.workspace("foo")
358
+ ws.workflows.import_workflow(source = "http://foo.git", name = "bar", description = "",
359
+ language = "WDL", tag = "baz", token = "xxxxxxxx", main_workflow_path = "aaa/bbb.wdl")
360
+
361
+ :param source: git source of workflow
362
+ :type source: str
363
+ :param name: The name of specified workflow
364
+ :type name: str
365
+ :param description: The description of specified workflow
366
+ :type description: str
367
+ :param language: The language of specified workflow
368
+ :type language: str
369
+ :param tag: The tag of specified workflow
370
+ :type tag: str
371
+ :param token: The token of specified workflow
372
+ :type token: str
373
+ :param main_workflow_path: Main path of specified workflow
374
+ :type main_workflow_path: str
375
+ :return: Workflow ID
376
+ :rtype: str
377
+ """
378
+ if name: # 流程是否存在
379
+ exist = Config.service().check_workflow({
380
+ "WorkspaceID": self.workspace_id,
381
+ "Name": name,
382
+ }).get("IsNameExist")
383
+
384
+ if exist:
385
+ raise ConflictError("name", f"{name} already exists")
386
+ else:
387
+ raise ParameterError("name", name)
388
+
389
+ if not (source.startswith("http://") or source.startswith("https://")):
390
+ raise ParameterError("source", source)
391
+ if language != "WDL":
392
+ raise ParameterError("language", language)
393
+
394
+ params = {
395
+ "WorkspaceID": self.workspace_id,
396
+ "Name": name,
397
+ "Description": description,
398
+ "Language": language,
399
+ "Source": source,
400
+ "Tag": tag,
401
+ "MainWorkflowPath": main_workflow_path,
402
+ }
403
+ if token:
404
+ params["Token"] = token
405
+ return Config.service().create_workflow(params).get("ID")
406
+
407
+ def list(self) -> DataFrame:
408
+ """Lists all workflows' information .
409
+
410
+ *Example*:
411
+ ::
412
+
413
+ ws = bioos.workspace("foo")
414
+ ws.workflows.list()
415
+
416
+ :return: all workflows information
417
+ :rtype: DataFrame
418
+ """
419
+ content = Config.service().list_workflows({
420
+ 'WorkspaceID': self.workspace_id,
421
+ 'SortBy': 'CreateTime',
422
+ 'PageSize': 0,
423
+ })
424
+ res_df = pd.DataFrame.from_records(content['Items'])
425
+ if res_df.empty:
426
+ return res_df
427
+ res_df['CreateTime'] = pd.to_datetime(
428
+ res_df['CreateTime'], unit='ms', origin=pd.Timestamp('2018-07-01'))
429
+ res_df['UpdateTime'] = pd.to_datetime(
430
+ res_df['UpdateTime'], unit='ms', origin=pd.Timestamp('2018-07-01'))
431
+
432
+ return res_df.drop("Status", axis=1)
433
+
434
+ def delete(self, target: str):
435
+ """Deletes a workflow from the workspace .
436
+ Considering security issues, user can only delete a single workflow currently
437
+
438
+ *Example*:
439
+ ::
440
+
441
+ ws = bioos.workspace("foo")
442
+ ws.workflows.delete(target = "bar")
443
+
444
+ :param target: workflow name
445
+ :type target: str
446
+ """
447
+ res = self.list().query(f"Name=='{target}'")
448
+ if res.empty:
449
+ raise ParameterError("target")
450
+
451
+ Config.service().delete_workflow({
452
+ "WorkspaceID": self.workspace_id,
453
+ "ID": res["ID"].iloc[0]
454
+ })
455
+
456
+
457
+ class Workflow(metaclass=SingletonType):
458
+
459
+ def __init__(self,
460
+ name: str,
461
+ workspace_id: str,
462
+ bucket: str,
463
+ check: bool = False):
464
+ self.name = name
465
+ self.workspace_id = workspace_id
466
+ self.bucket = bucket
467
+ if check:
468
+ self.id
469
+
470
+ @property
471
+ @cached(cache=TTLCache(maxsize=100, ttl=1))
472
+ def id(self) -> str:
473
+ """Gets the id of workflow itself
474
+
475
+ :return: the id of workflow itself
476
+ :rtype: str
477
+ """
478
+ res = WorkflowResource(self.workspace_id). \
479
+ list().query(f"Name=='{self.name}'")
480
+ if res.empty:
481
+ raise ParameterError("name")
482
+ return res["ID"].iloc[0]
483
+
484
+ @property
485
+ @cached(cache=TTLCache(maxsize=100, ttl=1))
486
+ def get_cluster(self):
487
+ """Gets the bound cluster id supporting running workflow
488
+
489
+ :return: The bound cluster id
490
+ :rtype: str
491
+ """
492
+ workflow_env_info = Config.service().list_cluster(
493
+ params={
494
+ 'Type': "workflow",
495
+ "ID": self.workspace_id
496
+ })
497
+ for cluster in workflow_env_info.get('Items'):
498
+ info = cluster["ClusterInfo"]
499
+ if info['Status'] == "Running":
500
+ # one workspace will only be bind to one cluster so far
501
+ return info['ID']
502
+ raise NotFoundError("cluster", "workflow")
503
+
504
+ def query_data_model_id(self, name: str) -> "":
505
+ """Gets the id of given data_models among those accessible
506
+
507
+ :param name:
508
+ :return:
509
+ """
510
+ res = DataModelResource(self.workspace_id).list(). \
511
+ query(f"Name=='{name}'")
512
+ if res.empty:
513
+ return ""
514
+ return res["ID"].iloc[0]
515
+
516
+ def submit(self, data_model_name: str, row_ids: List[str], inputs: str, outputs: str,
517
+ submission_desc: str, call_caching: bool, submission_name_suffix: str = "") \
518
+ -> List[Run]:
519
+ """Submit an existed workflow.
520
+
521
+ *Example*:
522
+ ::
523
+
524
+ ws = bioos.workspace("foo")
525
+ wf = ws.workflow(name="123456788")
526
+ wf.submit(inputs = "{\"aaa\":\"bbb\"}",
527
+ outputs = "{\"ccc\": \"ddd\"}",
528
+ data_model_name = "bar",
529
+ row_ids = ["1a","2b"],
530
+ submission_desc = "baz",
531
+ call_caching = True)
532
+
533
+ :param data_model_name: The name of data_model to be used
534
+ :type data_model_name: str
535
+ :param row_ids: Rows to be used of specified data_model
536
+ :type row_ids: List[str]
537
+ :param inputs: Workflow inputs
538
+ :type inputs: str
539
+ :param outputs: Workflow outputs
540
+ :type outputs: str
541
+ :param submission_name_suffix: The suffix of this submission's name, defaults to yyyy-mm-dd-HH-MM-ss. The name format is {workflow_name}-history-{submission_name_suffix}
542
+ :type submission_name_suffix: str
543
+ :param submission_desc: The description of this submission
544
+ :type submission_desc: str
545
+ :param call_caching: CallCaching searches in the cache of previously running tasks with exactly the same commands and exactly the same input tasks. If the cache hit, the results of the previous task will be used instead of reorganizing, thereby saving time and resources.
546
+ :type call_caching: bool
547
+ :return: Result Runs corresponding to submitted workflows
548
+ :rtype: List[Run]
549
+ """
550
+ if not row_ids:
551
+ raise ParameterError("row_ids")
552
+ if not inputs and not is_json(inputs):
553
+ raise ParameterError('inputs')
554
+ if not outputs and not is_json(outputs):
555
+ raise ParameterError('outputs')
556
+
557
+ data_model_id = self.query_data_model_id(data_model_name)
558
+ if not data_model_id:
559
+ raise ParameterError("data_model_name")
560
+
561
+ if not submission_name_suffix:
562
+ submission_name_suffix = datetime.now().strftime(
563
+ '%Y-%m-%d-%H-%M-%S')
564
+ submission_id = Config.service().create_submission({
565
+ "ClusterID":
566
+ self.get_cluster,
567
+ 'WorkspaceID':
568
+ self.workspace_id,
569
+ 'WorkflowID':
570
+ self.id,
571
+ 'Name':
572
+ workflows.submission_name(self.name, submission_name_suffix),
573
+ 'Description':
574
+ submission_desc,
575
+ 'DataModelID':
576
+ data_model_id,
577
+ 'DataModelRowIDs':
578
+ row_ids,
579
+ 'Inputs':
580
+ inputs,
581
+ 'ExposedOptions': {
582
+ "ReadFromCache": call_caching,
583
+ # TODO this may change in the future
584
+ "ExecutionRootDir": f"s3://{self.bucket}"
585
+ },
586
+ 'Outputs':
587
+ outputs,
588
+ }).get("ID")
589
+
590
+ return Submission(self.workspace_id, submission_id).runs