toil 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +18 -13
- toil/batchSystems/abstractBatchSystem.py +39 -13
- toil/batchSystems/abstractGridEngineBatchSystem.py +24 -24
- toil/batchSystems/awsBatch.py +14 -14
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/contained_executor.py +3 -3
- toil/batchSystems/htcondor.py +0 -1
- toil/batchSystems/kubernetes.py +34 -31
- toil/batchSystems/local_support.py +3 -1
- toil/batchSystems/lsf.py +7 -7
- toil/batchSystems/mesos/batchSystem.py +7 -7
- toil/batchSystems/options.py +32 -83
- toil/batchSystems/registry.py +104 -23
- toil/batchSystems/singleMachine.py +16 -13
- toil/batchSystems/slurm.py +87 -16
- toil/batchSystems/torque.py +0 -1
- toil/bus.py +44 -8
- toil/common.py +544 -753
- toil/cwl/__init__.py +28 -32
- toil/cwl/cwltoil.py +595 -574
- toil/cwl/utils.py +55 -10
- toil/exceptions.py +1 -1
- toil/fileStores/__init__.py +2 -2
- toil/fileStores/abstractFileStore.py +88 -14
- toil/fileStores/cachingFileStore.py +610 -549
- toil/fileStores/nonCachingFileStore.py +46 -22
- toil/job.py +182 -101
- toil/jobStores/abstractJobStore.py +161 -95
- toil/jobStores/aws/jobStore.py +23 -9
- toil/jobStores/aws/utils.py +6 -6
- toil/jobStores/fileJobStore.py +116 -18
- toil/jobStores/googleJobStore.py +16 -7
- toil/jobStores/utils.py +5 -6
- toil/leader.py +87 -56
- toil/lib/accelerators.py +10 -5
- toil/lib/aws/__init__.py +3 -14
- toil/lib/aws/ami.py +22 -9
- toil/lib/aws/iam.py +21 -13
- toil/lib/aws/session.py +2 -16
- toil/lib/aws/utils.py +4 -5
- toil/lib/compatibility.py +1 -1
- toil/lib/conversions.py +26 -3
- toil/lib/docker.py +22 -23
- toil/lib/ec2.py +10 -6
- toil/lib/ec2nodes.py +106 -100
- toil/lib/encryption/_nacl.py +2 -1
- toil/lib/generatedEC2Lists.py +325 -18
- toil/lib/io.py +49 -2
- toil/lib/misc.py +1 -1
- toil/lib/resources.py +9 -2
- toil/lib/threading.py +101 -38
- toil/options/common.py +736 -0
- toil/options/cwl.py +336 -0
- toil/options/wdl.py +37 -0
- toil/provisioners/abstractProvisioner.py +9 -4
- toil/provisioners/aws/__init__.py +3 -6
- toil/provisioners/aws/awsProvisioner.py +6 -0
- toil/provisioners/clusterScaler.py +3 -2
- toil/provisioners/gceProvisioner.py +2 -2
- toil/realtimeLogger.py +2 -1
- toil/resource.py +24 -18
- toil/server/app.py +2 -3
- toil/server/cli/wes_cwl_runner.py +4 -4
- toil/server/utils.py +1 -1
- toil/server/wes/abstract_backend.py +3 -2
- toil/server/wes/amazon_wes_utils.py +5 -4
- toil/server/wes/tasks.py +2 -3
- toil/server/wes/toil_backend.py +2 -10
- toil/server/wsgi_app.py +2 -0
- toil/serviceManager.py +12 -10
- toil/statsAndLogging.py +41 -9
- toil/test/__init__.py +29 -54
- toil/test/batchSystems/batchSystemTest.py +11 -111
- toil/test/batchSystems/test_slurm.py +24 -8
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +438 -223
- toil/test/cwl/glob_dir.cwl +15 -0
- toil/test/cwl/preemptible.cwl +21 -0
- toil/test/cwl/preemptible_expression.cwl +28 -0
- toil/test/cwl/revsort.cwl +1 -1
- toil/test/cwl/revsort2.cwl +1 -1
- toil/test/docs/scriptsTest.py +2 -3
- toil/test/jobStores/jobStoreTest.py +34 -21
- toil/test/lib/aws/test_iam.py +4 -14
- toil/test/lib/aws/test_utils.py +0 -3
- toil/test/lib/dockerTest.py +4 -4
- toil/test/lib/test_ec2.py +12 -17
- toil/test/mesos/helloWorld.py +4 -5
- toil/test/mesos/stress.py +1 -1
- toil/test/{wdl/conftest.py → options/__init__.py} +0 -10
- toil/test/options/options.py +37 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
- toil/test/provisioners/clusterScalerTest.py +6 -4
- toil/test/provisioners/clusterTest.py +23 -11
- toil/test/provisioners/gceProvisionerTest.py +0 -6
- toil/test/provisioners/restartScript.py +3 -2
- toil/test/server/serverTest.py +1 -1
- toil/test/sort/restart_sort.py +2 -1
- toil/test/sort/sort.py +2 -1
- toil/test/sort/sortTest.py +2 -13
- toil/test/src/autoDeploymentTest.py +45 -45
- toil/test/src/busTest.py +5 -5
- toil/test/src/checkpointTest.py +2 -2
- toil/test/src/deferredFunctionTest.py +1 -1
- toil/test/src/fileStoreTest.py +32 -16
- toil/test/src/helloWorldTest.py +1 -1
- toil/test/src/importExportFileTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +2 -1
- toil/test/src/jobServiceTest.py +1 -1
- toil/test/src/jobTest.py +18 -18
- toil/test/src/miscTests.py +5 -3
- toil/test/src/promisedRequirementTest.py +3 -3
- toil/test/src/realtimeLoggerTest.py +1 -1
- toil/test/src/resourceTest.py +2 -2
- toil/test/src/restartDAGTest.py +1 -1
- toil/test/src/resumabilityTest.py +36 -2
- toil/test/src/retainTempDirTest.py +1 -1
- toil/test/src/systemTest.py +2 -2
- toil/test/src/toilContextManagerTest.py +2 -2
- toil/test/src/userDefinedJobArgTypeTest.py +1 -1
- toil/test/utils/toilDebugTest.py +98 -32
- toil/test/utils/toilKillTest.py +2 -2
- toil/test/utils/utilsTest.py +23 -3
- toil/test/wdl/wdltoil_test.py +223 -45
- toil/toilState.py +7 -6
- toil/utils/toilClean.py +1 -1
- toil/utils/toilConfig.py +36 -0
- toil/utils/toilDebugFile.py +60 -33
- toil/utils/toilDebugJob.py +39 -12
- toil/utils/toilDestroyCluster.py +1 -1
- toil/utils/toilKill.py +1 -1
- toil/utils/toilLaunchCluster.py +13 -2
- toil/utils/toilMain.py +3 -2
- toil/utils/toilRsyncCluster.py +1 -1
- toil/utils/toilSshCluster.py +1 -1
- toil/utils/toilStats.py +445 -305
- toil/utils/toilStatus.py +2 -5
- toil/version.py +10 -10
- toil/wdl/utils.py +2 -122
- toil/wdl/wdltoil.py +1257 -492
- toil/worker.py +55 -46
- toil-6.1.0.dist-info/METADATA +124 -0
- toil-6.1.0.dist-info/RECORD +241 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/WHEEL +1 -1
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -1
- toil/batchSystems/parasol.py +0 -379
- toil/batchSystems/tes.py +0 -459
- toil/test/batchSystems/parasolTestSupport.py +0 -117
- toil/test/wdl/builtinTest.py +0 -506
- toil/test/wdl/toilwdlTest.py +0 -522
- toil/wdl/toilwdl.py +0 -141
- toil/wdl/versions/dev.py +0 -107
- toil/wdl/versions/draft2.py +0 -980
- toil/wdl/versions/v1.py +0 -794
- toil/wdl/wdl_analysis.py +0 -116
- toil/wdl/wdl_functions.py +0 -997
- toil/wdl/wdl_synthesis.py +0 -1011
- toil/wdl/wdl_types.py +0 -243
- toil-5.12.0.dist-info/METADATA +0 -118
- toil-5.12.0.dist-info/RECORD +0 -244
- /toil/{wdl/versions → options}/__init__.py +0 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
- {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
toil/batchSystems/tes.py
DELETED
|
@@ -1,459 +0,0 @@
|
|
|
1
|
-
# Copyright (C) 2015-2021 Regents of the University of California
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
"""
|
|
15
|
-
Batch system for running Toil workflows on GA4GH TES.
|
|
16
|
-
|
|
17
|
-
Useful with network-based job stores when the TES server provides tasks with
|
|
18
|
-
credentials, and filesystem-based job stores when the TES server lets tasks
|
|
19
|
-
mount the job store.
|
|
20
|
-
|
|
21
|
-
Additional containers should be launched with Singularity, not Docker.
|
|
22
|
-
"""
|
|
23
|
-
import datetime
|
|
24
|
-
import logging
|
|
25
|
-
import math
|
|
26
|
-
import os
|
|
27
|
-
import pickle
|
|
28
|
-
import time
|
|
29
|
-
from argparse import ArgumentParser, _ArgumentGroup
|
|
30
|
-
from typing import Any, Callable, Dict, List, Optional, Union
|
|
31
|
-
|
|
32
|
-
import tes
|
|
33
|
-
from requests.exceptions import HTTPError
|
|
34
|
-
|
|
35
|
-
from toil import applianceSelf
|
|
36
|
-
from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
37
|
-
BatchJobExitReason,
|
|
38
|
-
UpdatedBatchJobInfo)
|
|
39
|
-
from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
|
|
40
|
-
from toil.batchSystems.contained_executor import pack_job
|
|
41
|
-
from toil.batchSystems.options import OptionSetter
|
|
42
|
-
from toil.common import Config, Toil
|
|
43
|
-
from toil.job import JobDescription
|
|
44
|
-
from toil.lib.misc import get_public_ip, slow_down, utc_now
|
|
45
|
-
from toil.resource import Resource
|
|
46
|
-
|
|
47
|
-
logger = logging.getLogger(__name__)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
# Map from TES terminal states to Toil batch job exit reasons
|
|
51
|
-
STATE_TO_EXIT_REASON: Dict[str, BatchJobExitReason] = {
|
|
52
|
-
'COMPLETE': BatchJobExitReason.FINISHED,
|
|
53
|
-
'CANCELED': BatchJobExitReason.KILLED,
|
|
54
|
-
'EXECUTOR_ERROR': BatchJobExitReason.FAILED,
|
|
55
|
-
'SYSTEM_ERROR': BatchJobExitReason.ERROR,
|
|
56
|
-
'UNKNOWN': BatchJobExitReason.ERROR
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class TESBatchSystem(BatchSystemCleanupSupport):
|
|
61
|
-
@classmethod
|
|
62
|
-
def supportsAutoDeployment(cls) -> bool:
|
|
63
|
-
return True
|
|
64
|
-
|
|
65
|
-
@classmethod
|
|
66
|
-
def get_default_tes_endpoint(cls) -> str:
|
|
67
|
-
"""
|
|
68
|
-
Get the default TES endpoint URL to use.
|
|
69
|
-
|
|
70
|
-
(unless overridden by an option or environment variable)
|
|
71
|
-
"""
|
|
72
|
-
return f'http://{get_public_ip()}:8000'
|
|
73
|
-
|
|
74
|
-
def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int) -> None:
|
|
75
|
-
super().__init__(config, maxCores, maxMemory, maxDisk)
|
|
76
|
-
# Connect to TES, using Funnel-compatible environment variables to fill in credentials if not specified.
|
|
77
|
-
self.tes = tes.HTTPClient(config.tes_endpoint,
|
|
78
|
-
user=config.tes_user,
|
|
79
|
-
password=config.tes_password,
|
|
80
|
-
token=config.tes_bearer_token)
|
|
81
|
-
|
|
82
|
-
# Get service info from the TES server and pull out supported storages.
|
|
83
|
-
# We need this so we can tell if the server is likely to be able to
|
|
84
|
-
# mount any of our local files. These are URL bases that the server
|
|
85
|
-
# supports.
|
|
86
|
-
server_info = self.tes.get_service_info()
|
|
87
|
-
logger.debug("Detected TES server info: %s", server_info)
|
|
88
|
-
self.server_storages = server_info.storage or []
|
|
89
|
-
|
|
90
|
-
# Define directories to mount for each task, as py-tes Input objects
|
|
91
|
-
self.mounts: List[tes.Input] = []
|
|
92
|
-
|
|
93
|
-
if config.jobStore:
|
|
94
|
-
job_store_type, job_store_path = Toil.parseLocator(config.jobStore)
|
|
95
|
-
if job_store_type == 'file':
|
|
96
|
-
# If we have a file job store, we want to mount it at the same path, if we can
|
|
97
|
-
self._mount_local_path_if_possible(job_store_path, job_store_path)
|
|
98
|
-
|
|
99
|
-
# If we have AWS credentials, we want to mount them in our home directory if we can.
|
|
100
|
-
aws_credentials_path = os.path.join(os.path.expanduser("~"), '.aws')
|
|
101
|
-
if os.path.isdir(aws_credentials_path):
|
|
102
|
-
self._mount_local_path_if_possible(aws_credentials_path, '/root/.aws')
|
|
103
|
-
|
|
104
|
-
# We assign job names based on a numerical job ID. This functionality
|
|
105
|
-
# is managed by the BatchSystemLocalSupport.
|
|
106
|
-
|
|
107
|
-
# Here is where we will store the user script resource object if we get one.
|
|
108
|
-
self.user_script: Optional[Resource] = None
|
|
109
|
-
|
|
110
|
-
# Ge the image to deploy from Toil's configuration
|
|
111
|
-
self.docker_image = applianceSelf()
|
|
112
|
-
|
|
113
|
-
# We need a way to map between our batch system ID numbers, and TES task IDs from the server.
|
|
114
|
-
self.bs_id_to_tes_id: Dict[int, str] = {}
|
|
115
|
-
self.tes_id_to_bs_id: Dict[str, int] = {}
|
|
116
|
-
|
|
117
|
-
def _server_can_mount(self, url: str) -> bool:
|
|
118
|
-
"""
|
|
119
|
-
Internal function. Should not be called outside this class.
|
|
120
|
-
|
|
121
|
-
Return true if the given URL is under a supported storage location for
|
|
122
|
-
the TES server, and false otherwise.
|
|
123
|
-
"""
|
|
124
|
-
# TODO: build some kind of fast matcher in case there are a lot of
|
|
125
|
-
# storages supported.
|
|
126
|
-
|
|
127
|
-
for base_url in self.server_storages:
|
|
128
|
-
if url.startswith(base_url):
|
|
129
|
-
return True
|
|
130
|
-
return False
|
|
131
|
-
|
|
132
|
-
def _mount_local_path_if_possible(self, local_path: str, container_path: str) -> None:
|
|
133
|
-
"""
|
|
134
|
-
Internal function. Should not be called outside this class.
|
|
135
|
-
|
|
136
|
-
If a local path is somewhere the server thinks it can access, mount it
|
|
137
|
-
into all the tasks.
|
|
138
|
-
"""
|
|
139
|
-
# TODO: We aren't going to work well with linked imports if we're mounting the job store into the container...
|
|
140
|
-
|
|
141
|
-
path_url = 'file://' + os.path.abspath(local_path)
|
|
142
|
-
if os.path.exists(local_path) and self._server_can_mount(path_url):
|
|
143
|
-
# We can access this file from the server. Probably.
|
|
144
|
-
self.mounts.append(tes.Input(url=path_url,
|
|
145
|
-
path=container_path,
|
|
146
|
-
type="DIRECTORY" if os.path.isdir(local_path) else "FILE"))
|
|
147
|
-
|
|
148
|
-
def setUserScript(self, user_script: Resource) -> None:
|
|
149
|
-
logger.debug(f'Setting user script for deployment: {user_script}')
|
|
150
|
-
self.user_script = user_script
|
|
151
|
-
|
|
152
|
-
# setEnv is provided by BatchSystemSupport, updates self.environment
|
|
153
|
-
|
|
154
|
-
def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
|
|
155
|
-
# TODO: get a sensible self.maxCores, etc. so we can check_resource_request.
|
|
156
|
-
# How do we know if the cluster will autoscale?
|
|
157
|
-
|
|
158
|
-
# Try the job as local
|
|
159
|
-
local_id = self.handleLocalJob(job_desc)
|
|
160
|
-
if local_id is not None:
|
|
161
|
-
# It is a local job
|
|
162
|
-
return local_id
|
|
163
|
-
else:
|
|
164
|
-
# We actually want to send to the cluster
|
|
165
|
-
|
|
166
|
-
# Check resource requirements (managed by BatchSystemSupport)
|
|
167
|
-
self.check_resource_request(job_desc)
|
|
168
|
-
|
|
169
|
-
# Make a batch system scope job ID
|
|
170
|
-
bs_id = self.getNextJobID()
|
|
171
|
-
# Make a vaguely human-readable name.
|
|
172
|
-
# TES does not require it to be unique.
|
|
173
|
-
# We could add a per-workflow prefix to use with ListTasks, but
|
|
174
|
-
# ListTasks doesn't let us filter for newly done tasks, so it's not
|
|
175
|
-
# actually useful for us over polling each task.
|
|
176
|
-
job_name = str(job_desc)
|
|
177
|
-
|
|
178
|
-
# Launch the job on TES
|
|
179
|
-
|
|
180
|
-
# Determine job environment
|
|
181
|
-
environment = self.environment.copy()
|
|
182
|
-
if job_environment:
|
|
183
|
-
environment.update(job_environment)
|
|
184
|
-
if 'TOIL_WORKDIR' not in environment:
|
|
185
|
-
# The appliance container defaults TOIL_WORKDIR to
|
|
186
|
-
# /var/lib/toil, but TES doesn't (always?) give us a writable
|
|
187
|
-
# /, so we need to use the writable space in /tmp by default
|
|
188
|
-
# instead when running on TES.
|
|
189
|
-
environment['TOIL_WORKDIR'] = '/tmp'
|
|
190
|
-
|
|
191
|
-
# Make a command to run it in the executor
|
|
192
|
-
command_list = pack_job(job_desc, self.user_script)
|
|
193
|
-
|
|
194
|
-
# Make the sequence of TES containers ("executors") to run.
|
|
195
|
-
# We just run one which is the Toil executor to grab the user
|
|
196
|
-
# script and do the job.
|
|
197
|
-
task_executors = [tes.Executor(image=self.docker_image,
|
|
198
|
-
command=command_list,
|
|
199
|
-
env=environment
|
|
200
|
-
)]
|
|
201
|
-
|
|
202
|
-
# Prepare inputs.
|
|
203
|
-
task_inputs = list(self.mounts)
|
|
204
|
-
# If we had any per-job input files they would come in here.
|
|
205
|
-
|
|
206
|
-
# Prepare resource requirements
|
|
207
|
-
task_resources = tes.Resources(cpu_cores=math.ceil(job_desc.cores),
|
|
208
|
-
ram_gb=job_desc.memory / (1024**3),
|
|
209
|
-
disk_gb=job_desc.disk / (1024**3),
|
|
210
|
-
# TODO: py-tes spells this differently than Toil
|
|
211
|
-
preemptible=job_desc.preemptible)
|
|
212
|
-
|
|
213
|
-
# Package into a TES Task
|
|
214
|
-
task = tes.Task(name=job_name,
|
|
215
|
-
executors=task_executors,
|
|
216
|
-
inputs=task_inputs,
|
|
217
|
-
resources=task_resources)
|
|
218
|
-
|
|
219
|
-
# Launch it and get back the TES ID that we can use to poll the task
|
|
220
|
-
tes_id = self.tes.create_task(task)
|
|
221
|
-
|
|
222
|
-
# Tie it to the numeric ID
|
|
223
|
-
self.bs_id_to_tes_id[bs_id] = tes_id
|
|
224
|
-
self.tes_id_to_bs_id[tes_id] = bs_id
|
|
225
|
-
|
|
226
|
-
logger.debug('Launched job: %s', job_name)
|
|
227
|
-
|
|
228
|
-
return bs_id
|
|
229
|
-
|
|
230
|
-
def _get_runtime(self, task: tes.Task) -> Optional[float]:
|
|
231
|
-
"""
|
|
232
|
-
Internal function. Should not be called outside this class.
|
|
233
|
-
|
|
234
|
-
Get the time that the given job ran/has been running for, in seconds,
|
|
235
|
-
or None if that time is not available. Never returns 0.
|
|
236
|
-
"""
|
|
237
|
-
start_time = None
|
|
238
|
-
end_time = utc_now()
|
|
239
|
-
for log in (task.logs or []):
|
|
240
|
-
if log.start_time:
|
|
241
|
-
# Find the first start time that is set
|
|
242
|
-
start_time = log.start_time
|
|
243
|
-
break
|
|
244
|
-
|
|
245
|
-
if not start_time:
|
|
246
|
-
# It hasn't been running for a measurable amount of time.
|
|
247
|
-
return None
|
|
248
|
-
|
|
249
|
-
for log in reversed(task.logs or []):
|
|
250
|
-
if log.end_time:
|
|
251
|
-
# Find the last end time that is set, and override now
|
|
252
|
-
end_time = log.end_time
|
|
253
|
-
break
|
|
254
|
-
# We have a set start time, so it is/was running. Return the time
|
|
255
|
-
# it has been running for.
|
|
256
|
-
return slow_down((end_time - start_time).total_seconds())
|
|
257
|
-
|
|
258
|
-
def _get_exit_code(self, task: tes.Task) -> int:
|
|
259
|
-
"""
|
|
260
|
-
Internal function. Should not be called outside this class.
|
|
261
|
-
|
|
262
|
-
Get the exit code of the last executor with a log in the task, or
|
|
263
|
-
EXIT_STATUS_UNAVAILABLE_VALUE if no executor has a log.
|
|
264
|
-
"""
|
|
265
|
-
for task_log in reversed(task.logs or []):
|
|
266
|
-
for executor_log in reversed(task_log.logs or []):
|
|
267
|
-
if isinstance(executor_log.exit_code, int):
|
|
268
|
-
# Find the last executor exit code that is a number and return it
|
|
269
|
-
return executor_log.exit_code
|
|
270
|
-
|
|
271
|
-
if task.state == 'COMPLETE':
|
|
272
|
-
# If the task completes without error but has no code logged, the
|
|
273
|
-
# code must be 0.
|
|
274
|
-
return 0
|
|
275
|
-
|
|
276
|
-
# If we get here we couldn't find an exit code.
|
|
277
|
-
return EXIT_STATUS_UNAVAILABLE_VALUE
|
|
278
|
-
|
|
279
|
-
def __get_log_text(self, task: tes.Task) -> Optional[str]:
|
|
280
|
-
"""
|
|
281
|
-
Get the log text (standard error) of the last executor with a log in
|
|
282
|
-
the task, or None.
|
|
283
|
-
"""
|
|
284
|
-
|
|
285
|
-
for task_log in reversed(task.logs or []):
|
|
286
|
-
for executor_log in reversed(task_log.logs or []):
|
|
287
|
-
if isinstance(executor_log.stderr, str):
|
|
288
|
-
# Find the last executor log code that is a string and return it
|
|
289
|
-
return executor_log.stderr
|
|
290
|
-
|
|
291
|
-
# If we get here we couldn't find a log.
|
|
292
|
-
return None
|
|
293
|
-
|
|
294
|
-
def getUpdatedBatchJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]:
|
|
295
|
-
# Remember when we started, for respecting the timeout
|
|
296
|
-
entry = datetime.datetime.now()
|
|
297
|
-
# This is the updated job we have found, if any
|
|
298
|
-
result = None
|
|
299
|
-
while result is None and ((datetime.datetime.now() - entry).total_seconds() < maxWait or not maxWait):
|
|
300
|
-
result = self.getUpdatedLocalJob(0)
|
|
301
|
-
|
|
302
|
-
if result:
|
|
303
|
-
return result
|
|
304
|
-
|
|
305
|
-
# Collect together the list of TES and batch system IDs for tasks we
|
|
306
|
-
# are acknowledging and don't care about anymore.
|
|
307
|
-
acknowledged = []
|
|
308
|
-
|
|
309
|
-
for tes_id, bs_id in self.tes_id_to_bs_id.items():
|
|
310
|
-
# Immediately poll all the jobs we issued.
|
|
311
|
-
# TODO: There's no way to acknowledge a finished job, so there's no
|
|
312
|
-
# faster way to find the newly finished jobs than polling
|
|
313
|
-
task = self.tes.get_task(tes_id, view="MINIMAL")
|
|
314
|
-
if task.state in ["COMPLETE", "CANCELED", "EXECUTOR_ERROR", "SYSTEM_ERROR"]:
|
|
315
|
-
# This task is done!
|
|
316
|
-
logger.debug("Found stopped task: %s", task)
|
|
317
|
-
|
|
318
|
-
# Acknowledge it
|
|
319
|
-
acknowledged.append((tes_id, bs_id))
|
|
320
|
-
|
|
321
|
-
if task.state == "CANCELED":
|
|
322
|
-
# Killed jobs aren't allowed to appear as updated.
|
|
323
|
-
continue
|
|
324
|
-
|
|
325
|
-
# Otherwise, it stopped running and it wasn't our fault.
|
|
326
|
-
|
|
327
|
-
# Fetch the task's full info, including logs.
|
|
328
|
-
task = self.tes.get_task(tes_id, view="FULL")
|
|
329
|
-
|
|
330
|
-
# Record runtime
|
|
331
|
-
runtime = self._get_runtime(task)
|
|
332
|
-
|
|
333
|
-
# Determine if it succeeded
|
|
334
|
-
exit_reason = STATE_TO_EXIT_REASON[task.state]
|
|
335
|
-
|
|
336
|
-
# Get its exit code
|
|
337
|
-
exit_code = self._get_exit_code(task)
|
|
338
|
-
|
|
339
|
-
if task.state == "EXECUTOR_ERROR":
|
|
340
|
-
# The task failed, so report executor logs.
|
|
341
|
-
logger.warning('Log from failed executor: %s', self.__get_log_text(task))
|
|
342
|
-
|
|
343
|
-
# Compose a result
|
|
344
|
-
result = UpdatedBatchJobInfo(jobID=bs_id, exitStatus=exit_code, wallTime=runtime, exitReason=exit_reason)
|
|
345
|
-
|
|
346
|
-
# No more iteration needed, we found a result.
|
|
347
|
-
break
|
|
348
|
-
|
|
349
|
-
# After the iteration, drop all the records for tasks we acknowledged
|
|
350
|
-
for (tes_id, bs_id) in acknowledged:
|
|
351
|
-
del self.tes_id_to_bs_id[tes_id]
|
|
352
|
-
del self.bs_id_to_tes_id[bs_id]
|
|
353
|
-
|
|
354
|
-
if not maxWait:
|
|
355
|
-
# Don't wait at all
|
|
356
|
-
break
|
|
357
|
-
elif result is None:
|
|
358
|
-
# Wait a bit and poll again
|
|
359
|
-
time.sleep(min(maxWait/2, 1.0))
|
|
360
|
-
|
|
361
|
-
# When we get here we have all the result we can get
|
|
362
|
-
return result
|
|
363
|
-
|
|
364
|
-
def shutdown(self) -> None:
|
|
365
|
-
|
|
366
|
-
# Shutdown local processes first
|
|
367
|
-
self.shutdownLocal()
|
|
368
|
-
|
|
369
|
-
for tes_id in self.tes_id_to_bs_id.keys():
|
|
370
|
-
# Shut down all the TES jobs we issued.
|
|
371
|
-
self._try_cancel(tes_id)
|
|
372
|
-
|
|
373
|
-
def _try_cancel(self, tes_id: str) -> None:
|
|
374
|
-
"""
|
|
375
|
-
Internal function. Should not be called outside this class.
|
|
376
|
-
|
|
377
|
-
Try to cancel a TES job.
|
|
378
|
-
|
|
379
|
-
Succeed if it can't be canceled because it has stopped,
|
|
380
|
-
but fail if it can't be canceled for some other reason.
|
|
381
|
-
"""
|
|
382
|
-
try:
|
|
383
|
-
# Kill each of our tasks in TES
|
|
384
|
-
self.tes.cancel_task(tes_id)
|
|
385
|
-
except HTTPError as e:
|
|
386
|
-
if e.response is not None and e.response.status_code in [409, 500]:
|
|
387
|
-
# TODO: This is what we probably get when trying to cancel
|
|
388
|
-
# something that is actually done. But can we rely on that?
|
|
389
|
-
pass
|
|
390
|
-
elif '500' in str(e) or '409' in str(e):
|
|
391
|
-
# TODO: drop this after <https://github.com/ohsu-comp-bio/py-tes/pull/36> merges.
|
|
392
|
-
# py-tes might be hiding the actual code and just putting it in a string
|
|
393
|
-
pass
|
|
394
|
-
else:
|
|
395
|
-
raise
|
|
396
|
-
|
|
397
|
-
def getIssuedBatchJobIDs(self) -> List[int]:
|
|
398
|
-
return self.getIssuedLocalJobIDs() + list(self.bs_id_to_tes_id.keys())
|
|
399
|
-
|
|
400
|
-
def getRunningBatchJobIDs(self) -> Dict[int, float]:
|
|
401
|
-
# We need a dict from job_id (integer) to seconds it has been running
|
|
402
|
-
bs_id_to_runtime = {}
|
|
403
|
-
|
|
404
|
-
for tes_id, bs_id in self.tes_id_to_bs_id.items():
|
|
405
|
-
# Poll every issued task, and get the runtime info right away in
|
|
406
|
-
# the default BASIC view.
|
|
407
|
-
# TODO: use list_tasks filtering by name prefix and running state!
|
|
408
|
-
task = self.tes.get_task(tes_id)
|
|
409
|
-
logger.debug("Observed task: %s", task)
|
|
410
|
-
if task.state in ["INITIALIZING", "RUNNING"]:
|
|
411
|
-
# We count INITIALIZING tasks because they may be e.g. pulling
|
|
412
|
-
# Docker containers, and we don't want to time out on them in
|
|
413
|
-
# the tests. But they may not have any runtimes, so it might
|
|
414
|
-
# not really help.
|
|
415
|
-
runtime = self._get_runtime(task)
|
|
416
|
-
if runtime:
|
|
417
|
-
# We can measure a runtime
|
|
418
|
-
bs_id_to_runtime[bs_id] = runtime
|
|
419
|
-
# If we can't find a runtime, we can't say it's running
|
|
420
|
-
# because we can't say how long it has been running for.
|
|
421
|
-
|
|
422
|
-
# Give back the times all our running jobs have been running for.
|
|
423
|
-
return bs_id_to_runtime
|
|
424
|
-
|
|
425
|
-
def killBatchJobs(self, job_ids: List[int]) -> None:
|
|
426
|
-
# Kill all the ones that are local
|
|
427
|
-
self.killLocalJobs(job_ids)
|
|
428
|
-
|
|
429
|
-
for bs_id in job_ids:
|
|
430
|
-
if bs_id in self.bs_id_to_tes_id:
|
|
431
|
-
# We sent this to TES. So try to cancel it.
|
|
432
|
-
self._try_cancel(self.bs_id_to_tes_id[bs_id])
|
|
433
|
-
# But don't forget the mapping until we actually get the finish
|
|
434
|
-
# notification for the job.
|
|
435
|
-
|
|
436
|
-
# TODO: If the kill races the collection of a finished update, do we
|
|
437
|
-
# have to censor the finished update even if the kill never took
|
|
438
|
-
# effect??? That's not implemented.
|
|
439
|
-
|
|
440
|
-
@classmethod
|
|
441
|
-
def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
|
|
442
|
-
parser.add_argument("--tesEndpoint", dest="tes_endpoint", default=cls.get_default_tes_endpoint(),
|
|
443
|
-
help="The http(s) URL of the TES server. (default: %(default)s)")
|
|
444
|
-
parser.add_argument("--tesUser", dest="tes_user", default=None,
|
|
445
|
-
help="User name to use for basic authentication to TES server.")
|
|
446
|
-
parser.add_argument("--tesPassword", dest="tes_password", default=None,
|
|
447
|
-
help="Password to use for basic authentication to TES server.")
|
|
448
|
-
parser.add_argument("--tesBearerToken", dest="tes_bearer_token", default=None,
|
|
449
|
-
help="Bearer token to use for authentication to TES server.")
|
|
450
|
-
|
|
451
|
-
@classmethod
|
|
452
|
-
def setOptions(cls, setOption: OptionSetter) -> None:
|
|
453
|
-
# Because we use the keyword arguments, we can't specify a type for setOption without using Protocols.
|
|
454
|
-
# TODO: start using Protocols, or just start returning objects to represent the options.
|
|
455
|
-
# When actually parsing options, remember to check the environment variables
|
|
456
|
-
setOption("tes_endpoint", default=cls.get_default_tes_endpoint(), env=["TOIL_TES_ENDPOINT"])
|
|
457
|
-
setOption("tes_user", default=None, env=["TOIL_TES_USER"])
|
|
458
|
-
setOption("tes_password", default=None, env=["TOIL_TES_PASSWORD"])
|
|
459
|
-
setOption("tes_bearer_token", default=None, env=["TOIL_TES_BEARER_TOKEN"])
|
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
# Copyright (C) 2015-2021 Regents of the University of California
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
import logging
|
|
15
|
-
import os
|
|
16
|
-
import signal
|
|
17
|
-
import subprocess
|
|
18
|
-
import tempfile
|
|
19
|
-
import threading
|
|
20
|
-
import time
|
|
21
|
-
|
|
22
|
-
from toil import physicalMemory
|
|
23
|
-
from toil.lib.objects import InnerClass
|
|
24
|
-
from toil.lib.threading import cpu_count
|
|
25
|
-
|
|
26
|
-
log = logging.getLogger(__name__)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class ParasolTestSupport:
|
|
30
|
-
"""
|
|
31
|
-
For test cases that need a running Parasol leader and worker on the local host
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
def _startParasol(self, numCores=None, memory=None):
|
|
35
|
-
if numCores is None:
|
|
36
|
-
numCores = cpu_count()
|
|
37
|
-
if memory is None:
|
|
38
|
-
memory = physicalMemory()
|
|
39
|
-
self.numCores = numCores
|
|
40
|
-
self.memory = memory
|
|
41
|
-
self.leader = self.ParasolLeaderThread()
|
|
42
|
-
self.leader.start()
|
|
43
|
-
self.worker = self.ParasolWorkerThread()
|
|
44
|
-
self.worker.start()
|
|
45
|
-
while self.leader.popen is None or self.worker.popen is None:
|
|
46
|
-
log.info('Waiting for leader and worker processes')
|
|
47
|
-
time.sleep(.1)
|
|
48
|
-
|
|
49
|
-
def _stopParasol(self):
|
|
50
|
-
self.worker.popen.kill()
|
|
51
|
-
self.worker.join()
|
|
52
|
-
self.leader.popen.kill()
|
|
53
|
-
self.leader.join()
|
|
54
|
-
for path in ('para.results', 'parasol.jid'):
|
|
55
|
-
if os.path.exists(path):
|
|
56
|
-
os.remove(path)
|
|
57
|
-
|
|
58
|
-
class ParasolThread(threading.Thread):
|
|
59
|
-
|
|
60
|
-
# Lock is used because subprocess is NOT thread safe: http://tinyurl.com/pkp5pgq
|
|
61
|
-
lock = threading.Lock()
|
|
62
|
-
|
|
63
|
-
def __init__(self):
|
|
64
|
-
threading.Thread.__init__(self)
|
|
65
|
-
self.popen = None
|
|
66
|
-
|
|
67
|
-
def parasolCommand(self):
|
|
68
|
-
raise NotImplementedError
|
|
69
|
-
|
|
70
|
-
def run(self):
|
|
71
|
-
command = self.parasolCommand()
|
|
72
|
-
with self.lock:
|
|
73
|
-
self.popen = subprocess.Popen(command)
|
|
74
|
-
status = self.popen.wait()
|
|
75
|
-
if status != 0 and status != -signal.SIGKILL:
|
|
76
|
-
log.error("Command '%s' failed with %i.", command, status)
|
|
77
|
-
raise subprocess.CalledProcessError(status, command)
|
|
78
|
-
log.info('Exiting %s', self.__class__.__name__)
|
|
79
|
-
|
|
80
|
-
@InnerClass
|
|
81
|
-
class ParasolLeaderThread(ParasolThread):
|
|
82
|
-
|
|
83
|
-
def __init__(self):
|
|
84
|
-
super().__init__()
|
|
85
|
-
self.machineList = None
|
|
86
|
-
|
|
87
|
-
def run(self):
|
|
88
|
-
with tempfile.NamedTemporaryFile(prefix='machineList.txt', mode='w') as f:
|
|
89
|
-
self.machineList = f.name
|
|
90
|
-
# name - Network name
|
|
91
|
-
# cpus - Number of CPUs we can use
|
|
92
|
-
# ramSize - Megabytes of memory
|
|
93
|
-
# tempDir - Location of (local) temp dir
|
|
94
|
-
# localDir - Location of local data dir
|
|
95
|
-
# localSize - Megabytes of local disk
|
|
96
|
-
# switchName - Name of switch this is on
|
|
97
|
-
f.write('localhost {numCores} {ramSize} {tempDir} {tempDir} 1024 foo'.format(
|
|
98
|
-
numCores=self.outer.numCores,
|
|
99
|
-
tempDir=tempfile.gettempdir(),
|
|
100
|
-
ramSize=self.outer.memory / 1024 / 1024))
|
|
101
|
-
f.flush()
|
|
102
|
-
super().run()
|
|
103
|
-
|
|
104
|
-
def parasolCommand(self):
|
|
105
|
-
return ['paraHub',
|
|
106
|
-
'-spokes=1',
|
|
107
|
-
'-debug',
|
|
108
|
-
self.machineList]
|
|
109
|
-
|
|
110
|
-
@InnerClass
|
|
111
|
-
class ParasolWorkerThread(ParasolThread):
|
|
112
|
-
def parasolCommand(self):
|
|
113
|
-
return ['paraNode',
|
|
114
|
-
'-cpu=%i' % self.outer.numCores,
|
|
115
|
-
'-randomDelay=0',
|
|
116
|
-
'-debug',
|
|
117
|
-
'start']
|