toil 5.12.0__py3-none-any.whl → 6.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. toil/__init__.py +18 -13
  2. toil/batchSystems/abstractBatchSystem.py +21 -10
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +2 -2
  4. toil/batchSystems/awsBatch.py +14 -14
  5. toil/batchSystems/contained_executor.py +3 -3
  6. toil/batchSystems/htcondor.py +0 -1
  7. toil/batchSystems/kubernetes.py +34 -31
  8. toil/batchSystems/local_support.py +3 -1
  9. toil/batchSystems/mesos/batchSystem.py +7 -7
  10. toil/batchSystems/options.py +32 -83
  11. toil/batchSystems/registry.py +104 -23
  12. toil/batchSystems/singleMachine.py +16 -13
  13. toil/batchSystems/slurm.py +3 -3
  14. toil/batchSystems/torque.py +0 -1
  15. toil/bus.py +6 -8
  16. toil/common.py +532 -743
  17. toil/cwl/__init__.py +28 -32
  18. toil/cwl/cwltoil.py +523 -520
  19. toil/cwl/utils.py +55 -10
  20. toil/fileStores/__init__.py +2 -2
  21. toil/fileStores/abstractFileStore.py +36 -11
  22. toil/fileStores/cachingFileStore.py +607 -530
  23. toil/fileStores/nonCachingFileStore.py +43 -10
  24. toil/job.py +140 -75
  25. toil/jobStores/abstractJobStore.py +147 -79
  26. toil/jobStores/aws/jobStore.py +23 -9
  27. toil/jobStores/aws/utils.py +1 -2
  28. toil/jobStores/fileJobStore.py +117 -19
  29. toil/jobStores/googleJobStore.py +16 -7
  30. toil/jobStores/utils.py +5 -6
  31. toil/leader.py +71 -43
  32. toil/lib/accelerators.py +10 -5
  33. toil/lib/aws/__init__.py +3 -14
  34. toil/lib/aws/ami.py +22 -9
  35. toil/lib/aws/iam.py +21 -13
  36. toil/lib/aws/session.py +2 -16
  37. toil/lib/aws/utils.py +4 -5
  38. toil/lib/compatibility.py +1 -1
  39. toil/lib/conversions.py +7 -3
  40. toil/lib/docker.py +22 -23
  41. toil/lib/ec2.py +10 -6
  42. toil/lib/ec2nodes.py +106 -100
  43. toil/lib/encryption/_nacl.py +2 -1
  44. toil/lib/generatedEC2Lists.py +325 -18
  45. toil/lib/io.py +21 -0
  46. toil/lib/misc.py +1 -1
  47. toil/lib/resources.py +1 -1
  48. toil/lib/threading.py +74 -26
  49. toil/options/common.py +738 -0
  50. toil/options/cwl.py +336 -0
  51. toil/options/wdl.py +32 -0
  52. toil/provisioners/abstractProvisioner.py +1 -4
  53. toil/provisioners/aws/__init__.py +3 -6
  54. toil/provisioners/aws/awsProvisioner.py +6 -0
  55. toil/provisioners/clusterScaler.py +3 -2
  56. toil/provisioners/gceProvisioner.py +2 -2
  57. toil/realtimeLogger.py +2 -1
  58. toil/resource.py +24 -18
  59. toil/server/app.py +2 -3
  60. toil/server/cli/wes_cwl_runner.py +4 -4
  61. toil/server/utils.py +1 -1
  62. toil/server/wes/abstract_backend.py +3 -2
  63. toil/server/wes/amazon_wes_utils.py +5 -4
  64. toil/server/wes/tasks.py +2 -3
  65. toil/server/wes/toil_backend.py +2 -10
  66. toil/server/wsgi_app.py +2 -0
  67. toil/serviceManager.py +12 -10
  68. toil/statsAndLogging.py +5 -1
  69. toil/test/__init__.py +29 -54
  70. toil/test/batchSystems/batchSystemTest.py +11 -111
  71. toil/test/batchSystems/test_slurm.py +3 -2
  72. toil/test/cwl/cwlTest.py +213 -90
  73. toil/test/cwl/glob_dir.cwl +15 -0
  74. toil/test/cwl/preemptible.cwl +21 -0
  75. toil/test/cwl/preemptible_expression.cwl +28 -0
  76. toil/test/cwl/revsort.cwl +1 -1
  77. toil/test/cwl/revsort2.cwl +1 -1
  78. toil/test/docs/scriptsTest.py +0 -1
  79. toil/test/jobStores/jobStoreTest.py +27 -16
  80. toil/test/lib/aws/test_iam.py +4 -14
  81. toil/test/lib/aws/test_utils.py +0 -3
  82. toil/test/lib/dockerTest.py +4 -4
  83. toil/test/lib/test_ec2.py +11 -16
  84. toil/test/mesos/helloWorld.py +4 -5
  85. toil/test/mesos/stress.py +1 -1
  86. toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
  87. toil/test/provisioners/clusterScalerTest.py +6 -4
  88. toil/test/provisioners/clusterTest.py +14 -3
  89. toil/test/provisioners/gceProvisionerTest.py +0 -6
  90. toil/test/provisioners/restartScript.py +3 -2
  91. toil/test/server/serverTest.py +1 -1
  92. toil/test/sort/restart_sort.py +2 -1
  93. toil/test/sort/sort.py +2 -1
  94. toil/test/sort/sortTest.py +2 -13
  95. toil/test/src/autoDeploymentTest.py +45 -45
  96. toil/test/src/busTest.py +5 -5
  97. toil/test/src/checkpointTest.py +2 -2
  98. toil/test/src/deferredFunctionTest.py +1 -1
  99. toil/test/src/fileStoreTest.py +32 -16
  100. toil/test/src/helloWorldTest.py +1 -1
  101. toil/test/src/importExportFileTest.py +1 -1
  102. toil/test/src/jobDescriptionTest.py +2 -1
  103. toil/test/src/jobServiceTest.py +1 -1
  104. toil/test/src/jobTest.py +18 -18
  105. toil/test/src/miscTests.py +5 -3
  106. toil/test/src/promisedRequirementTest.py +3 -3
  107. toil/test/src/realtimeLoggerTest.py +1 -1
  108. toil/test/src/resourceTest.py +2 -2
  109. toil/test/src/restartDAGTest.py +1 -1
  110. toil/test/src/resumabilityTest.py +36 -2
  111. toil/test/src/retainTempDirTest.py +1 -1
  112. toil/test/src/systemTest.py +2 -2
  113. toil/test/src/toilContextManagerTest.py +2 -2
  114. toil/test/src/userDefinedJobArgTypeTest.py +1 -1
  115. toil/test/utils/toilDebugTest.py +98 -32
  116. toil/test/utils/toilKillTest.py +2 -2
  117. toil/test/utils/utilsTest.py +20 -0
  118. toil/test/wdl/wdltoil_test.py +148 -45
  119. toil/toilState.py +7 -6
  120. toil/utils/toilClean.py +1 -1
  121. toil/utils/toilConfig.py +36 -0
  122. toil/utils/toilDebugFile.py +60 -33
  123. toil/utils/toilDebugJob.py +39 -12
  124. toil/utils/toilDestroyCluster.py +1 -1
  125. toil/utils/toilKill.py +1 -1
  126. toil/utils/toilLaunchCluster.py +13 -2
  127. toil/utils/toilMain.py +3 -2
  128. toil/utils/toilRsyncCluster.py +1 -1
  129. toil/utils/toilSshCluster.py +1 -1
  130. toil/utils/toilStats.py +240 -143
  131. toil/utils/toilStatus.py +1 -4
  132. toil/version.py +11 -11
  133. toil/wdl/utils.py +2 -122
  134. toil/wdl/wdltoil.py +999 -386
  135. toil/worker.py +25 -31
  136. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/METADATA +60 -53
  137. toil-6.1.0a1.dist-info/RECORD +237 -0
  138. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/WHEEL +1 -1
  139. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/entry_points.txt +0 -1
  140. toil/batchSystems/parasol.py +0 -379
  141. toil/batchSystems/tes.py +0 -459
  142. toil/test/batchSystems/parasolTestSupport.py +0 -117
  143. toil/test/wdl/builtinTest.py +0 -506
  144. toil/test/wdl/conftest.py +0 -23
  145. toil/test/wdl/toilwdlTest.py +0 -522
  146. toil/wdl/toilwdl.py +0 -141
  147. toil/wdl/versions/dev.py +0 -107
  148. toil/wdl/versions/draft2.py +0 -980
  149. toil/wdl/versions/v1.py +0 -794
  150. toil/wdl/wdl_analysis.py +0 -116
  151. toil/wdl/wdl_functions.py +0 -997
  152. toil/wdl/wdl_synthesis.py +0 -1011
  153. toil/wdl/wdl_types.py +0 -243
  154. toil-5.12.0.dist-info/RECORD +0 -244
  155. /toil/{wdl/versions → options}/__init__.py +0 -0
  156. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/LICENSE +0 -0
  157. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/top_level.txt +0 -0
toil/batchSystems/tes.py DELETED
@@ -1,459 +0,0 @@
1
- # Copyright (C) 2015-2021 Regents of the University of California
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- """
15
- Batch system for running Toil workflows on GA4GH TES.
16
-
17
- Useful with network-based job stores when the TES server provides tasks with
18
- credentials, and filesystem-based job stores when the TES server lets tasks
19
- mount the job store.
20
-
21
- Additional containers should be launched with Singularity, not Docker.
22
- """
23
- import datetime
24
- import logging
25
- import math
26
- import os
27
- import pickle
28
- import time
29
- from argparse import ArgumentParser, _ArgumentGroup
30
- from typing import Any, Callable, Dict, List, Optional, Union
31
-
32
- import tes
33
- from requests.exceptions import HTTPError
34
-
35
- from toil import applianceSelf
36
- from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE,
37
- BatchJobExitReason,
38
- UpdatedBatchJobInfo)
39
- from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
40
- from toil.batchSystems.contained_executor import pack_job
41
- from toil.batchSystems.options import OptionSetter
42
- from toil.common import Config, Toil
43
- from toil.job import JobDescription
44
- from toil.lib.misc import get_public_ip, slow_down, utc_now
45
- from toil.resource import Resource
46
-
47
- logger = logging.getLogger(__name__)
48
-
49
-
50
- # Map from TES terminal states to Toil batch job exit reasons
51
- STATE_TO_EXIT_REASON: Dict[str, BatchJobExitReason] = {
52
- 'COMPLETE': BatchJobExitReason.FINISHED,
53
- 'CANCELED': BatchJobExitReason.KILLED,
54
- 'EXECUTOR_ERROR': BatchJobExitReason.FAILED,
55
- 'SYSTEM_ERROR': BatchJobExitReason.ERROR,
56
- 'UNKNOWN': BatchJobExitReason.ERROR
57
- }
58
-
59
-
60
- class TESBatchSystem(BatchSystemCleanupSupport):
61
- @classmethod
62
- def supportsAutoDeployment(cls) -> bool:
63
- return True
64
-
65
- @classmethod
66
- def get_default_tes_endpoint(cls) -> str:
67
- """
68
- Get the default TES endpoint URL to use.
69
-
70
- (unless overridden by an option or environment variable)
71
- """
72
- return f'http://{get_public_ip()}:8000'
73
-
74
- def __init__(self, config: Config, maxCores: float, maxMemory: int, maxDisk: int) -> None:
75
- super().__init__(config, maxCores, maxMemory, maxDisk)
76
- # Connect to TES, using Funnel-compatible environment variables to fill in credentials if not specified.
77
- self.tes = tes.HTTPClient(config.tes_endpoint,
78
- user=config.tes_user,
79
- password=config.tes_password,
80
- token=config.tes_bearer_token)
81
-
82
- # Get service info from the TES server and pull out supported storages.
83
- # We need this so we can tell if the server is likely to be able to
84
- # mount any of our local files. These are URL bases that the server
85
- # supports.
86
- server_info = self.tes.get_service_info()
87
- logger.debug("Detected TES server info: %s", server_info)
88
- self.server_storages = server_info.storage or []
89
-
90
- # Define directories to mount for each task, as py-tes Input objects
91
- self.mounts: List[tes.Input] = []
92
-
93
- if config.jobStore:
94
- job_store_type, job_store_path = Toil.parseLocator(config.jobStore)
95
- if job_store_type == 'file':
96
- # If we have a file job store, we want to mount it at the same path, if we can
97
- self._mount_local_path_if_possible(job_store_path, job_store_path)
98
-
99
- # If we have AWS credentials, we want to mount them in our home directory if we can.
100
- aws_credentials_path = os.path.join(os.path.expanduser("~"), '.aws')
101
- if os.path.isdir(aws_credentials_path):
102
- self._mount_local_path_if_possible(aws_credentials_path, '/root/.aws')
103
-
104
- # We assign job names based on a numerical job ID. This functionality
105
- # is managed by the BatchSystemLocalSupport.
106
-
107
- # Here is where we will store the user script resource object if we get one.
108
- self.user_script: Optional[Resource] = None
109
-
110
- # Ge the image to deploy from Toil's configuration
111
- self.docker_image = applianceSelf()
112
-
113
- # We need a way to map between our batch system ID numbers, and TES task IDs from the server.
114
- self.bs_id_to_tes_id: Dict[int, str] = {}
115
- self.tes_id_to_bs_id: Dict[str, int] = {}
116
-
117
- def _server_can_mount(self, url: str) -> bool:
118
- """
119
- Internal function. Should not be called outside this class.
120
-
121
- Return true if the given URL is under a supported storage location for
122
- the TES server, and false otherwise.
123
- """
124
- # TODO: build some kind of fast matcher in case there are a lot of
125
- # storages supported.
126
-
127
- for base_url in self.server_storages:
128
- if url.startswith(base_url):
129
- return True
130
- return False
131
-
132
- def _mount_local_path_if_possible(self, local_path: str, container_path: str) -> None:
133
- """
134
- Internal function. Should not be called outside this class.
135
-
136
- If a local path is somewhere the server thinks it can access, mount it
137
- into all the tasks.
138
- """
139
- # TODO: We aren't going to work well with linked imports if we're mounting the job store into the container...
140
-
141
- path_url = 'file://' + os.path.abspath(local_path)
142
- if os.path.exists(local_path) and self._server_can_mount(path_url):
143
- # We can access this file from the server. Probably.
144
- self.mounts.append(tes.Input(url=path_url,
145
- path=container_path,
146
- type="DIRECTORY" if os.path.isdir(local_path) else "FILE"))
147
-
148
- def setUserScript(self, user_script: Resource) -> None:
149
- logger.debug(f'Setting user script for deployment: {user_script}')
150
- self.user_script = user_script
151
-
152
- # setEnv is provided by BatchSystemSupport, updates self.environment
153
-
154
- def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
155
- # TODO: get a sensible self.maxCores, etc. so we can check_resource_request.
156
- # How do we know if the cluster will autoscale?
157
-
158
- # Try the job as local
159
- local_id = self.handleLocalJob(job_desc)
160
- if local_id is not None:
161
- # It is a local job
162
- return local_id
163
- else:
164
- # We actually want to send to the cluster
165
-
166
- # Check resource requirements (managed by BatchSystemSupport)
167
- self.check_resource_request(job_desc)
168
-
169
- # Make a batch system scope job ID
170
- bs_id = self.getNextJobID()
171
- # Make a vaguely human-readable name.
172
- # TES does not require it to be unique.
173
- # We could add a per-workflow prefix to use with ListTasks, but
174
- # ListTasks doesn't let us filter for newly done tasks, so it's not
175
- # actually useful for us over polling each task.
176
- job_name = str(job_desc)
177
-
178
- # Launch the job on TES
179
-
180
- # Determine job environment
181
- environment = self.environment.copy()
182
- if job_environment:
183
- environment.update(job_environment)
184
- if 'TOIL_WORKDIR' not in environment:
185
- # The appliance container defaults TOIL_WORKDIR to
186
- # /var/lib/toil, but TES doesn't (always?) give us a writable
187
- # /, so we need to use the writable space in /tmp by default
188
- # instead when running on TES.
189
- environment['TOIL_WORKDIR'] = '/tmp'
190
-
191
- # Make a command to run it in the executor
192
- command_list = pack_job(job_desc, self.user_script)
193
-
194
- # Make the sequence of TES containers ("executors") to run.
195
- # We just run one which is the Toil executor to grab the user
196
- # script and do the job.
197
- task_executors = [tes.Executor(image=self.docker_image,
198
- command=command_list,
199
- env=environment
200
- )]
201
-
202
- # Prepare inputs.
203
- task_inputs = list(self.mounts)
204
- # If we had any per-job input files they would come in here.
205
-
206
- # Prepare resource requirements
207
- task_resources = tes.Resources(cpu_cores=math.ceil(job_desc.cores),
208
- ram_gb=job_desc.memory / (1024**3),
209
- disk_gb=job_desc.disk / (1024**3),
210
- # TODO: py-tes spells this differently than Toil
211
- preemptible=job_desc.preemptible)
212
-
213
- # Package into a TES Task
214
- task = tes.Task(name=job_name,
215
- executors=task_executors,
216
- inputs=task_inputs,
217
- resources=task_resources)
218
-
219
- # Launch it and get back the TES ID that we can use to poll the task
220
- tes_id = self.tes.create_task(task)
221
-
222
- # Tie it to the numeric ID
223
- self.bs_id_to_tes_id[bs_id] = tes_id
224
- self.tes_id_to_bs_id[tes_id] = bs_id
225
-
226
- logger.debug('Launched job: %s', job_name)
227
-
228
- return bs_id
229
-
230
- def _get_runtime(self, task: tes.Task) -> Optional[float]:
231
- """
232
- Internal function. Should not be called outside this class.
233
-
234
- Get the time that the given job ran/has been running for, in seconds,
235
- or None if that time is not available. Never returns 0.
236
- """
237
- start_time = None
238
- end_time = utc_now()
239
- for log in (task.logs or []):
240
- if log.start_time:
241
- # Find the first start time that is set
242
- start_time = log.start_time
243
- break
244
-
245
- if not start_time:
246
- # It hasn't been running for a measurable amount of time.
247
- return None
248
-
249
- for log in reversed(task.logs or []):
250
- if log.end_time:
251
- # Find the last end time that is set, and override now
252
- end_time = log.end_time
253
- break
254
- # We have a set start time, so it is/was running. Return the time
255
- # it has been running for.
256
- return slow_down((end_time - start_time).total_seconds())
257
-
258
- def _get_exit_code(self, task: tes.Task) -> int:
259
- """
260
- Internal function. Should not be called outside this class.
261
-
262
- Get the exit code of the last executor with a log in the task, or
263
- EXIT_STATUS_UNAVAILABLE_VALUE if no executor has a log.
264
- """
265
- for task_log in reversed(task.logs or []):
266
- for executor_log in reversed(task_log.logs or []):
267
- if isinstance(executor_log.exit_code, int):
268
- # Find the last executor exit code that is a number and return it
269
- return executor_log.exit_code
270
-
271
- if task.state == 'COMPLETE':
272
- # If the task completes without error but has no code logged, the
273
- # code must be 0.
274
- return 0
275
-
276
- # If we get here we couldn't find an exit code.
277
- return EXIT_STATUS_UNAVAILABLE_VALUE
278
-
279
- def __get_log_text(self, task: tes.Task) -> Optional[str]:
280
- """
281
- Get the log text (standard error) of the last executor with a log in
282
- the task, or None.
283
- """
284
-
285
- for task_log in reversed(task.logs or []):
286
- for executor_log in reversed(task_log.logs or []):
287
- if isinstance(executor_log.stderr, str):
288
- # Find the last executor log code that is a string and return it
289
- return executor_log.stderr
290
-
291
- # If we get here we couldn't find a log.
292
- return None
293
-
294
- def getUpdatedBatchJob(self, maxWait: int) -> Optional[UpdatedBatchJobInfo]:
295
- # Remember when we started, for respecting the timeout
296
- entry = datetime.datetime.now()
297
- # This is the updated job we have found, if any
298
- result = None
299
- while result is None and ((datetime.datetime.now() - entry).total_seconds() < maxWait or not maxWait):
300
- result = self.getUpdatedLocalJob(0)
301
-
302
- if result:
303
- return result
304
-
305
- # Collect together the list of TES and batch system IDs for tasks we
306
- # are acknowledging and don't care about anymore.
307
- acknowledged = []
308
-
309
- for tes_id, bs_id in self.tes_id_to_bs_id.items():
310
- # Immediately poll all the jobs we issued.
311
- # TODO: There's no way to acknowledge a finished job, so there's no
312
- # faster way to find the newly finished jobs than polling
313
- task = self.tes.get_task(tes_id, view="MINIMAL")
314
- if task.state in ["COMPLETE", "CANCELED", "EXECUTOR_ERROR", "SYSTEM_ERROR"]:
315
- # This task is done!
316
- logger.debug("Found stopped task: %s", task)
317
-
318
- # Acknowledge it
319
- acknowledged.append((tes_id, bs_id))
320
-
321
- if task.state == "CANCELED":
322
- # Killed jobs aren't allowed to appear as updated.
323
- continue
324
-
325
- # Otherwise, it stopped running and it wasn't our fault.
326
-
327
- # Fetch the task's full info, including logs.
328
- task = self.tes.get_task(tes_id, view="FULL")
329
-
330
- # Record runtime
331
- runtime = self._get_runtime(task)
332
-
333
- # Determine if it succeeded
334
- exit_reason = STATE_TO_EXIT_REASON[task.state]
335
-
336
- # Get its exit code
337
- exit_code = self._get_exit_code(task)
338
-
339
- if task.state == "EXECUTOR_ERROR":
340
- # The task failed, so report executor logs.
341
- logger.warning('Log from failed executor: %s', self.__get_log_text(task))
342
-
343
- # Compose a result
344
- result = UpdatedBatchJobInfo(jobID=bs_id, exitStatus=exit_code, wallTime=runtime, exitReason=exit_reason)
345
-
346
- # No more iteration needed, we found a result.
347
- break
348
-
349
- # After the iteration, drop all the records for tasks we acknowledged
350
- for (tes_id, bs_id) in acknowledged:
351
- del self.tes_id_to_bs_id[tes_id]
352
- del self.bs_id_to_tes_id[bs_id]
353
-
354
- if not maxWait:
355
- # Don't wait at all
356
- break
357
- elif result is None:
358
- # Wait a bit and poll again
359
- time.sleep(min(maxWait/2, 1.0))
360
-
361
- # When we get here we have all the result we can get
362
- return result
363
-
364
- def shutdown(self) -> None:
365
-
366
- # Shutdown local processes first
367
- self.shutdownLocal()
368
-
369
- for tes_id in self.tes_id_to_bs_id.keys():
370
- # Shut down all the TES jobs we issued.
371
- self._try_cancel(tes_id)
372
-
373
- def _try_cancel(self, tes_id: str) -> None:
374
- """
375
- Internal function. Should not be called outside this class.
376
-
377
- Try to cancel a TES job.
378
-
379
- Succeed if it can't be canceled because it has stopped,
380
- but fail if it can't be canceled for some other reason.
381
- """
382
- try:
383
- # Kill each of our tasks in TES
384
- self.tes.cancel_task(tes_id)
385
- except HTTPError as e:
386
- if e.response is not None and e.response.status_code in [409, 500]:
387
- # TODO: This is what we probably get when trying to cancel
388
- # something that is actually done. But can we rely on that?
389
- pass
390
- elif '500' in str(e) or '409' in str(e):
391
- # TODO: drop this after <https://github.com/ohsu-comp-bio/py-tes/pull/36> merges.
392
- # py-tes might be hiding the actual code and just putting it in a string
393
- pass
394
- else:
395
- raise
396
-
397
- def getIssuedBatchJobIDs(self) -> List[int]:
398
- return self.getIssuedLocalJobIDs() + list(self.bs_id_to_tes_id.keys())
399
-
400
- def getRunningBatchJobIDs(self) -> Dict[int, float]:
401
- # We need a dict from job_id (integer) to seconds it has been running
402
- bs_id_to_runtime = {}
403
-
404
- for tes_id, bs_id in self.tes_id_to_bs_id.items():
405
- # Poll every issued task, and get the runtime info right away in
406
- # the default BASIC view.
407
- # TODO: use list_tasks filtering by name prefix and running state!
408
- task = self.tes.get_task(tes_id)
409
- logger.debug("Observed task: %s", task)
410
- if task.state in ["INITIALIZING", "RUNNING"]:
411
- # We count INITIALIZING tasks because they may be e.g. pulling
412
- # Docker containers, and we don't want to time out on them in
413
- # the tests. But they may not have any runtimes, so it might
414
- # not really help.
415
- runtime = self._get_runtime(task)
416
- if runtime:
417
- # We can measure a runtime
418
- bs_id_to_runtime[bs_id] = runtime
419
- # If we can't find a runtime, we can't say it's running
420
- # because we can't say how long it has been running for.
421
-
422
- # Give back the times all our running jobs have been running for.
423
- return bs_id_to_runtime
424
-
425
- def killBatchJobs(self, job_ids: List[int]) -> None:
426
- # Kill all the ones that are local
427
- self.killLocalJobs(job_ids)
428
-
429
- for bs_id in job_ids:
430
- if bs_id in self.bs_id_to_tes_id:
431
- # We sent this to TES. So try to cancel it.
432
- self._try_cancel(self.bs_id_to_tes_id[bs_id])
433
- # But don't forget the mapping until we actually get the finish
434
- # notification for the job.
435
-
436
- # TODO: If the kill races the collection of a finished update, do we
437
- # have to censor the finished update even if the kill never took
438
- # effect??? That's not implemented.
439
-
440
- @classmethod
441
- def add_options(cls, parser: Union[ArgumentParser, _ArgumentGroup]) -> None:
442
- parser.add_argument("--tesEndpoint", dest="tes_endpoint", default=cls.get_default_tes_endpoint(),
443
- help="The http(s) URL of the TES server. (default: %(default)s)")
444
- parser.add_argument("--tesUser", dest="tes_user", default=None,
445
- help="User name to use for basic authentication to TES server.")
446
- parser.add_argument("--tesPassword", dest="tes_password", default=None,
447
- help="Password to use for basic authentication to TES server.")
448
- parser.add_argument("--tesBearerToken", dest="tes_bearer_token", default=None,
449
- help="Bearer token to use for authentication to TES server.")
450
-
451
- @classmethod
452
- def setOptions(cls, setOption: OptionSetter) -> None:
453
- # Because we use the keyword arguments, we can't specify a type for setOption without using Protocols.
454
- # TODO: start using Protocols, or just start returning objects to represent the options.
455
- # When actually parsing options, remember to check the environment variables
456
- setOption("tes_endpoint", default=cls.get_default_tes_endpoint(), env=["TOIL_TES_ENDPOINT"])
457
- setOption("tes_user", default=None, env=["TOIL_TES_USER"])
458
- setOption("tes_password", default=None, env=["TOIL_TES_PASSWORD"])
459
- setOption("tes_bearer_token", default=None, env=["TOIL_TES_BEARER_TOKEN"])
@@ -1,117 +0,0 @@
1
- # Copyright (C) 2015-2021 Regents of the University of California
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- import logging
15
- import os
16
- import signal
17
- import subprocess
18
- import tempfile
19
- import threading
20
- import time
21
-
22
- from toil import physicalMemory
23
- from toil.lib.objects import InnerClass
24
- from toil.lib.threading import cpu_count
25
-
26
- log = logging.getLogger(__name__)
27
-
28
-
29
- class ParasolTestSupport:
30
- """
31
- For test cases that need a running Parasol leader and worker on the local host
32
- """
33
-
34
- def _startParasol(self, numCores=None, memory=None):
35
- if numCores is None:
36
- numCores = cpu_count()
37
- if memory is None:
38
- memory = physicalMemory()
39
- self.numCores = numCores
40
- self.memory = memory
41
- self.leader = self.ParasolLeaderThread()
42
- self.leader.start()
43
- self.worker = self.ParasolWorkerThread()
44
- self.worker.start()
45
- while self.leader.popen is None or self.worker.popen is None:
46
- log.info('Waiting for leader and worker processes')
47
- time.sleep(.1)
48
-
49
- def _stopParasol(self):
50
- self.worker.popen.kill()
51
- self.worker.join()
52
- self.leader.popen.kill()
53
- self.leader.join()
54
- for path in ('para.results', 'parasol.jid'):
55
- if os.path.exists(path):
56
- os.remove(path)
57
-
58
- class ParasolThread(threading.Thread):
59
-
60
- # Lock is used because subprocess is NOT thread safe: http://tinyurl.com/pkp5pgq
61
- lock = threading.Lock()
62
-
63
- def __init__(self):
64
- threading.Thread.__init__(self)
65
- self.popen = None
66
-
67
- def parasolCommand(self):
68
- raise NotImplementedError
69
-
70
- def run(self):
71
- command = self.parasolCommand()
72
- with self.lock:
73
- self.popen = subprocess.Popen(command)
74
- status = self.popen.wait()
75
- if status != 0 and status != -signal.SIGKILL:
76
- log.error("Command '%s' failed with %i.", command, status)
77
- raise subprocess.CalledProcessError(status, command)
78
- log.info('Exiting %s', self.__class__.__name__)
79
-
80
- @InnerClass
81
- class ParasolLeaderThread(ParasolThread):
82
-
83
- def __init__(self):
84
- super().__init__()
85
- self.machineList = None
86
-
87
- def run(self):
88
- with tempfile.NamedTemporaryFile(prefix='machineList.txt', mode='w') as f:
89
- self.machineList = f.name
90
- # name - Network name
91
- # cpus - Number of CPUs we can use
92
- # ramSize - Megabytes of memory
93
- # tempDir - Location of (local) temp dir
94
- # localDir - Location of local data dir
95
- # localSize - Megabytes of local disk
96
- # switchName - Name of switch this is on
97
- f.write('localhost {numCores} {ramSize} {tempDir} {tempDir} 1024 foo'.format(
98
- numCores=self.outer.numCores,
99
- tempDir=tempfile.gettempdir(),
100
- ramSize=self.outer.memory / 1024 / 1024))
101
- f.flush()
102
- super().run()
103
-
104
- def parasolCommand(self):
105
- return ['paraHub',
106
- '-spokes=1',
107
- '-debug',
108
- self.machineList]
109
-
110
- @InnerClass
111
- class ParasolWorkerThread(ParasolThread):
112
- def parasolCommand(self):
113
- return ['paraNode',
114
- '-cpu=%i' % self.outer.numCores,
115
- '-randomDelay=0',
116
- '-debug',
117
- 'start']