parsl 2023.6.5__py3-none-any.whl → 2023.6.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/addresses.py +2 -1
- parsl/configs/vineex_local.py +26 -0
- parsl/data_provider/data_manager.py +2 -1
- parsl/data_provider/files.py +1 -1
- parsl/dataflow/memoization.py +1 -1
- parsl/executors/taskvine/__init__.py +3 -0
- parsl/executors/taskvine/errors.py +22 -0
- parsl/executors/taskvine/exec_parsl_function.py +207 -0
- parsl/executors/taskvine/executor.py +1055 -0
- parsl/executors/workqueue/executor.py +4 -5
- parsl/launchers/base.py +17 -0
- parsl/launchers/launchers.py +1 -16
- parsl/monitoring/monitoring.py +19 -8
- parsl/providers/cluster_provider.py +2 -2
- parsl/providers/condor/condor.py +1 -1
- parsl/providers/kubernetes/kube.py +2 -1
- parsl/providers/slurm/slurm.py +1 -1
- parsl/tests/configs/taskvine_ex.py +11 -0
- parsl/tests/conftest.py +6 -6
- parsl/tests/scaling_tests/vineex_condor.py +10 -0
- parsl/tests/scaling_tests/vineex_local.py +10 -0
- parsl/tests/test_bash_apps/test_pipeline.py +2 -2
- parsl/tests/test_error_handling/test_retry_handler.py +1 -1
- parsl/utils.py +2 -2
- parsl/version.py +1 -1
- {parsl-2023.6.5.dist-info → parsl-2023.6.12.dist-info}/METADATA +2 -2
- {parsl-2023.6.5.dist-info → parsl-2023.6.12.dist-info}/RECORD +40 -32
- parsl/tests/configs/workqueue_blocks.py +0 -12
- /parsl/tests/{workqueue_tests → scaling_tests}/__init__.py +0 -0
- /parsl/tests/{workqueue_tests → scaling_tests}/htex_local.py +0 -0
- /parsl/tests/{workqueue_tests → scaling_tests}/local_threads.py +0 -0
- /parsl/tests/{workqueue_tests → scaling_tests}/test_scale.py +0 -0
- /parsl/tests/{workqueue_tests → scaling_tests}/wqex_condor.py +0 -0
- /parsl/tests/{workqueue_tests → scaling_tests}/wqex_local.py +0 -0
- {parsl-2023.6.5.data → parsl-2023.6.12.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2023.6.5.data → parsl-2023.6.12.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2023.6.5.data → parsl-2023.6.12.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2023.6.5.dist-info → parsl-2023.6.12.dist-info}/LICENSE +0 -0
- {parsl-2023.6.5.dist-info → parsl-2023.6.12.dist-info}/WHEEL +0 -0
- {parsl-2023.6.5.dist-info → parsl-2023.6.12.dist-info}/entry_points.txt +0 -0
- {parsl-2023.6.5.dist-info → parsl-2023.6.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1055 @@
|
|
1
|
+
""" TaskVineExecutor utilizes the TaskVine distributed framework developed by the
|
2
|
+
Cooperative Computing Lab (CCL) at Notre Dame to provide a fault-tolerant,
|
3
|
+
high-throughput system for delegating Parsl tasks to thousands of remote machines
|
4
|
+
"""
|
5
|
+
|
6
|
+
import threading
|
7
|
+
import multiprocessing
|
8
|
+
import logging
|
9
|
+
from concurrent.futures import Future
|
10
|
+
from ctypes import c_bool
|
11
|
+
|
12
|
+
import tempfile
|
13
|
+
import hashlib
|
14
|
+
import subprocess
|
15
|
+
import os
|
16
|
+
import socket
|
17
|
+
import time
|
18
|
+
import pickle
|
19
|
+
import queue
|
20
|
+
import inspect
|
21
|
+
import shutil
|
22
|
+
import itertools
|
23
|
+
|
24
|
+
from parsl.serialize import pack_apply_message
|
25
|
+
import parsl.utils as putils
|
26
|
+
from parsl.executors.errors import ExecutorError
|
27
|
+
from parsl.data_provider.files import File
|
28
|
+
from parsl.errors import OptionalModuleMissing
|
29
|
+
from parsl.executors.status_handling import BlockProviderExecutor
|
30
|
+
from parsl.providers.base import ExecutionProvider
|
31
|
+
from parsl.providers import LocalProvider, CondorProvider
|
32
|
+
from parsl.executors.taskvine import exec_parsl_function
|
33
|
+
from parsl.process_loggers import wrap_with_logs
|
34
|
+
from parsl.utils import setproctitle
|
35
|
+
|
36
|
+
import typeguard
|
37
|
+
from typing import Dict, List, Optional, Union
|
38
|
+
from parsl.data_provider.staging import Staging
|
39
|
+
|
40
|
+
from .errors import TaskVineTaskFailure
|
41
|
+
from .errors import TaskVineFailure
|
42
|
+
|
43
|
+
from collections import namedtuple
|
44
|
+
|
45
|
+
try:
|
46
|
+
from ndcctools.taskvine import cvine
|
47
|
+
from ndcctools.taskvine import Manager
|
48
|
+
from ndcctools.taskvine import Task
|
49
|
+
from ndcctools.taskvine.cvine import VINE_DEFAULT_PORT
|
50
|
+
from ndcctools.taskvine.cvine import VINE_ALLOCATION_MODE_MAX_THROUGHPUT
|
51
|
+
except ImportError:
|
52
|
+
_taskvine_enabled = False
|
53
|
+
VINE_DEFAULT_PORT = 0
|
54
|
+
else:
|
55
|
+
_taskvine_enabled = True
|
56
|
+
|
57
|
+
package_analyze_script = shutil.which("poncho_package_analyze")
|
58
|
+
package_create_script = shutil.which("poncho_package_create")
|
59
|
+
package_run_script = shutil.which("poncho_package_run")
|
60
|
+
|
61
|
+
logger = logging.getLogger(__name__)
|
62
|
+
|
63
|
+
|
64
|
+
# Support structure to communicate parsl tasks to the taskvine submit thread.
|
65
|
+
ParslTaskToVine = namedtuple('ParslTaskToVine',
|
66
|
+
'id category cores memory disk gpus priority running_time_min \
|
67
|
+
env_pkg map_file function_file result_file input_files output_files')
|
68
|
+
|
69
|
+
# Support structure to communicate final status of taskvine tasks to parsl
|
70
|
+
# result is only valid if result_received is True
|
71
|
+
# reason and status are only valid if result_received is False
|
72
|
+
VineTaskToParsl = namedtuple('VineTaskToParsl', 'id result_received result reason status')
|
73
|
+
|
74
|
+
# Support structure to report parsl filenames to taskvine.
|
75
|
+
# parsl_name is the local_name or filepath attribute of a parsl file object.
|
76
|
+
# stage tells whether the file should be copied by taskvine to the workers.
|
77
|
+
# cache tells whether the file should be cached at workers. Only valid if stage == True
|
78
|
+
ParslFileToVine = namedtuple('ParslFileToVine', 'parsl_name stage cache')
|
79
|
+
|
80
|
+
|
81
|
+
class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
|
82
|
+
"""Executor to use TaskVine batch system
|
83
|
+
|
84
|
+
The TaskVineExecutor system utilizes the TaskVine framework to
|
85
|
+
efficiently delegate Parsl apps to remote machines in clusters and
|
86
|
+
grids using a fault-tolerant system. Users can run the
|
87
|
+
vine_worker program on remote machines to connect to the
|
88
|
+
TaskVineExecutor, and Parsl apps will then be sent out to these
|
89
|
+
machines for execution and retrieval.
|
90
|
+
|
91
|
+
|
92
|
+
Parameters
|
93
|
+
----------
|
94
|
+
|
95
|
+
label: str
|
96
|
+
A human readable label for the executor, unique
|
97
|
+
with respect to other TaskVine master programs.
|
98
|
+
Default is "TaskVineExecutor".
|
99
|
+
|
100
|
+
project_name: str
|
101
|
+
If a project_name is given, then TaskVine will periodically
|
102
|
+
report its status and performance back to the global TaskVine catalog,
|
103
|
+
which can be viewed here: http://ccl.cse.nd.edu/software/taskvine/status
|
104
|
+
Default is None. Overrides address.
|
105
|
+
|
106
|
+
project_password_file: str
|
107
|
+
Optional password file for the taskvine project. Default is None.
|
108
|
+
|
109
|
+
address: str
|
110
|
+
The ip to contact this taskvine master process.
|
111
|
+
If not given, uses the address of the current machine as returned
|
112
|
+
by socket.gethostname().
|
113
|
+
Ignored if project_name is specified.
|
114
|
+
|
115
|
+
port: int
|
116
|
+
TCP port on Parsl submission machine for TaskVine workers
|
117
|
+
to connect to. Workers will connect to Parsl using this port.
|
118
|
+
|
119
|
+
If 0, TaskVine will allocate a port number automatically.
|
120
|
+
In this case, environment variables can be used to influence the
|
121
|
+
choice of port, documented here:
|
122
|
+
https://cctools.readthedocs.io/en/stable/api/html/taskvine_8h.html#a47ac70464e357e4dfcb0722fee6c44a0
|
123
|
+
Default is VINE_DEFAULT_PORT.
|
124
|
+
|
125
|
+
env: dict{str}
|
126
|
+
Dictionary that contains the environmental variables that
|
127
|
+
need to be set on the TaskVine worker machine.
|
128
|
+
|
129
|
+
shared_fs: bool
|
130
|
+
Define if working in a shared file system or not. If Parsl
|
131
|
+
and the TaskVine workers are on a shared file system, TaskVine
|
132
|
+
does not need to transfer and rename files for execution.
|
133
|
+
Default is False.
|
134
|
+
|
135
|
+
use_cache: bool
|
136
|
+
Whether workers should cache files of tasks.
|
137
|
+
Default is True.
|
138
|
+
|
139
|
+
source: bool
|
140
|
+
Choose whether to transfer parsl app information as
|
141
|
+
source code. (Note: this increases throughput for
|
142
|
+
@python_apps, but the implementation does not include
|
143
|
+
functionality for @bash_apps, and thus source=False
|
144
|
+
must be used for programs utilizing @bash_apps.)
|
145
|
+
Default is False. Set to True if pack is True.
|
146
|
+
|
147
|
+
pack: bool
|
148
|
+
Use conda-pack to prepare a self-contained Python evironment for
|
149
|
+
each task. This greatly increases task latency, but does not
|
150
|
+
require a common environment or shared FS on execution nodes.
|
151
|
+
Implies source=True. Default is False.
|
152
|
+
|
153
|
+
pack_env: str
|
154
|
+
An already prepared environment tarball using poncho_package_create
|
155
|
+
to attach to each task.
|
156
|
+
Default is "".
|
157
|
+
|
158
|
+
extra_pkgs: list
|
159
|
+
List of extra pip/conda package names to include when packing
|
160
|
+
the environment. This may be useful if the app executes other
|
161
|
+
(possibly non-Python) programs provided via pip or conda.
|
162
|
+
Scanning the app source for imports would not detect these
|
163
|
+
dependencies, so they need to be manually specified.
|
164
|
+
|
165
|
+
autolabel: bool
|
166
|
+
Use the Resource Monitor to automatically determine resource
|
167
|
+
labels based on observed task behavior.
|
168
|
+
|
169
|
+
autolabel_window: int
|
170
|
+
Set the number of tasks considered for autolabeling. TaskVine
|
171
|
+
will wait for a series of N tasks with steady resource
|
172
|
+
requirements before making a decision on labels. Increasing
|
173
|
+
this parameter will reduce the number of failed tasks due to
|
174
|
+
resource exhaustion when autolabeling, at the cost of increased
|
175
|
+
resources spent collecting stats.
|
176
|
+
|
177
|
+
autocategory: bool
|
178
|
+
Place each app in its own category by default. If all
|
179
|
+
invocations of an app have similar performance characteristics,
|
180
|
+
this will provide a reasonable set of categories automatically.
|
181
|
+
Default is True.
|
182
|
+
|
183
|
+
max_retries: int
|
184
|
+
Set the number of retries that TaskVine will make when a task
|
185
|
+
fails. This is distinct from Parsl level retries configured in
|
186
|
+
parsl.config.Config. Set to None to allow TaskVine to retry
|
187
|
+
tasks forever. By default, this is set to 1, so that all retries
|
188
|
+
will be managed by Parsl.
|
189
|
+
|
190
|
+
init_command: str
|
191
|
+
Command line to run before executing a task in a worker.
|
192
|
+
Default is ''.
|
193
|
+
|
194
|
+
worker_options: str
|
195
|
+
Extra options passed to vine_worker. Default is ''.
|
196
|
+
|
197
|
+
worker_executable: str
|
198
|
+
The command used to invoke vine_worker. This can be used
|
199
|
+
when the worker needs to be wrapped inside some other command
|
200
|
+
(for example, to run the worker inside a container). Default is
|
201
|
+
'vine_worker'.
|
202
|
+
|
203
|
+
function_dir: str
|
204
|
+
The directory where serialized function invocations are placed
|
205
|
+
to be sent to workers. If undefined, this defaults to a directory
|
206
|
+
under runinfo/. If shared_filesystem=True, then this directory
|
207
|
+
must be visible from both the submitting side and workers.
|
208
|
+
|
209
|
+
wait_for_workers: int
|
210
|
+
The number of workers to wait for before running any task.
|
211
|
+
Default is 0, so the manager won't wait for workers to connect.
|
212
|
+
|
213
|
+
enable_peer_transfers: bool
|
214
|
+
Option to enable transferring files between workers.
|
215
|
+
Default is True.
|
216
|
+
|
217
|
+
full_debug: bool
|
218
|
+
Whether to enable full debug mode for monitoring in TaskVine.
|
219
|
+
Default is False.
|
220
|
+
|
221
|
+
provider: ExecutionProvider
|
222
|
+
The Parsl provider that will spawn worker processes.
|
223
|
+
Default to spawning one local vine worker process.
|
224
|
+
|
225
|
+
storage_access: Optional[List[Staging]]
|
226
|
+
Define Parsl file staging providers for this executor.
|
227
|
+
Default is None.
|
228
|
+
"""
|
229
|
+
|
230
|
+
radio_mode = "filesystem"
|
231
|
+
|
232
|
+
@typeguard.typechecked
|
233
|
+
def __init__(self,
|
234
|
+
label: str = "TaskVineExecutor",
|
235
|
+
project_name: Optional[str] = None,
|
236
|
+
project_password_file: Optional[str] = None,
|
237
|
+
address: Optional[str] = None,
|
238
|
+
port: int = VINE_DEFAULT_PORT,
|
239
|
+
env: Optional[Dict] = None,
|
240
|
+
shared_fs: bool = False,
|
241
|
+
use_cache: bool = True,
|
242
|
+
source: bool = False,
|
243
|
+
pack: bool = False,
|
244
|
+
pack_env: str = "",
|
245
|
+
extra_pkgs: Optional[List[str]] = None,
|
246
|
+
autolabel: bool = False,
|
247
|
+
autolabel_window: int = 1,
|
248
|
+
autocategory: bool = True,
|
249
|
+
max_retries: int = 1,
|
250
|
+
init_command: str = "",
|
251
|
+
worker_options: str = "",
|
252
|
+
worker_executable: str = 'vine_worker',
|
253
|
+
function_dir: Optional[str] = None,
|
254
|
+
wait_for_workers: int = 0,
|
255
|
+
enable_peer_transfers: bool = True,
|
256
|
+
full_debug: bool = False,
|
257
|
+
provider: ExecutionProvider = LocalProvider(),
|
258
|
+
storage_access: Optional[List[Staging]] = None):
|
259
|
+
BlockProviderExecutor.__init__(self, provider=provider,
|
260
|
+
block_error_handler=True)
|
261
|
+
if not _taskvine_enabled:
|
262
|
+
raise OptionalModuleMissing(['taskvine'], "TaskVineExecutor requires the taskvine module.")
|
263
|
+
|
264
|
+
self.label = label
|
265
|
+
self.project_name = project_name
|
266
|
+
self.project_password_file = project_password_file
|
267
|
+
self.address = address
|
268
|
+
self.port = port
|
269
|
+
self.env = env
|
270
|
+
self.shared_fs = shared_fs
|
271
|
+
self.use_cache = use_cache
|
272
|
+
self.source = True if pack else source
|
273
|
+
self.pack = pack
|
274
|
+
self.pack_env = pack_env
|
275
|
+
self.extra_pkgs = extra_pkgs or []
|
276
|
+
self.autolabel = autolabel
|
277
|
+
self.autolabel_window = autolabel_window
|
278
|
+
self.autocategory = autocategory
|
279
|
+
self.max_retries = max_retries
|
280
|
+
self.init_command = init_command
|
281
|
+
self.worker_options = worker_options
|
282
|
+
self.worker_executable = worker_executable
|
283
|
+
self.function_dir = function_dir
|
284
|
+
self.wait_for_workers = wait_for_workers
|
285
|
+
self.enable_peer_transfers = enable_peer_transfers
|
286
|
+
self.full_debug = full_debug
|
287
|
+
self.storage_access = storage_access
|
288
|
+
|
289
|
+
# Queue to send tasks from TaskVine executor process to TaskVine manager process
|
290
|
+
self.task_queue: multiprocessing.Queue = multiprocessing.Queue()
|
291
|
+
|
292
|
+
# Queue to send tasks from TaskVine manager process to TaskVine executor process
|
293
|
+
self.collector_queue: multiprocessing.Queue = multiprocessing.Queue()
|
294
|
+
|
295
|
+
self.blocks: Dict[str, str] = {} # track Parsl blocks
|
296
|
+
self.executor_task_counter = -1 # task id starts from 0
|
297
|
+
self.should_stop = multiprocessing.Value(c_bool, False)
|
298
|
+
|
299
|
+
# mapping of function's unique memory address to its solved environment
|
300
|
+
self.cached_envs: Dict[int, str] = {}
|
301
|
+
|
302
|
+
if not self.address:
|
303
|
+
self.address = socket.gethostname()
|
304
|
+
|
305
|
+
if self.project_password_file is not None and not os.path.exists(self.project_password_file):
|
306
|
+
raise TaskVineFailure('Could not find password file: {}'.format(self.project_password_file))
|
307
|
+
|
308
|
+
# Build foundations of the launch command
|
309
|
+
self.launch_cmd = ("python3 exec_parsl_function.py {mapping} {function} {result}")
|
310
|
+
if self.init_command != "":
|
311
|
+
self.launch_cmd = self.init_command + "; " + self.launch_cmd
|
312
|
+
|
313
|
+
def _get_launch_command(self, block_id):
|
314
|
+
# this executor uses different terminology for worker/launch
|
315
|
+
# commands than in htex
|
316
|
+
return f"PARSL_WORKER_BLOCK_ID={block_id} {self.worker_command}"
|
317
|
+
|
318
|
+
def start(self):
|
319
|
+
"""Create submit process and collector thread to create, send, and
|
320
|
+
retrieve Parsl tasks within the TaskVine system.
|
321
|
+
"""
|
322
|
+
self.tasks_lock = threading.Lock()
|
323
|
+
|
324
|
+
# Create directories for data and results
|
325
|
+
if not self.function_dir:
|
326
|
+
self.function_data_dir = os.path.join(self.run_dir, self.label, "function_data")
|
327
|
+
else:
|
328
|
+
tp = str(time.time())
|
329
|
+
tx = os.path.join(self.function_dir, tp)
|
330
|
+
os.makedirs(tx)
|
331
|
+
self.function_data_dir = os.path.join(self.function_dir, tp, self.label, "function_data")
|
332
|
+
self.vine_log_dir = os.path.join(self.run_dir, self.label)
|
333
|
+
logger.debug("function data directory: {}\nlog directory: {}".format(self.function_data_dir, self.vine_log_dir))
|
334
|
+
os.makedirs(self.vine_log_dir)
|
335
|
+
os.makedirs(self.function_data_dir)
|
336
|
+
|
337
|
+
logger.debug("Starting TaskVineExecutor")
|
338
|
+
|
339
|
+
# Create a Process to perform TaskVine submissions
|
340
|
+
submit_process_kwargs = {"task_queue": self.task_queue,
|
341
|
+
"launch_cmd": self.launch_cmd,
|
342
|
+
"env": self.env,
|
343
|
+
"collector_queue": self.collector_queue,
|
344
|
+
"full": self.full_debug,
|
345
|
+
"shared_fs": self.shared_fs,
|
346
|
+
"autolabel": self.autolabel,
|
347
|
+
"autolabel_window": self.autolabel_window,
|
348
|
+
"autocategory": self.autocategory,
|
349
|
+
"max_retries": self.max_retries,
|
350
|
+
"should_stop": self.should_stop,
|
351
|
+
"port": self.port,
|
352
|
+
"vine_log_dir": self.vine_log_dir,
|
353
|
+
"project_password_file": self.project_password_file,
|
354
|
+
"project_name": self.project_name,
|
355
|
+
"wait_for_workers": self.wait_for_workers,
|
356
|
+
"enable_peer_transfers": self.enable_peer_transfers}
|
357
|
+
self.submit_process = multiprocessing.Process(target=_taskvine_submit_wait,
|
358
|
+
name="TaskVine-Submit-Process",
|
359
|
+
kwargs=submit_process_kwargs)
|
360
|
+
|
361
|
+
self.collector_thread = threading.Thread(target=self._collect_taskvine_results,
|
362
|
+
name="TaskVine-collector-thread")
|
363
|
+
self.collector_thread.daemon = True
|
364
|
+
|
365
|
+
# Begin both processes
|
366
|
+
self.submit_process.start()
|
367
|
+
self.collector_thread.start()
|
368
|
+
|
369
|
+
# Initialize scaling for the provider
|
370
|
+
self.initialize_scaling()
|
371
|
+
|
372
|
+
def _path_in_task(self, executor_task_id, *path_components):
|
373
|
+
"""Returns a filename fixed and specific to a task.
|
374
|
+
It is used for the following filename's:
|
375
|
+
(not given): The subdirectory per task that contains function, result, etc.
|
376
|
+
'function': Pickled file that contains the function to be executed.
|
377
|
+
'result': Pickled file that (will) contain the result of the function.
|
378
|
+
'map': Pickled file with a dict between local parsl names, and remote taskvine names.
|
379
|
+
"""
|
380
|
+
task_dir = "{:04d}".format(executor_task_id)
|
381
|
+
return os.path.join(self.function_data_dir, task_dir, *path_components)
|
382
|
+
|
383
|
+
def submit(self, func, resource_specification, *args, **kwargs):
|
384
|
+
"""Processes the Parsl app by its arguments and submits the function
|
385
|
+
information to the task queue, to be executed using the TaskVine
|
386
|
+
system. The args and kwargs are processed for input and output files to
|
387
|
+
the Parsl app, so that the files are appropriately specified for the TaskVine task.
|
388
|
+
|
389
|
+
Parameters
|
390
|
+
----------
|
391
|
+
|
392
|
+
func : function
|
393
|
+
Parsl app to be submitted to the TaskVine system
|
394
|
+
args : list
|
395
|
+
Arguments to the Parsl app
|
396
|
+
kwargs : dict
|
397
|
+
Keyword arguments to the Parsl app
|
398
|
+
"""
|
399
|
+
cores = None
|
400
|
+
memory = None
|
401
|
+
disk = None
|
402
|
+
gpus = None
|
403
|
+
priority = None
|
404
|
+
category = None
|
405
|
+
running_time_min = None
|
406
|
+
if resource_specification and isinstance(resource_specification, dict):
|
407
|
+
logger.debug("Got resource specification: {}".format(resource_specification))
|
408
|
+
|
409
|
+
required_resource_types = set(['cores', 'memory', 'disk'])
|
410
|
+
acceptable_resource_types = set(['cores', 'memory', 'disk', 'gpus', 'priority', 'running_time_min'])
|
411
|
+
keys = set(resource_specification.keys())
|
412
|
+
|
413
|
+
if not keys.issubset(acceptable_resource_types):
|
414
|
+
message = "Task resource specification only accepts these types of resources: {}".format(
|
415
|
+
', '.join(acceptable_resource_types))
|
416
|
+
logger.error(message)
|
417
|
+
raise ExecutorError(self, message)
|
418
|
+
|
419
|
+
# this checks that either all of the required resource types are specified, or
|
420
|
+
# that none of them are: the `required_resource_types` are not actually required,
|
421
|
+
# but if one is specified, then they all must be.
|
422
|
+
key_check = required_resource_types.intersection(keys)
|
423
|
+
required_keys_ok = len(key_check) == 0 or key_check == required_resource_types
|
424
|
+
if not self.autolabel and not required_keys_ok:
|
425
|
+
logger.error("Running with `autolabel=False`. In this mode, "
|
426
|
+
"task resource specification requires "
|
427
|
+
"three resources to be specified simultaneously: cores, memory, and disk")
|
428
|
+
raise ExecutorError(self, "Task resource specification requires "
|
429
|
+
"three resources to be specified simultaneously: cores, memory, and disk. "
|
430
|
+
"Try setting autolabel=True if you are unsure of the resource usage")
|
431
|
+
|
432
|
+
for k in keys:
|
433
|
+
if k == 'cores':
|
434
|
+
cores = resource_specification[k]
|
435
|
+
elif k == 'memory':
|
436
|
+
memory = resource_specification[k]
|
437
|
+
elif k == 'disk':
|
438
|
+
disk = resource_specification[k]
|
439
|
+
elif k == 'gpus':
|
440
|
+
gpus = resource_specification[k]
|
441
|
+
elif k == 'priority':
|
442
|
+
priority = resource_specification[k]
|
443
|
+
elif k == 'category':
|
444
|
+
category = resource_specification[k]
|
445
|
+
elif k == 'running_time_min':
|
446
|
+
running_time_min = resource_specification[k]
|
447
|
+
|
448
|
+
self.executor_task_counter += 1
|
449
|
+
executor_task_id = self.executor_task_counter
|
450
|
+
|
451
|
+
# Create a per task directory for the function, result, map, and result files
|
452
|
+
os.mkdir(self._path_in_task(executor_task_id))
|
453
|
+
|
454
|
+
input_files = []
|
455
|
+
output_files = []
|
456
|
+
|
457
|
+
# Determine the input and output files that will exist at the workers:
|
458
|
+
input_files += [self._register_file(f) for f in kwargs.get("inputs", []) if isinstance(f, File)]
|
459
|
+
output_files += [self._register_file(f) for f in kwargs.get("outputs", []) if isinstance(f, File)]
|
460
|
+
|
461
|
+
# Also consider any *arg that looks like a file as an input:
|
462
|
+
input_files += [self._register_file(f) for f in args if isinstance(f, File)]
|
463
|
+
|
464
|
+
for kwarg, maybe_file in kwargs.items():
|
465
|
+
# Add appropriate input and output files from "stdout" and "stderr" keyword arguments
|
466
|
+
if kwarg == "stdout" or kwarg == "stderr":
|
467
|
+
if maybe_file:
|
468
|
+
output_files.append(self._std_output_to_vine(kwarg, maybe_file))
|
469
|
+
# For any other keyword that looks like a file, assume it is an input file
|
470
|
+
elif isinstance(maybe_file, File):
|
471
|
+
input_files.append(self._register_file(maybe_file))
|
472
|
+
|
473
|
+
# Create a Future object and have it be mapped from the task ID in the tasks dictionary
|
474
|
+
fu = Future()
|
475
|
+
fu.parsl_executor_task_id = executor_task_id
|
476
|
+
logger.debug("Getting tasks_lock to set vine-level task entry")
|
477
|
+
with self.tasks_lock:
|
478
|
+
logger.debug("Got tasks_lock to set vine-level task entry")
|
479
|
+
self.tasks[str(executor_task_id)] = fu
|
480
|
+
|
481
|
+
logger.debug("Creating task {} for function {} with args {}".format(executor_task_id, func, args))
|
482
|
+
|
483
|
+
# Pickle the result into object to pass into message buffer
|
484
|
+
function_file = self._path_in_task(executor_task_id, "function")
|
485
|
+
result_file = self._path_in_task(executor_task_id, "result")
|
486
|
+
map_file = self._path_in_task(executor_task_id, "map")
|
487
|
+
|
488
|
+
logger.debug("Creating Task {} with function at: {}".format(executor_task_id, function_file))
|
489
|
+
logger.debug("Creating Executor Task {} with result to be found at: {}".format(executor_task_id, result_file))
|
490
|
+
|
491
|
+
self._serialize_function(function_file, func, args, kwargs)
|
492
|
+
|
493
|
+
if self.pack:
|
494
|
+
env_pkg = self._prepare_package(func, self.extra_pkgs)
|
495
|
+
else:
|
496
|
+
env_pkg = None
|
497
|
+
|
498
|
+
if self.pack_env:
|
499
|
+
env_pkg = self.pack_env
|
500
|
+
|
501
|
+
logger.debug("Constructing map for local filenames at worker for task {}".format(executor_task_id))
|
502
|
+
self._construct_map_file(map_file, input_files, output_files)
|
503
|
+
|
504
|
+
if not self.submit_process.is_alive():
|
505
|
+
raise ExecutorError(self, "taskvine Submit Process is not alive")
|
506
|
+
|
507
|
+
# Create message to put into the message queue
|
508
|
+
logger.debug("Placing task {} on message queue".format(executor_task_id))
|
509
|
+
if category is None:
|
510
|
+
category = func.__name__ if self.autocategory else 'parsl-default'
|
511
|
+
self.task_queue.put_nowait(ParslTaskToVine(executor_task_id,
|
512
|
+
category,
|
513
|
+
cores,
|
514
|
+
memory,
|
515
|
+
disk,
|
516
|
+
gpus,
|
517
|
+
priority,
|
518
|
+
running_time_min,
|
519
|
+
env_pkg,
|
520
|
+
map_file,
|
521
|
+
function_file,
|
522
|
+
result_file,
|
523
|
+
input_files,
|
524
|
+
output_files))
|
525
|
+
|
526
|
+
return fu
|
527
|
+
|
528
|
+
def _construct_worker_command(self):
|
529
|
+
worker_command = self.worker_executable
|
530
|
+
if self.project_password_file:
|
531
|
+
worker_command += ' --password {}'.format(self.project_password_file)
|
532
|
+
if self.worker_options:
|
533
|
+
worker_command += ' {}'.format(self.worker_options)
|
534
|
+
if self.project_name:
|
535
|
+
worker_command += ' -M {}'.format(self.project_name)
|
536
|
+
else:
|
537
|
+
worker_command += ' {} {}'.format(self.address, self.port)
|
538
|
+
|
539
|
+
logger.debug("Using worker command: {}".format(worker_command))
|
540
|
+
return worker_command
|
541
|
+
|
542
|
+
def _patch_providers(self):
|
543
|
+
# Add the worker and password file to files that the provider needs to stage.
|
544
|
+
# (Currently only for the CondorProvider)
|
545
|
+
if isinstance(self.provider, CondorProvider):
|
546
|
+
path_to_worker = shutil.which('vine_worker')
|
547
|
+
self.worker_command = './' + self.worker_command
|
548
|
+
self.provider.transfer_input_files.append(path_to_worker)
|
549
|
+
if self.project_password_file:
|
550
|
+
self.provider.transfer_input_files.append(self.project_password_file)
|
551
|
+
|
552
|
+
def _serialize_function(self, fn_path, parsl_fn, parsl_fn_args, parsl_fn_kwargs):
|
553
|
+
"""Takes the function application parsl_fn(*parsl_fn_args, **parsl_fn_kwargs)
|
554
|
+
and serializes it to the file fn_path."""
|
555
|
+
|
556
|
+
# Either build a dictionary with the source of the function, or pickle
|
557
|
+
# the function directly:
|
558
|
+
if self.source:
|
559
|
+
function_info = {"source code": inspect.getsource(parsl_fn),
|
560
|
+
"name": parsl_fn.__name__,
|
561
|
+
"args": parsl_fn_args,
|
562
|
+
"kwargs": parsl_fn_kwargs}
|
563
|
+
else:
|
564
|
+
function_info = {"byte code": pack_apply_message(parsl_fn, parsl_fn_args, parsl_fn_kwargs,
|
565
|
+
buffer_threshold=1024 * 1024)}
|
566
|
+
|
567
|
+
with open(fn_path, "wb") as f_out:
|
568
|
+
pickle.dump(function_info, f_out)
|
569
|
+
|
570
|
+
def _construct_map_file(self, map_file, input_files, output_files):
|
571
|
+
""" Map local filepath of parsl files to the filenames at the execution worker.
|
572
|
+
If using a shared filesystem, the filepath is mapped to its absolute filename.
|
573
|
+
Otherwise, to its original relative filename. In this later case, taskvine
|
574
|
+
recreates any directory hierarchy needed."""
|
575
|
+
file_translation_map = {}
|
576
|
+
for spec in itertools.chain(input_files, output_files):
|
577
|
+
local_name = spec[0]
|
578
|
+
if self.shared_fs:
|
579
|
+
remote_name = os.path.abspath(local_name)
|
580
|
+
else:
|
581
|
+
remote_name = local_name
|
582
|
+
file_translation_map[local_name] = remote_name
|
583
|
+
with open(map_file, "wb") as f_out:
|
584
|
+
pickle.dump(file_translation_map, f_out)
|
585
|
+
|
586
|
+
def _register_file(self, parsl_file):
|
587
|
+
"""Generates a tuple (parsl_file.filepath, stage, cache) to give to
|
588
|
+
taskvine. cache is always True if self.use_cache is True.
|
589
|
+
Otherwise, it is set to False.
|
590
|
+
stage is True if the file needs to be copied by taskvine. (i.e., not
|
591
|
+
a URL or an absolute path)"""
|
592
|
+
|
593
|
+
to_cache = True
|
594
|
+
if not self.use_cache:
|
595
|
+
to_cache = False
|
596
|
+
|
597
|
+
to_stage = False
|
598
|
+
if parsl_file.scheme == 'file' or (parsl_file.local_path and os.path.exists(parsl_file.local_path)):
|
599
|
+
to_stage = not os.path.isabs(parsl_file.filepath)
|
600
|
+
|
601
|
+
return ParslFileToVine(parsl_file.filepath, to_stage, to_cache)
|
602
|
+
|
603
|
+
def _std_output_to_vine(self, fdname, stdfspec):
|
604
|
+
"""Find the name of the file that will contain stdout or stderr and
|
605
|
+
return a ParslFileToVine with it. These files are never cached"""
|
606
|
+
fname, mode = putils.get_std_fname_mode(fdname, stdfspec)
|
607
|
+
to_stage = not os.path.isabs(fname)
|
608
|
+
return ParslFileToVine(fname, stage=to_stage, cache=False)
|
609
|
+
|
610
|
+
def _prepare_package(self, fn, extra_pkgs):
|
611
|
+
fn_id = id(fn)
|
612
|
+
fn_name = fn.__name__
|
613
|
+
if fn_id in self.cached_envs:
|
614
|
+
logger.debug("Skipping analysis of %s, previously got %s", fn_name, self.cached_envs[fn_id])
|
615
|
+
return self.cached_envs[fn_id]
|
616
|
+
source_code = inspect.getsource(fn).encode()
|
617
|
+
pkg_dir = os.path.join(tempfile.gettempdir(), "python_package-{}".format(os.geteuid()))
|
618
|
+
os.makedirs(pkg_dir, exist_ok=True)
|
619
|
+
with tempfile.NamedTemporaryFile(suffix='.yaml') as spec:
|
620
|
+
logger.info("Analyzing dependencies of %s", fn_name)
|
621
|
+
analyze_cmdline = [package_analyze_script, exec_parsl_function.__file__, '-', spec.name]
|
622
|
+
for p in extra_pkgs:
|
623
|
+
analyze_cmdline += ["--extra-pkg", p]
|
624
|
+
subprocess.run(analyze_cmdline, input=source_code, check=True)
|
625
|
+
with open(spec.name, mode='rb') as f:
|
626
|
+
spec_hash = hashlib.sha256(f.read()).hexdigest()
|
627
|
+
logger.debug("Spec hash for %s is %s", fn_name, spec_hash)
|
628
|
+
pkg = os.path.join(pkg_dir, "pack-{}.tar.gz".format(spec_hash))
|
629
|
+
if os.access(pkg, os.R_OK):
|
630
|
+
self.cached_envs[fn_id] = pkg
|
631
|
+
logger.debug("Cached package for %s found at %s", fn_name, pkg)
|
632
|
+
return pkg
|
633
|
+
(fd, tarball) = tempfile.mkstemp(dir=pkg_dir, prefix='.tmp', suffix='.tar.gz')
|
634
|
+
os.close(fd)
|
635
|
+
logger.info("Creating dependency package for %s", fn_name)
|
636
|
+
logger.debug("Writing deps for %s to %s", fn_name, tarball)
|
637
|
+
subprocess.run([package_create_script, spec.name, tarball], stdout=subprocess.DEVNULL, check=True)
|
638
|
+
logger.debug("Done with conda-pack; moving %s to %s", tarball, pkg)
|
639
|
+
os.rename(tarball, pkg)
|
640
|
+
self.cached_envs[fn_id] = pkg
|
641
|
+
return pkg
|
642
|
+
|
643
|
+
def initialize_scaling(self):
|
644
|
+
""" Compose the launch command and call scale out
|
645
|
+
|
646
|
+
Scales the workers to the appropriate nodes with provider
|
647
|
+
"""
|
648
|
+
# Start scaling in/out
|
649
|
+
logger.debug("Starting TaskVineExecutor with provider: %s", self.provider)
|
650
|
+
self.worker_command = self._construct_worker_command()
|
651
|
+
self._patch_providers()
|
652
|
+
|
653
|
+
if hasattr(self.provider, 'init_blocks'):
|
654
|
+
try:
|
655
|
+
self.scale_out(blocks=self.provider.init_blocks)
|
656
|
+
except Exception as e:
|
657
|
+
logger.error("Initial block scaling out failed: {}".format(e))
|
658
|
+
raise e
|
659
|
+
|
660
|
+
@property
|
661
|
+
def outstanding(self) -> int:
|
662
|
+
"""Count the number of outstanding tasks. This is inefficiently
|
663
|
+
implemented and probably could be replaced with a counter.
|
664
|
+
"""
|
665
|
+
outstanding = 0
|
666
|
+
with self.tasks_lock:
|
667
|
+
for fut in self.tasks.values():
|
668
|
+
if not fut.done():
|
669
|
+
outstanding += 1
|
670
|
+
logger.debug(f"Counted {outstanding} outstanding tasks")
|
671
|
+
return outstanding
|
672
|
+
|
673
|
+
@property
|
674
|
+
def workers_per_node(self) -> Union[int, float]:
|
675
|
+
return 1
|
676
|
+
|
677
|
+
def scale_in(self, count):
|
678
|
+
"""Scale in method. Cancel a given number of blocks
|
679
|
+
"""
|
680
|
+
# Obtain list of blocks to kill
|
681
|
+
to_kill = list(self.blocks.keys())[:count]
|
682
|
+
kill_ids = [self.blocks[block] for block in to_kill]
|
683
|
+
|
684
|
+
# Cancel the blocks provisioned
|
685
|
+
if self.provider:
|
686
|
+
self.provider.cancel(kill_ids)
|
687
|
+
else:
|
688
|
+
logger.error("No execution provider available to scale")
|
689
|
+
|
690
|
+
def shutdown(self, *args, **kwargs):
|
691
|
+
"""Shutdown the executor. Sets flag to cancel the submit process and
|
692
|
+
collector thread, which shuts down the TaskVine system submission.
|
693
|
+
"""
|
694
|
+
logger.debug("TaskVine shutdown started")
|
695
|
+
self.should_stop.value = True
|
696
|
+
|
697
|
+
# Remove the workers that are still going
|
698
|
+
kill_ids = [self.blocks[block] for block in self.blocks.keys()]
|
699
|
+
if self.provider:
|
700
|
+
logger.debug("Cancelling blocks")
|
701
|
+
self.provider.cancel(kill_ids)
|
702
|
+
|
703
|
+
logger.debug("Joining on submit process")
|
704
|
+
self.submit_process.join()
|
705
|
+
logger.debug("Joining on collector thread")
|
706
|
+
self.collector_thread.join()
|
707
|
+
|
708
|
+
logger.debug("TaskVine shutdown completed")
|
709
|
+
return True
|
710
|
+
|
711
|
+
@wrap_with_logs
|
712
|
+
def _collect_taskvine_results(self):
|
713
|
+
"""Sets the values of tasks' futures of tasks completed by taskvine.
|
714
|
+
"""
|
715
|
+
logger.debug("Starting Collector Thread")
|
716
|
+
try:
|
717
|
+
while not self.should_stop.value:
|
718
|
+
if not self.submit_process.is_alive():
|
719
|
+
raise ExecutorError(self, "taskvine Submit Process is not alive")
|
720
|
+
|
721
|
+
# Get the result message from the collector_queue
|
722
|
+
try:
|
723
|
+
task_report = self.collector_queue.get(timeout=1)
|
724
|
+
except queue.Empty:
|
725
|
+
continue
|
726
|
+
|
727
|
+
# Obtain the future from the tasks dictionary
|
728
|
+
with self.tasks_lock:
|
729
|
+
future = self.tasks.pop(task_report.id)
|
730
|
+
|
731
|
+
logger.debug("Updating Future for Parsl Task {}".format(task_report.id))
|
732
|
+
if task_report.result_received:
|
733
|
+
future.set_result(task_report.result)
|
734
|
+
else:
|
735
|
+
# If there are no results, then the task failed according to one of
|
736
|
+
# taskvine modes, such as resource exhaustion.
|
737
|
+
future.set_exception(TaskVineTaskFailure(task_report.reason, task_report.result))
|
738
|
+
finally:
|
739
|
+
logger.debug("Marking all outstanding tasks as failed")
|
740
|
+
logger.debug("Acquiring tasks_lock")
|
741
|
+
with self.tasks_lock:
|
742
|
+
logger.debug("Acquired tasks_lock")
|
743
|
+
# set exception for tasks waiting for results that taskvine did not execute
|
744
|
+
for fu in self.tasks.values():
|
745
|
+
if not fu.done():
|
746
|
+
fu.set_exception(TaskVineFailure("taskvine executor failed to execute the task."))
|
747
|
+
logger.debug("Exiting Collector Thread")
|
748
|
+
|
749
|
+
|
750
|
+
@wrap_with_logs
|
751
|
+
def _taskvine_submit_wait(task_queue=multiprocessing.Queue(),
|
752
|
+
launch_cmd=None,
|
753
|
+
env=None,
|
754
|
+
collector_queue=multiprocessing.Queue(),
|
755
|
+
full=False,
|
756
|
+
shared_fs=False,
|
757
|
+
autolabel=False,
|
758
|
+
autolabel_window=None,
|
759
|
+
autocategory=True,
|
760
|
+
max_retries=None,
|
761
|
+
should_stop=None,
|
762
|
+
port=VINE_DEFAULT_PORT,
|
763
|
+
vine_log_dir=None,
|
764
|
+
project_password_file=None,
|
765
|
+
project_name=None,
|
766
|
+
wait_for_workers=0,
|
767
|
+
enable_peer_transfers=True):
|
768
|
+
"""Thread to handle Parsl app submissions to the TaskVine objects.
|
769
|
+
Takes in Parsl functions submitted using submit(), and creates a
|
770
|
+
TaskVine task with the appropriate specifications, which is then
|
771
|
+
submitted to TaskVine. After tasks are completed, processes the
|
772
|
+
exit status and exit code of the task, and sends results to the
|
773
|
+
TaskVine collector thread.
|
774
|
+
To avoid python's global interpreter lock with taskvine's wait, this
|
775
|
+
function should be launched as a process, not as a lightweight thread. This
|
776
|
+
means that any communication should be done using the multiprocessing
|
777
|
+
module capabilities, rather than shared memory.
|
778
|
+
"""
|
779
|
+
logger.debug("Starting TaskVine Submit/Wait Process")
|
780
|
+
setproctitle("parsl: TaskVine submit/wait")
|
781
|
+
|
782
|
+
# Enable debugging flags and create logging file
|
783
|
+
if vine_log_dir is not None:
|
784
|
+
logger.debug("Setting debugging flags and creating logging file at {}".format(vine_log_dir))
|
785
|
+
|
786
|
+
# Create TaskVine queue object
|
787
|
+
logger.debug("Creating TaskVine Object")
|
788
|
+
try:
|
789
|
+
logger.debug("Listening on port {}".format(port))
|
790
|
+
m = Manager(port=port,
|
791
|
+
name=project_name,
|
792
|
+
run_info_path=vine_log_dir)
|
793
|
+
except Exception as e:
|
794
|
+
logger.error("Unable to create TaskVine object: {}".format(e))
|
795
|
+
raise e
|
796
|
+
|
797
|
+
# Specify TaskVine queue attributes
|
798
|
+
if project_password_file:
|
799
|
+
m.set_password_file(project_password_file)
|
800
|
+
|
801
|
+
if autolabel:
|
802
|
+
m.enable_monitoring()
|
803
|
+
if autolabel_window is not None:
|
804
|
+
m.tune('category-steady-n-tasks', autolabel_window)
|
805
|
+
|
806
|
+
if wait_for_workers:
|
807
|
+
m.tune("wait-for-workers", wait_for_workers)
|
808
|
+
|
809
|
+
if enable_peer_transfers:
|
810
|
+
m.enable_peer_transfers()
|
811
|
+
|
812
|
+
# Only write logs when the vine_log_dir is specified, which it most likely will be
|
813
|
+
# if vine_log_dir is not None:
|
814
|
+
if full and autolabel:
|
815
|
+
m.enable_monitoring_full(dirname=vine_log_dir)
|
816
|
+
|
817
|
+
orig_ppid = os.getppid()
|
818
|
+
|
819
|
+
result_file_of_task_id = {} # Mapping executor task id -> result file for active tasks.
|
820
|
+
|
821
|
+
poncho_env_to_file = {} # Mapping poncho_env file to File object in TaskVine
|
822
|
+
|
823
|
+
# Mapping of parsl local file name to TaskVine File object
|
824
|
+
# dict[str] -> vine File object
|
825
|
+
parsl_file_name_to_vine_file = {}
|
826
|
+
|
827
|
+
# Declare helper scripts as cache-able and peer-transferable
|
828
|
+
package_run_script_file = m.declare_file(package_run_script, cache=True, peer_transfer=True)
|
829
|
+
exec_parsl_function_file = m.declare_file(exec_parsl_function.__file__, cache=True, peer_transfer=True)
|
830
|
+
|
831
|
+
# Mapping of tasks from vine id to parsl id
|
832
|
+
# Dict[str] -> str
|
833
|
+
vine_id_to_executor_task_id = {}
|
834
|
+
|
835
|
+
while not should_stop.value:
|
836
|
+
# Monitor the task queue
|
837
|
+
ppid = os.getppid()
|
838
|
+
if ppid != orig_ppid:
|
839
|
+
logger.debug("new Process")
|
840
|
+
break
|
841
|
+
|
842
|
+
# Submit tasks
|
843
|
+
while task_queue.qsize() > 0 and not should_stop.value:
|
844
|
+
# Obtain task from task_queue
|
845
|
+
try:
|
846
|
+
task = task_queue.get(timeout=1)
|
847
|
+
logger.debug("Removing executor task from queue")
|
848
|
+
except queue.Empty:
|
849
|
+
continue
|
850
|
+
|
851
|
+
# Create command string
|
852
|
+
command_str = launch_cmd.format(mapping=os.path.basename(task.map_file),
|
853
|
+
function=os.path.basename(task.function_file),
|
854
|
+
result=os.path.basename(task.result_file))
|
855
|
+
|
856
|
+
# Create TaskVine task for the command
|
857
|
+
logger.debug("Sending executor task {} with command: {}".format(task.id, command_str))
|
858
|
+
try:
|
859
|
+
t = Task(command_str)
|
860
|
+
except Exception as e:
|
861
|
+
logger.error("Unable to create executor task: {}".format(e))
|
862
|
+
collector_queue.put_nowait(VineTaskToParsl(id=task.id,
|
863
|
+
result_received=False,
|
864
|
+
result=None,
|
865
|
+
reason="task could not be created by taskvine",
|
866
|
+
status=-1))
|
867
|
+
continue
|
868
|
+
|
869
|
+
poncho_env_file = None
|
870
|
+
if task.env_pkg is not None:
|
871
|
+
if task.env_pkg not in poncho_env_to_file:
|
872
|
+
poncho_env_file = m.declare_poncho(task.env_pkg, cache=True, peer_transfer=True)
|
873
|
+
poncho_env_to_file[task.env_pkg] = poncho_env_file
|
874
|
+
else:
|
875
|
+
poncho_env_file = poncho_env_to_file[task.env_pkg]
|
876
|
+
|
877
|
+
if poncho_env_file is not None:
|
878
|
+
t.add_environment(poncho_env_file)
|
879
|
+
t.add_input(package_run_script_file, "poncho_package_run")
|
880
|
+
|
881
|
+
t.set_category(task.category)
|
882
|
+
if autolabel:
|
883
|
+
m.set_category_mode(task.category, VINE_ALLOCATION_MODE_MAX_THROUGHPUT)
|
884
|
+
|
885
|
+
if task.cores is not None:
|
886
|
+
t.set_cores(task.cores)
|
887
|
+
if task.memory is not None:
|
888
|
+
t.set_memory(task.memory)
|
889
|
+
if task.disk is not None:
|
890
|
+
t.set_disk(task.disk)
|
891
|
+
if task.gpus is not None:
|
892
|
+
t.set_gpus(task.gpus)
|
893
|
+
if task.priority is not None:
|
894
|
+
t.set_priority(task.priority)
|
895
|
+
if task.running_time_min is not None:
|
896
|
+
t.set_time_min(task.running_time_min)
|
897
|
+
|
898
|
+
if max_retries is not None:
|
899
|
+
logger.debug(f"Specifying max_retries {max_retries}")
|
900
|
+
t.set_retries(max_retries)
|
901
|
+
else:
|
902
|
+
logger.debug("Not specifying max_retries")
|
903
|
+
|
904
|
+
# Specify environment variables for the task
|
905
|
+
if env is not None:
|
906
|
+
for var in env:
|
907
|
+
t.set_env_var(str(var), str(env[var]))
|
908
|
+
|
909
|
+
# Add helper function that execute parsl functions on remote nodes
|
910
|
+
t.add_input(exec_parsl_function_file, "exec_parsl_function.py")
|
911
|
+
|
912
|
+
# Declare and add task-specific function, data, and result files to task
|
913
|
+
task_function_file = m.declare_file(task.function_file, cache=False, peer_transfer=False)
|
914
|
+
t.add_input(task_function_file, "function")
|
915
|
+
|
916
|
+
task_map_file = m.declare_file(task.map_file, cache=False, peer_transfer=False)
|
917
|
+
t.add_input(task_map_file, "map")
|
918
|
+
|
919
|
+
task_result_file = m.declare_file(task.result_file, cache=False, peer_transfer=False)
|
920
|
+
t.add_output(task_result_file, "result")
|
921
|
+
|
922
|
+
result_file_of_task_id[str(task.id)] = task.result_file
|
923
|
+
|
924
|
+
logger.debug("Executor task id: {}".format(task.id))
|
925
|
+
|
926
|
+
# Specify input/output files that need to be staged.
|
927
|
+
# Absolute paths are assumed to be in shared filesystem, and thus
|
928
|
+
# not staged by taskvine.
|
929
|
+
# Files that share the same local path are assumed to be the same
|
930
|
+
# and thus use the same Vine File object if detected.
|
931
|
+
if not shared_fs:
|
932
|
+
for spec in task.input_files:
|
933
|
+
if spec.stage:
|
934
|
+
if spec.parsl_name in parsl_file_name_to_vine_file:
|
935
|
+
task_in_file = parsl_file_name_to_vine_file[spec.parsl_name]
|
936
|
+
else:
|
937
|
+
task_in_file = m.declare_file(spec.parsl_name, cache=spec.cache, peer_transfer=True)
|
938
|
+
parsl_file_name_to_vine_file[spec.parsl_name] = task_in_file
|
939
|
+
t.add_input(task_in_file, spec.parsl_name)
|
940
|
+
|
941
|
+
for spec in task.output_files:
|
942
|
+
if spec.stage:
|
943
|
+
if spec.parsl_name in parsl_file_name_to_vine_file:
|
944
|
+
task_out_file = parsl_file_name_to_vine_file[spec.parsl_name]
|
945
|
+
else:
|
946
|
+
task_out_file = m.declare_file(spec.parsl_name, cache=spec.cache, peer_transfer=True)
|
947
|
+
t.add_output(task_out_file, spec.parsl_name)
|
948
|
+
|
949
|
+
# Submit the task to the TaskVine object
|
950
|
+
logger.debug("Submitting executor task {} to TaskVine".format(task.id))
|
951
|
+
try:
|
952
|
+
vine_id = m.submit(t)
|
953
|
+
vine_id_to_executor_task_id[str(vine_id)] = str(task.id)
|
954
|
+
except Exception as e:
|
955
|
+
logger.error("Unable to submit task to taskvine: {}".format(e))
|
956
|
+
collector_queue.put_nowait(VineTaskToParsl(id=task.id,
|
957
|
+
result_received=False,
|
958
|
+
result=None,
|
959
|
+
reason="task could not be submited to taskvine",
|
960
|
+
status=-1))
|
961
|
+
continue
|
962
|
+
logger.info("Executor task {} submitted as TaskVine task with id {}".format(task.id, vine_id))
|
963
|
+
|
964
|
+
# If the queue is not empty wait on the TaskVine queue for a task
|
965
|
+
task_found = True
|
966
|
+
if not m.empty():
|
967
|
+
while task_found and not should_stop.value:
|
968
|
+
# Obtain the task from the queue
|
969
|
+
t = m.wait(1)
|
970
|
+
if t is None:
|
971
|
+
task_found = False
|
972
|
+
continue
|
973
|
+
# When a task is found:
|
974
|
+
executor_task_id = vine_id_to_executor_task_id[str(t.id)]
|
975
|
+
logger.debug("Completed TaskVine task {}, executor task {}".format(t.id, executor_task_id))
|
976
|
+
result_file = result_file_of_task_id.pop(executor_task_id)
|
977
|
+
vine_id_to_executor_task_id.pop(str(t.id))
|
978
|
+
|
979
|
+
logger.debug(f"completed executor task info: {executor_task_id}, {t.category}, {t.command}, {t.std_output}")
|
980
|
+
|
981
|
+
# A tasks completes 'succesfully' if it has result file,
|
982
|
+
# and it can be loaded. This may mean that the 'success' is
|
983
|
+
# an exception.
|
984
|
+
logger.debug("Looking for result in {}".format(result_file))
|
985
|
+
try:
|
986
|
+
with open(result_file, "rb") as f_in:
|
987
|
+
result = pickle.load(f_in)
|
988
|
+
logger.debug("Found result in {}".format(result_file))
|
989
|
+
collector_queue.put_nowait(VineTaskToParsl(id=executor_task_id,
|
990
|
+
result_received=True,
|
991
|
+
result=result,
|
992
|
+
reason=None,
|
993
|
+
status=t.exit_code))
|
994
|
+
# If a result file could not be generated, explain the
|
995
|
+
# failure according to taskvine error codes. We generate
|
996
|
+
# an exception and wrap it with RemoteExceptionWrapper, to
|
997
|
+
# match the positive case.
|
998
|
+
except Exception as e:
|
999
|
+
reason = _explain_taskvine_result(t)
|
1000
|
+
logger.debug("Did not find result in {}".format(result_file))
|
1001
|
+
logger.debug("Wrapper Script status: {}\nTaskVine Status: {}"
|
1002
|
+
.format(t.exit_code, t.result))
|
1003
|
+
logger.debug("Task with executor id {} / vine id {} failed because:\n{}"
|
1004
|
+
.format(executor_task_id, t.id, reason))
|
1005
|
+
collector_queue.put_nowait(VineTaskToParsl(id=executor_task_id,
|
1006
|
+
result_received=False,
|
1007
|
+
result=e,
|
1008
|
+
reason=reason,
|
1009
|
+
status=t.exit_code))
|
1010
|
+
|
1011
|
+
logger.debug("Exiting TaskVine Monitoring Process")
|
1012
|
+
return 0
|
1013
|
+
|
1014
|
+
|
1015
|
+
def _explain_taskvine_result(vine_task):
|
1016
|
+
"""Returns a string with the reason why a task failed according to taskvine."""
|
1017
|
+
|
1018
|
+
vine_result = vine_task.result
|
1019
|
+
|
1020
|
+
reason = "taskvine result: "
|
1021
|
+
if vine_result == cvine.VINE_RESULT_SUCCESS:
|
1022
|
+
reason += "succesful execution with exit code {}".format(vine_task.return_status)
|
1023
|
+
elif vine_result == cvine.VINE_RESULT_OUTPUT_MISSING:
|
1024
|
+
reason += "The result file was not transfered from the worker.\n"
|
1025
|
+
reason += "This usually means that there is a problem with the python setup,\n"
|
1026
|
+
reason += "or the wrapper that executes the function."
|
1027
|
+
reason += "\nTrace:\n" + str(vine_task.output)
|
1028
|
+
elif vine_result == cvine.VINE_RESULT_INPUT_MISSING:
|
1029
|
+
reason += "missing input file"
|
1030
|
+
elif vine_result == cvine.VINE_RESULT_STDOUT_MISSING:
|
1031
|
+
reason += "stdout has been truncated"
|
1032
|
+
elif vine_result == cvine.VINE_RESULT_SIGNAL:
|
1033
|
+
reason += "task terminated with a signal"
|
1034
|
+
elif vine_result == cvine.VINE_RESULT_RESOURCE_EXHAUSTION:
|
1035
|
+
reason += "task used more resources than requested"
|
1036
|
+
elif vine_result == cvine.VINE_RESULT_MAX_END_TIME:
|
1037
|
+
reason += "task ran past the specified end time"
|
1038
|
+
elif vine_result == cvine.VINE_RESULT_UNKNOWN:
|
1039
|
+
reason += "result could not be classified"
|
1040
|
+
elif vine_result == cvine.VINE_RESULT_FORSAKEN:
|
1041
|
+
reason += "task failed, but not a task error"
|
1042
|
+
elif vine_result == cvine.VINE_RESULT_MAX_RETRIES:
|
1043
|
+
reason += "unable to complete after specified number of retries"
|
1044
|
+
elif vine_result == cvine.VINE_RESULT_MAX_WALL_TIME:
|
1045
|
+
reason += "task ran for more than the specified time"
|
1046
|
+
elif vine_result == cvine.VINE_RESULT_RMONITOR_ERROR:
|
1047
|
+
reason += "task failed because the monitor did not produce an output"
|
1048
|
+
elif vine_result == cvine.VINE_RESULT_OUTPUT_TRANSFER_ERROR:
|
1049
|
+
reason += "task failed because output transfer fails"
|
1050
|
+
elif vine_result == cvine.VINE_RESULT_FIXED_LOCATION_MISSING:
|
1051
|
+
reason += "task failed because no worker could satisfy the fixed \n"
|
1052
|
+
reason += "location input file requirements"
|
1053
|
+
else:
|
1054
|
+
reason += "unable to process TaskVine system failure"
|
1055
|
+
return reason
|