parsl 2025.1.13__py3-none-any.whl → 2025.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/configs/gc_multisite.py +27 -0
- parsl/configs/gc_tutorial.py +18 -0
- parsl/dataflow/dflow.py +23 -103
- parsl/dataflow/errors.py +60 -18
- parsl/dataflow/memoization.py +76 -2
- parsl/dataflow/taskrecord.py +1 -3
- parsl/executors/__init__.py +3 -1
- parsl/executors/globus_compute.py +125 -0
- parsl/executors/high_throughput/errors.py +1 -1
- parsl/executors/high_throughput/executor.py +16 -15
- parsl/executors/high_throughput/interchange.py +74 -96
- parsl/executors/high_throughput/zmq_pipes.py +0 -1
- parsl/tests/configs/globus_compute.py +20 -0
- parsl/tests/conftest.py +4 -0
- parsl/tests/test_checkpointing/test_python_checkpoint_1.py +0 -3
- parsl/tests/test_error_handling/test_resource_spec.py +3 -0
- parsl/tests/test_htex/test_interchange_exit_bad_registration.py +120 -0
- parsl/tests/test_htex/test_resource_spec_validation.py +0 -7
- parsl/tests/test_python_apps/test_dep_standard_futures.py +3 -0
- parsl/tests/test_python_apps/test_fail.py +23 -8
- parsl/tests/test_python_apps/test_join.py +6 -0
- parsl/tests/test_python_apps/test_memoize_1.py +0 -1
- parsl/tests/unit/test_globus_compute_executor.py +104 -0
- parsl/usage_tracking/usage.py +13 -8
- parsl/version.py +1 -1
- {parsl-2025.1.13.data → parsl-2025.1.27.data}/scripts/interchange.py +74 -96
- {parsl-2025.1.13.dist-info → parsl-2025.1.27.dist-info}/METADATA +5 -2
- {parsl-2025.1.13.dist-info → parsl-2025.1.27.dist-info}/RECORD +35 -30
- parsl/tests/test_checkpointing/test_python_checkpoint_3.py +0 -42
- {parsl-2025.1.13.data → parsl-2025.1.27.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2025.1.13.data → parsl-2025.1.27.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2025.1.13.data → parsl-2025.1.27.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2025.1.13.dist-info → parsl-2025.1.27.dist-info}/LICENSE +0 -0
- {parsl-2025.1.13.dist-info → parsl-2025.1.27.dist-info}/WHEEL +0 -0
- {parsl-2025.1.13.dist-info → parsl-2025.1.27.dist-info}/entry_points.txt +0 -0
- {parsl-2025.1.13.dist-info → parsl-2025.1.27.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,7 @@ import warnings
|
|
8
8
|
from collections import defaultdict
|
9
9
|
from concurrent.futures import Future
|
10
10
|
from dataclasses import dataclass
|
11
|
-
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
|
11
|
+
from typing import Callable, Dict, List, Optional, Sequence, Set, Tuple, Union
|
12
12
|
|
13
13
|
import typeguard
|
14
14
|
|
@@ -357,10 +357,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
357
357
|
return self.logdir
|
358
358
|
|
359
359
|
def validate_resource_spec(self, resource_specification: dict):
|
360
|
-
"""HTEX supports the following *Optional* resource specifications:
|
361
|
-
priority: lower value is higher priority"""
|
362
360
|
if resource_specification:
|
363
|
-
acceptable_fields =
|
361
|
+
acceptable_fields: Set[str] = set() # add new resource spec field names here to make htex accept them
|
364
362
|
keys = set(resource_specification.keys())
|
365
363
|
invalid_keys = keys - acceptable_fields
|
366
364
|
if invalid_keys:
|
@@ -433,8 +431,6 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
433
431
|
self._start_result_queue_thread()
|
434
432
|
self._start_local_interchange_process()
|
435
433
|
|
436
|
-
logger.debug("Created result queue thread: %s", self._result_queue_thread)
|
437
|
-
|
438
434
|
self.initialize_scaling()
|
439
435
|
|
440
436
|
@wrap_with_logs
|
@@ -531,6 +527,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
531
527
|
get the worker task and result ports that the interchange has bound to.
|
532
528
|
"""
|
533
529
|
|
530
|
+
assert self.interchange_proc is None, f"Already exists! {self.interchange_proc!r}"
|
531
|
+
|
534
532
|
interchange_config = {"client_address": self.loopback_address,
|
535
533
|
"client_ports": (self.outgoing_q.port,
|
536
534
|
self.incoming_q.port,
|
@@ -565,7 +563,12 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
565
563
|
except CommandClientTimeoutError:
|
566
564
|
logger.error("Interchange has not completed initialization. Aborting")
|
567
565
|
raise Exception("Interchange failed to start")
|
568
|
-
logger.debug(
|
566
|
+
logger.debug(
|
567
|
+
"Interchange process started (%r). Worker ports: %d, %d",
|
568
|
+
self.interchange_proc,
|
569
|
+
self.worker_task_port,
|
570
|
+
self.worker_result_port
|
571
|
+
)
|
569
572
|
|
570
573
|
def _start_result_queue_thread(self):
|
571
574
|
"""Method to start the result queue thread as a daemon.
|
@@ -573,15 +576,13 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
573
576
|
Checks if a thread already exists, then starts it.
|
574
577
|
Could be used later as a restart if the result queue thread dies.
|
575
578
|
"""
|
576
|
-
|
577
|
-
logger.debug("Starting result queue thread")
|
578
|
-
self._result_queue_thread = threading.Thread(target=self._result_queue_worker, name="HTEX-Result-Queue-Thread")
|
579
|
-
self._result_queue_thread.daemon = True
|
580
|
-
self._result_queue_thread.start()
|
581
|
-
logger.debug("Started result queue thread")
|
579
|
+
assert self._result_queue_thread is None, f"Already exists! {self._result_queue_thread!r}"
|
582
580
|
|
583
|
-
|
584
|
-
|
581
|
+
logger.debug("Starting result queue thread")
|
582
|
+
self._result_queue_thread = threading.Thread(target=self._result_queue_worker, name="HTEX-Result-Queue-Thread")
|
583
|
+
self._result_queue_thread.daemon = True
|
584
|
+
self._result_queue_thread.start()
|
585
|
+
logger.debug("Started result queue thread: %r", self._result_queue_thread)
|
585
586
|
|
586
587
|
def hold_worker(self, worker_id: str) -> None:
|
587
588
|
"""Puts a worker on hold, preventing scheduling of additional tasks to it.
|
@@ -9,7 +9,7 @@ import queue
|
|
9
9
|
import sys
|
10
10
|
import threading
|
11
11
|
import time
|
12
|
-
from typing import Any, Dict, List,
|
12
|
+
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, cast
|
13
13
|
|
14
14
|
import zmq
|
15
15
|
|
@@ -132,6 +132,11 @@ class Interchange:
|
|
132
132
|
self.hub_zmq_port = hub_zmq_port
|
133
133
|
|
134
134
|
self.pending_task_queue: queue.Queue[Any] = queue.Queue(maxsize=10 ** 6)
|
135
|
+
|
136
|
+
# count of tasks that have been received from the submit side
|
137
|
+
self.task_counter = 0
|
138
|
+
|
139
|
+
# count of tasks that have been sent out to worker pools
|
135
140
|
self.count = 0
|
136
141
|
|
137
142
|
self.worker_ports = worker_ports
|
@@ -201,28 +206,6 @@ class Interchange:
|
|
201
206
|
|
202
207
|
return tasks
|
203
208
|
|
204
|
-
@wrap_with_logs(target="interchange")
|
205
|
-
def task_puller(self) -> NoReturn:
|
206
|
-
"""Pull tasks from the incoming tasks zmq pipe onto the internal
|
207
|
-
pending task queue
|
208
|
-
"""
|
209
|
-
logger.info("Starting")
|
210
|
-
task_counter = 0
|
211
|
-
|
212
|
-
while True:
|
213
|
-
logger.debug("launching recv_pyobj")
|
214
|
-
try:
|
215
|
-
msg = self.task_incoming.recv_pyobj()
|
216
|
-
except zmq.Again:
|
217
|
-
# We just timed out while attempting to receive
|
218
|
-
logger.debug("zmq.Again with {} tasks in internal queue".format(self.pending_task_queue.qsize()))
|
219
|
-
continue
|
220
|
-
|
221
|
-
logger.debug("putting message onto pending_task_queue")
|
222
|
-
self.pending_task_queue.put(msg)
|
223
|
-
task_counter += 1
|
224
|
-
logger.debug(f"Fetched {task_counter} tasks so far")
|
225
|
-
|
226
209
|
def _send_monitoring_info(self, monitoring_radio: Optional[MonitoringRadioSender], manager: ManagerRecord) -> None:
|
227
210
|
if monitoring_radio:
|
228
211
|
logger.info("Sending message {} to MonitoringHub".format(manager))
|
@@ -234,79 +217,68 @@ class Interchange:
|
|
234
217
|
|
235
218
|
monitoring_radio.send((MessageType.NODE_INFO, d))
|
236
219
|
|
237
|
-
|
238
|
-
def _command_server(self) -> NoReturn:
|
220
|
+
def process_command(self, monitoring_radio: Optional[MonitoringRadioSender]) -> None:
|
239
221
|
""" Command server to run async command to the interchange
|
240
222
|
"""
|
241
|
-
logger.debug("
|
242
|
-
|
243
|
-
if self.hub_address is not None and self.hub_zmq_port is not None:
|
244
|
-
logger.debug("Creating monitoring radio to %s:%s", self.hub_address, self.hub_zmq_port)
|
245
|
-
monitoring_radio = ZMQRadioSender(self.hub_address, self.hub_zmq_port)
|
246
|
-
else:
|
247
|
-
monitoring_radio = None
|
223
|
+
logger.debug("entering command_server section")
|
248
224
|
|
249
225
|
reply: Any # the type of reply depends on the command_req received (aka this needs dependent types...)
|
250
226
|
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
else:
|
272
|
-
idle_duration = 0.0
|
273
|
-
resp = {'manager': manager_id.decode('utf-8'),
|
274
|
-
'block_id': m['block_id'],
|
275
|
-
'worker_count': m['worker_count'],
|
276
|
-
'tasks': len(m['tasks']),
|
277
|
-
'idle_duration': idle_duration,
|
278
|
-
'active': m['active'],
|
279
|
-
'parsl_version': m['parsl_version'],
|
280
|
-
'python_version': m['python_version'],
|
281
|
-
'draining': m['draining']}
|
282
|
-
reply.append(resp)
|
283
|
-
|
284
|
-
elif command_req.startswith("HOLD_WORKER"):
|
285
|
-
cmd, s_manager = command_req.split(';')
|
286
|
-
manager_id = s_manager.encode('utf-8')
|
287
|
-
logger.info("Received HOLD_WORKER for {!r}".format(manager_id))
|
288
|
-
if manager_id in self._ready_managers:
|
289
|
-
m = self._ready_managers[manager_id]
|
290
|
-
m['active'] = False
|
291
|
-
self._send_monitoring_info(monitoring_radio, m)
|
227
|
+
if self.command_channel in self.socks and self.socks[self.command_channel] == zmq.POLLIN:
|
228
|
+
|
229
|
+
command_req = self.command_channel.recv_pyobj()
|
230
|
+
logger.debug("Received command request: {}".format(command_req))
|
231
|
+
if command_req == "CONNECTED_BLOCKS":
|
232
|
+
reply = self.connected_block_history
|
233
|
+
|
234
|
+
elif command_req == "WORKERS":
|
235
|
+
num_workers = 0
|
236
|
+
for manager in self._ready_managers.values():
|
237
|
+
num_workers += manager['worker_count']
|
238
|
+
reply = num_workers
|
239
|
+
|
240
|
+
elif command_req == "MANAGERS":
|
241
|
+
reply = []
|
242
|
+
for manager_id in self._ready_managers:
|
243
|
+
m = self._ready_managers[manager_id]
|
244
|
+
idle_since = m['idle_since']
|
245
|
+
if idle_since is not None:
|
246
|
+
idle_duration = time.time() - idle_since
|
292
247
|
else:
|
293
|
-
|
294
|
-
|
295
|
-
|
248
|
+
idle_duration = 0.0
|
249
|
+
resp = {'manager': manager_id.decode('utf-8'),
|
250
|
+
'block_id': m['block_id'],
|
251
|
+
'worker_count': m['worker_count'],
|
252
|
+
'tasks': len(m['tasks']),
|
253
|
+
'idle_duration': idle_duration,
|
254
|
+
'active': m['active'],
|
255
|
+
'parsl_version': m['parsl_version'],
|
256
|
+
'python_version': m['python_version'],
|
257
|
+
'draining': m['draining']}
|
258
|
+
reply.append(resp)
|
259
|
+
|
260
|
+
elif command_req.startswith("HOLD_WORKER"):
|
261
|
+
cmd, s_manager = command_req.split(';')
|
262
|
+
manager_id = s_manager.encode('utf-8')
|
263
|
+
logger.info("Received HOLD_WORKER for {!r}".format(manager_id))
|
264
|
+
if manager_id in self._ready_managers:
|
265
|
+
m = self._ready_managers[manager_id]
|
266
|
+
m['active'] = False
|
267
|
+
self._send_monitoring_info(monitoring_radio, m)
|
268
|
+
else:
|
269
|
+
logger.warning("Worker to hold was not in ready managers list")
|
296
270
|
|
297
|
-
|
298
|
-
reply = (self.worker_task_port, self.worker_result_port)
|
271
|
+
reply = None
|
299
272
|
|
300
|
-
|
301
|
-
|
302
|
-
reply = None
|
273
|
+
elif command_req == "WORKER_PORTS":
|
274
|
+
reply = (self.worker_task_port, self.worker_result_port)
|
303
275
|
|
304
|
-
|
305
|
-
|
276
|
+
else:
|
277
|
+
logger.error(f"Received unknown command: {command_req}")
|
278
|
+
reply = None
|
306
279
|
|
307
|
-
|
308
|
-
|
309
|
-
continue
|
280
|
+
logger.debug("Reply: {}".format(reply))
|
281
|
+
self.command_channel.send_pyobj(reply)
|
310
282
|
|
311
283
|
@wrap_with_logs
|
312
284
|
def start(self) -> None:
|
@@ -326,21 +298,13 @@ class Interchange:
|
|
326
298
|
|
327
299
|
start = time.time()
|
328
300
|
|
329
|
-
self._task_puller_thread = threading.Thread(target=self.task_puller,
|
330
|
-
name="Interchange-Task-Puller",
|
331
|
-
daemon=True)
|
332
|
-
self._task_puller_thread.start()
|
333
|
-
|
334
|
-
self._command_thread = threading.Thread(target=self._command_server,
|
335
|
-
name="Interchange-Command",
|
336
|
-
daemon=True)
|
337
|
-
self._command_thread.start()
|
338
|
-
|
339
301
|
kill_event = threading.Event()
|
340
302
|
|
341
303
|
poller = zmq.Poller()
|
342
304
|
poller.register(self.task_outgoing, zmq.POLLIN)
|
343
305
|
poller.register(self.results_incoming, zmq.POLLIN)
|
306
|
+
poller.register(self.task_incoming, zmq.POLLIN)
|
307
|
+
poller.register(self.command_channel, zmq.POLLIN)
|
344
308
|
|
345
309
|
# These are managers which we should examine in an iteration
|
346
310
|
# for scheduling a job (or maybe any other attention?).
|
@@ -351,6 +315,8 @@ class Interchange:
|
|
351
315
|
while not kill_event.is_set():
|
352
316
|
self.socks = dict(poller.poll(timeout=poll_period))
|
353
317
|
|
318
|
+
self.process_command(monitoring_radio)
|
319
|
+
self.process_task_incoming()
|
354
320
|
self.process_task_outgoing_incoming(interesting_managers, monitoring_radio, kill_event)
|
355
321
|
self.process_results_incoming(interesting_managers, monitoring_radio)
|
356
322
|
self.expire_bad_managers(interesting_managers, monitoring_radio)
|
@@ -362,6 +328,18 @@ class Interchange:
|
|
362
328
|
logger.info(f"Processed {self.count} tasks in {delta} seconds")
|
363
329
|
logger.warning("Exiting")
|
364
330
|
|
331
|
+
def process_task_incoming(self) -> None:
|
332
|
+
"""Process incoming task message(s).
|
333
|
+
"""
|
334
|
+
|
335
|
+
if self.task_incoming in self.socks and self.socks[self.task_incoming] == zmq.POLLIN:
|
336
|
+
logger.debug("start task_incoming section")
|
337
|
+
msg = self.task_incoming.recv_pyobj()
|
338
|
+
logger.debug("putting message onto pending_task_queue")
|
339
|
+
self.pending_task_queue.put(msg)
|
340
|
+
self.task_counter += 1
|
341
|
+
logger.debug(f"Fetched {self.task_counter} tasks so far")
|
342
|
+
|
365
343
|
def process_task_outgoing_incoming(
|
366
344
|
self,
|
367
345
|
interesting_managers: Set[bytes],
|
@@ -213,7 +213,6 @@ class ResultsIncoming:
|
|
213
213
|
"""Get a message from the queue, returning None if timeout expires
|
214
214
|
without a message. timeout is measured in milliseconds.
|
215
215
|
"""
|
216
|
-
logger.debug("Waiting for ResultsIncoming message")
|
217
216
|
socks = dict(self.poller.poll(timeout=timeout_ms))
|
218
217
|
if self.results_receiver in socks and socks[self.results_receiver] == zmq.POLLIN:
|
219
218
|
m = self.results_receiver.recv_multipart()
|
@@ -0,0 +1,20 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
from globus_compute_sdk import Executor
|
4
|
+
|
5
|
+
from parsl.config import Config
|
6
|
+
from parsl.executors import GlobusComputeExecutor
|
7
|
+
|
8
|
+
|
9
|
+
def fresh_config():
|
10
|
+
|
11
|
+
endpoint_id = os.environ["GLOBUS_COMPUTE_ENDPOINT"]
|
12
|
+
|
13
|
+
return Config(
|
14
|
+
executors=[
|
15
|
+
GlobusComputeExecutor(
|
16
|
+
executor=Executor(endpoint_id=endpoint_id),
|
17
|
+
label="globus_compute",
|
18
|
+
)
|
19
|
+
]
|
20
|
+
)
|
parsl/tests/conftest.py
CHANGED
@@ -163,6 +163,10 @@ def pytest_configure(config):
|
|
163
163
|
'markers',
|
164
164
|
'shared_fs: Marks tests that require a shared_fs between the workers are the test client'
|
165
165
|
)
|
166
|
+
config.addinivalue_line(
|
167
|
+
'markers',
|
168
|
+
'issue_3620: Marks tests that do not work correctly on GlobusComputeExecutor (ref: issue 3620)'
|
169
|
+
)
|
166
170
|
|
167
171
|
|
168
172
|
@pytest.fixture(autouse=True, scope='session')
|
@@ -27,8 +27,5 @@ def test_initial_checkpoint_write():
|
|
27
27
|
|
28
28
|
cpt_dir = parsl.dfk().checkpoint()
|
29
29
|
|
30
|
-
cptpath = cpt_dir + '/dfk.pkl'
|
31
|
-
assert os.path.exists(cptpath), f"DFK checkpoint missing: {cptpath}"
|
32
|
-
|
33
30
|
cptpath = cpt_dir + '/tasks.pkl'
|
34
31
|
assert os.path.exists(cptpath), f"Tasks checkpoint missing: {cptpath}"
|
@@ -1,3 +1,5 @@
|
|
1
|
+
import pytest
|
2
|
+
|
1
3
|
import parsl
|
2
4
|
from parsl.app.app import python_app
|
3
5
|
from parsl.executors import WorkQueueExecutor
|
@@ -11,6 +13,7 @@ def double(x, parsl_resource_specification={}):
|
|
11
13
|
return x * 2
|
12
14
|
|
13
15
|
|
16
|
+
@pytest.mark.issue_3620
|
14
17
|
def test_resource(n=2):
|
15
18
|
executors = parsl.dfk().executors
|
16
19
|
executor = None
|
@@ -0,0 +1,120 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
import pickle
|
5
|
+
import platform
|
6
|
+
import subprocess
|
7
|
+
import time
|
8
|
+
|
9
|
+
import psutil
|
10
|
+
import pytest
|
11
|
+
import zmq
|
12
|
+
|
13
|
+
import parsl.executors.high_throughput.zmq_pipes as zmq_pipes
|
14
|
+
from parsl.executors.high_throughput.executor import DEFAULT_INTERCHANGE_LAUNCH_CMD
|
15
|
+
from parsl.executors.high_throughput.manager_selector import RandomManagerSelector
|
16
|
+
from parsl.version import VERSION as PARSL_VERSION
|
17
|
+
|
18
|
+
P_ms = 10
|
19
|
+
|
20
|
+
|
21
|
+
@pytest.mark.local
|
22
|
+
def test_exit_with_bad_registration(tmpd_cwd, try_assert):
|
23
|
+
"""Test that the interchange exits when it receives a bad registration message.
|
24
|
+
This complements parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py
|
25
|
+
which tests that the interchange is resistent to other forms of bad message.
|
26
|
+
"""
|
27
|
+
|
28
|
+
outgoing_q = zmq_pipes.TasksOutgoing(
|
29
|
+
"127.0.0.1", (49152, 65535), None
|
30
|
+
)
|
31
|
+
incoming_q = zmq_pipes.ResultsIncoming(
|
32
|
+
"127.0.0.1", (49152, 65535), None
|
33
|
+
)
|
34
|
+
command_client = zmq_pipes.CommandClient(
|
35
|
+
"127.0.0.1", (49152, 65535), None
|
36
|
+
)
|
37
|
+
|
38
|
+
interchange_config = {"client_address": "127.0.0.1",
|
39
|
+
"client_ports": (outgoing_q.port,
|
40
|
+
incoming_q.port,
|
41
|
+
command_client.port),
|
42
|
+
"interchange_address": "127.0.0.1",
|
43
|
+
"worker_ports": None,
|
44
|
+
"worker_port_range": (50000, 60000),
|
45
|
+
"hub_address": None,
|
46
|
+
"hub_zmq_port": None,
|
47
|
+
"logdir": tmpd_cwd,
|
48
|
+
"heartbeat_threshold": 120,
|
49
|
+
"poll_period": P_ms,
|
50
|
+
"logging_level": logging.DEBUG,
|
51
|
+
"cert_dir": None,
|
52
|
+
"manager_selector": RandomManagerSelector(),
|
53
|
+
"run_id": "test"
|
54
|
+
}
|
55
|
+
|
56
|
+
config_pickle = pickle.dumps(interchange_config)
|
57
|
+
|
58
|
+
interchange_proc = subprocess.Popen(DEFAULT_INTERCHANGE_LAUNCH_CMD, stdin=subprocess.PIPE)
|
59
|
+
stdin = interchange_proc.stdin
|
60
|
+
assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode"
|
61
|
+
|
62
|
+
stdin.write(config_pickle)
|
63
|
+
stdin.flush()
|
64
|
+
stdin.close()
|
65
|
+
|
66
|
+
# wait for interchange to be alive, by waiting for the command thread to become
|
67
|
+
# responsive. if the interchange process didn't start enough to get the command
|
68
|
+
# thread running, this will time out.
|
69
|
+
|
70
|
+
(task_port, result_port) = command_client.run("WORKER_PORTS", timeout_s=120)
|
71
|
+
|
72
|
+
# now we'll assume that if the interchange command thread is responding,
|
73
|
+
# then the worker polling code is also running and that the interchange has
|
74
|
+
# started successfully.
|
75
|
+
|
76
|
+
# send bad registration message as if from a new worker pool. The badness here
|
77
|
+
# is that the Python version does not match the real Python version - which
|
78
|
+
# unlike some other bad interchange messages, should cause the interchange
|
79
|
+
# to shut down.
|
80
|
+
|
81
|
+
msg = {'type': 'registration',
|
82
|
+
'parsl_v': PARSL_VERSION,
|
83
|
+
'python_v': "{}.{}.{}".format(1, 1, 1), # this is the bad bit
|
84
|
+
'worker_count': 1,
|
85
|
+
'uid': 'testuid',
|
86
|
+
'block_id': 0,
|
87
|
+
'start_time': time.time(),
|
88
|
+
'prefetch_capacity': 0,
|
89
|
+
'max_capacity': 1,
|
90
|
+
'os': platform.system(),
|
91
|
+
'hostname': platform.node(),
|
92
|
+
'dir': os.getcwd(),
|
93
|
+
'cpu_count': psutil.cpu_count(logical=False),
|
94
|
+
'total_memory': psutil.virtual_memory().total,
|
95
|
+
}
|
96
|
+
|
97
|
+
# connect to worker port and send this message.
|
98
|
+
|
99
|
+
context = zmq.Context()
|
100
|
+
channel_timeout = 10000 # in milliseconds
|
101
|
+
task_channel = context.socket(zmq.DEALER)
|
102
|
+
task_channel.setsockopt(zmq.LINGER, 0)
|
103
|
+
task_channel.setsockopt(zmq.IDENTITY, b'testid')
|
104
|
+
|
105
|
+
task_channel.set_hwm(0)
|
106
|
+
task_channel.setsockopt(zmq.SNDTIMEO, channel_timeout)
|
107
|
+
task_channel.connect(f"tcp://127.0.0.1:{task_port}")
|
108
|
+
|
109
|
+
b_msg = json.dumps(msg).encode('utf-8')
|
110
|
+
|
111
|
+
task_channel.send(b_msg)
|
112
|
+
|
113
|
+
# check that the interchange exits within some reasonable time
|
114
|
+
try_assert(lambda: interchange_proc.poll() is not None, "Interchange did not exit after killing watched client process", timeout_ms=5000)
|
115
|
+
|
116
|
+
# See issue #3697 - ideally the interchange would exit cleanly, but it does not.
|
117
|
+
# assert interchange_proc.poll() == 0, "Interchange exited with an error code, not 0"
|
118
|
+
|
119
|
+
task_channel.close()
|
120
|
+
context.term()
|
@@ -30,13 +30,6 @@ def test_resource_spec_validation():
|
|
30
30
|
assert ret_val is None
|
31
31
|
|
32
32
|
|
33
|
-
@pytest.mark.local
|
34
|
-
def test_resource_spec_validation_one_key():
|
35
|
-
htex = HighThroughputExecutor()
|
36
|
-
ret_val = htex.validate_resource_spec({"priority": 2})
|
37
|
-
assert ret_val is None
|
38
|
-
|
39
|
-
|
40
33
|
@pytest.mark.local
|
41
34
|
def test_resource_spec_validation_bad_keys():
|
42
35
|
htex = HighThroughputExecutor()
|
@@ -43,3 +43,6 @@ def test_future_fail_dependency():
|
|
43
43
|
# Future, plain_fut, somewhere in its str
|
44
44
|
|
45
45
|
assert repr(plain_fut) in str(ex)
|
46
|
+
assert len(ex.dependent_exceptions_tids) == 1
|
47
|
+
assert isinstance(ex.dependent_exceptions_tids[0][0], ValueError)
|
48
|
+
assert ex.dependent_exceptions_tids[0][1].startswith("<Future ")
|
@@ -27,17 +27,32 @@ def test_no_deps():
|
|
27
27
|
pass
|
28
28
|
|
29
29
|
|
30
|
-
|
31
|
-
|
32
|
-
|
30
|
+
def test_fail_sequence_first():
|
31
|
+
t1 = random_fail(fail_prob=1)
|
32
|
+
t2 = random_fail(fail_prob=0, inputs=[t1])
|
33
|
+
t_final = random_fail(fail_prob=0, inputs=[t2])
|
33
34
|
|
34
|
-
|
35
|
-
|
35
|
+
with pytest.raises(DependencyError):
|
36
|
+
t_final.result()
|
37
|
+
|
38
|
+
assert len(t_final.exception().dependent_exceptions_tids) == 1
|
39
|
+
assert isinstance(t_final.exception().dependent_exceptions_tids[0][0], DependencyError)
|
40
|
+
assert t_final.exception().dependent_exceptions_tids[0][1].startswith("task ")
|
36
41
|
|
37
|
-
|
38
|
-
|
39
|
-
|
42
|
+
assert hasattr(t_final.exception(), '__cause__')
|
43
|
+
assert t_final.exception().__cause__ == t1.exception()
|
44
|
+
|
45
|
+
|
46
|
+
def test_fail_sequence_middle():
|
47
|
+
t1 = random_fail(fail_prob=0)
|
48
|
+
t2 = random_fail(fail_prob=1, inputs=[t1])
|
40
49
|
t_final = random_fail(fail_prob=0, inputs=[t2])
|
41
50
|
|
42
51
|
with pytest.raises(DependencyError):
|
43
52
|
t_final.result()
|
53
|
+
|
54
|
+
assert len(t_final.exception().dependent_exceptions_tids) == 1
|
55
|
+
assert isinstance(t_final.exception().dependent_exceptions_tids[0][0], ManufacturedTestFailure)
|
56
|
+
|
57
|
+
assert hasattr(t_final.exception(), '__cause__')
|
58
|
+
assert t_final.exception().__cause__ == t2.exception()
|
@@ -97,7 +97,10 @@ def test_error():
|
|
97
97
|
f = outer_error()
|
98
98
|
e = f.exception()
|
99
99
|
assert isinstance(e, JoinError)
|
100
|
+
|
101
|
+
assert len(e.dependent_exceptions_tids) == 1
|
100
102
|
assert isinstance(e.dependent_exceptions_tids[0][0], InnerError)
|
103
|
+
assert e.dependent_exceptions_tids[0][1].startswith("task ")
|
101
104
|
|
102
105
|
|
103
106
|
def test_two_errors():
|
@@ -109,10 +112,12 @@ def test_two_errors():
|
|
109
112
|
de0 = e.dependent_exceptions_tids[0][0]
|
110
113
|
assert isinstance(de0, InnerError)
|
111
114
|
assert de0.args[0] == "Error A"
|
115
|
+
assert e.dependent_exceptions_tids[0][1].startswith("task ")
|
112
116
|
|
113
117
|
de1 = e.dependent_exceptions_tids[1][0]
|
114
118
|
assert isinstance(de1, InnerError)
|
115
119
|
assert de1.args[0] == "Error B"
|
120
|
+
assert e.dependent_exceptions_tids[1][1].startswith("task ")
|
116
121
|
|
117
122
|
|
118
123
|
def test_one_error_one_result():
|
@@ -125,6 +130,7 @@ def test_one_error_one_result():
|
|
125
130
|
de0 = e.dependent_exceptions_tids[0][0]
|
126
131
|
assert isinstance(de0, InnerError)
|
127
132
|
assert de0.args[0] == "Error A"
|
133
|
+
assert e.dependent_exceptions_tids[0][1].startswith("task ")
|
128
134
|
|
129
135
|
|
130
136
|
@join_app
|