parsl 2024.11.25__py3-none-any.whl → 2024.12.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsl/configs/ASPIRE1.py +0 -1
- parsl/configs/cc_in2p3.py +0 -2
- parsl/configs/frontera.py +0 -2
- parsl/configs/htex_local.py +0 -2
- parsl/dataflow/dflow.py +0 -2
- parsl/executors/base.py +1 -1
- parsl/executors/high_throughput/executor.py +15 -2
- parsl/executors/high_throughput/interchange.py +2 -1
- parsl/executors/high_throughput/zmq_pipes.py +13 -4
- parsl/monitoring/monitoring.py +1 -1
- parsl/monitoring/radios/base.py +13 -0
- parsl/monitoring/radios/filesystem.py +52 -0
- parsl/monitoring/radios/htex.py +57 -0
- parsl/monitoring/radios/multiprocessing.py +17 -0
- parsl/monitoring/radios/udp.py +56 -0
- parsl/monitoring/radios/zmq.py +17 -0
- parsl/monitoring/remote.py +4 -6
- parsl/monitoring/router.py +1 -1
- parsl/providers/cluster_provider.py +2 -5
- parsl/providers/condor/condor.py +4 -13
- parsl/providers/grid_engine/grid_engine.py +3 -9
- parsl/providers/local/local.py +6 -23
- parsl/providers/lsf/lsf.py +3 -18
- parsl/providers/pbspro/pbspro.py +3 -10
- parsl/providers/slurm/slurm.py +4 -20
- parsl/providers/torque/torque.py +2 -10
- parsl/tests/configs/cc_in2p3.py +0 -2
- parsl/tests/configs/frontera.py +0 -2
- parsl/tests/configs/htex_local.py +0 -2
- parsl/tests/configs/htex_local_alternate.py +0 -3
- parsl/tests/configs/htex_local_intask_staging.py +0 -2
- parsl/tests/configs/htex_local_rsync_staging.py +0 -2
- parsl/tests/configs/local_threads_monitoring.py +0 -1
- parsl/tests/configs/slurm_local.py +0 -2
- parsl/tests/manual_tests/htex_local.py +0 -2
- parsl/tests/manual_tests/test_memory_limits.py +0 -2
- parsl/tests/manual_tests/test_udp_simple.py +0 -1
- parsl/tests/scaling_tests/htex_local.py +0 -2
- parsl/tests/sites/test_affinity.py +0 -2
- parsl/tests/sites/test_worker_info.py +0 -2
- parsl/tests/test_htex/test_drain.py +0 -2
- parsl/tests/test_htex/test_manager_selector_by_block.py +0 -2
- parsl/tests/test_monitoring/test_htex_init_blocks_vs_monitoring.py +0 -3
- parsl/tests/test_monitoring/test_stdouterr.py +0 -1
- parsl/tests/test_providers/test_local_provider.py +1 -2
- parsl/tests/test_providers/test_pbspro_template.py +1 -3
- parsl/tests/test_providers/test_slurm_template.py +1 -3
- parsl/tests/test_scaling/test_regression_1621.py +0 -2
- parsl/tests/test_scaling/test_regression_3568_scaledown_vs_MISSING.py +0 -1
- parsl/tests/test_scaling/test_scale_down.py +0 -2
- parsl/tests/test_scaling/test_scale_down_htex_auto_scale.py +0 -2
- parsl/tests/test_scaling/test_scale_down_htex_unregistered.py +0 -2
- parsl/tests/test_scaling/test_shutdown_scalein.py +0 -2
- parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py +0 -2
- parsl/tests/test_staging/test_zip_in.py +0 -1
- parsl/tests/test_staging/test_zip_out.py +0 -1
- parsl/tests/test_staging/test_zip_to_zip.py +0 -1
- parsl/tests/test_utils/test_execute_wait.py +35 -0
- parsl/utils.py +35 -0
- parsl/version.py +1 -1
- {parsl-2024.11.25.data → parsl-2024.12.9.data}/scripts/interchange.py +2 -1
- {parsl-2024.11.25.dist-info → parsl-2024.12.9.dist-info}/METADATA +2 -2
- {parsl-2024.11.25.dist-info → parsl-2024.12.9.dist-info}/RECORD +71 -73
- parsl/channels/__init__.py +0 -4
- parsl/channels/base.py +0 -82
- parsl/channels/errors.py +0 -30
- parsl/channels/local/local.py +0 -102
- parsl/monitoring/radios.py +0 -191
- parsl/tests/integration/test_channels/__init__.py +0 -0
- parsl/tests/test_channels/__init__.py +0 -0
- parsl/tests/test_channels/test_large_output.py +0 -22
- parsl/tests/test_channels/test_local_channel.py +0 -19
- /parsl/{channels/local → monitoring/radios}/__init__.py +0 -0
- {parsl-2024.11.25.data → parsl-2024.12.9.data}/scripts/exec_parsl_function.py +0 -0
- {parsl-2024.11.25.data → parsl-2024.12.9.data}/scripts/parsl_coprocess.py +0 -0
- {parsl-2024.11.25.data → parsl-2024.12.9.data}/scripts/process_worker_pool.py +0 -0
- {parsl-2024.11.25.dist-info → parsl-2024.12.9.dist-info}/LICENSE +0 -0
- {parsl-2024.11.25.dist-info → parsl-2024.12.9.dist-info}/WHEEL +0 -0
- {parsl-2024.11.25.dist-info → parsl-2024.12.9.dist-info}/entry_points.txt +0 -0
- {parsl-2024.11.25.dist-info → parsl-2024.12.9.dist-info}/top_level.txt +0 -0
parsl/configs/ASPIRE1.py
CHANGED
parsl/configs/cc_in2p3.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
from parsl.channels import LocalChannel
|
2
1
|
from parsl.config import Config
|
3
2
|
from parsl.executors import HighThroughputExecutor
|
4
3
|
from parsl.providers import GridEngineProvider
|
@@ -10,7 +9,6 @@ config = Config(
|
|
10
9
|
label='cc_in2p3_htex',
|
11
10
|
max_workers_per_node=2,
|
12
11
|
provider=GridEngineProvider(
|
13
|
-
channel=LocalChannel(),
|
14
12
|
nodes_per_block=1,
|
15
13
|
init_blocks=2,
|
16
14
|
max_blocks=2,
|
parsl/configs/frontera.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
from parsl.channels import LocalChannel
|
2
1
|
from parsl.config import Config
|
3
2
|
from parsl.executors import HighThroughputExecutor
|
4
3
|
from parsl.launchers import SrunLauncher
|
@@ -15,7 +14,6 @@ config = Config(
|
|
15
14
|
max_workers_per_node=1, # Set number of workers per node
|
16
15
|
provider=SlurmProvider(
|
17
16
|
cmd_timeout=60, # Add extra time for slow scheduler responses
|
18
|
-
channel=LocalChannel(),
|
19
17
|
nodes_per_block=2,
|
20
18
|
init_blocks=1,
|
21
19
|
min_blocks=1,
|
parsl/configs/htex_local.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
from parsl.channels import LocalChannel
|
2
1
|
from parsl.config import Config
|
3
2
|
from parsl.executors import HighThroughputExecutor
|
4
3
|
from parsl.providers import LocalProvider
|
@@ -10,7 +9,6 @@ config = Config(
|
|
10
9
|
label="htex_local",
|
11
10
|
cores_per_worker=1,
|
12
11
|
provider=LocalProvider(
|
13
|
-
channel=LocalChannel(),
|
14
12
|
init_blocks=1,
|
15
13
|
max_blocks=1,
|
16
14
|
),
|
parsl/dataflow/dflow.py
CHANGED
@@ -1151,8 +1151,6 @@ class DataFlowKernel:
|
|
1151
1151
|
executor.provider.script_dir = os.path.join(self.run_dir, 'submit_scripts')
|
1152
1152
|
os.makedirs(executor.provider.script_dir, exist_ok=True)
|
1153
1153
|
|
1154
|
-
executor.provider.channel.script_dir = executor.provider.script_dir
|
1155
|
-
|
1156
1154
|
self.executors[executor.label] = executor
|
1157
1155
|
executor.start()
|
1158
1156
|
block_executors = [e for e in executors if isinstance(e, BlockProviderExecutor)]
|
parsl/executors/base.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, Optional
|
|
5
5
|
|
6
6
|
from typing_extensions import Literal, Self
|
7
7
|
|
8
|
-
from parsl.monitoring.radios import MonitoringRadioSender
|
8
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
9
9
|
|
10
10
|
|
11
11
|
class ParslExecutor(metaclass=ABCMeta):
|
@@ -331,6 +331,9 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
331
331
|
interchange_launch_cmd = DEFAULT_INTERCHANGE_LAUNCH_CMD
|
332
332
|
self.interchange_launch_cmd = interchange_launch_cmd
|
333
333
|
|
334
|
+
self._result_queue_thread_exit = threading.Event()
|
335
|
+
self._result_queue_thread: Optional[threading.Thread] = None
|
336
|
+
|
334
337
|
radio_mode = "htex"
|
335
338
|
enable_mpi_mode: bool = False
|
336
339
|
mpi_launcher: str = "mpiexec"
|
@@ -455,9 +458,11 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
455
458
|
"""
|
456
459
|
logger.debug("Result queue worker starting")
|
457
460
|
|
458
|
-
while not self.bad_state_is_set:
|
461
|
+
while not self.bad_state_is_set and not self._result_queue_thread_exit.is_set():
|
459
462
|
try:
|
460
|
-
msgs = self.incoming_q.get()
|
463
|
+
msgs = self.incoming_q.get(timeout_ms=self.poll_period)
|
464
|
+
if msgs is None: # timeout
|
465
|
+
continue
|
461
466
|
|
462
467
|
except IOError as e:
|
463
468
|
logger.exception("Caught broken queue with exception code {}: {}".format(e.errno, e))
|
@@ -515,6 +520,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
515
520
|
else:
|
516
521
|
raise BadMessage("Message received with unknown type {}".format(msg['type']))
|
517
522
|
|
523
|
+
logger.info("Closing result ZMQ pipe")
|
524
|
+
self.incoming_q.close()
|
518
525
|
logger.info("Result queue worker finished")
|
519
526
|
|
520
527
|
def _start_local_interchange_process(self) -> None:
|
@@ -817,6 +824,8 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
817
824
|
|
818
825
|
logger.info("Attempting HighThroughputExecutor shutdown")
|
819
826
|
|
827
|
+
logger.info("Terminating interchange and result queue thread")
|
828
|
+
self._result_queue_thread_exit.set()
|
820
829
|
self.interchange_proc.terminate()
|
821
830
|
try:
|
822
831
|
self.interchange_proc.wait(timeout=timeout)
|
@@ -841,6 +850,10 @@ class HighThroughputExecutor(BlockProviderExecutor, RepresentationMixin, UsageIn
|
|
841
850
|
logger.info("Closing command client")
|
842
851
|
self.command_client.close()
|
843
852
|
|
853
|
+
logger.info("Waiting for result queue thread exit")
|
854
|
+
if self._result_queue_thread:
|
855
|
+
self._result_queue_thread.join()
|
856
|
+
|
844
857
|
logger.info("Finished HighThroughputExecutor shutdown attempt")
|
845
858
|
|
846
859
|
def get_usage_information(self):
|
@@ -20,7 +20,8 @@ from parsl.executors.high_throughput.errors import ManagerLost, VersionMismatch
|
|
20
20
|
from parsl.executors.high_throughput.manager_record import ManagerRecord
|
21
21
|
from parsl.executors.high_throughput.manager_selector import ManagerSelector
|
22
22
|
from parsl.monitoring.message_type import MessageType
|
23
|
-
from parsl.monitoring.radios import MonitoringRadioSender
|
23
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
24
|
+
from parsl.monitoring.radios.zmq import ZMQRadioSender
|
24
25
|
from parsl.process_loggers import wrap_with_logs
|
25
26
|
from parsl.serialize import serialize as serialize_object
|
26
27
|
from parsl.utils import setproctitle
|
@@ -206,12 +206,21 @@ class ResultsIncoming:
|
|
206
206
|
self.port = self.results_receiver.bind_to_random_port(tcp_url(ip_address),
|
207
207
|
min_port=port_range[0],
|
208
208
|
max_port=port_range[1])
|
209
|
+
self.poller = zmq.Poller()
|
210
|
+
self.poller.register(self.results_receiver, zmq.POLLIN)
|
209
211
|
|
210
|
-
def get(self):
|
212
|
+
def get(self, timeout_ms=None):
|
213
|
+
"""Get a message from the queue, returning None if timeout expires
|
214
|
+
without a message. timeout is measured in milliseconds.
|
215
|
+
"""
|
211
216
|
logger.debug("Waiting for ResultsIncoming message")
|
212
|
-
|
213
|
-
|
214
|
-
|
217
|
+
socks = dict(self.poller.poll(timeout=timeout_ms))
|
218
|
+
if self.results_receiver in socks and socks[self.results_receiver] == zmq.POLLIN:
|
219
|
+
m = self.results_receiver.recv_multipart()
|
220
|
+
logger.debug("Received ResultsIncoming message")
|
221
|
+
return m
|
222
|
+
else:
|
223
|
+
return None
|
215
224
|
|
216
225
|
def close(self):
|
217
226
|
self.results_receiver.close()
|
parsl/monitoring/monitoring.py
CHANGED
@@ -14,7 +14,7 @@ import typeguard
|
|
14
14
|
|
15
15
|
from parsl.log_utils import set_file_logger
|
16
16
|
from parsl.monitoring.errors import MonitoringHubStartError
|
17
|
-
from parsl.monitoring.radios import MultiprocessingQueueRadioSender
|
17
|
+
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
18
18
|
from parsl.monitoring.router import router_starter
|
19
19
|
from parsl.monitoring.types import TaggedMonitoringMessage
|
20
20
|
from parsl.multiprocessing import ForkProcess, SizedQueue
|
@@ -0,0 +1,13 @@
|
|
1
|
+
import logging
|
2
|
+
from abc import ABCMeta, abstractmethod
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
_db_manager_excepts: Optional[Exception]
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class MonitoringRadioSender(metaclass=ABCMeta):
|
11
|
+
@abstractmethod
|
12
|
+
def send(self, message: object) -> None:
|
13
|
+
pass
|
@@ -0,0 +1,52 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import pickle
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
7
|
+
|
8
|
+
logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
class FilesystemRadioSender(MonitoringRadioSender):
|
12
|
+
"""A MonitoringRadioSender that sends messages over a shared filesystem.
|
13
|
+
|
14
|
+
The messsage directory structure is based on maildir,
|
15
|
+
https://en.wikipedia.org/wiki/Maildir
|
16
|
+
|
17
|
+
The writer creates a message in tmp/ and then when it is fully
|
18
|
+
written, moves it atomically into new/
|
19
|
+
|
20
|
+
The reader ignores tmp/ and only reads and deletes messages from
|
21
|
+
new/
|
22
|
+
|
23
|
+
This avoids a race condition of reading partially written messages.
|
24
|
+
|
25
|
+
This radio is likely to give higher shared filesystem load compared to
|
26
|
+
the UDP radio, but should be much more reliable.
|
27
|
+
"""
|
28
|
+
|
29
|
+
def __init__(self, *, monitoring_url: str, timeout: int = 10, run_dir: str):
|
30
|
+
logger.info("filesystem based monitoring channel initializing")
|
31
|
+
self.base_path = f"{run_dir}/monitor-fs-radio/"
|
32
|
+
self.tmp_path = f"{self.base_path}/tmp"
|
33
|
+
self.new_path = f"{self.base_path}/new"
|
34
|
+
|
35
|
+
os.makedirs(self.tmp_path, exist_ok=True)
|
36
|
+
os.makedirs(self.new_path, exist_ok=True)
|
37
|
+
|
38
|
+
def send(self, message: object) -> None:
|
39
|
+
logger.info("Sending a monitoring message via filesystem")
|
40
|
+
|
41
|
+
unique_id = str(uuid.uuid4())
|
42
|
+
|
43
|
+
tmp_filename = f"{self.tmp_path}/{unique_id}"
|
44
|
+
new_filename = f"{self.new_path}/{unique_id}"
|
45
|
+
buffer = message
|
46
|
+
|
47
|
+
# this will write the message out then atomically
|
48
|
+
# move it into new/, so that a partially written
|
49
|
+
# file will never be observed in new/
|
50
|
+
with open(tmp_filename, "wb") as f:
|
51
|
+
pickle.dump(buffer, f)
|
52
|
+
os.rename(tmp_filename, new_filename)
|
@@ -0,0 +1,57 @@
|
|
1
|
+
import logging
|
2
|
+
import pickle
|
3
|
+
|
4
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
class HTEXRadioSender(MonitoringRadioSender):
|
10
|
+
|
11
|
+
def __init__(self, monitoring_url: str, timeout: int = 10):
|
12
|
+
"""
|
13
|
+
Parameters
|
14
|
+
----------
|
15
|
+
|
16
|
+
monitoring_url : str
|
17
|
+
URL of the form <scheme>://<IP>:<PORT>
|
18
|
+
timeout : int
|
19
|
+
timeout, default=10s
|
20
|
+
"""
|
21
|
+
logger.info("htex-based monitoring channel initialising")
|
22
|
+
|
23
|
+
def send(self, message: object) -> None:
|
24
|
+
""" Sends a message to the UDP receiver
|
25
|
+
|
26
|
+
Parameter
|
27
|
+
---------
|
28
|
+
|
29
|
+
message: object
|
30
|
+
Arbitrary pickle-able object that is to be sent
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
None
|
34
|
+
"""
|
35
|
+
|
36
|
+
import parsl.executors.high_throughput.monitoring_info
|
37
|
+
|
38
|
+
result_queue = parsl.executors.high_throughput.monitoring_info.result_queue
|
39
|
+
|
40
|
+
# this message needs to go in the result queue tagged so that it is treated
|
41
|
+
# i) as a monitoring message by the interchange, and then further more treated
|
42
|
+
# as a RESOURCE_INFO message when received by monitoring (rather than a NODE_INFO
|
43
|
+
# which is the implicit default for messages from the interchange)
|
44
|
+
|
45
|
+
# for the interchange, the outer wrapper, this needs to be a dict:
|
46
|
+
|
47
|
+
interchange_msg = {
|
48
|
+
'type': 'monitoring',
|
49
|
+
'payload': message
|
50
|
+
}
|
51
|
+
|
52
|
+
if result_queue:
|
53
|
+
result_queue.put(pickle.dumps(interchange_msg))
|
54
|
+
else:
|
55
|
+
logger.error("result_queue is uninitialized - cannot put monitoring message")
|
56
|
+
|
57
|
+
return
|
@@ -0,0 +1,17 @@
|
|
1
|
+
from multiprocessing.queues import Queue
|
2
|
+
|
3
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
4
|
+
|
5
|
+
|
6
|
+
class MultiprocessingQueueRadioSender(MonitoringRadioSender):
|
7
|
+
"""A monitoring radio which connects over a multiprocessing Queue.
|
8
|
+
This radio is intended to be used on the submit side, where components
|
9
|
+
in the submit process, or processes launched by multiprocessing, will have
|
10
|
+
access to a Queue shared with the monitoring database code (bypassing the
|
11
|
+
monitoring router).
|
12
|
+
"""
|
13
|
+
def __init__(self, queue: Queue) -> None:
|
14
|
+
self.queue = queue
|
15
|
+
|
16
|
+
def send(self, message: object) -> None:
|
17
|
+
self.queue.put(message)
|
@@ -0,0 +1,56 @@
|
|
1
|
+
import logging
|
2
|
+
import pickle
|
3
|
+
import socket
|
4
|
+
|
5
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
6
|
+
|
7
|
+
|
8
|
+
class UDPRadioSender(MonitoringRadioSender):
|
9
|
+
|
10
|
+
def __init__(self, monitoring_url: str, timeout: int = 10):
|
11
|
+
"""
|
12
|
+
Parameters
|
13
|
+
----------
|
14
|
+
|
15
|
+
monitoring_url : str
|
16
|
+
URL of the form <scheme>://<IP>:<PORT>
|
17
|
+
timeout : int
|
18
|
+
timeout, default=10s
|
19
|
+
"""
|
20
|
+
self.monitoring_url = monitoring_url
|
21
|
+
self.sock_timeout = timeout
|
22
|
+
try:
|
23
|
+
self.scheme, self.ip, port = (x.strip('/') for x in monitoring_url.split(':'))
|
24
|
+
self.port = int(port)
|
25
|
+
except Exception:
|
26
|
+
raise Exception("Failed to parse monitoring url: {}".format(monitoring_url))
|
27
|
+
|
28
|
+
self.sock = socket.socket(socket.AF_INET,
|
29
|
+
socket.SOCK_DGRAM,
|
30
|
+
socket.IPPROTO_UDP) # UDP
|
31
|
+
self.sock.settimeout(self.sock_timeout)
|
32
|
+
|
33
|
+
def send(self, message: object) -> None:
|
34
|
+
""" Sends a message to the UDP receiver
|
35
|
+
|
36
|
+
Parameter
|
37
|
+
---------
|
38
|
+
|
39
|
+
message: object
|
40
|
+
Arbitrary pickle-able object that is to be sent
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
None
|
44
|
+
"""
|
45
|
+
try:
|
46
|
+
buffer = pickle.dumps(message)
|
47
|
+
except Exception:
|
48
|
+
logging.exception("Exception during pickling", exc_info=True)
|
49
|
+
return
|
50
|
+
|
51
|
+
try:
|
52
|
+
self.sock.sendto(buffer, (self.ip, self.port))
|
53
|
+
except socket.timeout:
|
54
|
+
logging.error("Could not send message within timeout limit")
|
55
|
+
return
|
56
|
+
return
|
@@ -0,0 +1,17 @@
|
|
1
|
+
import zmq
|
2
|
+
|
3
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
4
|
+
|
5
|
+
|
6
|
+
class ZMQRadioSender(MonitoringRadioSender):
|
7
|
+
"""A monitoring radio which connects over ZMQ. This radio is not
|
8
|
+
thread-safe, because its use of ZMQ is not thread-safe.
|
9
|
+
"""
|
10
|
+
|
11
|
+
def __init__(self, hub_address: str, hub_zmq_port: int) -> None:
|
12
|
+
self._hub_channel = zmq.Context().socket(zmq.DEALER)
|
13
|
+
self._hub_channel.set_hwm(0)
|
14
|
+
self._hub_channel.connect(f"tcp://{hub_address}:{hub_zmq_port}")
|
15
|
+
|
16
|
+
def send(self, message: object) -> None:
|
17
|
+
self._hub_channel.send_pyobj(message)
|
parsl/monitoring/remote.py
CHANGED
@@ -7,12 +7,10 @@ from multiprocessing import Event
|
|
7
7
|
from typing import Any, Callable, Dict, List, Sequence, Tuple
|
8
8
|
|
9
9
|
from parsl.monitoring.message_type import MessageType
|
10
|
-
from parsl.monitoring.radios import
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
UDPRadioSender,
|
15
|
-
)
|
10
|
+
from parsl.monitoring.radios.base import MonitoringRadioSender
|
11
|
+
from parsl.monitoring.radios.filesystem import FilesystemRadioSender
|
12
|
+
from parsl.monitoring.radios.htex import HTEXRadioSender
|
13
|
+
from parsl.monitoring.radios.udp import UDPRadioSender
|
16
14
|
from parsl.multiprocessing import ForkProcess
|
17
15
|
from parsl.process_loggers import wrap_with_logs
|
18
16
|
|
parsl/monitoring/router.py
CHANGED
@@ -14,7 +14,7 @@ import typeguard
|
|
14
14
|
import zmq
|
15
15
|
|
16
16
|
from parsl.log_utils import set_file_logger
|
17
|
-
from parsl.monitoring.radios import MultiprocessingQueueRadioSender
|
17
|
+
from parsl.monitoring.radios.multiprocessing import MultiprocessingQueueRadioSender
|
18
18
|
from parsl.monitoring.types import TaggedMonitoringMessage
|
19
19
|
from parsl.process_loggers import wrap_with_logs
|
20
20
|
from parsl.utils import setproctitle
|
@@ -6,6 +6,7 @@ from parsl.launchers.base import Launcher
|
|
6
6
|
from parsl.launchers.errors import BadLauncher
|
7
7
|
from parsl.providers.base import ExecutionProvider
|
8
8
|
from parsl.providers.errors import SchedulerMissingArgs, ScriptPathError
|
9
|
+
from parsl.utils import execute_wait
|
9
10
|
|
10
11
|
logger = logging.getLogger(__name__)
|
11
12
|
|
@@ -17,8 +18,6 @@ class ClusterProvider(ExecutionProvider):
|
|
17
18
|
----------
|
18
19
|
label : str
|
19
20
|
Label for this provider.
|
20
|
-
channel : Channel
|
21
|
-
Channel for accessing this provider.
|
22
21
|
walltime : str
|
23
22
|
Walltime requested per block in HH:MM:SS.
|
24
23
|
launcher : Launcher
|
@@ -44,7 +43,6 @@ class ClusterProvider(ExecutionProvider):
|
|
44
43
|
|
45
44
|
def __init__(self,
|
46
45
|
label,
|
47
|
-
channel,
|
48
46
|
nodes_per_block,
|
49
47
|
init_blocks,
|
50
48
|
min_blocks,
|
@@ -55,7 +53,6 @@ class ClusterProvider(ExecutionProvider):
|
|
55
53
|
cmd_timeout=10):
|
56
54
|
|
57
55
|
self._label = label
|
58
|
-
self.channel = channel
|
59
56
|
self.nodes_per_block = nodes_per_block
|
60
57
|
self.init_blocks = init_blocks
|
61
58
|
self.min_blocks = min_blocks
|
@@ -76,7 +73,7 @@ class ClusterProvider(ExecutionProvider):
|
|
76
73
|
t = self.cmd_timeout
|
77
74
|
if timeout is not None:
|
78
75
|
t = timeout
|
79
|
-
return
|
76
|
+
return execute_wait(cmd, t)
|
80
77
|
|
81
78
|
def _write_submit_script(self, template, script_filename, job_name, configs):
|
82
79
|
"""Generate submit script and write it to a file.
|
parsl/providers/condor/condor.py
CHANGED
@@ -5,7 +5,6 @@ import time
|
|
5
5
|
|
6
6
|
import typeguard
|
7
7
|
|
8
|
-
from parsl.channels import LocalChannel
|
9
8
|
from parsl.jobs.states import JobState, JobStatus
|
10
9
|
from parsl.launchers import SingleNodeLauncher
|
11
10
|
from parsl.launchers.base import Launcher
|
@@ -18,8 +17,6 @@ logger = logging.getLogger(__name__)
|
|
18
17
|
|
19
18
|
from typing import Dict, List, Optional
|
20
19
|
|
21
|
-
from parsl.channels.base import Channel
|
22
|
-
|
23
20
|
# See http://pages.cs.wisc.edu/~adesmet/status.html
|
24
21
|
translate_table = {
|
25
22
|
'1': JobState.PENDING,
|
@@ -36,8 +33,6 @@ class CondorProvider(RepresentationMixin, ClusterProvider):
|
|
36
33
|
|
37
34
|
Parameters
|
38
35
|
----------
|
39
|
-
channel : Channel
|
40
|
-
Channel for accessing this provider.
|
41
36
|
nodes_per_block : int
|
42
37
|
Nodes to provision per block.
|
43
38
|
cores_per_slot : int
|
@@ -79,7 +74,6 @@ class CondorProvider(RepresentationMixin, ClusterProvider):
|
|
79
74
|
"""
|
80
75
|
@typeguard.typechecked
|
81
76
|
def __init__(self,
|
82
|
-
channel: Channel = LocalChannel(),
|
83
77
|
nodes_per_block: int = 1,
|
84
78
|
cores_per_slot: Optional[int] = None,
|
85
79
|
mem_per_slot: Optional[float] = None,
|
@@ -100,7 +94,6 @@ class CondorProvider(RepresentationMixin, ClusterProvider):
|
|
100
94
|
|
101
95
|
label = 'condor'
|
102
96
|
super().__init__(label,
|
103
|
-
channel,
|
104
97
|
nodes_per_block,
|
105
98
|
init_blocks,
|
106
99
|
min_blocks,
|
@@ -226,7 +219,7 @@ class CondorProvider(RepresentationMixin, ClusterProvider):
|
|
226
219
|
|
227
220
|
job_config = {}
|
228
221
|
job_config["job_name"] = job_name
|
229
|
-
job_config["submit_script_dir"] = self.
|
222
|
+
job_config["submit_script_dir"] = self.script_dir
|
230
223
|
job_config["project"] = self.project
|
231
224
|
job_config["nodes"] = self.nodes_per_block
|
232
225
|
job_config["scheduler_options"] = scheduler_options
|
@@ -245,16 +238,14 @@ class CondorProvider(RepresentationMixin, ClusterProvider):
|
|
245
238
|
with open(userscript_path, 'w') as f:
|
246
239
|
f.write(job_config["worker_init"] + '\n' + wrapped_command)
|
247
240
|
|
248
|
-
|
249
|
-
the_input_files = [user_script_path] + self.transfer_input_files
|
241
|
+
the_input_files = [userscript_path] + self.transfer_input_files
|
250
242
|
job_config["input_files"] = ','.join(the_input_files)
|
251
|
-
job_config["job_script"] = os.path.basename(
|
243
|
+
job_config["job_script"] = os.path.basename(userscript_path)
|
252
244
|
|
253
245
|
# Construct and move the submit script
|
254
246
|
self._write_submit_script(template_string, script_path, job_name, job_config)
|
255
|
-
channel_script_path = self.channel.push_file(script_path, self.channel.script_dir)
|
256
247
|
|
257
|
-
cmd = "condor_submit {0}".format(
|
248
|
+
cmd = "condor_submit {0}".format(script_path)
|
258
249
|
try:
|
259
250
|
retcode, stdout, stderr = self.execute_wait(cmd)
|
260
251
|
except Exception as e:
|
@@ -2,7 +2,6 @@ import logging
|
|
2
2
|
import os
|
3
3
|
import time
|
4
4
|
|
5
|
-
from parsl.channels import LocalChannel
|
6
5
|
from parsl.jobs.states import JobState, JobStatus
|
7
6
|
from parsl.launchers import SingleNodeLauncher
|
8
7
|
from parsl.providers.cluster_provider import ClusterProvider
|
@@ -36,8 +35,6 @@ class GridEngineProvider(ClusterProvider, RepresentationMixin):
|
|
36
35
|
|
37
36
|
Parameters
|
38
37
|
----------
|
39
|
-
channel : Channel
|
40
|
-
Channel for accessing this provider.
|
41
38
|
nodes_per_block : int
|
42
39
|
Nodes to provision per block.
|
43
40
|
min_blocks : int
|
@@ -62,7 +59,6 @@ class GridEngineProvider(ClusterProvider, RepresentationMixin):
|
|
62
59
|
"""
|
63
60
|
|
64
61
|
def __init__(self,
|
65
|
-
channel=LocalChannel(),
|
66
62
|
nodes_per_block=1,
|
67
63
|
init_blocks=1,
|
68
64
|
min_blocks=0,
|
@@ -76,7 +72,6 @@ class GridEngineProvider(ClusterProvider, RepresentationMixin):
|
|
76
72
|
queue=None):
|
77
73
|
label = 'grid_engine'
|
78
74
|
super().__init__(label,
|
79
|
-
channel,
|
80
75
|
nodes_per_block,
|
81
76
|
init_blocks,
|
82
77
|
min_blocks,
|
@@ -100,7 +95,7 @@ class GridEngineProvider(ClusterProvider, RepresentationMixin):
|
|
100
95
|
self.nodes_per_block, tasks_per_node))
|
101
96
|
|
102
97
|
job_config = {}
|
103
|
-
job_config["submit_script_dir"] = self.
|
98
|
+
job_config["submit_script_dir"] = self.script_dir
|
104
99
|
job_config["nodes"] = self.nodes_per_block
|
105
100
|
job_config["walltime"] = self.walltime
|
106
101
|
job_config["scheduler_options"] = self.scheduler_options
|
@@ -142,11 +137,10 @@ class GridEngineProvider(ClusterProvider, RepresentationMixin):
|
|
142
137
|
logger.debug("Writing submit script")
|
143
138
|
self._write_submit_script(template_string, script_path, job_name, job_config)
|
144
139
|
|
145
|
-
channel_script_path = self.channel.push_file(script_path, self.channel.script_dir)
|
146
140
|
if self.queue is not None:
|
147
|
-
cmd = "qsub -q {0} -terse {1}".format(self.queue,
|
141
|
+
cmd = "qsub -q {0} -terse {1}".format(self.queue, script_path)
|
148
142
|
else:
|
149
|
-
cmd = "qsub -terse {0}".format(
|
143
|
+
cmd = "qsub -terse {0}".format(script_path)
|
150
144
|
retcode, stdout, stderr = self.execute_wait(cmd)
|
151
145
|
|
152
146
|
if retcode == 0:
|