tracdap-runtime 0.6.2__py3-none-any.whl → 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracdap/rt/_exec/actors.py +87 -10
- tracdap/rt/_exec/dev_mode.py +9 -17
- tracdap/rt/_exec/engine.py +79 -14
- tracdap/rt/_exec/runtime.py +83 -40
- tracdap/rt/_exec/server.py +306 -29
- tracdap/rt/_impl/config_parser.py +219 -49
- tracdap/rt/_impl/grpc/codec.py +60 -5
- tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2.py +19 -19
- tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2.pyi +11 -9
- tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2_grpc.py +25 -25
- tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.py +28 -16
- tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.pyi +33 -6
- tracdap/rt/_impl/grpc/tracdap/metadata/object_pb2.py +8 -3
- tracdap/rt/_impl/grpc/tracdap/metadata/object_pb2.pyi +13 -2
- tracdap/rt/_impl/guard_rails.py +21 -0
- tracdap/rt/_impl/models.py +25 -0
- tracdap/rt/_impl/static_api.py +23 -9
- tracdap/rt/_impl/type_system.py +17 -0
- tracdap/rt/_impl/validation.py +10 -0
- tracdap/rt/_plugins/config_local.py +49 -0
- tracdap/rt/_version.py +1 -1
- tracdap/rt/api/hook.py +6 -3
- tracdap/rt/api/static_api.py +71 -21
- tracdap/rt/config/__init__.py +4 -4
- tracdap/rt/config/common.py +10 -0
- tracdap/rt/config/platform.py +0 -10
- tracdap/rt/config/runtime.py +2 -0
- tracdap/rt/ext/config.py +34 -0
- tracdap/rt/ext/embed.py +1 -3
- tracdap/rt/ext/plugins.py +47 -6
- tracdap/rt/launch/cli.py +4 -0
- tracdap/rt/launch/launch.py +34 -9
- tracdap/rt/metadata/__init__.py +17 -17
- tracdap/rt/metadata/model.py +6 -0
- tracdap/rt/metadata/object.py +3 -0
- {tracdap_runtime-0.6.2.dist-info → tracdap_runtime-0.6.3.dist-info}/METADATA +4 -4
- {tracdap_runtime-0.6.2.dist-info → tracdap_runtime-0.6.3.dist-info}/RECORD +40 -49
- {tracdap_runtime-0.6.2.dist-info → tracdap_runtime-0.6.3.dist-info}/WHEEL +1 -1
- tracdap/rt/_impl/grpc/tracdap/config/common_pb2.py +0 -55
- tracdap/rt/_impl/grpc/tracdap/config/common_pb2.pyi +0 -103
- tracdap/rt/_impl/grpc/tracdap/config/job_pb2.py +0 -42
- tracdap/rt/_impl/grpc/tracdap/config/job_pb2.pyi +0 -44
- tracdap/rt/_impl/grpc/tracdap/config/platform_pb2.py +0 -71
- tracdap/rt/_impl/grpc/tracdap/config/platform_pb2.pyi +0 -197
- tracdap/rt/_impl/grpc/tracdap/config/result_pb2.py +0 -37
- tracdap/rt/_impl/grpc/tracdap/config/result_pb2.pyi +0 -35
- tracdap/rt/_impl/grpc/tracdap/config/runtime_pb2.py +0 -42
- tracdap/rt/_impl/grpc/tracdap/config/runtime_pb2.pyi +0 -46
- tracdap/rt/ext/_guard.py +0 -37
- {tracdap_runtime-0.6.2.dist-info → tracdap_runtime-0.6.3.dist-info}/LICENSE +0 -0
- {tracdap_runtime-0.6.2.dist-info → tracdap_runtime-0.6.3.dist-info}/top_level.txt +0 -0
tracdap/rt/_exec/actors.py
CHANGED
@@ -25,6 +25,7 @@ import queue
|
|
25
25
|
import time
|
26
26
|
|
27
27
|
import tracdap.rt._impl.util as util # noqa
|
28
|
+
import tracdap.rt._impl.validation as _val # noqa
|
28
29
|
import tracdap.rt.exceptions as _ex
|
29
30
|
|
30
31
|
|
@@ -180,6 +181,49 @@ class ActorContext:
|
|
180
181
|
return self.__error or self.__node.error
|
181
182
|
|
182
183
|
|
184
|
+
class ThreadsafeActor(Actor):
|
185
|
+
|
186
|
+
def __init__(self):
|
187
|
+
super().__init__()
|
188
|
+
self.__threadsafe: tp.Optional[ThreadsafeContext] = None
|
189
|
+
|
190
|
+
def threadsafe(self) -> ThreadsafeContext:
|
191
|
+
return self.__threadsafe
|
192
|
+
|
193
|
+
|
194
|
+
class ThreadsafeContext:
|
195
|
+
|
196
|
+
def __init__(self, node: ActorNode):
|
197
|
+
self.__node = node
|
198
|
+
self.__id = node.actor_id
|
199
|
+
self.__parent = node.parent.actor_id if node.parent is not None else None
|
200
|
+
|
201
|
+
def spawn(self, actor: Actor):
|
202
|
+
self.__node.event_loop.post_message(
|
203
|
+
None, lambda _:
|
204
|
+
self.__node.spawn(actor) and None)
|
205
|
+
|
206
|
+
def send(self, target_id: ActorId, message: str, *args, **kwargs):
|
207
|
+
self.__node.event_loop.post_message(
|
208
|
+
None, lambda _:
|
209
|
+
self.__node.send_message(self.__id, target_id, message, args, kwargs))
|
210
|
+
|
211
|
+
def send_parent(self, message: str, *args, **kwargs):
|
212
|
+
self.__node.event_loop.post_message(
|
213
|
+
None, lambda _:
|
214
|
+
self.__node.send_message(self.__id, self.__parent, message, args, kwargs))
|
215
|
+
|
216
|
+
def stop(self):
|
217
|
+
self.__node.event_loop.post_message(
|
218
|
+
None, lambda _:
|
219
|
+
self.__node.send_signal(self.__id, self.__id, SignalNames.STOP))
|
220
|
+
|
221
|
+
def fail(self, error: Exception):
|
222
|
+
self.__node.event_loop.post_message(
|
223
|
+
None, lambda _:
|
224
|
+
self.__node.send_signal(self.__id, self.__id, SignalNames.STOP, error))
|
225
|
+
|
226
|
+
|
183
227
|
class EventLoop:
|
184
228
|
|
185
229
|
_T_MSG = tp.TypeVar("_T_MSG")
|
@@ -340,7 +384,7 @@ class ActorNode:
|
|
340
384
|
self.state: ActorState = ActorState.NOT_STARTED
|
341
385
|
self.error: tp.Optional[Exception] = None
|
342
386
|
|
343
|
-
def spawn(self, child_actor: Actor):
|
387
|
+
def spawn(self, child_actor: Actor) -> ActorId:
|
344
388
|
|
345
389
|
if self._log.isEnabledFor(logging.DEBUG):
|
346
390
|
self._log.debug(f"spawn [{self.actor_id}]: [{type(child_actor)}]")
|
@@ -355,6 +399,11 @@ class ActorNode:
|
|
355
399
|
child_node = ActorNode(child_id, child_actor, self, self.system, event_loop)
|
356
400
|
self.children[child_id] = child_node
|
357
401
|
|
402
|
+
# If this is a threadsafe actor, set up the threadsafe context
|
403
|
+
if isinstance(child_actor, ThreadsafeActor):
|
404
|
+
threadsafe = ThreadsafeContext(child_node)
|
405
|
+
child_actor._ThreadsafeActor__threadsafe = threadsafe
|
406
|
+
|
358
407
|
child_node.send_signal(self.actor_id, child_id, SignalNames.START)
|
359
408
|
|
360
409
|
return child_id
|
@@ -542,6 +591,12 @@ class ActorNode:
|
|
542
591
|
if not self._check_message_target(signal):
|
543
592
|
return
|
544
593
|
|
594
|
+
# Do not process signals after the actor has stopped
|
595
|
+
# This is common with e.g. STOP signals that propagate up and down the tree
|
596
|
+
|
597
|
+
if self.state in [ActorState.STOPPED, ActorState.FAILED]:
|
598
|
+
return
|
599
|
+
|
545
600
|
# Call the signal receiver function
|
546
601
|
# This gives the actor a chance to respond to the signal
|
547
602
|
|
@@ -768,10 +823,12 @@ class ActorNode:
|
|
768
823
|
# Positional arg types
|
769
824
|
for pos_param, pos_arg in zip(pos_params, args):
|
770
825
|
|
826
|
+
# If no type hint is available, allow anything through
|
827
|
+
# Otherwise, reuse the validator logic to type check individual args
|
771
828
|
type_hint = type_hints.get(pos_param.name)
|
829
|
+
type_check = type_hint is None or _val.check_type(type_hint, pos_arg)
|
772
830
|
|
773
|
-
|
774
|
-
if type_hint is not None and not isinstance(pos_arg, type_hint):
|
831
|
+
if not type_check:
|
775
832
|
error = f"Invalid message: [{message}] -> {target_id} (wrong parameter type for '{pos_param.name}')"
|
776
833
|
self._log.error(error)
|
777
834
|
raise EBadActor(error)
|
@@ -780,20 +837,20 @@ class ActorNode:
|
|
780
837
|
for kw_param in kw_params:
|
781
838
|
|
782
839
|
kw_arg = kwargs.get(kw_param.name)
|
783
|
-
type_hint = type_hints.get(kw_param.name)
|
784
840
|
|
785
841
|
# If param has taken a default value, no type check is needed
|
786
842
|
if kw_arg is None:
|
787
843
|
continue
|
788
844
|
|
789
|
-
#
|
790
|
-
|
845
|
+
# Otherwise use the same type-validation logic as positional args
|
846
|
+
type_hint = type_hints.get(kw_param.name)
|
847
|
+
type_check = type_hint is None or _val.check_type(type_hint, kw_arg)
|
848
|
+
|
849
|
+
if not type_check:
|
791
850
|
error = f"Invalid message: [{message}] -> {target_id} (wrong parameter type for '{kw_param.name}')"
|
792
851
|
self._log.error(error)
|
793
852
|
raise EBadActor(error)
|
794
853
|
|
795
|
-
# TODO: Verify generics for both args and kwargs
|
796
|
-
|
797
854
|
|
798
855
|
class RootActor(Actor):
|
799
856
|
|
@@ -864,11 +921,17 @@ class ActorSystem:
|
|
864
921
|
|
865
922
|
self.__root_started = threading.Event()
|
866
923
|
self.__root_stopped = threading.Event()
|
924
|
+
|
867
925
|
self.__root_actor = RootActor(main_actor, self.__root_started, self.__root_stopped)
|
868
926
|
self.__root_node = ActorNode(self.ROOT_ID, self.__root_actor, None, self, self.__system_event_loop)
|
869
927
|
|
870
928
|
# Public API
|
871
929
|
|
930
|
+
def main_id(self) -> ActorId:
|
931
|
+
if not self.__root_started.is_set():
|
932
|
+
raise EBadActor("System has not started yet")
|
933
|
+
return self.__root_actor.main_id
|
934
|
+
|
872
935
|
def start(self, wait=True):
|
873
936
|
|
874
937
|
self.__system_thread.start()
|
@@ -913,12 +976,26 @@ class ActorSystem:
|
|
913
976
|
|
914
977
|
return self.__root_node.error
|
915
978
|
|
916
|
-
def
|
979
|
+
def spawn_agent(self, agent: Actor) -> ActorId:
|
980
|
+
|
981
|
+
if not self.__root_started.is_set():
|
982
|
+
raise EBadActor("System has not started yet")
|
983
|
+
|
984
|
+
return self.__root_node.spawn(agent)
|
985
|
+
|
986
|
+
def send_main(self, message: str, *args, **kwargs):
|
917
987
|
|
918
988
|
if self.__root_actor.main_id is None:
|
919
989
|
raise EBadActor("System has not started yet")
|
920
990
|
|
921
|
-
self.__root_node.send_message("/external", self.__root_actor.main_id, message, args, kwargs)
|
991
|
+
self.__root_node.send_message("/external", self.__root_actor.main_id, message, args, kwargs) # TODO
|
992
|
+
|
993
|
+
def send(self, actor_id: ActorId, message: str, *args, **kwargs):
|
994
|
+
|
995
|
+
if not self.__root_started.is_set():
|
996
|
+
raise EBadActor("System has not started yet")
|
997
|
+
|
998
|
+
self.__root_node.send_message("/external", actor_id, message, args, kwargs)
|
922
999
|
|
923
1000
|
def _setup_event_loops(self, thread_pools: tp.Dict[str, int]):
|
924
1001
|
|
tracdap/rt/_exec/dev_mode.py
CHANGED
@@ -46,7 +46,7 @@ class DevModeTranslator:
|
|
46
46
|
_log: tp.Optional[_util.logging.Logger] = None
|
47
47
|
|
48
48
|
@classmethod
|
49
|
-
def translate_sys_config(cls, sys_config: _cfg.RuntimeConfig,
|
49
|
+
def translate_sys_config(cls, sys_config: _cfg.RuntimeConfig, config_mgr: _cfg_p.ConfigManager):
|
50
50
|
|
51
51
|
cls._log.info(f"Applying dev mode config translation to system config")
|
52
52
|
|
@@ -56,7 +56,7 @@ class DevModeTranslator:
|
|
56
56
|
sys_config.storage = _cfg.StorageConfig()
|
57
57
|
|
58
58
|
sys_config = cls._add_integrated_repo(sys_config)
|
59
|
-
sys_config = cls._resolve_relative_storage_root(sys_config,
|
59
|
+
sys_config = cls._resolve_relative_storage_root(sys_config, config_mgr)
|
60
60
|
|
61
61
|
return sys_config
|
62
62
|
|
@@ -66,7 +66,7 @@ class DevModeTranslator:
|
|
66
66
|
sys_config: _cfg.RuntimeConfig,
|
67
67
|
job_config: _cfg.JobConfig,
|
68
68
|
scratch_dir: pathlib.Path,
|
69
|
-
|
69
|
+
config_mgr: _cfg_p.ConfigManager,
|
70
70
|
model_class: tp.Optional[_api.TracModel.__class__]) \
|
71
71
|
-> _cfg.JobConfig:
|
72
72
|
|
@@ -84,7 +84,7 @@ class DevModeTranslator:
|
|
84
84
|
|
85
85
|
# Fow flows, load external flow definitions then perform auto-wiring and type inference
|
86
86
|
if job_config.job.jobType == _meta.JobType.RUN_FLOW:
|
87
|
-
job_config = cls._process_flow_definition(job_config,
|
87
|
+
job_config = cls._process_flow_definition(job_config, config_mgr)
|
88
88
|
|
89
89
|
# For run (model|flow) jobs, apply processing to the parameters, inputs and outputs
|
90
90
|
if job_config.job.jobType in [_meta.JobType.RUN_MODEL, _meta.JobType.RUN_FLOW]:
|
@@ -109,7 +109,7 @@ class DevModeTranslator:
|
|
109
109
|
@classmethod
|
110
110
|
def _resolve_relative_storage_root(
|
111
111
|
cls, sys_config: _cfg.RuntimeConfig,
|
112
|
-
|
112
|
+
config_mgr: _cfg_p.ConfigManager):
|
113
113
|
|
114
114
|
storage_config = copy.deepcopy(sys_config.storage)
|
115
115
|
|
@@ -128,6 +128,7 @@ class DevModeTranslator:
|
|
128
128
|
|
129
129
|
cls._log.info(f"Resolving relative path for [{bucket_key}] local storage...")
|
130
130
|
|
131
|
+
sys_config_path = config_mgr.config_dir_path()
|
131
132
|
if sys_config_path is not None:
|
132
133
|
absolute_path = sys_config_path.joinpath(root_path).resolve()
|
133
134
|
if absolute_path.exists():
|
@@ -291,7 +292,7 @@ class DevModeTranslator:
|
|
291
292
|
return model_id, model_object
|
292
293
|
|
293
294
|
@classmethod
|
294
|
-
def _process_flow_definition(cls, job_config: _cfg.JobConfig,
|
295
|
+
def _process_flow_definition(cls, job_config: _cfg.JobConfig, config_mgr: _cfg_p.ConfigManager) -> _cfg.JobConfig:
|
295
296
|
|
296
297
|
flow_details = job_config.job.runFlow.flow
|
297
298
|
|
@@ -305,21 +306,12 @@ class DevModeTranslator:
|
|
305
306
|
cls._log.error(err)
|
306
307
|
raise _ex.EConfigParse(err)
|
307
308
|
|
308
|
-
flow_path = config_dir.joinpath(flow_details) if config_dir is not None else pathlib.Path(flow_details)
|
309
|
-
|
310
|
-
if not flow_path.exists():
|
311
|
-
err = f"Flow definition not available for [{flow_details}]: File not found ({flow_path})"
|
312
|
-
cls._log.error(err)
|
313
|
-
raise _ex.EConfigParse(err)
|
314
|
-
|
315
309
|
flow_id = _util.new_object_id(_meta.ObjectType.FLOW)
|
316
310
|
flow_key = _util.object_key(flow_id)
|
317
311
|
|
318
|
-
cls._log.info(f"Generating flow definition
|
312
|
+
cls._log.info(f"Generating flow definition from [{flow_details}] with ID = [{flow_key}]")
|
319
313
|
|
320
|
-
|
321
|
-
flow_raw_data = flow_parser.load_raw_config(flow_path, flow_path.name)
|
322
|
-
flow_def = flow_parser.parse(flow_raw_data, flow_path.name)
|
314
|
+
flow_def = config_mgr.load_config_object(flow_details, _meta.FlowDefinition)
|
323
315
|
|
324
316
|
# Auto-wiring and inference only applied to externally loaded flows for now
|
325
317
|
flow_def = cls._autowire_flow(flow_def, job_config)
|
tracdap/rt/_exec/engine.py
CHANGED
@@ -19,6 +19,7 @@ import dataclasses as dc
|
|
19
19
|
import enum
|
20
20
|
import typing as tp
|
21
21
|
|
22
|
+
import tracdap.rt.metadata as _meta
|
22
23
|
import tracdap.rt.config as _cfg
|
23
24
|
import tracdap.rt.exceptions as _ex
|
24
25
|
import tracdap.rt._exec.actors as _actors
|
@@ -28,7 +29,6 @@ import tracdap.rt._impl.models as _models # noqa
|
|
28
29
|
import tracdap.rt._impl.data as _data # noqa
|
29
30
|
import tracdap.rt._impl.storage as _storage # noqa
|
30
31
|
import tracdap.rt._impl.util as _util # noqa
|
31
|
-
from .actors import Signal
|
32
32
|
|
33
33
|
from .graph import NodeId
|
34
34
|
|
@@ -66,6 +66,18 @@ class _EngineContext:
|
|
66
66
|
failed_nodes: tp.Set[NodeId] = dc.field(default_factory=set)
|
67
67
|
|
68
68
|
|
69
|
+
@dc.dataclass
|
70
|
+
class _JobState:
|
71
|
+
|
72
|
+
job_id: _meta.TagHeader
|
73
|
+
job_config: _cfg.JobConfig
|
74
|
+
|
75
|
+
actor_id: _actors.ActorId = None
|
76
|
+
|
77
|
+
job_result: _cfg.JobResult = None
|
78
|
+
job_error: Exception = None
|
79
|
+
|
80
|
+
|
69
81
|
class TracEngine(_actors.Actor):
|
70
82
|
|
71
83
|
"""
|
@@ -88,7 +100,7 @@ class TracEngine(_actors.Actor):
|
|
88
100
|
self._storage = storage
|
89
101
|
self._notify_callback = notify_callback
|
90
102
|
|
91
|
-
self.
|
103
|
+
self._jobs: tp.Dict[str, _JobState] = dict()
|
92
104
|
|
93
105
|
def on_start(self):
|
94
106
|
|
@@ -98,7 +110,7 @@ class TracEngine(_actors.Actor):
|
|
98
110
|
|
99
111
|
self._log.info("Engine shutdown complete")
|
100
112
|
|
101
|
-
def on_signal(self, signal: Signal) -> tp.Optional[bool]:
|
113
|
+
def on_signal(self, signal: _actors.Signal) -> tp.Optional[bool]:
|
102
114
|
|
103
115
|
# Failed signals can propagate from leaf nodes up the actor tree for a job
|
104
116
|
# If the failure goes all the way up the tree without being handled, it will reach the engine node
|
@@ -110,8 +122,8 @@ class TracEngine(_actors.Actor):
|
|
110
122
|
failed_job_key = None
|
111
123
|
|
112
124
|
# Look for the job key corresponding to the failed actor
|
113
|
-
for job_key,
|
114
|
-
if
|
125
|
+
for job_key, job_state in self._jobs.items():
|
126
|
+
if job_state.actor_id == signal.sender:
|
115
127
|
failed_job_key = job_key
|
116
128
|
|
117
129
|
# If the job is still live, call job_failed explicitly
|
@@ -147,19 +159,34 @@ class TracEngine(_actors.Actor):
|
|
147
159
|
job_processor = JobProcessor(job_key, job_config, result_spec,self._models, self._storage)
|
148
160
|
job_actor_id = self.actors().spawn(job_processor)
|
149
161
|
|
150
|
-
|
151
|
-
|
162
|
+
job_state = _JobState(job_config.jobId, job_config)
|
163
|
+
job_state.actor_id = job_actor_id
|
164
|
+
|
165
|
+
self._jobs[job_key] = job_state
|
166
|
+
|
167
|
+
@_actors.Message
|
168
|
+
def get_job_list(self):
|
169
|
+
|
170
|
+
job_list = list(map(self._get_job_info, self._jobs.keys()))
|
171
|
+
self.actors().reply("job_list", job_list)
|
172
|
+
|
173
|
+
@_actors.Message
|
174
|
+
def get_job_details(self, job_key: str, details: bool):
|
175
|
+
|
176
|
+
details = self._get_job_info(job_key, details)
|
177
|
+
self.actors().reply("job_details", details)
|
152
178
|
|
153
179
|
@_actors.Message
|
154
180
|
def job_succeeded(self, job_key: str, job_result: _cfg.JobResult):
|
155
181
|
|
156
182
|
# Ignore duplicate messages from the job processor (can happen in unusual error cases)
|
157
|
-
if job_key not in self.
|
183
|
+
if job_key not in self._jobs:
|
158
184
|
self._log.warning(f"Ignoring [job_succeeded] message, job [{job_key}] has already completed")
|
159
185
|
return
|
160
186
|
|
161
187
|
self._log.info(f"Recording job as successful: {job_key}")
|
162
188
|
|
189
|
+
self._jobs[job_key].job_result = job_result
|
163
190
|
self._finalize_job(job_key)
|
164
191
|
|
165
192
|
if self._notify_callback is not None:
|
@@ -169,12 +196,13 @@ class TracEngine(_actors.Actor):
|
|
169
196
|
def job_failed(self, job_key: str, error: Exception):
|
170
197
|
|
171
198
|
# Ignore duplicate messages from the job processor (can happen in unusual error cases)
|
172
|
-
if job_key not in self.
|
199
|
+
if job_key not in self._jobs:
|
173
200
|
self._log.warning(f"Ignoring [job_failed] message, job [{job_key}] has already completed")
|
174
201
|
return
|
175
202
|
|
176
203
|
self._log.error(f"Recording job as failed: {job_key}")
|
177
204
|
|
205
|
+
self._jobs[job_key].job_error = error
|
178
206
|
self._finalize_job(job_key)
|
179
207
|
|
180
208
|
if self._notify_callback is not None:
|
@@ -182,10 +210,47 @@ class TracEngine(_actors.Actor):
|
|
182
210
|
|
183
211
|
def _finalize_job(self, job_key: str):
|
184
212
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
213
|
+
# Stop the actor but keep the job state available for status / results queries
|
214
|
+
|
215
|
+
# In the future, job state will need to be expunged after some period of time
|
216
|
+
# For now each instance of the runtime only processes one job so no need to worry
|
217
|
+
|
218
|
+
job_state = self._jobs.get(job_key)
|
219
|
+
job_actor_id = job_state.actor_id if job_state is not None else None
|
220
|
+
|
221
|
+
if job_actor_id is not None:
|
222
|
+
self.actors().stop(job_actor_id)
|
223
|
+
job_state.actor_id = None
|
224
|
+
|
225
|
+
def _get_job_info(self, job_key: str, details: bool = False) -> tp.Optional[_cfg.JobResult]:
|
226
|
+
|
227
|
+
job_state = self._jobs.get(job_key)
|
228
|
+
|
229
|
+
if job_state is None:
|
230
|
+
return None
|
231
|
+
|
232
|
+
job_result = _cfg.JobResult()
|
233
|
+
job_result.jobId = job_state.job_id
|
234
|
+
|
235
|
+
if job_state.actor_id is not None:
|
236
|
+
job_result.statusCode = _meta.JobStatusCode.RUNNING
|
237
|
+
|
238
|
+
elif job_state.job_result is not None:
|
239
|
+
job_result.statusCode = job_state.job_result.statusCode
|
240
|
+
job_result.statusMessage = job_state.job_result.statusMessage
|
241
|
+
if details:
|
242
|
+
job_result.results = job_state.job_result.results or dict()
|
243
|
+
|
244
|
+
elif job_state.job_error is not None:
|
245
|
+
job_result.statusCode = _meta.JobStatusCode.FAILED
|
246
|
+
job_result.statusMessage = str(job_state.job_error.args[0])
|
247
|
+
|
248
|
+
else:
|
249
|
+
# Alternatively return UNKNOWN status or throw an error here
|
250
|
+
job_result.statusCode = _meta.JobStatusCode.FAILED
|
251
|
+
job_result.statusMessage = "No details available"
|
252
|
+
|
253
|
+
return job_result
|
189
254
|
|
190
255
|
|
191
256
|
class JobProcessor(_actors.Actor):
|
@@ -218,7 +283,7 @@ class JobProcessor(_actors.Actor):
|
|
218
283
|
self._log.info(f"Cleaning up job [{self.job_key}]")
|
219
284
|
self._models.destroy_scope(self.job_key)
|
220
285
|
|
221
|
-
def on_signal(self, signal: Signal) -> tp.Optional[bool]:
|
286
|
+
def on_signal(self, signal: _actors.Signal) -> tp.Optional[bool]:
|
222
287
|
|
223
288
|
if signal.message == _actors.SignalNames.FAILED and isinstance(signal, _actors.ErrorSignal):
|
224
289
|
|
tracdap/rt/_exec/runtime.py
CHANGED
@@ -16,6 +16,7 @@ from __future__ import annotations
|
|
16
16
|
|
17
17
|
import dataclasses as dc
|
18
18
|
import datetime as dt
|
19
|
+
import signal
|
19
20
|
import threading
|
20
21
|
|
21
22
|
import sys
|
@@ -54,6 +55,8 @@ class TracRuntime:
|
|
54
55
|
_engine.ModelNodeProcessor: "model",
|
55
56
|
_engine.DataNodeProcessor: "data"}
|
56
57
|
|
58
|
+
__DEFAULT_API_PORT = 9000
|
59
|
+
|
57
60
|
def __init__(
|
58
61
|
self,
|
59
62
|
sys_config: tp.Union[str, pathlib.Path, _cfg.RuntimeConfig],
|
@@ -61,6 +64,7 @@ class TracRuntime:
|
|
61
64
|
job_result_format: tp.Optional[str] = None,
|
62
65
|
scratch_dir: tp.Union[str, pathlib.Path, None] = None,
|
63
66
|
scratch_dir_persist: bool = False,
|
67
|
+
plugin_packages: tp.List[str] = None,
|
64
68
|
dev_mode: bool = False):
|
65
69
|
|
66
70
|
trac_version = _version.__version__
|
@@ -83,28 +87,34 @@ class TracRuntime:
|
|
83
87
|
self._log.info(f"TRAC D.A.P. Python Runtime {trac_version}")
|
84
88
|
|
85
89
|
self._sys_config = sys_config if isinstance(sys_config, _cfg.RuntimeConfig) else None
|
86
|
-
self._sys_config_path =
|
90
|
+
self._sys_config_path = sys_config if not self._sys_config else None
|
87
91
|
self._job_result_dir = job_result_dir
|
88
92
|
self._job_result_format = job_result_format
|
89
93
|
self._scratch_dir = scratch_dir
|
90
94
|
self._scratch_dir_provided = True if scratch_dir is not None else False
|
91
95
|
self._scratch_dir_persist = scratch_dir_persist
|
96
|
+
self._plugin_packages = plugin_packages or []
|
92
97
|
self._dev_mode = dev_mode
|
93
|
-
self._server_enabled = False
|
94
|
-
self._server_port = 0
|
95
98
|
|
99
|
+
# Runtime control
|
100
|
+
self._runtime_lock = threading.Lock()
|
101
|
+
self._runtime_event = threading.Condition(self._runtime_lock)
|
96
102
|
self._pre_start_complete = False
|
103
|
+
self._shutdown_requested = False
|
104
|
+
self._oneshot_job = None
|
97
105
|
|
98
106
|
# Top level resources
|
107
|
+
self._config_mgr: tp.Optional[_cparse.ConfigManager] = None
|
99
108
|
self._models: tp.Optional[_models.ModelLoader] = None
|
100
109
|
self._storage: tp.Optional[_storage.StorageManager] = None
|
101
110
|
|
102
111
|
# The execution engine
|
103
112
|
self._system: tp.Optional[_actors.ActorSystem] = None
|
104
113
|
self._engine: tp.Optional[_engine.TracEngine] = None
|
105
|
-
self._engine_event = threading.Condition()
|
106
114
|
|
107
115
|
# Runtime API server
|
116
|
+
self._server_enabled = False
|
117
|
+
self._server_port = 0
|
108
118
|
self._server = None
|
109
119
|
|
110
120
|
self._jobs: tp.Dict[str, _RuntimeJobInfo] = dict()
|
@@ -134,21 +144,28 @@ class TracRuntime:
|
|
134
144
|
|
135
145
|
self._prepare_scratch_dir()
|
136
146
|
|
137
|
-
# Plugin manager
|
138
|
-
#
|
147
|
+
# Plugin manager, static API and guard rails are singletons
|
148
|
+
# Calling these methods multiple times is safe (e.g. for embedded or testing scenarios)
|
149
|
+
# However, plugins are never un-registered for the lifetime of the processes
|
139
150
|
|
140
151
|
_plugins.PluginManager.register_core_plugins()
|
152
|
+
|
153
|
+
for plugin_package in self._plugin_packages:
|
154
|
+
_plugins.PluginManager.register_plugin_package(plugin_package)
|
155
|
+
|
141
156
|
_static_api.StaticApiImpl.register_impl()
|
142
157
|
_guard.PythonGuardRails.protect_dangerous_functions()
|
143
158
|
|
144
159
|
# Load sys config (or use embedded), config errors are detected before start()
|
145
160
|
# Job config can also be checked before start() by using load_job_config()
|
146
161
|
|
162
|
+
self._config_mgr = _cparse.ConfigManager.for_root_config(self._sys_config_path)
|
163
|
+
|
147
164
|
if self._sys_config is None:
|
148
165
|
sys_config_dev_mode = _dev_mode.DEV_MODE_SYS_CONFIG if self._dev_mode else None
|
149
|
-
|
150
|
-
|
151
|
-
|
166
|
+
self._sys_config = self._config_mgr.load_root_object(
|
167
|
+
_cfg.RuntimeConfig, sys_config_dev_mode,
|
168
|
+
config_file_name="system")
|
152
169
|
else:
|
153
170
|
self._log.info("Using embedded system config")
|
154
171
|
|
@@ -156,8 +173,15 @@ class TracRuntime:
|
|
156
173
|
# I.e. it can be applied to embedded configs
|
157
174
|
|
158
175
|
if self._dev_mode:
|
159
|
-
|
160
|
-
|
176
|
+
self._sys_config = _dev_mode.DevModeTranslator.translate_sys_config(self._sys_config, self._config_mgr)
|
177
|
+
|
178
|
+
# Runtime API server is controlled by the sys config
|
179
|
+
|
180
|
+
if self._sys_config.runtimeApi is not None:
|
181
|
+
api_config = self._sys_config.runtimeApi
|
182
|
+
if api_config.enabled:
|
183
|
+
self._server_enabled = True
|
184
|
+
self._server_port = api_config.port or self.__DEFAULT_API_PORT
|
161
185
|
|
162
186
|
self._pre_start_complete = True
|
163
187
|
|
@@ -196,7 +220,7 @@ class TracRuntime:
|
|
196
220
|
# The server module pulls in all the gRPC dependencies, don't import it unless we have to
|
197
221
|
import tracdap.rt._exec.server as _server
|
198
222
|
|
199
|
-
self._server = _server.RuntimeApiServer(self._server_port)
|
223
|
+
self._server = _server.RuntimeApiServer(self._system, self._server_port)
|
200
224
|
self._server.start()
|
201
225
|
|
202
226
|
except Exception as e:
|
@@ -237,6 +261,28 @@ class TracRuntime:
|
|
237
261
|
else:
|
238
262
|
self._log.info("TRAC runtime has gone down cleanly")
|
239
263
|
|
264
|
+
def is_oneshot(self):
|
265
|
+
return not self._server_enabled
|
266
|
+
|
267
|
+
def run_until_done(self):
|
268
|
+
|
269
|
+
if self._server_enabled == False and len(self._jobs) == 0:
|
270
|
+
self._log.error("No job config supplied, TRAC runtime will not run")
|
271
|
+
raise _ex.EStartup("No job config supplied")
|
272
|
+
|
273
|
+
signal.signal(signal.SIGTERM, self._request_shutdown)
|
274
|
+
signal.signal(signal.SIGINT, self._request_shutdown)
|
275
|
+
|
276
|
+
with self._runtime_lock:
|
277
|
+
while not self._shutdown_requested:
|
278
|
+
self._runtime_event.wait()
|
279
|
+
|
280
|
+
def _request_shutdown(self, _signum = None, _frame = None):
|
281
|
+
|
282
|
+
with self._runtime_lock:
|
283
|
+
self._shutdown_requested = True
|
284
|
+
self._runtime_event.notify()
|
285
|
+
|
240
286
|
def _prepare_scratch_dir(self):
|
241
287
|
|
242
288
|
if not self._scratch_dir_provided:
|
@@ -274,20 +320,18 @@ class TracRuntime:
|
|
274
320
|
|
275
321
|
if isinstance(job_config, _cfg.JobConfig):
|
276
322
|
self._log.info("Using embedded job config")
|
277
|
-
job_config_path = None
|
278
323
|
|
279
324
|
else:
|
280
|
-
job_config_path = job_config
|
281
325
|
job_config_dev_mode = _dev_mode.DEV_MODE_JOB_CONFIG if self._dev_mode else None
|
282
|
-
|
283
|
-
|
284
|
-
|
326
|
+
job_config = self._config_mgr.load_config_object(
|
327
|
+
job_config, _cfg.JobConfig,
|
328
|
+
job_config_dev_mode,
|
329
|
+
config_file_name="job")
|
285
330
|
|
286
331
|
if self._dev_mode:
|
287
|
-
config_dir = job_config_path.parent if job_config_path is not None else None
|
288
332
|
job_config = _dev_mode.DevModeTranslator.translate_job_config(
|
289
333
|
self._sys_config, job_config,
|
290
|
-
self._scratch_dir,
|
334
|
+
self._scratch_dir, self._config_mgr,
|
291
335
|
model_class)
|
292
336
|
|
293
337
|
return job_config
|
@@ -297,7 +341,7 @@ class TracRuntime:
|
|
297
341
|
job_key = _util.object_key(job_config.jobId)
|
298
342
|
self._jobs[job_key] = _RuntimeJobInfo()
|
299
343
|
|
300
|
-
self._system.
|
344
|
+
self._system.send_main(
|
301
345
|
"submit_job", job_config,
|
302
346
|
str(self._job_result_dir) if self._job_result_dir else "",
|
303
347
|
self._job_result_format if self._job_result_format else "")
|
@@ -309,35 +353,34 @@ class TracRuntime:
|
|
309
353
|
if job_key not in self._jobs:
|
310
354
|
raise _ex.ETracInternal(f"Attempt to wait for a job that was never started")
|
311
355
|
|
312
|
-
|
313
|
-
while True:
|
356
|
+
self._oneshot_job = job_key
|
314
357
|
|
315
|
-
|
358
|
+
self.run_until_done()
|
316
359
|
|
317
|
-
|
318
|
-
raise job_info.error
|
360
|
+
job_info = self._jobs[job_key]
|
319
361
|
|
320
|
-
|
321
|
-
|
362
|
+
if job_info.error is not None:
|
363
|
+
raise job_info.error
|
322
364
|
|
323
|
-
|
365
|
+
elif job_info.result is not None:
|
366
|
+
return job_info.result
|
324
367
|
|
325
|
-
|
368
|
+
else:
|
369
|
+
err = f"No result or error information is available for job [{job_key}]"
|
370
|
+
self._log.error(err)
|
371
|
+
raise _ex.ETracInternal(err)
|
326
372
|
|
327
373
|
def _engine_callback(self, job_key, job_result, job_error):
|
328
374
|
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
self._jobs[job_key].done = True
|
336
|
-
self._jobs[job_key].error = job_error
|
337
|
-
else:
|
338
|
-
pass
|
375
|
+
if job_result is not None:
|
376
|
+
self._jobs[job_key].done = True
|
377
|
+
self._jobs[job_key].result = job_result
|
378
|
+
elif job_error is not None:
|
379
|
+
self._jobs[job_key].done = True
|
380
|
+
self._jobs[job_key].error = job_error
|
339
381
|
|
340
|
-
|
382
|
+
if self._oneshot_job == job_key:
|
383
|
+
self._request_shutdown()
|
341
384
|
|
342
385
|
# ------------------------------------------------------------------------------------------------------------------
|
343
386
|
# Error handling
|