tracdap-runtime 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracdap/rt/_exec/actors.py +87 -10
- tracdap/rt/_exec/context.py +207 -100
- tracdap/rt/_exec/dev_mode.py +52 -20
- tracdap/rt/_exec/engine.py +79 -14
- tracdap/rt/_exec/functions.py +14 -17
- tracdap/rt/_exec/runtime.py +83 -40
- tracdap/rt/_exec/server.py +306 -29
- tracdap/rt/_impl/config_parser.py +219 -49
- tracdap/rt/_impl/data.py +70 -5
- tracdap/rt/_impl/grpc/codec.py +60 -5
- tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2.py +19 -19
- tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2.pyi +11 -9
- tracdap/rt/_impl/grpc/tracdap/api/internal/runtime_pb2_grpc.py +25 -25
- tracdap/rt/_impl/grpc/tracdap/metadata/data_pb2.py +18 -18
- tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.py +28 -16
- tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.pyi +37 -6
- tracdap/rt/_impl/grpc/tracdap/metadata/object_pb2.py +8 -3
- tracdap/rt/_impl/grpc/tracdap/metadata/object_pb2.pyi +13 -2
- tracdap/rt/_impl/guard_rails.py +21 -0
- tracdap/rt/_impl/models.py +25 -0
- tracdap/rt/_impl/static_api.py +43 -13
- tracdap/rt/_impl/type_system.py +17 -0
- tracdap/rt/_impl/validation.py +47 -4
- tracdap/rt/_plugins/config_local.py +49 -0
- tracdap/rt/_version.py +1 -1
- tracdap/rt/api/hook.py +6 -5
- tracdap/rt/api/model_api.py +50 -7
- tracdap/rt/api/static_api.py +81 -23
- tracdap/rt/config/__init__.py +4 -4
- tracdap/rt/config/common.py +25 -15
- tracdap/rt/config/job.py +2 -2
- tracdap/rt/config/platform.py +25 -35
- tracdap/rt/config/result.py +2 -2
- tracdap/rt/config/runtime.py +4 -2
- tracdap/rt/ext/config.py +34 -0
- tracdap/rt/ext/embed.py +1 -3
- tracdap/rt/ext/plugins.py +47 -6
- tracdap/rt/launch/cli.py +11 -4
- tracdap/rt/launch/launch.py +53 -12
- tracdap/rt/metadata/__init__.py +17 -17
- tracdap/rt/metadata/common.py +2 -2
- tracdap/rt/metadata/custom.py +3 -3
- tracdap/rt/metadata/data.py +12 -12
- tracdap/rt/metadata/file.py +6 -6
- tracdap/rt/metadata/flow.py +6 -6
- tracdap/rt/metadata/job.py +8 -8
- tracdap/rt/metadata/model.py +21 -11
- tracdap/rt/metadata/object.py +3 -0
- tracdap/rt/metadata/object_id.py +8 -8
- tracdap/rt/metadata/search.py +5 -5
- tracdap/rt/metadata/stoarge.py +6 -6
- tracdap/rt/metadata/tag.py +1 -1
- tracdap/rt/metadata/tag_update.py +1 -1
- tracdap/rt/metadata/type.py +4 -4
- {tracdap_runtime-0.6.2.dist-info → tracdap_runtime-0.6.4.dist-info}/METADATA +4 -4
- tracdap_runtime-0.6.4.dist-info/RECORD +112 -0
- {tracdap_runtime-0.6.2.dist-info → tracdap_runtime-0.6.4.dist-info}/WHEEL +1 -1
- tracdap/rt/_impl/grpc/tracdap/config/common_pb2.py +0 -55
- tracdap/rt/_impl/grpc/tracdap/config/common_pb2.pyi +0 -103
- tracdap/rt/_impl/grpc/tracdap/config/job_pb2.py +0 -42
- tracdap/rt/_impl/grpc/tracdap/config/job_pb2.pyi +0 -44
- tracdap/rt/_impl/grpc/tracdap/config/platform_pb2.py +0 -71
- tracdap/rt/_impl/grpc/tracdap/config/platform_pb2.pyi +0 -197
- tracdap/rt/_impl/grpc/tracdap/config/result_pb2.py +0 -37
- tracdap/rt/_impl/grpc/tracdap/config/result_pb2.pyi +0 -35
- tracdap/rt/_impl/grpc/tracdap/config/runtime_pb2.py +0 -42
- tracdap/rt/_impl/grpc/tracdap/config/runtime_pb2.pyi +0 -46
- tracdap/rt/ext/_guard.py +0 -37
- tracdap_runtime-0.6.2.dist-info/RECORD +0 -121
- {tracdap_runtime-0.6.2.dist-info → tracdap_runtime-0.6.4.dist-info}/LICENSE +0 -0
- {tracdap_runtime-0.6.2.dist-info → tracdap_runtime-0.6.4.dist-info}/top_level.txt +0 -0
tracdap/rt/_exec/dev_mode.py
CHANGED
@@ -46,7 +46,7 @@ class DevModeTranslator:
|
|
46
46
|
_log: tp.Optional[_util.logging.Logger] = None
|
47
47
|
|
48
48
|
@classmethod
|
49
|
-
def translate_sys_config(cls, sys_config: _cfg.RuntimeConfig,
|
49
|
+
def translate_sys_config(cls, sys_config: _cfg.RuntimeConfig, config_mgr: _cfg_p.ConfigManager):
|
50
50
|
|
51
51
|
cls._log.info(f"Applying dev mode config translation to system config")
|
52
52
|
|
@@ -56,7 +56,7 @@ class DevModeTranslator:
|
|
56
56
|
sys_config.storage = _cfg.StorageConfig()
|
57
57
|
|
58
58
|
sys_config = cls._add_integrated_repo(sys_config)
|
59
|
-
sys_config = cls._resolve_relative_storage_root(sys_config,
|
59
|
+
sys_config = cls._resolve_relative_storage_root(sys_config, config_mgr)
|
60
60
|
|
61
61
|
return sys_config
|
62
62
|
|
@@ -66,7 +66,7 @@ class DevModeTranslator:
|
|
66
66
|
sys_config: _cfg.RuntimeConfig,
|
67
67
|
job_config: _cfg.JobConfig,
|
68
68
|
scratch_dir: pathlib.Path,
|
69
|
-
|
69
|
+
config_mgr: _cfg_p.ConfigManager,
|
70
70
|
model_class: tp.Optional[_api.TracModel.__class__]) \
|
71
71
|
-> _cfg.JobConfig:
|
72
72
|
|
@@ -84,7 +84,7 @@ class DevModeTranslator:
|
|
84
84
|
|
85
85
|
# Fow flows, load external flow definitions then perform auto-wiring and type inference
|
86
86
|
if job_config.job.jobType == _meta.JobType.RUN_FLOW:
|
87
|
-
job_config = cls._process_flow_definition(job_config,
|
87
|
+
job_config = cls._process_flow_definition(job_config, config_mgr)
|
88
88
|
|
89
89
|
# For run (model|flow) jobs, apply processing to the parameters, inputs and outputs
|
90
90
|
if job_config.job.jobType in [_meta.JobType.RUN_MODEL, _meta.JobType.RUN_FLOW]:
|
@@ -109,7 +109,7 @@ class DevModeTranslator:
|
|
109
109
|
@classmethod
|
110
110
|
def _resolve_relative_storage_root(
|
111
111
|
cls, sys_config: _cfg.RuntimeConfig,
|
112
|
-
|
112
|
+
config_mgr: _cfg_p.ConfigManager):
|
113
113
|
|
114
114
|
storage_config = copy.deepcopy(sys_config.storage)
|
115
115
|
|
@@ -128,6 +128,7 @@ class DevModeTranslator:
|
|
128
128
|
|
129
129
|
cls._log.info(f"Resolving relative path for [{bucket_key}] local storage...")
|
130
130
|
|
131
|
+
sys_config_path = config_mgr.config_dir_path()
|
131
132
|
if sys_config_path is not None:
|
132
133
|
absolute_path = sys_config_path.joinpath(root_path).resolve()
|
133
134
|
if absolute_path.exists():
|
@@ -291,7 +292,7 @@ class DevModeTranslator:
|
|
291
292
|
return model_id, model_object
|
292
293
|
|
293
294
|
@classmethod
|
294
|
-
def _process_flow_definition(cls, job_config: _cfg.JobConfig,
|
295
|
+
def _process_flow_definition(cls, job_config: _cfg.JobConfig, config_mgr: _cfg_p.ConfigManager) -> _cfg.JobConfig:
|
295
296
|
|
296
297
|
flow_details = job_config.job.runFlow.flow
|
297
298
|
|
@@ -305,21 +306,15 @@ class DevModeTranslator:
|
|
305
306
|
cls._log.error(err)
|
306
307
|
raise _ex.EConfigParse(err)
|
307
308
|
|
308
|
-
flow_path = config_dir.joinpath(flow_details) if config_dir is not None else pathlib.Path(flow_details)
|
309
|
-
|
310
|
-
if not flow_path.exists():
|
311
|
-
err = f"Flow definition not available for [{flow_details}]: File not found ({flow_path})"
|
312
|
-
cls._log.error(err)
|
313
|
-
raise _ex.EConfigParse(err)
|
314
|
-
|
315
309
|
flow_id = _util.new_object_id(_meta.ObjectType.FLOW)
|
316
310
|
flow_key = _util.object_key(flow_id)
|
317
311
|
|
318
|
-
cls._log.info(f"Generating flow definition
|
312
|
+
cls._log.info(f"Generating flow definition from [{flow_details}] with ID = [{flow_key}]")
|
313
|
+
|
314
|
+
flow_def = config_mgr.load_config_object(flow_details, _meta.FlowDefinition)
|
319
315
|
|
320
|
-
|
321
|
-
|
322
|
-
flow_def = flow_parser.parse(flow_raw_data, flow_path.name)
|
316
|
+
# Validate models against the flow (this could move to _impl.validation and check prod jobs as well)
|
317
|
+
cls._check_models_for_flow(flow_def, job_config)
|
323
318
|
|
324
319
|
# Auto-wiring and inference only applied to externally loaded flows for now
|
325
320
|
flow_def = cls._autowire_flow(flow_def, job_config)
|
@@ -339,6 +334,37 @@ class DevModeTranslator:
|
|
339
334
|
|
340
335
|
return job_config
|
341
336
|
|
337
|
+
@classmethod
|
338
|
+
def _check_models_for_flow(cls, flow: _meta.FlowDefinition, job_config: _cfg.JobConfig):
|
339
|
+
|
340
|
+
model_nodes = dict(filter(lambda n: n[1].nodeType == _meta.FlowNodeType.MODEL_NODE, flow.nodes.items()))
|
341
|
+
|
342
|
+
missing_models = list(filter(lambda m: m not in job_config.job.runFlow.models, model_nodes.keys()))
|
343
|
+
extra_models = list(filter(lambda m: m not in model_nodes, job_config.job.runFlow.models.keys()))
|
344
|
+
|
345
|
+
if any(missing_models):
|
346
|
+
error = f"Missing models in job definition: {', '.join(missing_models)}"
|
347
|
+
cls._log.error(error)
|
348
|
+
raise _ex.EJobValidation(error)
|
349
|
+
|
350
|
+
if any (extra_models):
|
351
|
+
error = f"Extra models in job definition: {', '.join(extra_models)}"
|
352
|
+
cls._log.error(error)
|
353
|
+
raise _ex.EJobValidation(error)
|
354
|
+
|
355
|
+
for model_name, model_node in model_nodes.items():
|
356
|
+
|
357
|
+
model_selector = job_config.job.runFlow.models[model_name]
|
358
|
+
model_obj = _util.get_job_resource(model_selector, job_config)
|
359
|
+
|
360
|
+
model_inputs = set(model_obj.model.inputs.keys())
|
361
|
+
model_outputs = set(model_obj.model.outputs.keys())
|
362
|
+
|
363
|
+
if model_inputs != set(model_node.inputs) or model_outputs != set(model_node.outputs):
|
364
|
+
error = f"The model supplied for [{model_name}] does not match the flow definition"
|
365
|
+
cls._log.error(error)
|
366
|
+
raise _ex.EJobValidation(error)
|
367
|
+
|
342
368
|
@classmethod
|
343
369
|
def _autowire_flow(cls, flow: _meta.FlowDefinition, job_config: _cfg.JobConfig):
|
344
370
|
|
@@ -629,11 +655,13 @@ class DevModeTranslator:
|
|
629
655
|
job_details = job_config.job.runModel
|
630
656
|
model_obj = _util.get_job_resource(job_details.model, job_config)
|
631
657
|
required_inputs = model_obj.model.inputs
|
658
|
+
required_outputs = model_obj.model.outputs
|
632
659
|
|
633
660
|
elif job_config.job.jobType == _meta.JobType.RUN_FLOW:
|
634
661
|
job_details = job_config.job.runFlow
|
635
662
|
flow_obj = _util.get_job_resource(job_details.flow, job_config)
|
636
663
|
required_inputs = flow_obj.flow.inputs
|
664
|
+
required_outputs = flow_obj.flow.outputs
|
637
665
|
|
638
666
|
else:
|
639
667
|
return job_config
|
@@ -645,7 +673,8 @@ class DevModeTranslator:
|
|
645
673
|
for input_key, input_value in job_inputs.items():
|
646
674
|
if not (isinstance(input_value, str) and input_value in job_resources):
|
647
675
|
|
648
|
-
|
676
|
+
model_input = required_inputs[input_key]
|
677
|
+
input_schema = model_input.schema if model_input and not model_input.dynamic else None
|
649
678
|
|
650
679
|
input_id = cls._process_input_or_output(
|
651
680
|
sys_config, input_key, input_value, job_resources,
|
@@ -656,9 +685,12 @@ class DevModeTranslator:
|
|
656
685
|
for output_key, output_value in job_outputs.items():
|
657
686
|
if not (isinstance(output_value, str) and output_value in job_resources):
|
658
687
|
|
688
|
+
model_output= required_outputs[output_key]
|
689
|
+
output_schema = model_output.schema if model_output and not model_output.dynamic else None
|
690
|
+
|
659
691
|
output_id = cls._process_input_or_output(
|
660
692
|
sys_config, output_key, output_value, job_resources,
|
661
|
-
new_unique_file=True, schema=
|
693
|
+
new_unique_file=True, schema=output_schema)
|
662
694
|
|
663
695
|
job_outputs[output_key] = _util.selector_for(output_id)
|
664
696
|
|
@@ -776,7 +808,7 @@ class DevModeTranslator:
|
|
776
808
|
if schema is not None:
|
777
809
|
data_def.schema = schema
|
778
810
|
else:
|
779
|
-
data_def.schema =
|
811
|
+
data_def.schema = None
|
780
812
|
|
781
813
|
data_def.storageId = _meta.TagSelector(
|
782
814
|
_meta.ObjectType.STORAGE, storage_id.objectId,
|
tracdap/rt/_exec/engine.py
CHANGED
@@ -19,6 +19,7 @@ import dataclasses as dc
|
|
19
19
|
import enum
|
20
20
|
import typing as tp
|
21
21
|
|
22
|
+
import tracdap.rt.metadata as _meta
|
22
23
|
import tracdap.rt.config as _cfg
|
23
24
|
import tracdap.rt.exceptions as _ex
|
24
25
|
import tracdap.rt._exec.actors as _actors
|
@@ -28,7 +29,6 @@ import tracdap.rt._impl.models as _models # noqa
|
|
28
29
|
import tracdap.rt._impl.data as _data # noqa
|
29
30
|
import tracdap.rt._impl.storage as _storage # noqa
|
30
31
|
import tracdap.rt._impl.util as _util # noqa
|
31
|
-
from .actors import Signal
|
32
32
|
|
33
33
|
from .graph import NodeId
|
34
34
|
|
@@ -66,6 +66,18 @@ class _EngineContext:
|
|
66
66
|
failed_nodes: tp.Set[NodeId] = dc.field(default_factory=set)
|
67
67
|
|
68
68
|
|
69
|
+
@dc.dataclass
|
70
|
+
class _JobState:
|
71
|
+
|
72
|
+
job_id: _meta.TagHeader
|
73
|
+
job_config: _cfg.JobConfig
|
74
|
+
|
75
|
+
actor_id: _actors.ActorId = None
|
76
|
+
|
77
|
+
job_result: _cfg.JobResult = None
|
78
|
+
job_error: Exception = None
|
79
|
+
|
80
|
+
|
69
81
|
class TracEngine(_actors.Actor):
|
70
82
|
|
71
83
|
"""
|
@@ -88,7 +100,7 @@ class TracEngine(_actors.Actor):
|
|
88
100
|
self._storage = storage
|
89
101
|
self._notify_callback = notify_callback
|
90
102
|
|
91
|
-
self.
|
103
|
+
self._jobs: tp.Dict[str, _JobState] = dict()
|
92
104
|
|
93
105
|
def on_start(self):
|
94
106
|
|
@@ -98,7 +110,7 @@ class TracEngine(_actors.Actor):
|
|
98
110
|
|
99
111
|
self._log.info("Engine shutdown complete")
|
100
112
|
|
101
|
-
def on_signal(self, signal: Signal) -> tp.Optional[bool]:
|
113
|
+
def on_signal(self, signal: _actors.Signal) -> tp.Optional[bool]:
|
102
114
|
|
103
115
|
# Failed signals can propagate from leaf nodes up the actor tree for a job
|
104
116
|
# If the failure goes all the way up the tree without being handled, it will reach the engine node
|
@@ -110,8 +122,8 @@ class TracEngine(_actors.Actor):
|
|
110
122
|
failed_job_key = None
|
111
123
|
|
112
124
|
# Look for the job key corresponding to the failed actor
|
113
|
-
for job_key,
|
114
|
-
if
|
125
|
+
for job_key, job_state in self._jobs.items():
|
126
|
+
if job_state.actor_id == signal.sender:
|
115
127
|
failed_job_key = job_key
|
116
128
|
|
117
129
|
# If the job is still live, call job_failed explicitly
|
@@ -147,19 +159,34 @@ class TracEngine(_actors.Actor):
|
|
147
159
|
job_processor = JobProcessor(job_key, job_config, result_spec,self._models, self._storage)
|
148
160
|
job_actor_id = self.actors().spawn(job_processor)
|
149
161
|
|
150
|
-
|
151
|
-
|
162
|
+
job_state = _JobState(job_config.jobId, job_config)
|
163
|
+
job_state.actor_id = job_actor_id
|
164
|
+
|
165
|
+
self._jobs[job_key] = job_state
|
166
|
+
|
167
|
+
@_actors.Message
|
168
|
+
def get_job_list(self):
|
169
|
+
|
170
|
+
job_list = list(map(self._get_job_info, self._jobs.keys()))
|
171
|
+
self.actors().reply("job_list", job_list)
|
172
|
+
|
173
|
+
@_actors.Message
|
174
|
+
def get_job_details(self, job_key: str, details: bool):
|
175
|
+
|
176
|
+
details = self._get_job_info(job_key, details)
|
177
|
+
self.actors().reply("job_details", details)
|
152
178
|
|
153
179
|
@_actors.Message
|
154
180
|
def job_succeeded(self, job_key: str, job_result: _cfg.JobResult):
|
155
181
|
|
156
182
|
# Ignore duplicate messages from the job processor (can happen in unusual error cases)
|
157
|
-
if job_key not in self.
|
183
|
+
if job_key not in self._jobs:
|
158
184
|
self._log.warning(f"Ignoring [job_succeeded] message, job [{job_key}] has already completed")
|
159
185
|
return
|
160
186
|
|
161
187
|
self._log.info(f"Recording job as successful: {job_key}")
|
162
188
|
|
189
|
+
self._jobs[job_key].job_result = job_result
|
163
190
|
self._finalize_job(job_key)
|
164
191
|
|
165
192
|
if self._notify_callback is not None:
|
@@ -169,12 +196,13 @@ class TracEngine(_actors.Actor):
|
|
169
196
|
def job_failed(self, job_key: str, error: Exception):
|
170
197
|
|
171
198
|
# Ignore duplicate messages from the job processor (can happen in unusual error cases)
|
172
|
-
if job_key not in self.
|
199
|
+
if job_key not in self._jobs:
|
173
200
|
self._log.warning(f"Ignoring [job_failed] message, job [{job_key}] has already completed")
|
174
201
|
return
|
175
202
|
|
176
203
|
self._log.error(f"Recording job as failed: {job_key}")
|
177
204
|
|
205
|
+
self._jobs[job_key].job_error = error
|
178
206
|
self._finalize_job(job_key)
|
179
207
|
|
180
208
|
if self._notify_callback is not None:
|
@@ -182,10 +210,47 @@ class TracEngine(_actors.Actor):
|
|
182
210
|
|
183
211
|
def _finalize_job(self, job_key: str):
|
184
212
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
213
|
+
# Stop the actor but keep the job state available for status / results queries
|
214
|
+
|
215
|
+
# In the future, job state will need to be expunged after some period of time
|
216
|
+
# For now each instance of the runtime only processes one job so no need to worry
|
217
|
+
|
218
|
+
job_state = self._jobs.get(job_key)
|
219
|
+
job_actor_id = job_state.actor_id if job_state is not None else None
|
220
|
+
|
221
|
+
if job_actor_id is not None:
|
222
|
+
self.actors().stop(job_actor_id)
|
223
|
+
job_state.actor_id = None
|
224
|
+
|
225
|
+
def _get_job_info(self, job_key: str, details: bool = False) -> tp.Optional[_cfg.JobResult]:
|
226
|
+
|
227
|
+
job_state = self._jobs.get(job_key)
|
228
|
+
|
229
|
+
if job_state is None:
|
230
|
+
return None
|
231
|
+
|
232
|
+
job_result = _cfg.JobResult()
|
233
|
+
job_result.jobId = job_state.job_id
|
234
|
+
|
235
|
+
if job_state.actor_id is not None:
|
236
|
+
job_result.statusCode = _meta.JobStatusCode.RUNNING
|
237
|
+
|
238
|
+
elif job_state.job_result is not None:
|
239
|
+
job_result.statusCode = job_state.job_result.statusCode
|
240
|
+
job_result.statusMessage = job_state.job_result.statusMessage
|
241
|
+
if details:
|
242
|
+
job_result.results = job_state.job_result.results or dict()
|
243
|
+
|
244
|
+
elif job_state.job_error is not None:
|
245
|
+
job_result.statusCode = _meta.JobStatusCode.FAILED
|
246
|
+
job_result.statusMessage = str(job_state.job_error.args[0])
|
247
|
+
|
248
|
+
else:
|
249
|
+
# Alternatively return UNKNOWN status or throw an error here
|
250
|
+
job_result.statusCode = _meta.JobStatusCode.FAILED
|
251
|
+
job_result.statusMessage = "No details available"
|
252
|
+
|
253
|
+
return job_result
|
189
254
|
|
190
255
|
|
191
256
|
class JobProcessor(_actors.Actor):
|
@@ -218,7 +283,7 @@ class JobProcessor(_actors.Actor):
|
|
218
283
|
self._log.info(f"Cleaning up job [{self.job_key}]")
|
219
284
|
self._models.destroy_scope(self.job_key)
|
220
285
|
|
221
|
-
def on_signal(self, signal: Signal) -> tp.Optional[bool]:
|
286
|
+
def on_signal(self, signal: _actors.Signal) -> tp.Optional[bool]:
|
222
287
|
|
223
288
|
if signal.message == _actors.SignalNames.FAILED and isinstance(signal, _actors.ErrorSignal):
|
224
289
|
|
tracdap/rt/_exec/functions.py
CHANGED
@@ -252,7 +252,13 @@ class DataViewFunc(NodeFunction[_data.DataView]):
|
|
252
252
|
if root_item.is_empty():
|
253
253
|
return _data.DataView.create_empty()
|
254
254
|
|
255
|
-
|
255
|
+
if self.node.schema is not None and len(self.node.schema.table.fields) > 0:
|
256
|
+
trac_schema = self.node.schema
|
257
|
+
else:
|
258
|
+
arrow_schema = root_item.schema
|
259
|
+
trac_schema = _data.DataMapping.arrow_to_trac_schema(arrow_schema)
|
260
|
+
|
261
|
+
data_view = _data.DataView.for_trac_schema(trac_schema)
|
256
262
|
data_view = _data.DataMapping.add_item_to_view(data_view, root_part_key, root_item)
|
257
263
|
|
258
264
|
return data_view
|
@@ -544,7 +550,6 @@ class RunModelFunc(NodeFunction[Bundle[_data.DataView]]):
|
|
544
550
|
# Still, if any nodes are missing or have the wrong type TracContextImpl will raise ERuntimeValidation
|
545
551
|
|
546
552
|
local_ctx = {}
|
547
|
-
static_schemas = {}
|
548
553
|
|
549
554
|
for node_id, node_result in _ctx_iter_items(ctx):
|
550
555
|
|
@@ -558,22 +563,10 @@ class RunModelFunc(NodeFunction[Bundle[_data.DataView]]):
|
|
558
563
|
if node_id.name in model_def.inputs:
|
559
564
|
input_name = node_id.name
|
560
565
|
local_ctx[input_name] = node_result
|
561
|
-
# At the moment, all model inputs have static schemas
|
562
|
-
static_schemas[input_name] = model_def.inputs[input_name].schema
|
563
|
-
|
564
|
-
# Add empty data views to the local context to hold model outputs
|
565
|
-
# Assuming outputs are all defined with static schemas
|
566
|
-
|
567
|
-
for output_name in model_def.outputs:
|
568
|
-
output_schema = self.node.model_def.outputs[output_name].schema
|
569
|
-
empty_data_view = _data.DataView.for_trac_schema(output_schema)
|
570
|
-
local_ctx[output_name] = empty_data_view
|
571
|
-
# At the moment, all model outputs have static schemas
|
572
|
-
static_schemas[output_name] = output_schema
|
573
566
|
|
574
567
|
# Run the model against the mapped local context
|
575
568
|
|
576
|
-
trac_ctx = _ctx.TracContextImpl(self.node.model_def, self.model_class, local_ctx,
|
569
|
+
trac_ctx = _ctx.TracContextImpl(self.node.model_def, self.model_class, local_ctx, self.checkout_directory)
|
577
570
|
|
578
571
|
try:
|
579
572
|
model = self.model_class()
|
@@ -594,12 +587,16 @@ class RunModelFunc(NodeFunction[Bundle[_data.DataView]]):
|
|
594
587
|
result: _data.DataView = local_ctx.get(output_name)
|
595
588
|
|
596
589
|
if result is None or result.is_empty():
|
590
|
+
|
597
591
|
if not output_schema.optional:
|
598
592
|
model_name = self.model_class.__name__
|
599
593
|
raise _ex.ERuntimeValidation(f"Missing required output [{output_name}] from model [{model_name}]")
|
600
594
|
|
601
|
-
|
602
|
-
|
595
|
+
# Create a placeholder for optional outputs that were not emitted
|
596
|
+
elif result is None:
|
597
|
+
result = _data.DataView.create_empty()
|
598
|
+
|
599
|
+
results[output_name] = result
|
603
600
|
|
604
601
|
return results
|
605
602
|
|
tracdap/rt/_exec/runtime.py
CHANGED
@@ -16,6 +16,7 @@ from __future__ import annotations
|
|
16
16
|
|
17
17
|
import dataclasses as dc
|
18
18
|
import datetime as dt
|
19
|
+
import signal
|
19
20
|
import threading
|
20
21
|
|
21
22
|
import sys
|
@@ -54,6 +55,8 @@ class TracRuntime:
|
|
54
55
|
_engine.ModelNodeProcessor: "model",
|
55
56
|
_engine.DataNodeProcessor: "data"}
|
56
57
|
|
58
|
+
__DEFAULT_API_PORT = 9000
|
59
|
+
|
57
60
|
def __init__(
|
58
61
|
self,
|
59
62
|
sys_config: tp.Union[str, pathlib.Path, _cfg.RuntimeConfig],
|
@@ -61,6 +64,7 @@ class TracRuntime:
|
|
61
64
|
job_result_format: tp.Optional[str] = None,
|
62
65
|
scratch_dir: tp.Union[str, pathlib.Path, None] = None,
|
63
66
|
scratch_dir_persist: bool = False,
|
67
|
+
plugin_packages: tp.List[str] = None,
|
64
68
|
dev_mode: bool = False):
|
65
69
|
|
66
70
|
trac_version = _version.__version__
|
@@ -83,28 +87,34 @@ class TracRuntime:
|
|
83
87
|
self._log.info(f"TRAC D.A.P. Python Runtime {trac_version}")
|
84
88
|
|
85
89
|
self._sys_config = sys_config if isinstance(sys_config, _cfg.RuntimeConfig) else None
|
86
|
-
self._sys_config_path =
|
90
|
+
self._sys_config_path = sys_config if not self._sys_config else None
|
87
91
|
self._job_result_dir = job_result_dir
|
88
92
|
self._job_result_format = job_result_format
|
89
93
|
self._scratch_dir = scratch_dir
|
90
94
|
self._scratch_dir_provided = True if scratch_dir is not None else False
|
91
95
|
self._scratch_dir_persist = scratch_dir_persist
|
96
|
+
self._plugin_packages = plugin_packages or []
|
92
97
|
self._dev_mode = dev_mode
|
93
|
-
self._server_enabled = False
|
94
|
-
self._server_port = 0
|
95
98
|
|
99
|
+
# Runtime control
|
100
|
+
self._runtime_lock = threading.Lock()
|
101
|
+
self._runtime_event = threading.Condition(self._runtime_lock)
|
96
102
|
self._pre_start_complete = False
|
103
|
+
self._shutdown_requested = False
|
104
|
+
self._oneshot_job = None
|
97
105
|
|
98
106
|
# Top level resources
|
107
|
+
self._config_mgr: tp.Optional[_cparse.ConfigManager] = None
|
99
108
|
self._models: tp.Optional[_models.ModelLoader] = None
|
100
109
|
self._storage: tp.Optional[_storage.StorageManager] = None
|
101
110
|
|
102
111
|
# The execution engine
|
103
112
|
self._system: tp.Optional[_actors.ActorSystem] = None
|
104
113
|
self._engine: tp.Optional[_engine.TracEngine] = None
|
105
|
-
self._engine_event = threading.Condition()
|
106
114
|
|
107
115
|
# Runtime API server
|
116
|
+
self._server_enabled = False
|
117
|
+
self._server_port = 0
|
108
118
|
self._server = None
|
109
119
|
|
110
120
|
self._jobs: tp.Dict[str, _RuntimeJobInfo] = dict()
|
@@ -134,21 +144,28 @@ class TracRuntime:
|
|
134
144
|
|
135
145
|
self._prepare_scratch_dir()
|
136
146
|
|
137
|
-
# Plugin manager
|
138
|
-
#
|
147
|
+
# Plugin manager, static API and guard rails are singletons
|
148
|
+
# Calling these methods multiple times is safe (e.g. for embedded or testing scenarios)
|
149
|
+
# However, plugins are never un-registered for the lifetime of the processes
|
139
150
|
|
140
151
|
_plugins.PluginManager.register_core_plugins()
|
152
|
+
|
153
|
+
for plugin_package in self._plugin_packages:
|
154
|
+
_plugins.PluginManager.register_plugin_package(plugin_package)
|
155
|
+
|
141
156
|
_static_api.StaticApiImpl.register_impl()
|
142
157
|
_guard.PythonGuardRails.protect_dangerous_functions()
|
143
158
|
|
144
159
|
# Load sys config (or use embedded), config errors are detected before start()
|
145
160
|
# Job config can also be checked before start() by using load_job_config()
|
146
161
|
|
162
|
+
self._config_mgr = _cparse.ConfigManager.for_root_config(self._sys_config_path)
|
163
|
+
|
147
164
|
if self._sys_config is None:
|
148
165
|
sys_config_dev_mode = _dev_mode.DEV_MODE_SYS_CONFIG if self._dev_mode else None
|
149
|
-
|
150
|
-
|
151
|
-
|
166
|
+
self._sys_config = self._config_mgr.load_root_object(
|
167
|
+
_cfg.RuntimeConfig, sys_config_dev_mode,
|
168
|
+
config_file_name="system")
|
152
169
|
else:
|
153
170
|
self._log.info("Using embedded system config")
|
154
171
|
|
@@ -156,8 +173,15 @@ class TracRuntime:
|
|
156
173
|
# I.e. it can be applied to embedded configs
|
157
174
|
|
158
175
|
if self._dev_mode:
|
159
|
-
|
160
|
-
|
176
|
+
self._sys_config = _dev_mode.DevModeTranslator.translate_sys_config(self._sys_config, self._config_mgr)
|
177
|
+
|
178
|
+
# Runtime API server is controlled by the sys config
|
179
|
+
|
180
|
+
if self._sys_config.runtimeApi is not None:
|
181
|
+
api_config = self._sys_config.runtimeApi
|
182
|
+
if api_config.enabled:
|
183
|
+
self._server_enabled = True
|
184
|
+
self._server_port = api_config.port or self.__DEFAULT_API_PORT
|
161
185
|
|
162
186
|
self._pre_start_complete = True
|
163
187
|
|
@@ -196,7 +220,7 @@ class TracRuntime:
|
|
196
220
|
# The server module pulls in all the gRPC dependencies, don't import it unless we have to
|
197
221
|
import tracdap.rt._exec.server as _server
|
198
222
|
|
199
|
-
self._server = _server.RuntimeApiServer(self._server_port)
|
223
|
+
self._server = _server.RuntimeApiServer(self._system, self._server_port)
|
200
224
|
self._server.start()
|
201
225
|
|
202
226
|
except Exception as e:
|
@@ -237,6 +261,28 @@ class TracRuntime:
|
|
237
261
|
else:
|
238
262
|
self._log.info("TRAC runtime has gone down cleanly")
|
239
263
|
|
264
|
+
def is_oneshot(self):
|
265
|
+
return not self._server_enabled
|
266
|
+
|
267
|
+
def run_until_done(self):
|
268
|
+
|
269
|
+
if self._server_enabled == False and len(self._jobs) == 0:
|
270
|
+
self._log.error("No job config supplied, TRAC runtime will not run")
|
271
|
+
raise _ex.EStartup("No job config supplied")
|
272
|
+
|
273
|
+
signal.signal(signal.SIGTERM, self._request_shutdown)
|
274
|
+
signal.signal(signal.SIGINT, self._request_shutdown)
|
275
|
+
|
276
|
+
with self._runtime_lock:
|
277
|
+
while not self._shutdown_requested:
|
278
|
+
self._runtime_event.wait()
|
279
|
+
|
280
|
+
def _request_shutdown(self, _signum = None, _frame = None):
|
281
|
+
|
282
|
+
with self._runtime_lock:
|
283
|
+
self._shutdown_requested = True
|
284
|
+
self._runtime_event.notify()
|
285
|
+
|
240
286
|
def _prepare_scratch_dir(self):
|
241
287
|
|
242
288
|
if not self._scratch_dir_provided:
|
@@ -274,20 +320,18 @@ class TracRuntime:
|
|
274
320
|
|
275
321
|
if isinstance(job_config, _cfg.JobConfig):
|
276
322
|
self._log.info("Using embedded job config")
|
277
|
-
job_config_path = None
|
278
323
|
|
279
324
|
else:
|
280
|
-
job_config_path = job_config
|
281
325
|
job_config_dev_mode = _dev_mode.DEV_MODE_JOB_CONFIG if self._dev_mode else None
|
282
|
-
|
283
|
-
|
284
|
-
|
326
|
+
job_config = self._config_mgr.load_config_object(
|
327
|
+
job_config, _cfg.JobConfig,
|
328
|
+
job_config_dev_mode,
|
329
|
+
config_file_name="job")
|
285
330
|
|
286
331
|
if self._dev_mode:
|
287
|
-
config_dir = job_config_path.parent if job_config_path is not None else None
|
288
332
|
job_config = _dev_mode.DevModeTranslator.translate_job_config(
|
289
333
|
self._sys_config, job_config,
|
290
|
-
self._scratch_dir,
|
334
|
+
self._scratch_dir, self._config_mgr,
|
291
335
|
model_class)
|
292
336
|
|
293
337
|
return job_config
|
@@ -297,7 +341,7 @@ class TracRuntime:
|
|
297
341
|
job_key = _util.object_key(job_config.jobId)
|
298
342
|
self._jobs[job_key] = _RuntimeJobInfo()
|
299
343
|
|
300
|
-
self._system.
|
344
|
+
self._system.send_main(
|
301
345
|
"submit_job", job_config,
|
302
346
|
str(self._job_result_dir) if self._job_result_dir else "",
|
303
347
|
self._job_result_format if self._job_result_format else "")
|
@@ -309,35 +353,34 @@ class TracRuntime:
|
|
309
353
|
if job_key not in self._jobs:
|
310
354
|
raise _ex.ETracInternal(f"Attempt to wait for a job that was never started")
|
311
355
|
|
312
|
-
|
313
|
-
while True:
|
356
|
+
self._oneshot_job = job_key
|
314
357
|
|
315
|
-
|
358
|
+
self.run_until_done()
|
316
359
|
|
317
|
-
|
318
|
-
raise job_info.error
|
360
|
+
job_info = self._jobs[job_key]
|
319
361
|
|
320
|
-
|
321
|
-
|
362
|
+
if job_info.error is not None:
|
363
|
+
raise job_info.error
|
322
364
|
|
323
|
-
|
365
|
+
elif job_info.result is not None:
|
366
|
+
return job_info.result
|
324
367
|
|
325
|
-
|
368
|
+
else:
|
369
|
+
err = f"No result or error information is available for job [{job_key}]"
|
370
|
+
self._log.error(err)
|
371
|
+
raise _ex.ETracInternal(err)
|
326
372
|
|
327
373
|
def _engine_callback(self, job_key, job_result, job_error):
|
328
374
|
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
self._jobs[job_key].done = True
|
336
|
-
self._jobs[job_key].error = job_error
|
337
|
-
else:
|
338
|
-
pass
|
375
|
+
if job_result is not None:
|
376
|
+
self._jobs[job_key].done = True
|
377
|
+
self._jobs[job_key].result = job_result
|
378
|
+
elif job_error is not None:
|
379
|
+
self._jobs[job_key].done = True
|
380
|
+
self._jobs[job_key].error = job_error
|
339
381
|
|
340
|
-
|
382
|
+
if self._oneshot_job == job_key:
|
383
|
+
self._request_shutdown()
|
341
384
|
|
342
385
|
# ------------------------------------------------------------------------------------------------------------------
|
343
386
|
# Error handling
|