tracdap-runtime 0.6.5__py3-none-any.whl → 0.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,8 +39,9 @@ class _EngineNode:
39
39
  """
40
40
 
41
41
  node: _graph.Node
42
- dependencies: tp.Dict[NodeId, _graph.DependencyType]
43
42
  function: tp.Optional[_func.NodeFunction] = None
43
+
44
+ dependencies: tp.Dict[NodeId, _graph.DependencyType] = dc.field(default_factory=dict)
44
45
  complete: bool = False
45
46
  result: tp.Optional[tp.Any] = None
46
47
  error: tp.Optional[str] = None
@@ -57,21 +58,35 @@ class _EngineContext:
57
58
  Represents the state of an execution graph being processed by the TRAC engine
58
59
  """
59
60
 
61
+ engine_id: _actors.ActorId
62
+ job_key: str
63
+ root_id: NodeId
64
+
60
65
  nodes: tp.Dict[NodeId, _EngineNode]
61
66
  pending_nodes: tp.Set[NodeId] = dc.field(default_factory=set)
62
67
  active_nodes: tp.Set[NodeId] = dc.field(default_factory=set)
63
68
  succeeded_nodes: tp.Set[NodeId] = dc.field(default_factory=set)
64
69
  failed_nodes: tp.Set[NodeId] = dc.field(default_factory=set)
65
70
 
71
+ def with_updates(
72
+ self, nodes,
73
+ pending_nodes, active_nodes,
74
+ succeeded_nodes, failed_nodes) -> "_EngineContext":
75
+
76
+ return _EngineContext(
77
+ self.engine_id, self.job_key, self.root_id, nodes,
78
+ pending_nodes, active_nodes, succeeded_nodes, failed_nodes)
79
+
66
80
 
67
81
  @dc.dataclass
68
82
  class _JobState:
69
83
 
70
84
  job_id: _meta.TagHeader
71
- job_config: _cfg.JobConfig
72
-
73
85
  actor_id: _actors.ActorId = None
74
86
 
87
+ monitors: tp.List[_actors.ActorId] = dc.field(default_factory=list)
88
+
89
+ job_config: _cfg.JobConfig = None
75
90
  job_result: _cfg.JobResult = None
76
91
  job_error: Exception = None
77
92
 
@@ -154,14 +169,35 @@ class TracEngine(_actors.Actor):
154
169
 
155
170
  self._log.info(f"Job submitted: [{job_key}]")
156
171
 
157
- job_processor = JobProcessor(job_key, job_config, result_spec,self._models, self._storage)
172
+ job_processor = JobProcessor(self._models, self._storage, job_key, job_config, result_spec, graph_spec=None)
158
173
  job_actor_id = self.actors().spawn(job_processor)
159
174
 
160
- job_state = _JobState(job_config.jobId, job_config)
175
+ job_monitor_success = lambda ctx, key, result: self._notify_callback(key, result, None)
176
+ job_monitor_failure = lambda ctx, key, error: self._notify_callback(key, None, error)
177
+ job_monitor = JobMonitor(job_key, job_monitor_success, job_monitor_failure)
178
+ job_monitor_id = self.actors().spawn(job_monitor)
179
+
180
+ job_state = _JobState(job_config.jobId)
161
181
  job_state.actor_id = job_actor_id
182
+ job_state.monitors.append(job_monitor_id)
183
+ job_state.job_config = job_config
162
184
 
163
185
  self._jobs[job_key] = job_state
164
186
 
187
+ @_actors.Message
188
+ def submit_child_job(self, child_id: _meta.TagHeader, child_graph: _graph.Graph, monitor_id: _actors.ActorId):
189
+
190
+ child_key = _util.object_key(child_id)
191
+
192
+ child_processor = JobProcessor(self._models, self._storage, child_key, None, None, graph_spec=child_graph) # noqa
193
+ child_actor_id = self.actors().spawn(child_processor)
194
+
195
+ child_state = _JobState(child_id)
196
+ child_state.actor_id = child_actor_id
197
+ child_state.monitors.append(monitor_id)
198
+
199
+ self._jobs[child_key] = child_state
200
+
165
201
  @_actors.Message
166
202
  def get_job_list(self):
167
203
 
@@ -184,11 +220,13 @@ class TracEngine(_actors.Actor):
184
220
 
185
221
  self._log.info(f"Recording job as successful: {job_key}")
186
222
 
187
- self._jobs[job_key].job_result = job_result
188
- self._finalize_job(job_key)
223
+ job_state = self._jobs[job_key]
224
+ job_state.job_result = job_result
225
+
226
+ for monitor_id in job_state.monitors:
227
+ self.actors().send(monitor_id, "job_succeeded", job_result)
189
228
 
190
- if self._notify_callback is not None:
191
- self._notify_callback(job_key, job_result, None)
229
+ self._finalize_job(job_key)
192
230
 
193
231
  @_actors.Message
194
232
  def job_failed(self, job_key: str, error: Exception):
@@ -200,11 +238,13 @@ class TracEngine(_actors.Actor):
200
238
 
201
239
  self._log.error(f"Recording job as failed: {job_key}")
202
240
 
203
- self._jobs[job_key].job_error = error
204
- self._finalize_job(job_key)
241
+ job_state = self._jobs[job_key]
242
+ job_state.job_error = error
243
+
244
+ for monitor_id in job_state.monitors:
245
+ self.actors().send(monitor_id, "job_failed", error)
205
246
 
206
- if self._notify_callback is not None:
207
- self._notify_callback(job_key, None, error)
247
+ self._finalize_job(job_key)
208
248
 
209
249
  def _finalize_job(self, job_key: str):
210
250
 
@@ -214,10 +254,17 @@ class TracEngine(_actors.Actor):
214
254
  # For now each instance of the runtime only processes one job so no need to worry
215
255
 
216
256
  job_state = self._jobs.get(job_key)
217
- job_actor_id = job_state.actor_id if job_state is not None else None
218
257
 
219
- if job_actor_id is not None:
220
- self.actors().stop(job_actor_id)
258
+ # Stop any monitors that were created directly by the engine
259
+ # (Other actors are responsible for stopping their own monitors)
260
+ while job_state.monitors:
261
+ monitor_id = job_state.monitors.pop()
262
+ monitor_parent = monitor_id[:monitor_id.rfind('/')]
263
+ if self.actors().id == monitor_parent:
264
+ self.actors().stop(monitor_id)
265
+
266
+ if job_state.actor_id is not None:
267
+ self.actors().stop(job_state.actor_id )
221
268
  job_state.actor_id = None
222
269
 
223
270
  def _get_job_info(self, job_key: str, details: bool = False) -> tp.Optional[_cfg.JobResult]:
@@ -251,6 +298,35 @@ class TracEngine(_actors.Actor):
251
298
  return job_result
252
299
 
253
300
 
301
+ class JobMonitor(_actors.Actor):
302
+
303
+ def __init__(
304
+ self, job_key: str,
305
+ success_func: tp.Callable[[_actors.ActorContext, str, _cfg.JobResult], None],
306
+ failure_func: tp.Callable[[_actors.ActorContext, str, Exception], None]):
307
+
308
+ super().__init__()
309
+ self._job_key = job_key
310
+ self._success_func = success_func
311
+ self._failure_func = failure_func
312
+ self._signal_sent = False
313
+
314
+ @_actors.Message
315
+ def job_succeeded(self, job_result: _cfg.JobResult):
316
+ self._success_func(self.actors(), self._job_key, job_result)
317
+ self._signal_sent = True
318
+
319
+ @_actors.Message
320
+ def job_failed(self, error: Exception):
321
+ self._failure_func(self.actors(), self._job_key, error)
322
+ self._signal_sent = True
323
+
324
+ def on_stop(self):
325
+ if not self._signal_sent:
326
+ error = _ex.ETracInternal(f"No result was received for job [{self._job_key}]")
327
+ self._failure_func(self.actors(), self._job_key, error)
328
+
329
+
254
330
  class JobProcessor(_actors.Actor):
255
331
 
256
332
  """
@@ -259,26 +335,32 @@ class JobProcessor(_actors.Actor):
259
335
  """
260
336
 
261
337
  def __init__(
262
- self, job_key, job_config: _cfg.JobConfig,
263
- result_spec: _graph.JobResultSpec,
264
- models: _models.ModelLoader,
265
- storage: _storage.StorageManager):
338
+ self, models: _models.ModelLoader, storage: _storage.StorageManager,
339
+ job_key: str, job_config: _cfg.JobConfig, result_spec: _graph.JobResultSpec,
340
+ graph_spec: tp.Optional[_graph.Graph]):
266
341
 
267
342
  super().__init__()
268
343
  self.job_key = job_key
269
344
  self.job_config = job_config
270
345
  self.result_spec = result_spec
346
+ self.graph_spec = graph_spec
271
347
  self._models = models
272
348
  self._storage = storage
273
349
  self._resolver = _func.FunctionResolver(models, storage)
274
350
  self._log = _util.logger_for_object(self)
275
351
 
276
352
  def on_start(self):
353
+
277
354
  self._log.info(f"Starting job [{self.job_key}]")
278
355
  self._models.create_scope(self.job_key)
279
- self.actors().spawn(GraphBuilder(self.job_config, self.result_spec, self._resolver))
356
+
357
+ if self.graph_spec is not None:
358
+ self.actors().send(self.actors().id, "build_graph_succeeded", self.graph_spec)
359
+ else:
360
+ self.actors().spawn(GraphBuilder(self.job_config, self.result_spec))
280
361
 
281
362
  def on_stop(self):
363
+
282
364
  self._log.info(f"Cleaning up job [{self.job_key}]")
283
365
  self._models.destroy_scope(self.job_key)
284
366
 
@@ -303,9 +385,26 @@ class JobProcessor(_actors.Actor):
303
385
  return super().on_signal(signal)
304
386
 
305
387
  @_actors.Message
306
- def job_graph(self, graph: _EngineContext, root_id: NodeId):
307
- self.actors().spawn(GraphProcessor(graph, root_id, self._resolver))
308
- self.actors().stop(self.actors().sender)
388
+ def build_graph_succeeded(self, graph_spec: _graph.Graph):
389
+
390
+ # Build a new engine context graph from the graph spec
391
+ engine_id = self.actors().parent
392
+ nodes = dict((node_id, _EngineNode(node)) for node_id, node in graph_spec.nodes.items())
393
+ graph = _EngineContext(engine_id, self.job_key, graph_spec.root_id, nodes)
394
+
395
+ # Add all the nodes as pending nodes to start
396
+ graph.pending_nodes.update(graph.nodes.keys())
397
+
398
+ self.actors().spawn(FunctionResolver(self._resolver, graph))
399
+ if self.actors().sender != self.actors().id and self.actors().sender != self.actors().parent:
400
+ self.actors().stop(self.actors().sender)
401
+
402
+ @_actors.Message
403
+ def resolve_functions_succeeded(self, graph: _EngineContext):
404
+
405
+ self.actors().spawn(GraphProcessor(graph, self._resolver))
406
+ if self.actors().sender != self.actors().id and self.actors().sender != self.actors().parent:
407
+ self.actors().stop(self.actors().sender)
309
408
 
310
409
  @_actors.Message
311
410
  def job_succeeded(self, job_result: _cfg.JobResult):
@@ -323,44 +422,54 @@ class JobProcessor(_actors.Actor):
323
422
  class GraphBuilder(_actors.Actor):
324
423
 
325
424
  """
326
- GraphBuilder is a worker (actors.Worker) responsible for building the execution graph for a job
327
- The logic for graph building is provided in graph_builder.py
425
+ GraphBuilder is a worker (actor) to wrap the GraphBuilder logic from graph_builder.py
328
426
  """
329
427
 
330
- def __init__(
331
- self, job_config: _cfg.JobConfig,
332
- result_spec: _graph.JobResultSpec,
333
- resolver: _func.FunctionResolver):
334
-
428
+ def __init__(self, job_config: _cfg.JobConfig, result_spec: _graph.JobResultSpec):
335
429
  super().__init__()
336
430
  self.job_config = job_config
337
431
  self.result_spec = result_spec
338
- self.graph: tp.Optional[_EngineContext] = None
339
-
340
- self._resolver = resolver
341
432
  self._log = _util.logger_for_object(self)
342
433
 
343
434
  def on_start(self):
435
+ self.build_graph(self, self.job_config)
436
+
437
+ @_actors.Message
438
+ def build_graph(self, job_config: _cfg.JobConfig):
344
439
 
345
440
  self._log.info("Building execution graph")
346
441
 
347
442
  # TODO: Get sys config, or find a way to pass storage settings
348
- graph_data = _graph.GraphBuilder.build_job(self.job_config, self.result_spec)
349
- graph_nodes = {node_id: _EngineNode(node, {}) for node_id, node in graph_data.nodes.items()}
350
- graph = _EngineContext(graph_nodes, pending_nodes=set(graph_nodes.keys()))
443
+ graph_builder = _graph.GraphBuilder(job_config, self.result_spec)
444
+ graph_spec = graph_builder.build_job(job_config.job)
351
445
 
352
- self._log.info("Resolving graph nodes to executable code")
446
+ self.actors().reply("build_graph_succeeded", graph_spec)
353
447
 
354
- for node_id, node in graph.nodes.items():
355
- node.function = self._resolver.resolve_node(node.node)
356
448
 
449
+ class FunctionResolver(_actors.Actor):
450
+
451
+ """
452
+ GraphResolver is a worker (actors) to wrap the FunctionResolver logic in functions.py
453
+ """
454
+
455
+ def __init__(self, resolver: _func.FunctionResolver, graph: _EngineContext):
456
+ super().__init__()
357
457
  self.graph = graph
358
- self.actors().send_parent("job_graph", self.graph, graph_data.root_id)
458
+ self._resolver = resolver
459
+ self._log = _util.logger_for_object(self)
460
+
461
+ def on_start(self):
462
+ self.resolve_functions(self, self.graph)
359
463
 
360
464
  @_actors.Message
361
- def get_execution_graph(self):
465
+ def resolve_functions(self, graph: _EngineContext):
362
466
 
363
- self.actors().send(self.actors().sender, "job_graph", self.graph)
467
+ self._log.info("Resolving graph nodes to executable code")
468
+
469
+ for node_id, node in graph.nodes.items():
470
+ node.function = self._resolver.resolve_node(node.node)
471
+
472
+ self.actors().reply("resolve_functions_succeeded", graph)
364
473
 
365
474
 
366
475
  class GraphProcessor(_actors.Actor):
@@ -376,10 +485,10 @@ class GraphProcessor(_actors.Actor):
376
485
  Once all running nodes are stopped, an error is reported to the parent
377
486
  """
378
487
 
379
- def __init__(self, graph: _EngineContext, root_id: NodeId, resolver: _func.FunctionResolver):
488
+ def __init__(self, graph: _EngineContext, resolver: _func.FunctionResolver):
380
489
  super().__init__()
381
490
  self.graph = graph
382
- self.root_id = root_id
491
+ self.root_id_ = graph.root_id
383
492
  self.processors: tp.Dict[NodeId, _actors.ActorId] = dict()
384
493
  self._resolver = resolver
385
494
  self._log = _util.logger_for_object(self)
@@ -427,12 +536,14 @@ class GraphProcessor(_actors.Actor):
427
536
  # Model and data nodes map to different thread pools in the actors engine
428
537
  # There is scope for a much more sophisticated approach, with prioritized scheduling
429
538
 
430
- if isinstance(node.node, _graph.RunModelNode) or isinstance(node.node, _graph.ImportModelNode):
431
- processor = ModelNodeProcessor(processed_graph, node_id, node)
539
+ if isinstance(node.node, _graph.ChildJobNode):
540
+ processor = ChildJobNodeProcessor(processed_graph, node)
541
+ elif isinstance(node.node, _graph.RunModelNode) or isinstance(node.node, _graph.ImportModelNode):
542
+ processor = ModelNodeProcessor(processed_graph, node)
432
543
  elif isinstance(node.node, _graph.LoadDataNode) or isinstance(node.node, _graph.SaveDataNode):
433
- processor = DataNodeProcessor(processed_graph, node_id, node)
544
+ processor = DataNodeProcessor(processed_graph, node)
434
545
  else:
435
- processor = NodeProcessor(processed_graph, node_id, node)
546
+ processor = NodeProcessor(processed_graph, node)
436
547
 
437
548
  # New nodes can be launched with the updated graph
438
549
  # Anything that was pruned is not needed by the new node
@@ -502,7 +613,7 @@ class GraphProcessor(_actors.Actor):
502
613
  for node_id, node in new_nodes.items():
503
614
  GraphLogger.log_node_add(node)
504
615
  node_func = self._resolver.resolve_node(node)
505
- new_node = _EngineNode(node, {}, function=node_func)
616
+ new_node = _EngineNode(node, node_func)
506
617
  new_graph.nodes[node_id] = new_node
507
618
  new_graph.pending_nodes.add(node_id)
508
619
 
@@ -625,9 +736,10 @@ class GraphProcessor(_actors.Actor):
625
736
  for node_id in list(filter(lambda n: n.namespace == context_pop, nodes)):
626
737
  nodes.pop(node_id)
627
738
 
628
- graph = _EngineContext(nodes, pending_nodes, active_nodes, succeeded_nodes, failed_nodes)
739
+ self.graph = self.graph.with_updates(
740
+ nodes, pending_nodes, active_nodes,
741
+ succeeded_nodes, failed_nodes)
629
742
 
630
- self.graph = graph
631
743
  self.check_job_status()
632
744
 
633
745
  def check_job_status(self, do_submit=True):
@@ -657,7 +769,7 @@ class GraphProcessor(_actors.Actor):
657
769
  self.actors().send_parent("job_failed", _ex.EModelExec("Job suffered multiple errors", errors))
658
770
 
659
771
  else:
660
- job_result = self.graph.nodes[self.root_id].result
772
+ job_result = self.graph.nodes[self.graph.root_id].result
661
773
  self.actors().send_parent("job_succeeded", job_result)
662
774
 
663
775
 
@@ -669,11 +781,12 @@ class NodeProcessor(_actors.Actor):
669
781
 
670
782
  __NONE_TYPE = type(None)
671
783
 
672
- def __init__(self, graph: _EngineContext, node_id: NodeId, node: _EngineNode):
784
+ def __init__(self, graph: _EngineContext, node: _EngineNode):
673
785
  super().__init__()
674
786
  self.graph = graph
675
- self.node_id = node_id
676
787
  self.node = node
788
+ self.node_id = node.node.id
789
+
677
790
 
678
791
  def on_start(self):
679
792
 
@@ -782,14 +895,59 @@ class NodeProcessor(_actors.Actor):
782
895
 
783
896
  class ModelNodeProcessor(NodeProcessor):
784
897
 
785
- def __init__(self, graph: _EngineContext, node_id: NodeId, node: _EngineNode):
786
- super().__init__(graph, node_id, node)
898
+ def __init__(self, graph: _EngineContext, node: _EngineNode):
899
+ super().__init__(graph, node)
787
900
 
788
901
 
789
902
  class DataNodeProcessor(NodeProcessor):
790
903
 
791
- def __init__(self, graph: _EngineContext, node_id: NodeId, node: _EngineNode):
792
- super().__init__(graph, node_id, node)
904
+ def __init__(self, graph: _EngineContext, node: _EngineNode):
905
+ super().__init__(graph, node)
906
+
907
+
908
+ class ChildJobNodeProcessor(NodeProcessor):
909
+
910
+ def __init__(self, graph: _EngineContext, node: _EngineNode):
911
+ super().__init__(graph, node)
912
+
913
+ @_actors.Message
914
+ def evaluate_node(self):
915
+
916
+ NodeLogger.log_node_start(self.node)
917
+
918
+ job_id = self.node.node.job_id # noqa
919
+ job_key = _util.object_key(job_id)
920
+
921
+ node_id = self.actors().id
922
+
923
+ def success_callback(ctx, _, result):
924
+ ctx.send(node_id, "child_job_succeeded", result)
925
+
926
+ def failure_callback(ctx, _, error):
927
+ ctx.send(node_id, "child_job_failed", error)
928
+
929
+ monitor = JobMonitor(job_key, success_callback, failure_callback)
930
+ monitor_id = self.actors().spawn(monitor)
931
+
932
+ graph_spec: _graph.Graph = self.node.node.graph # noqa
933
+
934
+ self.actors().send(self.graph.engine_id, "submit_child_job", job_id, graph_spec, monitor_id)
935
+
936
+ @_actors.Message
937
+ def child_job_succeeded(self, job_result: _cfg.JobResult):
938
+
939
+ self._check_result_type(job_result)
940
+
941
+ NodeLogger.log_node_succeeded(self.node)
942
+
943
+ self.actors().send_parent("node_succeeded", self.node_id, job_result)
944
+
945
+ @_actors.Message
946
+ def child_job_failed(self, job_error: Exception):
947
+
948
+ NodeLogger.log_node_failed(self.node, job_error)
949
+
950
+ self.actors().send_parent("node_failed", self.node_id, job_error)
793
951
 
794
952
 
795
953
  class GraphLogger:
@@ -623,6 +623,17 @@ class RunModelFunc(NodeFunction[Bundle[_data.DataView]]):
623
623
  storage_impl = self.storage_manager.get_file_storage(storage_key, external=True)
624
624
  storage = _ctx.TracFileStorageImpl(storage_key, storage_impl, write_access, self.checkout_directory)
625
625
  storage_map[storage_key] = storage
626
+ elif self.storage_manager.has_data_storage(storage_key, external=True):
627
+ storage_impl = self.storage_manager.get_data_storage(storage_key, external=True)
628
+ # This is a work-around until the storage extension API can be updated / unified
629
+ if not isinstance(storage_impl, _storage.IDataStorageBase):
630
+ raise _ex.EStorageConfig(f"External storage for [{storage_key}] is using the legacy storage framework]")
631
+ converter = _data.DataConverter.noop()
632
+ storage = _ctx.TracDataStorageImpl(storage_key, storage_impl, converter, write_access, self.checkout_directory)
633
+ storage_map[storage_key] = storage
634
+ else:
635
+ raise _ex.EStorageConfig(f"External storage is not available: [{storage_key}]")
636
+
626
637
 
627
638
  # Run the model against the mapped local context
628
639
 
@@ -688,7 +699,7 @@ class RunModelFunc(NodeFunction[Bundle[_data.DataView]]):
688
699
  output_section = _graph.GraphBuilder.build_runtime_outputs(dynamic_outputs, self.node.id.namespace)
689
700
  new_nodes.update(output_section.nodes)
690
701
 
691
- ctx_id = NodeId.of("trac_build_result", self.node.id.namespace, result_type=None)
702
+ ctx_id = NodeId.of("trac_job_result", self.node.id.namespace, result_type=None)
692
703
  new_deps[ctx_id] = list(_graph.Dependency(nid, _graph.DependencyType.HARD) for nid in output_section.outputs)
693
704
 
694
705
  self.node_callback.send_graph_updates(new_nodes, new_deps)
@@ -696,6 +707,18 @@ class RunModelFunc(NodeFunction[Bundle[_data.DataView]]):
696
707
  return results
697
708
 
698
709
 
710
+ class ChildJobFunction(NodeFunction[None]):
711
+
712
+ def __init__(self, node: ChildJobNode):
713
+ super().__init__()
714
+ self.node = node
715
+
716
+ def _execute(self, ctx: NodeContext):
717
+ # This node should never execute, the engine intercepts child job nodes and provides special handling
718
+ raise _ex.ETracInternal("Child job was not processed correctly (this is a bug)")
719
+
720
+
721
+
699
722
  # ----------------------------------------------------------------------------------------------------------------------
700
723
  # FUNCTION RESOLUTION
701
724
  # ----------------------------------------------------------------------------------------------------------------------
@@ -779,6 +802,7 @@ class FunctionResolver:
779
802
  DataResultNode: DataResultFunc,
780
803
  StaticValueNode: StaticValueFunc,
781
804
  RuntimeOutputsNode: RuntimeOutputsFunc,
805
+ ChildJobNode: ChildJobFunction,
782
806
  BundleItemNode: NoopFunc,
783
807
  NoopNode: NoopFunc,
784
808
  RunModelResultNode: NoopFunc
tracdap/rt/_exec/graph.py CHANGED
@@ -414,3 +414,12 @@ class SaveJobResultNode(Node[None]):
414
414
 
415
415
  def _node_dependencies(self) -> tp.Dict[NodeId, DependencyType]:
416
416
  return {self.job_result_id: DependencyType.HARD}
417
+
418
+
419
+ @_node_type
420
+ class ChildJobNode(Node[cfg.JobResult]):
421
+
422
+ job_id: meta.TagHeader
423
+ job_def: meta.JobDefinition
424
+
425
+ graph: Graph