tracdap-runtime 0.7.0__py3-none-any.whl → 0.8.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. tracdap/rt/_exec/context.py +140 -64
  2. tracdap/rt/_exec/dev_mode.py +144 -69
  3. tracdap/rt/_exec/engine.py +9 -7
  4. tracdap/rt/_exec/functions.py +95 -33
  5. tracdap/rt/_exec/graph.py +22 -15
  6. tracdap/rt/_exec/graph_builder.py +221 -98
  7. tracdap/rt/_exec/runtime.py +19 -6
  8. tracdap/rt/_impl/data.py +86 -13
  9. tracdap/rt/_impl/grpc/tracdap/metadata/file_pb2.py +3 -1
  10. tracdap/rt/_impl/grpc/tracdap/metadata/file_pb2.pyi +8 -0
  11. tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.py +27 -25
  12. tracdap/rt/_impl/grpc/tracdap/metadata/model_pb2.pyi +14 -4
  13. tracdap/rt/_impl/models.py +9 -7
  14. tracdap/rt/_impl/static_api.py +53 -33
  15. tracdap/rt/_impl/util.py +1 -1
  16. tracdap/rt/_impl/validation.py +54 -28
  17. tracdap/rt/_version.py +1 -1
  18. tracdap/rt/api/__init__.py +6 -3
  19. tracdap/rt/api/file_types.py +29 -0
  20. tracdap/rt/api/hook.py +15 -7
  21. tracdap/rt/api/model_api.py +16 -0
  22. tracdap/rt/api/static_api.py +211 -125
  23. tracdap/rt/config/__init__.py +6 -6
  24. tracdap/rt/config/common.py +11 -1
  25. tracdap/rt/config/platform.py +4 -6
  26. tracdap/rt/launch/launch.py +9 -11
  27. tracdap/rt/metadata/__init__.py +10 -9
  28. tracdap/rt/metadata/file.py +8 -0
  29. tracdap/rt/metadata/model.py +12 -2
  30. {tracdap_runtime-0.7.0.dist-info → tracdap_runtime-0.8.0b1.dist-info}/METADATA +15 -15
  31. {tracdap_runtime-0.7.0.dist-info → tracdap_runtime-0.8.0b1.dist-info}/RECORD +34 -33
  32. {tracdap_runtime-0.7.0.dist-info → tracdap_runtime-0.8.0b1.dist-info}/WHEEL +1 -1
  33. {tracdap_runtime-0.7.0.dist-info → tracdap_runtime-0.8.0b1.dist-info}/LICENSE +0 -0
  34. {tracdap_runtime-0.7.0.dist-info → tracdap_runtime-0.8.0b1.dist-info}/top_level.txt +0 -0
@@ -137,11 +137,14 @@ class DevModeTranslator:
137
137
  raise _ex.EConfigParse(msg)
138
138
 
139
139
 
140
- def __init__(self, sys_config: _cfg.RuntimeConfig, config_mgr: _cfg_p.ConfigManager, scratch_dir: pathlib.Path):
140
+ def __init__(
141
+ self, sys_config: _cfg.RuntimeConfig, config_mgr: _cfg_p.ConfigManager, scratch_dir: pathlib.Path = None,
142
+ model_loader: _models.ModelLoader = None, storage_manager: _storage.StorageManager = None):
143
+
141
144
  self._sys_config = sys_config
142
145
  self._config_mgr = config_mgr
143
- self._scratch_dir = scratch_dir
144
- self._model_loader: tp.Optional[_models.ModelLoader] = None
146
+ self._model_loader = model_loader or _models.ModelLoader(self._sys_config, scratch_dir)
147
+ self._storage_manager = storage_manager or _storage.StorageManager(self._sys_config)
145
148
 
146
149
  def translate_job_config(
147
150
  self, job_config: _cfg.JobConfig,
@@ -150,8 +153,6 @@ class DevModeTranslator:
150
153
 
151
154
  try:
152
155
  self._log.info(f"Applying dev mode config translation to job config")
153
-
154
- self._model_loader = _models.ModelLoader(self._sys_config, self._scratch_dir)
155
156
  self._model_loader.create_scope("DEV_MODE_TRANSLATION")
156
157
 
157
158
  job_config = copy.deepcopy(job_config)
@@ -168,7 +169,6 @@ class DevModeTranslator:
168
169
 
169
170
  finally:
170
171
  self._model_loader.destroy_scope("DEV_MODE_TRANSLATION")
171
- self._model_loader = None
172
172
 
173
173
  def translate_job_def(
174
174
  self, job_config: _cfg.JobConfig, job_def: _meta.JobDefinition,
@@ -694,7 +694,7 @@ class DevModeTranslator:
694
694
 
695
695
  model_selector = job_def.runFlow.models.get(source.node)
696
696
  model_obj = _util.get_job_resource(model_selector, job_config)
697
- model_input = model_obj.model.inputs.get(source.socket)
697
+ model_input = model_obj.model.outputs.get(source.socket)
698
698
  model_outputs.append(model_input)
699
699
 
700
700
  if len(model_outputs) == 0:
@@ -764,7 +764,7 @@ class DevModeTranslator:
764
764
  else:
765
765
  p_spec = param_specs[p_name]
766
766
 
767
- cls._log.info(f"Encoding parameter [{p_name}] as {p_spec.paramType.basicType}")
767
+ cls._log.info(f"Encoding parameter [{p_name}] as {p_spec.paramType.basicType.name}")
768
768
 
769
769
  encoded_value = _types.MetadataCodec.convert_value(p_value, p_spec.paramType)
770
770
  encoded_values[p_name] = encoded_value
@@ -798,38 +798,46 @@ class DevModeTranslator:
798
798
  if not (isinstance(input_value, str) and input_value in job_resources):
799
799
 
800
800
  model_input = required_inputs[input_key]
801
- input_schema = model_input.schema if model_input and not model_input.dynamic else None
802
801
 
803
- input_id = self._process_input_or_output(
804
- input_key, input_value, job_resources,
805
- new_unique_file=False, schema=input_schema)
802
+ if model_input.objectType == _meta.ObjectType.DATA:
803
+ schema = model_input.schema if model_input and not model_input.dynamic else None
804
+ input_id = self._process_data_socket(input_key, input_value, schema, job_resources, new_unique_file=False)
805
+ elif model_input.objectType == _meta.ObjectType.FILE:
806
+ file_type = model_input.fileType
807
+ input_id = self._process_file_socket(input_key, input_value, file_type, job_resources, new_unique_file=False)
808
+ else:
809
+ raise _ex.EUnexpected()
806
810
 
807
811
  job_inputs[input_key] = _util.selector_for(input_id)
808
812
 
809
813
  for output_key, output_value in job_outputs.items():
810
814
  if not (isinstance(output_value, str) and output_value in job_resources):
811
815
 
812
- model_output= required_outputs[output_key]
813
- output_schema = model_output.schema if model_output and not model_output.dynamic else None
816
+ model_output = required_outputs[output_key]
814
817
 
815
- output_id = self._process_input_or_output(
816
- output_key, output_value, job_resources,
817
- new_unique_file=True, schema=output_schema)
818
+ if model_output.objectType == _meta.ObjectType.DATA:
819
+ schema = model_output.schema if model_output and not model_output.dynamic else None
820
+ output_id = self._process_data_socket(output_key, output_value, schema, job_resources, new_unique_file=True)
821
+ elif model_output.objectType == _meta.ObjectType.FILE:
822
+ file_type = model_output.fileType
823
+ output_id = self._process_file_socket(output_key, output_value, file_type, job_resources, new_unique_file=True)
824
+ else:
825
+ raise _ex.EUnexpected()
818
826
 
819
827
  job_outputs[output_key] = _util.selector_for(output_id)
820
828
 
821
829
  return job_config, job_def
822
830
 
823
- def _process_input_or_output(
824
- self, data_key, data_value,
825
- resources: tp.Dict[str, _meta.ObjectDefinition],
826
- new_unique_file=False,
827
- schema: tp.Optional[_meta.SchemaDefinition] = None) \
831
+ def _process_data_socket(
832
+ self, data_key, data_value, schema: tp.Optional[_meta.SchemaDefinition],
833
+ resources: tp.Dict[str, _meta.ObjectDefinition], new_unique_file=False) \
828
834
  -> _meta.TagHeader:
829
835
 
830
836
  data_id = _util.new_object_id(_meta.ObjectType.DATA)
831
837
  storage_id = _util.new_object_id(_meta.ObjectType.STORAGE)
832
838
 
839
+ self._log.info(f"Generating data definition for [{data_key}] with ID = [{_util.object_key(data_id)}]")
840
+
833
841
  if isinstance(data_value, str):
834
842
  storage_path = data_value
835
843
  storage_key = self._sys_config.storage.defaultBucket
@@ -850,43 +858,85 @@ class DevModeTranslator:
850
858
  else:
851
859
  raise _ex.EConfigParse(f"Invalid configuration for input '{data_key}'")
852
860
 
853
- self._log.info(f"Generating data definition for [{data_key}] with ID = [{_util.object_key(data_id)}]")
854
-
855
861
  # For unique outputs, increment the snap number to find a new unique snap
856
862
  # These are not incarnations, bc likely in dev mode model code and inputs are changing
857
863
  # Incarnations are for recreation of a dataset using the exact same code path and inputs
858
864
 
859
865
  if new_unique_file:
866
+ storage_path, snap_version = self._new_unique_file(data_key, storage_key, storage_path, snap_version)
860
867
 
861
- x_storage_mgr = _storage.StorageManager(self._sys_config)
862
- x_storage = x_storage_mgr.get_file_storage(storage_key)
863
- x_orig_path = pathlib.PurePath(storage_path)
864
- x_name = x_orig_path.name
865
-
866
- if x_storage.exists(str(x_orig_path.parent)):
867
- listing = x_storage.ls(str(x_orig_path.parent))
868
- existing_files = list(map(lambda stat: stat.file_name, listing))
869
- else:
870
- existing_files = []
871
-
872
- while x_name in existing_files:
868
+ part_key = _meta.PartKey(opaqueKey="part-root", partType=_meta.PartType.PART_ROOT)
869
+ delta_index = 1
870
+ incarnation_index = 1
873
871
 
874
- snap_version += 1
875
- x_name = f"{x_orig_path.stem}-{snap_version}"
876
- storage_path = str(x_orig_path.parent.joinpath(x_name))
872
+ # This is also defined in functions.DynamicDataSpecFunc, maybe centralize?
873
+ data_item = f"data/table/{data_id.objectId}/{part_key.opaqueKey}/snap-{snap_version}/delta-{delta_index}"
877
874
 
878
- self._log.info(f"Output for [{data_key}] will be snap version {snap_version}")
875
+ data_obj = self._generate_data_definition(
876
+ part_key, snap_version, delta_index, data_item,
877
+ schema, storage_id)
879
878
 
880
- data_obj, storage_obj = self._generate_input_definition(
881
- data_id, storage_id, storage_key, storage_path, storage_format,
882
- snap_index=snap_version, delta_index=1, incarnation_index=1,
883
- schema=schema)
879
+ storage_obj = self._generate_storage_definition(
880
+ storage_id, storage_key, storage_path, storage_format,
881
+ data_item, incarnation_index)
884
882
 
885
883
  resources[_util.object_key(data_id)] = data_obj
886
884
  resources[_util.object_key(storage_id)] = storage_obj
887
885
 
888
886
  return data_id
889
887
 
888
+ def _process_file_socket(
889
+ self, file_key, file_value, file_type: _meta.FileType,
890
+ resources: tp.Dict[str, _meta.ObjectDefinition], new_unique_file=False) \
891
+ -> _meta.TagHeader:
892
+
893
+ file_id = _util.new_object_id(_meta.ObjectType.FILE)
894
+ storage_id = _util.new_object_id(_meta.ObjectType.STORAGE)
895
+
896
+ self._log.info(f"Generating file definition for [{file_key}] with ID = [{_util.object_key(file_id)}]")
897
+
898
+ if isinstance(file_value, str):
899
+
900
+ storage_key = self._sys_config.storage.defaultBucket
901
+ storage_path = file_value
902
+
903
+ elif isinstance(file_value, dict):
904
+
905
+ storage_key = file_value.get("storageKey") or self._sys_config.storage.defaultBucket
906
+ storage_path = file_value.get("path")
907
+
908
+ if not storage_path:
909
+ raise _ex.EConfigParse(f"Invalid configuration for input [{file_key}] (missing required value 'path'")
910
+
911
+ else:
912
+ raise _ex.EConfigParse(f"Invalid configuration for input '{file_key}'")
913
+
914
+ storage_format = "application/x-binary"
915
+ file_version = 1
916
+
917
+ if new_unique_file:
918
+ storage_path, file_version = self._new_unique_file(file_key, storage_key, storage_path, file_version)
919
+ file_size = 0
920
+ else:
921
+ storage = self._storage_manager.get_file_storage(storage_key)
922
+ file_size = storage.size(storage_path)
923
+
924
+ data_item = f"file/{file_id.objectId}/version-{file_version}"
925
+ file_name = f"{file_key}.{file_type.extension}"
926
+
927
+ file_obj = self._generate_file_definition(
928
+ file_name, file_type, file_size,
929
+ storage_id, data_item)
930
+
931
+ storage_obj = self._generate_storage_definition(
932
+ storage_id, storage_key, storage_path, storage_format,
933
+ data_item, incarnation_index=1)
934
+
935
+ resources[_util.object_key(file_id)] = file_obj
936
+ resources[_util.object_key(storage_id)] = storage_obj
937
+
938
+ return file_id
939
+
890
940
  @staticmethod
891
941
  def infer_format(storage_path: str, storage_config: _cfg.StorageConfig):
892
942
 
@@ -898,20 +948,33 @@ class DevModeTranslator:
898
948
  else:
899
949
  return storage_config.defaultFormat
900
950
 
901
- @classmethod
902
- def _generate_input_definition(
903
- cls, data_id: _meta.TagHeader, storage_id: _meta.TagHeader,
904
- storage_key: str, storage_path: str, storage_format: str,
905
- snap_index: int, delta_index: int, incarnation_index: int,
906
- schema: tp.Optional[_meta.SchemaDefinition] = None) \
907
- -> (_meta.ObjectDefinition, _meta.ObjectDefinition):
951
+ def _new_unique_file(self, socket_name, storage_key, storage_path, version):
908
952
 
909
- part_key = _meta.PartKey(
910
- opaqueKey="part-root",
911
- partType=_meta.PartType.PART_ROOT)
953
+ x_storage = self._storage_manager.get_file_storage(storage_key)
954
+ x_orig_path = pathlib.PurePath(storage_path)
955
+ x_name = x_orig_path.name
912
956
 
913
- # This is also defined in functions.DynamicDataSpecFunc, maybe centralize?
914
- data_item = f"data/table/{data_id.objectId}/{part_key.opaqueKey}/snap-{snap_index}/delta-{delta_index}"
957
+ if x_storage.exists(str(x_orig_path.parent)):
958
+ listing = x_storage.ls(str(x_orig_path.parent))
959
+ existing_files = list(map(lambda stat: stat.file_name, listing))
960
+ else:
961
+ existing_files = []
962
+
963
+ while x_name in existing_files:
964
+
965
+ version += 1
966
+ x_name = f"{x_orig_path.stem}-{version}{x_orig_path.suffix}"
967
+ storage_path = str(x_orig_path.parent.joinpath(x_name))
968
+
969
+ self._log.info(f"Output for [{socket_name}] will be version {version}")
970
+
971
+ return storage_path, version
972
+
973
+ @classmethod
974
+ def _generate_data_definition(
975
+ cls, part_key: _meta.PartKey, snap_index: int, delta_index: int, data_item: str,
976
+ schema: tp.Optional[_meta.SchemaDefinition], storage_id: _meta.TagHeader) \
977
+ -> (_meta.ObjectDefinition, _meta.ObjectDefinition):
915
978
 
916
979
  delta = _meta.DataDefinition.Delta(
917
980
  deltaIndex=delta_index,
@@ -925,17 +988,31 @@ class DevModeTranslator:
925
988
  partKey=part_key,
926
989
  snap=snap)
927
990
 
928
- data_def = _meta.DataDefinition(parts={})
991
+ data_def = _meta.DataDefinition()
929
992
  data_def.parts[part_key.opaqueKey] = part
993
+ data_def.schema = schema
994
+ data_def.storageId = _util.selector_for(storage_id)
930
995
 
931
- if schema is not None:
932
- data_def.schema = schema
933
- else:
934
- data_def.schema = None
996
+ return _meta.ObjectDefinition(objectType=_meta.ObjectType.DATA, data=data_def)
935
997
 
936
- data_def.storageId = _meta.TagSelector(
937
- _meta.ObjectType.STORAGE, storage_id.objectId,
938
- objectVersion=storage_id.objectVersion, latestTag=True)
998
+ @classmethod
999
+ def _generate_file_definition(
1000
+ cls, file_name: str, file_type: _meta.FileType, file_size: int,
1001
+ storage_id: _meta.TagHeader, data_item: str) \
1002
+ -> _meta.ObjectDefinition:
1003
+
1004
+ file_def = _meta.FileDefinition(
1005
+ name=file_name, extension=file_type.extension, mimeType=file_type.mimeType,
1006
+ storageId=_util.selector_for(storage_id), dataItem=data_item, size=file_size)
1007
+
1008
+ return _meta.ObjectDefinition(objectType=_meta.ObjectType.FILE, file=file_def)
1009
+
1010
+ @classmethod
1011
+ def _generate_storage_definition(
1012
+ cls, storage_id: _meta.TagHeader,
1013
+ storage_key: str, storage_path: str, storage_format: str,
1014
+ data_item: str, incarnation_index: int) \
1015
+ -> _meta.ObjectDefinition:
939
1016
 
940
1017
  storage_copy = _meta.StorageCopy(
941
1018
  storageKey=storage_key,
@@ -952,16 +1029,14 @@ class DevModeTranslator:
952
1029
  storage_item = _meta.StorageItem(
953
1030
  incarnations=[storage_incarnation])
954
1031
 
955
- storage_def = _meta.StorageDefinition(dataItems={})
956
- storage_def.dataItems[delta.dataItem] = storage_item
1032
+ storage_def = _meta.StorageDefinition()
1033
+ storage_def.dataItems[data_item] = storage_item
957
1034
 
958
1035
  if storage_format.lower() == "csv":
959
1036
  storage_def.storageOptions["lenient_csv_parser"] = _types.MetadataCodec.encode_value(True)
960
1037
 
961
- data_obj = _meta.ObjectDefinition(objectType=_meta.ObjectType.DATA, data=data_def)
962
- storage_obj = _meta.ObjectDefinition(objectType=_meta.ObjectType.STORAGE, storage=storage_def)
1038
+ return _meta.ObjectDefinition(objectType=_meta.ObjectType.STORAGE, storage=storage_def)
963
1039
 
964
- return data_obj, storage_obj
965
1040
 
966
1041
 
967
1042
  DevModeTranslator._log = _util.logger_for_class(DevModeTranslator)
@@ -170,7 +170,7 @@ class TracEngine(_actors.Actor):
170
170
 
171
171
  self._log.info(f"Job submitted: [{job_key}]")
172
172
 
173
- job_processor = JobProcessor(self._models, self._storage, job_key, job_config, result_spec, graph_spec=None)
173
+ job_processor = JobProcessor(self._sys_config, self._models, self._storage, job_key, job_config, result_spec, graph_spec=None)
174
174
  job_actor_id = self.actors().spawn(job_processor)
175
175
 
176
176
  job_monitor_success = lambda ctx, key, result: self._notify_callback(key, result, None)
@@ -190,7 +190,7 @@ class TracEngine(_actors.Actor):
190
190
 
191
191
  child_key = _util.object_key(child_id)
192
192
 
193
- child_processor = JobProcessor(self._models, self._storage, child_key, None, None, graph_spec=child_graph) # noqa
193
+ child_processor = JobProcessor(self._sys_config, self._models, self._storage, child_key, None, None, graph_spec=child_graph) # noqa
194
194
  child_actor_id = self.actors().spawn(child_processor)
195
195
 
196
196
  child_state = _JobState(child_id)
@@ -336,7 +336,8 @@ class JobProcessor(_actors.Actor):
336
336
  """
337
337
 
338
338
  def __init__(
339
- self, models: _models.ModelLoader, storage: _storage.StorageManager,
339
+ self, sys_config: _cfg.RuntimeConfig,
340
+ models: _models.ModelLoader, storage: _storage.StorageManager,
340
341
  job_key: str, job_config: _cfg.JobConfig, result_spec: _graph.JobResultSpec,
341
342
  graph_spec: tp.Optional[_graph.Graph]):
342
343
 
@@ -345,6 +346,7 @@ class JobProcessor(_actors.Actor):
345
346
  self.job_config = job_config
346
347
  self.result_spec = result_spec
347
348
  self.graph_spec = graph_spec
349
+ self._sys_config = sys_config
348
350
  self._models = models
349
351
  self._storage = storage
350
352
  self._resolver = _func.FunctionResolver(models, storage)
@@ -358,7 +360,7 @@ class JobProcessor(_actors.Actor):
358
360
  if self.graph_spec is not None:
359
361
  self.actors().send(self.actors().id, "build_graph_succeeded", self.graph_spec)
360
362
  else:
361
- self.actors().spawn(GraphBuilder(self.job_config, self.result_spec))
363
+ self.actors().spawn(GraphBuilder(self._sys_config, self.job_config, self.result_spec))
362
364
 
363
365
  def on_stop(self):
364
366
 
@@ -426,8 +428,9 @@ class GraphBuilder(_actors.Actor):
426
428
  GraphBuilder is a worker (actor) to wrap the GraphBuilder logic from graph_builder.py
427
429
  """
428
430
 
429
- def __init__(self, job_config: _cfg.JobConfig, result_spec: _graph.JobResultSpec):
431
+ def __init__(self, sys_config: _cfg.RuntimeConfig, job_config: _cfg.JobConfig, result_spec: _graph.JobResultSpec):
430
432
  super().__init__()
433
+ self.sys_config = sys_config
431
434
  self.job_config = job_config
432
435
  self.result_spec = result_spec
433
436
  self._log = _util.logger_for_object(self)
@@ -440,8 +443,7 @@ class GraphBuilder(_actors.Actor):
440
443
 
441
444
  self._log.info("Building execution graph")
442
445
 
443
- # TODO: Get sys config, or find a way to pass storage settings
444
- graph_builder = _graph.GraphBuilder(job_config, self.result_spec)
446
+ graph_builder = _graph.GraphBuilder(self.sys_config, job_config, self.result_spec)
445
447
  graph_spec = graph_builder.build_job(job_config.job)
446
448
 
447
449
  self.actors().reply("build_graph_succeeded", graph_spec)
@@ -15,6 +15,7 @@
15
15
 
16
16
  from __future__ import annotations
17
17
 
18
+ import copy
18
19
  import datetime
19
20
  import abc
20
21
  import random
@@ -296,8 +297,13 @@ class DataViewFunc(NodeFunction[_data.DataView]):
296
297
 
297
298
  # Map empty item -> emtpy view (for optional inputs not supplied)
298
299
  if root_item.is_empty():
299
- return _data.DataView.create_empty()
300
+ return _data.DataView.create_empty(root_item.object_type)
300
301
 
302
+ # Handle file data views
303
+ if root_item.object_type == meta.ObjectType.FILE:
304
+ return _data.DataView.for_file_item(root_item)
305
+
306
+ # Everything else is a regular data view
301
307
  if self.node.schema is not None and len(self.node.schema.table.fields) > 0:
302
308
  trac_schema = self.node.schema
303
309
  else:
@@ -322,7 +328,11 @@ class DataItemFunc(NodeFunction[_data.DataItem]):
322
328
 
323
329
  # Map empty view -> emtpy item (for optional outputs not supplied)
324
330
  if data_view.is_empty():
325
- return _data.DataItem.create_empty()
331
+ return _data.DataItem.create_empty(data_view.object_type)
332
+
333
+ # Handle file data views
334
+ if data_view.object_type == meta.ObjectType.FILE:
335
+ return data_view.file_item
326
336
 
327
337
  # TODO: Support selecting data item described by self.node
328
338
 
@@ -342,25 +352,24 @@ class DataResultFunc(NodeFunction[ObjectBundle]):
342
352
 
343
353
  def _execute(self, ctx: NodeContext) -> ObjectBundle:
344
354
 
345
- data_item = _ctx_lookup(self.node.data_item_id, ctx)
355
+ data_spec = _ctx_lookup(self.node.data_save_id, ctx)
346
356
 
347
- # Do not record output metadata for optional outputs that are empty
348
- if data_item.is_empty():
349
- return {}
357
+ result_bundle = dict()
350
358
 
351
- data_spec = _ctx_lookup(self.node.data_spec_id, ctx)
359
+ # Do not record output metadata for optional outputs that are empty
360
+ if data_spec.is_empty():
361
+ return result_bundle
352
362
 
353
- # TODO: Check result of save operation
354
- # save_result = _ctx_lookup(self.node.data_save_id, ctx)
363
+ if self.node.data_key is not None:
364
+ result_bundle[self.node.data_key] = meta.ObjectDefinition(objectType=meta.ObjectType.DATA, data=data_spec.data_def)
355
365
 
356
- data_result = meta.ObjectDefinition(objectType=meta.ObjectType.DATA, data=data_spec.data_def)
357
- storage_result = meta.ObjectDefinition(objectType=meta.ObjectType.STORAGE, storage=data_spec.storage_def)
366
+ if self.node.file_key is not None:
367
+ result_bundle[self.node.file_key] = meta.ObjectDefinition(objectType=meta.ObjectType.FILE, file=data_spec.file_def)
358
368
 
359
- bundle = {
360
- self.node.data_key: data_result,
361
- self.node.storage_key: storage_result}
369
+ if self.node.storage_key is not None:
370
+ result_bundle[self.node.storage_key] = meta.ObjectDefinition(objectType=meta.ObjectType.STORAGE, storage=data_spec.storage_def)
362
371
 
363
- return bundle
372
+ return result_bundle
364
373
 
365
374
 
366
375
  class DynamicDataSpecFunc(NodeFunction[_data.DataSpec]):
@@ -443,11 +452,7 @@ class DynamicDataSpecFunc(NodeFunction[_data.DataSpec]):
443
452
 
444
453
  # Dynamic data def will always use an embedded schema (this is no ID for an external schema)
445
454
 
446
- return _data.DataSpec(
447
- data_item,
448
- data_def,
449
- storage_def,
450
- schema_def=None)
455
+ return _data.DataSpec.create_data_spec(data_item, data_def, storage_def, schema_def=None)
451
456
 
452
457
 
453
458
  class _LoadSaveDataFunc(abc.ABC):
@@ -455,6 +460,16 @@ class _LoadSaveDataFunc(abc.ABC):
455
460
  def __init__(self, storage: _storage.StorageManager):
456
461
  self.storage = storage
457
462
 
463
+ @classmethod
464
+ def _choose_data_spec(cls, spec_id, spec, ctx: NodeContext):
465
+
466
+ if spec_id is not None:
467
+ return _ctx_lookup(spec_id, ctx)
468
+ elif spec is not None:
469
+ return spec
470
+ else:
471
+ raise _ex.EUnexpected()
472
+
458
473
  def _choose_copy(self, data_item: str, storage_def: meta.StorageDefinition) -> meta.StorageCopy:
459
474
 
460
475
  # Metadata should be checked for consistency before a job is accepted
@@ -491,9 +506,19 @@ class LoadDataFunc( _LoadSaveDataFunc, NodeFunction[_data.DataItem],):
491
506
 
492
507
  def _execute(self, ctx: NodeContext) -> _data.DataItem:
493
508
 
494
- data_spec = _ctx_lookup(self.node.spec_id, ctx)
509
+ data_spec = self._choose_data_spec(self.node.spec_id, self.node.spec, ctx)
495
510
  data_copy = self._choose_copy(data_spec.data_item, data_spec.storage_def)
496
- data_storage = self.storage.get_data_storage(data_copy.storageKey)
511
+
512
+ if data_spec.object_type == _api.ObjectType.DATA:
513
+ return self._load_data(data_spec, data_copy)
514
+
515
+ elif data_spec.object_type == _api.ObjectType.FILE:
516
+ return self._load_file(data_copy)
517
+
518
+ else:
519
+ raise _ex.EUnexpected()
520
+
521
+ def _load_data(self, data_spec, data_copy):
497
522
 
498
523
  trac_schema = data_spec.schema_def if data_spec.schema_def else data_spec.data_def.schema
499
524
  arrow_schema = _data.DataMapping.trac_to_arrow_schema(trac_schema) if trac_schema else None
@@ -503,36 +528,52 @@ class LoadDataFunc( _LoadSaveDataFunc, NodeFunction[_data.DataItem],):
503
528
  for opt_key, opt_value in data_spec.storage_def.storageOptions.items():
504
529
  options[opt_key] = _types.MetadataCodec.decode_value(opt_value)
505
530
 
506
- table = data_storage.read_table(
531
+ storage = self.storage.get_data_storage(data_copy.storageKey)
532
+ table = storage.read_table(
507
533
  data_copy.storagePath,
508
534
  data_copy.storageFormat,
509
535
  arrow_schema,
510
536
  storage_options=options)
511
537
 
512
- return _data.DataItem(table.schema, table)
538
+ return _data.DataItem(_api.ObjectType.DATA, table.schema, table)
539
+
540
+ def _load_file(self, data_copy):
541
+
542
+ storage = self.storage.get_file_storage(data_copy.storageKey)
543
+ raw_bytes = storage.read_bytes(data_copy.storagePath)
513
544
 
545
+ return _data.DataItem(_api.ObjectType.FILE, raw_bytes=raw_bytes)
514
546
 
515
- class SaveDataFunc(_LoadSaveDataFunc, NodeFunction[None]):
547
+
548
+ class SaveDataFunc(_LoadSaveDataFunc, NodeFunction[_data.DataSpec]):
516
549
 
517
550
  def __init__(self, node: SaveDataNode, storage: _storage.StorageManager):
518
551
  super().__init__(storage)
519
552
  self.node = node
520
553
 
521
- def _execute(self, ctx: NodeContext):
554
+ def _execute(self, ctx: NodeContext) -> _data.DataSpec:
522
555
 
523
556
  # Item to be saved should exist in the current context
524
557
  data_item = _ctx_lookup(self.node.data_item_id, ctx)
525
558
 
559
+ # Metadata already exists as data_spec but may not contain schema, row count, file size etc.
560
+ data_spec = self._choose_data_spec(self.node.spec_id, self.node.spec, ctx)
561
+ data_copy = self._choose_copy(data_spec.data_item, data_spec.storage_def)
562
+
526
563
  # Do not save empty outputs (optional outputs that were not produced)
527
564
  if data_item.is_empty():
528
- return
565
+ return _data.DataSpec.create_empty_spec(data_item.object_type)
529
566
 
530
- # This function assumes that metadata has already been generated as the data_spec
531
- # i.e. it is already known which incarnation / copy of the data will be created
567
+ if data_item.object_type == _api.ObjectType.DATA:
568
+ return self._save_data(data_item, data_spec, data_copy)
532
569
 
533
- data_spec = _ctx_lookup(self.node.spec_id, ctx)
534
- data_copy = self._choose_copy(data_spec.data_item, data_spec.storage_def)
535
- data_storage = self.storage.get_data_storage(data_copy.storageKey)
570
+ elif data_item.object_type == _api.ObjectType.FILE:
571
+ return self._save_file(data_item, data_spec, data_copy)
572
+
573
+ else:
574
+ raise _ex.EUnexpected()
575
+
576
+ def _save_data(self, data_item, data_spec, data_copy):
536
577
 
537
578
  # Current implementation will always put an Arrow table in the data item
538
579
  # Empty tables are allowed, so explicitly check if table is None
@@ -546,11 +587,32 @@ class SaveDataFunc(_LoadSaveDataFunc, NodeFunction[None]):
546
587
  for opt_key, opt_value in data_spec.storage_def.storageOptions.items():
547
588
  options[opt_key] = _types.MetadataCodec.decode_value(opt_value)
548
589
 
549
- data_storage.write_table(
590
+ storage = self.storage.get_data_storage(data_copy.storageKey)
591
+ storage.write_table(
550
592
  data_copy.storagePath, data_copy.storageFormat,
551
593
  data_item.table,
552
594
  storage_options=options, overwrite=False)
553
595
 
596
+ data_spec = copy.deepcopy(data_spec)
597
+ # TODO: Save row count in metadata
598
+
599
+ if data_spec.data_def.schema is None and data_spec.data_def.schemaId is None:
600
+ data_spec.data_def.schema = _data.DataMapping.arrow_to_trac_schema(data_item.table.schema)
601
+
602
+ return data_spec
603
+
604
+ def _save_file(self, data_item, data_spec, data_copy):
605
+
606
+ if data_item.raw_bytes is None:
607
+ raise _ex.EUnexpected()
608
+
609
+ storage = self.storage.get_file_storage(data_copy.storageKey)
610
+ storage.write_bytes(data_copy.storagePath, data_item.raw_bytes)
611
+
612
+ data_spec = copy.deepcopy(data_spec)
613
+ data_spec.file_def.size = len(data_item.raw_bytes)
614
+
615
+ return data_spec
554
616
 
555
617
  def _model_def_for_import(import_details: meta.ImportModelJob):
556
618