deriva-ml 1.8.2__tar.gz → 1.8.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {deriva_ml-1.8.2/src/deriva_ml.egg-info → deriva_ml-1.8.5}/PKG-INFO +6 -3
  2. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/pyproject.toml +4 -2
  3. deriva_ml-1.8.5/src/deriva_ml/VERSION.py +1 -0
  4. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/__init__.py +2 -1
  5. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/dataset.py +149 -93
  6. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/deriva_definitions.py +0 -1
  7. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/deriva_ml_base.py +199 -33
  8. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/execution.py +45 -4
  9. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/execution_configuration.py +5 -5
  10. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/upload.py +4 -1
  11. {deriva_ml-1.8.2 → deriva_ml-1.8.5/src/deriva_ml.egg-info}/PKG-INFO +6 -3
  12. deriva_ml-1.8.5/src/deriva_ml.egg-info/requires.txt +7 -0
  13. deriva_ml-1.8.2/src/deriva_ml/VERSION.py +0 -1
  14. deriva_ml-1.8.2/src/deriva_ml.egg-info/requires.txt +0 -5
  15. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/LICENSE +0 -0
  16. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/README.md +0 -0
  17. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/setup.cfg +0 -0
  18. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/build/lib/schema_setup/__init__.py +0 -0
  19. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -0
  20. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -0
  21. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/build/lib/schema_setup/create_schema.py +0 -0
  22. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/build/lib/schema_setup/table_comments_utils.py +0 -0
  23. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/database_model.py +0 -0
  24. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/dataset_aux_classes.py +0 -0
  25. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/dataset_bag.py +0 -0
  26. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/demo_catalog.py +0 -0
  27. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/deriva_model.py +0 -0
  28. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/execution_environment.py +0 -0
  29. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/feature.py +0 -0
  30. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/history.py +0 -0
  31. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/schema_setup/__init__.py +0 -0
  32. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/schema_setup/alter_annotation.py +0 -0
  33. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/schema_setup/annotations.py +0 -0
  34. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/schema_setup/create_schema.py +0 -0
  35. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/schema_setup/policy.json +0 -0
  36. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/schema_setup/table_comments_utils.py +0 -0
  37. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/test_functions.py +0 -0
  38. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml.egg-info/SOURCES.txt +0 -0
  39. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
  40. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml.egg-info/entry_points.txt +0 -0
  41. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml.egg-info/top_level.txt +0 -0
  42. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/tests/test_basic_tables.py +0 -0
  43. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/tests/test_dataset.py +0 -0
  44. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/tests/test_download.py +0 -0
  45. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/tests/test_execution.py +0 -0
  46. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/tests/test_features.py +0 -0
  47. {deriva_ml-1.8.2 → deriva_ml-1.8.5}/tests/test_upload.py +0 -0
@@ -1,16 +1,19 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.8.2
3
+ Version: 1.8.5
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
- Requires-Dist: deriva~=1.7.6
9
+ Requires-Dist: deriva~=1.7.7
10
10
  Requires-Dist: pandas
11
11
  Requires-Dist: regex~=2024.7.24
12
12
  Requires-Dist: pydantic>=2.10.6
13
13
  Requires-Dist: semver>3.0.0
14
+ Requires-Dist: setuptools-git-versioning<3,>=2.0
15
+ Requires-Dist: nbstripout
16
+ Dynamic: license-file
14
17
 
15
18
  Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
16
19
  using a deriva catalog.
@@ -13,11 +13,13 @@ description = "Utilities to simplify use of Dervia and Pandas to create reproduc
13
13
  readme = "README.md"
14
14
  requires-python = ">=3.10"
15
15
  dependencies = [
16
- "deriva~=1.7.6",
16
+ "deriva~=1.7.7",
17
17
  "pandas",
18
18
  "regex~=2024.7.24",
19
19
  "pydantic>=2.10.6",
20
- "semver>3.0.0"
20
+ "semver>3.0.0",
21
+ "setuptools-git-versioning>=2.0,<3",
22
+ "nbstripout",
21
23
  ]
22
24
 
23
25
  [tool.setuptools.package-data]
@@ -0,0 +1 @@
1
+ __version__ = "1.8.5"
@@ -4,6 +4,7 @@ __all__ = [
4
4
  "FileUploadState",
5
5
  "FileSpec",
6
6
  "ExecutionConfiguration",
7
+ "Execution",
7
8
  "Workflow",
8
9
  "DatasetBag",
9
10
  "DatasetVersion",
@@ -39,4 +40,4 @@ from .execution_configuration import (
39
40
  ExecutionConfiguration,
40
41
  Workflow,
41
42
  )
42
-
43
+ from .execution import Execution
@@ -6,6 +6,7 @@ accessible via a DerivaML class instance.
6
6
 
7
7
  """
8
8
 
9
+ from __future__ import annotations
9
10
  from bdbag.fetch.fetcher import fetch_single_file
10
11
  from bdbag import bdbag_api as bdb
11
12
  from collections import defaultdict
@@ -37,7 +38,7 @@ from pydantic import (
37
38
  import requests
38
39
 
39
40
  from tempfile import TemporaryDirectory, NamedTemporaryFile
40
- from typing import Any, Callable, Optional, Iterable, Iterator
41
+ from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
41
42
 
42
43
  from deriva_ml import DatasetBag
43
44
  from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
@@ -52,6 +53,9 @@ from .dataset_aux_classes import (
52
53
  DatasetSpec,
53
54
  )
54
55
 
56
+ if TYPE_CHECKING:
57
+ from .deriva_ml_base import DerivaML
58
+
55
59
 
56
60
  class Dataset:
57
61
  """
@@ -83,29 +87,32 @@ class Dataset:
83
87
  else:
84
88
  return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
85
89
 
86
- def _insert_dataset_version(
90
+ def _insert_dataset_versions(
87
91
  self,
88
- dataset_rid: RID,
89
- dataset_version: DatasetVersion,
92
+ dataset_list: list[DatasetSpec],
90
93
  description: Optional[str] = "",
91
94
  execution_rid: Optional[RID] = None,
92
95
  ) -> RID:
93
96
  schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
94
- version_path = schema_path.tables["Dataset_Version"]
95
- version_rid = version_path.insert(
96
- [
97
- {
98
- "Dataset": dataset_rid,
99
- "Version": str(dataset_version),
100
- "Description": description,
101
- "Execution": execution_rid,
102
- }
103
- ]
104
- )[0]["RID"]
105
- schema_path.tables["Dataset"].update(
106
- [{"RID": dataset_rid, "Version": version_rid}]
107
- )
108
- return version_rid
97
+
98
+ # Construct version records for insert
99
+ version_records = [
100
+ {
101
+ "Dataset": dataset.rid,
102
+ "Version": str(dataset.version),
103
+ "Description": description,
104
+ "Execution": execution_rid,
105
+ }
106
+ for dataset in dataset_list
107
+ ]
108
+
109
+ # Insert version records and construct entities for updating the dataset version column.
110
+ version_rids = [
111
+ {"Version": v["RID"], "RID": v["Dataset"]}
112
+ for v in schema_path.tables["Dataset_Version"].insert(version_records)
113
+ ]
114
+ schema_path.tables["Dataset"].update(version_rids)
115
+ return version_rids
109
116
 
110
117
  def _bootstrap_versions(self):
111
118
  datasets = [ds["RID"] for ds in self.find_datasets()]
@@ -237,16 +244,20 @@ class Dataset:
237
244
  Raises:
238
245
  DerivaMLException: if provided RID is not to a dataset_table.
239
246
  """
240
- for dataset in self._build_dataset_graph(dataset_rid=dataset_rid):
241
- version = self.dataset_version(dataset)
242
- new_version = version.increment_version(component)
243
- self._insert_dataset_version(
244
- dataset,
245
- new_version,
246
- description=description,
247
- execution_rid=execution_rid,
247
+
248
+ # Find all of the datasets that are reachable from this dataset and determine their new version numbers.
249
+ related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
250
+ version_update_list = [
251
+ DatasetSpec(
252
+ rid=ds_rid,
253
+ version=self.dataset_version(ds_rid).increment_version(component),
248
254
  )
249
- return self.dataset_version(dataset_rid)
255
+ for ds_rid in related_datasets
256
+ ]
257
+ updated_versions = self._insert_dataset_versions(
258
+ version_update_list, description=description, execution_rid=execution_rid
259
+ )
260
+ return [d.version for d in version_update_list if d.rid == dataset_rid][0]
250
261
 
251
262
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
252
263
  def create_dataset(
@@ -323,9 +334,8 @@ class Dataset:
323
334
  pb.schemas[self._ml_schema].Dataset_Execution.insert(
324
335
  [{"Dataset": dataset_rid, "Execution": execution_rid}]
325
336
  )
326
- self._insert_dataset_version(
327
- dataset_rid,
328
- dataset_version=version,
337
+ self._insert_dataset_versions(
338
+ [DatasetSpec(rid=dataset_rid, version=version)],
329
339
  execution_rid=execution_rid,
330
340
  description="Initial dataset creation.",
331
341
  )
@@ -455,7 +465,7 @@ class Dataset:
455
465
  dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
456
466
  dataset_rid: RID:
457
467
  recurse: (Default value = False)
458
- limit: If provided, the maxiumum number of members to return for each element type.
468
+ limit: If provided, the maximum number of members to return for each element type.
459
469
 
460
470
  Returns:
461
471
  Dictionary of entities associated with a specific dataset_table. Key is the table from which the elements
@@ -697,11 +707,25 @@ class Dataset:
697
707
  list of RIDs of nested datasets.
698
708
 
699
709
  """
700
- children = [d["RID"] for d in self.list_dataset_members(dataset_rid)["Dataset"]]
701
- if recurse:
702
- for child in children.copy():
703
- children.extend(self.list_dataset_children(child, recurse=recurse))
704
- return children
710
+ dataset_dataset_path = (
711
+ self._model.catalog.getPathBuilder()
712
+ .schemas[self._ml_schema]
713
+ .tables["Dataset_Dataset"]
714
+ )
715
+ nested_datasets = list(dataset_dataset_path.entities().fetch())
716
+
717
+ def find_children(rid: RID):
718
+ children = [
719
+ child["Nested_Dataset"]
720
+ for child in nested_datasets
721
+ if child["Dataset"] == rid
722
+ ]
723
+ if recurse:
724
+ for child in children.copy():
725
+ children.extend(find_children(child))
726
+ return children
727
+
728
+ return find_children(dataset_rid)
705
729
 
706
730
  def _vocabulary_specification(
707
731
  self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
@@ -727,20 +751,19 @@ class Dataset:
727
751
  ]
728
752
 
729
753
  def _table_paths(
730
- self, dataset: DatasetSpec = None
731
- ) -> Iterator[tuple[list[str], list[str], list[Table]]]:
754
+ self, dataset: DatasetSpec = None, snapshot_catalog: Optional[DerivaML] = None
755
+ ) -> Iterator[tuple[str, str, Table]]:
732
756
 
733
- dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
734
-
735
- paths = self._collect_paths(dataset and dataset.rid)
757
+ paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
736
758
 
737
759
  def source_path(path: tuple[Table, ...]):
760
+ """Convert a tuple representing a path into a source path component with FK linkage"""
738
761
  path = list(path)
739
762
  p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
740
763
  for table in path[1:]:
741
- if table == dataset_dataset:
764
+ if table.name == "Dataset_Dataset":
742
765
  p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
743
- elif table == self.dataset_table:
766
+ elif table.name == "Dataset":
744
767
  p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
745
768
  elif table.name == "Dataset_Version":
746
769
  p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
@@ -751,63 +774,76 @@ class Dataset:
751
774
  src_paths = ["/".join(source_path(p)) for p in paths]
752
775
  dest_paths = ["/".join([t.name for t in p]) for p in paths]
753
776
  target_tables = [p[-1] for p in paths]
754
-
755
777
  return zip(src_paths, dest_paths, target_tables)
756
778
 
757
779
  def _collect_paths(
758
780
  self,
759
781
  dataset_rid: Optional[RID] = None,
782
+ snapshot_catalog: Optional[DerivaML] = None,
760
783
  dataset_nesting_depth: Optional[int] = None,
761
784
  ) -> set[tuple[Table, ...]]:
762
785
 
763
- dataset_nesting_depth = (
764
- self._dataset_nesting_depth()
765
- if dataset_nesting_depth is None
766
- else dataset_nesting_depth
767
- )
768
- dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
769
-
770
- # Figure out which paths we don't need to query for this dataset. If no dataset is provided, use them all.
771
- dataset_elements = (
772
- [
773
- self._model.name_to_table(e)
774
- for e, m in self.list_dataset_members(
786
+ snapshot_catalog = snapshot_catalog or self
787
+ dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
788
+ "Dataset"
789
+ ]
790
+ dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
791
+ "Dataset_Dataset"
792
+ ]
793
+ dataset_associations = [
794
+ a
795
+ for a in self.dataset_table.find_associations()
796
+ if a.table.schema.name != self._ml_schema
797
+ or a.table.name == "Dataset_Dataset"
798
+ ]
799
+ if dataset_rid:
800
+ # Get a list of the members of the dataset so we can figure out which tables to query.
801
+ dataset_elements = [
802
+ snapshot_catalog._model.name_to_table(e)
803
+ for e, m in snapshot_catalog.list_dataset_members(
775
804
  dataset_rid=dataset_rid, limit=1
776
805
  ).items()
777
806
  if m
778
807
  ]
779
- if dataset_rid
780
- else self.list_dataset_element_types()
781
- )
782
-
783
- dataset_associations = [a.table for a in self.dataset_table.find_associations()]
784
- included_associations = [
785
- a.table
786
- for a in self.dataset_table.find_associations()
787
- if a.other_fkeys.pop().pk_table in dataset_elements
788
- ]
808
+ included_associations = [
809
+ a.table
810
+ for a in dataset_table.find_associations()
811
+ if a.other_fkeys.pop().pk_table in dataset_elements
812
+ ]
813
+ else:
814
+ included_associations = dataset_associations
789
815
  # Get the paths through the schema and filter out all of dataset paths not used by this dataset.
790
816
  paths = {
791
817
  tuple(p)
792
- for p in self._model._schema_to_paths()
818
+ for p in snapshot_catalog._model._schema_to_paths()
793
819
  if (len(p) == 1)
794
- or (p[1] not in dataset_associations)
795
- or (p[1] in included_associations)
820
+ or (p[1] not in dataset_associations) # Tables in the domain schema
821
+ or (
822
+ p[1] in included_associations
823
+ ) # Tables that include members of the dataset
796
824
  }
797
825
  # Now get paths for nested datasets
798
826
  nested_paths = set()
799
827
  if dataset_rid:
800
- for c in self.list_dataset_children(dataset_rid=dataset_rid):
801
- nested_paths |= self._collect_paths(c)
828
+ for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
829
+ nested_paths |= self._collect_paths(
830
+ c, snapshot_catalog=snapshot_catalog
831
+ )
802
832
  else:
833
+ # Initialize nesting depth if not already provided.
834
+ dataset_nesting_depth = (
835
+ self._dataset_nesting_depth()
836
+ if dataset_nesting_depth is None
837
+ else dataset_nesting_depth
838
+ )
803
839
  if dataset_nesting_depth:
804
840
  nested_paths = self._collect_paths(
805
841
  dataset_nesting_depth=dataset_nesting_depth - 1
806
842
  )
807
843
  if nested_paths:
808
844
  paths |= {
809
- tuple([self.dataset_table]),
810
- (self.dataset_table, dataset_dataset),
845
+ tuple([dataset_table]),
846
+ (dataset_table, dataset_dataset),
811
847
  }
812
848
  paths |= {(self.dataset_table, dataset_dataset) + p for p in nested_paths}
813
849
  return paths
@@ -863,6 +899,7 @@ class Dataset:
863
899
  self,
864
900
  writer: Callable[[str, str, Table], list[dict[str, Any]]],
865
901
  dataset: DatasetSpec,
902
+ snapshot_catalog: Optional[DerivaML] = None,
866
903
  ) -> list[dict[str, Any]]:
867
904
  """Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
868
905
  The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
@@ -902,21 +939,24 @@ class Dataset:
902
939
  A dataset_table specification.
903
940
  """
904
941
  element_spec = []
905
- for path in self._table_paths(dataset=dataset):
942
+ for path in self._table_paths(
943
+ dataset=dataset, snapshot_catalog=snapshot_catalog
944
+ ):
906
945
  element_spec.extend(writer(*path))
907
946
  return self._vocabulary_specification(writer) + element_spec
908
947
 
909
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
910
- def download_dataset_bag(
948
+ def _download_dataset_bag(
911
949
  self,
912
950
  dataset: DatasetSpec,
913
951
  execution_rid: Optional[RID] = None,
952
+ snapshot_catalog: Optional[DerivaML] = None,
914
953
  ) -> DatasetBag:
915
954
  """Download a dataset onto the local file system. Create a MINID for the dataset if one doesn't already exist.
916
955
 
917
956
  Args:
918
957
  dataset: Specification of the dataset to be downloaded.
919
958
  execution_rid: Execution RID for the dataset.
959
+ snapshot_catalog: Snapshot catalog for the dataset version if specified.
920
960
 
921
961
  Returns:
922
962
  Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
@@ -927,16 +967,17 @@ class Dataset:
927
967
  and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
928
968
  ):
929
969
  raise DerivaMLException(f"RID {execution_rid} is not an execution")
930
- minid = self.get_dataset_minid(dataset)
970
+ minid = self._get_dataset_minid(dataset, snapshot_catalog=snapshot_catalog)
931
971
 
932
972
  bag_path = (
933
973
  self._materialize_dataset_bag(minid, execution_rid=execution_rid)
934
974
  if dataset.materialize
935
- else self._download_dataset_bag(minid)
975
+ else self._download_dataset_minid(minid)
936
976
  )
937
977
  return DatabaseModel(minid, bag_path).get_dataset()
938
978
 
939
979
  def _version_snapshot(self, dataset: DatasetSpec) -> str:
980
+ """Return a catalog with snapshot for the specified dataset version"""
940
981
  version_record = [
941
982
  h
942
983
  for h in self.dataset_history(dataset_rid=dataset.rid)
@@ -944,13 +985,17 @@ class Dataset:
944
985
  ][0]
945
986
  return f"{self._model.catalog.catalog_id}@{iso_to_snap(version_record.timestamp.isoformat())}"
946
987
 
947
- def _create_dataset_minid(self, dataset: DatasetSpec) -> str:
988
+ def _create_dataset_minid(
989
+ self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
990
+ ) -> str:
948
991
  with TemporaryDirectory() as tmp_dir:
949
992
  # Generate a download specification file for the current catalog schema. By default, this spec
950
993
  # will generate a minid and place the bag into S3 storage.
951
994
  spec_file = f"{tmp_dir}/download_spec.json"
952
995
  with open(spec_file, "w", encoding="utf-8") as ds:
953
- json.dump(self._generate_dataset_download_spec(dataset), ds)
996
+ json.dump(
997
+ self._generate_dataset_download_spec(dataset, snapshot_catalog), ds
998
+ )
954
999
  try:
955
1000
  self._logger.info(
956
1001
  f"Downloading dataset minid for catalog: {dataset.rid}@{str(dataset.version)}"
@@ -987,14 +1032,17 @@ class Dataset:
987
1032
  version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
988
1033
  return minid_page_url
989
1034
 
990
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
991
- def get_dataset_minid(
992
- self, dataset: DatasetSpec, create: bool = True
1035
+ def _get_dataset_minid(
1036
+ self,
1037
+ dataset: DatasetSpec,
1038
+ snapshot_catalog: Optional[DerivaML] = None,
1039
+ create: bool = True,
993
1040
  ) -> DatasetMinid:
994
1041
  """Return a MINID to the specified dataset. If no version is specified, use the latest.
995
1042
 
996
1043
  Args:
997
1044
  dataset: Specification of the dataset.
1045
+ snapshot_catalog: Snapshot catalog for the dataset version if specified.
998
1046
  create: Create a new MINID if one doesn't already exist.
999
1047
 
1000
1048
  Returns:
@@ -1025,12 +1073,12 @@ class Dataset:
1025
1073
  f"Minid for dataset {dataset.rid} doesn't exist"
1026
1074
  )
1027
1075
  self._logger.info("Creating new MINID for dataset %s", dataset.rid)
1028
- minid_url = self._create_dataset_minid(dataset)
1076
+ minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
1029
1077
  # If provided a MINID, use the MINID metadata to get the checksum and download the bag.
1030
1078
  r = requests.get(minid_url, headers={"accept": "application/json"})
1031
1079
  return DatasetMinid(dataset_version=dataset.version, **r.json())
1032
1080
 
1033
- def _download_dataset_bag(self, minid: DatasetMinid) -> Path:
1081
+ def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
1034
1082
  """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
1035
1083
  that all the metadata is correct
1036
1084
 
@@ -1097,7 +1145,7 @@ class Dataset:
1097
1145
  return True
1098
1146
 
1099
1147
  # request metadata
1100
- bag_path = self._download_dataset_bag(minid)
1148
+ bag_path = self._download_dataset_minid(minid)
1101
1149
  bag_dir = bag_path.parent
1102
1150
  validated_check = bag_dir / "validated_check.txt"
1103
1151
 
@@ -1112,7 +1160,9 @@ class Dataset:
1112
1160
  return Path(bag_path)
1113
1161
 
1114
1162
  def _export_outputs(
1115
- self, dataset: Optional[DatasetSpec] = None
1163
+ self,
1164
+ dataset: Optional[DatasetSpec] = None,
1165
+ snapshot_catalog: Optional[DerivaML] = None,
1116
1166
  ) -> list[dict[str, Any]]:
1117
1167
  """Return and output specification for the datasets in the provided model
1118
1168
 
@@ -1150,9 +1200,13 @@ class Dataset:
1150
1200
  "source": {"api": "schema", "skip_root_path": True},
1151
1201
  "destination": {"type": "json", "name": "schema"},
1152
1202
  },
1153
- ] + self._dataset_specification(writer, dataset)
1203
+ ] + self._dataset_specification(
1204
+ writer, dataset, snapshot_catalog=snapshot_catalog
1205
+ )
1154
1206
 
1155
- def _processor_params(self, dataset: DatasetSpec) -> list[dict[str, Any]]:
1207
+ def _processor_params(
1208
+ self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
1209
+ ) -> list[dict[str, Any]]:
1156
1210
  """
1157
1211
  Returns:
1158
1212
  a download specification for the datasets in the provided model.
@@ -1178,7 +1232,7 @@ class Dataset:
1178
1232
  "processor": "json",
1179
1233
  "processor_params": {"query_path": "/schema", "output_path": "schema"},
1180
1234
  }
1181
- ] + self._dataset_specification(writer, dataset)
1235
+ ] + self._dataset_specification(writer, dataset, snapshot_catalog)
1182
1236
 
1183
1237
  @staticmethod
1184
1238
  def _download_dataset_element(
@@ -1257,7 +1311,9 @@ class Dataset:
1257
1311
  )
1258
1312
  return exports
1259
1313
 
1260
- def _generate_dataset_download_spec(self, dataset: DatasetSpec) -> dict[str, Any]:
1314
+ def _generate_dataset_download_spec(
1315
+ self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML]
1316
+ ) -> dict[str, Any]:
1261
1317
  """
1262
1318
 
1263
1319
  Returns:
@@ -1315,7 +1371,7 @@ class Dataset:
1315
1371
  },
1316
1372
  },
1317
1373
  ]
1318
- + self._processor_params(dataset),
1374
+ + self._processor_params(dataset, snapshot_catalog),
1319
1375
  },
1320
1376
  }
1321
1377
 
@@ -139,7 +139,6 @@ class FileSpec(BaseModel):
139
139
  if url_parts.scheme == "tag":
140
140
  return v
141
141
  elif not url_parts.scheme:
142
- print(v)
143
142
  return f'tag://{gethostname()},{date.today()}:file://{v}'
144
143
  else:
145
144
  raise ValidationError("url is not a file URL")
@@ -15,8 +15,12 @@ import logging
15
15
  from datetime import datetime
16
16
  import hashlib
17
17
  from itertools import chain
18
+ import inspect
18
19
  from pathlib import Path
19
20
  import requests
21
+ from setuptools_git_versioning import get_latest_file_commit
22
+ import subprocess
23
+ import shutil
20
24
  from typing import Optional, Any, Iterable, TYPE_CHECKING
21
25
  from deriva.core import (
22
26
  ErmrestCatalog,
@@ -27,6 +31,7 @@ from deriva.core import (
27
31
  )
28
32
  import deriva.core.datapath as datapath
29
33
  from deriva.core.datapath import DataPathException
34
+ from deriva.core.deriva_server import DerivaServer
30
35
  from deriva.core.ermrest_catalog import ResolveRidResult
31
36
  from deriva.core.ermrest_model import Key, Table
32
37
  from deriva.core.hatrac_store import HatracStore
@@ -35,6 +40,8 @@ from pydantic import validate_call, ConfigDict
35
40
  from .execution_configuration import ExecutionConfiguration, Workflow
36
41
  from .feature import Feature, FeatureRecord
37
42
  from .dataset import Dataset
43
+ from .dataset_aux_classes import DatasetSpec
44
+ from .dataset_bag import DatasetBag
38
45
  from .deriva_model import DerivaModel
39
46
  from .upload import (
40
47
  table_path,
@@ -56,6 +63,18 @@ from .deriva_definitions import (
56
63
  FileSpec,
57
64
  )
58
65
 
66
+ try:
67
+ from icecream import ic
68
+ except ImportError: # Graceful fallback if IceCream isn't installed.
69
+ ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
70
+
71
+
72
+ try:
73
+ from IPython import get_ipython
74
+ except ImportError: # Graceful fallback if IPython isn't installed.
75
+ get_ipython = lambda: None
76
+
77
+
59
78
  if TYPE_CHECKING:
60
79
  from .execution import Execution
61
80
 
@@ -98,13 +117,13 @@ class DerivaML(Dataset):
98
117
  model_version: A string that indicates the version model. Typically passed in via
99
118
  """
100
119
  self.credential = get_credential(hostname)
101
- self.catalog = ErmrestCatalog(
120
+ server = DerivaServer(
102
121
  "https",
103
122
  hostname,
104
- catalog_id,
105
- self.credential,
123
+ credentials=self.credential,
106
124
  session_config=self._get_session_config(),
107
125
  )
126
+ self.catalog = server.connect_ermrest(catalog_id)
108
127
  self.model = DerivaModel(
109
128
  self.catalog.getCatalogModel(), domain_schema=domain_schema
110
129
  )
@@ -132,6 +151,29 @@ class DerivaML(Dataset):
132
151
  self.version = model_version
133
152
  self.configuration = None
134
153
  self._execution: Optional[Execution] = None
154
+ self._notebook = None
155
+ try:
156
+ from IPython import get_ipython
157
+
158
+ ipython = get_ipython()
159
+ # Check if running in Jupyter's ZMQ kernel (used by notebooks)
160
+ if ipython is not None and "IPKernelApp" in ipython.config:
161
+ self._notebook = Path(ipython.user_ns.get("__session__"))
162
+ # Check if running in Jupyter's ZMQ kernel (used by notebooks)
163
+ try:
164
+ if subprocess.run(
165
+ [shutil.which("nbstripout"), "--is-installed"],
166
+ check=False,
167
+ capture_output=True,
168
+ ).returncode:
169
+ self._logger.warn(
170
+ "nbstripout is not installed in repository. Please run nbstripout --install"
171
+ )
172
+ except subprocess.CalledProcessError:
173
+ self._logger.error("nbstripout is not found.")
174
+
175
+ except (ImportError, AttributeError):
176
+ pass
135
177
 
136
178
  self.domain_schema = self.model.domain_schema
137
179
  self.project_name = project_name or self.domain_schema
@@ -705,6 +747,28 @@ class DerivaML(Dataset):
705
747
  for v in pb.schemas[table.schema.name].tables[table.name].entities().fetch()
706
748
  ]
707
749
 
750
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
751
+ def download_dataset_bag(
752
+ self,
753
+ dataset: DatasetSpec,
754
+ execution_rid: Optional[RID] = None,
755
+ ) -> DatasetBag:
756
+ """Download a dataset onto the local file system. Create a MINID for the dataset if one doesn't already exist.
757
+
758
+ Args:
759
+ dataset: Specification of the dataset to be downloaded.
760
+ execution_rid: Execution RID for the dataset.
761
+
762
+ Returns:
763
+ Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
764
+ for the dataset.
765
+ """
766
+ return self._download_dataset_bag(
767
+ dataset=dataset,
768
+ execution_rid=execution_rid,
769
+ snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
770
+ )
771
+
708
772
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
709
773
  def download_asset(self, asset_rid: RID, dest_dir: Path) -> Path:
710
774
  """Download an asset from a URL and place it in a local directory.
@@ -808,8 +872,10 @@ class DerivaML(Dataset):
808
872
  Iterable of the RIDs of the files that were added.
809
873
  """
810
874
  defined_types = self.list_vocabulary_terms(MLVocab.file_type)
811
- if execution_rid and self.resolve_rid(execution_rid).table.name != 'Execution':
812
- raise DerivaMLException(f'RID {execution_rid} is not for an execution table.')
875
+ if execution_rid and self.resolve_rid(execution_rid).table.name != "Execution":
876
+ raise DerivaMLException(
877
+ f"RID {execution_rid} is not for an execution table."
878
+ )
813
879
 
814
880
  def check_file_type(dtype: str) -> bool:
815
881
  for term in defined_types:
@@ -862,18 +928,11 @@ class DerivaML(Dataset):
862
928
  self, file_types: Optional[list[str]] = None
863
929
  ) -> list[dict[str, Any]]:
864
930
  """Return the contents of the file table. Denormalized file types into the file record."""
865
- atable = next(
866
- self._model.schemas[self._ml_schema]
867
- .tables[MLVocab.dataset_type]
868
- .find_associations()
869
- ).name
870
931
  ml_path = self.pathBuilder.schemas[self._ml_schema]
871
- atable_path = ml_path.tables[atable]
872
932
  file_path = ml_path.File
873
933
  type_path = ml_path.File_File_Type
874
934
 
875
935
  # Get a list of all the dataset_type values associated with this dataset_table.
876
- files = []
877
936
  path = file_path.link(type_path)
878
937
  path = path.attributes(
879
938
  path.File.RID,
@@ -885,10 +944,12 @@ class DerivaML(Dataset):
885
944
  )
886
945
  file_map = {}
887
946
  for f in path.fetch():
888
- file_map.setdefault(f['RID'], f | {'File_Types': []})['File_Types'].append(f['File_Type'])
947
+ file_map.setdefault(f["RID"], f | {"File_Types": []})["File_Types"].append(
948
+ f["File_Type"]
949
+ )
889
950
 
890
951
  # Now get rid of the File_Type key and return the result
891
- return [ (f, f.pop('File_Type'))[0] for f in file_map.values()]
952
+ return [(f, f.pop("File_Type"))[0] for f in file_map.values()]
892
953
 
893
954
  def list_workflows(self) -> list[Workflow]:
894
955
  """Return a list of all the workflows in the catalog."""
@@ -901,6 +962,7 @@ class DerivaML(Dataset):
901
962
  version=w["Version"],
902
963
  description=w["Description"],
903
964
  rid=w["RID"],
965
+ checksum=w["Checksum"],
904
966
  )
905
967
  for w in workflow_path.entities().fetch()
906
968
  ]
@@ -917,33 +979,18 @@ class DerivaML(Dataset):
917
979
  """
918
980
 
919
981
  # Check to make sure that the workflow is not already in the table. If it's not, add it.
920
- def get_checksum(url) -> str:
921
- """Get the checksum of a file from a URL."""
922
- try:
923
- response = requests.get(url)
924
- response.raise_for_status()
925
- except Exception:
926
- raise DerivaMLException(f"Invalid URL: {url}")
927
- else:
928
- sha256_hash = hashlib.sha256()
929
- sha256_hash.update(response.content)
930
- checksum = "SHA-256: " + sha256_hash.hexdigest()
931
- return checksum
982
+
983
+ if workflow_rid := self.lookup_workflow(workflow.url):
984
+ return workflow_rid
932
985
 
933
986
  ml_schema_path = self.pathBuilder.schemas[self.ml_schema]
934
987
  try:
935
- url_column = ml_schema_path.Workflow.URL
936
- workflow_record = list(
937
- ml_schema_path.Workflow.filter(url_column == workflow.url).entities()
938
- )[0]
939
- workflow_rid = workflow_record["RID"]
940
- except IndexError:
941
988
  # Record doesn't exist already
942
989
  workflow_record = {
943
990
  "URL": workflow.url,
944
991
  "Name": workflow.name,
945
992
  "Description": workflow.description,
946
- "Checksum": get_checksum(workflow.url),
993
+ "Checksum": workflow.checksum,
947
994
  "Version": workflow.version,
948
995
  MLVocab.workflow_type: self.lookup_term(
949
996
  MLVocab.workflow_type, workflow.workflow_type
@@ -955,6 +1002,125 @@ class DerivaML(Dataset):
955
1002
  raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
956
1003
  return workflow_rid
957
1004
 
1005
+ def lookup_workflow(self, url: str) -> Optional[RID]:
1006
+ workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
1007
+ try:
1008
+ url_column = workflow_path.URL
1009
+ return list(workflow_path.filter(url_column == url).entities())[0]["RID"]
1010
+ except IndexError:
1011
+ return None
1012
+
1013
+ def create_workflow(
1014
+ self, name: str, workflow_type: str, description: str = "", create: bool = True
1015
+ ) -> RID:
1016
+ """Identify current executing program and return a workflow RID for it
1017
+
1018
+ Determine the notebook or script that is currently being executed. Assume that this is
1019
+ being executed from a cloned GitHub repository. Determine the remote repository name for
1020
+ this object. Then either retrieve an existing workflow for this executable or create
1021
+ a new one.
1022
+
1023
+ Args:
1024
+ name: The name of the workflow.
1025
+ workflow_type: The type of the workflow.
1026
+ description: The description of the workflow.
1027
+ create: Whether to create a new workflow.
1028
+ """
1029
+ # Make sure type is correct.
1030
+ self.lookup_term(MLVocab.workflow_type, workflow_type)
1031
+ filename, github_url, is_dirty = self._github_url()
1032
+
1033
+ if is_dirty:
1034
+ self._logger.warning(
1035
+ f"File {filename} has been modified since last commit. Consider commiting before executing"
1036
+ )
1037
+
1038
+ sha256_hash = hashlib.sha256()
1039
+ if self._notebook:
1040
+ # If you are in a notebook, strip out the outputs before computing the checksum.
1041
+ result = subprocess.run(
1042
+ ["nbstripout", "-t", filename],
1043
+ capture_output=True,
1044
+ text=False,
1045
+ check=True,
1046
+ )
1047
+ sha256_hash.update(result.stdout)
1048
+ else:
1049
+ with open(filename, "rb") as f:
1050
+ sha256_hash.update(f.read())
1051
+ checksum = "SHA-256:" + sha256_hash.hexdigest()
1052
+
1053
+ workflow = Workflow(
1054
+ name=name,
1055
+ url=github_url,
1056
+ checksum=checksum,
1057
+ description=description,
1058
+ workflow_type=workflow_type,
1059
+ )
1060
+ return self.add_workflow(workflow) if create else None
1061
+
1062
+ def _github_url(self) -> tuple[Path, str, bool]:
1063
+ """Return a GitHUB URL for the latest commit of the script from which this routine is called.
1064
+
1065
+ This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
1066
+ the file is in a gitHUB repository and commited. It returns a URL to the last commited version of this
1067
+ file in GitHUB.
1068
+
1069
+ Returns: A tuple with the filename, gethub_url and a boolean to indicated if uncommited changes
1070
+ have been made to the file.
1071
+
1072
+ """
1073
+
1074
+ # Get the name of the script that is calling this function.
1075
+ if self._notebook:
1076
+ # Try to get the __session__ variable from the user namespace.
1077
+ filename = Path("").absolute().parent / self._notebook
1078
+ else:
1079
+ stack = inspect.stack()
1080
+ if len(stack) > 1:
1081
+ filename = Path(
1082
+ stack[2].filename
1083
+ ) # Get the caller's filename, which is two up the stack from here.
1084
+ else:
1085
+ raise DerivaMLException(
1086
+ f"Looking for caller failed"
1087
+ ) # Stack is too shallow
1088
+
1089
+ # Get repo URL from local github repo.
1090
+ try:
1091
+ result = subprocess.run(
1092
+ ["git", "remote", "get-url", "origin"], capture_output=True, text=True
1093
+ )
1094
+ github_url = result.stdout.strip().removesuffix(".git")
1095
+ except subprocess.CalledProcessError:
1096
+ raise DerivaMLException(f"No GIT remote found")
1097
+
1098
+ # Find the root directory for the repository
1099
+ repo_root = filename
1100
+ while repo_root != repo_root.root:
1101
+ if (repo_root / ".git").exists():
1102
+ break
1103
+ else:
1104
+ repo_root = repo_root.parent
1105
+
1106
+ # Now check to see if file has been modified since the last commit.
1107
+ try:
1108
+ result = subprocess.run(
1109
+ ["git", "status", "--porcelain"],
1110
+ capture_output=True,
1111
+ text=True,
1112
+ check=True,
1113
+ )
1114
+ is_dirty = bool(
1115
+ "M " in result.stdout.strip()
1116
+ ) # Returns True if output indicates a modified file
1117
+ except subprocess.CalledProcessError:
1118
+ is_dirty = False # If Git command fails, assume no changes
1119
+
1120
+ sha = get_latest_file_commit(filename)
1121
+ url = f"{github_url}/blob/{sha}/{filename.relative_to(repo_root)}"
1122
+ return filename, url, is_dirty
1123
+
958
1124
  # @validate_call
959
1125
  def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":
960
1126
  """Create an execution object
@@ -12,6 +12,7 @@ import os
12
12
  import shutil
13
13
  from datetime import datetime
14
14
  from pathlib import Path
15
+ import requests
15
16
  from tempfile import NamedTemporaryFile
16
17
  from typing import Iterable, Any, Optional
17
18
  from deriva.core import format_exception
@@ -28,7 +29,6 @@ from .deriva_definitions import (
28
29
  )
29
30
  from .deriva_ml_base import DerivaML, FeatureRecord
30
31
  from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
31
- from .dataset import Dataset
32
32
  from .dataset_bag import DatasetBag
33
33
  from .execution_configuration import ExecutionConfiguration
34
34
  from .execution_environment import get_execution_environment
@@ -51,6 +51,12 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
51
51
  ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
52
52
 
53
53
 
54
+ try:
55
+ from jupyter_server.serverapp import list_running_servers
56
+ except ImportError:
57
+ list_running_servers = lambda: []
58
+
59
+
54
60
  class Execution:
55
61
  """The Execution class is used to capture the context of an activity within DerivaML. While these are primarily
56
62
  computational, manual processes can be represented by an execution as well.
@@ -100,6 +106,7 @@ class Execution:
100
106
  self.configuration = configuration
101
107
  self._ml_object = ml_object
102
108
  self.start_time = None
109
+ self.stop_time = None
103
110
  self.status = Status.created
104
111
  self.uploaded_assets: list[Path] = []
105
112
 
@@ -221,8 +228,9 @@ class Execution:
221
228
  Returns:
222
229
  the location of the unpacked and validated dataset_table bag and the RID of the bag
223
230
  """
224
- ds = Dataset(self._ml_object.model, cache_dir=self._cache_dir)
225
- return ds.download_dataset_bag(dataset, execution_rid=self.execution_rid)
231
+ return self._ml_object.download_dataset_bag(
232
+ dataset, execution_rid=self.execution_rid
233
+ )
226
234
 
227
235
  @validate_call
228
236
  def update_status(self, status: Status, msg: str) -> None:
@@ -243,6 +251,35 @@ class Execution:
243
251
  ]
244
252
  )
245
253
 
254
+ def _create_notebook_checkpoint(self):
255
+ """Trigger a checkpoint creation using Jupyter's API."""
256
+ notebook_name = self._ml_object._notebook
257
+
258
+ # Look for the server running this notebook.
259
+ root = Path("").absolute().parent.as_posix()
260
+ servers = list(list_running_servers())
261
+ # Jupyterhub seems to handle root_dir differently then server case.
262
+ server = (
263
+ servers
264
+ if len(servers) == 1
265
+ else [s for s in servers if s["root_dir"] == root]
266
+ )[0]
267
+ notebook_url = f"{server['url']}api/contents/{notebook_name}"
268
+
269
+ # Get notebook content
270
+ response = requests.get(
271
+ notebook_url, headers={"Authorization": f"Token {server['token']}"}
272
+ )
273
+ if response.status_code == 200:
274
+ notebook_content = response.json()["content"]
275
+ # Execution metadata cannot be in a directory, so map path into filename.
276
+ checkpoint_path = (
277
+ self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
278
+ / f"{notebook_name.as_posix().replace('/','_')}.checkpoint"
279
+ )
280
+ with open(checkpoint_path, "w", encoding="utf-8") as f:
281
+ json.dump(notebook_content, f)
282
+
246
283
  def execution_start(self) -> None:
247
284
  """Start an execution, uploading status to catalog"""
248
285
 
@@ -252,11 +289,15 @@ class Execution:
252
289
 
253
290
  def execution_stop(self) -> None:
254
291
  """Finish the execution and update the duration and status of execution."""
255
- duration = datetime.now() - self.start_time
292
+ self.stop_time = datetime.now()
293
+ duration = self.stop_time - self.start_time
256
294
  hours, remainder = divmod(duration.total_seconds(), 3600)
257
295
  minutes, seconds = divmod(remainder, 60)
258
296
  duration = f"{round(hours, 0)}H {round(minutes, 0)}min {round(seconds, 4)}sec"
259
297
 
298
+ if self._ml_object._notebook:
299
+ self._create_notebook_checkpoint()
300
+
260
301
  self.update_status(Status.completed, "Algorithm execution ended.")
261
302
  self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
262
303
  [{"RID": self.execution_rid, "Duration": duration}]
@@ -33,18 +33,18 @@ class Workflow(BaseModel):
33
33
  version: Optional[str] = None
34
34
  description: Optional[str] = ""
35
35
  rid: Optional[RID] = None
36
+ checksum: Optional[str]
37
+
36
38
 
37
39
 
38
40
  class ExecutionConfiguration(BaseModel):
39
41
  """Define the parameters that are used to configure a specific execution.
40
42
 
41
43
  Attributes:
42
- datasets: List of dataset_table RIDS, MINIDS for datasets to be downloaded prior to execution. By default,
43
- all the datasets are materialized. However, if the assets associated with a dataset_table are not
44
- needed, a dictionary that defines the rid and the materialization parameter for the
45
- download_dataset_bag method can be specified, e.g. datasets=[{'rid': RID, 'materialize': True}].
44
+ datasets: List of dataset specifications which specify the dataset RID, version and if the dataset
45
+ should be materialized.
46
46
  assets: List of assets to be downloaded prior to execution. The values must be RIDs in an asset table
47
- workflow: A workflow instance. Must have a name, URI to the workflow instance, and a type.
47
+ workflow: A RID for a workflow instance. Must have a name, URI to the workflow instance, and a type.
48
48
  description: A description of the execution. Can use Markdown format.
49
49
  """
50
50
 
@@ -70,8 +70,11 @@ exec_asset_regex = (
70
70
  exec_metadata_dir_regex = (
71
71
  exec_dir_regex + r"/execution-metadata/(?P<execution_metadata_type>[-\w]+)"
72
72
  )
73
+
74
+ # May have more than one suffix
73
75
  exec_metadata_regex = (
74
- exec_metadata_dir_regex + r"/(?P<filename>[-\w]+)[.](?P<file_ext>[a-z0-9]*)$"
76
+ exec_metadata_dir_regex
77
+ + r"/(?P<filename>[-\w]+([.][\w]+)*)[.](?P<file_ext>[a-z0-9]*)$"
75
78
  )
76
79
  feature_dir_regex = exec_dir_regex + r"/feature"
77
80
  feature_table_dir_regex = (
@@ -1,16 +1,19 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.8.2
3
+ Version: 1.8.5
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
- Requires-Dist: deriva~=1.7.6
9
+ Requires-Dist: deriva~=1.7.7
10
10
  Requires-Dist: pandas
11
11
  Requires-Dist: regex~=2024.7.24
12
12
  Requires-Dist: pydantic>=2.10.6
13
13
  Requires-Dist: semver>3.0.0
14
+ Requires-Dist: setuptools-git-versioning<3,>=2.0
15
+ Requires-Dist: nbstripout
16
+ Dynamic: license-file
14
17
 
15
18
  Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
16
19
  using a deriva catalog.
@@ -0,0 +1,7 @@
1
+ deriva~=1.7.7
2
+ pandas
3
+ regex~=2024.7.24
4
+ pydantic>=2.10.6
5
+ semver>3.0.0
6
+ setuptools-git-versioning<3,>=2.0
7
+ nbstripout
@@ -1 +0,0 @@
1
- __version__ = "1.8.2"
@@ -1,5 +0,0 @@
1
- deriva~=1.7.6
2
- pandas
3
- regex~=2024.7.24
4
- pydantic>=2.10.6
5
- semver>3.0.0
File without changes
File without changes
File without changes