deriva-ml 1.8.1__py3-none-any.whl → 1.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/VERSION.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.8.1"
1
+ __version__ = "1.8.4"
deriva_ml/dataset.py CHANGED
@@ -6,6 +6,7 @@ accessible via a DerivaML class instance.
6
6
 
7
7
  """
8
8
 
9
+ from __future__ import annotations
9
10
  from bdbag.fetch.fetcher import fetch_single_file
10
11
  from bdbag import bdbag_api as bdb
11
12
  from collections import defaultdict
@@ -37,7 +38,7 @@ from pydantic import (
37
38
  import requests
38
39
 
39
40
  from tempfile import TemporaryDirectory, NamedTemporaryFile
40
- from typing import Any, Callable, Optional, Iterable, Iterator
41
+ from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
41
42
 
42
43
  from deriva_ml import DatasetBag
43
44
  from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
@@ -52,6 +53,9 @@ from .dataset_aux_classes import (
52
53
  DatasetSpec,
53
54
  )
54
55
 
56
+ if TYPE_CHECKING:
57
+ from .deriva_ml_base import DerivaML
58
+
55
59
 
56
60
  class Dataset:
57
61
  """
@@ -83,29 +87,32 @@ class Dataset:
83
87
  else:
84
88
  return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
85
89
 
86
- def _insert_dataset_version(
90
+ def _insert_dataset_versions(
87
91
  self,
88
- dataset_rid: RID,
89
- dataset_version: DatasetVersion,
92
+ dataset_list: list[DatasetSpec],
90
93
  description: Optional[str] = "",
91
94
  execution_rid: Optional[RID] = None,
92
95
  ) -> RID:
93
96
  schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
94
- version_path = schema_path.tables["Dataset_Version"]
95
- version_rid = version_path.insert(
96
- [
97
- {
98
- "Dataset": dataset_rid,
99
- "Version": str(dataset_version),
100
- "Description": description,
101
- "Execution": execution_rid,
102
- }
103
- ]
104
- )[0]["RID"]
105
- schema_path.tables["Dataset"].update(
106
- [{"RID": dataset_rid, "Version": version_rid}]
107
- )
108
- return version_rid
97
+
98
+ # Construct version records for insert
99
+ version_records = [
100
+ {
101
+ "Dataset": dataset.rid,
102
+ "Version": str(dataset.version),
103
+ "Description": description,
104
+ "Execution": execution_rid,
105
+ }
106
+ for dataset in dataset_list
107
+ ]
108
+
109
+ # Insert version records and construct entities for updating the dataset version column.
110
+ version_rids = [
111
+ {"Version": v["RID"], "RID": v["Dataset"]}
112
+ for v in schema_path.tables["Dataset_Version"].insert(version_records)
113
+ ]
114
+ schema_path.tables["Dataset"].update(version_rids)
115
+ return version_rids
109
116
 
110
117
  def _bootstrap_versions(self):
111
118
  datasets = [ds["RID"] for ds in self.find_datasets()]
@@ -237,16 +244,20 @@ class Dataset:
237
244
  Raises:
238
245
  DerivaMLException: if provided RID is not to a dataset_table.
239
246
  """
240
- for dataset in self._build_dataset_graph(dataset_rid=dataset_rid):
241
- version = self.dataset_version(dataset)
242
- new_version = version.increment_version(component)
243
- self._insert_dataset_version(
244
- dataset,
245
- new_version,
246
- description=description,
247
- execution_rid=execution_rid,
247
+
248
+ # Find all of the datasets that are reachable from this dataset and determine their new version numbers.
249
+ related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
250
+ version_update_list = [
251
+ DatasetSpec(
252
+ rid=ds_rid,
253
+ version=self.dataset_version(ds_rid).increment_version(component),
248
254
  )
249
- return self.dataset_version(dataset_rid)
255
+ for ds_rid in related_datasets
256
+ ]
257
+ updated_versions = self._insert_dataset_versions(
258
+ version_update_list, description=description, execution_rid=execution_rid
259
+ )
260
+ return [d.version for d in version_update_list if d.rid == dataset_rid][0]
250
261
 
251
262
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
252
263
  def create_dataset(
@@ -323,9 +334,8 @@ class Dataset:
323
334
  pb.schemas[self._ml_schema].Dataset_Execution.insert(
324
335
  [{"Dataset": dataset_rid, "Execution": execution_rid}]
325
336
  )
326
- self._insert_dataset_version(
327
- dataset_rid,
328
- dataset_version=version,
337
+ self._insert_dataset_versions(
338
+ [DatasetSpec(rid=dataset_rid, version=version)],
329
339
  execution_rid=execution_rid,
330
340
  description="Initial dataset creation.",
331
341
  )
@@ -447,7 +457,7 @@ class Dataset:
447
457
 
448
458
  # @validate_call
449
459
  def list_dataset_members(
450
- self, dataset_rid: RID, recurse: bool = False
460
+ self, dataset_rid: RID, recurse: bool = False, limit: Optional[int] = None
451
461
  ) -> dict[str, list[dict[str, Any]]]:
452
462
  """Return a list of entities associated with a specific dataset_table.
453
463
 
@@ -455,6 +465,7 @@ class Dataset:
455
465
  dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
456
466
  dataset_rid: RID:
457
467
  recurse: (Default value = False)
468
+ limit: If provided, the maximum number of members to return for each element type.
458
469
 
459
470
  Returns:
460
471
  Dictionary of entities associated with a specific dataset_table. Key is the table from which the elements
@@ -492,7 +503,9 @@ class Dataset:
492
503
  target_path,
493
504
  on=(member_path.columns[member_column] == target_path.columns["RID"]),
494
505
  )
495
- target_entities = list(path.entities().fetch())
506
+ target_entities = list(
507
+ path.entities().fetch(limit=limit) if limit else path.entities().fetch()
508
+ )
496
509
  members[target_table.name].extend(target_entities)
497
510
  if recurse and target_table == self.dataset_table:
498
511
  # Get the members for all the nested datasets and add to the member list.
@@ -694,11 +707,25 @@ class Dataset:
694
707
  list of RIDs of nested datasets.
695
708
 
696
709
  """
697
- children = [d["RID"] for d in self.list_dataset_members(dataset_rid)["Dataset"]]
698
- if recurse:
699
- for child in children.copy():
700
- children.extend(self.list_dataset_children(child, recurse=recurse))
701
- return children
710
+ dataset_dataset_path = (
711
+ self._model.catalog.getPathBuilder()
712
+ .schemas[self._ml_schema]
713
+ .tables["Dataset_Dataset"]
714
+ )
715
+ nested_datasets = list(dataset_dataset_path.entities().fetch())
716
+
717
+ def find_children(rid: RID):
718
+ children = [
719
+ child["Nested_Dataset"]
720
+ for child in nested_datasets
721
+ if child["Dataset"] == rid
722
+ ]
723
+ if recurse:
724
+ for child in children.copy():
725
+ children.extend(find_children(child))
726
+ return children
727
+
728
+ return find_children(dataset_rid)
702
729
 
703
730
  def _vocabulary_specification(
704
731
  self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
@@ -723,26 +750,20 @@ class Dataset:
723
750
  for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
724
751
  ]
725
752
 
726
- def _table_paths(self) -> Iterator[tuple[list[str], list[str], list[Table]]]:
753
+ def _table_paths(
754
+ self, dataset: DatasetSpec = None, snapshot_catalog: Optional[DerivaML] = None
755
+ ) -> Iterator[tuple[str, str, Table]]:
727
756
 
728
- dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
729
- paths = self._model._schema_to_paths()
730
- nested_paths = paths
757
+ paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
731
758
 
732
- for i in range(self._dataset_nesting_depth()):
733
- if i == 0:
734
- paths.extend([[self.dataset_table, dataset_dataset]])
735
- nested_paths = [
736
- [self.dataset_table, dataset_dataset] + p for p in nested_paths
737
- ]
738
- paths.extend(nested_paths)
739
-
740
- def source_path(path):
759
+ def source_path(path: tuple[Table, ...]):
760
+ """Convert a tuple representing a path into a source path component with FK linkage"""
761
+ path = list(path)
741
762
  p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
742
763
  for table in path[1:]:
743
- if table == dataset_dataset:
764
+ if table.name == "Dataset_Dataset":
744
765
  p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
745
- elif table == self.dataset_table:
766
+ elif table.name == "Dataset":
746
767
  p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
747
768
  elif table.name == "Dataset_Version":
748
769
  p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
@@ -753,10 +774,81 @@ class Dataset:
753
774
  src_paths = ["/".join(source_path(p)) for p in paths]
754
775
  dest_paths = ["/".join([t.name for t in p]) for p in paths]
755
776
  target_tables = [p[-1] for p in paths]
756
-
757
777
  return zip(src_paths, dest_paths, target_tables)
758
778
 
759
- def _dataset_nesting_depth(self):
779
+ def _collect_paths(
780
+ self,
781
+ dataset_rid: Optional[RID] = None,
782
+ snapshot_catalog: Optional[DerivaML] = None,
783
+ dataset_nesting_depth: Optional[int] = None,
784
+ ) -> set[tuple[Table, ...]]:
785
+
786
+ snapshot_catalog = snapshot_catalog or self
787
+ dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
788
+ "Dataset"
789
+ ]
790
+ dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
791
+ "Dataset_Dataset"
792
+ ]
793
+ dataset_associations = [
794
+ a
795
+ for a in self.dataset_table.find_associations()
796
+ if a.table.schema.name != self._ml_schema
797
+ or a.table.name == "Dataset_Dataset"
798
+ ]
799
+ if dataset_rid:
800
+ # Get a list of the members of the dataset so we can figure out which tables to query.
801
+ dataset_elements = [
802
+ snapshot_catalog._model.name_to_table(e)
803
+ for e, m in snapshot_catalog.list_dataset_members(
804
+ dataset_rid=dataset_rid, limit=1
805
+ ).items()
806
+ if m
807
+ ]
808
+ included_associations = [
809
+ a.table
810
+ for a in dataset_table.find_associations()
811
+ if a.other_fkeys.pop().pk_table in dataset_elements
812
+ ]
813
+ else:
814
+ included_associations = dataset_associations
815
+ # Get the paths through the schema and filter out all of dataset paths not used by this dataset.
816
+ paths = {
817
+ tuple(p)
818
+ for p in snapshot_catalog._model._schema_to_paths()
819
+ if (len(p) == 1)
820
+ or (p[1] not in dataset_associations) # Tables in the domain schema
821
+ or (
822
+ p[1] in included_associations
823
+ ) # Tables that include members of the dataset
824
+ }
825
+ # Now get paths for nested datasets
826
+ nested_paths = set()
827
+ if dataset_rid:
828
+ for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
829
+ nested_paths |= self._collect_paths(
830
+ c, snapshot_catalog=snapshot_catalog
831
+ )
832
+ else:
833
+ # Initialize nesting depth if not already provided.
834
+ dataset_nesting_depth = (
835
+ self._dataset_nesting_depth()
836
+ if dataset_nesting_depth is None
837
+ else dataset_nesting_depth
838
+ )
839
+ if dataset_nesting_depth:
840
+ nested_paths = self._collect_paths(
841
+ dataset_nesting_depth=dataset_nesting_depth - 1
842
+ )
843
+ if nested_paths:
844
+ paths |= {
845
+ tuple([dataset_table]),
846
+ (dataset_table, dataset_dataset),
847
+ }
848
+ paths |= {(self.dataset_table, dataset_dataset) + p for p in nested_paths}
849
+ return paths
850
+
851
+ def _dataset_nesting_depth(self, dataset_rid: Optional[RID] = None) -> int:
760
852
  """Determine the maximum dataset nesting depth in the current catalog.
761
853
 
762
854
  Returns:
@@ -766,7 +858,7 @@ class Dataset:
766
858
  def children_depth(
767
859
  dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
768
860
  ) -> int:
769
- """Return the number of nested datasets in the current catalog"""
861
+ """Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
770
862
  try:
771
863
  children = nested_datasets[dataset_rid]
772
864
  return (
@@ -783,8 +875,19 @@ class Dataset:
783
875
  .schemas[self._ml_schema]
784
876
  .tables["Dataset_Dataset"]
785
877
  )
878
+ dataset_children = (
879
+ [
880
+ {
881
+ "Dataset": dataset_rid,
882
+ "Nested_Dataset": c,
883
+ } # Make uniform with return from datapath
884
+ for c in self.list_dataset_children(dataset_rid)
885
+ ]
886
+ if dataset_rid
887
+ else pb.entities().fetch()
888
+ )
786
889
  nested_dataset = defaultdict(list)
787
- for ds in pb.entities().fetch():
890
+ for ds in dataset_children:
788
891
  nested_dataset[ds["Dataset"]].append(ds["Nested_Dataset"])
789
892
  return (
790
893
  max(map(lambda d: children_depth(d, dict(nested_dataset)), nested_dataset))
@@ -793,7 +896,10 @@ class Dataset:
793
896
  )
794
897
 
795
898
  def _dataset_specification(
796
- self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
899
+ self,
900
+ writer: Callable[[str, str, Table], list[dict[str, Any]]],
901
+ dataset: DatasetSpec,
902
+ snapshot_catalog: Optional[DerivaML] = None,
797
903
  ) -> list[dict[str, Any]]:
798
904
  """Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
799
905
  The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
@@ -833,21 +939,24 @@ class Dataset:
833
939
  A dataset_table specification.
834
940
  """
835
941
  element_spec = []
836
- for path in self._table_paths():
942
+ for path in self._table_paths(
943
+ dataset=dataset, snapshot_catalog=snapshot_catalog
944
+ ):
837
945
  element_spec.extend(writer(*path))
838
946
  return self._vocabulary_specification(writer) + element_spec
839
947
 
840
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
841
- def download_dataset_bag(
948
+ def _download_dataset_bag(
842
949
  self,
843
950
  dataset: DatasetSpec,
844
951
  execution_rid: Optional[RID] = None,
952
+ snapshot_catalog: Optional[DerivaML] = None,
845
953
  ) -> DatasetBag:
846
954
  """Download a dataset onto the local file system. Create a MINID for the dataset if one doesn't already exist.
847
955
 
848
956
  Args:
849
957
  dataset: Specification of the dataset to be downloaded.
850
958
  execution_rid: Execution RID for the dataset.
959
+ snapshot_catalog: Snapshot catalog for the dataset version if specified.
851
960
 
852
961
  Returns:
853
962
  Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
@@ -858,16 +967,17 @@ class Dataset:
858
967
  and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
859
968
  ):
860
969
  raise DerivaMLException(f"RID {execution_rid} is not an execution")
861
- minid = self.get_dataset_minid(dataset)
970
+ minid = self._get_dataset_minid(dataset, snapshot_catalog=snapshot_catalog)
862
971
 
863
972
  bag_path = (
864
973
  self._materialize_dataset_bag(minid, execution_rid=execution_rid)
865
974
  if dataset.materialize
866
- else self._download_dataset_bag(minid)
975
+ else self._download_dataset_minid(minid)
867
976
  )
868
977
  return DatabaseModel(minid, bag_path).get_dataset()
869
978
 
870
979
  def _version_snapshot(self, dataset: DatasetSpec) -> str:
980
+ """Return a catalog with snapshot for the specified dataset version"""
871
981
  version_record = [
872
982
  h
873
983
  for h in self.dataset_history(dataset_rid=dataset.rid)
@@ -875,13 +985,17 @@ class Dataset:
875
985
  ][0]
876
986
  return f"{self._model.catalog.catalog_id}@{iso_to_snap(version_record.timestamp.isoformat())}"
877
987
 
878
- def _create_dataset_minid(self, dataset: DatasetSpec) -> str:
988
+ def _create_dataset_minid(
989
+ self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
990
+ ) -> str:
879
991
  with TemporaryDirectory() as tmp_dir:
880
992
  # Generate a download specification file for the current catalog schema. By default, this spec
881
993
  # will generate a minid and place the bag into S3 storage.
882
994
  spec_file = f"{tmp_dir}/download_spec.json"
883
995
  with open(spec_file, "w", encoding="utf-8") as ds:
884
- json.dump(self._generate_dataset_download_spec(dataset), ds)
996
+ json.dump(
997
+ self._generate_dataset_download_spec(dataset, snapshot_catalog), ds
998
+ )
885
999
  try:
886
1000
  self._logger.info(
887
1001
  f"Downloading dataset minid for catalog: {dataset.rid}@{str(dataset.version)}"
@@ -918,14 +1032,17 @@ class Dataset:
918
1032
  version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
919
1033
  return minid_page_url
920
1034
 
921
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
922
- def get_dataset_minid(
923
- self, dataset: DatasetSpec, create: bool = True
1035
+ def _get_dataset_minid(
1036
+ self,
1037
+ dataset: DatasetSpec,
1038
+ snapshot_catalog: Optional[DerivaML] = None,
1039
+ create: bool = True,
924
1040
  ) -> DatasetMinid:
925
1041
  """Return a MINID to the specified dataset. If no version is specified, use the latest.
926
1042
 
927
1043
  Args:
928
1044
  dataset: Specification of the dataset.
1045
+ snapshot_catalog: Snapshot catalog for the dataset version if specified.
929
1046
  create: Create a new MINID if one doesn't already exist.
930
1047
 
931
1048
  Returns:
@@ -956,12 +1073,12 @@ class Dataset:
956
1073
  f"Minid for dataset {dataset.rid} doesn't exist"
957
1074
  )
958
1075
  self._logger.info("Creating new MINID for dataset %s", dataset.rid)
959
- minid_url = self._create_dataset_minid(dataset)
1076
+ minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
960
1077
  # If provided a MINID, use the MINID metadata to get the checksum and download the bag.
961
1078
  r = requests.get(minid_url, headers={"accept": "application/json"})
962
1079
  return DatasetMinid(dataset_version=dataset.version, **r.json())
963
1080
 
964
- def _download_dataset_bag(self, minid: DatasetMinid) -> Path:
1081
+ def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
965
1082
  """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
966
1083
  that all the metadata is correct
967
1084
 
@@ -1028,7 +1145,7 @@ class Dataset:
1028
1145
  return True
1029
1146
 
1030
1147
  # request metadata
1031
- bag_path = self._download_dataset_bag(minid)
1148
+ bag_path = self._download_dataset_minid(minid)
1032
1149
  bag_dir = bag_path.parent
1033
1150
  validated_check = bag_dir / "validated_check.txt"
1034
1151
 
@@ -1042,7 +1159,11 @@ class Dataset:
1042
1159
  validated_check.touch()
1043
1160
  return Path(bag_path)
1044
1161
 
1045
- def _export_outputs(self) -> list[dict[str, Any]]:
1162
+ def _export_outputs(
1163
+ self,
1164
+ dataset: Optional[DatasetSpec] = None,
1165
+ snapshot_catalog: Optional[DerivaML] = None,
1166
+ ) -> list[dict[str, Any]]:
1046
1167
  """Return and output specification for the datasets in the provided model
1047
1168
 
1048
1169
  Returns:
@@ -1079,9 +1200,13 @@ class Dataset:
1079
1200
  "source": {"api": "schema", "skip_root_path": True},
1080
1201
  "destination": {"type": "json", "name": "schema"},
1081
1202
  },
1082
- ] + self._dataset_specification(writer)
1203
+ ] + self._dataset_specification(
1204
+ writer, dataset, snapshot_catalog=snapshot_catalog
1205
+ )
1083
1206
 
1084
- def _processor_params(self) -> list[dict[str, Any]]:
1207
+ def _processor_params(
1208
+ self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
1209
+ ) -> list[dict[str, Any]]:
1085
1210
  """
1086
1211
  Returns:
1087
1212
  a download specification for the datasets in the provided model.
@@ -1107,7 +1232,7 @@ class Dataset:
1107
1232
  "processor": "json",
1108
1233
  "processor_params": {"query_path": "/schema", "output_path": "schema"},
1109
1234
  }
1110
- ] + self._dataset_specification(writer)
1235
+ ] + self._dataset_specification(writer, dataset, snapshot_catalog)
1111
1236
 
1112
1237
  @staticmethod
1113
1238
  def _download_dataset_element(
@@ -1186,7 +1311,9 @@ class Dataset:
1186
1311
  )
1187
1312
  return exports
1188
1313
 
1189
- def _generate_dataset_download_spec(self, dataset: DatasetSpec) -> dict[str, Any]:
1314
+ def _generate_dataset_download_spec(
1315
+ self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML]
1316
+ ) -> dict[str, Any]:
1190
1317
  """
1191
1318
 
1192
1319
  Returns:
@@ -1244,7 +1371,7 @@ class Dataset:
1244
1371
  },
1245
1372
  },
1246
1373
  ]
1247
- + self._processor_params(),
1374
+ + self._processor_params(dataset, snapshot_catalog),
1248
1375
  },
1249
1376
  }
1250
1377
 
@@ -139,7 +139,6 @@ class FileSpec(BaseModel):
139
139
  if url_parts.scheme == "tag":
140
140
  return v
141
141
  elif not url_parts.scheme:
142
- print(v)
143
142
  return f'tag://{gethostname()},{date.today()}:file://{v}'
144
143
  else:
145
144
  raise ValidationError("url is not a file URL")
@@ -15,8 +15,11 @@ import logging
15
15
  from datetime import datetime
16
16
  import hashlib
17
17
  from itertools import chain
18
+ import inspect
18
19
  from pathlib import Path
19
20
  import requests
21
+ from setuptools_git_versioning import get_latest_file_commit
22
+ import subprocess
20
23
  from typing import Optional, Any, Iterable, TYPE_CHECKING
21
24
  from deriva.core import (
22
25
  ErmrestCatalog,
@@ -35,6 +38,8 @@ from pydantic import validate_call, ConfigDict
35
38
  from .execution_configuration import ExecutionConfiguration, Workflow
36
39
  from .feature import Feature, FeatureRecord
37
40
  from .dataset import Dataset
41
+ from .dataset_aux_classes import DatasetSpec
42
+ from .dataset_bag import DatasetBag
38
43
  from .deriva_model import DerivaModel
39
44
  from .upload import (
40
45
  table_path,
@@ -56,6 +61,18 @@ from .deriva_definitions import (
56
61
  FileSpec,
57
62
  )
58
63
 
64
+ try:
65
+ from icecream import ic
66
+ except ImportError: # Graceful fallback if IceCream isn't installed.
67
+ ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
68
+
69
+
70
+ try:
71
+ from IPython import get_ipython
72
+ except ImportError: # Graceful fallback if IPython isn't installed.
73
+ get_ipython = lambda: None
74
+
75
+
59
76
  if TYPE_CHECKING:
60
77
  from .execution import Execution
61
78
 
@@ -132,6 +149,17 @@ class DerivaML(Dataset):
132
149
  self.version = model_version
133
150
  self.configuration = None
134
151
  self._execution: Optional[Execution] = None
152
+ self._notebook = None
153
+ try:
154
+ from IPython import get_ipython
155
+
156
+ ipython = get_ipython()
157
+ # Check if running in Jupyter's ZMQ kernel (used by notebooks)
158
+ if ipython is not None and "IPKernelApp" in ipython.config:
159
+ self._notebook = Path(ipython.user_ns.get("__session__"))
160
+ # Check if running in Jupyter's ZMQ kernel (used by notebooks)
161
+ except (ImportError, AttributeError):
162
+ pass
135
163
 
136
164
  self.domain_schema = self.model.domain_schema
137
165
  self.project_name = project_name or self.domain_schema
@@ -705,6 +733,28 @@ class DerivaML(Dataset):
705
733
  for v in pb.schemas[table.schema.name].tables[table.name].entities().fetch()
706
734
  ]
707
735
 
736
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
737
+ def download_dataset_bag(
738
+ self,
739
+ dataset: DatasetSpec,
740
+ execution_rid: Optional[RID] = None,
741
+ ) -> DatasetBag:
742
+ """Download a dataset onto the local file system. Create a MINID for the dataset if one doesn't already exist.
743
+
744
+ Args:
745
+ dataset: Specification of the dataset to be downloaded.
746
+ execution_rid: Execution RID for the dataset.
747
+
748
+ Returns:
749
+ Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
750
+ for the dataset.
751
+ """
752
+ return self._download_dataset_bag(
753
+ dataset=dataset,
754
+ execution_rid=execution_rid,
755
+ snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
756
+ )
757
+
708
758
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
709
759
  def download_asset(self, asset_rid: RID, dest_dir: Path) -> Path:
710
760
  """Download an asset from a URL and place it in a local directory.
@@ -808,8 +858,10 @@ class DerivaML(Dataset):
808
858
  Iterable of the RIDs of the files that were added.
809
859
  """
810
860
  defined_types = self.list_vocabulary_terms(MLVocab.file_type)
811
- if execution_rid and self.resolve_rid(execution_rid).table.name != 'Execution':
812
- raise DerivaMLException(f'RID {execution_rid} is not for an execution table.')
861
+ if execution_rid and self.resolve_rid(execution_rid).table.name != "Execution":
862
+ raise DerivaMLException(
863
+ f"RID {execution_rid} is not for an execution table."
864
+ )
813
865
 
814
866
  def check_file_type(dtype: str) -> bool:
815
867
  for term in defined_types:
@@ -862,18 +914,11 @@ class DerivaML(Dataset):
862
914
  self, file_types: Optional[list[str]] = None
863
915
  ) -> list[dict[str, Any]]:
864
916
  """Return the contents of the file table. Denormalized file types into the file record."""
865
- atable = next(
866
- self._model.schemas[self._ml_schema]
867
- .tables[MLVocab.dataset_type]
868
- .find_associations()
869
- ).name
870
917
  ml_path = self.pathBuilder.schemas[self._ml_schema]
871
- atable_path = ml_path.tables[atable]
872
918
  file_path = ml_path.File
873
919
  type_path = ml_path.File_File_Type
874
920
 
875
921
  # Get a list of all the dataset_type values associated with this dataset_table.
876
- files = []
877
922
  path = file_path.link(type_path)
878
923
  path = path.attributes(
879
924
  path.File.RID,
@@ -885,10 +930,12 @@ class DerivaML(Dataset):
885
930
  )
886
931
  file_map = {}
887
932
  for f in path.fetch():
888
- file_map.setdefault(f['RID'], f | {'File_Types': []})['File_Types'].append(f['File_Type'])
933
+ file_map.setdefault(f["RID"], f | {"File_Types": []})["File_Types"].append(
934
+ f["File_Type"]
935
+ )
889
936
 
890
937
  # Now get rid of the File_Type key and return the result
891
- return [ (f, f.pop('File_Type'))[0] for f in file_map.values()]
938
+ return [(f, f.pop("File_Type"))[0] for f in file_map.values()]
892
939
 
893
940
  def list_workflows(self) -> list[Workflow]:
894
941
  """Return a list of all the workflows in the catalog."""
@@ -901,6 +948,7 @@ class DerivaML(Dataset):
901
948
  version=w["Version"],
902
949
  description=w["Description"],
903
950
  rid=w["RID"],
951
+ checksum=w["Checksum"],
904
952
  )
905
953
  for w in workflow_path.entities().fetch()
906
954
  ]
@@ -917,33 +965,18 @@ class DerivaML(Dataset):
917
965
  """
918
966
 
919
967
  # Check to make sure that the workflow is not already in the table. If it's not, add it.
920
- def get_checksum(url) -> str:
921
- """Get the checksum of a file from a URL."""
922
- try:
923
- response = requests.get(url)
924
- response.raise_for_status()
925
- except Exception:
926
- raise DerivaMLException(f"Invalid URL: {url}")
927
- else:
928
- sha256_hash = hashlib.sha256()
929
- sha256_hash.update(response.content)
930
- checksum = "SHA-256: " + sha256_hash.hexdigest()
931
- return checksum
968
+
969
+ if workflow_rid := self.lookup_workflow(workflow.url):
970
+ return workflow_rid
932
971
 
933
972
  ml_schema_path = self.pathBuilder.schemas[self.ml_schema]
934
973
  try:
935
- url_column = ml_schema_path.Workflow.URL
936
- workflow_record = list(
937
- ml_schema_path.Workflow.filter(url_column == workflow.url).entities()
938
- )[0]
939
- workflow_rid = workflow_record["RID"]
940
- except IndexError:
941
974
  # Record doesn't exist already
942
975
  workflow_record = {
943
976
  "URL": workflow.url,
944
977
  "Name": workflow.name,
945
978
  "Description": workflow.description,
946
- "Checksum": get_checksum(workflow.url),
979
+ "Checksum": workflow.checksum,
947
980
  "Version": workflow.version,
948
981
  MLVocab.workflow_type: self.lookup_term(
949
982
  MLVocab.workflow_type, workflow.workflow_type
@@ -955,6 +988,125 @@ class DerivaML(Dataset):
955
988
  raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
956
989
  return workflow_rid
957
990
 
991
+ def lookup_workflow(self, url: str) -> Optional[RID]:
992
+ workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
993
+ try:
994
+ url_column = workflow_path.URL
995
+ return list(workflow_path.filter(url_column == url).entities())[0]["RID"]
996
+ except IndexError:
997
+ return None
998
+
999
+ def create_workflow(
1000
+ self, name: str, workflow_type: str, description: str = "", create: bool = True
1001
+ ) -> RID:
1002
+ """Identify current executing program and return a workflow RID for it
1003
+
1004
+ Determane the notebook of script that is currently being executed. Assume that this is
1005
+ being executed from a cloned GitHub repository. Determine the remote repository name for
1006
+ this object. Then either retrieve an existing workflow for this executable of create
1007
+ a new one.
1008
+
1009
+ Args:
1010
+ name: The name of the workflow.
1011
+ workflow_type: The type of the workflow.
1012
+ description: The description of the workflow.
1013
+ create: Whether or not to create a new workflow.
1014
+ """
1015
+ # Make sure type is correct.
1016
+ self.lookup_term(MLVocab.workflow_type, workflow_type)
1017
+ filename, github_url, is_dirty = self._github_url()
1018
+
1019
+ if is_dirty:
1020
+ self._logger.warning(
1021
+ f"File {filename} has been modified since last commit. Consider commiting before executing"
1022
+ )
1023
+
1024
+ sha256_hash = hashlib.sha256()
1025
+ if self._notebook:
1026
+ # If you are in a notebook, strip out the outputs before computing the checksum.
1027
+ result = subprocess.run(
1028
+ ["nbstripout", "-t", filename],
1029
+ capture_output=True,
1030
+ text=False,
1031
+ check=True,
1032
+ )
1033
+ sha256_hash.update(result.stdout)
1034
+ else:
1035
+ with open(filename, "rb") as f:
1036
+ sha256_hash.update(f.read())
1037
+ checksum = "SHA-256:" + sha256_hash.hexdigest()
1038
+
1039
+ workflow = Workflow(
1040
+ name=name,
1041
+ url=github_url,
1042
+ checksum=checksum,
1043
+ description=description,
1044
+ workflow_type=workflow_type,
1045
+ )
1046
+ return self.add_workflow(workflow) if create else None
1047
+
1048
+ def _github_url(self) -> tuple[str, str, bool]:
1049
+ """Return a GitHUB URL for the latest commit of the script from which this routine is called.
1050
+
1051
+ This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
1052
+ the file is in a gitHUB repository and commited. It returns a URL to the last commited version of this
1053
+ file in GitHUB.
1054
+
1055
+ Returns: A tuple with the filename, gethub_url and a boolaen to indicated if uncommited changes
1056
+ have been made to the file.
1057
+
1058
+ """
1059
+
1060
+ # Get the name of the script that is calling this function.
1061
+ if self._notebook:
1062
+ # Try to get the __session__ variable from the user namespace.
1063
+ filename = Path("").absolute().parent / self._notebook
1064
+ else:
1065
+ stack = inspect.stack()
1066
+ if len(stack) > 1:
1067
+ filename = Path(
1068
+ stack[2].filename
1069
+ ) # Get the caller's filename, which is two up the stack from here.
1070
+ else:
1071
+ raise DerivaMLException(
1072
+ f"Looking for caller failed"
1073
+ ) # Stack is too shallow
1074
+
1075
+ # Get repo URL from local github repo.
1076
+ try:
1077
+ result = subprocess.run(
1078
+ ["git", "remote", "get-url", "origin"], capture_output=True, text=True
1079
+ )
1080
+ github_url = result.stdout.strip().removesuffix(".git")
1081
+ except subprocess.CalledProcessError:
1082
+ raise DerivaMLException(f"No GIT remote found")
1083
+
1084
+ # Find the root directory for the repository
1085
+ repo_root = filename
1086
+ while repo_root != repo_root.root:
1087
+ if (repo_root / ".git").exists():
1088
+ break
1089
+ else:
1090
+ repo_root = repo_root.parent
1091
+
1092
+ # Now check to see if file has been modified since the last commit.
1093
+ try:
1094
+ result = subprocess.run(
1095
+ ["git", "status", "--porcelain"],
1096
+ capture_output=True,
1097
+ text=True,
1098
+ check=True,
1099
+ )
1100
+ is_dirty = bool(
1101
+ " M " in result.stdout.strip()
1102
+ ) # Returns True if output indicates a modified file
1103
+ except subprocess.CalledProcessError:
1104
+ is_dirty = False # If Git command fails, assume no changes
1105
+
1106
+ sha = get_latest_file_commit(filename)
1107
+ url = f"{github_url}/blob/{sha}/{filename.relative_to(repo_root)}"
1108
+ return filename, url, is_dirty
1109
+
958
1110
  # @validate_call
959
1111
  def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":
960
1112
  """Create an execution object
@@ -0,0 +1,104 @@
1
+ from sympy import cxxcode
2
+
3
+ from deriva_ml import DerivaML, execution_configuration
4
+
5
+ def execute(host, catalog, script):
6
+ workflow_rid = foobar
7
+ execution_configuration = cxxcode(
8
+
9
+ )
10
+ ml_instance = DerivaML()
11
+ ml_instance.create_execution(configuration)
12
+ script
13
+
14
+
15
+ from deriva_ml import DerivaML, ExecutionConfiguration, DatasetSpec, RID, DerivaMLException
16
+ import os
17
+ import sys
18
+ import json
19
+ import traceback
20
+ import argparse
21
+ import requests
22
+ from requests.exceptions import HTTPError, ConnectionError
23
+ from deriva.transfer import GenericDownloader
24
+ from deriva.transfer.download import DerivaDownloadError, DerivaDownloadConfigurationError, \
25
+ DerivaDownloadAuthenticationError, DerivaDownloadAuthorizationError, DerivaDownloadTimeoutError, \
26
+ DerivaDownloadBaggingError
27
+ from deriva.core import BaseCLI, KeyValuePairArgs, format_credential, format_exception, urlparse
28
+
29
+
30
+ class DerivaMLExecCLI(BaseCLI):
31
+ def __init__(self, description, epilog, **kwargs):
32
+
33
+ BaseCLI.__init__(self, description, epilog, **kwargs)
34
+ self.parser.add_argument("--catalog", default=1, metavar="<1>", help="Catalog number. Default: 1")
35
+ self.parser.add_argument("--timeout", metavar="<seconds>",
36
+ help="Total number of seconds elapsed before the download is aborted.")
37
+ self.parser.add_argument("output_dir", metavar="<output dir>", help="Path to an output directory.")
38
+ self.parser.add_argument("envars", metavar="[key=value key=value ...]",
39
+ nargs=argparse.REMAINDER, action=KeyValuePairArgs, default={},
40
+ help="Variable length of whitespace-delimited key=value pair arguments used for "
41
+ "string interpolation in specific parts of the configuration file. "
42
+ "For example: key1=value1 key2=value2")
43
+
44
+ def main(self):
45
+ try:
46
+ args = self.parse_cli()
47
+ except ValueError as e:
48
+ sys.stderr.write(str(e))
49
+ return 2
50
+ if not args.quiet:
51
+ sys.stderr.write("\n")
52
+
53
+ try:
54
+ try:
55
+ ml_instance = DerivaML(args.hostname, args.catalog)
56
+ downloaded = self.execute()
57
+ sys.stdout.write("\n%s\n" % (json.dumps(downloaded)))
58
+ except ConnectionError as e:
59
+ raise DerivaDownloadError("Connection error occurred. %s" % format_exception(e))
60
+ except HTTPError as e:
61
+ if e.response.status_code == requests.codes.unauthorized:
62
+ raise DerivaDownloadAuthenticationError(
63
+ "The requested service requires authentication and a valid login session could "
64
+ "not be found for the specified host. Server responded: %s" % e)
65
+ elif e.response.status_code == requests.codes.forbidden:
66
+ raise DerivaDownloadAuthorizationError(
67
+ "A requested operation was forbidden. Server responded: %s" % e)
68
+ except (DerivaDownloadError, DerivaDownloadConfigurationError, DerivaDownloadAuthenticationError,
69
+ DerivaDownloadAuthorizationError, DerivaDownloadTimeoutError, DerivaDownloadBaggingError) as e:
70
+ sys.stderr.write(("\n" if not args.quiet else "") + format_exception(e))
71
+ if args.debug:
72
+ traceback.print_exc()
73
+ return 1
74
+ except:
75
+ sys.stderr.write("An unexpected error occurred.")
76
+ traceback.print_exc()
77
+ return 1
78
+ finally:
79
+ if not args.quiet:
80
+ sys.stderr.write("\n\n")
81
+ return 0
82
+
83
+
84
+ def do_stuff():
85
+ pass
86
+
87
+ def main(datasets: list[RID], model: list[RID], hostname: str, catalog_id: str):
88
+ my_url = DerivaML.github_url()
89
+ ml_instance = DerivaML(hostname, catalog_id)
90
+ ml_instance.lookup_workflow(my_url)
91
+ config = ExecutionConfiguration(
92
+ datasets=[DatasetSpec(rid=dataset,
93
+ version=ml_instance.dataset_version(dataset)) for dataset in datasets],
94
+ assets=model,
95
+ workflow= ml_instance.lookup_workflow(my_url)
96
+ )
97
+ execution = ml_instance.create_execution(config)
98
+ with execution as e:
99
+ do_stuff()
100
+ execution.upload_execution_outputs()
101
+
102
+ if __name__ == "__main__":
103
+ main(datasets, model, hostname, catalog_id)
104
+ if __file__ == matplotlib_inline
deriva_ml/deriva_model.py CHANGED
@@ -265,7 +265,9 @@ class DerivaModel:
265
265
  return relationships[0]
266
266
 
267
267
  def _schema_to_paths(
268
- self, root: Table = None, path: list[Table] = None
268
+ self,
269
+ root: Table = None,
270
+ path: list[Table] = None,
269
271
  ) -> list[list[Table]]:
270
272
  """Recursively walk over the domain schema graph and extend the current path.
271
273
 
@@ -278,6 +280,7 @@ class DerivaModel:
278
280
  A list of all the paths through the graph. Each path is a list of tables.
279
281
 
280
282
  """
283
+
281
284
  root = root or self.model.schemas[self.ml_schema].tables["Dataset"]
282
285
  path = path.copy() if path else []
283
286
  parent = path[-1] if path else None # Table that we are coming from.
deriva_ml/execution.py CHANGED
@@ -12,6 +12,7 @@ import os
12
12
  import shutil
13
13
  from datetime import datetime
14
14
  from pathlib import Path
15
+ import requests
15
16
  from tempfile import NamedTemporaryFile
16
17
  from typing import Iterable, Any, Optional
17
18
  from deriva.core import format_exception
@@ -28,7 +29,6 @@ from .deriva_definitions import (
28
29
  )
29
30
  from .deriva_ml_base import DerivaML, FeatureRecord
30
31
  from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
31
- from .dataset import Dataset
32
32
  from .dataset_bag import DatasetBag
33
33
  from .execution_configuration import ExecutionConfiguration
34
34
  from .execution_environment import get_execution_environment
@@ -51,6 +51,12 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
51
51
  ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
52
52
 
53
53
 
54
+ try:
55
+ from jupyter_server.serverapp import list_running_servers
56
+ except ImportError:
57
+ list_running_servers = lambda: []
58
+
59
+
54
60
  class Execution:
55
61
  """The Execution class is used to capture the context of an activity within DerivaML. While these are primarily
56
62
  computational, manual processes can be represented by an execution as well.
@@ -100,6 +106,7 @@ class Execution:
100
106
  self.configuration = configuration
101
107
  self._ml_object = ml_object
102
108
  self.start_time = None
109
+ self.stop_time = None
103
110
  self.status = Status.created
104
111
  self.uploaded_assets: list[Path] = []
105
112
 
@@ -221,8 +228,9 @@ class Execution:
221
228
  Returns:
222
229
  the location of the unpacked and validated dataset_table bag and the RID of the bag
223
230
  """
224
- ds = Dataset(self._ml_object.model, cache_dir=self._cache_dir)
225
- return ds.download_dataset_bag(dataset, execution_rid=self.execution_rid)
231
+ return self._ml_object.download_dataset_bag(
232
+ dataset, execution_rid=self.execution_rid
233
+ )
226
234
 
227
235
  @validate_call
228
236
  def update_status(self, status: Status, msg: str) -> None:
@@ -243,6 +251,35 @@ class Execution:
243
251
  ]
244
252
  )
245
253
 
254
+ def _create_notebook_checkpoint(self):
255
+ """Trigger a checkpoint creation using Jupyter's API."""
256
+ notebook_name = self._ml_object._notebook
257
+ servers = list_running_servers()
258
+ # Look for the server running this notebook.
259
+ root = Path("").absolute().parent.as_posix()
260
+ servers = list(list_running_servers())
261
+ # Jupyterhub seems to handle root_dir differently then server case.
262
+ server = (
263
+ servers
264
+ if len(servers) == 1
265
+ else [s for s in servers if s["root_dir"] == root]
266
+ )[0]
267
+ notebook_url = f"{server['url']}api/contents/{notebook_name}"
268
+
269
+ # Get notebook content
270
+ response = requests.get(
271
+ notebook_url, headers={"Authorization": f"Token {server['token']}"}
272
+ )
273
+ if response.status_code == 200:
274
+ notebook_content = response.json()["content"]
275
+ # Execution metadata cannot be in a directory, so map path into filename.
276
+ checkpoint_path = (
277
+ self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
278
+ / f"{notebook_name.as_posix().replace('/','_')}.checkpoint"
279
+ )
280
+ with open(checkpoint_path, "w", encoding="utf-8") as f:
281
+ json.dump(notebook_content, f)
282
+
246
283
  def execution_start(self) -> None:
247
284
  """Start an execution, uploading status to catalog"""
248
285
 
@@ -252,11 +289,15 @@ class Execution:
252
289
 
253
290
  def execution_stop(self) -> None:
254
291
  """Finish the execution and update the duration and status of execution."""
255
- duration = datetime.now() - self.start_time
292
+ self.stop_time = datetime.now()
293
+ duration = self.stop_time - self.start_time
256
294
  hours, remainder = divmod(duration.total_seconds(), 3600)
257
295
  minutes, seconds = divmod(remainder, 60)
258
296
  duration = f"{round(hours, 0)}H {round(minutes, 0)}min {round(seconds, 4)}sec"
259
297
 
298
+ if self._ml_object._notebook:
299
+ self._create_notebook_checkpoint()
300
+
260
301
  self.update_status(Status.completed, "Algorithm execution ended.")
261
302
  self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
262
303
  [{"RID": self.execution_rid, "Duration": duration}]
@@ -33,18 +33,18 @@ class Workflow(BaseModel):
33
33
  version: Optional[str] = None
34
34
  description: Optional[str] = ""
35
35
  rid: Optional[RID] = None
36
+ checksum: Optional[str]
37
+
36
38
 
37
39
 
38
40
  class ExecutionConfiguration(BaseModel):
39
41
  """Define the parameters that are used to configure a specific execution.
40
42
 
41
43
  Attributes:
42
- datasets: List of dataset_table RIDS, MINIDS for datasets to be downloaded prior to execution. By default,
43
- all the datasets are materialized. However, if the assets associated with a dataset_table are not
44
- needed, a dictionary that defines the rid and the materialization parameter for the
45
- download_dataset_bag method can be specified, e.g. datasets=[{'rid': RID, 'materialize': True}].
44
+ datasets: List of dataset specifications which specify the dataset RID, version and if the dataset
45
+ should be materialized.
46
46
  assets: List of assets to be downloaded prior to execution. The values must be RIDs in an asset table
47
- workflow: A workflow instance. Must have a name, URI to the workflow instance, and a type.
47
+ workflow: A RID for a workflow instance. Must have a name, URI to the workflow instance, and a type.
48
48
  description: A description of the execution. Can use Markdown format.
49
49
  """
50
50
 
deriva_ml/upload.py CHANGED
@@ -70,8 +70,11 @@ exec_asset_regex = (
70
70
  exec_metadata_dir_regex = (
71
71
  exec_dir_regex + r"/execution-metadata/(?P<execution_metadata_type>[-\w]+)"
72
72
  )
73
+
74
+ # May have more than one suffix
73
75
  exec_metadata_regex = (
74
- exec_metadata_dir_regex + r"/(?P<filename>[-\w]+)[.](?P<file_ext>[a-z0-9]*)$"
76
+ exec_metadata_dir_regex
77
+ + r"/(?P<filename>[-\w]+([.][\w]+)*)[.](?P<file_ext>[a-z0-9]*)$"
75
78
  )
76
79
  feature_dir_regex = exec_dir_regex + r"/feature"
77
80
  feature_table_dir_regex = (
@@ -1,16 +1,18 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: deriva-ml
3
- Version: 1.8.1
3
+ Version: 1.8.4
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
- Requires-Dist: deriva~=1.7.6
9
+ Requires-Dist: deriva~=1.7.7
10
10
  Requires-Dist: pandas
11
11
  Requires-Dist: regex~=2024.7.24
12
12
  Requires-Dist: pydantic>=2.10.6
13
13
  Requires-Dist: semver>3.0.0
14
+ Requires-Dist: setuptools-git-versioning<3,>=2.0
15
+ Requires-Dist: nbstripout
14
16
 
15
17
  Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
16
18
  using a deriva catalog.
@@ -1,20 +1,21 @@
1
- deriva_ml/VERSION.py,sha256=6cud5pVpwnMsz6fxZVA6qWcQVSpLQXd0ddN6ip-1M_8,22
1
+ deriva_ml/VERSION.py,sha256=8kdJa8mgK7VES73y02oBbzwoXZCUs42GzbJ4UU-L_3I,22
2
2
  deriva_ml/__init__.py,sha256=0PHNB8gRDALLtaffRmU7wCUgWbRHVQZcjuPJxMLNEco,856
3
3
  deriva_ml/database_model.py,sha256=uhoyVyd8MQmY8J9ovCH8fjxhZDxxXNkdJyYdeyEGPXA,13898
4
- deriva_ml/dataset.py,sha256=f6eFgaTYjW-iZwqpVPZH9hSOmZP2C4_elkWhtLOjW9E,55524
4
+ deriva_ml/dataset.py,sha256=xC6QPUp4MZcJiEnOEU3NnzoLBL9RcJWtPTyzIQP0Ivw,60666
5
5
  deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
6
6
  deriva_ml/dataset_bag.py,sha256=e6IHv3saZUnZRfl0EjfnlV2NnmPeOagYYv3PuZqS1l0,11501
7
7
  deriva_ml/demo_catalog.py,sha256=xQPhFlflqwJskNQrQ-jdBSnGzBm2-aONBgcRxfsdNKM,11045
8
- deriva_ml/deriva_definitions.py,sha256=MGl29ogCzqrlRilMhSuR5tECo4NSHP4CLbJAXRtPH6E,8914
9
- deriva_ml/deriva_ml_base.py,sha256=ShDZlG9F4XrGRUcUINT3bb_P_UdvV1FqSnnPsjGTCLU,36443
10
- deriva_ml/deriva_model.py,sha256=F5zDw-MDV55POAjKB6dorpa7P4KTSN9hjDCN4E9zB9A,11986
11
- deriva_ml/execution.py,sha256=UcXWY1W5Mt_Yzuayd3Pjd-lKzLlMV5QXZFcLvE6Lt0E,28390
12
- deriva_ml/execution_configuration.py,sha256=nMeaG1qYdIgu4BV5atSUlcL8VZ3O6ohGY5iBhtD_LQ4,3700
8
+ deriva_ml/deriva_definitions.py,sha256=pZLPoUxiuJ-uGglmQ6sF9oVXsSUuOnPEqywoec78XNM,8893
9
+ deriva_ml/deriva_ml_base.py,sha256=3iA1OaPU-6Q7ixt87uDmPuHHZ5P-FyHvX0AKfi4tKp0,42224
10
+ deriva_ml/deriva_ml_execute.py,sha256=y_rGjc97eidBuzy-AaQGe93vuTbWbkNkK9rpReqV0IY,4433
11
+ deriva_ml/deriva_model.py,sha256=LV3FjIhIlz13ckZSmu0aOJhT9EVE0-M9oVMudfkxb0g,12004
12
+ deriva_ml/execution.py,sha256=c7dbk4HvEh7E4BLlBrf_azUxxhRSUmLQa_6G8t8OKVY,29929
13
+ deriva_ml/execution_configuration.py,sha256=bjnZwXN6M7YPy5dFQwoGEBU8YjhQRSe1FW0rL0V9TaM,3422
13
14
  deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
14
15
  deriva_ml/feature.py,sha256=7e8WYPCfJSrGxJh9oUTduYSnB5ekybRhXa_0HIigS_w,5459
15
16
  deriva_ml/history.py,sha256=qTDLDs8Ow_6r7mDO0gZm0Fg81SWKOAgtCU5pzZoDRgM,2828
16
17
  deriva_ml/test_functions.py,sha256=-eqLHjjCQCLBNAr1ofbZekNiCOfMISSACRxT_YHER8I,4396
17
- deriva_ml/upload.py,sha256=HCOChW6bALW_gt0sWUs_81bNPsb72TNs4o0FQsGSLM4,22222
18
+ deriva_ml/upload.py,sha256=CKtT-gBln3pnAll9TFaiPhFSHC-bzg9oE4ruh_OSOqY,22270
18
19
  deriva_ml/build/lib/schema_setup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
20
  deriva_ml/build/lib/schema_setup/alter_annotation.py,sha256=pkwk0WystN69JfAFK4iBJZAZVQKbRs-gN9IFYuS9rfg,1739
20
21
  deriva_ml/build/lib/schema_setup/annotation_temp.py,sha256=Euygu8wNklZFUbR6mz-pDWJemlzdsIn9d6j0f6fCfgE,9102
@@ -26,9 +27,9 @@ deriva_ml/schema_setup/annotations.py,sha256=Uogm9YkRtoKSdgfQlICqRywbCATppwBO-Xr
26
27
  deriva_ml/schema_setup/create_schema.py,sha256=jwziMWJPbjRgjiRBT-KtidnXI8YNEFO74A9fwfptjHY,10626
27
28
  deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
28
29
  deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
29
- deriva_ml-1.8.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
30
- deriva_ml-1.8.1.dist-info/METADATA,sha256=wIwGjDbMrZ88z1y2ohD8vXJQliQLSPmKRoLO86l6c8c,556
31
- deriva_ml-1.8.1.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
32
- deriva_ml-1.8.1.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
33
- deriva_ml-1.8.1.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
34
- deriva_ml-1.8.1.dist-info/RECORD,,
30
+ deriva_ml-1.8.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
31
+ deriva_ml-1.8.4.dist-info/METADATA,sha256=F14U7NvY310NBB4wGp3-OVmAUXvMy_sDNuS1ZmRjwek,631
32
+ deriva_ml-1.8.4.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
33
+ deriva_ml-1.8.4.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
34
+ deriva_ml-1.8.4.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
35
+ deriva_ml-1.8.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: setuptools (76.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5