deriva-ml 1.6.7__tar.gz → 1.6.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {deriva_ml-1.6.7/src/deriva_ml.egg-info → deriva_ml-1.6.8}/PKG-INFO +1 -1
  2. deriva_ml-1.6.8/src/deriva_ml/VERSION.py +1 -0
  3. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/deriva_definitions.py +4 -1
  4. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/deriva_ml_base.py +99 -8
  5. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/execution.py +64 -245
  6. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/execution_configuration.py +1 -1
  7. {deriva_ml-1.6.7 → deriva_ml-1.6.8/src/deriva_ml.egg-info}/PKG-INFO +1 -1
  8. deriva_ml-1.6.8/tests/test_execution.py +148 -0
  9. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/tests/test_upload.py +9 -7
  10. deriva_ml-1.6.7/src/deriva_ml/VERSION.py +0 -1
  11. deriva_ml-1.6.7/tests/test_execution.py +0 -118
  12. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/LICENSE +0 -0
  13. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/README.md +0 -0
  14. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/pyproject.toml +0 -0
  15. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/setup.cfg +0 -0
  16. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/__init__.py +0 -0
  17. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/build/lib/schema_setup/__init__.py +0 -0
  18. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -0
  19. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -0
  20. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/build/lib/schema_setup/create_schema.py +0 -0
  21. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/build/lib/schema_setup/table_comments_utils.py +0 -0
  22. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/database_model.py +0 -0
  23. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/dataset.py +0 -0
  24. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/dataset_aux_classes.py +0 -0
  25. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/dataset_bag.py +0 -0
  26. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/demo_catalog.py +0 -0
  27. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/deriva_model.py +0 -0
  28. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/execution_environment.py +0 -0
  29. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/feature.py +0 -0
  30. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/history.py +0 -0
  31. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/schema_setup/__init__.py +0 -0
  32. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/schema_setup/alter_annotation.py +0 -0
  33. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/schema_setup/annotations.py +0 -0
  34. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/schema_setup/create_schema.py +0 -0
  35. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/schema_setup/policy.json +0 -0
  36. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/schema_setup/table_comments_utils.py +0 -0
  37. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/test_functions.py +0 -0
  38. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml/upload.py +0 -0
  39. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml.egg-info/SOURCES.txt +0 -0
  40. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
  41. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml.egg-info/entry_points.txt +0 -0
  42. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml.egg-info/requires.txt +0 -0
  43. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/src/deriva_ml.egg-info/top_level.txt +0 -0
  44. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/tests/test_basic_tables.py +0 -0
  45. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/tests/test_dataset.py +0 -0
  46. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/tests/test_download.py +0 -0
  47. {deriva_ml-1.6.7 → deriva_ml-1.6.8}/tests/test_features.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: deriva-ml
3
- Version: 1.6.7
3
+ Version: 1.6.8
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -0,0 +1 @@
1
+ __version__ = "1.6.8"
@@ -70,8 +70,11 @@ class Status(StrEnum):
70
70
 
71
71
  """
72
72
 
73
- running = "Running"
73
+ initializing = "Initializing"
74
+ created = "Created"
74
75
  pending = "Pending"
76
+ running = "Running"
77
+ aborted = "Aborted"
75
78
  completed = "Completed"
76
79
  failed = "Failed"
77
80
 
@@ -11,14 +11,17 @@ relationships that follow a specific data model.
11
11
  import getpass
12
12
  import logging
13
13
  from datetime import datetime
14
+ import hashlib
14
15
  from itertools import chain
15
16
  from pathlib import Path
17
+ import requests
16
18
  from typing import Optional, Any, Iterable, TYPE_CHECKING
17
19
  from deriva.core import (
18
20
  ErmrestCatalog,
19
21
  get_credential,
20
22
  urlquote,
21
23
  DEFAULT_SESSION_CONFIG,
24
+ format_exception,
22
25
  )
23
26
  import deriva.core.datapath as datapath
24
27
  from deriva.core.datapath import DataPathException
@@ -27,7 +30,7 @@ from deriva.core.ermrest_model import Key, Table
27
30
  from deriva.core.hatrac_store import HatracStore
28
31
  from pydantic import validate_call, ConfigDict
29
32
 
30
- from .execution_configuration import ExecutionConfiguration
33
+ from .execution_configuration import ExecutionConfiguration, Workflow
31
34
  from .feature import Feature, FeatureRecord
32
35
  from .dataset import Dataset
33
36
  from .deriva_model import DerivaModel
@@ -47,6 +50,7 @@ from .deriva_definitions import (
47
50
  DerivaMLException,
48
51
  ML_SCHEMA,
49
52
  VocabularyTerm,
53
+ MLVocab,
50
54
  )
51
55
 
52
56
  if TYPE_CHECKING:
@@ -122,6 +126,7 @@ class DerivaML(Dataset):
122
126
  self.ml_schema = ml_schema
123
127
  self.version = model_version
124
128
  self.configuration = None
129
+ self._execution: Optional[Execution] = None
125
130
 
126
131
  self.domain_schema = self.model.domain_schema
127
132
  self.project_name = project_name or self.domain_schema
@@ -145,6 +150,10 @@ class DerivaML(Dataset):
145
150
  f"Loading dirty model. Consider commiting and tagging: {self.version}"
146
151
  )
147
152
 
153
+ def __del__(self):
154
+ if self._execution and self._execution.status != Status.completed:
155
+ self._execution.update_status(Status.aborted, f"Execution Aborted")
156
+
148
157
  @staticmethod
149
158
  def _get_session_config():
150
159
  """ """
@@ -187,7 +196,7 @@ class DerivaML(Dataset):
187
196
  return table_path(
188
197
  self.working_dir,
189
198
  schema=self.domain_schema,
190
- table=self.model.namne_to_table(table).name,
199
+ table=self.model.name_to_table(table).name,
191
200
  )
192
201
 
193
202
  def asset_dir(
@@ -688,19 +697,29 @@ class DerivaML(Dataset):
688
697
  for v in pb.schemas[table.schema.name].tables[table.name].entities().fetch()
689
698
  ]
690
699
 
691
- def download_asset(self, asset_url: str, dest_filename: str) -> Path:
700
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
701
+ def download_asset(self, asset_rid: RID, dest_dir: Path) -> Path:
692
702
  """Download an asset from a URL and place it in a local directory.
693
703
 
694
704
  Args:
695
- asset_url: URL of the asset.
696
- dest_filename: Destination filename.
705
+ asset_rid: URL of the asset.
706
+ dest_dir: Destination directory for the asset.
697
707
 
698
708
  Returns:
699
709
  A Path object to the downloaded asset.
700
710
  """
711
+ table = self.resolve_rid(asset_rid).table
712
+ if not self.model.is_asset(table):
713
+ raise DerivaMLException(f"RID {asset_rid} is not for an asset table.")
714
+
715
+ tpath = self.pathBuilder.schemas[table.schema.name].tables[table.name]
716
+ asset_metadata = list(tpath.filter(tpath.RID == asset_rid).entities())[0]
717
+ asset_url = asset_metadata["URL"]
718
+ asset_filename = dest_dir / asset_metadata["Filename"]
719
+
701
720
  hs = HatracStore("https", self.host_name, self.credential)
702
- hs.get_obj(path=asset_url, destfilename=dest_filename)
703
- return Path(dest_filename)
721
+ hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
722
+ return Path(asset_filename)
704
723
 
705
724
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
706
725
  def upload_assets(
@@ -761,6 +780,72 @@ class DerivaML(Dataset):
761
780
  ]
762
781
  )
763
782
 
783
+ def list_workflows(self) -> list[Workflow]:
784
+ workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
785
+ return [
786
+ Workflow(
787
+ name=w["Name"],
788
+ url=w["URL"],
789
+ workflow_type=w["Workflow_Type"],
790
+ version=w["Version"],
791
+ description=w["Description"],
792
+ )
793
+ for w in workflow_path.entities().fetch()
794
+ ]
795
+
796
+ def add_workflow(self, workflow: Workflow) -> RID:
797
+ """Add a workflow to the Workflow table.
798
+
799
+ Args:
800
+ - url(str): URL of the workflow.
801
+ - workflow_type(str): Type of the workflow.
802
+ - version(str): Version of the workflow.
803
+ - description(str): Description of the workflow.
804
+
805
+ Returns:
806
+ - str: Resource Identifier (RID) of the added workflow.
807
+
808
+ """
809
+
810
+ # Check to make sure that the workflow is not already in the table. If it's not, add it.
811
+ def get_checksum(url) -> str:
812
+ """Get the checksum of a file from a URL."""
813
+ try:
814
+ response = requests.get(url)
815
+ response.raise_for_status()
816
+ except Exception:
817
+ raise DerivaMLException(f"Invalid URL: {url}")
818
+ else:
819
+ sha256_hash = hashlib.sha256()
820
+ sha256_hash.update(response.content)
821
+ checksum = "SHA-256: " + sha256_hash.hexdigest()
822
+ return checksum
823
+
824
+ ml_schema_path = self.pathBuilder.schemas[self.ml_schema]
825
+ try:
826
+ url_column = ml_schema_path.Workflow.URL
827
+ workflow_record = list(
828
+ ml_schema_path.Workflow.filter(url_column == workflow.url).entities()
829
+ )[0]
830
+ workflow_rid = workflow_record["RID"]
831
+ except IndexError:
832
+ # Record doesn't exist already
833
+ workflow_record = {
834
+ "URL": workflow.url,
835
+ "Name": workflow.name,
836
+ "Description": workflow.description,
837
+ "Checksum": get_checksum(workflow.url),
838
+ "Version": workflow.version,
839
+ MLVocab.workflow_type: self.lookup_term(
840
+ MLVocab.workflow_type, workflow.workflow_type
841
+ ).name,
842
+ }
843
+ workflow_rid = ml_schema_path.Workflow.insert([workflow_record])[0]["RID"]
844
+ except Exception as e:
845
+ error = format_exception(e)
846
+ raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
847
+ return workflow_rid
848
+
764
849
  # @validate_call
765
850
  def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":
766
851
  """Create an execution object
@@ -779,7 +864,13 @@ class DerivaML(Dataset):
779
864
  """
780
865
  from .execution import Execution
781
866
 
782
- return Execution(configuration, self)
867
+ if self._execution:
868
+ DerivaMLException(
869
+ f"Only one execution can be created for a Deriva ML instance."
870
+ )
871
+ else:
872
+ self._execution = Execution(configuration, self)
873
+ return self._execution
783
874
 
784
875
  # @validate_call
785
876
  def restore_execution(self, execution_rid: Optional[RID] = None) -> "Execution":
@@ -1,13 +1,13 @@
1
+ from __future__ import annotations
2
+
1
3
  from collections import defaultdict
2
4
  import csv
3
- import hashlib
4
5
  import json
5
6
  import logging
6
7
  import os
7
8
  import shutil
8
9
  from datetime import datetime
9
10
  from pathlib import Path
10
- import requests
11
11
  from tempfile import NamedTemporaryFile
12
12
  from typing import Iterable, Any, Optional
13
13
  from deriva.core import format_exception
@@ -96,7 +96,8 @@ class Execution:
96
96
  self.configuration = configuration
97
97
  self._ml_object = ml_object
98
98
  self.start_time = None
99
- self.status = Status.pending
99
+ self.status = Status.created
100
+ self.uploaded_assets: list[Path] = []
100
101
 
101
102
  self.dataset_rids: list[RID] = []
102
103
  self.datasets: list[DatasetBag] = []
@@ -104,7 +105,27 @@ class Execution:
104
105
  self._working_dir = self._ml_object.working_dir
105
106
  self._cache_dir = self._ml_object.cache_dir
106
107
 
107
- self.workflow_rid = self._add_workflow()
108
+ self.workflow_rid = self.configuration.workflow
109
+
110
+ if self._ml_object.resolve_rid(configuration.workflow).table.name != "Workflow":
111
+ raise DerivaMLException(
112
+ f"Workflow specified in execution configuration is not a Workflow"
113
+ )
114
+
115
+ for d in self.configuration.datasets:
116
+ if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
117
+ raise DerivaMLException(
118
+ f"Dataset specified in execution configuration is not a dataset"
119
+ )
120
+
121
+ for a in self.configuration.assets:
122
+ if not self._ml_object.model.is_asset(
123
+ self._ml_object.resolve_rid(a).table.name
124
+ ):
125
+ raise DerivaMLException(
126
+ f"Asset specified in execution configuration is not a asset table"
127
+ )
128
+
108
129
  schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
109
130
  if reload:
110
131
  self.execution_rid = reload
@@ -117,47 +138,10 @@ class Execution:
117
138
  }
118
139
  ]
119
140
  )[0]["RID"]
120
- self._initialize_execution(reload)
121
-
122
- def _add_workflow(self) -> RID:
123
- """Add a workflow to the Workflow table.
124
-
125
- Args:
126
- - url(str): URL of the workflow.
127
- - workflow_type(str): Type of the workflow.
128
- - version(str): Version of the workflow.
129
- - description(str): Description of the workflow.
130
141
 
131
- Returns:
132
- - str: Resource Identifier (RID) of the added workflow.
133
-
134
- """
135
- workflow = self.configuration.workflow
136
- # Check to make sure that the workflow is not already in the table. If it's not, add it.
137
- ml_schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
138
- try:
139
- url_column = ml_schema_path.Workflow.URL
140
- workflow_record = list(
141
- ml_schema_path.Workflow.filter(url_column == workflow.url).entities()
142
- )[0]
143
- workflow_rid = workflow_record["RID"]
144
- except IndexError:
145
- # Record doesn't exist already
146
- workflow_record = {
147
- "URL": workflow.url,
148
- "Name": workflow.name,
149
- "Description": workflow.description,
150
- "Checksum": self._get_checksum(workflow.url),
151
- "Version": workflow.version,
152
- MLVocab.workflow_type: self._ml_object.lookup_term(
153
- MLVocab.workflow_type, workflow.workflow_type
154
- ).name,
155
- }
156
- workflow_rid = ml_schema_path.Workflow.insert([workflow_record])[0]["RID"]
157
- except Exception as e:
158
- error = format_exception(e)
159
- raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
160
- return workflow_rid
142
+ # Create a directory for execution rid so we can recover state in case of a crash.
143
+ execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
144
+ self._initialize_execution(reload)
161
145
 
162
146
  def _save_runtime_environment(self):
163
147
 
@@ -186,7 +170,9 @@ class Execution:
186
170
  """
187
171
  # Materialize bdbag
188
172
  for dataset in self.configuration.datasets:
189
- self.update_status(Status.running, f"Materialize bag {dataset.rid}... ")
173
+ self.update_status(
174
+ Status.initializing, f"Materialize bag {dataset.rid}... "
175
+ )
190
176
  self.datasets.append(self.download_dataset_bag(dataset))
191
177
  self.dataset_rids.append(dataset.rid)
192
178
  # Update execution info
@@ -199,13 +185,14 @@ class Execution:
199
185
  ]
200
186
  )
201
187
 
202
- # Download model
188
+ # Download assets....
203
189
  self.update_status(Status.running, "Downloading assets ...")
204
- asset_path = self._asset_dir().as_posix()
205
190
  self.asset_paths = [
206
- self._download_execution_file(file_rid=m, dest_dir=asset_path)
207
- for m in self.configuration.assets
191
+ self._ml_object.download_asset(asset_rid=a, dest_dir=self._asset_dir())
192
+ for a in self.configuration.assets
208
193
  ]
194
+ if self.asset_paths and not reload:
195
+ self._update_execution_asset_table(self.configuration.assets)
209
196
 
210
197
  # Save configuration details for later upload
211
198
  exec_config_path = ExecMetadataVocab.execution_config.value
@@ -217,29 +204,7 @@ class Execution:
217
204
  self._save_runtime_environment()
218
205
 
219
206
  self.start_time = datetime.now()
220
- self.update_status(Status.running, "Initialize status finished.")
221
-
222
- @staticmethod
223
- def _get_checksum(url) -> str:
224
- """Get the checksum of a file from a URL.
225
-
226
- Args:
227
- url:
228
-
229
- Returns:
230
- str: Checksum of the file.
231
-
232
- """
233
- try:
234
- response = requests.get(url)
235
- response.raise_for_status()
236
- except Exception:
237
- raise DerivaMLException(f"Invalid URL: {url}")
238
- else:
239
- sha256_hash = hashlib.sha256()
240
- sha256_hash.update(response.content)
241
- checksum = "SHA-256: " + sha256_hash.hexdigest()
242
- return checksum
207
+ self.update_status(Status.pending, "Initialize status finished.")
243
208
 
244
209
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
245
210
  def download_dataset_bag(self, dataset: DatasetSpec) -> DatasetBag:
@@ -276,8 +241,10 @@ class Execution:
276
241
 
277
242
  def execution_start(self) -> None:
278
243
  """ """
244
+
279
245
  self.start_time = datetime.now()
280
- self.update_status(Status.running, f"Start ML algorithm ...")
246
+ self.uploaded_assets = None
247
+ self.update_status(Status.initializing, f"Start ML algorithm ...")
281
248
 
282
249
  def execution_stop(self) -> None:
283
250
  """Finish the execution and update the duration and status of execution."""
@@ -318,7 +285,15 @@ class Execution:
318
285
  for r in results.values()
319
286
  if r.state == UploadState.success and "Execution_Asset_Type" in r.result
320
287
  ]
288
+ execution_metadata = [
289
+ r.result["RID"]
290
+ for r in results.values()
291
+ if r.state == UploadState.success
292
+ and "Execution_Metadata_Type" in r.result
293
+ ]
321
294
  self._update_execution_asset_table(execution_assets)
295
+ self._update_execution_metadata_table(execution_metadata)
296
+
322
297
  except Exception as e:
323
298
  error = format_exception(e)
324
299
  self.update_status(Status.failed, error)
@@ -413,53 +388,6 @@ class Execution:
413
388
  path.mkdir(parents=True, exist_ok=True)
414
389
  return path
415
390
 
416
- def _download_execution_file(self, file_rid: RID, dest_dir: str = "") -> Path:
417
- """Download execution assets.
418
-
419
- Args:
420
- file_rid(str): Resource Identifier (RID) of the file.
421
- dest_dir(str): Destination directory for the downloaded assets.
422
-
423
- Returns:
424
- - Path: Path to the downloaded asset.
425
-
426
- Raises:
427
- - DerivaMLException: If there is an issue downloading the assets.
428
- """
429
- table = self._ml_object.resolve_rid(file_rid).table
430
- if not self._ml_object.model.is_asset(table):
431
- raise DerivaMLException(f"Table {table} is not an asset table.")
432
-
433
- pb = self._ml_object.pathBuilder
434
- ml_schema_path = pb.schemas[self._ml_object.ml_schema]
435
- tpath = pb.schemas[table.schema.name].tables[table.name]
436
- file_metadata = list(tpath.filter(tpath.RID == file_rid).entities())[0]
437
- file_url = file_metadata["URL"]
438
- file_name = file_metadata["Filename"]
439
- try:
440
- self.update_status(Status.running, f"Downloading {table.name}...")
441
- file_path = self._ml_object.download_asset(
442
- file_url, str(dest_dir) + "/" + file_name
443
- )
444
- except Exception as e:
445
- error = format_exception(e)
446
- self.update_status(Status.failed, error)
447
- raise DerivaMLException(
448
- f"Failed to download the file {file_rid}. Error: {error}"
449
- )
450
-
451
- ass_table = table.name + "_Execution"
452
- ass_table_path = ml_schema_path.tables[ass_table]
453
- exec_file_exec_entities = ass_table_path.filter(
454
- ass_table_path.columns[table.name] == file_rid
455
- ).entities()
456
- exec_list = [e["Execution"] for e in exec_file_exec_entities]
457
- if self.execution_rid not in exec_list:
458
- tpath = pb.schemas[self._ml_object.ml_schema].tables[ass_table]
459
- tpath.insert([{table.name: file_rid, "Execution": self.execution_rid}])
460
- self.update_status(Status.running, f"Successfully download {table.name}...")
461
- return Path(file_path)
462
-
463
391
  def _clean_folder_contents(self, folder_path: Path):
464
392
  """
465
393
 
@@ -477,47 +405,6 @@ class Execution:
477
405
  error = format_exception(e)
478
406
  self.update_status(Status.failed, error)
479
407
 
480
- # def _update_execution_metadata_table(
481
- # self, assets: dict[str, FileUploadState]
482
- # ) -> None:
483
- # """Upload execution metadata at _working_dir/Execution_metadata.
484
- #
485
- # Args:
486
- # assets: dict[str:
487
- # FileUploadState]:
488
- #
489
- # Raises:
490
- # - DerivaMLException: If there is an issue uploading the metadata.
491
- # """
492
- # ml_schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
493
- # a_table = list(
494
- # self._ml_object.model.schemas[self._ml_object.ml_schema]
495
- # .tables["Execution_Metadata"]
496
- # .find_associations()
497
- # )[0].name
498
- #
499
- # def asset_rid(asset) -> str:
500
- # """
501
- #
502
- # Args:
503
- # asset:
504
- #
505
- # Returns:
506
- #
507
- # """
508
- # return (
509
- # asset.state == UploadState.success
510
- # and asset.result
511
- # and asset.result["RID"]
512
- # )
513
- #
514
- # entities = [
515
- # {"Execution_Metadata": rid, "Execution": self.execution_rid}
516
- # for asset in assets.values()
517
- # if (rid := asset_rid(asset))
518
- # ]
519
- # ml_schema_path.tables[a_table].insert(entities)
520
-
521
408
  def _update_feature_table(
522
409
  self,
523
410
  target_table: str,
@@ -568,6 +455,15 @@ class Execution:
568
455
  entities = [map_path(e) for e in csv.DictReader(feature_values)]
569
456
  self._ml_object.domain_path.tables[feature_table].insert(entities)
570
457
 
458
+ def _update_execution_metadata_table(self, assets: list[RID]) -> None:
459
+ """Upload execution metadata at _working_dir/Execution_metadata."""
460
+ ml_schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
461
+ entities = [
462
+ {"Execution_Metadata": metadata_rid, "Execution": self.execution_rid}
463
+ for metadata_rid in assets
464
+ ]
465
+ ml_schema_path.Execution_Metadata_Execution.insert(entities)
466
+
571
467
  def _update_execution_asset_table(self, assets: list[RID]) -> None:
572
468
  """Assets associated with an execution must be linked to an execution entity after they are uploaded into
573
469
  the catalog. This routine takes a list of uploaded assets and makes that association.
@@ -576,17 +472,9 @@ class Execution:
576
472
  assets: list of RIDS for execution assets.:
577
473
  """
578
474
  ml_schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
579
- asset_exec_entities = ml_schema_path.Execution_Asset_Execution.filter(
580
- ml_schema_path.Execution_Asset_Execution.Execution == self.execution_rid
581
- ).entities()
582
- existing_assets = {e["Execution_Asset"] for e in asset_exec_entities}
583
-
584
- # Now got through the list of recently added assets, and add an entry for this asset if it
585
- # doesn't already exist.
586
475
  entities = [
587
476
  {"Execution_Asset": asset_rid, "Execution": self.execution_rid}
588
477
  for asset_rid in assets
589
- if asset_rid not in existing_assets
590
478
  ]
591
479
  ml_schema_path.Execution_Asset_Execution.insert(entities)
592
480
 
@@ -741,13 +629,9 @@ class Execution:
741
629
  self._working_dir, schema=self._ml_object.domain_schema, table=table
742
630
  )
743
631
 
744
- def execute(self) -> "DerivaMLExec":
745
- """Generate a context manager for a DerivaML execution.
746
-
747
- Returns:
748
- A DerivaMLExec object
749
- """
750
- return DerivaMLExec(self)
632
+ def execute(self) -> Execution:
633
+ """Initiate an execution with provided configuration. Can be used in a context manager."""
634
+ return self
751
635
 
752
636
  @validate_call
753
637
  def write_feature_file(self, features: Iterable[FeatureRecord]) -> None:
@@ -801,20 +685,6 @@ class Execution:
801
685
  ]
802
686
  return "\n".join(items)
803
687
 
804
-
805
- class DerivaMLExec:
806
- """Context manager for managing DerivaML execution.
807
-
808
- Provides status updates. For convenience, asset discovery and creation functions from the
809
- Execution object are provided.
810
- """
811
-
812
- def __init__(self, execution: Execution):
813
- self.execution = execution
814
- self.execution_rid = execution.execution_rid
815
- self.start_time = datetime.now()
816
- self.uploaded_assets = None
817
-
818
688
  def __enter__(self):
819
689
  """
820
690
  Method invoked when entering the context.
@@ -823,7 +693,7 @@ class DerivaMLExec:
823
693
  - self: The instance itself.
824
694
 
825
695
  """
826
- self.execution.execution_start()
696
+ self.execution_start()
827
697
  return self
828
698
 
829
699
  def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> bool:
@@ -839,11 +709,11 @@ class DerivaMLExec:
839
709
  bool: True if execution completed successfully, False otherwise.
840
710
  """
841
711
  if not exc_type:
842
- self.execution.update_status(Status.running, "Successfully run Ml.")
843
- self.execution.execution_stop()
712
+ self.update_status(Status.running, "Successfully run Ml.")
713
+ self.execution_stop()
844
714
  return True
845
715
  else:
846
- self.execution.update_status(
716
+ self.update_status(
847
717
  Status.failed,
848
718
  f"Exception type: {exc_type}, Exception value: {exc_value}",
849
719
  )
@@ -851,54 +721,3 @@ class DerivaMLExec:
851
721
  f"Exception type: {exc_type}, Exception value: {exc_value}, Exception traceback: {exc_tb}"
852
722
  )
853
723
  return False
854
-
855
- def execution_asset_path(self, asset_type: str) -> Path:
856
- """Return path to where execution assets of specified type should be placed.
857
-
858
- Args:
859
- asset_type: str:
860
-
861
- Returns:
862
- Path to the directory in which to place asset files.
863
- """
864
- return self.execution.execution_asset_path(asset_type)
865
-
866
- def execution_metadata_path(self, metadata_type: str) -> Path:
867
- """Return path to where execution metadata of specified type should be placed.
868
-
869
- Args:
870
- metadata_type: Term from metadata type vocabulary.
871
-
872
- Returns:
873
- Path to the directory in which to place metadata files.
874
- """
875
- return self.execution.execution_metadata_path(metadata_type)
876
-
877
- def feature_paths(
878
- self, table: Table | str, feature_name: str
879
- ) -> tuple[Path, dict[str, Path]]:
880
- """Return the file path of where to place feature values, and assets for the named feature and table.
881
-
882
- A side effect of calling this routine is that the directories in which to place the feature values and assets
883
- will be created
884
-
885
- Args:
886
- table: The table with which the feature is associated.
887
- feature_name: Name of the feature
888
-
889
- Returns:
890
- A tuple whose first element is the path for the feature values and whose second element is a dictionary
891
- of associated asset table names and corresponding paths.
892
- """
893
- return self.execution.feature_paths(table, feature_name)
894
-
895
- def table_path(self, table: Table | str) -> Path:
896
- """Path in the local file system for tables to be uploaded as part of the execution.
897
-
898
- Args:
899
- table: Table|str:
900
-
901
- Returns:
902
-
903
- """
904
- return self.execution.table_path(table)
@@ -49,7 +49,7 @@ class ExecutionConfiguration(BaseModel):
49
49
 
50
50
  datasets: conlist(DatasetSpec) = []
51
51
  assets: list[RID | str] = [] # List of RIDs to model files.
52
- workflow: Workflow
52
+ workflow: RID
53
53
  description: str = ""
54
54
 
55
55
  model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: deriva-ml
3
- Version: 1.6.7
3
+ Version: 1.6.8
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -0,0 +1,148 @@
1
+ from idlelib.run import manage_socket
2
+
3
+ from derivaml_test import TestDerivaML
4
+ from deriva_ml import (
5
+ MLVocab as vc,
6
+ Workflow,
7
+ ExecutionConfiguration,
8
+ DatasetSpec,
9
+ DerivaML,
10
+ )
11
+
12
+
13
+ class TestExecution(TestDerivaML):
14
+ def test_execution_no_download(self):
15
+ self.ml_instance.add_term(
16
+ vc.workflow_type,
17
+ "Manual Workflow",
18
+ description="Initial setup of Model File",
19
+ )
20
+ self.ml_instance.add_term(
21
+ vc.execution_asset_type,
22
+ "API_Model",
23
+ description="Model for our API workflow",
24
+ )
25
+ self.ml_instance.add_term(
26
+ vc.workflow_type,
27
+ "ML Demo",
28
+ description="A ML Workflow that uses Deriva ML API",
29
+ )
30
+
31
+ api_workflow = self.ml_instance.add_workflow(
32
+ Workflow(
33
+ name="Manual Workflow",
34
+ url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/tests/test_execution.py",
35
+ workflow_type="Manual Workflow",
36
+ description="A manual operation",
37
+ )
38
+ )
39
+
40
+ manual_execution = self.ml_instance.create_execution(
41
+ ExecutionConfiguration(
42
+ description="Sample Execution", workflow=api_workflow
43
+ )
44
+ )
45
+ with manual_execution as e:
46
+ pass
47
+ manual_execution.upload_execution_outputs()
48
+
49
+ def test_execution_download(self):
50
+ self.populate_catalog()
51
+ double_nested, nested, datasets = self.create_nested_dataset()
52
+
53
+ self.ml_instance.add_term(
54
+ vc.execution_asset_type,
55
+ "API_Model",
56
+ description="Model for our API workflow",
57
+ )
58
+ self.ml_instance.add_term(
59
+ vc.workflow_type,
60
+ "ML Demo",
61
+ description="A ML Workflow that uses Deriva ML API",
62
+ )
63
+ api_workflow = self.ml_instance.add_workflow(
64
+ Workflow(
65
+ name="ML Demo",
66
+ url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml",
67
+ workflow_type="ML Demo",
68
+ description="A workflow that uses Deriva ML",
69
+ )
70
+ )
71
+ execution_model = self.create_execution_asset(api_workflow)
72
+
73
+ config = ExecutionConfiguration(
74
+ datasets=[
75
+ DatasetSpec(
76
+ rid=nested[0],
77
+ version=self.ml_instance.dataset_version(nested[0]),
78
+ ),
79
+ DatasetSpec(
80
+ rid=nested[1],
81
+ version=self.ml_instance.dataset_version(nested[1]),
82
+ ),
83
+ ],
84
+ assets=[execution_model],
85
+ description="Sample Execution",
86
+ workflow=api_workflow,
87
+ )
88
+ exec = self.ml_instance.create_execution(config)
89
+ with exec as e:
90
+ print(e.asset_paths)
91
+ print(e.datasets)
92
+ self.assertEqual(1, len(e.asset_paths))
93
+ self.assertEqual(2, len(e.datasets))
94
+ exec.upload_execution_outputs()
95
+ pb = self.ml_instance.pathBuilder.schemas[self.ml_instance.ml_schema]
96
+ execution_asset_execution = pb.Execution_Asset_Execution
97
+ execution_metadata_execution = pb.Execution_Metadata_Execution
98
+ execution_asset = pb.Execution_Asset
99
+ execution_metadata = pb.Execution_Metadata
100
+
101
+ assets_execution = [
102
+ {
103
+ "RID": a["RID"],
104
+ "Execution_Asset": a["Execution_Asset"],
105
+ "Execution": a["Execution"],
106
+ }
107
+ for a in execution_asset_execution.entities().fetch()
108
+ if a["Execution"] == exec.execution_rid
109
+ ]
110
+ metadata_execution = [
111
+ {
112
+ "RID": a["RID"],
113
+ "Execution": a["Execution"],
114
+ "Execution_Metadata": a["Execution_Metadata"],
115
+ }
116
+ for a in execution_metadata_execution.entities().fetch()
117
+ if a["Execution"] == exec.execution_rid
118
+ ]
119
+ execution_assets = [
120
+ {"RID": a["RID"], "Filename": a["Filename"]}
121
+ for a in execution_asset.entities().fetch()
122
+ ]
123
+ execution_metadata = [
124
+ {"RID": a["RID"], "Filename": a["Filename"]}
125
+ for a in execution_metadata.entities().fetch()
126
+ ]
127
+ print(assets_execution)
128
+ print(metadata_execution)
129
+ print(execution_assets)
130
+ print(execution_metadata)
131
+ self.assertEqual(1, len(assets_execution))
132
+ self.assertEqual(2, len(metadata_execution))
133
+
134
+ def create_execution_asset(self, api_workflow):
135
+ manual_execution = self.ml_instance.create_execution(
136
+ ExecutionConfiguration(
137
+ description="Sample Execution", workflow=api_workflow
138
+ )
139
+ )
140
+ model_file = (
141
+ manual_execution.execution_asset_path("API_Model") / "modelfile.txt"
142
+ )
143
+ with open(model_file, "w") as fp:
144
+ fp.write(f"My model")
145
+ # Now upload the file and retrieve the RID of the new asset from the returned results.
146
+ uploaded_assets = manual_execution.upload_execution_outputs()
147
+ self.ml_instance._execution = None
148
+ return uploaded_assets["API_Model/modelfile.txt"].result["RID"]
@@ -71,13 +71,14 @@ class TestUpload(TestDerivaML):
71
71
  description="Model for our API workflow",
72
72
  )
73
73
 
74
- api_workflow = Workflow(
75
- name="Manual Workflow",
76
- url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/tests/test_upload.py",
77
- workflow_type="Manual Workflow",
78
- description="A manual operation",
74
+ api_workflow = self.ml_instance.add_workflow(
75
+ Workflow(
76
+ name="Manual Workflow",
77
+ url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/tests/test_upload.py",
78
+ workflow_type="Manual Workflow",
79
+ description="A manual operation",
80
+ )
79
81
  )
80
-
81
82
  manual_execution = self.ml_instance.create_execution(
82
83
  ExecutionConfiguration(
83
84
  description="Sample Execution", workflow=api_workflow
@@ -95,6 +96,7 @@ class TestUpload(TestDerivaML):
95
96
  uploaded_assets = manual_execution.upload_execution_outputs()
96
97
  path = self.ml_instance.catalog.getPathBuilder().schemas["deriva-ml"]
97
98
  self.assertEqual(1, len(list(path.Execution_Asset.entities().fetch())))
99
+
98
100
  execution_metadata = list(path.Execution_Metadata.entities().fetch())
99
- print([m["Filename"] for m in execution_metadata])
101
+ print([m for m in execution_metadata])
100
102
  self.assertEqual(2, len(execution_metadata))
@@ -1 +0,0 @@
1
- __version__ = "1.6.7"
@@ -1,118 +0,0 @@
1
- from derivaml_test import TestDerivaML
2
- from deriva_ml import MLVocab as vc, Workflow, ExecutionConfiguration, DatasetSpec
3
- from deriva_ml.demo_catalog import (
4
- reset_demo_catalog,
5
- populate_demo_catalog,
6
- create_demo_datasets,
7
- )
8
-
9
-
10
- class TestExecution(TestDerivaML):
11
- def test_execution_no_download(self):
12
- self.ml_instance.add_term(
13
- vc.workflow_type,
14
- "Manual Workflow",
15
- description="Initial setup of Model File",
16
- )
17
- self.ml_instance.add_term(
18
- vc.execution_asset_type,
19
- "API_Model",
20
- description="Model for our API workflow",
21
- )
22
- self.ml_instance.add_term(
23
- vc.workflow_type,
24
- "ML Demo",
25
- description="A ML Workflow that uses Deriva ML API",
26
- )
27
-
28
- api_workflow = Workflow(
29
- name="Manual Workflow",
30
- url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/tests/test_execution.py",
31
- workflow_type="Manual Workflow",
32
- description="A manual operation",
33
- )
34
-
35
- manual_execution = self.ml_instance.create_execution(
36
- ExecutionConfiguration(
37
- description="Sample Execution", workflow=api_workflow
38
- )
39
- )
40
- manual_execution.upload_execution_outputs()
41
-
42
- def test_execution_download(self):
43
- populate_demo_catalog(self.ml_instance, self.domain_schema)
44
- create_demo_datasets(self.ml_instance)
45
- exec_config = execution_test(self.ml_instance)
46
- exec = self.ml_instance.create_execution(exec_config)
47
-
48
-
49
- def execution_test(ml_instance):
50
- training_dataset_rid = [
51
- ds["RID"]
52
- for ds in ml_instance.find_datasets()
53
- if "Training" in ds["Dataset_Type"]
54
- ][0]
55
- testing_dataset_rid = [
56
- ds["RID"]
57
- for ds in ml_instance.find_datasets()
58
- if "Testing" in ds["Dataset_Type"]
59
- ][0]
60
-
61
- nested_dataset_rid = [
62
- ds["RID"]
63
- for ds in ml_instance.find_datasets()
64
- if "Partitioned" in ds["Dataset_Type"]
65
- ][0]
66
-
67
- ml_instance.add_term(
68
- vc.workflow_type, "Manual Workflow", description="Initial setup of Model File"
69
- )
70
- ml_instance.add_term(
71
- vc.execution_asset_type, "API_Model", description="Model for our API workflow"
72
- )
73
- ml_instance.add_term(
74
- vc.workflow_type, "ML Demo", description="A ML Workflow that uses Deriva ML API"
75
- )
76
- api_workflow = Workflow(
77
- name="Manual Workflow",
78
- url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/Notebooks/DerivaML%20Execution.ipynb",
79
- workflow_type="Manual Workflow",
80
- description="A manual operation",
81
- )
82
-
83
- manual_execution = ml_instance.create_execution(
84
- ExecutionConfiguration(description="Sample Execution", workflow=api_workflow)
85
- )
86
-
87
- # Now lets create model configuration for our program.
88
- model_file = manual_execution.execution_asset_path("API_Model") / "modelfile.txt"
89
- with open(model_file, "w") as fp:
90
- fp.write(f"My model")
91
-
92
- # Now upload the file and retrieve the RID of the new asset from the returned results.
93
- uploaded_assets = manual_execution.upload_execution_outputs()
94
-
95
- training_model_rid = uploaded_assets["API_Model/modelfile.txt"].result["RID"]
96
- api_workflow = Workflow(
97
- name="ML Demo",
98
- url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml",
99
- workflow_type="ML Demo",
100
- description="A workflow that uses Deriva ML",
101
- )
102
-
103
- config = ExecutionConfiguration(
104
- datasets=[
105
- DatasetSpec(
106
- rid=nested_dataset_rid,
107
- version=ml_instance.dataset_version(nested_dataset_rid),
108
- ),
109
- DatasetSpec(
110
- rid=testing_dataset_rid,
111
- version=ml_instance.dataset_version(testing_dataset_rid),
112
- ),
113
- ],
114
- assets=[training_model_rid],
115
- description="Sample Execution",
116
- workflow=api_workflow,
117
- )
118
- return config
File without changes
File without changes
File without changes
File without changes