deriva-ml 1.9.1__py3-none-any.whl → 1.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/demo_catalog.py CHANGED
@@ -295,7 +295,9 @@ def create_demo_catalog(
295
295
  dataset_table = deriva_ml.dataset_table
296
296
  dataset_table.annotations.update(
297
297
  Dataset(
298
- deriva_ml.model, deriva_ml.cache_dir
298
+ deriva_ml.model,
299
+ cache_dir=deriva_ml.cache_dir,
300
+ working_dir=deriva_ml.working_dir,
299
301
  )._generate_dataset_annotations()
300
302
  )
301
303
  deriva_ml.model.apply()
@@ -1136,8 +1136,8 @@ class DerivaML(Dataset):
1136
1136
  return None
1137
1137
 
1138
1138
  def create_workflow(
1139
- self, name: str, workflow_type: str, description: str = "", create: bool = True
1140
- ) -> RID | None:
1139
+ self, name: str, workflow_type: str, description: str = ""
1140
+ ) -> Workflow:
1141
1141
  """Identify current executing program and return a workflow RID for it
1142
1142
 
1143
1143
  Determine the notebook or script that is currently being executed. Assume that this is
@@ -1149,10 +1149,21 @@ class DerivaML(Dataset):
1149
1149
  name: The name of the workflow.
1150
1150
  workflow_type: The type of the workflow.
1151
1151
  description: The description of the workflow.
1152
- create: Whether to create a new workflow.
1153
1152
  """
1154
1153
  # Make sure type is correct.
1155
1154
  self.lookup_term(MLVocab.workflow_type, workflow_type)
1155
+
1156
+ try:
1157
+ subprocess.run(
1158
+ "git rev-parse --is-inside-work-tree",
1159
+ capture_output=True,
1160
+ text=True,
1161
+ shell=True,
1162
+ check=True,
1163
+ )
1164
+ except subprocess.CalledProcessError:
1165
+ raise DerivaMLException("Not executing in a Git repository.")
1166
+
1156
1167
  github_url, is_dirty = self._github_url()
1157
1168
 
1158
1169
  if is_dirty:
@@ -1174,14 +1185,13 @@ class DerivaML(Dataset):
1174
1185
  shell=True,
1175
1186
  ).stdout.strip()
1176
1187
 
1177
- workflow = Workflow(
1188
+ return Workflow(
1178
1189
  name=name,
1179
1190
  url=github_url,
1180
1191
  checksum=checksum,
1181
1192
  description=description,
1182
1193
  workflow_type=workflow_type,
1183
1194
  )
1184
- return self.add_workflow(workflow) if create else None
1185
1195
 
1186
1196
  def _github_url(self) -> tuple[str, bool]:
1187
1197
  """Return a GitHUB URL for the latest commit of the script from which this routine is called.
@@ -1238,7 +1248,9 @@ class DerivaML(Dataset):
1238
1248
  return url, is_dirty
1239
1249
 
1240
1250
  # @validate_call
1241
- def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":
1251
+ def create_execution(
1252
+ self, configuration: ExecutionConfiguration, dry_run: bool = False
1253
+ ) -> "Execution":
1242
1254
  """Create an execution object
1243
1255
 
1244
1256
  Given an execution configuration, initialize the local compute environment to prepare for executing an
@@ -1249,18 +1261,14 @@ class DerivaML(Dataset):
1249
1261
 
1250
1262
  Args:
1251
1263
  configuration: ExecutionConfiguration:
1264
+ dryrun: Do not create an execution record or upload results.
1252
1265
 
1253
1266
  Returns:
1254
1267
  An execution object.
1255
1268
  """
1256
1269
  from .execution import Execution
1257
1270
 
1258
- if self._execution:
1259
- DerivaMLException(
1260
- "Only one execution can be created for a Deriva ML instance."
1261
- )
1262
- else:
1263
- self._execution = Execution(configuration, self)
1271
+ self._execution = Execution(configuration, self, dry_run=dry_run)
1264
1272
  return self._execution
1265
1273
 
1266
1274
  # @validate_call
deriva_ml/execution.py CHANGED
@@ -18,6 +18,7 @@ from typing import Iterable, Any, Optional
18
18
  from deriva.core import format_exception
19
19
  from deriva.core.ermrest_model import Table
20
20
  from pydantic import validate_call, ConfigDict
21
+ import sys
21
22
 
22
23
  from .deriva_definitions import MLVocab, ExecMetadataVocab
23
24
  from .deriva_definitions import (
@@ -30,7 +31,7 @@ from .deriva_definitions import (
30
31
  from .deriva_ml_base import DerivaML, FeatureRecord
31
32
  from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
32
33
  from .dataset_bag import DatasetBag
33
- from .execution_configuration import ExecutionConfiguration
34
+ from .execution_configuration import ExecutionConfiguration, Workflow
34
35
  from .execution_environment import get_execution_environment
35
36
  from .upload import (
36
37
  execution_metadata_dir,
@@ -96,6 +97,7 @@ class Execution:
96
97
  configuration: ExecutionConfiguration,
97
98
  ml_object: "DerivaML",
98
99
  reload: Optional[RID] = None,
100
+ dry_run: bool = False,
99
101
  ):
100
102
  """
101
103
 
@@ -107,23 +109,36 @@ class Execution:
107
109
  self.asset_paths: list[Path] = []
108
110
  self.configuration = configuration
109
111
  self._ml_object = ml_object
112
+ self._logger = ml_object._logger
110
113
  self.start_time = None
111
114
  self.stop_time = None
112
115
  self.status = Status.created
113
116
  self.uploaded_assets: list[Path] = []
117
+ self.configuration.argv = sys.argv
114
118
 
115
119
  self.dataset_rids: list[RID] = []
116
120
  self.datasets: list[DatasetBag] = []
121
+ self.parameters = self.configuration.parameters
117
122
 
118
123
  self._working_dir = self._ml_object.working_dir
119
124
  self._cache_dir = self._ml_object.cache_dir
125
+ self._dry_run = dry_run
120
126
 
121
- self.workflow_rid = self.configuration.workflow
122
-
123
- if self._ml_object.resolve_rid(configuration.workflow).table.name != "Workflow":
124
- raise DerivaMLException(
125
- "Workflow specified in execution configuration is not a Workflow"
127
+ if isinstance(self.configuration.workflow, Workflow):
128
+ self.workflow_rid = (
129
+ self._ml_object.add_workflow(self.configuration.workflow)
130
+ if not self._dry_run
131
+ else "0000"
126
132
  )
133
+ else:
134
+ self.workflow_rid = self.configuration.workflow
135
+ if (
136
+ self._ml_object.resolve_rid(configuration.workflow).table.name
137
+ != "Workflow"
138
+ ):
139
+ raise DerivaMLException(
140
+ "Workflow specified in execution configuration is not a Workflow"
141
+ )
127
142
 
128
143
  for d in self.configuration.datasets:
129
144
  if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
@@ -142,6 +157,10 @@ class Execution:
142
157
  schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
143
158
  if reload:
144
159
  self.execution_rid = reload
160
+ if self.execution_rid == "0000":
161
+ self._dry_run = True
162
+ elif self._dry_run:
163
+ self.execution_rid = "0000"
145
164
  else:
146
165
  self.execution_rid = schema_path.Execution.insert(
147
166
  [
@@ -189,7 +208,7 @@ class Execution:
189
208
  self.dataset_rids.append(dataset.rid)
190
209
  # Update execution info
191
210
  schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
192
- if self.dataset_rids and not reload:
211
+ if self.dataset_rids and not (reload or self._dry_run):
193
212
  schema_path.Dataset_Execution.insert(
194
213
  [
195
214
  {"Dataset": d, "Execution": self.execution_rid}
@@ -203,7 +222,7 @@ class Execution:
203
222
  self._ml_object.download_asset(asset_rid=a, dest_dir=self._asset_dir())
204
223
  for a in self.configuration.assets
205
224
  ]
206
- if self.asset_paths and not reload:
225
+ if self.asset_paths and not (reload or self._dry_run):
207
226
  self._update_execution_asset_table(self.configuration.assets)
208
227
 
209
228
  # Save configuration details for later upload
@@ -242,6 +261,11 @@ class Execution:
242
261
  msg: Additional information about the status
243
262
  """
244
263
  self.status = status
264
+ self._logger.info(msg)
265
+
266
+ if self._dry_run:
267
+ return
268
+
245
269
  self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
246
270
  [
247
271
  {
@@ -278,7 +302,7 @@ class Execution:
278
302
 
279
303
  self.start_time = datetime.now()
280
304
  self.uploaded_assets = None
281
- self.update_status(Status.initializing, "Start ML algorithm ...")
305
+ self.update_status(Status.initializing, "Start execution ...")
282
306
 
283
307
  def execution_stop(self) -> None:
284
308
  """Finish the execution and update the duration and status of execution."""
@@ -288,13 +312,11 @@ class Execution:
288
312
  minutes, seconds = divmod(remainder, 60)
289
313
  duration = f"{round(hours, 0)}H {round(minutes, 0)}min {round(seconds, 4)}sec"
290
314
 
291
- if self._ml_object._is_notebook:
292
- self._create_notebook_checkpoint()
293
-
294
315
  self.update_status(Status.completed, "Algorithm execution ended.")
295
- self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
296
- [{"RID": self.execution_rid, "Duration": duration}]
297
- )
316
+ if not self._dry_run:
317
+ self._ml_object.pathBuilder.schemas[
318
+ self._ml_object.ml_schema
319
+ ].Execution.update([{"RID": self.execution_rid, "Duration": duration}])
298
320
 
299
321
  def _upload_execution_dirs(self) -> dict[str, FileUploadState]:
300
322
  """Upload execution assets at _working_dir/Execution_asset.
@@ -402,6 +424,8 @@ class Execution:
402
424
  Uploaded assets with key as assets' suborder name, values as an
403
425
  ordered dictionary with RID and metadata in the Execution_Asset table.
404
426
  """
427
+ if self._dry_run:
428
+ return {}
405
429
  try:
406
430
  uploaded_assets = self._upload_execution_dirs()
407
431
  self.update_status(Status.completed, "Successfully end the execution.")
@@ -1,14 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
- from typing import Optional
4
+ from typing import Optional, Any
5
5
 
6
- from pydantic import (
7
- BaseModel,
8
- conlist,
9
- ConfigDict,
10
- )
6
+ from pydantic import BaseModel, conlist, ConfigDict, field_validator, Field
11
7
  from pathlib import Path
8
+ import sys
12
9
 
13
10
 
14
11
  from .dataset_aux_classes import DatasetSpec
@@ -43,17 +40,30 @@ class ExecutionConfiguration(BaseModel):
43
40
  datasets: List of dataset specifications which specify the dataset RID, version and if the dataset
44
41
  should be materialized.
45
42
  assets: List of assets to be downloaded prior to execution. The values must be RIDs in an asset table
43
+ parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
46
44
  workflow: A RID for a workflow instance. Must have a name, URI to the workflow instance, and a type.
47
45
  description: A description of the execution. Can use Markdown format.
48
46
  """
49
47
 
50
48
  datasets: conlist(DatasetSpec) = []
51
49
  assets: list[RID | str] = [] # List of RIDs to model files.
52
- workflow: RID
50
+ workflow: RID | Workflow
51
+ parameters: dict[str, Any] = {}
53
52
  description: str = ""
53
+ argv: conlist(str) = Field(default_factory=lambda: sys.argv)
54
54
 
55
55
  model_config = ConfigDict(arbitrary_types_allowed=True)
56
56
 
57
+ @field_validator("parameters", mode="before")
58
+ @classmethod
59
+ def validate_parameters(cls, value: Any) -> Any:
60
+ """If parameter is a file, assume that it has JSON contents for configuration parameters"""
61
+ if isinstance(value, str) or isinstance(value, Path):
62
+ with open(value, "r") as f:
63
+ return json.load(f)
64
+ else:
65
+ return value
66
+
57
67
  @staticmethod
58
68
  def load_configuration(path: Path) -> ExecutionConfiguration:
59
69
  """Create a ExecutionConfiguration from a JSON configuration file.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.9.1
3
+ Version: 1.10.1
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -3,12 +3,12 @@ deriva_ml/database_model.py,sha256=58iweWRteLeKKjjeNA9_e7TbUb4Av92lxH2zKvZzwA8,1
3
3
  deriva_ml/dataset.py,sha256=h7Zkhnhy66GhPg6O1ud-YCx-jFKAabWF-nwuIDsR8SU,60785
4
4
  deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
5
5
  deriva_ml/dataset_bag.py,sha256=aOJxFA9t5apjE5BNBrk8Pi9R1Cp8AWnnaL-10P8ELrQ,11515
6
- deriva_ml/demo_catalog.py,sha256=zQAHWSvrVPxMg-vyRUqoC0Jj5RhfGjkBwXW3mBksLhA,10986
6
+ deriva_ml/demo_catalog.py,sha256=1442Lbxmlq45_fgFx0SZPag6dZLimXCk57-TRFee3VA,11064
7
7
  deriva_ml/deriva_definitions.py,sha256=jNiInYA2Cb1GE4OOT1CofxBygdLDSOmNsw5Wl6NbZQE,8943
8
- deriva_ml/deriva_ml_base.py,sha256=nzPzn_iLQIUJDCxTdRgAVEWqS7LbRTZriofWYmdEYe8,46975
8
+ deriva_ml/deriva_ml_base.py,sha256=9LeHUf20MTL3wawUAZz0rRZrxdjo-kki2zRpfv7Rgzg,47141
9
9
  deriva_ml/deriva_model.py,sha256=B4gwr3-92IQU-mEZlusgNEnRyulD96esWGS67q9MzHk,12024
10
- deriva_ml/execution.py,sha256=on8hAtuZr9qFiyxuk_vDCmnRJ9Cv4kFOgHK4HY4CmV8,29585
11
- deriva_ml/execution_configuration.py,sha256=vsdL31J09dz7CQDd2rYXIjyBPwNlgAWvrTqsXNWi82g,3357
10
+ deriva_ml/execution.py,sha256=15z4S5tElF-pUFIKgPGmxaC1wwh4Via0Mfd1S_ZiZ8c,30404
11
+ deriva_ml/execution_configuration.py,sha256=ZdLHLTUcg5V1id1sVjbp7Nm5bjh42ATG7hOGKaiCSj4,4013
12
12
  deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
13
13
  deriva_ml/feature.py,sha256=7e8WYPCfJSrGxJh9oUTduYSnB5ekybRhXa_0HIigS_w,5459
14
14
  deriva_ml/history.py,sha256=qTDLDs8Ow_6r7mDO0gZm0Fg81SWKOAgtCU5pzZoDRgM,2828
@@ -19,9 +19,9 @@ deriva_ml/schema_setup/annotations.py,sha256=v0gTpmWYxRqsQ-bcnQzsr8WowGv2pi9pZUs
19
19
  deriva_ml/schema_setup/create_schema.py,sha256=BRdYeWW5I8HxuATkB1hkKuIw4n-JQu620xod7EQoVSE,10674
20
20
  deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
21
21
  deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
22
- deriva_ml-1.9.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
- deriva_ml-1.9.1.dist-info/METADATA,sha256=zdDl9mmw2-DwvfYyWtq3vKnsB175gRFUsfHvRNLOGLg,941
24
- deriva_ml-1.9.1.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
25
- deriva_ml-1.9.1.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
26
- deriva_ml-1.9.1.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
27
- deriva_ml-1.9.1.dist-info/RECORD,,
22
+ deriva_ml-1.10.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
+ deriva_ml-1.10.1.dist-info/METADATA,sha256=0kxsip-JxmgtpvQeHUebq0DDUB3sSOsFJeIpJ6Qdaww,942
24
+ deriva_ml-1.10.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
25
+ deriva_ml-1.10.1.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
26
+ deriva_ml-1.10.1.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
27
+ deriva_ml-1.10.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (77.0.3)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5