deriva-ml 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -84,7 +84,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
84
84
  except KeyError:
85
85
  raise DerivaMLException(f"Dataset {dataset_rid} not found")
86
86
 
87
- def __init__(self, minid: DatasetMinid, bag_path: Path):
87
+ def __init__(self, minid: DatasetMinid, bag_path: Path, dbase_path: Path):
88
88
  """Create a new DatabaseModel.
89
89
 
90
90
  Args:
@@ -95,8 +95,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
95
95
  self.bag_path = bag_path
96
96
  self.minid = minid
97
97
  self.dataset_rid = minid.dataset_rid
98
- dir_path = bag_path.parent
99
- self.dbase_file = dir_path / f"{minid.version_rid}.db"
98
+ self.dbase_file = dbase_path / f"{minid.version_rid}.db"
100
99
  self.dbase = sqlite3.connect(self.dbase_file)
101
100
 
102
101
  super().__init__(
deriva_ml/dataset.py CHANGED
@@ -67,11 +67,12 @@ class Dataset:
67
67
 
68
68
  _Logger = logging.getLogger("deriva_ml")
69
69
 
70
- def __init__(self, model: DerivaModel, cache_dir: Path):
70
+ def __init__(self, model: DerivaModel, cache_dir: Path, working_dir: Path):
71
71
  self._model = model
72
72
  self._ml_schema = ML_SCHEMA
73
73
  self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
74
74
  self._cache_dir = cache_dir
75
+ self._working_dir = working_dir
75
76
  self._logger = logging.getLogger("deriva_ml")
76
77
 
77
78
  def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
@@ -783,7 +784,6 @@ class Dataset:
783
784
  snapshot: Optional[Dataset] = None,
784
785
  dataset_nesting_depth: Optional[int] = None,
785
786
  ) -> set[tuple[Table, ...]]:
786
-
787
787
  snapshot_catalog = snapshot if snapshot else self
788
788
 
789
789
  dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
@@ -831,9 +831,7 @@ class Dataset:
831
831
  nested_paths = set()
832
832
  if dataset_rid:
833
833
  for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
834
- nested_paths |= self._collect_paths(
835
- c, snapshot=snapshot_catalog
836
- )
834
+ nested_paths |= self._collect_paths(c, snapshot=snapshot_catalog)
837
835
  else:
838
836
  # Initialize nesting depth if not already provided.
839
837
  dataset_nesting_depth = (
@@ -979,7 +977,7 @@ class Dataset:
979
977
  if dataset.materialize
980
978
  else self._download_dataset_minid(minid)
981
979
  )
982
- return DatabaseModel(minid, bag_path).get_dataset()
980
+ return DatabaseModel(minid, bag_path, self._working_dir).get_dataset()
983
981
 
984
982
  def _version_snapshot(self, dataset: DatasetSpec) -> str:
985
983
  """Return a catalog with snapshot for the specified dataset version"""
deriva_ml/demo_catalog.py CHANGED
@@ -295,7 +295,9 @@ def create_demo_catalog(
295
295
  dataset_table = deriva_ml.dataset_table
296
296
  dataset_table.annotations.update(
297
297
  Dataset(
298
- deriva_ml.model, deriva_ml.cache_dir
298
+ deriva_ml.model,
299
+ cache_dir=deriva_ml.cache_dir,
300
+ working_dir=deriva_ml.working_dir,
299
301
  )._generate_dataset_annotations()
300
302
  )
301
303
  deriva_ml.model.apply()
@@ -163,7 +163,7 @@ class DerivaML(Dataset):
163
163
  self.cache_dir.mkdir(parents=True, exist_ok=True)
164
164
 
165
165
  # Initialize dataset class.
166
- super().__init__(self.model, self.cache_dir)
166
+ super().__init__(self.model, self.cache_dir, self.working_dir)
167
167
  self._logger = logging.getLogger("deriva_ml")
168
168
  self._logger.setLevel(logging_level)
169
169
 
@@ -257,7 +257,7 @@ class DerivaML(Dataset):
257
257
  def _get_notebook_path(self) -> Path | None:
258
258
  """Return the absolute path of the current notebook."""
259
259
 
260
- server, session = self._get_notebook_session()
260
+ server, session = DerivaML._get_notebook_session()
261
261
  if server and session:
262
262
  self._check_nbstrip_status()
263
263
  relative_path = session["notebook"]["path"]
@@ -1136,8 +1136,8 @@ class DerivaML(Dataset):
1136
1136
  return None
1137
1137
 
1138
1138
  def create_workflow(
1139
- self, name: str, workflow_type: str, description: str = "", create: bool = True
1140
- ) -> RID | None:
1139
+ self, name: str, workflow_type: str, description: str = ""
1140
+ ) -> Workflow:
1141
1141
  """Identify current executing program and return a workflow RID for it
1142
1142
 
1143
1143
  Determine the notebook or script that is currently being executed. Assume that this is
@@ -1149,10 +1149,21 @@ class DerivaML(Dataset):
1149
1149
  name: The name of the workflow.
1150
1150
  workflow_type: The type of the workflow.
1151
1151
  description: The description of the workflow.
1152
- create: Whether to create a new workflow.
1153
1152
  """
1154
1153
  # Make sure type is correct.
1155
1154
  self.lookup_term(MLVocab.workflow_type, workflow_type)
1155
+
1156
+ try:
1157
+ subprocess.run(
1158
+ "git rev-parse --is-inside-work-tree",
1159
+ capture_output=True,
1160
+ text=True,
1161
+ shell=True,
1162
+ check=True,
1163
+ )
1164
+ except subprocess.CalledProcessError:
1165
+ raise DerivaMLException("Not executing in a Git repository.")
1166
+
1156
1167
  github_url, is_dirty = self._github_url()
1157
1168
 
1158
1169
  if is_dirty:
@@ -1174,14 +1185,13 @@ class DerivaML(Dataset):
1174
1185
  shell=True,
1175
1186
  ).stdout.strip()
1176
1187
 
1177
- workflow = Workflow(
1188
+ return Workflow(
1178
1189
  name=name,
1179
1190
  url=github_url,
1180
1191
  checksum=checksum,
1181
1192
  description=description,
1182
1193
  workflow_type=workflow_type,
1183
1194
  )
1184
- return self.add_workflow(workflow) if create else None
1185
1195
 
1186
1196
  def _github_url(self) -> tuple[str, bool]:
1187
1197
  """Return a GitHUB URL for the latest commit of the script from which this routine is called.
@@ -1238,7 +1248,9 @@ class DerivaML(Dataset):
1238
1248
  return url, is_dirty
1239
1249
 
1240
1250
  # @validate_call
1241
- def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":
1251
+ def create_execution(
1252
+ self, configuration: ExecutionConfiguration, dryrun: bool = False
1253
+ ) -> "Execution":
1242
1254
  """Create an execution object
1243
1255
 
1244
1256
  Given an execution configuration, initialize the local compute environment to prepare for executing an
@@ -1249,6 +1261,7 @@ class DerivaML(Dataset):
1249
1261
 
1250
1262
  Args:
1251
1263
  configuration: ExecutionConfiguration:
1264
+ dryrun: Do not create an execution record or upload results.
1252
1265
 
1253
1266
  Returns:
1254
1267
  An execution object.
@@ -1260,7 +1273,7 @@ class DerivaML(Dataset):
1260
1273
  "Only one execution can be created for a Deriva ML instance."
1261
1274
  )
1262
1275
  else:
1263
- self._execution = Execution(configuration, self)
1276
+ self._execution = Execution(configuration, self, dryrun=dryrun)
1264
1277
  return self._execution
1265
1278
 
1266
1279
  # @validate_call
deriva_ml/execution.py CHANGED
@@ -30,7 +30,7 @@ from .deriva_definitions import (
30
30
  from .deriva_ml_base import DerivaML, FeatureRecord
31
31
  from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
32
32
  from .dataset_bag import DatasetBag
33
- from .execution_configuration import ExecutionConfiguration
33
+ from .execution_configuration import ExecutionConfiguration, Workflow
34
34
  from .execution_environment import get_execution_environment
35
35
  from .upload import (
36
36
  execution_metadata_dir,
@@ -96,6 +96,7 @@ class Execution:
96
96
  configuration: ExecutionConfiguration,
97
97
  ml_object: "DerivaML",
98
98
  reload: Optional[RID] = None,
99
+ dry_run: bool = False,
99
100
  ):
100
101
  """
101
102
 
@@ -107,6 +108,7 @@ class Execution:
107
108
  self.asset_paths: list[Path] = []
108
109
  self.configuration = configuration
109
110
  self._ml_object = ml_object
111
+ self._logger = ml_object._logger
110
112
  self.start_time = None
111
113
  self.stop_time = None
112
114
  self.status = Status.created
@@ -117,13 +119,23 @@ class Execution:
117
119
 
118
120
  self._working_dir = self._ml_object.working_dir
119
121
  self._cache_dir = self._ml_object.cache_dir
122
+ self._dry_run = dry_run
120
123
 
121
- self.workflow_rid = self.configuration.workflow
122
-
123
- if self._ml_object.resolve_rid(configuration.workflow).table.name != "Workflow":
124
- raise DerivaMLException(
125
- "Workflow specified in execution configuration is not a Workflow"
124
+ if isinstance(self.configuration.workflow, Workflow):
125
+ self.workflow_rid = (
126
+ self._ml_object.add_workflow(self.configuration.workflow)
127
+ if not self._dry_run
128
+ else "0000"
126
129
  )
130
+ else:
131
+ self.workflow_rid = self.configuration.workflow
132
+ if (
133
+ self._ml_object.resolve_rid(configuration.workflow).table.name
134
+ != "Workflow"
135
+ ):
136
+ raise DerivaMLException(
137
+ "Workflow specified in execution configuration is not a Workflow"
138
+ )
127
139
 
128
140
  for d in self.configuration.datasets:
129
141
  if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
@@ -142,6 +154,10 @@ class Execution:
142
154
  schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
143
155
  if reload:
144
156
  self.execution_rid = reload
157
+ if self.execution_rid == "0000":
158
+ self._dry_run = True
159
+ elif self._dry_run:
160
+ self.execution_rid = "0000"
145
161
  else:
146
162
  self.execution_rid = schema_path.Execution.insert(
147
163
  [
@@ -189,7 +205,7 @@ class Execution:
189
205
  self.dataset_rids.append(dataset.rid)
190
206
  # Update execution info
191
207
  schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
192
- if self.dataset_rids and not reload:
208
+ if self.dataset_rids and not (reload or self._dry_run):
193
209
  schema_path.Dataset_Execution.insert(
194
210
  [
195
211
  {"Dataset": d, "Execution": self.execution_rid}
@@ -203,7 +219,7 @@ class Execution:
203
219
  self._ml_object.download_asset(asset_rid=a, dest_dir=self._asset_dir())
204
220
  for a in self.configuration.assets
205
221
  ]
206
- if self.asset_paths and not reload:
222
+ if self.asset_paths and not (reload or self._dry_run):
207
223
  self._update_execution_asset_table(self.configuration.assets)
208
224
 
209
225
  # Save configuration details for later upload
@@ -242,6 +258,11 @@ class Execution:
242
258
  msg: Additional information about the status
243
259
  """
244
260
  self.status = status
261
+ self._logger.info(msg)
262
+
263
+ if self._dry_run:
264
+ return
265
+
245
266
  self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
246
267
  [
247
268
  {
@@ -278,7 +299,7 @@ class Execution:
278
299
 
279
300
  self.start_time = datetime.now()
280
301
  self.uploaded_assets = None
281
- self.update_status(Status.initializing, "Start ML algorithm ...")
302
+ self.update_status(Status.initializing, "Start execution ...")
282
303
 
283
304
  def execution_stop(self) -> None:
284
305
  """Finish the execution and update the duration and status of execution."""
@@ -288,13 +309,11 @@ class Execution:
288
309
  minutes, seconds = divmod(remainder, 60)
289
310
  duration = f"{round(hours, 0)}H {round(minutes, 0)}min {round(seconds, 4)}sec"
290
311
 
291
- if self._ml_object._is_notebook:
292
- self._create_notebook_checkpoint()
293
-
294
312
  self.update_status(Status.completed, "Algorithm execution ended.")
295
- self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
296
- [{"RID": self.execution_rid, "Duration": duration}]
297
- )
313
+ if not self._dry_run:
314
+ self._ml_object.pathBuilder.schemas[
315
+ self._ml_object.ml_schema
316
+ ].Execution.update([{"RID": self.execution_rid, "Duration": duration}])
298
317
 
299
318
  def _upload_execution_dirs(self) -> dict[str, FileUploadState]:
300
319
  """Upload execution assets at _working_dir/Execution_asset.
@@ -402,6 +421,8 @@ class Execution:
402
421
  Uploaded assets with key as assets' suborder name, values as an
403
422
  ordered dictionary with RID and metadata in the Execution_Asset table.
404
423
  """
424
+ if self._dry_run:
425
+ return {}
405
426
  try:
406
427
  uploaded_assets = self._upload_execution_dirs()
407
428
  self.update_status(Status.completed, "Successfully end the execution.")
@@ -49,7 +49,7 @@ class ExecutionConfiguration(BaseModel):
49
49
 
50
50
  datasets: conlist(DatasetSpec) = []
51
51
  assets: list[RID | str] = [] # List of RIDs to model files.
52
- workflow: RID
52
+ workflow: RID | Workflow
53
53
  description: str = ""
54
54
 
55
55
  model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.9.0
3
+ Version: 1.10.0
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -16,5 +16,14 @@ Requires-Dist: setuptools-scm<=6.0
16
16
  Requires-Dist: nbstripout
17
17
  Dynamic: license-file
18
18
 
19
- Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
19
+ # DerivaML
20
+ Deriva-ML is a python library to simplify the process of creating and executing reproducible machine learning workflows
20
21
  using a deriva catalog.
22
+
23
+
24
+ ## Installing the GitHub CLI
25
+
26
+ The script release.sh will create a new release tag in GitHub. This script requires the
27
+ GitHUB CLI be installed.
28
+
29
+ See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.
@@ -1,14 +1,14 @@
1
1
  deriva_ml/__init__.py,sha256=r1Z9N5vtZkAET7emqhpAx2bf_xJUp5wHOc4_DIplsG8,1082
2
- deriva_ml/database_model.py,sha256=HaJoxKSogc-xLGaZfEviqRAWO9wUy52h7yK8by6FKxM,14838
3
- deriva_ml/dataset.py,sha256=XIXyTej55WduvEOGitG5SJIfPYrQu36cXjCoCNHNMwQ,60746
2
+ deriva_ml/database_model.py,sha256=58iweWRteLeKKjjeNA9_e7TbUb4Av92lxH2zKvZzwA8,14823
3
+ deriva_ml/dataset.py,sha256=h7Zkhnhy66GhPg6O1ud-YCx-jFKAabWF-nwuIDsR8SU,60785
4
4
  deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
5
5
  deriva_ml/dataset_bag.py,sha256=aOJxFA9t5apjE5BNBrk8Pi9R1Cp8AWnnaL-10P8ELrQ,11515
6
- deriva_ml/demo_catalog.py,sha256=zQAHWSvrVPxMg-vyRUqoC0Jj5RhfGjkBwXW3mBksLhA,10986
6
+ deriva_ml/demo_catalog.py,sha256=1442Lbxmlq45_fgFx0SZPag6dZLimXCk57-TRFee3VA,11064
7
7
  deriva_ml/deriva_definitions.py,sha256=jNiInYA2Cb1GE4OOT1CofxBygdLDSOmNsw5Wl6NbZQE,8943
8
- deriva_ml/deriva_ml_base.py,sha256=B0_0R0tgx4o30VM-QSSKIGy2BN5kOBcYKuYGvmPkwMg,46953
8
+ deriva_ml/deriva_ml_base.py,sha256=Yo52Sb_9rujH7ew9aJ_Ys84NZU-Tc3TGV_O--wnaUQA,47307
9
9
  deriva_ml/deriva_model.py,sha256=B4gwr3-92IQU-mEZlusgNEnRyulD96esWGS67q9MzHk,12024
10
- deriva_ml/execution.py,sha256=on8hAtuZr9qFiyxuk_vDCmnRJ9Cv4kFOgHK4HY4CmV8,29585
11
- deriva_ml/execution_configuration.py,sha256=vsdL31J09dz7CQDd2rYXIjyBPwNlgAWvrTqsXNWi82g,3357
10
+ deriva_ml/execution.py,sha256=nPTQ__QHoBTz0gUu8k4CSEeCD4UvttZfy2oDJr9HxKY,30294
11
+ deriva_ml/execution_configuration.py,sha256=yksebWFjAfrar2955L8_D6vUnQlfuvcyrqjOIrvWW90,3368
12
12
  deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
13
13
  deriva_ml/feature.py,sha256=7e8WYPCfJSrGxJh9oUTduYSnB5ekybRhXa_0HIigS_w,5459
14
14
  deriva_ml/history.py,sha256=qTDLDs8Ow_6r7mDO0gZm0Fg81SWKOAgtCU5pzZoDRgM,2828
@@ -19,9 +19,9 @@ deriva_ml/schema_setup/annotations.py,sha256=v0gTpmWYxRqsQ-bcnQzsr8WowGv2pi9pZUs
19
19
  deriva_ml/schema_setup/create_schema.py,sha256=BRdYeWW5I8HxuATkB1hkKuIw4n-JQu620xod7EQoVSE,10674
20
20
  deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
21
21
  deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
22
- deriva_ml-1.9.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
- deriva_ml-1.9.0.dist-info/METADATA,sha256=REDBcboXpGhYbG7bVaICPhZP81cDLoSiCdiY7PX8GrQ,669
24
- deriva_ml-1.9.0.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
25
- deriva_ml-1.9.0.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
26
- deriva_ml-1.9.0.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
27
- deriva_ml-1.9.0.dist-info/RECORD,,
22
+ deriva_ml-1.10.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
23
+ deriva_ml-1.10.0.dist-info/METADATA,sha256=EInfvOS4ru5OFfTQvNvFYVytQiuzHOXhiH3zISlNhmQ,942
24
+ deriva_ml-1.10.0.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
25
+ deriva_ml-1.10.0.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
26
+ deriva_ml-1.10.0.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
27
+ deriva_ml-1.10.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (77.0.3)
2
+ Generator: setuptools (78.0.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5