deriva-ml 1.14.46__py3-none-any.whl → 1.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/__init__.py CHANGED
@@ -1,45 +1,84 @@
1
- __all__ = [
2
- "DerivaML",
3
- "DerivaMLException",
4
- "DerivaMLInvalidTerm",
5
- "DerivaMLTableTypeError",
6
- "Execution",
7
- "ExecAssetType",
8
- "ExecMetadataType",
9
- "Workflow",
10
- "DatasetBag",
11
- "DatasetVersion",
12
- "DatasetSpec",
13
- "FileSpec",
14
- "VersionPart",
15
- "RID",
16
- "BuiltinTypes",
17
- "ColumnDefinition",
18
- "MLVocab",
19
- "MLAsset",
20
- "TableDefinition",
21
- "ExecutionConfiguration",
22
- ]
23
-
24
1
  from importlib.metadata import PackageNotFoundError, version
2
+ from typing import TYPE_CHECKING
25
3
 
26
- from deriva_ml.core import (
4
+ # Safe imports - no circular dependencies
5
+ from deriva_ml.core.config import DerivaMLConfig
6
+ from deriva_ml.core.definitions import (
27
7
  RID,
28
8
  BuiltinTypes,
29
9
  ColumnDefinition,
30
- DerivaML,
10
+ DerivaAssetColumns,
11
+ DerivaSystemColumns,
31
12
  ExecAssetType,
32
13
  ExecMetadataType,
33
14
  FileSpec,
15
+ FileUploadState,
16
+ ForeignKeyDefinition,
17
+ KeyDefinition,
34
18
  MLAsset,
35
19
  MLVocab,
36
20
  TableDefinition,
21
+ UploadState,
22
+ )
23
+ from deriva_ml.core.exceptions import (
24
+ DerivaMLException,
25
+ DerivaMLInvalidTerm,
26
+ DerivaMLTableTypeError,
37
27
  )
38
- from deriva_ml.core.exceptions import DerivaMLException, DerivaMLInvalidTerm, DerivaMLTableTypeError
39
- from deriva_ml.dataset.aux_classes import DatasetSpec, DatasetVersion, VersionPart
40
- from deriva_ml.dataset.dataset_bag import DatasetBag
41
- from deriva_ml.execution.execution import Execution, ExecutionConfiguration
42
- from deriva_ml.execution.workflow import Workflow
28
+ from deriva_ml.dataset.aux_classes import DatasetConfig, DatasetConfigList, DatasetSpec, DatasetVersion
29
+
30
+ from .execution import Execution, ExecutionConfiguration, Workflow
31
+
32
+ # Type-checking only - avoid circular import at runtime
33
+ if TYPE_CHECKING:
34
+ from deriva_ml.core.base import DerivaML
35
+
36
+
37
+ # Lazy import function for runtime usage
38
+ def __getattr__(name):
39
+ """Lazy import to avoid circular dependencies."""
40
+ if name == "DerivaML":
41
+ from deriva_ml.core.base import DerivaML
42
+
43
+ return DerivaML
44
+ elif name == "Execution":
45
+ from deriva_ml.execution.execution import Execution
46
+
47
+ return Execution
48
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
49
+
50
+
51
+ __all__ = [
52
+ "DerivaML", # Lazy-loaded
53
+ "DerivaMLConfig",
54
+ "DatasetConfig",
55
+ "DatasetConfigList",
56
+ "DatasetSpec",
57
+ "DatasetVersion",
58
+ "Execution",
59
+ "ExecutionConfiguration",
60
+ "Workflow",
61
+ # Exceptions
62
+ "DerivaMLException",
63
+ "DerivaMLInvalidTerm",
64
+ "DerivaMLTableTypeError",
65
+ # Definitions
66
+ "RID",
67
+ "BuiltinTypes",
68
+ "ColumnDefinition",
69
+ "DerivaSystemColumns",
70
+ "DerivaAssetColumns",
71
+ "ExecAssetType",
72
+ "ExecMetadataType",
73
+ "FileSpec",
74
+ "FileUploadState",
75
+ "ForeignKeyDefinition",
76
+ "KeyDefinition",
77
+ "MLAsset",
78
+ "MLVocab",
79
+ "TableDefinition",
80
+ "UploadState",
81
+ ]
43
82
 
44
83
  try:
45
84
  __version__ = version("deriva_ml")
deriva_ml/bump_version.py CHANGED
@@ -105,7 +105,7 @@ def main() -> int:
105
105
 
106
106
  # Find latest semver tag with prefix
107
107
  tag = latest_semver_tag(prefix)
108
-
108
+ print(f"Latest semver tag: {tag}")
109
109
  if not tag:
110
110
  seed_initial_tag(f"{prefix}{start}")
111
111
  print(f"Seeded {prefix}{start}. Done.")
@@ -1,4 +1,5 @@
1
1
  from deriva_ml.core.base import DerivaML
2
+ from deriva_ml.core.config import DerivaMLConfig
2
3
  from deriva_ml.core.definitions import (
3
4
  RID,
4
5
  BuiltinTypes,
@@ -17,12 +18,11 @@ from deriva_ml.core.exceptions import DerivaMLException, DerivaMLInvalidTerm, De
17
18
 
18
19
  __all__ = [
19
20
  "DerivaML",
20
-
21
+ "DerivaMLConfig",
21
22
  # Exceptions
22
23
  "DerivaMLException",
23
24
  "DerivaMLInvalidTerm",
24
25
  "DerivaMLTableTypeError",
25
-
26
26
  # Definitions
27
27
  "RID",
28
28
  "BuiltinTypes",
deriva_ml/core/base.py CHANGED
@@ -15,7 +15,6 @@ from __future__ import annotations # noqa: I001
15
15
 
16
16
  # Standard library imports
17
17
  from collections import defaultdict
18
- import getpass
19
18
  import logging
20
19
  from datetime import datetime
21
20
  from itertools import chain
@@ -29,12 +28,7 @@ import requests
29
28
  from pydantic import ConfigDict, validate_call
30
29
 
31
30
  # Deriva imports
32
- from deriva.core import (
33
- DEFAULT_SESSION_CONFIG,
34
- format_exception,
35
- get_credential,
36
- urlquote,
37
- )
31
+ from deriva.core import DEFAULT_SESSION_CONFIG, format_exception, get_credential, urlquote, init_logging
38
32
 
39
33
  import deriva.core.datapath as datapath
40
34
  from deriva.core.datapath import DataPathException, _SchemaWrapper as SchemaWrapper
@@ -55,6 +49,7 @@ from deriva_ml.core.definitions import (
55
49
  TableDefinition,
56
50
  VocabularyTerm,
57
51
  )
52
+ from deriva_ml.core.config import DerivaMLConfig
58
53
  from deriva_ml.core.exceptions import DerivaMLTableTypeError, DerivaMLException
59
54
  from deriva_ml.dataset.aux_classes import DatasetSpec
60
55
  from deriva_ml.dataset.dataset import Dataset
@@ -116,8 +111,10 @@ class DerivaML(Dataset):
116
111
  project_name: str | None = None,
117
112
  cache_dir: str | Path | None = None,
118
113
  working_dir: str | Path | None = None,
114
+ hydra_runtime_output_dir: str | Path | None = None,
119
115
  ml_schema: str = ML_SCHEMA,
120
116
  logging_level=logging.WARNING,
117
+ deriva_logging_level=logging.WARNING,
121
118
  credential=None,
122
119
  use_minid: bool = True,
123
120
  check_auth: bool = True,
@@ -166,12 +163,10 @@ class DerivaML(Dataset):
166
163
  self.model = DerivaModel(self.catalog.getCatalogModel(), domain_schema=domain_schema)
167
164
 
168
165
  # Set up working and cache directories
169
- default_workdir = self.__class__.__name__ + "_working"
170
- self.working_dir = (
171
- Path(working_dir) / getpass.getuser() if working_dir else Path.home() / "deriva-ml"
172
- ) / default_workdir
173
-
166
+ self.working_dir = DerivaMLConfig.compute_workdir(working_dir)
174
167
  self.working_dir.mkdir(parents=True, exist_ok=True)
168
+ self.hydra_runtime_output_dir = hydra_runtime_output_dir
169
+
175
170
  self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
176
171
  self.cache_dir.mkdir(parents=True, exist_ok=True)
177
172
 
@@ -182,6 +177,11 @@ class DerivaML(Dataset):
182
177
  self._logger = logging.getLogger("deriva_ml")
183
178
  self._logger.setLevel(logging_level)
184
179
 
180
+ # Configure deriva logging level
181
+ init_logging(deriva_logging_level)
182
+ logging.getLogger("bagit").setLevel(deriva_logging_level)
183
+ logging.getLogger("bdbag").setLevel(deriva_logging_level)
184
+
185
185
  # Store instance configuration
186
186
  self.host_name = hostname
187
187
  self.catalog_id = catalog_id
@@ -0,0 +1,67 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from hydra.conf import HydraConf, RunDir
6
+ from hydra.core.hydra_config import HydraConfig
7
+ from hydra_zen import store
8
+ from omegaconf import OmegaConf
9
+ from pydantic import BaseModel, model_validator
10
+
11
+ from deriva_ml.core.definitions import ML_SCHEMA
12
+
13
+
14
+ class DerivaMLConfig(BaseModel):
15
+ hostname: str
16
+ catalog_id: str | int = 1
17
+ domain_schema: str | None = None
18
+ project_name: str | None = None
19
+ cache_dir: str | Path | None = None
20
+ working_dir: str | Path | None = None
21
+ hydra_runtime_output_dir: str | Path | None = None
22
+ ml_schema: str = ML_SCHEMA
23
+ logging_level: Any = logging.WARNING
24
+ deriva_logging_level: Any = logging.WARNING
25
+ credential: Any = None
26
+ use_minid: bool = True
27
+ check_auth: bool = True
28
+
29
+ @model_validator(mode="after")
30
+ def init_working_dir(self):
31
+ """
32
+ Sets up the working directory for the model.
33
+
34
+ This method configures the working directory, ensuring that all required
35
+ file operations are performed in the appropriate location. If the user does not
36
+ specify a directory, a default directory based on the user's home directory
37
+ or username will be used.
38
+
39
+ This is a repeat of what is in the DerivaML.__init__ bu we put this here so that the working
40
+ directory is available to hydra.
41
+
42
+ Returns:
43
+ Self: The object instance with the working directory initialized.
44
+ """
45
+
46
+ self.working_dir = DerivaMLConfig.compute_workdir(self.working_dir)
47
+ self.hydra_runtime_output_dir = Path(HydraConfig.get().runtime.output_dir)
48
+ return self
49
+
50
+ @staticmethod
51
+ def compute_workdir(working_dir) -> Path:
52
+ # Create a default working directory if none is provided
53
+ working_dir = Path(working_dir) if working_dir else Path.home() / "deriva-ml"
54
+ return working_dir.absolute()
55
+
56
+
57
+ OmegaConf.register_new_resolver("compute_workdir", DerivaMLConfig.compute_workdir, replace=True)
58
+ store(
59
+ HydraConf(
60
+ run=RunDir("${compute_workdir:${deriva_ml.working_dir}}/hydra/${now:%Y-%m-%d_%H-%M-%S}"),
61
+ output_subdir="hydra-config",
62
+ ),
63
+ group="hydra",
64
+ name="config",
65
+ )
66
+
67
+ store.add_to_hydra_store()
@@ -1,4 +1,17 @@
1
- from .aux_classes import DatasetSpec
1
+ from typing import Protocol, runtime_checkable
2
+
3
+ from deriva_ml.core.definitions import RID
4
+
5
+ from .aux_classes import DatasetConfig, DatasetConfigList, DatasetSpec, DatasetVersion, VersionPart
2
6
  from .dataset import Dataset
7
+ from .dataset_bag import DatasetBag
3
8
 
4
- __all__ = ["Dataset", "DatasetSpec"]
9
+ __all__ = [
10
+ "Dataset",
11
+ "DatasetSpec",
12
+ "DatasetConfig",
13
+ "DatasetConfigList",
14
+ "DatasetBag",
15
+ "DatasetVersion",
16
+ "VersionPart",
17
+ ]
@@ -5,6 +5,7 @@ THis module defines the DataSet class with is used to manipulate n
5
5
  from enum import Enum
6
6
  from typing import Any, Optional, SupportsInt
7
7
 
8
+ from hydra_zen import hydrated_dataclass
8
9
  from pydantic import (
9
10
  BaseModel,
10
11
  ConfigDict,
@@ -182,8 +183,9 @@ class DatasetSpec(BaseModel):
182
183
  """
183
184
 
184
185
  rid: RID
185
- materialize: bool = True
186
186
  version: DatasetVersion | conlist(item_type=int, min_length=3, max_length=3) | tuple[int, int, int] | str
187
+ materialize: bool = True
188
+ description: str = ""
187
189
 
188
190
  model_config = ConfigDict(arbitrary_types_allowed=True)
189
191
 
@@ -208,3 +210,20 @@ class DatasetSpec(BaseModel):
208
210
  @field_serializer("version")
209
211
  def serialize_version(self, version: DatasetVersion) -> dict[str, Any]:
210
212
  return version.to_dict()
213
+
214
+
215
+ @hydrated_dataclass(DatasetSpec)
216
+ class DatasetConfig:
217
+ rid: str
218
+ version: str
219
+ materialize: bool = True
220
+ description: str = ""
221
+
222
+ class DatasetList(BaseModel):
223
+ datasets: list[DatasetSpec]
224
+ description: str = ""
225
+
226
+ @hydrated_dataclass(DatasetList)
227
+ class DatasetConfigList:
228
+ datasets: list[DatasetConfig]
229
+ description: str = ""
@@ -22,10 +22,11 @@ Typical usage example:
22
22
 
23
23
  from __future__ import annotations
24
24
 
25
- # Standard library imports
26
25
  import json
27
26
  import logging
28
27
  from collections import defaultdict
28
+
29
+ # Standard library imports
29
30
  from graphlib import TopologicalSorter
30
31
  from pathlib import Path
31
32
  from tempfile import TemporaryDirectory
@@ -1138,7 +1139,7 @@ class Dataset:
1138
1139
  with TemporaryDirectory() as tmp_dir:
1139
1140
  if self._use_minid:
1140
1141
  # Get bag from S3
1141
- archive_path = fetch_single_file(minid.bag_url)
1142
+ archive_path = fetch_single_file(minid.bag_url, output_path=tmp_dir)
1142
1143
  else:
1143
1144
  exporter = DerivaExport(host=self._model.catalog.deriva_server.server, output_dir=tmp_dir)
1144
1145
  archive_path = exporter.retrieve_file(minid.bag_url)
@@ -266,6 +266,22 @@ class DatasetBag:
266
266
  sql_cmd = f'SELECT * FROM "{feature_table}"'
267
267
  return cast(datapath._ResultSet, [dict(zip(col_names, r)) for r in db.execute(sql_cmd).fetchall()])
268
268
 
269
+ def list_dataset_element_types(self) -> list[Table]:
270
+ """
271
+ Lists the data types of elements contained within a dataset.
272
+
273
+ This method analyzes the dataset and identifies the data types for all
274
+ elements within it. It is useful for understanding the structure and
275
+ content of the dataset and allows for better manipulation and usage of its
276
+ data.
277
+
278
+ Returns:
279
+ list[str]: A list of strings where each string represents a data type
280
+ of an element found in the dataset.
281
+
282
+ """
283
+ return self.model.list_dataset_element_types()
284
+
269
285
  def list_dataset_children(self, recurse: bool = False) -> list[DatasetBag]:
270
286
  """Get nested datasets.
271
287
 
@@ -333,6 +349,105 @@ class DatasetBag:
333
349
  # Term not found
334
350
  raise DerivaMLInvalidTerm(vocab_table, term_name)
335
351
 
352
+ def _denormalize(self, include_tables: list[str] | None) -> str:
353
+ """
354
+ Generates an SQL statement for denormalizing the dataset based on the tables to include. Processes cycles in
355
+ graph relationships, ensures proper join order, and generates selected columns for denormalization.
356
+
357
+ Args:
358
+ include_tables (list[str] | None): List of table names to include in the denormalized dataset. If None,
359
+ all tables from the dataset will be included.
360
+
361
+ Returns:
362
+ str: SQL query string that represents the process of denormalization.
363
+ """
364
+
365
+ def column_name(col: Column) -> str:
366
+ return f'"{self.model.normalize_table_name(col.table.name)}"."{col.name}"'
367
+
368
+ # Skip over tables that we don't want to include in the denormalized dataset.
369
+ # Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
370
+ # table.
371
+
372
+ join_tables, tables, denormalized_columns, dataset_rids, dataset_element_tables = (
373
+ self.model._prepare_wide_table(self, self.dataset_rid, include_tables)
374
+ )
375
+
376
+ select_args = [
377
+ # SQLlite will strip out the table name from the column in the select statement, so we need to add
378
+ # an explicit alias to the column name.
379
+ f'"{self.model.normalize_table_name(table_name)}"."{column_name}" AS "{table_name}.{column_name}"'
380
+ for table_name, column_name in denormalized_columns
381
+ ]
382
+
383
+ # First table in the table list is the table specified in the method call.
384
+ normalized_join_tables = [self.model.normalize_table_name(t) for t in join_tables]
385
+ sql_statement = f'SELECT {",".join(select_args)} FROM "{normalized_join_tables[0]}"'
386
+ for t in normalized_join_tables[1:]:
387
+ on = tables[t]
388
+ sql_statement += f' LEFT JOIN "{t}" ON '
389
+ sql_statement += "OR ".join([f"{column_name(o[0])} = {column_name(o[1])}" for o in on])
390
+
391
+ # Select only rows from the datasets you wish to include.
392
+ dataset_rid_list = ",".join([f'"{self.dataset_rid}"'] + [f'"{b.dataset_rid}"' for b in dataset_rids])
393
+ sql_statement += f'WHERE "{self.model.normalize_table_name("Dataset")}"."RID" IN ({dataset_rid_list})'
394
+
395
+ # Only include rows that have actual values in them.
396
+ real_row = [f'"{self.model.normalize_table_name(t)}".RID IS NOT NULL ' for t in dataset_element_tables]
397
+ sql_statement += f" AND ({' OR '.join(real_row)})"
398
+ return sql_statement
399
+
400
+ def denormalize_as_dataframe(self, include_tables: list[str] | None = None) -> pd.DataFrame:
401
+ """
402
+ Denormalize the dataset and return the result as a dataframe.
403
+
404
+ This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
405
+ the dataset values into a single wide table. The result is returned as a dataframe.
406
+
407
+ The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
408
+ view. The tables in this argument can appear anywhere in the dataset schema. The method will determine which
409
+ additional tables are required to complete the denormalization process. If include_tables is not specified,
410
+ all of the tables in the schema will be included.
411
+
412
+ The resulting wide table will include a column for every table needed to complete the denormalization process.
413
+
414
+ Args:
415
+ include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
416
+ is used.
417
+
418
+ Returns:
419
+ Dataframe containing the denormalized dataset.
420
+ """
421
+ return pd.read_sql(self._denormalize(include_tables=include_tables), self.database)
422
+
423
+ def denormalize_as_dict(self, include_tables: list[str] | None = None) -> Generator[dict[str, Any], None, None]:
424
+ """
425
+ Denormalize the dataset and return the result as a set of dictionarys.
426
+
427
+ This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
428
+ the dataset values into a single wide table. The result is returned as a generateor that returns a dictionary
429
+ for each row in the denormlized wide table.
430
+
431
+ The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
432
+ view. The tables in this argument can appear anywhere in the dataset schema. The method will determine which
433
+ additional tables are required to complete the denormalization process. If include_tables is not specified,
434
+ all of the tables in the schema will be included.
435
+
436
+ The resulting wide table will include a column for every table needed to complete the denormalization process.
437
+
438
+ Args:
439
+ include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
440
+ is used.
441
+
442
+ Returns:
443
+ A generator that returns a dictionary representation of each row in the denormalized dataset.
444
+ """
445
+ with self.database as dbase:
446
+ cursor = dbase.execute(self._denormalize(include_tables=include_tables))
447
+ columns = [desc[0] for desc in cursor.description]
448
+ for row in cursor:
449
+ yield dict(zip(columns, row))
450
+
336
451
 
337
452
  # Add annotations after definition to deal with forward reference issues in pydantic
338
453
 
@@ -412,6 +412,7 @@ def asset_file_path(
412
412
  "Description",
413
413
  }.union(set(DerivaSystemColumns))
414
414
  asset_metadata = {c.name for c in asset_table.columns} - asset_columns
415
+
415
416
  if not (asset_metadata >= set(metadata.keys())):
416
417
  raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")
417
418
 
deriva_ml/demo_catalog.py CHANGED
@@ -367,7 +367,7 @@ def create_demo_catalog(
367
367
  create_features=False,
368
368
  create_datasets=False,
369
369
  on_exit_delete=True,
370
- logging_level=logging.INFO,
370
+ logging_level=logging.WARNING,
371
371
  ) -> ErmrestCatalog:
372
372
  test_catalog = create_ml_catalog(hostname, project_name=project_name)
373
373
  if on_exit_delete:
@@ -0,0 +1,25 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ # Safe imports - no circular dependencies
4
+ from deriva_ml.execution.execution_configuration import ExecutionConfiguration
5
+ from deriva_ml.execution.workflow import Workflow
6
+
7
+ if TYPE_CHECKING:
8
+ from deriva_ml.execution.execution import Execution
9
+
10
+
11
+ # Lazy import for runtime
12
+ def __getattr__(name):
13
+ """Lazy import to avoid circular dependencies."""
14
+ if name == "Execution":
15
+ from deriva_ml.execution.execution import Execution
16
+
17
+ return Execution
18
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
19
+
20
+
21
+ __all__ = [
22
+ "Execution", # Lazy-loaded
23
+ "ExecutionConfiguration",
24
+ "Workflow",
25
+ ]
@@ -41,7 +41,6 @@ from deriva_ml.core.base import DerivaML
41
41
  from deriva_ml.core.definitions import (
42
42
  DRY_RUN_RID,
43
43
  RID,
44
- ExecAssetType,
45
44
  ExecMetadataType,
46
45
  FileSpec,
47
46
  FileUploadState,
@@ -198,7 +197,6 @@ class Execution:
198
197
  workflow_rid (RID): RID of the associated workflow.
199
198
  status (Status): Current execution status.
200
199
  asset_paths (list[AssetFilePath]): Paths to execution assets.
201
- parameters (dict): Execution parameters.
202
200
  start_time (datetime | None): When execution started.
203
201
  stop_time (datetime | None): When execution completed.
204
202
 
@@ -206,7 +204,6 @@ class Execution:
206
204
  >>> config = ExecutionConfiguration(
207
205
  ... workflow="analysis",
208
206
  ... description="Process samples",
209
- ... parameters={"threshold": 0.5}
210
207
  ... )
211
208
  >>> with ml.create_execution(config) as execution:
212
209
  ... execution.download_dataset_bag(dataset_spec)
@@ -250,7 +247,6 @@ class Execution:
250
247
 
251
248
  self.dataset_rids: List[RID] = []
252
249
  self.datasets: list[DatasetBag] = []
253
- self.parameters = self.configuration.parameters
254
250
 
255
251
  self._working_dir = self._ml_object.working_dir
256
252
  self._cache_dir = self._ml_object.cache_dir
@@ -292,9 +288,18 @@ class Execution:
292
288
  ]
293
289
  )[0]["RID"]
294
290
 
295
- if isinstance(self.configuration.workflow, Workflow) and self.configuration.workflow.is_notebook:
296
- # Put execution_rid into the cell output so we can find it later.
297
- display(Markdown(f"Execution RID: {self._ml_object.cite(self.execution_rid)}"))
291
+ if rid_path := os.environ.get("DERIVA_ML_SAVE_EXECUTION_RID", None):
292
+ # Put execution_rid into the provided file path so we can find it later.
293
+ with Path(rid_path).open("w") as f:
294
+ json.dump(
295
+ {
296
+ "hostname": self._ml_object.host_name,
297
+ "catalog_id": self._ml_object.catalog_id,
298
+ "workflow_rid": self.workflow_rid,
299
+ "execution_rid": self.execution_rid,
300
+ },
301
+ f,
302
+ )
298
303
 
299
304
  # Create a directory for execution rid so we can recover the state in case of a crash.
300
305
  execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
@@ -302,13 +307,28 @@ class Execution:
302
307
 
303
308
  def _save_runtime_environment(self):
304
309
  runtime_env_path = self.asset_file_path(
305
- "Execution_Metadata",
306
- f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
307
- ExecMetadataType.runtime_env.value,
310
+ asset_name="Execution_Metadata",
311
+ file_name=f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
312
+ asset_types=ExecMetadataType.runtime_env.value,
308
313
  )
309
314
  with Path(runtime_env_path).open("w") as fp:
310
315
  json.dump(get_execution_environment(), fp)
311
316
 
317
+ def _upload_hydra_config_assets(self):
318
+ """Upload hydra assets to the catalog."""
319
+ hydra_runtime_output_dir = self._ml_object.hydra_runtime_output_dir
320
+ if hydra_runtime_output_dir:
321
+ timestamp = hydra_runtime_output_dir.parts[-1]
322
+ for hydra_asset in hydra_runtime_output_dir.rglob("*"):
323
+ if hydra_asset.is_dir():
324
+ continue
325
+ asset = self.asset_file_path(
326
+ asset_name=MLAsset.execution_metadata,
327
+ file_name=hydra_runtime_output_dir / hydra_asset,
328
+ rename_file=f"hydra-{timestamp}-{hydra_asset.name}",
329
+ asset_types=ExecMetadataType.execution_config.value,
330
+ )
331
+
312
332
  def _initialize_execution(self, reload: RID | None = None) -> None:
313
333
  """Initialize the execution by a configuration in the Execution_Metadata table.
314
334
  Set up a working directory and download all the assets and data.
@@ -354,9 +374,9 @@ class Execution:
354
374
  # Save configuration details for later upload
355
375
  if not reload:
356
376
  cfile = self.asset_file_path(
357
- MLAsset.execution_metadata,
358
- "configuration.json",
359
- ExecMetadataType.execution_config.value,
377
+ asset_name=MLAsset.execution_metadata,
378
+ file_name="configuration.json",
379
+ asset_types=ExecMetadataType.execution_config.value,
360
380
  )
361
381
  with Path(cfile).open("w", encoding="utf-8") as config_file:
362
382
  json.dump(self.configuration.model_dump(), config_file)
@@ -364,24 +384,18 @@ class Execution:
364
384
  lock_file = Path(self.configuration.workflow.git_root) / "uv.lock"
365
385
  if lock_file.exists():
366
386
  _ = self.asset_file_path(
367
- MLAsset.execution_metadata,
368
- lock_file,
369
- ExecMetadataType.execution_config.value,
387
+ asset_name=MLAsset.execution_metadata,
388
+ file_name=lock_file,
389
+ asset_types=ExecMetadataType.execution_config.value,
370
390
  )
371
391
 
372
- for parameter_file in self.configuration.parameters:
373
- self.asset_file_path(
374
- MLAsset.execution_asset,
375
- parameter_file,
376
- ExecAssetType.input_file.value,
377
- )
392
+ self._upload_hydra_config_assets()
378
393
 
379
394
  # save runtime env
380
395
  self._save_runtime_environment()
381
396
 
382
397
  # Now upload the files so we have the info in case the execution fails.
383
398
  self.uploaded_assets = self._upload_execution_dirs()
384
-
385
399
  self.start_time = datetime.now()
386
400
  self.update_status(Status.pending, "Initialize status finished.")
387
401
 
@@ -856,6 +870,7 @@ class Execution:
856
870
  file_name: str | Path,
857
871
  asset_types: list[str] | str | None = None,
858
872
  copy_file=False,
873
+ rename_file: str | None = None,
859
874
  **kwargs,
860
875
  ) -> AssetFilePath:
861
876
  """Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
@@ -875,6 +890,8 @@ class Execution:
875
890
  asset_name: Type of asset to be uploaded. Must be a term in Asset_Type controlled vocabulary.
876
891
  file_name: Name of file to be uploaded.
877
892
  asset_types: Type of asset to be uploaded. Defaults to the name of the asset.
893
+ copy_file: Whether to copy the file rather than creating a symbolic link.
894
+ rename_file: If provided, the file will be renamed to this name if the file already exists..
878
895
  **kwargs: Any additional metadata values that may be part of the asset table.
879
896
 
880
897
  Returns:
@@ -893,12 +910,15 @@ class Execution:
893
910
  for t in asset_types:
894
911
  self._ml_object.lookup_term(MLVocab.asset_type, t)
895
912
 
913
+ # Determine if we will need to rename an existing file as the asset.
896
914
  file_name = Path(file_name)
915
+ target_name = Path(rename_file) if file_name.exists() and rename_file else file_name
916
+
897
917
  asset_path = asset_file_path(
898
918
  prefix=self._working_dir,
899
919
  exec_rid=self.execution_rid,
900
920
  asset_table=self._model.name_to_table(asset_name),
901
- file_name=file_name.name,
921
+ file_name=target_name.name,
902
922
  metadata=kwargs,
903
923
  )
904
924
 
@@ -914,12 +934,12 @@ class Execution:
914
934
 
915
935
  # Persist the asset types into a file
916
936
  with Path(asset_type_path(self._working_dir, self.execution_rid, asset_table)).open("a") as asset_type_file:
917
- asset_type_file.write(json.dumps({file_name.name: asset_types}) + "\n")
937
+ asset_type_file.write(json.dumps({target_name.name: asset_types}) + "\n")
918
938
 
919
939
  return AssetFilePath(
920
940
  asset_path=asset_path,
921
941
  asset_name=asset_name,
922
- file_name=file_name.name,
942
+ file_name=target_name.name,
923
943
  asset_metadata=kwargs,
924
944
  asset_types=asset_types,
925
945
  )
@@ -30,7 +30,7 @@ from typing import Any
30
30
  from pydantic import BaseModel, ConfigDict, Field, field_validator
31
31
 
32
32
  from deriva_ml.core.definitions import RID
33
- from deriva_ml.dataset.aux_classes import DatasetSpec
33
+ from deriva_ml.dataset.aux_classes import DatasetList, DatasetSpec
34
34
  from deriva_ml.execution.workflow import Workflow
35
35
 
36
36
 
@@ -64,45 +64,21 @@ class ExecutionConfiguration(BaseModel):
64
64
  ... )
65
65
  """
66
66
 
67
- datasets: list[DatasetSpec] = []
67
+ datasets: list[DatasetSpec] | DatasetList = []
68
68
  assets: list[RID] = []
69
69
  workflow: RID | Workflow
70
- parameters: dict[str, Any] | Path = {}
71
70
  description: str = ""
72
71
  argv: list[str] = Field(default_factory=lambda: sys.argv)
73
72
 
74
73
  model_config = ConfigDict(arbitrary_types_allowed=True)
75
74
 
76
- @field_validator("parameters", mode="before")
75
+ @field_validator("datasets", mode="before")
77
76
  @classmethod
78
- def validate_parameters(cls, value: Any) -> Any:
79
- """Validates and loads execution parameters.
80
-
81
- If value is a file path, loads and parses it as JSON. Otherwise, returns
82
- the value as is.
83
-
84
- Args:
85
- value: Parameter value to validate, either:
86
- - Dictionary of parameters
87
- - Path to JSON file
88
- - String path to JSON file
89
-
90
- Returns:
91
- dict[str, Any]: Validated parameter dictionary.
92
-
93
- Raises:
94
- ValueError: If JSON file is invalid or cannot be read.
95
- FileNotFoundError: If parameter file doesn't exist.
96
-
97
- Example:
98
- >>> config = ExecutionConfiguration(parameters="params.json")
99
- >>> print(config.parameters) # Contents of params.json as dict
100
- """
101
- if isinstance(value, str) or isinstance(value, Path):
102
- with Path(value).open("r") as f:
103
- return json.load(f)
104
- else:
105
- return value
77
+ def validate_datasets(cls, value: Any) -> Any:
78
+ if isinstance(value, DatasetList):
79
+ config_list: DatasetList = value
80
+ value = config_list.datasets
81
+ return value
106
82
 
107
83
  @field_validator("workflow", mode="before")
108
84
  @classmethod
@@ -9,6 +9,7 @@ from __future__ import annotations
9
9
 
10
10
  # Standard library imports
11
11
  from collections import Counter
12
+ from graphlib import CycleError, TopologicalSorter
12
13
  from typing import Any, Callable, Final, Iterable, NewType, TypeAlias
13
14
 
14
15
  from deriva.core.ermrest_catalog import ErmrestCatalog
@@ -21,6 +22,7 @@ from pydantic import ConfigDict, validate_call
21
22
 
22
23
  from deriva_ml.core.definitions import (
23
24
  ML_SCHEMA,
25
+ RID,
24
26
  DerivaAssetColumns,
25
27
  TableDefinition,
26
28
  )
@@ -28,6 +30,7 @@ from deriva_ml.core.exceptions import DerivaMLException, DerivaMLTableTypeError
28
30
 
29
31
  # Local imports
30
32
  from deriva_ml.feature import Feature
33
+ from deriva_ml.protocols.dataset import DatasetLike
31
34
 
32
35
  try:
33
36
  from icecream import ic
@@ -287,6 +290,113 @@ class DerivaModel:
287
290
  else:
288
291
  self.model.apply()
289
292
 
293
+ def list_dataset_element_types(self) -> list[Table]:
294
+ """
295
+ Lists the data types of elements contained within a dataset.
296
+
297
+ This method analyzes the dataset and identifies the data types for all
298
+ elements within it. It is useful for understanding the structure and
299
+ content of the dataset and allows for better manipulation and usage of its
300
+ data.
301
+
302
+ Returns:
303
+ list[str]: A list of strings where each string represents a data type
304
+ of an element found in the dataset.
305
+
306
+ """
307
+
308
+ dataset_table = self.name_to_table("Dataset")
309
+
310
+ def domain_table(table: Table) -> bool:
311
+ return table.schema.name == self.domain_schema or table.name == dataset_table.name
312
+
313
+ return [t for a in dataset_table.find_associations() if domain_table(t := a.other_fkeys.pop().pk_table)]
314
+
315
+ def _prepare_wide_table(self, dataset: DatasetLike, dataset_rid: RID, include_tables: list[str] | None) -> tuple:
316
+ """
317
+ Generates details of a wide table from the model
318
+
319
+ Args:
320
+ include_tables (list[str] | None): List of table names to include in the denormalized dataset. If None,
321
+ all tables from the dataset will be included.
322
+
323
+ Returns:
324
+ str: SQL query string that represents the process of denormalization.
325
+ """
326
+
327
+ # Skip over tables that we don't want to include in the denormalized dataset.
328
+ # Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
329
+ # table.
330
+ include_tables = set(include_tables) if include_tables else set()
331
+ for t in include_tables:
332
+ # Check to make sure the table is in the catalog.
333
+ _ = self.name_to_table(t)
334
+
335
+ table_paths = [
336
+ path
337
+ for path in self._schema_to_paths()
338
+ if (not include_tables) or include_tables.intersection({p.name for p in path})
339
+ ]
340
+
341
+ # Get the names of all of the tables that can be dataset elements.
342
+ dataset_element_tables = {
343
+ e.name for e in self.list_dataset_element_types() if e.schema.name == self.domain_schema
344
+ }
345
+
346
+ skip_columns = {"RCT", "RMT", "RCB", "RMB"}
347
+ tables = {}
348
+ graph = {}
349
+ for path in table_paths:
350
+ for left, right in zip(path[0:], path[1:]):
351
+ graph.setdefault(left.name, set()).add(right.name)
352
+
353
+ # New lets remove any cycles that we may have in the graph.
354
+ # We will use a topological sort to find the order in which we need to join the tables.
355
+ # If we find a cycle, we will remove the table from the graph and splice in an additional ON clause.
356
+ # We will then repeat the process until there are no cycles.
357
+ graph_has_cycles = True
358
+ join_tables = []
359
+ while graph_has_cycles:
360
+ try:
361
+ ts = TopologicalSorter(graph)
362
+ join_tables = list(reversed(list(ts.static_order())))
363
+ graph_has_cycles = False
364
+ except CycleError as e:
365
+ cycle_nodes = e.args[1]
366
+ if len(cycle_nodes) > 3:
367
+ raise DerivaMLException(f"Unexpected cycle found when normalizing dataset {cycle_nodes}")
368
+ # Remove cycle from graph and splice in additional ON constraint.
369
+ graph[cycle_nodes[1]].remove(cycle_nodes[0])
370
+
371
+ # The Dataset_Version table is a special case as it points to dataset and dataset to version.
372
+ if "Dataset_Version" in join_tables:
373
+ join_tables.remove("Dataset_Version")
374
+
375
+ for path in table_paths:
376
+ for left, right in zip(path[0:], path[1:]):
377
+ if right.name == "Dataset_Version":
378
+ # The Dataset_Version table is a special case as it points to dataset and dataset to version.
379
+ continue
380
+ if join_tables.index(right.name) < join_tables.index(left.name):
381
+ continue
382
+ table_relationship = self._table_relationship(left, right)
383
+ tables.setdefault(self.normalize_table_name(right.name), set()).add(
384
+ (table_relationship[0], table_relationship[1])
385
+ )
386
+
387
+ # Get the list of columns that will appear in the final denormalized dataset.
388
+ denormalized_columns = [
389
+ (table_name, c.name)
390
+ for table_name in join_tables
391
+ if not self.is_association(table_name) # Don't include association columns in the denormalized view.'
392
+ for c in self.name_to_table(table_name).columns
393
+ if c.name not in skip_columns
394
+ ]
395
+
396
+ # List of dataset ids to include in the denormalized view.
397
+ dataset_rids = dataset.list_dataset_children(recurse=True)
398
+ return join_tables, tables, denormalized_columns, dataset_rids, dataset_element_tables
399
+
290
400
  def _table_relationship(
291
401
  self,
292
402
  table1: TableInput,
@@ -302,7 +412,9 @@ class DerivaModel:
302
412
  [(fk.referenced_columns[0], fk.foreign_key_columns[0]) for fk in table1.referenced_by if fk.table == table2]
303
413
  )
304
414
  if len(relationships) != 1:
305
- raise DerivaMLException(f"Ambiguous linkage between {table1.name} and {table2.name}")
415
+ raise DerivaMLException(
416
+ f"Ambiguous linkage between {table1.name} and {table2.name}: {[(r[0].name, r[1].name) for r in relationships]}"
417
+ )
306
418
  return relationships[0]
307
419
 
308
420
  def _schema_to_paths(
@@ -226,7 +226,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
226
226
  indexes: A tuple whose first element is the column index of the file name and whose second element
227
227
  is the index of the URL in an asset table. Tuple is None if table is not an asset table.
228
228
  o: list:
229
- indexes: Optional[tuple[int: int]]:
229
+ indexes: Optional[tuple[int, int]]:
230
230
 
231
231
  Returns:
232
232
  Tuple of updated column values.
@@ -262,7 +262,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
262
262
  DatasetBag object for the specified dataset.
263
263
  """
264
264
  if dataset_rid and dataset_rid not in self.bag_rids:
265
- DerivaMLException(f"Dataset RID {dataset_rid} is not in model.")
265
+ raise DerivaMLException(f"Dataset RID {dataset_rid} is not in model.")
266
266
  return DatasetBag(self, dataset_rid or self.dataset_rid)
267
267
 
268
268
  def dataset_version(self, dataset_rid: Optional[RID] = None) -> DatasetVersion:
@@ -0,0 +1,19 @@
1
+ """A module defining the DatasetLike protocol for dataset operations.
2
+
3
+ This module contains the definition of the DatasetLike protocol, which
4
+ provides an interface for datasets to implement specific functionality related
5
+ to listing dataset children. It is particularly useful for ensuring type
6
+ compatibility for objects that mimic datasets in their behavior.
7
+
8
+ Classes:
9
+ DatasetLike: A protocol that specifies methods required for dataset-like
10
+ objects.
11
+ """
12
+ from typing import Protocol, runtime_checkable
13
+
14
+ from deriva_ml.core.definitions import RID
15
+
16
+
17
+ @runtime_checkable
18
+ class DatasetLike(Protocol):
19
+ def list_dataset_children(self, dataset_rid: RID, recurse: bool = False) -> list[RID]: ...
deriva_ml/run_notebook.py CHANGED
@@ -3,14 +3,13 @@
3
3
  import json
4
4
  import os
5
5
  import tempfile
6
- from datetime import datetime
7
6
  from pathlib import Path
8
7
 
9
8
  import nbformat
10
9
  import papermill as pm
11
- import regex as re
12
10
  import yaml
13
11
  from deriva.core import BaseCLI
12
+ from jupyter_client.kernelspec import KernelSpecManager
14
13
  from nbconvert import MarkdownExporter
15
14
 
16
15
  from deriva_ml import DerivaML, ExecAssetType, Execution, ExecutionConfiguration, MLAsset, Workflow
@@ -44,13 +43,6 @@ class DerivaMLRunNotebookCLI(BaseCLI):
44
43
  help="Display logging output from notebook.",
45
44
  )
46
45
 
47
- self.parser.add_argument(
48
- "--catalog",
49
- metavar="<1>",
50
- default=1,
51
- help="Catalog number. Default 1",
52
- )
53
-
54
46
  self.parser.add_argument(
55
47
  "--parameter",
56
48
  "-p",
@@ -61,7 +53,13 @@ class DerivaMLRunNotebookCLI(BaseCLI):
61
53
  help="Provide a parameter name and value to inject into the notebook.",
62
54
  )
63
55
 
64
- self.parser.add_argument("--kernel", "-k", nargs=1, help="Name of kernel to run..", default=None)
56
+ self.parser.add_argument(
57
+ "--kernel",
58
+ "-k",
59
+ type=str,
60
+ help="Name of kernel to run..",
61
+ default=self._find_kernel_for_venv(),
62
+ )
65
63
 
66
64
  @staticmethod
67
65
  def _coerce_number(val: str):
@@ -100,26 +98,50 @@ class DerivaMLRunNotebookCLI(BaseCLI):
100
98
  print(f"Notebook file must be an ipynb file: {notebook_file.name}.")
101
99
  exit(1)
102
100
 
103
- os.environ["DERIVA_HOST"] = args.host
104
- os.environ["DERIVA_CATALOG"] = args.catalog
105
-
106
101
  # Create a workflow instance for this specific version of the script.
107
102
  # Return an existing workflow if one is found.
108
103
  notebook_parameters = pm.inspect_notebook(notebook_file)
104
+
109
105
  if args.inspect:
110
106
  for param, value in notebook_parameters.items():
111
107
  print(f"{param}:{value['inferred_type_name']} (default {value['default']})")
112
108
  return
113
109
  else:
114
- notebook_parameters = (
115
- {k: v["default"] for k, v in notebook_parameters.items()}
116
- | {"host": args.host, "hostname": args.host, "catalog_id": args.catalog, "catalog": args.catalog}
117
- | parameters
118
- )
119
- print(f"Running notebook {notebook_file.name} with parameters:")
120
- for param, value in notebook_parameters.items():
121
- print(f" {param}:{value}")
122
- self.run_notebook(notebook_file.resolve(), parameters, kernel=args.kernel[0], log=args.log_output)
110
+ notebook_parameters = {k: v["default"] for k, v in notebook_parameters.items()} | parameters
111
+ self.run_notebook(notebook_file.resolve(), parameters, kernel=args.kernel, log=args.log_output)
112
+
113
+ @staticmethod
114
+ def _find_kernel_for_venv() -> str | None:
115
+ """
116
+ Return the name and spec of an existing Jupyter kernel corresponding
117
+ to a given Python virtual environment path.
118
+
119
+ Parameters
120
+ ----------
121
+ venv_path : str
122
+ Absolute or relative path to the virtual environment.
123
+
124
+ Returns
125
+ -------
126
+ dict | None
127
+ The kernel spec (as a dict) if found, or None if not found.
128
+ """
129
+ venv = os.environ.get("VIRTUAL_ENV")
130
+ if not venv:
131
+ return None
132
+ venv_path = Path(venv).resolve()
133
+ ksm = KernelSpecManager()
134
+ for name, spec in ksm.get_all_specs().items():
135
+ kernel_json = spec.get("spec", {})
136
+ argv = kernel_json.get("argv", [])
137
+ # check for python executable path inside argv
138
+ for arg in argv:
139
+ try:
140
+ if Path(arg).resolve() == venv_path.joinpath("bin", "python").resolve():
141
+ return name
142
+ except Exception:
143
+ continue
144
+ return None
123
145
 
124
146
  def run_notebook(self, notebook_file: Path, parameters, kernel=None, log=False):
125
147
  url, checksum = Workflow.get_url_and_checksum(Path(notebook_file))
@@ -127,8 +149,9 @@ class DerivaMLRunNotebookCLI(BaseCLI):
127
149
  os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"] = checksum
128
150
  os.environ["DERIVA_ML_NOTEBOOK_PATH"] = notebook_file.as_posix()
129
151
  with tempfile.TemporaryDirectory() as tmpdirname:
130
- print(f"Running notebook {notebook_file.name} with parameters:")
131
152
  notebook_output = Path(tmpdirname) / Path(notebook_file).name
153
+ execution_rid_path = Path(tmpdirname) / "execution_rid.json"
154
+ os.environ["DERIVA_ML_SAVE_EXECUTION_RID"] = execution_rid_path.as_posix()
132
155
  pm.execute_notebook(
133
156
  input_path=notebook_file,
134
157
  output_path=notebook_output,
@@ -137,22 +160,19 @@ class DerivaMLRunNotebookCLI(BaseCLI):
137
160
  log_output=log,
138
161
  )
139
162
  print(f"Notebook output saved to {notebook_output}")
140
- catalog_id = execution_rid = None
141
- with Path(notebook_output).open("r") as f:
142
- for line in f:
143
- if m := re.search(
144
- r"Execution RID: https://(?P<host>.*)/id/(?P<catalog_id>.*)/(?P<execution_rid>[\w-]+)",
145
- line,
146
- ):
147
- hostname = m["host"]
148
- catalog_id = m["catalog_id"]
149
- execution_rid = m["execution_rid"]
150
- if not execution_rid:
163
+ with execution_rid_path.open("r") as f:
164
+ execution_config = json.load(f)
165
+
166
+ if not execution_config:
151
167
  print("Execution RID not found.")
152
168
  exit(1)
153
169
 
170
+ execution_rid = execution_config["execution_rid"]
171
+ hostname = execution_config["hostname"]
172
+ catalog_id = execution_config["catalog_id"]
173
+ workflow_rid = execution_config["workflow_rid"]
154
174
  ml_instance = DerivaML(hostname=hostname, catalog_id=catalog_id, working_dir=tmpdirname)
155
- workflow_rid = ml_instance.retrieve_rid(execution_rid)["Workflow"]
175
+ workflow_rid = ml_instance.retrieve_rid(execution_config["execution_rid"])["Workflow"]
156
176
 
157
177
  execution = Execution(
158
178
  configuration=ExecutionConfiguration(workflow=workflow_rid),
@@ -183,21 +203,6 @@ class DerivaMLRunNotebookCLI(BaseCLI):
183
203
  file_name=notebook_output_md,
184
204
  asset_types=ExecAssetType.notebook_output,
185
205
  )
186
- execution.asset_file_path(
187
- asset_name=MLAsset.execution_asset,
188
- file_name=notebook_output_md,
189
- asset_types=ExecAssetType.notebook_output,
190
- )
191
- print("parameter....")
192
-
193
- parameter_file = execution.asset_file_path(
194
- asset_name=MLAsset.execution_asset,
195
- file_name=f"notebook-parameters-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json",
196
- asset_types=ExecAssetType.input_file.value,
197
- )
198
-
199
- with Path(parameter_file).open("w") as f:
200
- json.dump(parameters, f)
201
206
  execution.upload_execution_outputs()
202
207
 
203
208
  print(ml_instance.cite(execution_rid))
@@ -1,4 +1,5 @@
1
1
  import argparse
2
+ import sys
2
3
 
3
4
  from deriva.core.ermrest_model import Model, Table
4
5
  from deriva.core.utils.core_utils import tag as deriva_tags
@@ -183,10 +184,10 @@ def catalog_annotation(model: DerivaModel) -> None:
183
184
 
184
185
  def asset_annotation(asset_table: Table):
185
186
  """Generate annotations for an asset table.
186
-
187
+
187
188
  Args:
188
189
  asset_table: The Table object representing the asset table.
189
-
190
+
190
191
  Returns:
191
192
  A dictionary containing the annotations for the asset table.
192
193
  """
@@ -316,7 +317,8 @@ def generate_annotation(model: Model, schema: str) -> dict:
316
317
  },
317
318
  {
318
319
  "source": [
319
- {"inbound": [schema, "Execution_Metadata_Execution_fkey"]},
320
+ {"inbound": [schema, "Execution_Metadata_Execution_Execution_fkey"]},
321
+ {"outbound": [schema, "Execution_Metadata_Execution_Execution_Metadata_fkey"]},
320
322
  "RID",
321
323
  ],
322
324
  "markdown_name": "Execution Metadata",
@@ -453,9 +455,9 @@ def generate_annotation(model: Model, schema: str) -> dict:
453
455
 
454
456
  def main():
455
457
  """Main entry point for the annotations CLI.
456
-
458
+
457
459
  Applies annotations to the ML schema based on command line arguments.
458
-
460
+
459
461
  Returns:
460
462
  None. Executes the CLI.
461
463
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.14.46
3
+ Version: 1.16.0
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -21,6 +21,7 @@ Requires-Dist: nbstripout
21
21
  Requires-Dist: papermill
22
22
  Requires-Dist: pandas-stubs==2.2.3.250527
23
23
  Requires-Dist: pyyaml
24
+ Requires-Dist: hydra_zen
24
25
  Dynamic: license-file
25
26
 
26
27
  # DerivaML
@@ -0,0 +1,44 @@
1
+ deriva_ml/__init__.py,sha256=Yt8q0WbLFt7fbRLZe_f0bJWy1Qo6vidQzlYWQoT8U7o,2097
2
+ deriva_ml/bump_version.py,sha256=eN2G5G_OeiuFxhOdjjwfxD8Rmv6dFvzIm0y_1x4Mif4,4020
3
+ deriva_ml/demo_catalog.py,sha256=6hlSVGNQ364chisKvSyMy2BBxzhQq1mLPPlW324eca4,14931
4
+ deriva_ml/feature.py,sha256=6-aphkxdKjWa9oPSGFWxHcwAc_8hmWj-7I4M178YG5Y,8470
5
+ deriva_ml/install_kernel.py,sha256=b62XY0SLViYO_Zye5r1Pl9qhYZyu_fk4KAO8NS1pxgM,2165
6
+ deriva_ml/run_notebook.py,sha256=_pds1q3WcfWqhCBqKeznbwSv5n7OND8FkL6JQ2Jkfmc,8093
7
+ deriva_ml/core/__init__.py,sha256=Ko8GsWc7K_eDFW0-GaNS6gOWYP8cWHWir-ChSQaHntE,856
8
+ deriva_ml/core/base.py,sha256=xsz1h5QZVE7PCVZiCt7lRV43Dupq9c7elUsbGk3QHJQ,61919
9
+ deriva_ml/core/config.py,sha256=dF4rOLFmbk1DEkQimqbiH4pC519nRZWpwKItARNMiZ4,2244
10
+ deriva_ml/core/constants.py,sha256=6wBJ8qMxe-dbCjRGrjUIX-RK0mTWrLDTeUpaVbLFoM8,888
11
+ deriva_ml/core/definitions.py,sha256=uq_8uYFBVBVHS691Ri2kdQsN37z0GNYTaZskJIb_ocM,1385
12
+ deriva_ml/core/enums.py,sha256=sSN4B4OynbB-AXwxRszoFr-KWIWIAfhVa06EzAEHwVc,7194
13
+ deriva_ml/core/ermrest.py,sha256=N0IJ3TE87jElaBChEIo5AFDTr0SIrb6F90yiimRfPr4,10182
14
+ deriva_ml/core/exceptions.py,sha256=4MZNPOyN-UMaGeY9sqJDVwh_iOmz1ntp4usSyCNqVMg,934
15
+ deriva_ml/core/filespec.py,sha256=BQAAcRXfXq1lDcsKlokLOOXCBtEZpPgXxrFOIZYAgLg,4229
16
+ deriva_ml/dataset/__init__.py,sha256=tV3yK9tb8iB9f5P3ml459bP2uPWJhCJcplhmbGVtoMI,411
17
+ deriva_ml/dataset/aux_classes.py,sha256=K-cVBrZY1j0ZO__FORHRVdVz3O69OgvhO5YkhwJJyxE,7348
18
+ deriva_ml/dataset/dataset.py,sha256=c6hGsIH9UOn8ayDP7EsYzqgKeZm2Kr7naliPLQxGtSg,64473
19
+ deriva_ml/dataset/dataset_bag.py,sha256=peFEMU8PfExbzJ0VJGIL3QDIPz0stmUR7daCXptA3f4,20256
20
+ deriva_ml/dataset/history.py,sha256=FK5AYYz11p4E4FWMVg4r7UPWOD4eobrq3b3xMjWF59g,3197
21
+ deriva_ml/dataset/upload.py,sha256=Q9bNVv6xTK_IpwFOU_ugq33IWRs0AWyFoF8Rzwi6OVs,16430
22
+ deriva_ml/execution/__init__.py,sha256=Zs-ZNmwrJJW6suJilzh3vdcPvzI8HIA0Ym0VUwuiQME,668
23
+ deriva_ml/execution/environment.py,sha256=B7nywqxFTRUWgyu8n7rFoKcVC9on422kjeFG2FPQfvg,9302
24
+ deriva_ml/execution/execution.py,sha256=X4HBADT_F5ZuER8qBcnNYqRUuMU3BaEV7rMgXEUrLCg,46096
25
+ deriva_ml/execution/execution_configuration.py,sha256=oWgBueuFO0-PBm9LM08EQeFeY9IXF8tVbd3LyRsTiNw,5437
26
+ deriva_ml/execution/workflow.py,sha256=7CwPrgs3FKQHiEVus0PpK9w5hVKLKZnCrlu_nT8GFe8,13604
27
+ deriva_ml/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ deriva_ml/model/catalog.py,sha256=TY6QdlhZX7OL5bhWcGkAFpZNaZye5l_rkb1Cih-bTjs,19180
29
+ deriva_ml/model/database.py,sha256=KEPJKIlmIxTiF4Th1NgpuuuMBhbfsgsd_k8UHs-hMg4,14843
30
+ deriva_ml/model/sql_mapper.py,sha256=_0QsJEVSgSPtxrWKSgjfPZCQ1aMVcjR_Tk2OxLhWEvY,1696
31
+ deriva_ml/protocols/dataset.py,sha256=1TyaT--89Elcs-nCvVyJxUj4cDaLztZOuSOzzj1cBMk,699
32
+ deriva_ml/schema/__init__.py,sha256=yV-MfzCF3FA4OOz7mZwMM2q6-x1vgOJ057kUvikFF6E,130
33
+ deriva_ml/schema/annotations.py,sha256=CMcRqYUlyW8iLCYp6sYJsncaRNtp4kFKoxcg-i-t-50,18302
34
+ deriva_ml/schema/check_schema.py,sha256=6dadLYHPqRex6AYVClmsESI8WhC7-rb-XnGf2G298xw,3609
35
+ deriva_ml/schema/create_schema.py,sha256=9qK9_8SRQT-DwcEwTGSkhi3j2NaoH5EVgthvV2kO-gg,13042
36
+ deriva_ml/schema/deriva-ml-reference.json,sha256=AEOMIgwKO3dNMMWHb0lxaXyamvfAEbUPh8qw0aAtsUQ,242460
37
+ deriva_ml/schema/policy.json,sha256=5ykB8nnZFl-oCHzlAwppCFKJHWJFIkYognUMVEanfY8,1826
38
+ deriva_ml/schema/table_comments_utils.py,sha256=4flCqnZAaqg_uSZ9I18pNUWAZoLfmMCXbmI5uERY5vM,2007
39
+ deriva_ml-1.16.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
40
+ deriva_ml-1.16.0.dist-info/METADATA,sha256=gN7KnQ1MDdqSSaVJOIKY-lBEwEE8s0bRMoVLrZGYgtA,1214
41
+ deriva_ml-1.16.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
42
+ deriva_ml-1.16.0.dist-info/entry_points.txt,sha256=XsHSbfp7S1cKMjHoPUdFIaFcp9lHXHS6CV1zb_MEXkg,463
43
+ deriva_ml-1.16.0.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
44
+ deriva_ml-1.16.0.dist-info/RECORD,,
@@ -1,42 +0,0 @@
1
- deriva_ml/__init__.py,sha256=_aMdxGG4mRTcXodLZLNpXqH8v5uqMbqFUryE9KqNSB8,1158
2
- deriva_ml/bump_version.py,sha256=KpHmkpEztly2QHYL4dyaIGdEMyP4F0D89rawyh5EDTs,3982
3
- deriva_ml/demo_catalog.py,sha256=JjPAIac_hKPh5krEhGJydjXquRnivi7kQoR8W4Khp-s,14928
4
- deriva_ml/feature.py,sha256=6-aphkxdKjWa9oPSGFWxHcwAc_8hmWj-7I4M178YG5Y,8470
5
- deriva_ml/install_kernel.py,sha256=b62XY0SLViYO_Zye5r1Pl9qhYZyu_fk4KAO8NS1pxgM,2165
6
- deriva_ml/run_notebook.py,sha256=QRO_CK9Q9qt_n-c0rxGdIRyTHjGOuZxt-wj0WQTnaAM,8171
7
- deriva_ml/core/__init__.py,sha256=V_i90pc5PB1F4UdOO6DZWzpEFaZDTaPRU-EzKXQ19eI,787
8
- deriva_ml/core/base.py,sha256=LI_ZLpVJwWx4DW2Wo7luALQauQ3xhBxFYHSKDAfNsag,61649
9
- deriva_ml/core/constants.py,sha256=6wBJ8qMxe-dbCjRGrjUIX-RK0mTWrLDTeUpaVbLFoM8,888
10
- deriva_ml/core/definitions.py,sha256=uq_8uYFBVBVHS691Ri2kdQsN37z0GNYTaZskJIb_ocM,1385
11
- deriva_ml/core/enums.py,sha256=sSN4B4OynbB-AXwxRszoFr-KWIWIAfhVa06EzAEHwVc,7194
12
- deriva_ml/core/ermrest.py,sha256=N0IJ3TE87jElaBChEIo5AFDTr0SIrb6F90yiimRfPr4,10182
13
- deriva_ml/core/exceptions.py,sha256=4MZNPOyN-UMaGeY9sqJDVwh_iOmz1ntp4usSyCNqVMg,934
14
- deriva_ml/core/filespec.py,sha256=BQAAcRXfXq1lDcsKlokLOOXCBtEZpPgXxrFOIZYAgLg,4229
15
- deriva_ml/dataset/__init__.py,sha256=ukl2laJqa9J2AVqb4zlpIYc-3RaAlfRR33NMIQaoNrQ,104
16
- deriva_ml/dataset/aux_classes.py,sha256=9mZAln7_rrzaRbKhKA6dJOp3xeD6dHOC9NXOtJKROo4,6933
17
- deriva_ml/dataset/dataset.py,sha256=AU27ZtzDSpCodtbq9T-8AtqiA-x8r78wQvFBOCgaqsQ,64451
18
- deriva_ml/dataset/dataset_bag.py,sha256=mPIZRX5aTbVRcJbCFtdkmlnexquF8NE-onbVK_8IxVk,14224
19
- deriva_ml/dataset/history.py,sha256=FK5AYYz11p4E4FWMVg4r7UPWOD4eobrq3b3xMjWF59g,3197
20
- deriva_ml/dataset/upload.py,sha256=i_7KLfRSd2-THqZ1aG2OFAFGoyb8dJBCZZ5t1ftrtMQ,16429
21
- deriva_ml/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- deriva_ml/execution/environment.py,sha256=B7nywqxFTRUWgyu8n7rFoKcVC9on422kjeFG2FPQfvg,9302
23
- deriva_ml/execution/execution.py,sha256=NJjjrxGsedv0zoe-T-LxfO_5UG83KOHaxU3SY5EJ0QQ,44928
24
- deriva_ml/execution/execution_configuration.py,sha256=Rw4VWkBCZN9yatvSKdTqEWTfu470lpcVKfHFR0uN0jI,6248
25
- deriva_ml/execution/workflow.py,sha256=7CwPrgs3FKQHiEVus0PpK9w5hVKLKZnCrlu_nT8GFe8,13604
26
- deriva_ml/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- deriva_ml/model/catalog.py,sha256=dzTBcRlqgEVkPY32AUax_iu75RgFiT4Pu5au7rmrv8k,14068
28
- deriva_ml/model/database.py,sha256=SBkYFf0qwbGmvL0Xtn_n5DCz4roGfrhuYrM8G69Cy9Y,14837
29
- deriva_ml/model/sql_mapper.py,sha256=_0QsJEVSgSPtxrWKSgjfPZCQ1aMVcjR_Tk2OxLhWEvY,1696
30
- deriva_ml/schema/__init__.py,sha256=yV-MfzCF3FA4OOz7mZwMM2q6-x1vgOJ057kUvikFF6E,130
31
- deriva_ml/schema/annotations.py,sha256=TuQ3vWFnK0160fRmtvsCkHx9qAcRa63MSyERB4x5a98,18197
32
- deriva_ml/schema/check_schema.py,sha256=6dadLYHPqRex6AYVClmsESI8WhC7-rb-XnGf2G298xw,3609
33
- deriva_ml/schema/create_schema.py,sha256=9qK9_8SRQT-DwcEwTGSkhi3j2NaoH5EVgthvV2kO-gg,13042
34
- deriva_ml/schema/deriva-ml-reference.json,sha256=AEOMIgwKO3dNMMWHb0lxaXyamvfAEbUPh8qw0aAtsUQ,242460
35
- deriva_ml/schema/policy.json,sha256=5ykB8nnZFl-oCHzlAwppCFKJHWJFIkYognUMVEanfY8,1826
36
- deriva_ml/schema/table_comments_utils.py,sha256=4flCqnZAaqg_uSZ9I18pNUWAZoLfmMCXbmI5uERY5vM,2007
37
- deriva_ml-1.14.46.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
38
- deriva_ml-1.14.46.dist-info/METADATA,sha256=jhm5D5-SqSJD-JVSMyqLcVPpjG3vY6MLJTIZacyt_Fc,1190
39
- deriva_ml-1.14.46.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
- deriva_ml-1.14.46.dist-info/entry_points.txt,sha256=XsHSbfp7S1cKMjHoPUdFIaFcp9lHXHS6CV1zb_MEXkg,463
41
- deriva_ml-1.14.46.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
42
- deriva_ml-1.14.46.dist-info/RECORD,,