deriva-ml 1.14.47__py3-none-any.whl → 1.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +69 -30
- deriva_ml/core/__init__.py +2 -2
- deriva_ml/core/base.py +12 -12
- deriva_ml/core/config.py +67 -0
- deriva_ml/dataset/__init__.py +15 -2
- deriva_ml/dataset/aux_classes.py +20 -1
- deriva_ml/dataset/dataset.py +2 -1
- deriva_ml/dataset/dataset_bag.py +115 -0
- deriva_ml/dataset/upload.py +1 -0
- deriva_ml/demo_catalog.py +1 -1
- deriva_ml/execution/__init__.py +25 -0
- deriva_ml/execution/execution.py +46 -26
- deriva_ml/execution/execution_configuration.py +8 -32
- deriva_ml/model/catalog.py +113 -1
- deriva_ml/model/database.py +2 -2
- deriva_ml/protocols/dataset.py +19 -0
- deriva_ml/run_notebook.py +55 -50
- deriva_ml/schema/annotations.py +7 -5
- {deriva_ml-1.14.47.dist-info → deriva_ml-1.16.0.dist-info}/METADATA +2 -1
- deriva_ml-1.16.0.dist-info/RECORD +44 -0
- deriva_ml-1.14.47.dist-info/RECORD +0 -42
- {deriva_ml-1.14.47.dist-info → deriva_ml-1.16.0.dist-info}/WHEEL +0 -0
- {deriva_ml-1.14.47.dist-info → deriva_ml-1.16.0.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.14.47.dist-info → deriva_ml-1.16.0.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.14.47.dist-info → deriva_ml-1.16.0.dist-info}/top_level.txt +0 -0
deriva_ml/__init__.py
CHANGED
|
@@ -1,45 +1,84 @@
|
|
|
1
|
-
__all__ = [
|
|
2
|
-
"DerivaML",
|
|
3
|
-
"DerivaMLException",
|
|
4
|
-
"DerivaMLInvalidTerm",
|
|
5
|
-
"DerivaMLTableTypeError",
|
|
6
|
-
"Execution",
|
|
7
|
-
"ExecAssetType",
|
|
8
|
-
"ExecMetadataType",
|
|
9
|
-
"Workflow",
|
|
10
|
-
"DatasetBag",
|
|
11
|
-
"DatasetVersion",
|
|
12
|
-
"DatasetSpec",
|
|
13
|
-
"FileSpec",
|
|
14
|
-
"VersionPart",
|
|
15
|
-
"RID",
|
|
16
|
-
"BuiltinTypes",
|
|
17
|
-
"ColumnDefinition",
|
|
18
|
-
"MLVocab",
|
|
19
|
-
"MLAsset",
|
|
20
|
-
"TableDefinition",
|
|
21
|
-
"ExecutionConfiguration",
|
|
22
|
-
]
|
|
23
|
-
|
|
24
1
|
from importlib.metadata import PackageNotFoundError, version
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
25
3
|
|
|
26
|
-
|
|
4
|
+
# Safe imports - no circular dependencies
|
|
5
|
+
from deriva_ml.core.config import DerivaMLConfig
|
|
6
|
+
from deriva_ml.core.definitions import (
|
|
27
7
|
RID,
|
|
28
8
|
BuiltinTypes,
|
|
29
9
|
ColumnDefinition,
|
|
30
|
-
|
|
10
|
+
DerivaAssetColumns,
|
|
11
|
+
DerivaSystemColumns,
|
|
31
12
|
ExecAssetType,
|
|
32
13
|
ExecMetadataType,
|
|
33
14
|
FileSpec,
|
|
15
|
+
FileUploadState,
|
|
16
|
+
ForeignKeyDefinition,
|
|
17
|
+
KeyDefinition,
|
|
34
18
|
MLAsset,
|
|
35
19
|
MLVocab,
|
|
36
20
|
TableDefinition,
|
|
21
|
+
UploadState,
|
|
22
|
+
)
|
|
23
|
+
from deriva_ml.core.exceptions import (
|
|
24
|
+
DerivaMLException,
|
|
25
|
+
DerivaMLInvalidTerm,
|
|
26
|
+
DerivaMLTableTypeError,
|
|
37
27
|
)
|
|
38
|
-
from deriva_ml.
|
|
39
|
-
|
|
40
|
-
from
|
|
41
|
-
|
|
42
|
-
|
|
28
|
+
from deriva_ml.dataset.aux_classes import DatasetConfig, DatasetConfigList, DatasetSpec, DatasetVersion
|
|
29
|
+
|
|
30
|
+
from .execution import Execution, ExecutionConfiguration, Workflow
|
|
31
|
+
|
|
32
|
+
# Type-checking only - avoid circular import at runtime
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from deriva_ml.core.base import DerivaML
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Lazy import function for runtime usage
|
|
38
|
+
def __getattr__(name):
|
|
39
|
+
"""Lazy import to avoid circular dependencies."""
|
|
40
|
+
if name == "DerivaML":
|
|
41
|
+
from deriva_ml.core.base import DerivaML
|
|
42
|
+
|
|
43
|
+
return DerivaML
|
|
44
|
+
elif name == "Execution":
|
|
45
|
+
from deriva_ml.execution.execution import Execution
|
|
46
|
+
|
|
47
|
+
return Execution
|
|
48
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
"DerivaML", # Lazy-loaded
|
|
53
|
+
"DerivaMLConfig",
|
|
54
|
+
"DatasetConfig",
|
|
55
|
+
"DatasetConfigList",
|
|
56
|
+
"DatasetSpec",
|
|
57
|
+
"DatasetVersion",
|
|
58
|
+
"Execution",
|
|
59
|
+
"ExecutionConfiguration",
|
|
60
|
+
"Workflow",
|
|
61
|
+
# Exceptions
|
|
62
|
+
"DerivaMLException",
|
|
63
|
+
"DerivaMLInvalidTerm",
|
|
64
|
+
"DerivaMLTableTypeError",
|
|
65
|
+
# Definitions
|
|
66
|
+
"RID",
|
|
67
|
+
"BuiltinTypes",
|
|
68
|
+
"ColumnDefinition",
|
|
69
|
+
"DerivaSystemColumns",
|
|
70
|
+
"DerivaAssetColumns",
|
|
71
|
+
"ExecAssetType",
|
|
72
|
+
"ExecMetadataType",
|
|
73
|
+
"FileSpec",
|
|
74
|
+
"FileUploadState",
|
|
75
|
+
"ForeignKeyDefinition",
|
|
76
|
+
"KeyDefinition",
|
|
77
|
+
"MLAsset",
|
|
78
|
+
"MLVocab",
|
|
79
|
+
"TableDefinition",
|
|
80
|
+
"UploadState",
|
|
81
|
+
]
|
|
43
82
|
|
|
44
83
|
try:
|
|
45
84
|
__version__ = version("deriva_ml")
|
deriva_ml/core/__init__.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from deriva_ml.core.base import DerivaML
|
|
2
|
+
from deriva_ml.core.config import DerivaMLConfig
|
|
2
3
|
from deriva_ml.core.definitions import (
|
|
3
4
|
RID,
|
|
4
5
|
BuiltinTypes,
|
|
@@ -17,12 +18,11 @@ from deriva_ml.core.exceptions import DerivaMLException, DerivaMLInvalidTerm, De
|
|
|
17
18
|
|
|
18
19
|
__all__ = [
|
|
19
20
|
"DerivaML",
|
|
20
|
-
|
|
21
|
+
"DerivaMLConfig",
|
|
21
22
|
# Exceptions
|
|
22
23
|
"DerivaMLException",
|
|
23
24
|
"DerivaMLInvalidTerm",
|
|
24
25
|
"DerivaMLTableTypeError",
|
|
25
|
-
|
|
26
26
|
# Definitions
|
|
27
27
|
"RID",
|
|
28
28
|
"BuiltinTypes",
|
deriva_ml/core/base.py
CHANGED
|
@@ -15,7 +15,6 @@ from __future__ import annotations # noqa: I001
|
|
|
15
15
|
|
|
16
16
|
# Standard library imports
|
|
17
17
|
from collections import defaultdict
|
|
18
|
-
import getpass
|
|
19
18
|
import logging
|
|
20
19
|
from datetime import datetime
|
|
21
20
|
from itertools import chain
|
|
@@ -29,12 +28,7 @@ import requests
|
|
|
29
28
|
from pydantic import ConfigDict, validate_call
|
|
30
29
|
|
|
31
30
|
# Deriva imports
|
|
32
|
-
from deriva.core import
|
|
33
|
-
DEFAULT_SESSION_CONFIG,
|
|
34
|
-
format_exception,
|
|
35
|
-
get_credential,
|
|
36
|
-
urlquote,
|
|
37
|
-
)
|
|
31
|
+
from deriva.core import DEFAULT_SESSION_CONFIG, format_exception, get_credential, urlquote, init_logging
|
|
38
32
|
|
|
39
33
|
import deriva.core.datapath as datapath
|
|
40
34
|
from deriva.core.datapath import DataPathException, _SchemaWrapper as SchemaWrapper
|
|
@@ -55,6 +49,7 @@ from deriva_ml.core.definitions import (
|
|
|
55
49
|
TableDefinition,
|
|
56
50
|
VocabularyTerm,
|
|
57
51
|
)
|
|
52
|
+
from deriva_ml.core.config import DerivaMLConfig
|
|
58
53
|
from deriva_ml.core.exceptions import DerivaMLTableTypeError, DerivaMLException
|
|
59
54
|
from deriva_ml.dataset.aux_classes import DatasetSpec
|
|
60
55
|
from deriva_ml.dataset.dataset import Dataset
|
|
@@ -116,8 +111,10 @@ class DerivaML(Dataset):
|
|
|
116
111
|
project_name: str | None = None,
|
|
117
112
|
cache_dir: str | Path | None = None,
|
|
118
113
|
working_dir: str | Path | None = None,
|
|
114
|
+
hydra_runtime_output_dir: str | Path | None = None,
|
|
119
115
|
ml_schema: str = ML_SCHEMA,
|
|
120
116
|
logging_level=logging.WARNING,
|
|
117
|
+
deriva_logging_level=logging.WARNING,
|
|
121
118
|
credential=None,
|
|
122
119
|
use_minid: bool = True,
|
|
123
120
|
check_auth: bool = True,
|
|
@@ -166,12 +163,10 @@ class DerivaML(Dataset):
|
|
|
166
163
|
self.model = DerivaModel(self.catalog.getCatalogModel(), domain_schema=domain_schema)
|
|
167
164
|
|
|
168
165
|
# Set up working and cache directories
|
|
169
|
-
|
|
170
|
-
self.working_dir = (
|
|
171
|
-
Path(working_dir) / getpass.getuser() if working_dir else Path.home() / "deriva-ml"
|
|
172
|
-
) / default_workdir
|
|
173
|
-
|
|
166
|
+
self.working_dir = DerivaMLConfig.compute_workdir(working_dir)
|
|
174
167
|
self.working_dir.mkdir(parents=True, exist_ok=True)
|
|
168
|
+
self.hydra_runtime_output_dir = hydra_runtime_output_dir
|
|
169
|
+
|
|
175
170
|
self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
|
|
176
171
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
177
172
|
|
|
@@ -182,6 +177,11 @@ class DerivaML(Dataset):
|
|
|
182
177
|
self._logger = logging.getLogger("deriva_ml")
|
|
183
178
|
self._logger.setLevel(logging_level)
|
|
184
179
|
|
|
180
|
+
# Configure deriva logging level
|
|
181
|
+
init_logging(deriva_logging_level)
|
|
182
|
+
logging.getLogger("bagit").setLevel(deriva_logging_level)
|
|
183
|
+
logging.getLogger("bdbag").setLevel(deriva_logging_level)
|
|
184
|
+
|
|
185
185
|
# Store instance configuration
|
|
186
186
|
self.host_name = hostname
|
|
187
187
|
self.catalog_id = catalog_id
|
deriva_ml/core/config.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from hydra.conf import HydraConf, RunDir
|
|
6
|
+
from hydra.core.hydra_config import HydraConfig
|
|
7
|
+
from hydra_zen import store
|
|
8
|
+
from omegaconf import OmegaConf
|
|
9
|
+
from pydantic import BaseModel, model_validator
|
|
10
|
+
|
|
11
|
+
from deriva_ml.core.definitions import ML_SCHEMA
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DerivaMLConfig(BaseModel):
|
|
15
|
+
hostname: str
|
|
16
|
+
catalog_id: str | int = 1
|
|
17
|
+
domain_schema: str | None = None
|
|
18
|
+
project_name: str | None = None
|
|
19
|
+
cache_dir: str | Path | None = None
|
|
20
|
+
working_dir: str | Path | None = None
|
|
21
|
+
hydra_runtime_output_dir: str | Path | None = None
|
|
22
|
+
ml_schema: str = ML_SCHEMA
|
|
23
|
+
logging_level: Any = logging.WARNING
|
|
24
|
+
deriva_logging_level: Any = logging.WARNING
|
|
25
|
+
credential: Any = None
|
|
26
|
+
use_minid: bool = True
|
|
27
|
+
check_auth: bool = True
|
|
28
|
+
|
|
29
|
+
@model_validator(mode="after")
|
|
30
|
+
def init_working_dir(self):
|
|
31
|
+
"""
|
|
32
|
+
Sets up the working directory for the model.
|
|
33
|
+
|
|
34
|
+
This method configures the working directory, ensuring that all required
|
|
35
|
+
file operations are performed in the appropriate location. If the user does not
|
|
36
|
+
specify a directory, a default directory based on the user's home directory
|
|
37
|
+
or username will be used.
|
|
38
|
+
|
|
39
|
+
This is a repeat of what is in the DerivaML.__init__ bu we put this here so that the working
|
|
40
|
+
directory is available to hydra.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Self: The object instance with the working directory initialized.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
self.working_dir = DerivaMLConfig.compute_workdir(self.working_dir)
|
|
47
|
+
self.hydra_runtime_output_dir = Path(HydraConfig.get().runtime.output_dir)
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def compute_workdir(working_dir) -> Path:
|
|
52
|
+
# Create a default working directory if none is provided
|
|
53
|
+
working_dir = Path(working_dir) if working_dir else Path.home() / "deriva-ml"
|
|
54
|
+
return working_dir.absolute()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
OmegaConf.register_new_resolver("compute_workdir", DerivaMLConfig.compute_workdir, replace=True)
|
|
58
|
+
store(
|
|
59
|
+
HydraConf(
|
|
60
|
+
run=RunDir("${compute_workdir:${deriva_ml.working_dir}}/hydra/${now:%Y-%m-%d_%H-%M-%S}"),
|
|
61
|
+
output_subdir="hydra-config",
|
|
62
|
+
),
|
|
63
|
+
group="hydra",
|
|
64
|
+
name="config",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
store.add_to_hydra_store()
|
deriva_ml/dataset/__init__.py
CHANGED
|
@@ -1,4 +1,17 @@
|
|
|
1
|
-
from
|
|
1
|
+
from typing import Protocol, runtime_checkable
|
|
2
|
+
|
|
3
|
+
from deriva_ml.core.definitions import RID
|
|
4
|
+
|
|
5
|
+
from .aux_classes import DatasetConfig, DatasetConfigList, DatasetSpec, DatasetVersion, VersionPart
|
|
2
6
|
from .dataset import Dataset
|
|
7
|
+
from .dataset_bag import DatasetBag
|
|
3
8
|
|
|
4
|
-
__all__ = [
|
|
9
|
+
__all__ = [
|
|
10
|
+
"Dataset",
|
|
11
|
+
"DatasetSpec",
|
|
12
|
+
"DatasetConfig",
|
|
13
|
+
"DatasetConfigList",
|
|
14
|
+
"DatasetBag",
|
|
15
|
+
"DatasetVersion",
|
|
16
|
+
"VersionPart",
|
|
17
|
+
]
|
deriva_ml/dataset/aux_classes.py
CHANGED
|
@@ -5,6 +5,7 @@ THis module defines the DataSet class with is used to manipulate n
|
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from typing import Any, Optional, SupportsInt
|
|
7
7
|
|
|
8
|
+
from hydra_zen import hydrated_dataclass
|
|
8
9
|
from pydantic import (
|
|
9
10
|
BaseModel,
|
|
10
11
|
ConfigDict,
|
|
@@ -182,8 +183,9 @@ class DatasetSpec(BaseModel):
|
|
|
182
183
|
"""
|
|
183
184
|
|
|
184
185
|
rid: RID
|
|
185
|
-
materialize: bool = True
|
|
186
186
|
version: DatasetVersion | conlist(item_type=int, min_length=3, max_length=3) | tuple[int, int, int] | str
|
|
187
|
+
materialize: bool = True
|
|
188
|
+
description: str = ""
|
|
187
189
|
|
|
188
190
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
189
191
|
|
|
@@ -208,3 +210,20 @@ class DatasetSpec(BaseModel):
|
|
|
208
210
|
@field_serializer("version")
|
|
209
211
|
def serialize_version(self, version: DatasetVersion) -> dict[str, Any]:
|
|
210
212
|
return version.to_dict()
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
@hydrated_dataclass(DatasetSpec)
|
|
216
|
+
class DatasetConfig:
|
|
217
|
+
rid: str
|
|
218
|
+
version: str
|
|
219
|
+
materialize: bool = True
|
|
220
|
+
description: str = ""
|
|
221
|
+
|
|
222
|
+
class DatasetList(BaseModel):
|
|
223
|
+
datasets: list[DatasetSpec]
|
|
224
|
+
description: str = ""
|
|
225
|
+
|
|
226
|
+
@hydrated_dataclass(DatasetList)
|
|
227
|
+
class DatasetConfigList:
|
|
228
|
+
datasets: list[DatasetConfig]
|
|
229
|
+
description: str = ""
|
deriva_ml/dataset/dataset.py
CHANGED
|
@@ -22,10 +22,11 @@ Typical usage example:
|
|
|
22
22
|
|
|
23
23
|
from __future__ import annotations
|
|
24
24
|
|
|
25
|
-
# Standard library imports
|
|
26
25
|
import json
|
|
27
26
|
import logging
|
|
28
27
|
from collections import defaultdict
|
|
28
|
+
|
|
29
|
+
# Standard library imports
|
|
29
30
|
from graphlib import TopologicalSorter
|
|
30
31
|
from pathlib import Path
|
|
31
32
|
from tempfile import TemporaryDirectory
|
deriva_ml/dataset/dataset_bag.py
CHANGED
|
@@ -266,6 +266,22 @@ class DatasetBag:
|
|
|
266
266
|
sql_cmd = f'SELECT * FROM "{feature_table}"'
|
|
267
267
|
return cast(datapath._ResultSet, [dict(zip(col_names, r)) for r in db.execute(sql_cmd).fetchall()])
|
|
268
268
|
|
|
269
|
+
def list_dataset_element_types(self) -> list[Table]:
|
|
270
|
+
"""
|
|
271
|
+
Lists the data types of elements contained within a dataset.
|
|
272
|
+
|
|
273
|
+
This method analyzes the dataset and identifies the data types for all
|
|
274
|
+
elements within it. It is useful for understanding the structure and
|
|
275
|
+
content of the dataset and allows for better manipulation and usage of its
|
|
276
|
+
data.
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
list[str]: A list of strings where each string represents a data type
|
|
280
|
+
of an element found in the dataset.
|
|
281
|
+
|
|
282
|
+
"""
|
|
283
|
+
return self.model.list_dataset_element_types()
|
|
284
|
+
|
|
269
285
|
def list_dataset_children(self, recurse: bool = False) -> list[DatasetBag]:
|
|
270
286
|
"""Get nested datasets.
|
|
271
287
|
|
|
@@ -333,6 +349,105 @@ class DatasetBag:
|
|
|
333
349
|
# Term not found
|
|
334
350
|
raise DerivaMLInvalidTerm(vocab_table, term_name)
|
|
335
351
|
|
|
352
|
+
def _denormalize(self, include_tables: list[str] | None) -> str:
|
|
353
|
+
"""
|
|
354
|
+
Generates an SQL statement for denormalizing the dataset based on the tables to include. Processes cycles in
|
|
355
|
+
graph relationships, ensures proper join order, and generates selected columns for denormalization.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
include_tables (list[str] | None): List of table names to include in the denormalized dataset. If None,
|
|
359
|
+
all tables from the dataset will be included.
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
str: SQL query string that represents the process of denormalization.
|
|
363
|
+
"""
|
|
364
|
+
|
|
365
|
+
def column_name(col: Column) -> str:
|
|
366
|
+
return f'"{self.model.normalize_table_name(col.table.name)}"."{col.name}"'
|
|
367
|
+
|
|
368
|
+
# Skip over tables that we don't want to include in the denormalized dataset.
|
|
369
|
+
# Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
|
|
370
|
+
# table.
|
|
371
|
+
|
|
372
|
+
join_tables, tables, denormalized_columns, dataset_rids, dataset_element_tables = (
|
|
373
|
+
self.model._prepare_wide_table(self, self.dataset_rid, include_tables)
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
select_args = [
|
|
377
|
+
# SQLlite will strip out the table name from the column in the select statement, so we need to add
|
|
378
|
+
# an explicit alias to the column name.
|
|
379
|
+
f'"{self.model.normalize_table_name(table_name)}"."{column_name}" AS "{table_name}.{column_name}"'
|
|
380
|
+
for table_name, column_name in denormalized_columns
|
|
381
|
+
]
|
|
382
|
+
|
|
383
|
+
# First table in the table list is the table specified in the method call.
|
|
384
|
+
normalized_join_tables = [self.model.normalize_table_name(t) for t in join_tables]
|
|
385
|
+
sql_statement = f'SELECT {",".join(select_args)} FROM "{normalized_join_tables[0]}"'
|
|
386
|
+
for t in normalized_join_tables[1:]:
|
|
387
|
+
on = tables[t]
|
|
388
|
+
sql_statement += f' LEFT JOIN "{t}" ON '
|
|
389
|
+
sql_statement += "OR ".join([f"{column_name(o[0])} = {column_name(o[1])}" for o in on])
|
|
390
|
+
|
|
391
|
+
# Select only rows from the datasets you wish to include.
|
|
392
|
+
dataset_rid_list = ",".join([f'"{self.dataset_rid}"'] + [f'"{b.dataset_rid}"' for b in dataset_rids])
|
|
393
|
+
sql_statement += f'WHERE "{self.model.normalize_table_name("Dataset")}"."RID" IN ({dataset_rid_list})'
|
|
394
|
+
|
|
395
|
+
# Only include rows that have actual values in them.
|
|
396
|
+
real_row = [f'"{self.model.normalize_table_name(t)}".RID IS NOT NULL ' for t in dataset_element_tables]
|
|
397
|
+
sql_statement += f" AND ({' OR '.join(real_row)})"
|
|
398
|
+
return sql_statement
|
|
399
|
+
|
|
400
|
+
def denormalize_as_dataframe(self, include_tables: list[str] | None = None) -> pd.DataFrame:
|
|
401
|
+
"""
|
|
402
|
+
Denormalize the dataset and return the result as a dataframe.
|
|
403
|
+
|
|
404
|
+
This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
|
|
405
|
+
the dataset values into a single wide table. The result is returned as a dataframe.
|
|
406
|
+
|
|
407
|
+
The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
|
|
408
|
+
view. The tables in this argument can appear anywhere in the dataset schema. The method will determine which
|
|
409
|
+
additional tables are required to complete the denormalization process. If include_tables is not specified,
|
|
410
|
+
all of the tables in the schema will be included.
|
|
411
|
+
|
|
412
|
+
The resulting wide table will include a column for every table needed to complete the denormalization process.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
|
|
416
|
+
is used.
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
Dataframe containing the denormalized dataset.
|
|
420
|
+
"""
|
|
421
|
+
return pd.read_sql(self._denormalize(include_tables=include_tables), self.database)
|
|
422
|
+
|
|
423
|
+
def denormalize_as_dict(self, include_tables: list[str] | None = None) -> Generator[dict[str, Any], None, None]:
|
|
424
|
+
"""
|
|
425
|
+
Denormalize the dataset and return the result as a set of dictionarys.
|
|
426
|
+
|
|
427
|
+
This routine will examine the domain schema for the dataset, determine which tables to include and denormalize
|
|
428
|
+
the dataset values into a single wide table. The result is returned as a generateor that returns a dictionary
|
|
429
|
+
for each row in the denormlized wide table.
|
|
430
|
+
|
|
431
|
+
The optional argument include_tables can be used to specify a subset of tables to include in the denormalized
|
|
432
|
+
view. The tables in this argument can appear anywhere in the dataset schema. The method will determine which
|
|
433
|
+
additional tables are required to complete the denormalization process. If include_tables is not specified,
|
|
434
|
+
all of the tables in the schema will be included.
|
|
435
|
+
|
|
436
|
+
The resulting wide table will include a column for every table needed to complete the denormalization process.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
include_tables: List of table names to include in the denormalized dataset. If None, than the entire schema
|
|
440
|
+
is used.
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
A generator that returns a dictionary representation of each row in the denormalized dataset.
|
|
444
|
+
"""
|
|
445
|
+
with self.database as dbase:
|
|
446
|
+
cursor = dbase.execute(self._denormalize(include_tables=include_tables))
|
|
447
|
+
columns = [desc[0] for desc in cursor.description]
|
|
448
|
+
for row in cursor:
|
|
449
|
+
yield dict(zip(columns, row))
|
|
450
|
+
|
|
336
451
|
|
|
337
452
|
# Add annotations after definition to deal with forward reference issues in pydantic
|
|
338
453
|
|
deriva_ml/dataset/upload.py
CHANGED
|
@@ -412,6 +412,7 @@ def asset_file_path(
|
|
|
412
412
|
"Description",
|
|
413
413
|
}.union(set(DerivaSystemColumns))
|
|
414
414
|
asset_metadata = {c.name for c in asset_table.columns} - asset_columns
|
|
415
|
+
|
|
415
416
|
if not (asset_metadata >= set(metadata.keys())):
|
|
416
417
|
raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")
|
|
417
418
|
|
deriva_ml/demo_catalog.py
CHANGED
|
@@ -367,7 +367,7 @@ def create_demo_catalog(
|
|
|
367
367
|
create_features=False,
|
|
368
368
|
create_datasets=False,
|
|
369
369
|
on_exit_delete=True,
|
|
370
|
-
logging_level=logging.
|
|
370
|
+
logging_level=logging.WARNING,
|
|
371
371
|
) -> ErmrestCatalog:
|
|
372
372
|
test_catalog = create_ml_catalog(hostname, project_name=project_name)
|
|
373
373
|
if on_exit_delete:
|
deriva_ml/execution/__init__.py
CHANGED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
# Safe imports - no circular dependencies
|
|
4
|
+
from deriva_ml.execution.execution_configuration import ExecutionConfiguration
|
|
5
|
+
from deriva_ml.execution.workflow import Workflow
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from deriva_ml.execution.execution import Execution
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Lazy import for runtime
|
|
12
|
+
def __getattr__(name):
|
|
13
|
+
"""Lazy import to avoid circular dependencies."""
|
|
14
|
+
if name == "Execution":
|
|
15
|
+
from deriva_ml.execution.execution import Execution
|
|
16
|
+
|
|
17
|
+
return Execution
|
|
18
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"Execution", # Lazy-loaded
|
|
23
|
+
"ExecutionConfiguration",
|
|
24
|
+
"Workflow",
|
|
25
|
+
]
|
deriva_ml/execution/execution.py
CHANGED
|
@@ -41,7 +41,6 @@ from deriva_ml.core.base import DerivaML
|
|
|
41
41
|
from deriva_ml.core.definitions import (
|
|
42
42
|
DRY_RUN_RID,
|
|
43
43
|
RID,
|
|
44
|
-
ExecAssetType,
|
|
45
44
|
ExecMetadataType,
|
|
46
45
|
FileSpec,
|
|
47
46
|
FileUploadState,
|
|
@@ -198,7 +197,6 @@ class Execution:
|
|
|
198
197
|
workflow_rid (RID): RID of the associated workflow.
|
|
199
198
|
status (Status): Current execution status.
|
|
200
199
|
asset_paths (list[AssetFilePath]): Paths to execution assets.
|
|
201
|
-
parameters (dict): Execution parameters.
|
|
202
200
|
start_time (datetime | None): When execution started.
|
|
203
201
|
stop_time (datetime | None): When execution completed.
|
|
204
202
|
|
|
@@ -206,7 +204,6 @@ class Execution:
|
|
|
206
204
|
>>> config = ExecutionConfiguration(
|
|
207
205
|
... workflow="analysis",
|
|
208
206
|
... description="Process samples",
|
|
209
|
-
... parameters={"threshold": 0.5}
|
|
210
207
|
... )
|
|
211
208
|
>>> with ml.create_execution(config) as execution:
|
|
212
209
|
... execution.download_dataset_bag(dataset_spec)
|
|
@@ -250,7 +247,6 @@ class Execution:
|
|
|
250
247
|
|
|
251
248
|
self.dataset_rids: List[RID] = []
|
|
252
249
|
self.datasets: list[DatasetBag] = []
|
|
253
|
-
self.parameters = self.configuration.parameters
|
|
254
250
|
|
|
255
251
|
self._working_dir = self._ml_object.working_dir
|
|
256
252
|
self._cache_dir = self._ml_object.cache_dir
|
|
@@ -292,9 +288,18 @@ class Execution:
|
|
|
292
288
|
]
|
|
293
289
|
)[0]["RID"]
|
|
294
290
|
|
|
295
|
-
if
|
|
296
|
-
# Put execution_rid into the
|
|
297
|
-
|
|
291
|
+
if rid_path := os.environ.get("DERIVA_ML_SAVE_EXECUTION_RID", None):
|
|
292
|
+
# Put execution_rid into the provided file path so we can find it later.
|
|
293
|
+
with Path(rid_path).open("w") as f:
|
|
294
|
+
json.dump(
|
|
295
|
+
{
|
|
296
|
+
"hostname": self._ml_object.host_name,
|
|
297
|
+
"catalog_id": self._ml_object.catalog_id,
|
|
298
|
+
"workflow_rid": self.workflow_rid,
|
|
299
|
+
"execution_rid": self.execution_rid,
|
|
300
|
+
},
|
|
301
|
+
f,
|
|
302
|
+
)
|
|
298
303
|
|
|
299
304
|
# Create a directory for execution rid so we can recover the state in case of a crash.
|
|
300
305
|
execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
|
|
@@ -302,13 +307,28 @@ class Execution:
|
|
|
302
307
|
|
|
303
308
|
def _save_runtime_environment(self):
|
|
304
309
|
runtime_env_path = self.asset_file_path(
|
|
305
|
-
"Execution_Metadata",
|
|
306
|
-
f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
|
|
307
|
-
ExecMetadataType.runtime_env.value,
|
|
310
|
+
asset_name="Execution_Metadata",
|
|
311
|
+
file_name=f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
|
|
312
|
+
asset_types=ExecMetadataType.runtime_env.value,
|
|
308
313
|
)
|
|
309
314
|
with Path(runtime_env_path).open("w") as fp:
|
|
310
315
|
json.dump(get_execution_environment(), fp)
|
|
311
316
|
|
|
317
|
+
def _upload_hydra_config_assets(self):
|
|
318
|
+
"""Upload hydra assets to the catalog."""
|
|
319
|
+
hydra_runtime_output_dir = self._ml_object.hydra_runtime_output_dir
|
|
320
|
+
if hydra_runtime_output_dir:
|
|
321
|
+
timestamp = hydra_runtime_output_dir.parts[-1]
|
|
322
|
+
for hydra_asset in hydra_runtime_output_dir.rglob("*"):
|
|
323
|
+
if hydra_asset.is_dir():
|
|
324
|
+
continue
|
|
325
|
+
asset = self.asset_file_path(
|
|
326
|
+
asset_name=MLAsset.execution_metadata,
|
|
327
|
+
file_name=hydra_runtime_output_dir / hydra_asset,
|
|
328
|
+
rename_file=f"hydra-{timestamp}-{hydra_asset.name}",
|
|
329
|
+
asset_types=ExecMetadataType.execution_config.value,
|
|
330
|
+
)
|
|
331
|
+
|
|
312
332
|
def _initialize_execution(self, reload: RID | None = None) -> None:
|
|
313
333
|
"""Initialize the execution by a configuration in the Execution_Metadata table.
|
|
314
334
|
Set up a working directory and download all the assets and data.
|
|
@@ -354,9 +374,9 @@ class Execution:
|
|
|
354
374
|
# Save configuration details for later upload
|
|
355
375
|
if not reload:
|
|
356
376
|
cfile = self.asset_file_path(
|
|
357
|
-
MLAsset.execution_metadata,
|
|
358
|
-
"configuration.json",
|
|
359
|
-
ExecMetadataType.execution_config.value,
|
|
377
|
+
asset_name=MLAsset.execution_metadata,
|
|
378
|
+
file_name="configuration.json",
|
|
379
|
+
asset_types=ExecMetadataType.execution_config.value,
|
|
360
380
|
)
|
|
361
381
|
with Path(cfile).open("w", encoding="utf-8") as config_file:
|
|
362
382
|
json.dump(self.configuration.model_dump(), config_file)
|
|
@@ -364,24 +384,18 @@ class Execution:
|
|
|
364
384
|
lock_file = Path(self.configuration.workflow.git_root) / "uv.lock"
|
|
365
385
|
if lock_file.exists():
|
|
366
386
|
_ = self.asset_file_path(
|
|
367
|
-
MLAsset.execution_metadata,
|
|
368
|
-
lock_file,
|
|
369
|
-
ExecMetadataType.execution_config.value,
|
|
387
|
+
asset_name=MLAsset.execution_metadata,
|
|
388
|
+
file_name=lock_file,
|
|
389
|
+
asset_types=ExecMetadataType.execution_config.value,
|
|
370
390
|
)
|
|
371
391
|
|
|
372
|
-
|
|
373
|
-
self.asset_file_path(
|
|
374
|
-
MLAsset.execution_asset,
|
|
375
|
-
parameter_file,
|
|
376
|
-
ExecAssetType.input_file.value,
|
|
377
|
-
)
|
|
392
|
+
self._upload_hydra_config_assets()
|
|
378
393
|
|
|
379
394
|
# save runtime env
|
|
380
395
|
self._save_runtime_environment()
|
|
381
396
|
|
|
382
397
|
# Now upload the files so we have the info in case the execution fails.
|
|
383
398
|
self.uploaded_assets = self._upload_execution_dirs()
|
|
384
|
-
|
|
385
399
|
self.start_time = datetime.now()
|
|
386
400
|
self.update_status(Status.pending, "Initialize status finished.")
|
|
387
401
|
|
|
@@ -856,6 +870,7 @@ class Execution:
|
|
|
856
870
|
file_name: str | Path,
|
|
857
871
|
asset_types: list[str] | str | None = None,
|
|
858
872
|
copy_file=False,
|
|
873
|
+
rename_file: str | None = None,
|
|
859
874
|
**kwargs,
|
|
860
875
|
) -> AssetFilePath:
|
|
861
876
|
"""Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
|
|
@@ -875,6 +890,8 @@ class Execution:
|
|
|
875
890
|
asset_name: Type of asset to be uploaded. Must be a term in Asset_Type controlled vocabulary.
|
|
876
891
|
file_name: Name of file to be uploaded.
|
|
877
892
|
asset_types: Type of asset to be uploaded. Defaults to the name of the asset.
|
|
893
|
+
copy_file: Whether to copy the file rather than creating a symbolic link.
|
|
894
|
+
rename_file: If provided, the file will be renamed to this name if the file already exists..
|
|
878
895
|
**kwargs: Any additional metadata values that may be part of the asset table.
|
|
879
896
|
|
|
880
897
|
Returns:
|
|
@@ -893,12 +910,15 @@ class Execution:
|
|
|
893
910
|
for t in asset_types:
|
|
894
911
|
self._ml_object.lookup_term(MLVocab.asset_type, t)
|
|
895
912
|
|
|
913
|
+
# Determine if we will need to rename an existing file as the asset.
|
|
896
914
|
file_name = Path(file_name)
|
|
915
|
+
target_name = Path(rename_file) if file_name.exists() and rename_file else file_name
|
|
916
|
+
|
|
897
917
|
asset_path = asset_file_path(
|
|
898
918
|
prefix=self._working_dir,
|
|
899
919
|
exec_rid=self.execution_rid,
|
|
900
920
|
asset_table=self._model.name_to_table(asset_name),
|
|
901
|
-
file_name=
|
|
921
|
+
file_name=target_name.name,
|
|
902
922
|
metadata=kwargs,
|
|
903
923
|
)
|
|
904
924
|
|
|
@@ -914,12 +934,12 @@ class Execution:
|
|
|
914
934
|
|
|
915
935
|
# Persist the asset types into a file
|
|
916
936
|
with Path(asset_type_path(self._working_dir, self.execution_rid, asset_table)).open("a") as asset_type_file:
|
|
917
|
-
asset_type_file.write(json.dumps({
|
|
937
|
+
asset_type_file.write(json.dumps({target_name.name: asset_types}) + "\n")
|
|
918
938
|
|
|
919
939
|
return AssetFilePath(
|
|
920
940
|
asset_path=asset_path,
|
|
921
941
|
asset_name=asset_name,
|
|
922
|
-
file_name=
|
|
942
|
+
file_name=target_name.name,
|
|
923
943
|
asset_metadata=kwargs,
|
|
924
944
|
asset_types=asset_types,
|
|
925
945
|
)
|
|
@@ -30,7 +30,7 @@ from typing import Any
|
|
|
30
30
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
31
31
|
|
|
32
32
|
from deriva_ml.core.definitions import RID
|
|
33
|
-
from deriva_ml.dataset.aux_classes import DatasetSpec
|
|
33
|
+
from deriva_ml.dataset.aux_classes import DatasetList, DatasetSpec
|
|
34
34
|
from deriva_ml.execution.workflow import Workflow
|
|
35
35
|
|
|
36
36
|
|
|
@@ -64,45 +64,21 @@ class ExecutionConfiguration(BaseModel):
|
|
|
64
64
|
... )
|
|
65
65
|
"""
|
|
66
66
|
|
|
67
|
-
datasets: list[DatasetSpec] = []
|
|
67
|
+
datasets: list[DatasetSpec] | DatasetList = []
|
|
68
68
|
assets: list[RID] = []
|
|
69
69
|
workflow: RID | Workflow
|
|
70
|
-
parameters: dict[str, Any] | Path = {}
|
|
71
70
|
description: str = ""
|
|
72
71
|
argv: list[str] = Field(default_factory=lambda: sys.argv)
|
|
73
72
|
|
|
74
73
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
75
74
|
|
|
76
|
-
@field_validator("
|
|
75
|
+
@field_validator("datasets", mode="before")
|
|
77
76
|
@classmethod
|
|
78
|
-
def
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
value: Parameter value to validate, either:
|
|
86
|
-
- Dictionary of parameters
|
|
87
|
-
- Path to JSON file
|
|
88
|
-
- String path to JSON file
|
|
89
|
-
|
|
90
|
-
Returns:
|
|
91
|
-
dict[str, Any]: Validated parameter dictionary.
|
|
92
|
-
|
|
93
|
-
Raises:
|
|
94
|
-
ValueError: If JSON file is invalid or cannot be read.
|
|
95
|
-
FileNotFoundError: If parameter file doesn't exist.
|
|
96
|
-
|
|
97
|
-
Example:
|
|
98
|
-
>>> config = ExecutionConfiguration(parameters="params.json")
|
|
99
|
-
>>> print(config.parameters) # Contents of params.json as dict
|
|
100
|
-
"""
|
|
101
|
-
if isinstance(value, str) or isinstance(value, Path):
|
|
102
|
-
with Path(value).open("r") as f:
|
|
103
|
-
return json.load(f)
|
|
104
|
-
else:
|
|
105
|
-
return value
|
|
77
|
+
def validate_datasets(cls, value: Any) -> Any:
|
|
78
|
+
if isinstance(value, DatasetList):
|
|
79
|
+
config_list: DatasetList = value
|
|
80
|
+
value = config_list.datasets
|
|
81
|
+
return value
|
|
106
82
|
|
|
107
83
|
@field_validator("workflow", mode="before")
|
|
108
84
|
@classmethod
|
deriva_ml/model/catalog.py
CHANGED
|
@@ -9,6 +9,7 @@ from __future__ import annotations
|
|
|
9
9
|
|
|
10
10
|
# Standard library imports
|
|
11
11
|
from collections import Counter
|
|
12
|
+
from graphlib import CycleError, TopologicalSorter
|
|
12
13
|
from typing import Any, Callable, Final, Iterable, NewType, TypeAlias
|
|
13
14
|
|
|
14
15
|
from deriva.core.ermrest_catalog import ErmrestCatalog
|
|
@@ -21,6 +22,7 @@ from pydantic import ConfigDict, validate_call
|
|
|
21
22
|
|
|
22
23
|
from deriva_ml.core.definitions import (
|
|
23
24
|
ML_SCHEMA,
|
|
25
|
+
RID,
|
|
24
26
|
DerivaAssetColumns,
|
|
25
27
|
TableDefinition,
|
|
26
28
|
)
|
|
@@ -28,6 +30,7 @@ from deriva_ml.core.exceptions import DerivaMLException, DerivaMLTableTypeError
|
|
|
28
30
|
|
|
29
31
|
# Local imports
|
|
30
32
|
from deriva_ml.feature import Feature
|
|
33
|
+
from deriva_ml.protocols.dataset import DatasetLike
|
|
31
34
|
|
|
32
35
|
try:
|
|
33
36
|
from icecream import ic
|
|
@@ -287,6 +290,113 @@ class DerivaModel:
|
|
|
287
290
|
else:
|
|
288
291
|
self.model.apply()
|
|
289
292
|
|
|
293
|
+
def list_dataset_element_types(self) -> list[Table]:
|
|
294
|
+
"""
|
|
295
|
+
Lists the data types of elements contained within a dataset.
|
|
296
|
+
|
|
297
|
+
This method analyzes the dataset and identifies the data types for all
|
|
298
|
+
elements within it. It is useful for understanding the structure and
|
|
299
|
+
content of the dataset and allows for better manipulation and usage of its
|
|
300
|
+
data.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
list[str]: A list of strings where each string represents a data type
|
|
304
|
+
of an element found in the dataset.
|
|
305
|
+
|
|
306
|
+
"""
|
|
307
|
+
|
|
308
|
+
dataset_table = self.name_to_table("Dataset")
|
|
309
|
+
|
|
310
|
+
def domain_table(table: Table) -> bool:
|
|
311
|
+
return table.schema.name == self.domain_schema or table.name == dataset_table.name
|
|
312
|
+
|
|
313
|
+
return [t for a in dataset_table.find_associations() if domain_table(t := a.other_fkeys.pop().pk_table)]
|
|
314
|
+
|
|
315
|
+
def _prepare_wide_table(self, dataset: DatasetLike, dataset_rid: RID, include_tables: list[str] | None) -> tuple:
|
|
316
|
+
"""
|
|
317
|
+
Generates details of a wide table from the model
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
include_tables (list[str] | None): List of table names to include in the denormalized dataset. If None,
|
|
321
|
+
all tables from the dataset will be included.
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
str: SQL query string that represents the process of denormalization.
|
|
325
|
+
"""
|
|
326
|
+
|
|
327
|
+
# Skip over tables that we don't want to include in the denormalized dataset.
|
|
328
|
+
# Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
|
|
329
|
+
# table.
|
|
330
|
+
include_tables = set(include_tables) if include_tables else set()
|
|
331
|
+
for t in include_tables:
|
|
332
|
+
# Check to make sure the table is in the catalog.
|
|
333
|
+
_ = self.name_to_table(t)
|
|
334
|
+
|
|
335
|
+
table_paths = [
|
|
336
|
+
path
|
|
337
|
+
for path in self._schema_to_paths()
|
|
338
|
+
if (not include_tables) or include_tables.intersection({p.name for p in path})
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
# Get the names of all of the tables that can be dataset elements.
|
|
342
|
+
dataset_element_tables = {
|
|
343
|
+
e.name for e in self.list_dataset_element_types() if e.schema.name == self.domain_schema
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
skip_columns = {"RCT", "RMT", "RCB", "RMB"}
|
|
347
|
+
tables = {}
|
|
348
|
+
graph = {}
|
|
349
|
+
for path in table_paths:
|
|
350
|
+
for left, right in zip(path[0:], path[1:]):
|
|
351
|
+
graph.setdefault(left.name, set()).add(right.name)
|
|
352
|
+
|
|
353
|
+
# New lets remove any cycles that we may have in the graph.
|
|
354
|
+
# We will use a topological sort to find the order in which we need to join the tables.
|
|
355
|
+
# If we find a cycle, we will remove the table from the graph and splice in an additional ON clause.
|
|
356
|
+
# We will then repeat the process until there are no cycles.
|
|
357
|
+
graph_has_cycles = True
|
|
358
|
+
join_tables = []
|
|
359
|
+
while graph_has_cycles:
|
|
360
|
+
try:
|
|
361
|
+
ts = TopologicalSorter(graph)
|
|
362
|
+
join_tables = list(reversed(list(ts.static_order())))
|
|
363
|
+
graph_has_cycles = False
|
|
364
|
+
except CycleError as e:
|
|
365
|
+
cycle_nodes = e.args[1]
|
|
366
|
+
if len(cycle_nodes) > 3:
|
|
367
|
+
raise DerivaMLException(f"Unexpected cycle found when normalizing dataset {cycle_nodes}")
|
|
368
|
+
# Remove cycle from graph and splice in additional ON constraint.
|
|
369
|
+
graph[cycle_nodes[1]].remove(cycle_nodes[0])
|
|
370
|
+
|
|
371
|
+
# The Dataset_Version table is a special case as it points to dataset and dataset to version.
|
|
372
|
+
if "Dataset_Version" in join_tables:
|
|
373
|
+
join_tables.remove("Dataset_Version")
|
|
374
|
+
|
|
375
|
+
for path in table_paths:
|
|
376
|
+
for left, right in zip(path[0:], path[1:]):
|
|
377
|
+
if right.name == "Dataset_Version":
|
|
378
|
+
# The Dataset_Version table is a special case as it points to dataset and dataset to version.
|
|
379
|
+
continue
|
|
380
|
+
if join_tables.index(right.name) < join_tables.index(left.name):
|
|
381
|
+
continue
|
|
382
|
+
table_relationship = self._table_relationship(left, right)
|
|
383
|
+
tables.setdefault(self.normalize_table_name(right.name), set()).add(
|
|
384
|
+
(table_relationship[0], table_relationship[1])
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
# Get the list of columns that will appear in the final denormalized dataset.
|
|
388
|
+
denormalized_columns = [
|
|
389
|
+
(table_name, c.name)
|
|
390
|
+
for table_name in join_tables
|
|
391
|
+
if not self.is_association(table_name) # Don't include association columns in the denormalized view.'
|
|
392
|
+
for c in self.name_to_table(table_name).columns
|
|
393
|
+
if c.name not in skip_columns
|
|
394
|
+
]
|
|
395
|
+
|
|
396
|
+
# List of dataset ids to include in the denormalized view.
|
|
397
|
+
dataset_rids = dataset.list_dataset_children(recurse=True)
|
|
398
|
+
return join_tables, tables, denormalized_columns, dataset_rids, dataset_element_tables
|
|
399
|
+
|
|
290
400
|
def _table_relationship(
|
|
291
401
|
self,
|
|
292
402
|
table1: TableInput,
|
|
@@ -302,7 +412,9 @@ class DerivaModel:
|
|
|
302
412
|
[(fk.referenced_columns[0], fk.foreign_key_columns[0]) for fk in table1.referenced_by if fk.table == table2]
|
|
303
413
|
)
|
|
304
414
|
if len(relationships) != 1:
|
|
305
|
-
raise DerivaMLException(
|
|
415
|
+
raise DerivaMLException(
|
|
416
|
+
f"Ambiguous linkage between {table1.name} and {table2.name}: {[(r[0].name, r[1].name) for r in relationships]}"
|
|
417
|
+
)
|
|
306
418
|
return relationships[0]
|
|
307
419
|
|
|
308
420
|
def _schema_to_paths(
|
deriva_ml/model/database.py
CHANGED
|
@@ -226,7 +226,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
226
226
|
indexes: A tuple whose first element is the column index of the file name and whose second element
|
|
227
227
|
is the index of the URL in an asset table. Tuple is None if table is not an asset table.
|
|
228
228
|
o: list:
|
|
229
|
-
indexes: Optional[tuple[int
|
|
229
|
+
indexes: Optional[tuple[int, int]]:
|
|
230
230
|
|
|
231
231
|
Returns:
|
|
232
232
|
Tuple of updated column values.
|
|
@@ -262,7 +262,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
262
262
|
DatasetBag object for the specified dataset.
|
|
263
263
|
"""
|
|
264
264
|
if dataset_rid and dataset_rid not in self.bag_rids:
|
|
265
|
-
DerivaMLException(f"Dataset RID {dataset_rid} is not in model.")
|
|
265
|
+
raise DerivaMLException(f"Dataset RID {dataset_rid} is not in model.")
|
|
266
266
|
return DatasetBag(self, dataset_rid or self.dataset_rid)
|
|
267
267
|
|
|
268
268
|
def dataset_version(self, dataset_rid: Optional[RID] = None) -> DatasetVersion:
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""A module defining the DatasetLike protocol for dataset operations.
|
|
2
|
+
|
|
3
|
+
This module contains the definition of the DatasetLike protocol, which
|
|
4
|
+
provides an interface for datasets to implement specific functionality related
|
|
5
|
+
to listing dataset children. It is particularly useful for ensuring type
|
|
6
|
+
compatibility for objects that mimic datasets in their behavior.
|
|
7
|
+
|
|
8
|
+
Classes:
|
|
9
|
+
DatasetLike: A protocol that specifies methods required for dataset-like
|
|
10
|
+
objects.
|
|
11
|
+
"""
|
|
12
|
+
from typing import Protocol, runtime_checkable
|
|
13
|
+
|
|
14
|
+
from deriva_ml.core.definitions import RID
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@runtime_checkable
|
|
18
|
+
class DatasetLike(Protocol):
|
|
19
|
+
def list_dataset_children(self, dataset_rid: RID, recurse: bool = False) -> list[RID]: ...
|
deriva_ml/run_notebook.py
CHANGED
|
@@ -3,14 +3,13 @@
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
import tempfile
|
|
6
|
-
from datetime import datetime
|
|
7
6
|
from pathlib import Path
|
|
8
7
|
|
|
9
8
|
import nbformat
|
|
10
9
|
import papermill as pm
|
|
11
|
-
import regex as re
|
|
12
10
|
import yaml
|
|
13
11
|
from deriva.core import BaseCLI
|
|
12
|
+
from jupyter_client.kernelspec import KernelSpecManager
|
|
14
13
|
from nbconvert import MarkdownExporter
|
|
15
14
|
|
|
16
15
|
from deriva_ml import DerivaML, ExecAssetType, Execution, ExecutionConfiguration, MLAsset, Workflow
|
|
@@ -44,13 +43,6 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
44
43
|
help="Display logging output from notebook.",
|
|
45
44
|
)
|
|
46
45
|
|
|
47
|
-
self.parser.add_argument(
|
|
48
|
-
"--catalog",
|
|
49
|
-
metavar="<1>",
|
|
50
|
-
default=1,
|
|
51
|
-
help="Catalog number. Default 1",
|
|
52
|
-
)
|
|
53
|
-
|
|
54
46
|
self.parser.add_argument(
|
|
55
47
|
"--parameter",
|
|
56
48
|
"-p",
|
|
@@ -61,7 +53,13 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
61
53
|
help="Provide a parameter name and value to inject into the notebook.",
|
|
62
54
|
)
|
|
63
55
|
|
|
64
|
-
self.parser.add_argument(
|
|
56
|
+
self.parser.add_argument(
|
|
57
|
+
"--kernel",
|
|
58
|
+
"-k",
|
|
59
|
+
type=str,
|
|
60
|
+
help="Name of kernel to run..",
|
|
61
|
+
default=self._find_kernel_for_venv(),
|
|
62
|
+
)
|
|
65
63
|
|
|
66
64
|
@staticmethod
|
|
67
65
|
def _coerce_number(val: str):
|
|
@@ -100,26 +98,50 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
100
98
|
print(f"Notebook file must be an ipynb file: {notebook_file.name}.")
|
|
101
99
|
exit(1)
|
|
102
100
|
|
|
103
|
-
os.environ["DERIVA_HOST"] = args.host
|
|
104
|
-
os.environ["DERIVA_CATALOG"] = args.catalog
|
|
105
|
-
|
|
106
101
|
# Create a workflow instance for this specific version of the script.
|
|
107
102
|
# Return an existing workflow if one is found.
|
|
108
103
|
notebook_parameters = pm.inspect_notebook(notebook_file)
|
|
104
|
+
|
|
109
105
|
if args.inspect:
|
|
110
106
|
for param, value in notebook_parameters.items():
|
|
111
107
|
print(f"{param}:{value['inferred_type_name']} (default {value['default']})")
|
|
112
108
|
return
|
|
113
109
|
else:
|
|
114
|
-
notebook_parameters = (
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
110
|
+
notebook_parameters = {k: v["default"] for k, v in notebook_parameters.items()} | parameters
|
|
111
|
+
self.run_notebook(notebook_file.resolve(), parameters, kernel=args.kernel, log=args.log_output)
|
|
112
|
+
|
|
113
|
+
@staticmethod
|
|
114
|
+
def _find_kernel_for_venv() -> str | None:
|
|
115
|
+
"""
|
|
116
|
+
Return the name and spec of an existing Jupyter kernel corresponding
|
|
117
|
+
to a given Python virtual environment path.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
venv_path : str
|
|
122
|
+
Absolute or relative path to the virtual environment.
|
|
123
|
+
|
|
124
|
+
Returns
|
|
125
|
+
-------
|
|
126
|
+
dict | None
|
|
127
|
+
The kernel spec (as a dict) if found, or None if not found.
|
|
128
|
+
"""
|
|
129
|
+
venv = os.environ.get("VIRTUAL_ENV")
|
|
130
|
+
if not venv:
|
|
131
|
+
return None
|
|
132
|
+
venv_path = Path(venv).resolve()
|
|
133
|
+
ksm = KernelSpecManager()
|
|
134
|
+
for name, spec in ksm.get_all_specs().items():
|
|
135
|
+
kernel_json = spec.get("spec", {})
|
|
136
|
+
argv = kernel_json.get("argv", [])
|
|
137
|
+
# check for python executable path inside argv
|
|
138
|
+
for arg in argv:
|
|
139
|
+
try:
|
|
140
|
+
if Path(arg).resolve() == venv_path.joinpath("bin", "python").resolve():
|
|
141
|
+
return name
|
|
142
|
+
except Exception:
|
|
143
|
+
continue
|
|
144
|
+
return None
|
|
123
145
|
|
|
124
146
|
def run_notebook(self, notebook_file: Path, parameters, kernel=None, log=False):
|
|
125
147
|
url, checksum = Workflow.get_url_and_checksum(Path(notebook_file))
|
|
@@ -127,8 +149,9 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
127
149
|
os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"] = checksum
|
|
128
150
|
os.environ["DERIVA_ML_NOTEBOOK_PATH"] = notebook_file.as_posix()
|
|
129
151
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
130
|
-
print(f"Running notebook {notebook_file.name} with parameters:")
|
|
131
152
|
notebook_output = Path(tmpdirname) / Path(notebook_file).name
|
|
153
|
+
execution_rid_path = Path(tmpdirname) / "execution_rid.json"
|
|
154
|
+
os.environ["DERIVA_ML_SAVE_EXECUTION_RID"] = execution_rid_path.as_posix()
|
|
132
155
|
pm.execute_notebook(
|
|
133
156
|
input_path=notebook_file,
|
|
134
157
|
output_path=notebook_output,
|
|
@@ -137,22 +160,19 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
137
160
|
log_output=log,
|
|
138
161
|
)
|
|
139
162
|
print(f"Notebook output saved to {notebook_output}")
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
r"Execution RID: https://(?P<host>.*)/id/(?P<catalog_id>.*)/(?P<execution_rid>[\w-]+)",
|
|
145
|
-
line,
|
|
146
|
-
):
|
|
147
|
-
hostname = m["host"]
|
|
148
|
-
catalog_id = m["catalog_id"]
|
|
149
|
-
execution_rid = m["execution_rid"]
|
|
150
|
-
if not execution_rid:
|
|
163
|
+
with execution_rid_path.open("r") as f:
|
|
164
|
+
execution_config = json.load(f)
|
|
165
|
+
|
|
166
|
+
if not execution_config:
|
|
151
167
|
print("Execution RID not found.")
|
|
152
168
|
exit(1)
|
|
153
169
|
|
|
170
|
+
execution_rid = execution_config["execution_rid"]
|
|
171
|
+
hostname = execution_config["hostname"]
|
|
172
|
+
catalog_id = execution_config["catalog_id"]
|
|
173
|
+
workflow_rid = execution_config["workflow_rid"]
|
|
154
174
|
ml_instance = DerivaML(hostname=hostname, catalog_id=catalog_id, working_dir=tmpdirname)
|
|
155
|
-
workflow_rid = ml_instance.retrieve_rid(execution_rid)["Workflow"]
|
|
175
|
+
workflow_rid = ml_instance.retrieve_rid(execution_config["execution_rid"])["Workflow"]
|
|
156
176
|
|
|
157
177
|
execution = Execution(
|
|
158
178
|
configuration=ExecutionConfiguration(workflow=workflow_rid),
|
|
@@ -183,21 +203,6 @@ class DerivaMLRunNotebookCLI(BaseCLI):
|
|
|
183
203
|
file_name=notebook_output_md,
|
|
184
204
|
asset_types=ExecAssetType.notebook_output,
|
|
185
205
|
)
|
|
186
|
-
execution.asset_file_path(
|
|
187
|
-
asset_name=MLAsset.execution_asset,
|
|
188
|
-
file_name=notebook_output_md,
|
|
189
|
-
asset_types=ExecAssetType.notebook_output,
|
|
190
|
-
)
|
|
191
|
-
print("parameter....")
|
|
192
|
-
|
|
193
|
-
parameter_file = execution.asset_file_path(
|
|
194
|
-
asset_name=MLAsset.execution_asset,
|
|
195
|
-
file_name=f"notebook-parameters-{datetime.now().strftime('%Y%m%d-%H%M%S')}.json",
|
|
196
|
-
asset_types=ExecAssetType.input_file.value,
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
with Path(parameter_file).open("w") as f:
|
|
200
|
-
json.dump(parameters, f)
|
|
201
206
|
execution.upload_execution_outputs()
|
|
202
207
|
|
|
203
208
|
print(ml_instance.cite(execution_rid))
|
deriva_ml/schema/annotations.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import argparse
|
|
2
|
+
import sys
|
|
2
3
|
|
|
3
4
|
from deriva.core.ermrest_model import Model, Table
|
|
4
5
|
from deriva.core.utils.core_utils import tag as deriva_tags
|
|
@@ -183,10 +184,10 @@ def catalog_annotation(model: DerivaModel) -> None:
|
|
|
183
184
|
|
|
184
185
|
def asset_annotation(asset_table: Table):
|
|
185
186
|
"""Generate annotations for an asset table.
|
|
186
|
-
|
|
187
|
+
|
|
187
188
|
Args:
|
|
188
189
|
asset_table: The Table object representing the asset table.
|
|
189
|
-
|
|
190
|
+
|
|
190
191
|
Returns:
|
|
191
192
|
A dictionary containing the annotations for the asset table.
|
|
192
193
|
"""
|
|
@@ -316,7 +317,8 @@ def generate_annotation(model: Model, schema: str) -> dict:
|
|
|
316
317
|
},
|
|
317
318
|
{
|
|
318
319
|
"source": [
|
|
319
|
-
{"inbound": [schema, "
|
|
320
|
+
{"inbound": [schema, "Execution_Metadata_Execution_Execution_fkey"]},
|
|
321
|
+
{"outbound": [schema, "Execution_Metadata_Execution_Execution_Metadata_fkey"]},
|
|
320
322
|
"RID",
|
|
321
323
|
],
|
|
322
324
|
"markdown_name": "Execution Metadata",
|
|
@@ -453,9 +455,9 @@ def generate_annotation(model: Model, schema: str) -> dict:
|
|
|
453
455
|
|
|
454
456
|
def main():
|
|
455
457
|
"""Main entry point for the annotations CLI.
|
|
456
|
-
|
|
458
|
+
|
|
457
459
|
Applies annotations to the ML schema based on command line arguments.
|
|
458
|
-
|
|
460
|
+
|
|
459
461
|
Returns:
|
|
460
462
|
None. Executes the CLI.
|
|
461
463
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deriva-ml
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.16.0
|
|
4
4
|
Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
|
|
5
5
|
Author-email: ISRD <isrd-dev@isi.edu>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -21,6 +21,7 @@ Requires-Dist: nbstripout
|
|
|
21
21
|
Requires-Dist: papermill
|
|
22
22
|
Requires-Dist: pandas-stubs==2.2.3.250527
|
|
23
23
|
Requires-Dist: pyyaml
|
|
24
|
+
Requires-Dist: hydra_zen
|
|
24
25
|
Dynamic: license-file
|
|
25
26
|
|
|
26
27
|
# DerivaML
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
deriva_ml/__init__.py,sha256=Yt8q0WbLFt7fbRLZe_f0bJWy1Qo6vidQzlYWQoT8U7o,2097
|
|
2
|
+
deriva_ml/bump_version.py,sha256=eN2G5G_OeiuFxhOdjjwfxD8Rmv6dFvzIm0y_1x4Mif4,4020
|
|
3
|
+
deriva_ml/demo_catalog.py,sha256=6hlSVGNQ364chisKvSyMy2BBxzhQq1mLPPlW324eca4,14931
|
|
4
|
+
deriva_ml/feature.py,sha256=6-aphkxdKjWa9oPSGFWxHcwAc_8hmWj-7I4M178YG5Y,8470
|
|
5
|
+
deriva_ml/install_kernel.py,sha256=b62XY0SLViYO_Zye5r1Pl9qhYZyu_fk4KAO8NS1pxgM,2165
|
|
6
|
+
deriva_ml/run_notebook.py,sha256=_pds1q3WcfWqhCBqKeznbwSv5n7OND8FkL6JQ2Jkfmc,8093
|
|
7
|
+
deriva_ml/core/__init__.py,sha256=Ko8GsWc7K_eDFW0-GaNS6gOWYP8cWHWir-ChSQaHntE,856
|
|
8
|
+
deriva_ml/core/base.py,sha256=xsz1h5QZVE7PCVZiCt7lRV43Dupq9c7elUsbGk3QHJQ,61919
|
|
9
|
+
deriva_ml/core/config.py,sha256=dF4rOLFmbk1DEkQimqbiH4pC519nRZWpwKItARNMiZ4,2244
|
|
10
|
+
deriva_ml/core/constants.py,sha256=6wBJ8qMxe-dbCjRGrjUIX-RK0mTWrLDTeUpaVbLFoM8,888
|
|
11
|
+
deriva_ml/core/definitions.py,sha256=uq_8uYFBVBVHS691Ri2kdQsN37z0GNYTaZskJIb_ocM,1385
|
|
12
|
+
deriva_ml/core/enums.py,sha256=sSN4B4OynbB-AXwxRszoFr-KWIWIAfhVa06EzAEHwVc,7194
|
|
13
|
+
deriva_ml/core/ermrest.py,sha256=N0IJ3TE87jElaBChEIo5AFDTr0SIrb6F90yiimRfPr4,10182
|
|
14
|
+
deriva_ml/core/exceptions.py,sha256=4MZNPOyN-UMaGeY9sqJDVwh_iOmz1ntp4usSyCNqVMg,934
|
|
15
|
+
deriva_ml/core/filespec.py,sha256=BQAAcRXfXq1lDcsKlokLOOXCBtEZpPgXxrFOIZYAgLg,4229
|
|
16
|
+
deriva_ml/dataset/__init__.py,sha256=tV3yK9tb8iB9f5P3ml459bP2uPWJhCJcplhmbGVtoMI,411
|
|
17
|
+
deriva_ml/dataset/aux_classes.py,sha256=K-cVBrZY1j0ZO__FORHRVdVz3O69OgvhO5YkhwJJyxE,7348
|
|
18
|
+
deriva_ml/dataset/dataset.py,sha256=c6hGsIH9UOn8ayDP7EsYzqgKeZm2Kr7naliPLQxGtSg,64473
|
|
19
|
+
deriva_ml/dataset/dataset_bag.py,sha256=peFEMU8PfExbzJ0VJGIL3QDIPz0stmUR7daCXptA3f4,20256
|
|
20
|
+
deriva_ml/dataset/history.py,sha256=FK5AYYz11p4E4FWMVg4r7UPWOD4eobrq3b3xMjWF59g,3197
|
|
21
|
+
deriva_ml/dataset/upload.py,sha256=Q9bNVv6xTK_IpwFOU_ugq33IWRs0AWyFoF8Rzwi6OVs,16430
|
|
22
|
+
deriva_ml/execution/__init__.py,sha256=Zs-ZNmwrJJW6suJilzh3vdcPvzI8HIA0Ym0VUwuiQME,668
|
|
23
|
+
deriva_ml/execution/environment.py,sha256=B7nywqxFTRUWgyu8n7rFoKcVC9on422kjeFG2FPQfvg,9302
|
|
24
|
+
deriva_ml/execution/execution.py,sha256=X4HBADT_F5ZuER8qBcnNYqRUuMU3BaEV7rMgXEUrLCg,46096
|
|
25
|
+
deriva_ml/execution/execution_configuration.py,sha256=oWgBueuFO0-PBm9LM08EQeFeY9IXF8tVbd3LyRsTiNw,5437
|
|
26
|
+
deriva_ml/execution/workflow.py,sha256=7CwPrgs3FKQHiEVus0PpK9w5hVKLKZnCrlu_nT8GFe8,13604
|
|
27
|
+
deriva_ml/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
deriva_ml/model/catalog.py,sha256=TY6QdlhZX7OL5bhWcGkAFpZNaZye5l_rkb1Cih-bTjs,19180
|
|
29
|
+
deriva_ml/model/database.py,sha256=KEPJKIlmIxTiF4Th1NgpuuuMBhbfsgsd_k8UHs-hMg4,14843
|
|
30
|
+
deriva_ml/model/sql_mapper.py,sha256=_0QsJEVSgSPtxrWKSgjfPZCQ1aMVcjR_Tk2OxLhWEvY,1696
|
|
31
|
+
deriva_ml/protocols/dataset.py,sha256=1TyaT--89Elcs-nCvVyJxUj4cDaLztZOuSOzzj1cBMk,699
|
|
32
|
+
deriva_ml/schema/__init__.py,sha256=yV-MfzCF3FA4OOz7mZwMM2q6-x1vgOJ057kUvikFF6E,130
|
|
33
|
+
deriva_ml/schema/annotations.py,sha256=CMcRqYUlyW8iLCYp6sYJsncaRNtp4kFKoxcg-i-t-50,18302
|
|
34
|
+
deriva_ml/schema/check_schema.py,sha256=6dadLYHPqRex6AYVClmsESI8WhC7-rb-XnGf2G298xw,3609
|
|
35
|
+
deriva_ml/schema/create_schema.py,sha256=9qK9_8SRQT-DwcEwTGSkhi3j2NaoH5EVgthvV2kO-gg,13042
|
|
36
|
+
deriva_ml/schema/deriva-ml-reference.json,sha256=AEOMIgwKO3dNMMWHb0lxaXyamvfAEbUPh8qw0aAtsUQ,242460
|
|
37
|
+
deriva_ml/schema/policy.json,sha256=5ykB8nnZFl-oCHzlAwppCFKJHWJFIkYognUMVEanfY8,1826
|
|
38
|
+
deriva_ml/schema/table_comments_utils.py,sha256=4flCqnZAaqg_uSZ9I18pNUWAZoLfmMCXbmI5uERY5vM,2007
|
|
39
|
+
deriva_ml-1.16.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
40
|
+
deriva_ml-1.16.0.dist-info/METADATA,sha256=gN7KnQ1MDdqSSaVJOIKY-lBEwEE8s0bRMoVLrZGYgtA,1214
|
|
41
|
+
deriva_ml-1.16.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
42
|
+
deriva_ml-1.16.0.dist-info/entry_points.txt,sha256=XsHSbfp7S1cKMjHoPUdFIaFcp9lHXHS6CV1zb_MEXkg,463
|
|
43
|
+
deriva_ml-1.16.0.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
|
|
44
|
+
deriva_ml-1.16.0.dist-info/RECORD,,
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
deriva_ml/__init__.py,sha256=_aMdxGG4mRTcXodLZLNpXqH8v5uqMbqFUryE9KqNSB8,1158
|
|
2
|
-
deriva_ml/bump_version.py,sha256=eN2G5G_OeiuFxhOdjjwfxD8Rmv6dFvzIm0y_1x4Mif4,4020
|
|
3
|
-
deriva_ml/demo_catalog.py,sha256=JjPAIac_hKPh5krEhGJydjXquRnivi7kQoR8W4Khp-s,14928
|
|
4
|
-
deriva_ml/feature.py,sha256=6-aphkxdKjWa9oPSGFWxHcwAc_8hmWj-7I4M178YG5Y,8470
|
|
5
|
-
deriva_ml/install_kernel.py,sha256=b62XY0SLViYO_Zye5r1Pl9qhYZyu_fk4KAO8NS1pxgM,2165
|
|
6
|
-
deriva_ml/run_notebook.py,sha256=QRO_CK9Q9qt_n-c0rxGdIRyTHjGOuZxt-wj0WQTnaAM,8171
|
|
7
|
-
deriva_ml/core/__init__.py,sha256=V_i90pc5PB1F4UdOO6DZWzpEFaZDTaPRU-EzKXQ19eI,787
|
|
8
|
-
deriva_ml/core/base.py,sha256=LI_ZLpVJwWx4DW2Wo7luALQauQ3xhBxFYHSKDAfNsag,61649
|
|
9
|
-
deriva_ml/core/constants.py,sha256=6wBJ8qMxe-dbCjRGrjUIX-RK0mTWrLDTeUpaVbLFoM8,888
|
|
10
|
-
deriva_ml/core/definitions.py,sha256=uq_8uYFBVBVHS691Ri2kdQsN37z0GNYTaZskJIb_ocM,1385
|
|
11
|
-
deriva_ml/core/enums.py,sha256=sSN4B4OynbB-AXwxRszoFr-KWIWIAfhVa06EzAEHwVc,7194
|
|
12
|
-
deriva_ml/core/ermrest.py,sha256=N0IJ3TE87jElaBChEIo5AFDTr0SIrb6F90yiimRfPr4,10182
|
|
13
|
-
deriva_ml/core/exceptions.py,sha256=4MZNPOyN-UMaGeY9sqJDVwh_iOmz1ntp4usSyCNqVMg,934
|
|
14
|
-
deriva_ml/core/filespec.py,sha256=BQAAcRXfXq1lDcsKlokLOOXCBtEZpPgXxrFOIZYAgLg,4229
|
|
15
|
-
deriva_ml/dataset/__init__.py,sha256=ukl2laJqa9J2AVqb4zlpIYc-3RaAlfRR33NMIQaoNrQ,104
|
|
16
|
-
deriva_ml/dataset/aux_classes.py,sha256=9mZAln7_rrzaRbKhKA6dJOp3xeD6dHOC9NXOtJKROo4,6933
|
|
17
|
-
deriva_ml/dataset/dataset.py,sha256=B9QBFgcW1fCEseBV3FcgckPSrJyixEqeoG80mp__CfI,64472
|
|
18
|
-
deriva_ml/dataset/dataset_bag.py,sha256=mPIZRX5aTbVRcJbCFtdkmlnexquF8NE-onbVK_8IxVk,14224
|
|
19
|
-
deriva_ml/dataset/history.py,sha256=FK5AYYz11p4E4FWMVg4r7UPWOD4eobrq3b3xMjWF59g,3197
|
|
20
|
-
deriva_ml/dataset/upload.py,sha256=i_7KLfRSd2-THqZ1aG2OFAFGoyb8dJBCZZ5t1ftrtMQ,16429
|
|
21
|
-
deriva_ml/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
-
deriva_ml/execution/environment.py,sha256=B7nywqxFTRUWgyu8n7rFoKcVC9on422kjeFG2FPQfvg,9302
|
|
23
|
-
deriva_ml/execution/execution.py,sha256=NJjjrxGsedv0zoe-T-LxfO_5UG83KOHaxU3SY5EJ0QQ,44928
|
|
24
|
-
deriva_ml/execution/execution_configuration.py,sha256=Rw4VWkBCZN9yatvSKdTqEWTfu470lpcVKfHFR0uN0jI,6248
|
|
25
|
-
deriva_ml/execution/workflow.py,sha256=7CwPrgs3FKQHiEVus0PpK9w5hVKLKZnCrlu_nT8GFe8,13604
|
|
26
|
-
deriva_ml/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
|
-
deriva_ml/model/catalog.py,sha256=dzTBcRlqgEVkPY32AUax_iu75RgFiT4Pu5au7rmrv8k,14068
|
|
28
|
-
deriva_ml/model/database.py,sha256=SBkYFf0qwbGmvL0Xtn_n5DCz4roGfrhuYrM8G69Cy9Y,14837
|
|
29
|
-
deriva_ml/model/sql_mapper.py,sha256=_0QsJEVSgSPtxrWKSgjfPZCQ1aMVcjR_Tk2OxLhWEvY,1696
|
|
30
|
-
deriva_ml/schema/__init__.py,sha256=yV-MfzCF3FA4OOz7mZwMM2q6-x1vgOJ057kUvikFF6E,130
|
|
31
|
-
deriva_ml/schema/annotations.py,sha256=TuQ3vWFnK0160fRmtvsCkHx9qAcRa63MSyERB4x5a98,18197
|
|
32
|
-
deriva_ml/schema/check_schema.py,sha256=6dadLYHPqRex6AYVClmsESI8WhC7-rb-XnGf2G298xw,3609
|
|
33
|
-
deriva_ml/schema/create_schema.py,sha256=9qK9_8SRQT-DwcEwTGSkhi3j2NaoH5EVgthvV2kO-gg,13042
|
|
34
|
-
deriva_ml/schema/deriva-ml-reference.json,sha256=AEOMIgwKO3dNMMWHb0lxaXyamvfAEbUPh8qw0aAtsUQ,242460
|
|
35
|
-
deriva_ml/schema/policy.json,sha256=5ykB8nnZFl-oCHzlAwppCFKJHWJFIkYognUMVEanfY8,1826
|
|
36
|
-
deriva_ml/schema/table_comments_utils.py,sha256=4flCqnZAaqg_uSZ9I18pNUWAZoLfmMCXbmI5uERY5vM,2007
|
|
37
|
-
deriva_ml-1.14.47.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
38
|
-
deriva_ml-1.14.47.dist-info/METADATA,sha256=7kRaVpheUZqLTe82Q9KIAovS2gkiBE7KItSk67nQU9U,1190
|
|
39
|
-
deriva_ml-1.14.47.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
40
|
-
deriva_ml-1.14.47.dist-info/entry_points.txt,sha256=XsHSbfp7S1cKMjHoPUdFIaFcp9lHXHS6CV1zb_MEXkg,463
|
|
41
|
-
deriva_ml-1.14.47.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
|
|
42
|
-
deriva_ml-1.14.47.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|