deriva-ml 1.17.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/.DS_Store +0 -0
- deriva_ml/__init__.py +79 -0
- deriva_ml/bump_version.py +142 -0
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1527 -0
- deriva_ml/core/config.py +69 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +12 -0
- deriva_ml/dataset/aux_classes.py +225 -0
- deriva_ml/dataset/dataset.py +1519 -0
- deriva_ml/dataset/dataset_bag.py +450 -0
- deriva_ml/dataset/history.py +109 -0
- deriva_ml/dataset/upload.py +439 -0
- deriva_ml/demo_catalog.py +495 -0
- deriva_ml/execution/__init__.py +26 -0
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/execution/execution.py +1180 -0
- deriva_ml/execution/execution_configuration.py +147 -0
- deriva_ml/execution/workflow.py +413 -0
- deriva_ml/feature.py +228 -0
- deriva_ml/install_kernel.py +71 -0
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/model/catalog.py +485 -0
- deriva_ml/model/database.py +719 -0
- deriva_ml/protocols/dataset.py +19 -0
- deriva_ml/run_notebook.py +228 -0
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/schema/annotations.py +473 -0
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/schema/create_schema.py +393 -0
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/policy.json +81 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- deriva_ml/test.py +94 -0
- deriva_ml-1.17.10.dist-info/METADATA +38 -0
- deriva_ml-1.17.10.dist-info/RECORD +45 -0
- deriva_ml-1.17.10.dist-info/WHEEL +5 -0
- deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
- deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
- deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Custom exceptions used throughout the DerivaML package.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DerivaMLException(Exception):
|
|
7
|
+
"""Exception class specific to DerivaML module.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
msg (str): Optional message for the exception.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, msg=""):
|
|
14
|
+
super().__init__(msg)
|
|
15
|
+
self._msg = msg
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DerivaMLInvalidTerm(DerivaMLException):
|
|
19
|
+
"""Exception class for invalid terms in DerivaML controlled vocabulary."""
|
|
20
|
+
def __init__(self, vocabulary, term: str, msg: str = "Term doesn't exist"):
|
|
21
|
+
"""Exception indicating undefined term type"""
|
|
22
|
+
super().__init__(f"Invalid term {term} in vocabulary {vocabulary}: {msg}.")
|
|
23
|
+
|
|
24
|
+
class DerivaMLTableTypeError(DerivaMLException):
|
|
25
|
+
"""RID for table is not of correct type."""
|
|
26
|
+
def __init__(self, table_type, table: str):
|
|
27
|
+
"""Exception indicating undefined term type"""
|
|
28
|
+
super().__init__(f"Table {table} is not of type {table_type}.")
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File-related utility functions for DerivaML.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from datetime import date
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from socket import gethostname
|
|
11
|
+
from typing import Callable, Generator
|
|
12
|
+
from urllib.parse import urlparse
|
|
13
|
+
|
|
14
|
+
import deriva.core.utils.hash_utils as hash_utils
|
|
15
|
+
from pydantic import BaseModel, Field, conlist, field_validator, validate_call
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FileSpec(BaseModel):
|
|
19
|
+
"""An entry into the File table
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
url: The File url to the url.
|
|
23
|
+
description: The description of the file.
|
|
24
|
+
md5: The MD5 hash of the file.
|
|
25
|
+
length: The length of the file in bytes.
|
|
26
|
+
file_types: A list of file types. Each files_type should be a defined term in MLVocab.file_type vocabulary.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
url: str = Field(alias="URL", validation_alias="url")
|
|
30
|
+
md5: str = Field(alias="MD5", validation_alias="md5")
|
|
31
|
+
length: int = Field(alias="Length", validation_alias="length")
|
|
32
|
+
description: str | None = Field(default="", alias="Description", validation_alias="description")
|
|
33
|
+
file_types: conlist(str) | None = []
|
|
34
|
+
|
|
35
|
+
@field_validator("url")
|
|
36
|
+
@classmethod
|
|
37
|
+
def validate_file_url(cls, url: str) -> str:
|
|
38
|
+
"""Examine the provided URL. If it's a local path, convert it into a tag URL.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
url: The URL to validate and potentially convert
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
The validated/converted URL
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValidationError: If the URL is not a file URL
|
|
48
|
+
"""
|
|
49
|
+
url_parts = urlparse(url)
|
|
50
|
+
if url_parts.scheme == "tag":
|
|
51
|
+
# Already a tag URL, so just return it.
|
|
52
|
+
return url
|
|
53
|
+
elif (not url_parts.scheme) or url_parts.scheme == "file":
|
|
54
|
+
# There is no scheme part of the URL, or it is a file URL, so it is a local file path.
|
|
55
|
+
# Convert to a tag URL.
|
|
56
|
+
return f"tag://{gethostname()},{date.today()}:file://{url_parts.path}"
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError("url is not a file URL")
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def create_filespecs(
|
|
62
|
+
cls, path: Path | str, description: str, file_types: list[str] | Callable[[Path], list[str]] | None = None
|
|
63
|
+
) -> Generator[FileSpec, None, None]:
|
|
64
|
+
"""Given a file or directory, generate the sequence of corresponding FileSpecs suitable to create a File table.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
path: Path to the file or directory.
|
|
68
|
+
description: The description of the file(s)
|
|
69
|
+
file_types: A list of file types or a function that takes a file path and returns a list of file types.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
An iterable of FileSpecs for each file in the directory.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
path = Path(path)
|
|
76
|
+
file_types = file_types or []
|
|
77
|
+
file_types_fn = file_types if callable(file_types) else lambda _x: file_types
|
|
78
|
+
|
|
79
|
+
def create_spec(file_path: Path) -> FileSpec:
|
|
80
|
+
hashes = hash_utils.compute_file_hashes(file_path, hashes=frozenset(["md5", "sha256"]))
|
|
81
|
+
md5 = hashes["md5"][0]
|
|
82
|
+
type_list = file_types_fn(file_path)
|
|
83
|
+
return FileSpec(
|
|
84
|
+
length=path.stat().st_size,
|
|
85
|
+
md5=md5,
|
|
86
|
+
description=description,
|
|
87
|
+
url=file_path.as_posix(),
|
|
88
|
+
file_types=type_list if "File" in type_list else ["File"] + type_list,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
files = [path] if path.is_file() else [f for f in Path(path).rglob("*") if f.is_file()]
|
|
92
|
+
return (create_spec(file) for file in files)
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def read_filespec(path: Path | str) -> Generator[FileSpec, None, None]:
|
|
96
|
+
"""Get FileSpecs from a JSON lines file.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
path: Path to the .jsonl file (string or Path).
|
|
100
|
+
|
|
101
|
+
Yields:
|
|
102
|
+
A FileSpec object.
|
|
103
|
+
"""
|
|
104
|
+
path = Path(path)
|
|
105
|
+
with path.open("r", encoding="utf-8") as f:
|
|
106
|
+
for line in f:
|
|
107
|
+
line = line.strip()
|
|
108
|
+
if not line:
|
|
109
|
+
continue
|
|
110
|
+
yield FileSpec(**json.loads(line))
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# Hack round pydantic validate_call and forward reference.
|
|
114
|
+
_raw = FileSpec.create_filespecs.__func__
|
|
115
|
+
# wrap it with validate_call, then re‐make it a classmethod
|
|
116
|
+
FileSpec.create_filespecs = classmethod(validate_call(_raw))
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .aux_classes import DatasetSpec, DatasetSpecConfig, DatasetVersion, VersionPart
|
|
2
|
+
from .dataset import Dataset
|
|
3
|
+
from .dataset_bag import DatasetBag
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"Dataset",
|
|
7
|
+
"DatasetSpec",
|
|
8
|
+
"DatasetSpecConfig",
|
|
9
|
+
"DatasetBag",
|
|
10
|
+
"DatasetVersion",
|
|
11
|
+
"VersionPart",
|
|
12
|
+
]
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""
|
|
2
|
+
THis module defines the DataSet class with is used to manipulate n
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any, Optional, SupportsInt
|
|
7
|
+
|
|
8
|
+
from hydra_zen import hydrated_dataclass
|
|
9
|
+
from pydantic import (
|
|
10
|
+
BaseModel,
|
|
11
|
+
ConfigDict,
|
|
12
|
+
Field,
|
|
13
|
+
computed_field,
|
|
14
|
+
conlist,
|
|
15
|
+
field_serializer,
|
|
16
|
+
field_validator,
|
|
17
|
+
model_validator,
|
|
18
|
+
)
|
|
19
|
+
from semver import Version
|
|
20
|
+
|
|
21
|
+
from deriva_ml.core.definitions import RID
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class VersionPart(Enum):
|
|
25
|
+
"""Simple enumeration for semantic versioning.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
major (int): Major version number
|
|
29
|
+
minor (int): Minor version number
|
|
30
|
+
patch (int): Patch version number
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
major = "major"
|
|
35
|
+
minor = "minor"
|
|
36
|
+
patch = "patch"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DatasetVersion(Version):
|
|
40
|
+
"""Represent the version associated with a dataset using semantic versioning.
|
|
41
|
+
|
|
42
|
+
Methods:
|
|
43
|
+
replace(major, minor, patch): Replace the major and minor versions
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0):
|
|
47
|
+
"""Initialize a DatasetVersion object.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
major: Major version number. Used to indicate schema changes.
|
|
51
|
+
minor: Minor version number. Used to indicate additional members added, or change in member values.
|
|
52
|
+
patch: Patch number of the dataset. Used to indicate minor clean-up and edits
|
|
53
|
+
"""
|
|
54
|
+
super().__init__(major, minor, patch)
|
|
55
|
+
|
|
56
|
+
def to_dict(self) -> dict[str, Any]:
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
dictionary of version information
|
|
61
|
+
|
|
62
|
+
"""
|
|
63
|
+
return {"major": self.major, "minor": self.minor, "patch": self.patch}
|
|
64
|
+
|
|
65
|
+
def to_tuple(self) -> tuple[int, int, int]:
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
tuple of version information
|
|
70
|
+
|
|
71
|
+
"""
|
|
72
|
+
return self.major, self.minor, self.patch
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def parse(cls, version: str, optional_minor_an_path=False) -> "DatasetVersion":
|
|
76
|
+
v = Version.parse(version)
|
|
77
|
+
return DatasetVersion(v.major, v.minor, v.patch)
|
|
78
|
+
|
|
79
|
+
def increment_version(self, component: VersionPart) -> "DatasetVersion":
|
|
80
|
+
match component:
|
|
81
|
+
case VersionPart.major:
|
|
82
|
+
return self.bump_major()
|
|
83
|
+
case VersionPart.minor:
|
|
84
|
+
return self.bump_minor()
|
|
85
|
+
case VersionPart.patch:
|
|
86
|
+
return self.bump_patch()
|
|
87
|
+
case _:
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class DatasetHistory(BaseModel):
|
|
92
|
+
"""
|
|
93
|
+
Class representing a dataset history.
|
|
94
|
+
|
|
95
|
+
Attributes:
|
|
96
|
+
dataset_version (DatasetVersion): A DatasetVersion object which captures the semantic versioning of the dataset.
|
|
97
|
+
dataset_rid (RID): The RID of the dataset.
|
|
98
|
+
version_rid (RID): The RID of the version record for the dataset in the Dataset_Version table.
|
|
99
|
+
minid (str): The URL that represents the handle of the dataset bag. This will be None if a MINID has not
|
|
100
|
+
been created yet.
|
|
101
|
+
snapshot (str): Catalog snapshot ID of when the version record was created.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
dataset_version: DatasetVersion
|
|
105
|
+
dataset_rid: RID
|
|
106
|
+
version_rid: RID
|
|
107
|
+
execution_rid: Optional[RID] = None
|
|
108
|
+
description: str | None = ""
|
|
109
|
+
minid: str | None = None
|
|
110
|
+
snapshot: str | None = None
|
|
111
|
+
|
|
112
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
113
|
+
|
|
114
|
+
@field_validator("description", mode="after")
|
|
115
|
+
def _default_description(cls, v) -> str:
|
|
116
|
+
return v or ""
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class DatasetMinid(BaseModel):
|
|
120
|
+
"""Represent information about a MINID that refers to a dataset
|
|
121
|
+
|
|
122
|
+
Attributes:
|
|
123
|
+
dataset_version (DatasetVersion): A DatasetVersion object which captures the semantic versioning of the dataset.
|
|
124
|
+
metadata (dict): A dictionary containing metadata from the MINID landing page.
|
|
125
|
+
minid (str): The URL that represents the handle of the MINID associated with the dataset.
|
|
126
|
+
bag_url (str): The URL to the dataset bag
|
|
127
|
+
identifier (str): The identifier of the MINID in CURI form
|
|
128
|
+
landing_page (str): The URL to the landing page of the MINID
|
|
129
|
+
version_rid (str): RID of the dataset version.
|
|
130
|
+
checksum (str): The checksum of the MINID in SHA256 form
|
|
131
|
+
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
dataset_version: DatasetVersion
|
|
135
|
+
metadata: dict[str, str | int] = {}
|
|
136
|
+
minid: str = Field(alias="compact_uri", default=None)
|
|
137
|
+
bag_url: str = Field(alias="location")
|
|
138
|
+
identifier: Optional[str] = None
|
|
139
|
+
landing_page: Optional[str] = None
|
|
140
|
+
version_rid: RID = Field(alias="RID")
|
|
141
|
+
checksum: str = Field(alias="checksums", default="")
|
|
142
|
+
|
|
143
|
+
@computed_field
|
|
144
|
+
@property
|
|
145
|
+
def dataset_rid(self) -> str:
|
|
146
|
+
rid_parts = self.version_rid.split("@")
|
|
147
|
+
return rid_parts[0]
|
|
148
|
+
|
|
149
|
+
@computed_field
|
|
150
|
+
@property
|
|
151
|
+
def dataset_snapshot(self) -> str:
|
|
152
|
+
return self.version_rid.split("@")[1]
|
|
153
|
+
|
|
154
|
+
@model_validator(mode="before")
|
|
155
|
+
@classmethod
|
|
156
|
+
def insert_metadata(cls, data: Any) -> Any:
|
|
157
|
+
if isinstance(data, dict):
|
|
158
|
+
if "metadata" in data:
|
|
159
|
+
data = data | data["metadata"]
|
|
160
|
+
return data
|
|
161
|
+
|
|
162
|
+
@field_validator("bag_url", mode="before")
|
|
163
|
+
@classmethod
|
|
164
|
+
def convert_location_to_str(cls, value: list[str] | str) -> str:
|
|
165
|
+
return value[0] if isinstance(value, list) else value
|
|
166
|
+
|
|
167
|
+
@field_validator("checksum", mode="before")
|
|
168
|
+
@classmethod
|
|
169
|
+
def convert_checksum_to_value(cls, checksums: list[dict]) -> str:
|
|
170
|
+
checksum_value = ""
|
|
171
|
+
for checksum in checksums:
|
|
172
|
+
if checksum.get("function") == "sha256":
|
|
173
|
+
checksum_value = checksum.get("value")
|
|
174
|
+
break
|
|
175
|
+
return checksum_value
|
|
176
|
+
|
|
177
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class DatasetSpec(BaseModel):
|
|
181
|
+
"""Represent a dataset_table in an execution configuration dataset_table list
|
|
182
|
+
|
|
183
|
+
Attributes:
|
|
184
|
+
rid (RID): A dataset_table RID
|
|
185
|
+
materialize (bool): If False do not materialize datasets, only download table data, no assets. Defaults to True
|
|
186
|
+
version (DatasetVersion): The version of the dataset. Should follow semantic versioning.
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
rid: RID
|
|
190
|
+
version: DatasetVersion | conlist(item_type=int, min_length=3, max_length=3) | tuple[int, int, int] | str
|
|
191
|
+
materialize: bool = True
|
|
192
|
+
description: str = ""
|
|
193
|
+
|
|
194
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
195
|
+
|
|
196
|
+
@field_validator("version", mode="before")
|
|
197
|
+
@classmethod
|
|
198
|
+
def version_field_validator(cls, v: Any) -> Any:
|
|
199
|
+
if isinstance(v, dict):
|
|
200
|
+
return DatasetVersion(**v)
|
|
201
|
+
elif isinstance(v, str):
|
|
202
|
+
return DatasetVersion.parse(v)
|
|
203
|
+
elif (isinstance(v, list) or isinstance(v, tuple)) and len(v) == 3:
|
|
204
|
+
return DatasetVersion(int(v[0]), int(v[1]), int(v[2]))
|
|
205
|
+
else:
|
|
206
|
+
return v
|
|
207
|
+
|
|
208
|
+
@model_validator(mode="before")
|
|
209
|
+
@classmethod
|
|
210
|
+
def _check_bare_rid(cls, data: Any) -> dict[str, str | bool]:
|
|
211
|
+
# If you are just given a string, assume it's a rid and put into dict for further validation.
|
|
212
|
+
return {"rid": data} if isinstance(data, str) else data
|
|
213
|
+
|
|
214
|
+
@field_serializer("version")
|
|
215
|
+
def serialize_version(self, version: DatasetVersion) -> dict[str, Any]:
|
|
216
|
+
return version.to_dict()
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
# Interface for hydra-zen
|
|
220
|
+
@hydrated_dataclass(DatasetSpec)
|
|
221
|
+
class DatasetSpecConfig:
|
|
222
|
+
rid: str
|
|
223
|
+
version: str
|
|
224
|
+
materialize: bool = True
|
|
225
|
+
description: str = ""
|