deriva-ml 1.17.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. deriva_ml/.DS_Store +0 -0
  2. deriva_ml/__init__.py +79 -0
  3. deriva_ml/bump_version.py +142 -0
  4. deriva_ml/core/__init__.py +39 -0
  5. deriva_ml/core/base.py +1527 -0
  6. deriva_ml/core/config.py +69 -0
  7. deriva_ml/core/constants.py +36 -0
  8. deriva_ml/core/definitions.py +74 -0
  9. deriva_ml/core/enums.py +222 -0
  10. deriva_ml/core/ermrest.py +288 -0
  11. deriva_ml/core/exceptions.py +28 -0
  12. deriva_ml/core/filespec.py +116 -0
  13. deriva_ml/dataset/__init__.py +12 -0
  14. deriva_ml/dataset/aux_classes.py +225 -0
  15. deriva_ml/dataset/dataset.py +1519 -0
  16. deriva_ml/dataset/dataset_bag.py +450 -0
  17. deriva_ml/dataset/history.py +109 -0
  18. deriva_ml/dataset/upload.py +439 -0
  19. deriva_ml/demo_catalog.py +495 -0
  20. deriva_ml/execution/__init__.py +26 -0
  21. deriva_ml/execution/environment.py +290 -0
  22. deriva_ml/execution/execution.py +1180 -0
  23. deriva_ml/execution/execution_configuration.py +147 -0
  24. deriva_ml/execution/workflow.py +413 -0
  25. deriva_ml/feature.py +228 -0
  26. deriva_ml/install_kernel.py +71 -0
  27. deriva_ml/model/__init__.py +0 -0
  28. deriva_ml/model/catalog.py +485 -0
  29. deriva_ml/model/database.py +719 -0
  30. deriva_ml/protocols/dataset.py +19 -0
  31. deriva_ml/run_notebook.py +228 -0
  32. deriva_ml/schema/__init__.py +3 -0
  33. deriva_ml/schema/annotations.py +473 -0
  34. deriva_ml/schema/check_schema.py +104 -0
  35. deriva_ml/schema/create_schema.py +393 -0
  36. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  37. deriva_ml/schema/policy.json +81 -0
  38. deriva_ml/schema/table_comments_utils.py +57 -0
  39. deriva_ml/test.py +94 -0
  40. deriva_ml-1.17.10.dist-info/METADATA +38 -0
  41. deriva_ml-1.17.10.dist-info/RECORD +45 -0
  42. deriva_ml-1.17.10.dist-info/WHEEL +5 -0
  43. deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
  44. deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
  45. deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
@@ -0,0 +1,28 @@
1
+ """
2
+ Custom exceptions used throughout the DerivaML package.
3
+ """
4
+
5
+
6
+ class DerivaMLException(Exception):
7
+ """Exception class specific to DerivaML module.
8
+
9
+ Args:
10
+ msg (str): Optional message for the exception.
11
+ """
12
+
13
+ def __init__(self, msg=""):
14
+ super().__init__(msg)
15
+ self._msg = msg
16
+
17
+
18
+ class DerivaMLInvalidTerm(DerivaMLException):
19
+ """Exception class for invalid terms in DerivaML controlled vocabulary."""
20
+ def __init__(self, vocabulary, term: str, msg: str = "Term doesn't exist"):
21
+ """Exception indicating undefined term type"""
22
+ super().__init__(f"Invalid term {term} in vocabulary {vocabulary}: {msg}.")
23
+
24
+ class DerivaMLTableTypeError(DerivaMLException):
25
+ """RID for table is not of correct type."""
26
+ def __init__(self, table_type, table: str):
27
+ """Exception indicating undefined term type"""
28
+ super().__init__(f"Table {table} is not of type {table_type}.")
@@ -0,0 +1,116 @@
1
+ """
2
+ File-related utility functions for DerivaML.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ from datetime import date
9
+ from pathlib import Path
10
+ from socket import gethostname
11
+ from typing import Callable, Generator
12
+ from urllib.parse import urlparse
13
+
14
+ import deriva.core.utils.hash_utils as hash_utils
15
+ from pydantic import BaseModel, Field, conlist, field_validator, validate_call
16
+
17
+
18
+ class FileSpec(BaseModel):
19
+ """An entry into the File table
20
+
21
+ Attributes:
22
+ url: The File url to the url.
23
+ description: The description of the file.
24
+ md5: The MD5 hash of the file.
25
+ length: The length of the file in bytes.
26
+ file_types: A list of file types. Each files_type should be a defined term in MLVocab.file_type vocabulary.
27
+ """
28
+
29
+ url: str = Field(alias="URL", validation_alias="url")
30
+ md5: str = Field(alias="MD5", validation_alias="md5")
31
+ length: int = Field(alias="Length", validation_alias="length")
32
+ description: str | None = Field(default="", alias="Description", validation_alias="description")
33
+ file_types: conlist(str) | None = []
34
+
35
+ @field_validator("url")
36
+ @classmethod
37
+ def validate_file_url(cls, url: str) -> str:
38
+ """Examine the provided URL. If it's a local path, convert it into a tag URL.
39
+
40
+ Args:
41
+ url: The URL to validate and potentially convert
42
+
43
+ Returns:
44
+ The validated/converted URL
45
+
46
+ Raises:
47
+ ValidationError: If the URL is not a file URL
48
+ """
49
+ url_parts = urlparse(url)
50
+ if url_parts.scheme == "tag":
51
+ # Already a tag URL, so just return it.
52
+ return url
53
+ elif (not url_parts.scheme) or url_parts.scheme == "file":
54
+ # There is no scheme part of the URL, or it is a file URL, so it is a local file path.
55
+ # Convert to a tag URL.
56
+ return f"tag://{gethostname()},{date.today()}:file://{url_parts.path}"
57
+ else:
58
+ raise ValueError("url is not a file URL")
59
+
60
+ @classmethod
61
+ def create_filespecs(
62
+ cls, path: Path | str, description: str, file_types: list[str] | Callable[[Path], list[str]] | None = None
63
+ ) -> Generator[FileSpec, None, None]:
64
+ """Given a file or directory, generate the sequence of corresponding FileSpecs suitable to create a File table.
65
+
66
+ Args:
67
+ path: Path to the file or directory.
68
+ description: The description of the file(s)
69
+ file_types: A list of file types or a function that takes a file path and returns a list of file types.
70
+
71
+ Returns:
72
+ An iterable of FileSpecs for each file in the directory.
73
+ """
74
+
75
+ path = Path(path)
76
+ file_types = file_types or []
77
+ file_types_fn = file_types if callable(file_types) else lambda _x: file_types
78
+
79
+ def create_spec(file_path: Path) -> FileSpec:
80
+ hashes = hash_utils.compute_file_hashes(file_path, hashes=frozenset(["md5", "sha256"]))
81
+ md5 = hashes["md5"][0]
82
+ type_list = file_types_fn(file_path)
83
+ return FileSpec(
84
+ length=path.stat().st_size,
85
+ md5=md5,
86
+ description=description,
87
+ url=file_path.as_posix(),
88
+ file_types=type_list if "File" in type_list else ["File"] + type_list,
89
+ )
90
+
91
+ files = [path] if path.is_file() else [f for f in Path(path).rglob("*") if f.is_file()]
92
+ return (create_spec(file) for file in files)
93
+
94
+ @staticmethod
95
+ def read_filespec(path: Path | str) -> Generator[FileSpec, None, None]:
96
+ """Get FileSpecs from a JSON lines file.
97
+
98
+ Args:
99
+ path: Path to the .jsonl file (string or Path).
100
+
101
+ Yields:
102
+ A FileSpec object.
103
+ """
104
+ path = Path(path)
105
+ with path.open("r", encoding="utf-8") as f:
106
+ for line in f:
107
+ line = line.strip()
108
+ if not line:
109
+ continue
110
+ yield FileSpec(**json.loads(line))
111
+
112
+
113
+ # Hack round pydantic validate_call and forward reference.
114
+ _raw = FileSpec.create_filespecs.__func__
115
+ # wrap it with validate_call, then re‐make it a classmethod
116
+ FileSpec.create_filespecs = classmethod(validate_call(_raw))
@@ -0,0 +1,12 @@
1
+ from .aux_classes import DatasetSpec, DatasetSpecConfig, DatasetVersion, VersionPart
2
+ from .dataset import Dataset
3
+ from .dataset_bag import DatasetBag
4
+
5
+ __all__ = [
6
+ "Dataset",
7
+ "DatasetSpec",
8
+ "DatasetSpecConfig",
9
+ "DatasetBag",
10
+ "DatasetVersion",
11
+ "VersionPart",
12
+ ]
@@ -0,0 +1,225 @@
1
+ """
2
+ THis module defines the DataSet class with is used to manipulate n
3
+ """
4
+
5
+ from enum import Enum
6
+ from typing import Any, Optional, SupportsInt
7
+
8
+ from hydra_zen import hydrated_dataclass
9
+ from pydantic import (
10
+ BaseModel,
11
+ ConfigDict,
12
+ Field,
13
+ computed_field,
14
+ conlist,
15
+ field_serializer,
16
+ field_validator,
17
+ model_validator,
18
+ )
19
+ from semver import Version
20
+
21
+ from deriva_ml.core.definitions import RID
22
+
23
+
24
+ class VersionPart(Enum):
25
+ """Simple enumeration for semantic versioning.
26
+
27
+ Attributes:
28
+ major (int): Major version number
29
+ minor (int): Minor version number
30
+ patch (int): Patch version number
31
+
32
+ """
33
+
34
+ major = "major"
35
+ minor = "minor"
36
+ patch = "patch"
37
+
38
+
39
+ class DatasetVersion(Version):
40
+ """Represent the version associated with a dataset using semantic versioning.
41
+
42
+ Methods:
43
+ replace(major, minor, patch): Replace the major and minor versions
44
+ """
45
+
46
+ def __init__(self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0):
47
+ """Initialize a DatasetVersion object.
48
+
49
+ Args:
50
+ major: Major version number. Used to indicate schema changes.
51
+ minor: Minor version number. Used to indicate additional members added, or change in member values.
52
+ patch: Patch number of the dataset. Used to indicate minor clean-up and edits
53
+ """
54
+ super().__init__(major, minor, patch)
55
+
56
+ def to_dict(self) -> dict[str, Any]:
57
+ """
58
+
59
+ Returns:
60
+ dictionary of version information
61
+
62
+ """
63
+ return {"major": self.major, "minor": self.minor, "patch": self.patch}
64
+
65
+ def to_tuple(self) -> tuple[int, int, int]:
66
+ """
67
+
68
+ Returns:
69
+ tuple of version information
70
+
71
+ """
72
+ return self.major, self.minor, self.patch
73
+
74
+ @classmethod
75
+ def parse(cls, version: str, optional_minor_an_path=False) -> "DatasetVersion":
76
+ v = Version.parse(version)
77
+ return DatasetVersion(v.major, v.minor, v.patch)
78
+
79
+ def increment_version(self, component: VersionPart) -> "DatasetVersion":
80
+ match component:
81
+ case VersionPart.major:
82
+ return self.bump_major()
83
+ case VersionPart.minor:
84
+ return self.bump_minor()
85
+ case VersionPart.patch:
86
+ return self.bump_patch()
87
+ case _:
88
+ return self
89
+
90
+
91
+ class DatasetHistory(BaseModel):
92
+ """
93
+ Class representing a dataset history.
94
+
95
+ Attributes:
96
+ dataset_version (DatasetVersion): A DatasetVersion object which captures the semantic versioning of the dataset.
97
+ dataset_rid (RID): The RID of the dataset.
98
+ version_rid (RID): The RID of the version record for the dataset in the Dataset_Version table.
99
+ minid (str): The URL that represents the handle of the dataset bag. This will be None if a MINID has not
100
+ been created yet.
101
+ snapshot (str): Catalog snapshot ID of when the version record was created.
102
+ """
103
+
104
+ dataset_version: DatasetVersion
105
+ dataset_rid: RID
106
+ version_rid: RID
107
+ execution_rid: Optional[RID] = None
108
+ description: str | None = ""
109
+ minid: str | None = None
110
+ snapshot: str | None = None
111
+
112
+ model_config = ConfigDict(arbitrary_types_allowed=True)
113
+
114
+ @field_validator("description", mode="after")
115
+ def _default_description(cls, v) -> str:
116
+ return v or ""
117
+
118
+
119
+ class DatasetMinid(BaseModel):
120
+ """Represent information about a MINID that refers to a dataset
121
+
122
+ Attributes:
123
+ dataset_version (DatasetVersion): A DatasetVersion object which captures the semantic versioning of the dataset.
124
+ metadata (dict): A dictionary containing metadata from the MINID landing page.
125
+ minid (str): The URL that represents the handle of the MINID associated with the dataset.
126
+ bag_url (str): The URL to the dataset bag
127
+ identifier (str): The identifier of the MINID in CURI form
128
+ landing_page (str): The URL to the landing page of the MINID
129
+ version_rid (str): RID of the dataset version.
130
+ checksum (str): The checksum of the MINID in SHA256 form
131
+
132
+ """
133
+
134
+ dataset_version: DatasetVersion
135
+ metadata: dict[str, str | int] = {}
136
+ minid: str = Field(alias="compact_uri", default=None)
137
+ bag_url: str = Field(alias="location")
138
+ identifier: Optional[str] = None
139
+ landing_page: Optional[str] = None
140
+ version_rid: RID = Field(alias="RID")
141
+ checksum: str = Field(alias="checksums", default="")
142
+
143
+ @computed_field
144
+ @property
145
+ def dataset_rid(self) -> str:
146
+ rid_parts = self.version_rid.split("@")
147
+ return rid_parts[0]
148
+
149
+ @computed_field
150
+ @property
151
+ def dataset_snapshot(self) -> str:
152
+ return self.version_rid.split("@")[1]
153
+
154
+ @model_validator(mode="before")
155
+ @classmethod
156
+ def insert_metadata(cls, data: Any) -> Any:
157
+ if isinstance(data, dict):
158
+ if "metadata" in data:
159
+ data = data | data["metadata"]
160
+ return data
161
+
162
+ @field_validator("bag_url", mode="before")
163
+ @classmethod
164
+ def convert_location_to_str(cls, value: list[str] | str) -> str:
165
+ return value[0] if isinstance(value, list) else value
166
+
167
+ @field_validator("checksum", mode="before")
168
+ @classmethod
169
+ def convert_checksum_to_value(cls, checksums: list[dict]) -> str:
170
+ checksum_value = ""
171
+ for checksum in checksums:
172
+ if checksum.get("function") == "sha256":
173
+ checksum_value = checksum.get("value")
174
+ break
175
+ return checksum_value
176
+
177
+ model_config = ConfigDict(arbitrary_types_allowed=True)
178
+
179
+
180
+ class DatasetSpec(BaseModel):
181
+ """Represent a dataset_table in an execution configuration dataset_table list
182
+
183
+ Attributes:
184
+ rid (RID): A dataset_table RID
185
+ materialize (bool): If False do not materialize datasets, only download table data, no assets. Defaults to True
186
+ version (DatasetVersion): The version of the dataset. Should follow semantic versioning.
187
+ """
188
+
189
+ rid: RID
190
+ version: DatasetVersion | conlist(item_type=int, min_length=3, max_length=3) | tuple[int, int, int] | str
191
+ materialize: bool = True
192
+ description: str = ""
193
+
194
+ model_config = ConfigDict(arbitrary_types_allowed=True)
195
+
196
+ @field_validator("version", mode="before")
197
+ @classmethod
198
+ def version_field_validator(cls, v: Any) -> Any:
199
+ if isinstance(v, dict):
200
+ return DatasetVersion(**v)
201
+ elif isinstance(v, str):
202
+ return DatasetVersion.parse(v)
203
+ elif (isinstance(v, list) or isinstance(v, tuple)) and len(v) == 3:
204
+ return DatasetVersion(int(v[0]), int(v[1]), int(v[2]))
205
+ else:
206
+ return v
207
+
208
+ @model_validator(mode="before")
209
+ @classmethod
210
+ def _check_bare_rid(cls, data: Any) -> dict[str, str | bool]:
211
+ # If you are just given a string, assume it's a rid and put into dict for further validation.
212
+ return {"rid": data} if isinstance(data, str) else data
213
+
214
+ @field_serializer("version")
215
+ def serialize_version(self, version: DatasetVersion) -> dict[str, Any]:
216
+ return version.to_dict()
217
+
218
+
219
+ # Interface for hydra-zen
220
+ @hydrated_dataclass(DatasetSpec)
221
+ class DatasetSpecConfig:
222
+ rid: str
223
+ version: str
224
+ materialize: bool = True
225
+ description: str = ""