deriva-ml 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +25 -30
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1489 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +4 -0
- deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
- deriva_ml/{dataset.py → dataset/dataset.py} +406 -428
- deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
- deriva_ml/{history.py → dataset/history.py} +51 -33
- deriva_ml/{upload.py → dataset/upload.py} +48 -70
- deriva_ml/demo_catalog.py +233 -183
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/{execution.py → execution/execution.py} +365 -252
- deriva_ml/execution/execution_configuration.py +163 -0
- deriva_ml/{execution_configuration.py → execution/workflow.py} +212 -224
- deriva_ml/feature.py +83 -46
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
- deriva_ml/{database_model.py → model/database.py} +52 -74
- deriva_ml/model/sql_mapper.py +44 -0
- deriva_ml/run_notebook.py +19 -11
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/{schema_setup → schema}/annotations.py +31 -22
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/METADATA +5 -4
- deriva_ml-1.14.27.dist-info/RECORD +40 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/entry_points.txt +1 -0
- deriva_ml/deriva_definitions.py +0 -391
- deriva_ml/deriva_ml_base.py +0 -1046
- deriva_ml/execution_environment.py +0 -139
- deriva_ml/schema_setup/table_comments_utils.py +0 -56
- deriva_ml/test-files/execution-parameters.json +0 -1
- deriva_ml/test-files/notebook-parameters.json +0 -5
- deriva_ml/test_functions.py +0 -141
- deriva_ml/test_notebook.ipynb +0 -197
- deriva_ml-1.14.0.dist-info/RECORD +0 -31
- /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
- /deriva_ml/{schema_setup → schema}/policy.json +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/WHEEL +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File-related utility functions for DerivaML.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from datetime import date
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from socket import gethostname
|
|
11
|
+
from typing import Callable, Generator
|
|
12
|
+
from urllib.parse import urlparse
|
|
13
|
+
|
|
14
|
+
import deriva.core.utils.hash_utils as hash_utils
|
|
15
|
+
from pydantic import BaseModel, Field, conlist, field_validator, validate_call
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FileSpec(BaseModel):
|
|
19
|
+
"""An entry into the File table
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
url: The File url to the url.
|
|
23
|
+
description: The description of the file.
|
|
24
|
+
md5: The MD5 hash of the file.
|
|
25
|
+
length: The length of the file in bytes.
|
|
26
|
+
file_types: A list of file types. Each files_type should be a defined term in MLVocab.file_type vocabulary.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
url: str = Field(alias="URL", validation_alias="url")
|
|
30
|
+
md5: str = Field(alias="MD5", validation_alias="md5")
|
|
31
|
+
length: int = Field(alias="Length", validation_alias="length")
|
|
32
|
+
description: str | None = Field(default="", alias="Description", validation_alias="description")
|
|
33
|
+
file_types: conlist(str) | None = []
|
|
34
|
+
|
|
35
|
+
@field_validator("url")
|
|
36
|
+
@classmethod
|
|
37
|
+
def validate_file_url(cls, url: str) -> str:
|
|
38
|
+
"""Examine the provided URL. If it's a local path, convert it into a tag URL.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
url: The URL to validate and potentially convert
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
The validated/converted URL
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValidationError: If the URL is not a file URL
|
|
48
|
+
"""
|
|
49
|
+
url_parts = urlparse(url)
|
|
50
|
+
if url_parts.scheme == "tag":
|
|
51
|
+
# Already a tag URL, so just return it.
|
|
52
|
+
return url
|
|
53
|
+
elif (not url_parts.scheme) or url_parts.scheme == "file":
|
|
54
|
+
# There is no scheme part of the URL, or it is a file URL, so it is a local file path.
|
|
55
|
+
# Convert to a tag URL.
|
|
56
|
+
return f"tag://{gethostname()},{date.today()}:file://{url_parts.path}"
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError("url is not a file URL")
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def create_filespecs(
|
|
62
|
+
cls, path: Path | str, description: str, file_types: list[str] | Callable[[Path], list[str]] | None = None
|
|
63
|
+
) -> Generator[FileSpec, None, None]:
|
|
64
|
+
"""Given a file or directory, generate the sequence of corresponding FileSpecs suitable to create a File table.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
path: Path to the file or directory.
|
|
68
|
+
description: The description of the file(s)
|
|
69
|
+
file_types: A list of file types or a function that takes a file path and returns a list of file types.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
An iterable of FileSpecs for each file in the directory.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
path = Path(path)
|
|
76
|
+
file_types = file_types or []
|
|
77
|
+
file_types_fn = file_types if callable(file_types) else lambda _x: file_types
|
|
78
|
+
|
|
79
|
+
def create_spec(file_path: Path) -> FileSpec:
|
|
80
|
+
hashes = hash_utils.compute_file_hashes(file_path, hashes=frozenset(["md5", "sha256"]))
|
|
81
|
+
md5 = hashes["md5"][0]
|
|
82
|
+
type_list = file_types_fn(file_path)
|
|
83
|
+
return FileSpec(
|
|
84
|
+
length=path.stat().st_size,
|
|
85
|
+
md5=md5,
|
|
86
|
+
description=description,
|
|
87
|
+
url=file_path.as_posix(),
|
|
88
|
+
file_types=type_list if "File" in type_list else ["File"] + type_list,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
files = [path] if path.is_file() else [f for f in Path(path).rglob("*") if f.is_file()]
|
|
92
|
+
return (create_spec(file) for file in files)
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def read_filespec(path: Path | str) -> Generator[FileSpec, None, None]:
|
|
96
|
+
"""Get FileSpecs from a JSON lines file.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
path: Path to the .jsonl file (string or Path).
|
|
100
|
+
|
|
101
|
+
Yields:
|
|
102
|
+
A FileSpec object.
|
|
103
|
+
"""
|
|
104
|
+
path = Path(path)
|
|
105
|
+
with path.open("r", encoding="utf-8") as f:
|
|
106
|
+
for line in f:
|
|
107
|
+
line = line.strip()
|
|
108
|
+
if not line:
|
|
109
|
+
continue
|
|
110
|
+
yield FileSpec(**json.loads(line))
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# Hack round pydantic validate_call and forward reference.
|
|
114
|
+
_raw = FileSpec.create_filespecs.__func__
|
|
115
|
+
# wrap it with validate_call, then re‐make it a classmethod
|
|
116
|
+
FileSpec.create_filespecs = classmethod(validate_call(_raw))
|
|
@@ -2,21 +2,22 @@
|
|
|
2
2
|
THis module defines the DataSet class with is used to manipulate n
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from .deriva_definitions import RID
|
|
6
|
-
|
|
7
5
|
from enum import Enum
|
|
6
|
+
from typing import Any, Optional, SupportsInt
|
|
7
|
+
|
|
8
8
|
from pydantic import (
|
|
9
9
|
BaseModel,
|
|
10
10
|
ConfigDict,
|
|
11
|
-
field_validator,
|
|
12
11
|
Field,
|
|
13
12
|
computed_field,
|
|
14
|
-
|
|
13
|
+
conlist,
|
|
15
14
|
field_serializer,
|
|
15
|
+
field_validator,
|
|
16
|
+
model_validator,
|
|
16
17
|
)
|
|
17
|
-
|
|
18
18
|
from semver import Version
|
|
19
|
-
|
|
19
|
+
|
|
20
|
+
from deriva_ml.core.definitions import RID
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class VersionPart(Enum):
|
|
@@ -41,9 +42,7 @@ class DatasetVersion(Version):
|
|
|
41
42
|
replace(major, minor, patch): Replace the major and minor versions
|
|
42
43
|
"""
|
|
43
44
|
|
|
44
|
-
def __init__(
|
|
45
|
-
self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0
|
|
46
|
-
):
|
|
45
|
+
def __init__(self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0):
|
|
47
46
|
"""Initialize a DatasetVersion object.
|
|
48
47
|
|
|
49
48
|
Args:
|
|
@@ -139,7 +138,8 @@ class DatasetMinid(BaseModel):
|
|
|
139
138
|
@computed_field
|
|
140
139
|
@property
|
|
141
140
|
def dataset_rid(self) -> str:
|
|
142
|
-
|
|
141
|
+
rid_parts = self.version_rid.split("@")
|
|
142
|
+
return rid_parts[0]
|
|
143
143
|
|
|
144
144
|
@computed_field
|
|
145
145
|
@property
|
|
@@ -177,13 +177,13 @@ class DatasetSpec(BaseModel):
|
|
|
177
177
|
|
|
178
178
|
Attributes:
|
|
179
179
|
rid (RID): A dataset_table RID
|
|
180
|
-
materialize (bool): If False
|
|
180
|
+
materialize (bool): If False do not materialize datasets, only download table data, no assets. Defaults to True
|
|
181
181
|
version (DatasetVersion): The version of the dataset. Should follow semantic versioning.
|
|
182
182
|
"""
|
|
183
183
|
|
|
184
184
|
rid: RID
|
|
185
185
|
materialize: bool = True
|
|
186
|
-
version: DatasetVersion
|
|
186
|
+
version: DatasetVersion | conlist(item_type=int, min_length=3, max_length=3) | tuple[int, int, int] | str
|
|
187
187
|
|
|
188
188
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
189
189
|
|
|
@@ -192,6 +192,10 @@ class DatasetSpec(BaseModel):
|
|
|
192
192
|
def version_field_validator(cls, v: Any) -> Any:
|
|
193
193
|
if isinstance(v, dict):
|
|
194
194
|
return DatasetVersion(**v)
|
|
195
|
+
elif isinstance(v, str):
|
|
196
|
+
return DatasetVersion.parse(v)
|
|
197
|
+
elif (isinstance(v, list) or isinstance(v, tuple)) and len(v) == 3:
|
|
198
|
+
return DatasetVersion(int(v[0]), int(v[1]), int(v[2]))
|
|
195
199
|
else:
|
|
196
200
|
return v
|
|
197
201
|
|