deriva-ml 1.17.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/.DS_Store +0 -0
- deriva_ml/__init__.py +79 -0
- deriva_ml/bump_version.py +142 -0
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1527 -0
- deriva_ml/core/config.py +69 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +12 -0
- deriva_ml/dataset/aux_classes.py +225 -0
- deriva_ml/dataset/dataset.py +1519 -0
- deriva_ml/dataset/dataset_bag.py +450 -0
- deriva_ml/dataset/history.py +109 -0
- deriva_ml/dataset/upload.py +439 -0
- deriva_ml/demo_catalog.py +495 -0
- deriva_ml/execution/__init__.py +26 -0
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/execution/execution.py +1180 -0
- deriva_ml/execution/execution_configuration.py +147 -0
- deriva_ml/execution/workflow.py +413 -0
- deriva_ml/feature.py +228 -0
- deriva_ml/install_kernel.py +71 -0
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/model/catalog.py +485 -0
- deriva_ml/model/database.py +719 -0
- deriva_ml/protocols/dataset.py +19 -0
- deriva_ml/run_notebook.py +228 -0
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/schema/annotations.py +473 -0
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/schema/create_schema.py +393 -0
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/policy.json +81 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- deriva_ml/test.py +94 -0
- deriva_ml-1.17.10.dist-info/METADATA +38 -0
- deriva_ml-1.17.10.dist-info/RECORD +45 -0
- deriva_ml-1.17.10.dist-info/WHEEL +5 -0
- deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
- deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
- deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
deriva_ml/feature.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Feature implementation for deriva-ml.
|
|
2
|
+
|
|
3
|
+
This module provides classes for defining and managing features in deriva-ml. Features represent measurable
|
|
4
|
+
properties or characteristics that can be associated with records in a table. The module includes:
|
|
5
|
+
|
|
6
|
+
- Feature: Main class for defining and managing features
|
|
7
|
+
- FeatureRecord: Base class for feature records using pydantic models
|
|
8
|
+
|
|
9
|
+
Typical usage example:
|
|
10
|
+
>>> feature = Feature(association_result, model)
|
|
11
|
+
>>> FeatureClass = feature.feature_record_class()
|
|
12
|
+
>>> record = FeatureClass(value="high", confidence=0.95)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from types import UnionType
|
|
17
|
+
from typing import TYPE_CHECKING, ClassVar, Optional, Type
|
|
18
|
+
|
|
19
|
+
from deriva.core.ermrest_model import Column, FindAssociationResult
|
|
20
|
+
from pydantic import BaseModel, create_model
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from model.catalog import DerivaModel
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FeatureRecord(BaseModel):
|
|
27
|
+
"""Base class for dynamically generated feature record models.
|
|
28
|
+
|
|
29
|
+
This class serves as the base for pydantic models that represent feature records. Each feature record
|
|
30
|
+
contains the values and metadata associated with a feature instance.
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
Execution (Optional[str]): RID of the execution that created this feature record.
|
|
34
|
+
Feature_Name (str): Name of the feature this record belongs to.
|
|
35
|
+
feature (ClassVar[Optional[Feature]]): Reference to the Feature object that created this record.
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
>>> class GeneFeature(FeatureRecord):
|
|
39
|
+
... value: str
|
|
40
|
+
... confidence: float
|
|
41
|
+
>>> record = GeneFeature(
|
|
42
|
+
... Feature_Name="expression",
|
|
43
|
+
... value="high",
|
|
44
|
+
... confidence=0.95
|
|
45
|
+
... )
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
# model_dump of this feature should be compatible with feature table columns.
|
|
49
|
+
Execution: Optional[str] = None
|
|
50
|
+
Feature_Name: str
|
|
51
|
+
feature: ClassVar[Optional["Feature"]] = None
|
|
52
|
+
|
|
53
|
+
class Config:
|
|
54
|
+
arbitrary_types_allowed = True
|
|
55
|
+
extra = "forbid"
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def feature_columns(cls) -> set[Column]:
|
|
59
|
+
"""Returns all columns specific to this feature.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
set[Column]: Set of feature-specific columns, excluding system and relationship columns.
|
|
63
|
+
"""
|
|
64
|
+
return cls.feature.feature_columns
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def asset_columns(cls) -> set[Column]:
|
|
68
|
+
"""Returns columns that reference asset tables.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
set[Column]: Set of columns that contain references to asset tables.
|
|
72
|
+
"""
|
|
73
|
+
return cls.feature.asset_columns
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def term_columns(cls) -> set[Column]:
|
|
77
|
+
"""Returns columns that reference vocabulary terms.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
set[Column]: Set of columns that contain references to controlled vocabulary terms.
|
|
81
|
+
"""
|
|
82
|
+
return cls.feature.term_columns
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def value_columns(cls) -> set[Column]:
|
|
86
|
+
"""Returns columns that contain direct values.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
set[Column]: Set of columns containing direct values (not references to assets or terms).
|
|
90
|
+
"""
|
|
91
|
+
return cls.feature.value_columns
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class Feature:
|
|
95
|
+
"""Manages feature definitions and their relationships in the catalog.
|
|
96
|
+
|
|
97
|
+
A Feature represents a measurable property or characteristic that can be associated with records in a table.
|
|
98
|
+
Features can include asset references, controlled vocabulary terms, and custom metadata fields.
|
|
99
|
+
|
|
100
|
+
Attributes:
|
|
101
|
+
feature_table: Table containing the feature implementation.
|
|
102
|
+
target_table: Table that the feature is associated with.
|
|
103
|
+
feature_name: Name of the feature (from Feature_Name column default).
|
|
104
|
+
feature_columns: Set of columns specific to this feature.
|
|
105
|
+
asset_columns: Set of columns referencing asset tables.
|
|
106
|
+
term_columns: Set of columns referencing vocabulary tables.
|
|
107
|
+
value_columns: Set of columns containing direct values.
|
|
108
|
+
|
|
109
|
+
Example:
|
|
110
|
+
>>> feature = Feature(association_result, model)
|
|
111
|
+
>>> print(f"Feature {feature.feature_name} on {feature.target_table.name}")
|
|
112
|
+
>>> print("Asset columns:", [c.name for c in feature.asset_columns])
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
def __init__(self, atable: FindAssociationResult, model: "DerivaModel") -> None:
|
|
116
|
+
self.feature_table = atable.table
|
|
117
|
+
self.target_table = atable.self_fkey.pk_table
|
|
118
|
+
self.feature_name = atable.table.columns["Feature_Name"].default
|
|
119
|
+
self._model = model
|
|
120
|
+
|
|
121
|
+
skip_columns = {
|
|
122
|
+
"RID",
|
|
123
|
+
"RMB",
|
|
124
|
+
"RCB",
|
|
125
|
+
"RCT",
|
|
126
|
+
"RMT",
|
|
127
|
+
"Feature_Name",
|
|
128
|
+
self.target_table.name,
|
|
129
|
+
"Execution",
|
|
130
|
+
}
|
|
131
|
+
self.feature_columns = {c for c in self.feature_table.columns if c.name not in skip_columns}
|
|
132
|
+
|
|
133
|
+
assoc_fkeys = {atable.self_fkey} | atable.other_fkeys
|
|
134
|
+
|
|
135
|
+
# Determine the role of each column in the feature outside the FK columns.
|
|
136
|
+
self.asset_columns = {
|
|
137
|
+
fk.foreign_key_columns[0]
|
|
138
|
+
for fk in self.feature_table.foreign_keys
|
|
139
|
+
if fk not in assoc_fkeys and self._model.is_asset(fk.pk_table)
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
self.term_columns = {
|
|
143
|
+
fk.foreign_key_columns[0]
|
|
144
|
+
for fk in self.feature_table.foreign_keys
|
|
145
|
+
if fk not in assoc_fkeys and self._model.is_vocabulary(fk.pk_table)
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
self.value_columns = self.feature_columns - (self.asset_columns | self.term_columns)
|
|
149
|
+
|
|
150
|
+
def feature_record_class(self) -> type[FeatureRecord]:
|
|
151
|
+
"""Create a pydantic model for entries into the specified feature table
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
A Feature class that can be used to create instances of the feature.
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
def map_type(c: Column) -> UnionType | Type[str] | Type[int] | Type[float]:
|
|
158
|
+
"""Maps a Deriva column type to a Python/pydantic type.
|
|
159
|
+
|
|
160
|
+
Converts ERMrest column types to appropriate Python types for use in pydantic models.
|
|
161
|
+
Special handling is provided for asset columns which can accept either strings or Path objects.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
c: ERMrest column to map to a Python type.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
UnionType | Type[str] | Type[int] | Type[float]: Appropriate Python type for the column:
|
|
168
|
+
- str | Path for asset columns
|
|
169
|
+
- str for text columns
|
|
170
|
+
- int for integer columns
|
|
171
|
+
- float for floating point columns
|
|
172
|
+
- str for all other types
|
|
173
|
+
|
|
174
|
+
Example:
|
|
175
|
+
>>> col = Column(name="score", type="float4")
|
|
176
|
+
>>> typ = map_type(col) # Returns float
|
|
177
|
+
"""
|
|
178
|
+
if c.name in {c.name for c in self.asset_columns}:
|
|
179
|
+
return str | Path
|
|
180
|
+
|
|
181
|
+
match c.type.typename:
|
|
182
|
+
case "text":
|
|
183
|
+
return str
|
|
184
|
+
case "int2" | "int4" | "int8":
|
|
185
|
+
return int
|
|
186
|
+
case "float4" | "float8":
|
|
187
|
+
return float
|
|
188
|
+
case _:
|
|
189
|
+
return str
|
|
190
|
+
|
|
191
|
+
featureclass_name = f"{self.target_table.name}Feature{self.feature_name}"
|
|
192
|
+
|
|
193
|
+
# Create feature class. To do this, we must determine the python type for each column and also if the
|
|
194
|
+
# column is optional or not based on its nullability.
|
|
195
|
+
feature_columns = {
|
|
196
|
+
c.name: (
|
|
197
|
+
Optional[map_type(c)] if c.nullok else map_type(c),
|
|
198
|
+
c.default or None,
|
|
199
|
+
)
|
|
200
|
+
for c in self.feature_columns
|
|
201
|
+
} | {
|
|
202
|
+
"Feature_Name": (
|
|
203
|
+
str,
|
|
204
|
+
self.feature_name,
|
|
205
|
+
), # Set default value for Feature_Name
|
|
206
|
+
self.target_table.name: (str, ...),
|
|
207
|
+
}
|
|
208
|
+
docstring = (
|
|
209
|
+
f"Class to capture fields in a feature {self.feature_name} on table {self.target_table}. "
|
|
210
|
+
"Feature columns include:\n"
|
|
211
|
+
)
|
|
212
|
+
docstring += "\n".join([f" {c.name}" for c in self.feature_columns])
|
|
213
|
+
|
|
214
|
+
model = create_model(
|
|
215
|
+
featureclass_name,
|
|
216
|
+
__base__=FeatureRecord,
|
|
217
|
+
__doc__=docstring,
|
|
218
|
+
**feature_columns,
|
|
219
|
+
)
|
|
220
|
+
model.feature = self # Set value of class variable within the feature class definition.
|
|
221
|
+
|
|
222
|
+
return model
|
|
223
|
+
|
|
224
|
+
def __repr__(self) -> str:
|
|
225
|
+
return (
|
|
226
|
+
f"Feature(target_table={self.target_table.name}, feature_name={self.feature_name}, "
|
|
227
|
+
f"feature_table={self.feature_table.name})"
|
|
228
|
+
)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import sys
|
|
3
|
+
from argparse import ArgumentParser
|
|
4
|
+
from importlib import metadata
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from ipykernel.kernelspec import install as install_kernel
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _dist_name_for_this_package() -> str:
|
|
11
|
+
"""
|
|
12
|
+
Try to resolve the distribution name that provides this package.
|
|
13
|
+
Works in editable installs and wheels.
|
|
14
|
+
"""
|
|
15
|
+
# Top-level package name of this module (your_pkg)
|
|
16
|
+
top_pkg = __name__.split(".")[0]
|
|
17
|
+
|
|
18
|
+
# Map top-level packages -> distributions
|
|
19
|
+
pkg_to_dists = metadata.packages_distributions()
|
|
20
|
+
dists = pkg_to_dists.get(top_pkg) or []
|
|
21
|
+
|
|
22
|
+
# Fall back to project name in METADATA when mapping isn't available
|
|
23
|
+
dist_name = dists[0] if dists else metadata.metadata(top_pkg).get("Name", top_pkg)
|
|
24
|
+
return dist_name
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _normalize_kernel_name(name: str) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Jupyter kernel directory names should be simple: lowercase, [-a-z0-9_].
|
|
30
|
+
"""
|
|
31
|
+
name = name.strip().lower()
|
|
32
|
+
name = re.sub(r"[^a-z0-9._-]+", "-", name)
|
|
33
|
+
return name
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _name_for_this_venv() -> str:
|
|
37
|
+
config_path = Path(sys.prefix) / "pyvenv.cfg"
|
|
38
|
+
with config_path.open() as f:
|
|
39
|
+
m = re.search("prompt *= *(?P<prompt>.*)", f.read())
|
|
40
|
+
return m["prompt"] if m else ""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def main() -> None:
|
|
44
|
+
parser = ArgumentParser()
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--install-local",
|
|
47
|
+
action="store_true",
|
|
48
|
+
help="Create kernal in local venv directory instead of sys.prefix.",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
dist_name = _name_for_this_venv() # e.g., "deriva-model-template"
|
|
52
|
+
kernel_name = _normalize_kernel_name(dist_name) # e.g., "deriva-model-template"
|
|
53
|
+
display_name = f"Python ({dist_name})"
|
|
54
|
+
|
|
55
|
+
# Install into the current environment's prefix (e.g., .venv/share/jupyter/kernels/..)
|
|
56
|
+
prefix_arg = {}
|
|
57
|
+
install_local = False
|
|
58
|
+
if install_local:
|
|
59
|
+
prefix_arg = {"prefix": sys.prefix}
|
|
60
|
+
|
|
61
|
+
install_kernel(
|
|
62
|
+
user=True, # write under sys.prefix (the active env)
|
|
63
|
+
kernel_name=kernel_name,
|
|
64
|
+
display_name=display_name,
|
|
65
|
+
**prefix_arg,
|
|
66
|
+
)
|
|
67
|
+
print(f"Installed Jupyter kernel '{kernel_name}' with display name '{display_name}' under {sys.prefix!s}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
if __name__ == "__main__":
|
|
71
|
+
main()
|
|
File without changes
|