deriva-ml 1.14.0__py3-none-any.whl → 1.14.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +25 -30
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1489 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +4 -0
- deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
- deriva_ml/{dataset.py → dataset/dataset.py} +405 -428
- deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
- deriva_ml/{history.py → dataset/history.py} +51 -33
- deriva_ml/{upload.py → dataset/upload.py} +48 -70
- deriva_ml/demo_catalog.py +233 -183
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/{execution.py → execution/execution.py} +365 -252
- deriva_ml/execution/execution_configuration.py +163 -0
- deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
- deriva_ml/feature.py +83 -46
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
- deriva_ml/{database_model.py → model/database.py} +52 -74
- deriva_ml/model/sql_mapper.py +44 -0
- deriva_ml/run_notebook.py +19 -11
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/{schema_setup → schema}/annotations.py +31 -22
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
- deriva_ml-1.14.26.dist-info/RECORD +40 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
- deriva_ml/deriva_definitions.py +0 -391
- deriva_ml/deriva_ml_base.py +0 -1046
- deriva_ml/execution_environment.py +0 -139
- deriva_ml/schema_setup/table_comments_utils.py +0 -56
- deriva_ml/test-files/execution-parameters.json +0 -1
- deriva_ml/test-files/notebook-parameters.json +0 -5
- deriva_ml/test_functions.py +0 -141
- deriva_ml/test_notebook.ipynb +0 -197
- deriva_ml-1.14.0.dist-info/RECORD +0 -31
- /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
- /deriva_ml/{schema_setup → schema}/policy.json +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0
deriva_ml/feature.py
CHANGED
|
@@ -1,27 +1,48 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""Feature implementation for deriva-ml.
|
|
2
|
+
|
|
3
|
+
This module provides classes for defining and managing features in deriva-ml. Features represent measurable
|
|
4
|
+
properties or characteristics that can be associated with records in a table. The module includes:
|
|
5
|
+
|
|
6
|
+
- Feature: Main class for defining and managing features
|
|
7
|
+
- FeatureRecord: Base class for feature records using pydantic models
|
|
8
|
+
|
|
9
|
+
Typical usage example:
|
|
10
|
+
>>> feature = Feature(association_result, model)
|
|
11
|
+
>>> FeatureClass = feature.feature_record_class()
|
|
12
|
+
>>> record = FeatureClass(value="high", confidence=0.95)
|
|
3
13
|
"""
|
|
4
14
|
|
|
5
|
-
from deriva.core.ermrest_model import FindAssociationResult, Column
|
|
6
15
|
from pathlib import Path
|
|
7
|
-
from pydantic import BaseModel, create_model
|
|
8
|
-
from typing import Optional, Type, ClassVar, TYPE_CHECKING
|
|
9
16
|
from types import UnionType
|
|
17
|
+
from typing import TYPE_CHECKING, ClassVar, Optional, Type
|
|
18
|
+
|
|
19
|
+
from deriva.core.ermrest_model import Column, FindAssociationResult
|
|
20
|
+
from pydantic import BaseModel, create_model
|
|
10
21
|
|
|
11
22
|
if TYPE_CHECKING:
|
|
12
|
-
from .
|
|
23
|
+
from model.catalog import DerivaModel
|
|
13
24
|
|
|
14
25
|
|
|
15
26
|
class FeatureRecord(BaseModel):
|
|
16
|
-
"""Base class for
|
|
17
|
-
describe all the columns of a feature.
|
|
27
|
+
"""Base class for dynamically generated feature record models.
|
|
18
28
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
Feature_Name (str):
|
|
22
|
-
feature:
|
|
23
|
-
Returns:
|
|
29
|
+
This class serves as the base for pydantic models that represent feature records. Each feature record
|
|
30
|
+
contains the values and metadata associated with a feature instance.
|
|
24
31
|
|
|
32
|
+
Attributes:
|
|
33
|
+
Execution (Optional[str]): RID of the execution that created this feature record.
|
|
34
|
+
Feature_Name (str): Name of the feature this record belongs to.
|
|
35
|
+
feature (ClassVar[Optional[Feature]]): Reference to the Feature object that created this record.
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
>>> class GeneFeature(FeatureRecord):
|
|
39
|
+
... value: str
|
|
40
|
+
... confidence: float
|
|
41
|
+
>>> record = GeneFeature(
|
|
42
|
+
... Feature_Name="expression",
|
|
43
|
+
... value="high",
|
|
44
|
+
... confidence=0.95
|
|
45
|
+
... )
|
|
25
46
|
"""
|
|
26
47
|
|
|
27
48
|
# model_dump of this feature should be compatible with feature table columns.
|
|
@@ -34,53 +55,61 @@ class FeatureRecord(BaseModel):
|
|
|
34
55
|
|
|
35
56
|
@classmethod
|
|
36
57
|
def feature_columns(cls) -> set[Column]:
|
|
37
|
-
"""
|
|
58
|
+
"""Returns all columns specific to this feature.
|
|
38
59
|
|
|
39
60
|
Returns:
|
|
40
|
-
|
|
41
|
-
|
|
61
|
+
set[Column]: Set of feature-specific columns, excluding system and relationship columns.
|
|
42
62
|
"""
|
|
43
63
|
return cls.feature.feature_columns
|
|
44
64
|
|
|
45
65
|
@classmethod
|
|
46
66
|
def asset_columns(cls) -> set[Column]:
|
|
47
|
-
"""
|
|
48
|
-
|
|
49
|
-
Args:
|
|
67
|
+
"""Returns columns that reference asset tables.
|
|
50
68
|
|
|
51
69
|
Returns:
|
|
52
|
-
|
|
53
|
-
|
|
70
|
+
set[Column]: Set of columns that contain references to asset tables.
|
|
54
71
|
"""
|
|
55
72
|
return cls.feature.asset_columns
|
|
56
73
|
|
|
57
74
|
@classmethod
|
|
58
75
|
def term_columns(cls) -> set[Column]:
|
|
59
|
-
"""
|
|
60
|
-
|
|
61
|
-
Args:
|
|
76
|
+
"""Returns columns that reference vocabulary terms.
|
|
62
77
|
|
|
63
78
|
Returns:
|
|
64
|
-
|
|
65
|
-
|
|
79
|
+
set[Column]: Set of columns that contain references to controlled vocabulary terms.
|
|
66
80
|
"""
|
|
67
81
|
return cls.feature.term_columns
|
|
68
82
|
|
|
69
83
|
@classmethod
|
|
70
84
|
def value_columns(cls) -> set[Column]:
|
|
71
|
-
"""
|
|
72
|
-
|
|
73
|
-
Args:
|
|
85
|
+
"""Returns columns that contain direct values.
|
|
74
86
|
|
|
75
87
|
Returns:
|
|
76
|
-
|
|
77
|
-
|
|
88
|
+
set[Column]: Set of columns containing direct values (not references to assets or terms).
|
|
78
89
|
"""
|
|
79
90
|
return cls.feature.value_columns
|
|
80
91
|
|
|
81
92
|
|
|
82
93
|
class Feature:
|
|
83
|
-
"""
|
|
94
|
+
"""Manages feature definitions and their relationships in the catalog.
|
|
95
|
+
|
|
96
|
+
A Feature represents a measurable property or characteristic that can be associated with records in a table.
|
|
97
|
+
Features can include asset references, controlled vocabulary terms, and custom metadata fields.
|
|
98
|
+
|
|
99
|
+
Attributes:
|
|
100
|
+
feature_table: Table containing the feature implementation.
|
|
101
|
+
target_table: Table that the feature is associated with.
|
|
102
|
+
feature_name: Name of the feature (from Feature_Name column default).
|
|
103
|
+
feature_columns: Set of columns specific to this feature.
|
|
104
|
+
asset_columns: Set of columns referencing asset tables.
|
|
105
|
+
term_columns: Set of columns referencing vocabulary tables.
|
|
106
|
+
value_columns: Set of columns containing direct values.
|
|
107
|
+
|
|
108
|
+
Example:
|
|
109
|
+
>>> feature = Feature(association_result, model)
|
|
110
|
+
>>> print(f"Feature {feature.feature_name} on {feature.target_table.name}")
|
|
111
|
+
>>> print("Asset columns:", [c.name for c in feature.asset_columns])
|
|
112
|
+
"""
|
|
84
113
|
|
|
85
114
|
def __init__(self, atable: FindAssociationResult, model: "DerivaModel") -> None:
|
|
86
115
|
self.feature_table = atable.table
|
|
@@ -98,9 +127,7 @@ class Feature:
|
|
|
98
127
|
self.target_table.name,
|
|
99
128
|
"Execution",
|
|
100
129
|
}
|
|
101
|
-
self.feature_columns = {
|
|
102
|
-
c for c in self.feature_table.columns if c.name not in skip_columns
|
|
103
|
-
}
|
|
130
|
+
self.feature_columns = {c for c in self.feature_table.columns if c.name not in skip_columns}
|
|
104
131
|
|
|
105
132
|
assoc_fkeys = {atable.self_fkey} | atable.other_fkeys
|
|
106
133
|
|
|
@@ -117,9 +144,7 @@ class Feature:
|
|
|
117
144
|
if fk not in assoc_fkeys and self._model.is_vocabulary(fk.pk_table)
|
|
118
145
|
}
|
|
119
146
|
|
|
120
|
-
self.value_columns = self.feature_columns - (
|
|
121
|
-
self.asset_columns | self.term_columns
|
|
122
|
-
)
|
|
147
|
+
self.value_columns = self.feature_columns - (self.asset_columns | self.term_columns)
|
|
123
148
|
|
|
124
149
|
def feature_record_class(self) -> type[FeatureRecord]:
|
|
125
150
|
"""Create a pydantic model for entries into the specified feature table
|
|
@@ -129,14 +154,25 @@ class Feature:
|
|
|
129
154
|
"""
|
|
130
155
|
|
|
131
156
|
def map_type(c: Column) -> UnionType | Type[str] | Type[int] | Type[float]:
|
|
132
|
-
"""
|
|
157
|
+
"""Maps a Deriva column type to a Python/pydantic type.
|
|
158
|
+
|
|
159
|
+
Converts ERMrest column types to appropriate Python types for use in pydantic models.
|
|
160
|
+
Special handling is provided for asset columns which can accept either strings or Path objects.
|
|
133
161
|
|
|
134
162
|
Args:
|
|
135
|
-
c: column to
|
|
136
|
-
c: Column:
|
|
163
|
+
c: ERMrest column to map to a Python type.
|
|
137
164
|
|
|
138
165
|
Returns:
|
|
139
|
-
|
|
166
|
+
UnionType | Type[str] | Type[int] | Type[float]: Appropriate Python type for the column:
|
|
167
|
+
- str | Path for asset columns
|
|
168
|
+
- str for text columns
|
|
169
|
+
- int for integer columns
|
|
170
|
+
- float for floating point columns
|
|
171
|
+
- str for all other types
|
|
172
|
+
|
|
173
|
+
Example:
|
|
174
|
+
>>> col = Column(name="score", type="float4")
|
|
175
|
+
>>> typ = map_type(col) # Returns float
|
|
140
176
|
"""
|
|
141
177
|
if c.name in {c.name for c in self.asset_columns}:
|
|
142
178
|
return str | Path
|
|
@@ -168,7 +204,10 @@ class Feature:
|
|
|
168
204
|
), # Set default value for Feature_Name
|
|
169
205
|
self.target_table.name: (str, ...),
|
|
170
206
|
}
|
|
171
|
-
docstring =
|
|
207
|
+
docstring = (
|
|
208
|
+
f"Class to capture fields in a feature {self.feature_name} on table {self.target_table}. "
|
|
209
|
+
"Feature columns include:\n"
|
|
210
|
+
)
|
|
172
211
|
docstring += "\n".join([f" {c.name}" for c in self.feature_columns])
|
|
173
212
|
|
|
174
213
|
model = create_model(
|
|
@@ -177,9 +216,7 @@ class Feature:
|
|
|
177
216
|
__doc__=docstring,
|
|
178
217
|
**feature_columns,
|
|
179
218
|
)
|
|
180
|
-
model.feature =
|
|
181
|
-
self # Set value of class variable within the feature class definition.
|
|
182
|
-
)
|
|
219
|
+
model.feature = self # Set value of class variable within the feature class definition.
|
|
183
220
|
|
|
184
221
|
return model
|
|
185
222
|
|
|
File without changes
|
|
@@ -1,27 +1,55 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
3
|
-
the primary interface to the Deriva based catalogs. The module also implements the Feature and Vocabulary functions
|
|
4
|
-
in the DerivaML.
|
|
5
|
-
|
|
6
|
-
DerivaML and its associated classes all depend on a catalog that implements a `deriva-ml` schema with tables and
|
|
7
|
-
relationships that follow a specific data model.
|
|
2
|
+
Model management for Deriva ML catalogs.
|
|
8
3
|
|
|
4
|
+
This module provides the DerivaModel class which augments the standard Deriva model class with
|
|
5
|
+
ML-specific functionality. It handles schema management, feature definitions, and asset tracking.
|
|
9
6
|
"""
|
|
10
7
|
|
|
11
|
-
from
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
# Standard library imports
|
|
11
|
+
from collections import Counter
|
|
12
|
+
from typing import Any, Callable, Final, Iterable, NewType, TypeAlias
|
|
13
|
+
|
|
12
14
|
from deriva.core.ermrest_catalog import ErmrestCatalog
|
|
13
|
-
from .feature import Feature
|
|
14
15
|
|
|
15
|
-
|
|
16
|
-
|
|
16
|
+
# Deriva imports
|
|
17
|
+
from deriva.core.ermrest_model import Column, FindAssociationResult, Model, Schema, Table
|
|
18
|
+
|
|
19
|
+
# Third-party imports
|
|
20
|
+
from pydantic import ConfigDict, validate_call
|
|
21
|
+
|
|
22
|
+
from deriva_ml.core.definitions import (
|
|
17
23
|
ML_SCHEMA,
|
|
18
|
-
|
|
24
|
+
DerivaAssetColumns,
|
|
19
25
|
TableDefinition,
|
|
20
26
|
)
|
|
27
|
+
from deriva_ml.core.exceptions import DerivaMLException, DerivaMLTableTypeError
|
|
21
28
|
|
|
22
|
-
|
|
23
|
-
from
|
|
24
|
-
|
|
29
|
+
# Local imports
|
|
30
|
+
from deriva_ml.feature import Feature
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
from icecream import ic
|
|
34
|
+
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
35
|
+
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Define common types:
|
|
39
|
+
TableInput: TypeAlias = str | Table
|
|
40
|
+
SchemaDict: TypeAlias = dict[str, Schema]
|
|
41
|
+
FeatureList: TypeAlias = Iterable[Feature]
|
|
42
|
+
SchemaName = NewType("SchemaName", str)
|
|
43
|
+
ColumnSet: TypeAlias = set[Column]
|
|
44
|
+
AssociationResult: TypeAlias = FindAssociationResult
|
|
45
|
+
TableSet: TypeAlias = set[Table]
|
|
46
|
+
PathList: TypeAlias = list[list[Table]]
|
|
47
|
+
|
|
48
|
+
# Define constants:
|
|
49
|
+
VOCAB_COLUMNS: Final[set[str]] = {"NAME", "URI", "SYNONYMS", "DESCRIPTION", "ID"}
|
|
50
|
+
ASSET_COLUMNS: Final[set[str]] = {"Filename", "URL", "Length", "MD5", "Description"}
|
|
51
|
+
|
|
52
|
+
FilterPredicate = Callable[[Table], bool]
|
|
25
53
|
|
|
26
54
|
|
|
27
55
|
class DerivaModel:
|
|
@@ -30,9 +58,8 @@ class DerivaModel:
|
|
|
30
58
|
This class provides a number of DerivaML specific methods that augment the interface in the deriva model class.
|
|
31
59
|
|
|
32
60
|
Attributes:
|
|
33
|
-
domain_schema: Schema name for domain
|
|
61
|
+
domain_schema: Schema name for domain-specific tables and relationships.
|
|
34
62
|
model: ERMRest model for the catalog.
|
|
35
|
-
schemas: ERMRest model for the catalog.
|
|
36
63
|
catalog: ERMRest catalog for the model
|
|
37
64
|
hostname: ERMRest catalog for the model
|
|
38
65
|
ml_schema: The ML schema for the catalog.
|
|
@@ -41,7 +68,10 @@ class DerivaModel:
|
|
|
41
68
|
"""
|
|
42
69
|
|
|
43
70
|
def __init__(
|
|
44
|
-
self,
|
|
71
|
+
self,
|
|
72
|
+
model: Model,
|
|
73
|
+
ml_schema: str = ML_SCHEMA,
|
|
74
|
+
domain_schema: str | None = None,
|
|
45
75
|
):
|
|
46
76
|
"""Create and initialize a DerivaML instance.
|
|
47
77
|
|
|
@@ -49,64 +79,66 @@ class DerivaModel:
|
|
|
49
79
|
This class is intended to be used as a base class on which domain-specific interfaces are built.
|
|
50
80
|
|
|
51
81
|
Args:
|
|
82
|
+
model: The ERMRest model for the catalog.
|
|
83
|
+
ml_schema: The ML schema name.
|
|
84
|
+
domain_schema: The domain schema name.
|
|
52
85
|
"""
|
|
53
86
|
self.model = model
|
|
54
87
|
self.configuration = None
|
|
55
88
|
self.catalog: ErmrestCatalog = self.model.catalog
|
|
56
|
-
self.hostname = (
|
|
57
|
-
self.catalog.deriva_server.server
|
|
58
|
-
if isinstance(self.catalog, ErmrestCatalog)
|
|
59
|
-
else "localhost"
|
|
60
|
-
)
|
|
61
|
-
self.schemas = self.model.schemas
|
|
89
|
+
self.hostname = self.catalog.deriva_server.server if isinstance(self.catalog, ErmrestCatalog) else "localhost"
|
|
62
90
|
|
|
63
91
|
self.ml_schema = ml_schema
|
|
64
|
-
builtin_schemas =
|
|
65
|
-
|
|
66
|
-
self.domain_schema = (
|
|
67
|
-
domain_schema
|
|
68
|
-
or [
|
|
69
|
-
s for s in self.model.schemas.keys() if s not in builtin_schemas
|
|
70
|
-
].pop()
|
|
71
|
-
)
|
|
72
|
-
except IndexError:
|
|
73
|
-
# No domain schema defined.
|
|
92
|
+
builtin_schemas = ("public", self.ml_schema, "www", "WWW")
|
|
93
|
+
if domain_schema:
|
|
74
94
|
self.domain_schema = domain_schema
|
|
95
|
+
else:
|
|
96
|
+
if len(user_schemas := {k for k in self.model.schemas.keys()} - set(builtin_schemas)) == 1:
|
|
97
|
+
self.domain_schema = user_schemas.pop()
|
|
98
|
+
else:
|
|
99
|
+
raise DerivaMLException(f"Ambiguous domain schema: {user_schemas}")
|
|
100
|
+
|
|
101
|
+
def refresh_model(self) -> None:
|
|
102
|
+
self.model = self.catalog.getCatalogModel()
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def schemas(self) -> dict[str, Schema]:
|
|
106
|
+
return self.model.schemas
|
|
75
107
|
|
|
76
108
|
@property
|
|
77
109
|
def chaise_config(self) -> dict[str, Any]:
|
|
78
110
|
"""Return the chaise configuration."""
|
|
79
111
|
return self.model.chaise_config
|
|
80
112
|
|
|
81
|
-
def __getattr__(self, name):
|
|
113
|
+
def __getattr__(self, name: str) -> Any:
|
|
82
114
|
# Called only if `name` is not found in Manager. Delegate attributes to model class.
|
|
83
115
|
return getattr(self.model, name)
|
|
84
116
|
|
|
85
|
-
def name_to_table(self, table:
|
|
117
|
+
def name_to_table(self, table: TableInput) -> Table:
|
|
86
118
|
"""Return the table object corresponding to the given table name.
|
|
87
119
|
|
|
88
120
|
If the table name appears in more than one schema, return the first one you find.
|
|
89
121
|
|
|
90
122
|
Args:
|
|
91
123
|
table: A ERMRest table object or a string that is the name of the table.
|
|
92
|
-
table: str | Table:
|
|
93
124
|
|
|
94
125
|
Returns:
|
|
95
126
|
Table object.
|
|
96
127
|
"""
|
|
97
128
|
if isinstance(table, Table):
|
|
98
129
|
return table
|
|
99
|
-
|
|
130
|
+
if table in (s := self.model.schemas[self.domain_schema].tables):
|
|
131
|
+
return s[table]
|
|
132
|
+
for s in [self.model.schemas[sname] for sname in [self.domain_schema, self.ml_schema, "WWW"]]:
|
|
100
133
|
if table in s.tables.keys():
|
|
101
134
|
return s.tables[table]
|
|
102
135
|
raise DerivaMLException(f"The table {table} doesn't exist.")
|
|
103
136
|
|
|
104
|
-
def is_vocabulary(self, table_name:
|
|
137
|
+
def is_vocabulary(self, table_name: TableInput) -> bool:
|
|
105
138
|
"""Check if a given table is a controlled vocabulary table.
|
|
106
139
|
|
|
107
140
|
Args:
|
|
108
141
|
table_name: A ERMRest table object or the name of the table.
|
|
109
|
-
table_name: str | Table:
|
|
110
142
|
|
|
111
143
|
Returns:
|
|
112
144
|
Table object if the table is a controlled vocabulary, False otherwise.
|
|
@@ -126,7 +158,7 @@ class DerivaModel:
|
|
|
126
158
|
pure: bool = True,
|
|
127
159
|
min_arity: int = 2,
|
|
128
160
|
max_arity: int = 2,
|
|
129
|
-
) -> bool | set | int:
|
|
161
|
+
) -> bool | set[str] | int:
|
|
130
162
|
"""Check the specified table to see if it is an association table.
|
|
131
163
|
|
|
132
164
|
Args:
|
|
@@ -140,12 +172,10 @@ class DerivaModel:
|
|
|
140
172
|
|
|
141
173
|
"""
|
|
142
174
|
table = self.name_to_table(table_name)
|
|
143
|
-
return table.is_association(
|
|
144
|
-
unqualified=unqualified, pure=pure, min_arity=min_arity, max_arity=max_arity
|
|
145
|
-
)
|
|
175
|
+
return table.is_association(unqualified=unqualified, pure=pure, min_arity=min_arity, max_arity=max_arity)
|
|
146
176
|
|
|
147
|
-
def find_association(self, table1: Table | str, table2: Table | str) -> Table:
|
|
148
|
-
"""Given two tables, return an association table that connects the two
|
|
177
|
+
def find_association(self, table1: Table | str, table2: Table | str) -> tuple[Table, Column, Column]:
|
|
178
|
+
"""Given two tables, return an association table that connects the two and the two columns used to link them..
|
|
149
179
|
|
|
150
180
|
Raises:
|
|
151
181
|
DerivaML exception if there is either not an association table or more than one association table.
|
|
@@ -154,22 +184,21 @@ class DerivaModel:
|
|
|
154
184
|
table2 = self.name_to_table(table2)
|
|
155
185
|
|
|
156
186
|
tables = [
|
|
157
|
-
a.table
|
|
187
|
+
(a.table, a.self_fkey.columns[0].name, other_key.columns[0].name)
|
|
158
188
|
for a in table1.find_associations(pure=False)
|
|
159
|
-
if a.other_fkeys.pop().pk_table == table2
|
|
189
|
+
if len(a.other_fkeys) == 1 and (other_key := a.other_fkeys.pop()).pk_table == table2
|
|
160
190
|
]
|
|
191
|
+
|
|
161
192
|
if len(tables) == 1:
|
|
162
193
|
return tables[0]
|
|
163
194
|
elif len(tables) == 0:
|
|
164
|
-
raise DerivaMLException(
|
|
165
|
-
f"No association tables found between {table1.name} and {table2.name}."
|
|
166
|
-
)
|
|
195
|
+
raise DerivaMLException(f"No association tables found between {table1.name} and {table2.name}.")
|
|
167
196
|
else:
|
|
168
197
|
raise DerivaMLException(
|
|
169
198
|
f"There are {len(tables)} association tables between {table1.name} and {table2.name}."
|
|
170
199
|
)
|
|
171
200
|
|
|
172
|
-
def is_asset(self, table_name:
|
|
201
|
+
def is_asset(self, table_name: TableInput) -> bool:
|
|
173
202
|
"""True if the specified table is an asset table.
|
|
174
203
|
|
|
175
204
|
Args:
|
|
@@ -185,24 +214,14 @@ class DerivaModel:
|
|
|
185
214
|
|
|
186
215
|
def find_assets(self, with_metadata: bool = False) -> list[Table]:
|
|
187
216
|
"""Return the list of asset tables in the current model"""
|
|
188
|
-
return [
|
|
189
|
-
t
|
|
190
|
-
for s in self.model.schemas.values()
|
|
191
|
-
for t in s.tables.values()
|
|
192
|
-
if self.is_asset(t)
|
|
193
|
-
]
|
|
217
|
+
return [t for s in self.model.schemas.values() for t in s.tables.values() if self.is_asset(t)]
|
|
194
218
|
|
|
195
219
|
def find_vocabularies(self) -> list[Table]:
|
|
196
220
|
"""Return a list of all the controlled vocabulary tables in the domain schema."""
|
|
197
|
-
return [
|
|
198
|
-
t
|
|
199
|
-
for s in self.model.schemas.values()
|
|
200
|
-
for t in s.tables.values()
|
|
201
|
-
if self.is_vocabulary(t)
|
|
202
|
-
]
|
|
221
|
+
return [t for s in self.model.schemas.values() for t in s.tables.values() if self.is_vocabulary(t)]
|
|
203
222
|
|
|
204
223
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
205
|
-
def find_features(self, table:
|
|
224
|
+
def find_features(self, table: TableInput) -> Iterable[Feature]:
|
|
206
225
|
"""List the names of the features in the specified table.
|
|
207
226
|
|
|
208
227
|
Args:
|
|
@@ -215,15 +234,13 @@ class DerivaModel:
|
|
|
215
234
|
table = self.name_to_table(table)
|
|
216
235
|
|
|
217
236
|
def is_feature(a: FindAssociationResult) -> bool:
|
|
218
|
-
"""
|
|
237
|
+
"""Check if association represents a feature.
|
|
219
238
|
|
|
220
239
|
Args:
|
|
221
|
-
|
|
222
|
-
|
|
240
|
+
a: Association result to check
|
|
223
241
|
Returns:
|
|
224
|
-
|
|
242
|
+
bool: True if association represents a feature
|
|
225
243
|
"""
|
|
226
|
-
# return {'Feature_Name', 'Execution'}.issubset({c.name for c in a.table.columns})
|
|
227
244
|
return {
|
|
228
245
|
"Feature_Name",
|
|
229
246
|
"Execution",
|
|
@@ -231,12 +248,10 @@ class DerivaModel:
|
|
|
231
248
|
}.issubset({c.name for c in a.table.columns})
|
|
232
249
|
|
|
233
250
|
return [
|
|
234
|
-
Feature(a, self)
|
|
235
|
-
for a in table.find_associations(min_arity=3, max_arity=3, pure=False)
|
|
236
|
-
if is_feature(a)
|
|
251
|
+
Feature(a, self) for a in table.find_associations(min_arity=3, max_arity=3, pure=False) if is_feature(a)
|
|
237
252
|
]
|
|
238
253
|
|
|
239
|
-
def lookup_feature(self, table:
|
|
254
|
+
def lookup_feature(self, table: TableInput, feature_name: str) -> Feature:
|
|
240
255
|
"""Lookup the named feature associated with the provided table.
|
|
241
256
|
|
|
242
257
|
Args:
|
|
@@ -252,31 +267,20 @@ class DerivaModel:
|
|
|
252
267
|
"""
|
|
253
268
|
table = self.name_to_table(table)
|
|
254
269
|
try:
|
|
255
|
-
return [
|
|
256
|
-
f for f in self.find_features(table) if f.feature_name == feature_name
|
|
257
|
-
][0]
|
|
270
|
+
return [f for f in self.find_features(table) if f.feature_name == feature_name][0]
|
|
258
271
|
except IndexError:
|
|
259
|
-
raise DerivaMLException(
|
|
260
|
-
f"Feature {table.name}:{feature_name} doesn't exist."
|
|
261
|
-
)
|
|
272
|
+
raise DerivaMLException(f"Feature {table.name}:{feature_name} doesn't exist.")
|
|
262
273
|
|
|
263
274
|
def asset_metadata(self, table: str | Table) -> set[str]:
|
|
264
275
|
"""Return the metadata columns for an asset table."""
|
|
265
276
|
|
|
266
277
|
table = self.name_to_table(table)
|
|
267
|
-
asset_columns = {
|
|
268
|
-
"Filename",
|
|
269
|
-
"URL",
|
|
270
|
-
"Length",
|
|
271
|
-
"MD5",
|
|
272
|
-
"Description",
|
|
273
|
-
}.union(set(DerivaSystemColumns))
|
|
274
278
|
|
|
275
279
|
if not self.is_asset(table):
|
|
276
|
-
raise
|
|
277
|
-
return {c.name for c in table.columns} -
|
|
280
|
+
raise DerivaMLTableTypeError("asset table", table.name)
|
|
281
|
+
return {c.name for c in table.columns} - DerivaAssetColumns
|
|
278
282
|
|
|
279
|
-
def apply(self):
|
|
283
|
+
def apply(self) -> None:
|
|
280
284
|
"""Call ERMRestModel.apply"""
|
|
281
285
|
if self.catalog == "file-system":
|
|
282
286
|
raise DerivaMLException("Cannot apply() to non-catalog model.")
|
|
@@ -284,45 +288,38 @@ class DerivaModel:
|
|
|
284
288
|
self.model.apply()
|
|
285
289
|
|
|
286
290
|
def _table_relationship(
|
|
287
|
-
self,
|
|
291
|
+
self,
|
|
292
|
+
table1: TableInput,
|
|
293
|
+
table2: TableInput,
|
|
288
294
|
) -> tuple[Column, Column]:
|
|
289
295
|
"""Return columns used to relate two tables."""
|
|
290
296
|
table1 = self.name_to_table(table1)
|
|
291
297
|
table2 = self.name_to_table(table2)
|
|
292
298
|
relationships = [
|
|
293
|
-
(fk.foreign_key_columns[0], fk.referenced_columns[0])
|
|
294
|
-
for fk in table1.foreign_keys
|
|
295
|
-
if fk.pk_table == table2
|
|
299
|
+
(fk.foreign_key_columns[0], fk.referenced_columns[0]) for fk in table1.foreign_keys if fk.pk_table == table2
|
|
296
300
|
]
|
|
297
301
|
relationships.extend(
|
|
298
|
-
[
|
|
299
|
-
(fk.referenced_columns[0], fk.foreign_key_columns[0])
|
|
300
|
-
for fk in table1.referenced_by
|
|
301
|
-
if fk.table == table2
|
|
302
|
-
]
|
|
302
|
+
[(fk.referenced_columns[0], fk.foreign_key_columns[0]) for fk in table1.referenced_by if fk.table == table2]
|
|
303
303
|
)
|
|
304
304
|
if len(relationships) != 1:
|
|
305
|
-
raise DerivaMLException(
|
|
306
|
-
f"Ambiguous linkage between {table1.name} and {table2.name}"
|
|
307
|
-
)
|
|
305
|
+
raise DerivaMLException(f"Ambiguous linkage between {table1.name} and {table2.name}")
|
|
308
306
|
return relationships[0]
|
|
309
307
|
|
|
310
308
|
def _schema_to_paths(
|
|
311
309
|
self,
|
|
312
|
-
root: Table = None,
|
|
313
|
-
path:
|
|
310
|
+
root: Table | None = None,
|
|
311
|
+
path: list[Table] | None = None,
|
|
314
312
|
) -> list[list[Table]]:
|
|
315
|
-
"""
|
|
316
|
-
|
|
317
|
-
Walk a schema graph and return a list all the paths through the graph.
|
|
313
|
+
"""Return a list of paths through the schema graph.
|
|
318
314
|
|
|
319
315
|
Args:
|
|
320
|
-
|
|
316
|
+
root: The root table to start from.
|
|
317
|
+
path: The current path being built.
|
|
321
318
|
|
|
322
319
|
Returns:
|
|
323
|
-
|
|
324
|
-
|
|
320
|
+
A list of paths through the schema graph.
|
|
325
321
|
"""
|
|
322
|
+
path = path or []
|
|
326
323
|
|
|
327
324
|
root = root or self.model.schemas[self.ml_schema].tables["Dataset"]
|
|
328
325
|
path = path.copy() if path else []
|
|
@@ -332,21 +329,11 @@ class DerivaModel:
|
|
|
332
329
|
|
|
333
330
|
def find_arcs(table: Table) -> set[Table]:
|
|
334
331
|
"""Given a path through the model, return the FKs that link the tables"""
|
|
335
|
-
arc_list = [fk.pk_table for fk in table.foreign_keys] + [
|
|
336
|
-
|
|
337
|
-
]
|
|
338
|
-
arc_list = [
|
|
339
|
-
t
|
|
340
|
-
for t in arc_list
|
|
341
|
-
if t.schema.name in {self.domain_schema, self.ml_schema}
|
|
342
|
-
]
|
|
332
|
+
arc_list = [fk.pk_table for fk in table.foreign_keys] + [fk.table for fk in table.referenced_by]
|
|
333
|
+
arc_list = [t for t in arc_list if t.schema.name in {self.domain_schema, self.ml_schema}]
|
|
343
334
|
domain_tables = [t for t in arc_list if t.schema.name == self.domain_schema]
|
|
344
|
-
if multiple_columns := [
|
|
345
|
-
|
|
346
|
-
]:
|
|
347
|
-
raise DerivaMLException(
|
|
348
|
-
f"Ambiguous relationship in {table.name} {multiple_columns}"
|
|
349
|
-
)
|
|
335
|
+
if multiple_columns := [c for c, cnt in Counter(domain_tables).items() if cnt > 1]:
|
|
336
|
+
raise DerivaMLException(f"Ambiguous relationship in {table.name} {multiple_columns}")
|
|
350
337
|
return set(arc_list)
|
|
351
338
|
|
|
352
339
|
def is_nested_dataset_loopback(n1: Table, n2: Table) -> bool:
|
|
@@ -354,9 +341,7 @@ class DerivaModel:
|
|
|
354
341
|
# If we have node_name <- node_name_dataset-> Dataset then we are looping
|
|
355
342
|
# back around to a new dataset element
|
|
356
343
|
dataset_table = self.model.schemas[self.ml_schema].tables["Dataset"]
|
|
357
|
-
assoc_table = [
|
|
358
|
-
a for a in dataset_table.find_associations() if a.table == n2
|
|
359
|
-
]
|
|
344
|
+
assoc_table = [a for a in dataset_table.find_associations() if a.table == n2]
|
|
360
345
|
return len(assoc_table) == 1 and n1 != dataset_table
|
|
361
346
|
|
|
362
347
|
# Don't follow vocabulary terms back to their use.
|
|
@@ -372,9 +357,7 @@ class DerivaModel:
|
|
|
372
357
|
if is_nested_dataset_loopback(root, child):
|
|
373
358
|
continue
|
|
374
359
|
if child in path:
|
|
375
|
-
raise DerivaMLException(
|
|
376
|
-
f"Cycle in schema path: {child.name} path:{[p.name for p in path]}"
|
|
377
|
-
)
|
|
360
|
+
raise DerivaMLException(f"Cycle in schema path: {child.name} path:{[p.name for p in path]}")
|
|
378
361
|
|
|
379
362
|
paths.extend(self._schema_to_paths(child, path))
|
|
380
363
|
return paths
|
|
@@ -382,6 +365,4 @@ class DerivaModel:
|
|
|
382
365
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
383
366
|
def create_table(self, table_def: TableDefinition) -> Table:
|
|
384
367
|
"""Create a new table from TableDefinition."""
|
|
385
|
-
return self.model.schemas[self.domain_schema].create_table(
|
|
386
|
-
table_def.model_dump()
|
|
387
|
-
)
|
|
368
|
+
return self.model.schemas[self.domain_schema].create_table(table_def.model_dump())
|