deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +69 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +31 -0
- deriva_ml/catalog/clone.py +1939 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +845 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +126 -110
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +543 -242
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +223 -34
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
- deriva_ml-1.17.12.dist-info/RECORD +77 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.10.dist-info/RECORD +0 -45
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
deriva_ml/model/catalog.py
CHANGED
|
@@ -12,10 +12,17 @@ from collections import Counter, defaultdict
|
|
|
12
12
|
from graphlib import CycleError, TopologicalSorter
|
|
13
13
|
from typing import Any, Callable, Final, Iterable, NewType, TypeAlias
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
15
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
16
|
+
import importlib
|
|
17
|
+
_ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")
|
|
18
|
+
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
19
|
+
|
|
20
|
+
ErmrestCatalog = _ermrest_catalog.ErmrestCatalog
|
|
21
|
+
Column = _ermrest_model.Column
|
|
22
|
+
FindAssociationResult = _ermrest_model.FindAssociationResult
|
|
23
|
+
Model = _ermrest_model.Model
|
|
24
|
+
Schema = _ermrest_model.Schema
|
|
25
|
+
Table = _ermrest_model.Table
|
|
19
26
|
|
|
20
27
|
# Third-party imports
|
|
21
28
|
from pydantic import ConfigDict, validate_call
|
|
@@ -23,14 +30,16 @@ from pydantic import ConfigDict, validate_call
|
|
|
23
30
|
from deriva_ml.core.definitions import (
|
|
24
31
|
ML_SCHEMA,
|
|
25
32
|
RID,
|
|
33
|
+
SYSTEM_SCHEMAS,
|
|
26
34
|
DerivaAssetColumns,
|
|
27
35
|
TableDefinition,
|
|
36
|
+
get_domain_schemas,
|
|
37
|
+
is_system_schema,
|
|
28
38
|
)
|
|
29
39
|
from deriva_ml.core.exceptions import DerivaMLException, DerivaMLTableTypeError
|
|
30
40
|
|
|
31
41
|
# Local imports
|
|
32
42
|
from deriva_ml.feature import Feature
|
|
33
|
-
from deriva_ml.protocols.dataset import DatasetLike
|
|
34
43
|
|
|
35
44
|
try:
|
|
36
45
|
from icecream import ic
|
|
@@ -61,12 +70,12 @@ class DerivaModel:
|
|
|
61
70
|
This class provides a number of DerivaML specific methods that augment the interface in the deriva model class.
|
|
62
71
|
|
|
63
72
|
Attributes:
|
|
64
|
-
domain_schema: Schema name for domain-specific tables and relationships.
|
|
65
73
|
model: ERMRest model for the catalog.
|
|
66
|
-
catalog: ERMRest catalog for the model
|
|
67
|
-
hostname:
|
|
68
|
-
ml_schema: The ML schema for the catalog.
|
|
69
|
-
|
|
74
|
+
catalog: ERMRest catalog for the model.
|
|
75
|
+
hostname: Hostname of the ERMRest server.
|
|
76
|
+
ml_schema: The ML schema name for the catalog.
|
|
77
|
+
domain_schemas: Frozenset of all domain schema names in the catalog.
|
|
78
|
+
default_schema: The default schema for table creation operations.
|
|
70
79
|
|
|
71
80
|
"""
|
|
72
81
|
|
|
@@ -74,17 +83,22 @@ class DerivaModel:
|
|
|
74
83
|
self,
|
|
75
84
|
model: Model,
|
|
76
85
|
ml_schema: str = ML_SCHEMA,
|
|
77
|
-
|
|
86
|
+
domain_schemas: set[str] | None = None,
|
|
87
|
+
default_schema: str | None = None,
|
|
78
88
|
):
|
|
79
|
-
"""Create and initialize a
|
|
89
|
+
"""Create and initialize a DerivaModel instance.
|
|
80
90
|
|
|
81
|
-
This method will connect to a catalog
|
|
91
|
+
This method will connect to a catalog and initialize schema configuration.
|
|
82
92
|
This class is intended to be used as a base class on which domain-specific interfaces are built.
|
|
83
93
|
|
|
84
94
|
Args:
|
|
85
95
|
model: The ERMRest model for the catalog.
|
|
86
96
|
ml_schema: The ML schema name.
|
|
87
|
-
|
|
97
|
+
domain_schemas: Optional explicit set of domain schema names. If None,
|
|
98
|
+
auto-detects all non-system schemas.
|
|
99
|
+
default_schema: The default schema for table creation operations. If None
|
|
100
|
+
and there is exactly one domain schema, that schema is used as default.
|
|
101
|
+
If there are multiple domain schemas, default_schema must be specified.
|
|
88
102
|
"""
|
|
89
103
|
self.model = model
|
|
90
104
|
self.configuration = None
|
|
@@ -92,27 +106,182 @@ class DerivaModel:
|
|
|
92
106
|
self.hostname = self.catalog.deriva_server.server if isinstance(self.catalog, ErmrestCatalog) else "localhost"
|
|
93
107
|
|
|
94
108
|
self.ml_schema = ml_schema
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
109
|
+
self._system_schemas = frozenset(SYSTEM_SCHEMAS | {ml_schema})
|
|
110
|
+
|
|
111
|
+
# Determine domain schemas
|
|
112
|
+
if domain_schemas is not None:
|
|
113
|
+
self.domain_schemas = frozenset(domain_schemas)
|
|
98
114
|
else:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
115
|
+
# Auto-detect all domain schemas
|
|
116
|
+
self.domain_schemas = get_domain_schemas(self.model.schemas.keys(), ml_schema)
|
|
117
|
+
|
|
118
|
+
# Determine default schema for table creation
|
|
119
|
+
if default_schema is not None:
|
|
120
|
+
if default_schema not in self.domain_schemas:
|
|
121
|
+
raise DerivaMLException(
|
|
122
|
+
f"default_schema '{default_schema}' is not in domain_schemas: {self.domain_schemas}"
|
|
123
|
+
)
|
|
124
|
+
self.default_schema = default_schema
|
|
125
|
+
elif len(self.domain_schemas) == 1:
|
|
126
|
+
# Single domain schema - use it as default
|
|
127
|
+
self.default_schema = next(iter(self.domain_schemas))
|
|
128
|
+
elif len(self.domain_schemas) == 0:
|
|
129
|
+
# No domain schemas - default_schema will be None
|
|
130
|
+
self.default_schema = None
|
|
131
|
+
else:
|
|
132
|
+
# Multiple domain schemas, no explicit default
|
|
133
|
+
self.default_schema = None
|
|
134
|
+
|
|
135
|
+
def is_system_schema(self, schema_name: str) -> bool:
|
|
136
|
+
"""Check if a schema is a system or ML schema.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
schema_name: Name of the schema to check.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
True if the schema is a system or ML schema.
|
|
143
|
+
"""
|
|
144
|
+
return is_system_schema(schema_name, self.ml_schema)
|
|
145
|
+
|
|
146
|
+
def is_domain_schema(self, schema_name: str) -> bool:
|
|
147
|
+
"""Check if a schema is a domain schema.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
schema_name: Name of the schema to check.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
True if the schema is a domain schema.
|
|
154
|
+
"""
|
|
155
|
+
return schema_name in self.domain_schemas
|
|
156
|
+
|
|
157
|
+
def _require_default_schema(self) -> str:
|
|
158
|
+
"""Get default schema, raising an error if not set.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
The default schema name.
|
|
162
|
+
|
|
163
|
+
Raises:
|
|
164
|
+
DerivaMLException: If default_schema is not set.
|
|
165
|
+
"""
|
|
166
|
+
if self.default_schema is None:
|
|
167
|
+
raise DerivaMLException(
|
|
168
|
+
f"No default_schema set. With multiple domain schemas {self.domain_schemas}, "
|
|
169
|
+
"you must either specify a default_schema when creating DerivaML or "
|
|
170
|
+
"pass an explicit schema parameter to this method."
|
|
171
|
+
)
|
|
172
|
+
return self.default_schema
|
|
103
173
|
|
|
104
174
|
def refresh_model(self) -> None:
|
|
105
175
|
self.model = self.catalog.getCatalogModel()
|
|
106
176
|
|
|
107
|
-
@property
|
|
108
|
-
def schemas(self) -> dict[str, Schema]:
|
|
109
|
-
return self.model.schemas
|
|
110
|
-
|
|
111
177
|
@property
|
|
112
178
|
def chaise_config(self) -> dict[str, Any]:
|
|
113
179
|
"""Return the chaise configuration."""
|
|
114
180
|
return self.model.chaise_config
|
|
115
181
|
|
|
182
|
+
def get_schema_description(self, include_system_columns: bool = False) -> dict[str, Any]:
|
|
183
|
+
"""Return a JSON description of the catalog schema structure.
|
|
184
|
+
|
|
185
|
+
Provides a structured representation of the domain and ML schemas including
|
|
186
|
+
tables, columns, foreign keys, and relationships. Useful for understanding
|
|
187
|
+
the data model structure programmatically.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
include_system_columns: If True, include RID, RCT, RMT, RCB, RMB columns.
|
|
191
|
+
Default False to reduce output size.
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
Dictionary with schema structure:
|
|
195
|
+
{
|
|
196
|
+
"domain_schemas": ["schema_name1", "schema_name2"],
|
|
197
|
+
"default_schema": "schema_name1",
|
|
198
|
+
"ml_schema": "deriva-ml",
|
|
199
|
+
"schemas": {
|
|
200
|
+
"schema_name": {
|
|
201
|
+
"tables": {
|
|
202
|
+
"TableName": {
|
|
203
|
+
"comment": "description",
|
|
204
|
+
"is_vocabulary": bool,
|
|
205
|
+
"is_asset": bool,
|
|
206
|
+
"is_association": bool,
|
|
207
|
+
"columns": [...],
|
|
208
|
+
"foreign_keys": [...],
|
|
209
|
+
"features": [...]
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
"""
|
|
216
|
+
system_columns = {"RID", "RCT", "RMT", "RCB", "RMB"}
|
|
217
|
+
result = {
|
|
218
|
+
"domain_schemas": sorted(self.domain_schemas),
|
|
219
|
+
"default_schema": self.default_schema,
|
|
220
|
+
"ml_schema": self.ml_schema,
|
|
221
|
+
"schemas": {},
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
# Include all domain schemas and the ML schema
|
|
225
|
+
for schema_name in [*self.domain_schemas, self.ml_schema]:
|
|
226
|
+
schema = self.model.schemas.get(schema_name)
|
|
227
|
+
if not schema:
|
|
228
|
+
continue
|
|
229
|
+
|
|
230
|
+
schema_info = {"tables": {}}
|
|
231
|
+
|
|
232
|
+
for table_name, table in schema.tables.items():
|
|
233
|
+
# Get columns
|
|
234
|
+
columns = []
|
|
235
|
+
for col in table.columns:
|
|
236
|
+
if not include_system_columns and col.name in system_columns:
|
|
237
|
+
continue
|
|
238
|
+
columns.append({
|
|
239
|
+
"name": col.name,
|
|
240
|
+
"type": str(col.type.typename),
|
|
241
|
+
"nullok": col.nullok,
|
|
242
|
+
"comment": col.comment or "",
|
|
243
|
+
})
|
|
244
|
+
|
|
245
|
+
# Get foreign keys
|
|
246
|
+
foreign_keys = []
|
|
247
|
+
for fk in table.foreign_keys:
|
|
248
|
+
fk_cols = [c.name for c in fk.foreign_key_columns]
|
|
249
|
+
ref_cols = [c.name for c in fk.referenced_columns]
|
|
250
|
+
foreign_keys.append({
|
|
251
|
+
"columns": fk_cols,
|
|
252
|
+
"referenced_table": f"{fk.pk_table.schema.name}.{fk.pk_table.name}",
|
|
253
|
+
"referenced_columns": ref_cols,
|
|
254
|
+
})
|
|
255
|
+
|
|
256
|
+
# Get features if this is a domain table
|
|
257
|
+
features = []
|
|
258
|
+
if self.is_domain_schema(schema_name):
|
|
259
|
+
try:
|
|
260
|
+
for f in self.find_features(table):
|
|
261
|
+
features.append({
|
|
262
|
+
"name": f.feature_name,
|
|
263
|
+
"feature_table": f.feature_table.name,
|
|
264
|
+
})
|
|
265
|
+
except Exception:
|
|
266
|
+
pass # Table may not support features
|
|
267
|
+
|
|
268
|
+
table_info = {
|
|
269
|
+
"comment": table.comment or "",
|
|
270
|
+
"is_vocabulary": self.is_vocabulary(table),
|
|
271
|
+
"is_asset": self.is_asset(table),
|
|
272
|
+
"is_association": bool(self.is_association(table)),
|
|
273
|
+
"columns": columns,
|
|
274
|
+
"foreign_keys": foreign_keys,
|
|
275
|
+
}
|
|
276
|
+
if features:
|
|
277
|
+
table_info["features"] = features
|
|
278
|
+
|
|
279
|
+
schema_info["tables"][table_name] = table_info
|
|
280
|
+
|
|
281
|
+
result["schemas"][schema_name] = schema_info
|
|
282
|
+
|
|
283
|
+
return result
|
|
284
|
+
|
|
116
285
|
def __getattr__(self, name: str) -> Any:
|
|
117
286
|
# Called only if `name` is not found in Manager. Delegate attributes to model class.
|
|
118
287
|
return getattr(self.model, name)
|
|
@@ -120,20 +289,28 @@ class DerivaModel:
|
|
|
120
289
|
def name_to_table(self, table: TableInput) -> Table:
|
|
121
290
|
"""Return the table object corresponding to the given table name.
|
|
122
291
|
|
|
123
|
-
|
|
292
|
+
Searches domain schemas first (in sorted order), then ML schema, then WWW.
|
|
293
|
+
If the table name appears in more than one schema, returns the first match.
|
|
124
294
|
|
|
125
295
|
Args:
|
|
126
296
|
table: A ERMRest table object or a string that is the name of the table.
|
|
127
297
|
|
|
128
298
|
Returns:
|
|
129
299
|
Table object.
|
|
300
|
+
|
|
301
|
+
Raises:
|
|
302
|
+
DerivaMLException: If the table doesn't exist in any searchable schema.
|
|
130
303
|
"""
|
|
131
304
|
if isinstance(table, Table):
|
|
132
305
|
return table
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
306
|
+
|
|
307
|
+
# Search domain schemas (sorted for deterministic order), then ML schema, then WWW
|
|
308
|
+
search_order = [*sorted(self.domain_schemas), self.ml_schema, "WWW"]
|
|
309
|
+
for sname in search_order:
|
|
310
|
+
if sname not in self.model.schemas:
|
|
311
|
+
continue
|
|
312
|
+
s = self.model.schemas[sname]
|
|
313
|
+
if table in s.tables:
|
|
137
314
|
return s.tables[table]
|
|
138
315
|
raise DerivaMLException(f"The table {table} doesn't exist.")
|
|
139
316
|
|
|
@@ -220,21 +397,28 @@ class DerivaModel:
|
|
|
220
397
|
return [t for s in self.model.schemas.values() for t in s.tables.values() if self.is_asset(t)]
|
|
221
398
|
|
|
222
399
|
def find_vocabularies(self) -> list[Table]:
|
|
223
|
-
"""Return a list of all
|
|
224
|
-
|
|
400
|
+
"""Return a list of all controlled vocabulary tables in domain and ML schemas."""
|
|
401
|
+
tables = []
|
|
402
|
+
for schema_name in [*self.domain_schemas, self.ml_schema]:
|
|
403
|
+
schema = self.model.schemas.get(schema_name)
|
|
404
|
+
if schema:
|
|
405
|
+
tables.extend(t for t in schema.tables.values() if self.is_vocabulary(t))
|
|
406
|
+
return tables
|
|
225
407
|
|
|
226
408
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
227
|
-
def find_features(self, table: TableInput) -> Iterable[Feature]:
|
|
228
|
-
"""List
|
|
409
|
+
def find_features(self, table: TableInput | None = None) -> Iterable[Feature]:
|
|
410
|
+
"""List features in the catalog.
|
|
411
|
+
|
|
412
|
+
If a table is specified, returns only features for that table.
|
|
413
|
+
If no table is specified, returns all features across all tables in the catalog.
|
|
229
414
|
|
|
230
415
|
Args:
|
|
231
|
-
table:
|
|
232
|
-
|
|
416
|
+
table: Optional table to find features for. If None, returns all features
|
|
417
|
+
in the catalog.
|
|
233
418
|
|
|
234
419
|
Returns:
|
|
235
|
-
An iterable of
|
|
420
|
+
An iterable of Feature instances describing the features.
|
|
236
421
|
"""
|
|
237
|
-
table = self.name_to_table(table)
|
|
238
422
|
|
|
239
423
|
def is_feature(a: FindAssociationResult) -> bool:
|
|
240
424
|
"""Check if association represents a feature.
|
|
@@ -250,9 +434,24 @@ class DerivaModel:
|
|
|
250
434
|
a.self_fkey.foreign_key_columns[0].name,
|
|
251
435
|
}.issubset({c.name for c in a.table.columns})
|
|
252
436
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
437
|
+
def find_table_features(t: Table) -> list[Feature]:
|
|
438
|
+
"""Find all features for a single table."""
|
|
439
|
+
return [
|
|
440
|
+
Feature(a, self) for a in t.find_associations(min_arity=3, max_arity=3, pure=False) if is_feature(a)
|
|
441
|
+
]
|
|
442
|
+
|
|
443
|
+
if table is not None:
|
|
444
|
+
# Find features for a specific table
|
|
445
|
+
return find_table_features(self.name_to_table(table))
|
|
446
|
+
else:
|
|
447
|
+
# Find all features across all domain and ML schema tables
|
|
448
|
+
features: list[Feature] = []
|
|
449
|
+
for schema_name in [*self.domain_schemas, self.ml_schema]:
|
|
450
|
+
schema = self.model.schemas.get(schema_name)
|
|
451
|
+
if schema:
|
|
452
|
+
for t in schema.tables.values():
|
|
453
|
+
features.extend(find_table_features(t))
|
|
454
|
+
return features
|
|
256
455
|
|
|
257
456
|
def lookup_feature(self, table: TableInput, feature_name: str) -> Feature:
|
|
258
457
|
"""Lookup the named feature associated with the provided table.
|
|
@@ -290,6 +489,20 @@ class DerivaModel:
|
|
|
290
489
|
else:
|
|
291
490
|
self.model.apply()
|
|
292
491
|
|
|
492
|
+
def is_dataset_rid(self, rid: RID, deleted: bool = False) -> bool:
|
|
493
|
+
"""Check if a given RID is a dataset RID."""
|
|
494
|
+
try:
|
|
495
|
+
rid_info = self.model.catalog.resolve_rid(rid, self.model)
|
|
496
|
+
except KeyError as _e:
|
|
497
|
+
raise DerivaMLException(f"Invalid RID {rid}")
|
|
498
|
+
if rid_info.table.name != "Dataset":
|
|
499
|
+
return False
|
|
500
|
+
elif deleted:
|
|
501
|
+
# Got a dataset rid. Now check to see if its deleted or not.
|
|
502
|
+
return True
|
|
503
|
+
else:
|
|
504
|
+
return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
|
|
505
|
+
|
|
293
506
|
def list_dataset_element_types(self) -> list[Table]:
|
|
294
507
|
"""
|
|
295
508
|
Lists the data types of elements contained within a dataset.
|
|
@@ -307,15 +520,14 @@ class DerivaModel:
|
|
|
307
520
|
|
|
308
521
|
dataset_table = self.name_to_table("Dataset")
|
|
309
522
|
|
|
310
|
-
def
|
|
311
|
-
return table.schema.name
|
|
523
|
+
def is_domain_or_dataset_table(table: Table) -> bool:
|
|
524
|
+
return self.is_domain_schema(table.schema.name) or table.name == dataset_table.name
|
|
312
525
|
|
|
313
|
-
return [t for a in dataset_table.find_associations() if
|
|
526
|
+
return [t for a in dataset_table.find_associations() if is_domain_or_dataset_table(t := a.other_fkeys.pop().pk_table)]
|
|
314
527
|
|
|
315
|
-
def _prepare_wide_table(
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
include_tables: list[str]) -> tuple[dict[str, Any], list[tuple]]:
|
|
528
|
+
def _prepare_wide_table(
|
|
529
|
+
self, dataset, dataset_rid: RID, include_tables: list[str]
|
|
530
|
+
) -> tuple[dict[str, Any], list[tuple]]:
|
|
319
531
|
"""
|
|
320
532
|
Generates details of a wide table from the model
|
|
321
533
|
|
|
@@ -344,11 +556,6 @@ class DerivaModel:
|
|
|
344
556
|
for p in table_paths:
|
|
345
557
|
paths_by_element[p[2].name].append(p)
|
|
346
558
|
|
|
347
|
-
# Get the names of all of the tables that can be dataset elements.
|
|
348
|
-
dataset_element_tables = {
|
|
349
|
-
e.name for e in self.list_dataset_element_types() if e.schema.name == self.domain_schema
|
|
350
|
-
}
|
|
351
|
-
|
|
352
559
|
skip_columns = {"RCT", "RMT", "RCB", "RMB"}
|
|
353
560
|
element_tables = {}
|
|
354
561
|
for element_table, paths in paths_by_element.items():
|
|
@@ -446,9 +653,11 @@ class DerivaModel:
|
|
|
446
653
|
|
|
447
654
|
def find_arcs(table: Table) -> set[Table]:
|
|
448
655
|
"""Given a path through the model, return the FKs that link the tables"""
|
|
656
|
+
# Valid schemas for traversal: all domain schemas + ML schema
|
|
657
|
+
valid_schemas = self.domain_schemas | {self.ml_schema}
|
|
449
658
|
arc_list = [fk.pk_table for fk in table.foreign_keys] + [fk.table for fk in table.referenced_by]
|
|
450
|
-
arc_list = [t for t in arc_list if t.schema.name in
|
|
451
|
-
domain_tables = [t for t in arc_list if t.schema.name
|
|
659
|
+
arc_list = [t for t in arc_list if t.schema.name in valid_schemas]
|
|
660
|
+
domain_tables = [t for t in arc_list if self.is_domain_schema(t.schema.name)]
|
|
452
661
|
if multiple_columns := [c for c, cnt in Counter(domain_tables).items() if cnt > 1]:
|
|
453
662
|
raise DerivaMLException(f"Ambiguous relationship in {table.name} {multiple_columns}")
|
|
454
663
|
return set(arc_list)
|
|
@@ -466,7 +675,8 @@ class DerivaModel:
|
|
|
466
675
|
return paths
|
|
467
676
|
|
|
468
677
|
for child in find_arcs(root):
|
|
469
|
-
if child.name in {"Dataset_Execution", "Dataset_Dataset", "Execution"}:
|
|
678
|
+
# if child.name in {"Dataset_Execution", "Dataset_Dataset", "Execution"}:
|
|
679
|
+
if child.name in {"Dataset_Dataset", "Execution"}:
|
|
470
680
|
continue
|
|
471
681
|
if child == parent:
|
|
472
682
|
# Don't loop back via referred_by
|
|
@@ -479,7 +689,23 @@ class DerivaModel:
|
|
|
479
689
|
paths.extend(self._schema_to_paths(child, path))
|
|
480
690
|
return paths
|
|
481
691
|
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
692
|
+
def create_table(self, table_def: TableDefinition, schema: str | None = None) -> Table:
|
|
693
|
+
"""Create a new table from TableDefinition.
|
|
694
|
+
|
|
695
|
+
Args:
|
|
696
|
+
table_def: Table definition (dataclass or dict).
|
|
697
|
+
schema: Schema to create the table in. If None, uses default_schema.
|
|
698
|
+
|
|
699
|
+
Returns:
|
|
700
|
+
The newly created Table.
|
|
701
|
+
|
|
702
|
+
Raises:
|
|
703
|
+
DerivaMLException: If no schema specified and default_schema is not set.
|
|
704
|
+
|
|
705
|
+
Note: @validate_call removed because TableDefinition is now a dataclass from
|
|
706
|
+
deriva.core.typed and Pydantic validation doesn't work well with dataclass fields.
|
|
707
|
+
"""
|
|
708
|
+
schema = schema or self._require_default_schema()
|
|
709
|
+
# Handle both TableDefinition (dataclass with to_dict) and plain dicts
|
|
710
|
+
table_dict = table_def.to_dict() if hasattr(table_def, 'to_dict') else table_def
|
|
711
|
+
return self.model.schemas[schema].create_table(table_dict)
|