deriva-ml 1.17.9__py3-none-any.whl → 1.17.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +43 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +21 -0
- deriva_ml/catalog/clone.py +1199 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +817 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +186 -105
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +545 -244
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +224 -35
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -5
- deriva_ml-1.17.11.dist-info/RECORD +77 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +2 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.9.dist-info/RECORD +0 -45
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
deriva_ml/interfaces.py
ADDED
|
@@ -0,0 +1,862 @@
|
|
|
1
|
+
"""Protocol definitions for DerivaML dataset, asset, and catalog operations.
|
|
2
|
+
|
|
3
|
+
This module defines the protocols (interfaces) used throughout DerivaML for
|
|
4
|
+
type checking and polymorphic access to datasets, assets, and catalogs.
|
|
5
|
+
|
|
6
|
+
Protocol Hierarchies
|
|
7
|
+
--------------------
|
|
8
|
+
|
|
9
|
+
**Dataset Protocols:**
|
|
10
|
+
DatasetLike: Read-only operations for both live datasets and downloaded bags.
|
|
11
|
+
WritableDataset: Write operations only available on live catalog datasets.
|
|
12
|
+
|
|
13
|
+
**Asset Protocols:**
|
|
14
|
+
AssetLike: Read-only operations for asset access.
|
|
15
|
+
WritableAsset: Write operations for asset modification.
|
|
16
|
+
|
|
17
|
+
**Catalog Protocols:**
|
|
18
|
+
DerivaMLCatalogReader: Read-only catalog operations (lookups, queries).
|
|
19
|
+
DerivaMLCatalog: Full catalog operations including write operations.
|
|
20
|
+
|
|
21
|
+
The separation allows code to express its requirements precisely:
|
|
22
|
+
- Code that only reads data can accept DatasetLike, AssetLike, or DerivaMLCatalogReader
|
|
23
|
+
- Code that modifies data requires WritableDataset, WritableAsset, or DerivaMLCatalog
|
|
24
|
+
|
|
25
|
+
API Naming Conventions
|
|
26
|
+
----------------------
|
|
27
|
+
|
|
28
|
+
The DerivaML API follows consistent naming conventions:
|
|
29
|
+
|
|
30
|
+
- ``lookup_*``: Single item retrieval by identifier. Returns one item or raises exception.
|
|
31
|
+
Examples: lookup_dataset(), lookup_asset(), lookup_term()
|
|
32
|
+
|
|
33
|
+
- ``find_*``: Search/discovery operations. Returns Iterable of matching items.
|
|
34
|
+
Examples: find_datasets(), find_assets(), find_features()
|
|
35
|
+
|
|
36
|
+
- ``list_*``: List all items of a type, often with context (e.g., members of a dataset).
|
|
37
|
+
Examples: list_assets(), list_vocabulary_terms(), list_dataset_members()
|
|
38
|
+
|
|
39
|
+
- ``get_*``: Data retrieval with transformation (e.g., to DataFrame).
|
|
40
|
+
Examples: get_table_as_dataframe(), get_metadata()
|
|
41
|
+
|
|
42
|
+
- ``create_*``: Create new entities in the catalog.
|
|
43
|
+
Examples: create_dataset(), create_execution(), create_feature()
|
|
44
|
+
|
|
45
|
+
- ``add_*``: Add items to existing entities or create vocabulary terms.
|
|
46
|
+
Examples: add_term(), add_dataset_members(), add_asset_type()
|
|
47
|
+
|
|
48
|
+
- ``delete_*`` / ``remove_*``: Remove items from entities.
|
|
49
|
+
Examples: delete_dataset_members(), remove_asset_type()
|
|
50
|
+
|
|
51
|
+
Implementation Notes
|
|
52
|
+
--------------------
|
|
53
|
+
- Dataset: Live catalog access via deriva-py/datapath (implements both protocols)
|
|
54
|
+
- DatasetBag: Downloaded bag access via SQLAlchemy/SQLite (read-only only)
|
|
55
|
+
- Asset: Live catalog access for file-based records (implements WritableAsset)
|
|
56
|
+
- DerivaML: Full catalog operations (implements DerivaMLCatalog)
|
|
57
|
+
- DerivaMLDatabase: Bag-backed catalog (implements DerivaMLCatalogReader only)
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
from __future__ import annotations
|
|
61
|
+
|
|
62
|
+
from pathlib import Path
|
|
63
|
+
from typing import TYPE_CHECKING, Any, Generator, Iterable, Protocol, Self, runtime_checkable
|
|
64
|
+
|
|
65
|
+
import pandas as pd
|
|
66
|
+
|
|
67
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
68
|
+
import importlib
|
|
69
|
+
_deriva_core = importlib.import_module("deriva.core")
|
|
70
|
+
_datapath = importlib.import_module("deriva.core.datapath")
|
|
71
|
+
_ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")
|
|
72
|
+
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
73
|
+
|
|
74
|
+
ErmrestSnapshot = _deriva_core.ErmrestSnapshot
|
|
75
|
+
SchemaWrapper = _datapath._SchemaWrapper
|
|
76
|
+
ErmrestCatalog = _ermrest_catalog.ErmrestCatalog
|
|
77
|
+
ResolveRidResult = _ermrest_catalog.ResolveRidResult
|
|
78
|
+
Table = _ermrest_model.Table
|
|
79
|
+
|
|
80
|
+
from deriva_ml.core.definitions import RID, VocabularyTerm
|
|
81
|
+
from deriva_ml.core.mixins.rid_resolution import BatchRidResult
|
|
82
|
+
from deriva_ml.feature import Feature, FeatureRecord
|
|
83
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
84
|
+
|
|
85
|
+
if TYPE_CHECKING:
|
|
86
|
+
from deriva_ml.core.enums import Status
|
|
87
|
+
from deriva_ml.dataset.aux_classes import DatasetHistory, DatasetSpec, DatasetVersion
|
|
88
|
+
from deriva_ml.dataset.dataset import Dataset
|
|
89
|
+
from deriva_ml.execution.execution_record import ExecutionRecord
|
|
90
|
+
from deriva_ml.execution.workflow import Workflow
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@runtime_checkable
|
|
94
|
+
class DatasetLike(Protocol):
|
|
95
|
+
"""Protocol defining read-only interface for dataset access.
|
|
96
|
+
|
|
97
|
+
This protocol is implemented by both Dataset (live catalog) and DatasetBag
|
|
98
|
+
(downloaded bag). It defines the common read interface for accessing dataset
|
|
99
|
+
metadata, members, and relationships.
|
|
100
|
+
|
|
101
|
+
The protocol defines the minimal interface that both implementations support.
|
|
102
|
+
Dataset extends this with optional `version` parameters on some methods to
|
|
103
|
+
support querying historical versions. DatasetBag doesn't need version parameters
|
|
104
|
+
since bags are immutable snapshots of a specific version.
|
|
105
|
+
|
|
106
|
+
Note on `_visited` parameters: Both implementations use `_visited` internally
|
|
107
|
+
for recursion guards, but this is not part of the protocol as it's an
|
|
108
|
+
implementation detail.
|
|
109
|
+
|
|
110
|
+
Attributes:
|
|
111
|
+
dataset_rid: Resource Identifier for the dataset.
|
|
112
|
+
execution_rid: Optional execution RID associated with the dataset.
|
|
113
|
+
description: Description of the dataset.
|
|
114
|
+
dataset_types: Type(s) of the dataset from Dataset_Type vocabulary.
|
|
115
|
+
current_version: Current semantic version of the dataset.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
dataset_rid: RID
|
|
119
|
+
execution_rid: RID | None
|
|
120
|
+
description: str
|
|
121
|
+
dataset_types: list[str]
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def current_version(self) -> DatasetVersion:
|
|
125
|
+
"""Get the current version of the dataset."""
|
|
126
|
+
...
|
|
127
|
+
|
|
128
|
+
def dataset_history(self) -> list[DatasetHistory]:
|
|
129
|
+
"""Get the version history of the dataset."""
|
|
130
|
+
...
|
|
131
|
+
|
|
132
|
+
def list_dataset_children(
|
|
133
|
+
self,
|
|
134
|
+
recurse: bool = False,
|
|
135
|
+
_visited: set[RID] | None = None,
|
|
136
|
+
version: Any = None,
|
|
137
|
+
**kwargs: Any,
|
|
138
|
+
) -> list[Self]:
|
|
139
|
+
"""Get nested child datasets.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
recurse: Whether to recursively include children of children.
|
|
143
|
+
_visited: Internal parameter to track visited datasets and prevent infinite recursion.
|
|
144
|
+
version: Dataset version to list children from (Dataset only, ignored by DatasetBag).
|
|
145
|
+
**kwargs: Additional implementation-specific arguments.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
List of child datasets (Dataset or DatasetBag depending on implementation).
|
|
149
|
+
|
|
150
|
+
Note:
|
|
151
|
+
Both Dataset and DatasetBag have `recurse` as the first parameter.
|
|
152
|
+
Dataset uses the `version` parameter to query historical versions.
|
|
153
|
+
"""
|
|
154
|
+
...
|
|
155
|
+
|
|
156
|
+
def list_dataset_parents(
|
|
157
|
+
self,
|
|
158
|
+
recurse: bool = False,
|
|
159
|
+
_visited: set[RID] | None = None,
|
|
160
|
+
version: Any = None,
|
|
161
|
+
**kwargs: Any,
|
|
162
|
+
) -> list[Self]:
|
|
163
|
+
"""Get parent datasets that contain this dataset.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
recurse: Whether to recursively include parents of parents.
|
|
167
|
+
_visited: Internal parameter to track visited datasets and prevent infinite recursion.
|
|
168
|
+
version: Dataset version to list parents from (Dataset only, ignored by DatasetBag).
|
|
169
|
+
**kwargs: Additional implementation-specific arguments.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
List of parent datasets (Dataset or DatasetBag depending on implementation).
|
|
173
|
+
|
|
174
|
+
Note:
|
|
175
|
+
Both Dataset and DatasetBag have `recurse` as the first parameter.
|
|
176
|
+
Dataset uses the `version` parameter to query historical versions.
|
|
177
|
+
"""
|
|
178
|
+
...
|
|
179
|
+
|
|
180
|
+
def list_dataset_members(
|
|
181
|
+
self,
|
|
182
|
+
recurse: bool = False,
|
|
183
|
+
limit: int | None = None,
|
|
184
|
+
_visited: set[RID] | None = None,
|
|
185
|
+
version: Any = None,
|
|
186
|
+
**kwargs: Any,
|
|
187
|
+
) -> dict[str, list[dict[str, Any]]]:
|
|
188
|
+
"""List members of the dataset.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
recurse: Whether to include members of nested datasets.
|
|
192
|
+
limit: Maximum number of members per type. None for no limit.
|
|
193
|
+
_visited: Internal parameter to track visited datasets and prevent infinite recursion.
|
|
194
|
+
version: Dataset version to list members from (Dataset only, ignored by DatasetBag).
|
|
195
|
+
**kwargs: Additional implementation-specific arguments.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
Dictionary mapping member types to lists of member records.
|
|
199
|
+
|
|
200
|
+
Note:
|
|
201
|
+
Both Dataset and DatasetBag have `recurse` as the first parameter.
|
|
202
|
+
Dataset uses the `version` parameter to query historical versions.
|
|
203
|
+
"""
|
|
204
|
+
...
|
|
205
|
+
|
|
206
|
+
def list_dataset_element_types(self) -> Iterable[Table]:
|
|
207
|
+
"""List the types of elements that can be contained in this dataset.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Iterable of Table objects representing element types.
|
|
211
|
+
"""
|
|
212
|
+
...
|
|
213
|
+
|
|
214
|
+
def find_features(self, table: str | Table) -> Iterable[Feature]:
|
|
215
|
+
"""Find features associated with a table.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
table: Table to find features for.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
Iterable of Feature objects.
|
|
222
|
+
"""
|
|
223
|
+
...
|
|
224
|
+
|
|
225
|
+
def denormalize_as_dataframe(
|
|
226
|
+
self,
|
|
227
|
+
include_tables: list[str],
|
|
228
|
+
version: Any = None,
|
|
229
|
+
**kwargs: Any,
|
|
230
|
+
) -> pd.DataFrame:
|
|
231
|
+
"""Denormalize the dataset into a single wide table (DataFrame).
|
|
232
|
+
|
|
233
|
+
Denormalization transforms normalized relational data into a single "wide table"
|
|
234
|
+
(also called a "flat table" or "denormalized table") by joining related tables
|
|
235
|
+
together. This produces a DataFrame where each row contains all related information
|
|
236
|
+
from multiple source tables, with columns from each table combined side-by-side.
|
|
237
|
+
|
|
238
|
+
Wide tables are the standard input format for most machine learning frameworks,
|
|
239
|
+
which expect all features for a single observation to be in one row. This method
|
|
240
|
+
bridges the gap between normalized database schemas and ML-ready tabular data.
|
|
241
|
+
|
|
242
|
+
**How it works:**
|
|
243
|
+
|
|
244
|
+
Tables are joined based on their foreign key relationships. For example, if
|
|
245
|
+
Image has a foreign key to Subject, and Diagnosis has a foreign key to Image,
|
|
246
|
+
then denormalizing ["Subject", "Image", "Diagnosis"] produces rows where each
|
|
247
|
+
image appears with its subject's metadata and any associated diagnoses.
|
|
248
|
+
|
|
249
|
+
The result uses outer join semantics - if a table has no FK relationship to
|
|
250
|
+
others, its rows are included with NULL values for unrelated columns.
|
|
251
|
+
|
|
252
|
+
**Column naming:**
|
|
253
|
+
|
|
254
|
+
To avoid column name collisions (e.g., multiple tables having "RID" or "Name"),
|
|
255
|
+
all column names are prefixed with their source table name.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
include_tables: List of table names to include in the output.
|
|
259
|
+
Tables are joined based on their foreign key relationships.
|
|
260
|
+
Order doesn't matter - the join order is determined automatically.
|
|
261
|
+
version: Dataset version to query (Dataset only, ignored by DatasetBag).
|
|
262
|
+
**kwargs: Additional implementation-specific arguments.
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
pd.DataFrame: Wide table with columns from all included tables.
|
|
266
|
+
Column names are prefixed with the source table name.
|
|
267
|
+
|
|
268
|
+
Note:
|
|
269
|
+
Column naming conventions differ between implementations:
|
|
270
|
+
|
|
271
|
+
- Dataset (catalog): Uses underscore separator (e.g., "Image_Filename")
|
|
272
|
+
- DatasetBag (bag): Uses dot separator (e.g., "Image.Filename")
|
|
273
|
+
|
|
274
|
+
Example:
|
|
275
|
+
Suppose you have a dataset with Images linked to Subjects, and each
|
|
276
|
+
Image has a Diagnosis label::
|
|
277
|
+
|
|
278
|
+
# Normalized schema:
|
|
279
|
+
# Subject: RID, Name, Age
|
|
280
|
+
# Image: RID, Filename, Subject (FK)
|
|
281
|
+
# Diagnosis: RID, Image (FK), Label
|
|
282
|
+
|
|
283
|
+
# Denormalize into a wide table for ML
|
|
284
|
+
df = dataset.denormalize_as_dataframe(["Subject", "Image", "Diagnosis"])
|
|
285
|
+
|
|
286
|
+
# Result has columns like:
|
|
287
|
+
# Subject_RID, Subject_Name, Subject_Age,
|
|
288
|
+
# Image_RID, Image_Filename, Image_Subject,
|
|
289
|
+
# Diagnosis_RID, Diagnosis_Image, Diagnosis_Label
|
|
290
|
+
|
|
291
|
+
# Each row represents one Image with its Subject info and Diagnosis
|
|
292
|
+
# Ready for use with sklearn, pandas, or other ML tools
|
|
293
|
+
|
|
294
|
+
See Also:
|
|
295
|
+
denormalize_as_dict: Generator version for memory-efficient processing.
|
|
296
|
+
"""
|
|
297
|
+
...
|
|
298
|
+
|
|
299
|
+
def denormalize_as_dict(
|
|
300
|
+
self,
|
|
301
|
+
include_tables: list[str],
|
|
302
|
+
version: Any = None,
|
|
303
|
+
**kwargs: Any,
|
|
304
|
+
) -> Generator[dict[str, Any], None, None]:
|
|
305
|
+
"""Denormalize the dataset and yield rows as dictionaries.
|
|
306
|
+
|
|
307
|
+
This is a memory-efficient alternative to denormalize_as_dataframe() that
|
|
308
|
+
yields one row at a time as a dictionary instead of loading all data into
|
|
309
|
+
a DataFrame. Use this when processing large datasets that may not fit in
|
|
310
|
+
memory, or when you want to process rows incrementally.
|
|
311
|
+
|
|
312
|
+
Like denormalize_as_dataframe(), this produces a "wide table" representation
|
|
313
|
+
where each yielded dictionary contains all columns from the joined tables.
|
|
314
|
+
See denormalize_as_dataframe() for detailed explanation of how denormalization
|
|
315
|
+
works.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
include_tables: List of table names to include in the output.
|
|
319
|
+
Tables are joined based on their foreign key relationships.
|
|
320
|
+
version: Dataset version to query (Dataset only, ignored by DatasetBag).
|
|
321
|
+
**kwargs: Additional implementation-specific arguments.
|
|
322
|
+
|
|
323
|
+
Yields:
|
|
324
|
+
dict[str, Any]: Dictionary representing one row of the wide table.
|
|
325
|
+
Keys are column names prefixed by table name.
|
|
326
|
+
|
|
327
|
+
Note:
|
|
328
|
+
Column naming conventions differ between implementations:
|
|
329
|
+
|
|
330
|
+
- Dataset (catalog): Uses underscore separator (e.g., "Image_Filename")
|
|
331
|
+
- DatasetBag (bag): Uses dot separator (e.g., "Image.Filename")
|
|
332
|
+
|
|
333
|
+
Example:
|
|
334
|
+
Process a large dataset without loading everything into memory::
|
|
335
|
+
|
|
336
|
+
# Stream through rows one at a time
|
|
337
|
+
for row in dataset.denormalize_as_dict(["Image", "Diagnosis"]):
|
|
338
|
+
image_path = row["Image_Filename"]
|
|
339
|
+
label = row["Diagnosis_Label"]
|
|
340
|
+
# Process each image-label pair...
|
|
341
|
+
|
|
342
|
+
# Or convert to list if you need random access
|
|
343
|
+
rows = list(dataset.denormalize_as_dict(["Image", "Diagnosis"]))
|
|
344
|
+
|
|
345
|
+
See Also:
|
|
346
|
+
denormalize_as_dataframe: Returns all data as a pandas DataFrame.
|
|
347
|
+
"""
|
|
348
|
+
...
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
@runtime_checkable
|
|
352
|
+
class WritableDataset(DatasetLike, Protocol):
|
|
353
|
+
"""Protocol defining write operations for datasets.
|
|
354
|
+
|
|
355
|
+
This protocol extends DatasetLike with write operations that are only
|
|
356
|
+
available on live catalog datasets. Downloaded bags (DatasetBag) are
|
|
357
|
+
immutable snapshots and do not implement these methods.
|
|
358
|
+
|
|
359
|
+
Use this protocol when you need to express that code requires the ability
|
|
360
|
+
to modify a dataset, not just read from it.
|
|
361
|
+
|
|
362
|
+
Example:
|
|
363
|
+
>>> def add_samples(dataset: WritableDataset, sample_rids: list[str]):
|
|
364
|
+
... dataset.add_dataset_members(sample_rids)
|
|
365
|
+
... dataset.increment_dataset_version(VersionPart.minor)
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
def add_dataset_members(self, members: list[RID]) -> None:
|
|
369
|
+
"""Add members to the dataset.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
members: List of RIDs to add to the dataset.
|
|
373
|
+
"""
|
|
374
|
+
...
|
|
375
|
+
|
|
376
|
+
def delete_dataset_members(
|
|
377
|
+
self,
|
|
378
|
+
members: list[RID],
|
|
379
|
+
description: str = "",
|
|
380
|
+
execution_rid: RID | None = None,
|
|
381
|
+
) -> None:
|
|
382
|
+
"""Remove members from the dataset.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
members: List of RIDs to remove from the dataset.
|
|
386
|
+
description: Optional description of the removal operation.
|
|
387
|
+
execution_rid: Optional RID of execution associated with this operation.
|
|
388
|
+
"""
|
|
389
|
+
...
|
|
390
|
+
|
|
391
|
+
def increment_dataset_version(
|
|
392
|
+
self,
|
|
393
|
+
component: Any,
|
|
394
|
+
description: str | None = "",
|
|
395
|
+
execution_rid: RID | None = None,
|
|
396
|
+
) -> DatasetVersion:
|
|
397
|
+
"""Increment the dataset version.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
component: Which version component to increment (major, minor, patch).
|
|
401
|
+
description: Optional description of the changes in this version.
|
|
402
|
+
execution_rid: Optional execution RID to associate with this version.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
The new version after incrementing.
|
|
406
|
+
"""
|
|
407
|
+
...
|
|
408
|
+
|
|
409
|
+
def download_dataset_bag(
|
|
410
|
+
self,
|
|
411
|
+
version: DatasetVersion | str | None = None,
|
|
412
|
+
use_minid: bool = False,
|
|
413
|
+
) -> Any:
|
|
414
|
+
"""Download the dataset as a BDBag.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
version: Optional version to download. Defaults to current version.
|
|
418
|
+
use_minid: If True, upload the bag to S3 and create a MINID.
|
|
419
|
+
Requires s3_bucket to be configured on the catalog. Defaults to False.
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
DatasetBag containing the downloaded data.
|
|
423
|
+
|
|
424
|
+
Raises:
|
|
425
|
+
DerivaMLException: If use_minid=True but s3_bucket is not configured.
|
|
426
|
+
"""
|
|
427
|
+
...
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
@runtime_checkable
|
|
431
|
+
class AssetLike(Protocol):
|
|
432
|
+
"""Protocol defining read-only interface for asset access.
|
|
433
|
+
|
|
434
|
+
This protocol defines the common read interface for accessing asset
|
|
435
|
+
metadata, types, and provenance. It parallels DatasetLike but for
|
|
436
|
+
individual file-based records rather than data collections.
|
|
437
|
+
|
|
438
|
+
Attributes:
|
|
439
|
+
asset_rid: Resource Identifier for the asset.
|
|
440
|
+
asset_table: Name of the asset table containing this asset.
|
|
441
|
+
filename: Original filename of the asset.
|
|
442
|
+
url: URL to access the asset file.
|
|
443
|
+
length: Size of the asset file in bytes.
|
|
444
|
+
md5: MD5 checksum of the asset file.
|
|
445
|
+
asset_types: Type(s) of the asset from Asset_Type vocabulary.
|
|
446
|
+
description: Description of the asset.
|
|
447
|
+
execution_rid: Optional execution RID that created the asset.
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
asset_rid: RID
|
|
451
|
+
asset_table: str
|
|
452
|
+
filename: str
|
|
453
|
+
url: str
|
|
454
|
+
length: int
|
|
455
|
+
md5: str
|
|
456
|
+
asset_types: list[str]
|
|
457
|
+
description: str
|
|
458
|
+
execution_rid: RID | None
|
|
459
|
+
|
|
460
|
+
def list_executions(self, asset_role: str | None = None) -> list[dict[str, Any]]:
|
|
461
|
+
"""List all executions associated with this asset.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
asset_role: Optional filter for asset role ('Input' or 'Output').
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
List of records with Execution RID and Asset_Role.
|
|
468
|
+
"""
|
|
469
|
+
...
|
|
470
|
+
|
|
471
|
+
def find_features(self) -> Iterable[Feature]:
|
|
472
|
+
"""Find features defined on this asset's table.
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
Iterable of Feature objects.
|
|
476
|
+
"""
|
|
477
|
+
...
|
|
478
|
+
|
|
479
|
+
def list_feature_values(self, feature_name: str) -> list[FeatureRecord]:
|
|
480
|
+
"""Get feature values for this specific asset.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
feature_name: Name of the feature to query.
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
List of FeatureRecord instances. Each record has:
|
|
487
|
+
- Execution: RID of the execution that created this feature value
|
|
488
|
+
- Feature_Name: Name of the feature
|
|
489
|
+
- All feature-specific columns as typed attributes
|
|
490
|
+
- model_dump() method to convert back to a dictionary
|
|
491
|
+
"""
|
|
492
|
+
...
|
|
493
|
+
|
|
494
|
+
def get_metadata(self) -> dict[str, Any]:
|
|
495
|
+
"""Get all metadata for this asset from the catalog.
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
Dictionary of all columns/values for this asset record.
|
|
499
|
+
"""
|
|
500
|
+
...
|
|
501
|
+
|
|
502
|
+
def get_chaise_url(self) -> str:
|
|
503
|
+
"""Get the Chaise URL for viewing this asset in the web interface.
|
|
504
|
+
|
|
505
|
+
Returns:
|
|
506
|
+
URL to view this asset in Chaise.
|
|
507
|
+
"""
|
|
508
|
+
...
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
@runtime_checkable
|
|
512
|
+
class WritableAsset(AssetLike, Protocol):
|
|
513
|
+
"""Protocol defining write operations for assets.
|
|
514
|
+
|
|
515
|
+
This protocol extends AssetLike with write operations that are only
|
|
516
|
+
available on live catalog assets. Downloaded assets are immutable
|
|
517
|
+
and do not implement these methods.
|
|
518
|
+
"""
|
|
519
|
+
|
|
520
|
+
def add_asset_type(self, type_name: str) -> None:
|
|
521
|
+
"""Add an asset type to this asset.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
type_name: Name of the asset type vocabulary term.
|
|
525
|
+
"""
|
|
526
|
+
...
|
|
527
|
+
|
|
528
|
+
def remove_asset_type(self, type_name: str) -> None:
|
|
529
|
+
"""Remove an asset type from this asset.
|
|
530
|
+
|
|
531
|
+
Args:
|
|
532
|
+
type_name: Name of the asset type vocabulary term.
|
|
533
|
+
"""
|
|
534
|
+
...
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
@runtime_checkable
|
|
538
|
+
class DerivaMLCatalogReader(Protocol):
|
|
539
|
+
"""Protocol for read-only catalog operations.
|
|
540
|
+
|
|
541
|
+
This protocol defines the minimal interface for reading from a catalog,
|
|
542
|
+
implemented by both DerivaML (live catalog) and DerivaMLDatabase (downloaded bags).
|
|
543
|
+
|
|
544
|
+
Use this protocol when code only needs to read data and should work with
|
|
545
|
+
both live catalogs and downloaded bags.
|
|
546
|
+
|
|
547
|
+
Attributes:
|
|
548
|
+
ml_schema: Name of the ML schema (typically 'deriva-ml').
|
|
549
|
+
domain_schema: Name of the domain-specific schema.
|
|
550
|
+
model: The catalog model containing schema information.
|
|
551
|
+
cache_dir: Directory for caching downloaded data.
|
|
552
|
+
working_dir: Directory for working files.
|
|
553
|
+
|
|
554
|
+
Example:
|
|
555
|
+
>>> def analyze_dataset(catalog: DerivaMLCatalogReader, dataset_rid: str):
|
|
556
|
+
... dataset = catalog.lookup_dataset(dataset_rid)
|
|
557
|
+
... members = dataset.list_dataset_members()
|
|
558
|
+
... return len(members)
|
|
559
|
+
"""
|
|
560
|
+
|
|
561
|
+
ml_schema: str
|
|
562
|
+
domain_schemas: frozenset[str]
|
|
563
|
+
default_schema: str | None
|
|
564
|
+
model: DerivaModel
|
|
565
|
+
cache_dir: Path
|
|
566
|
+
working_dir: Path
|
|
567
|
+
|
|
568
|
+
def lookup_dataset(self, dataset: RID | DatasetSpec, deleted: bool = False) -> DatasetLike:
|
|
569
|
+
"""Look up a dataset by RID or specification.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
dataset: RID or DatasetSpec identifying the dataset.
|
|
573
|
+
deleted: Whether to include deleted datasets.
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
The dataset (Dataset for live catalogs, DatasetBag for bags).
|
|
577
|
+
"""
|
|
578
|
+
...
|
|
579
|
+
|
|
580
|
+
def find_datasets(self, deleted: bool = False) -> Iterable[DatasetLike]:
|
|
581
|
+
"""Find all datasets in the catalog.
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
deleted: Whether to include deleted datasets.
|
|
585
|
+
|
|
586
|
+
Returns:
|
|
587
|
+
Iterable of all datasets.
|
|
588
|
+
"""
|
|
589
|
+
...
|
|
590
|
+
|
|
591
|
+
def lookup_term(self, table: str | Table, term_name: str) -> VocabularyTerm:
|
|
592
|
+
"""Look up a vocabulary term.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
table: Vocabulary table name or Table object.
|
|
596
|
+
term_name: Name of the term to look up.
|
|
597
|
+
|
|
598
|
+
Returns:
|
|
599
|
+
The vocabulary term.
|
|
600
|
+
"""
|
|
601
|
+
...
|
|
602
|
+
|
|
603
|
+
def get_table_as_dataframe(self, table: str) -> pd.DataFrame:
|
|
604
|
+
"""Get table contents as a pandas DataFrame.
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
table: Name of the table to retrieve.
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
DataFrame containing table contents.
|
|
611
|
+
"""
|
|
612
|
+
...
|
|
613
|
+
|
|
614
|
+
def get_table_as_dict(self, table: str) -> Iterable[dict[str, Any]]:
|
|
615
|
+
"""Get table contents as dictionaries.
|
|
616
|
+
|
|
617
|
+
Args:
|
|
618
|
+
table: Name of the table to retrieve.
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
Iterable of dictionaries for each row.
|
|
622
|
+
"""
|
|
623
|
+
...
|
|
624
|
+
|
|
625
|
+
def list_dataset_element_types(self) -> Iterable[Table]:
|
|
626
|
+
"""List the types of elements that can be contained in datasets.
|
|
627
|
+
|
|
628
|
+
Returns:
|
|
629
|
+
Iterable of Table objects representing element types.
|
|
630
|
+
"""
|
|
631
|
+
...
|
|
632
|
+
|
|
633
|
+
def find_features(self, table: str | Table) -> Iterable[Feature]:
|
|
634
|
+
"""Find features associated with a table.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
table: Table to find features for.
|
|
638
|
+
|
|
639
|
+
Returns:
|
|
640
|
+
Iterable of Feature objects.
|
|
641
|
+
"""
|
|
642
|
+
...
|
|
643
|
+
|
|
644
|
+
def lookup_workflow(self, rid: RID) -> "Workflow":
|
|
645
|
+
"""Look up a workflow by its Resource Identifier (RID).
|
|
646
|
+
|
|
647
|
+
Retrieves a workflow from the catalog by its RID. The returned Workflow
|
|
648
|
+
is bound to the catalog, allowing its description to be updated (on
|
|
649
|
+
writable catalogs).
|
|
650
|
+
|
|
651
|
+
Args:
|
|
652
|
+
rid: Resource Identifier of the workflow to look up.
|
|
653
|
+
|
|
654
|
+
Returns:
|
|
655
|
+
Workflow: The workflow object bound to this catalog.
|
|
656
|
+
|
|
657
|
+
Raises:
|
|
658
|
+
DerivaMLException: If the RID does not correspond to a workflow.
|
|
659
|
+
|
|
660
|
+
Example:
|
|
661
|
+
>>> workflow = catalog.lookup_workflow("2-ABC1")
|
|
662
|
+
>>> print(f"{workflow.name}: {workflow.description}")
|
|
663
|
+
"""
|
|
664
|
+
...
|
|
665
|
+
|
|
666
|
+
def find_workflows(self) -> Iterable["Workflow"]:
|
|
667
|
+
"""Find all workflows in the catalog.
|
|
668
|
+
|
|
669
|
+
Returns all workflow definitions, each bound to the catalog for
|
|
670
|
+
potential modification.
|
|
671
|
+
|
|
672
|
+
Returns:
|
|
673
|
+
Iterable of Workflow objects.
|
|
674
|
+
|
|
675
|
+
Example:
|
|
676
|
+
>>> for workflow in catalog.find_workflows():
|
|
677
|
+
... print(f"{workflow.name}: {workflow.description}")
|
|
678
|
+
"""
|
|
679
|
+
...
|
|
680
|
+
|
|
681
|
+
def lookup_workflow_by_url(self, url_or_checksum: str) -> "Workflow":
|
|
682
|
+
"""Look up a workflow by URL or checksum.
|
|
683
|
+
|
|
684
|
+
Searches for a workflow matching the given GitHub URL or Git object
|
|
685
|
+
hash (checksum) and returns a bound Workflow object.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
url_or_checksum: GitHub URL with commit hash, or Git object hash.
|
|
689
|
+
|
|
690
|
+
Returns:
|
|
691
|
+
Workflow: The workflow object bound to this catalog.
|
|
692
|
+
|
|
693
|
+
Raises:
|
|
694
|
+
DerivaMLException: If no matching workflow is found.
|
|
695
|
+
|
|
696
|
+
Example:
|
|
697
|
+
>>> url = "https://github.com/org/repo/blob/abc123/workflow.py"
|
|
698
|
+
>>> workflow = catalog.lookup_workflow_by_url(url)
|
|
699
|
+
>>> print(f"{workflow.name}: {workflow.description}")
|
|
700
|
+
"""
|
|
701
|
+
...
|
|
702
|
+
|
|
703
|
+
def lookup_execution(self, execution_rid: RID) -> "ExecutionRecord":
|
|
704
|
+
"""Look up an execution by RID.
|
|
705
|
+
|
|
706
|
+
Returns an ExecutionRecord for querying and modifying execution metadata.
|
|
707
|
+
|
|
708
|
+
Args:
|
|
709
|
+
execution_rid: Resource Identifier of the execution.
|
|
710
|
+
|
|
711
|
+
Returns:
|
|
712
|
+
ExecutionRecord: The execution record bound to this catalog.
|
|
713
|
+
|
|
714
|
+
Raises:
|
|
715
|
+
DerivaMLException: If the RID doesn't refer to an Execution.
|
|
716
|
+
|
|
717
|
+
Example:
|
|
718
|
+
>>> record = catalog.lookup_execution("2-ABC1")
|
|
719
|
+
>>> print(f"{record.status}: {record.description}")
|
|
720
|
+
"""
|
|
721
|
+
...
|
|
722
|
+
|
|
723
|
+
def find_executions(
|
|
724
|
+
self,
|
|
725
|
+
workflow: "Workflow | RID | None" = None,
|
|
726
|
+
workflow_type: str | None = None,
|
|
727
|
+
status: "Status | None" = None,
|
|
728
|
+
) -> Iterable["ExecutionRecord"]:
|
|
729
|
+
"""List all executions in the catalog.
|
|
730
|
+
|
|
731
|
+
Args:
|
|
732
|
+
workflow: Optional Workflow object or RID to filter by.
|
|
733
|
+
workflow_type: Optional workflow type name to filter by.
|
|
734
|
+
status: Optional status to filter by.
|
|
735
|
+
|
|
736
|
+
Returns:
|
|
737
|
+
Iterable of ExecutionRecord objects.
|
|
738
|
+
|
|
739
|
+
Example:
|
|
740
|
+
>>> for record in catalog.find_executions():
|
|
741
|
+
... print(f"{record.execution_rid}: {record.status}")
|
|
742
|
+
>>> # Filter by workflow type
|
|
743
|
+
>>> for record in catalog.find_executions(workflow_type="python_script"):
|
|
744
|
+
... print(f"{record.execution_rid}")
|
|
745
|
+
"""
|
|
746
|
+
...
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
@runtime_checkable
|
|
750
|
+
class DerivaMLCatalog(DerivaMLCatalogReader, Protocol):
|
|
751
|
+
"""Protocol for full catalog operations including writes.
|
|
752
|
+
|
|
753
|
+
This protocol extends DerivaMLCatalogReader with write operations and
|
|
754
|
+
is implemented by DerivaML (for live catalogs). It provides methods for:
|
|
755
|
+
- Schema and table access
|
|
756
|
+
- Dataset creation and modification
|
|
757
|
+
- Vocabulary term management
|
|
758
|
+
- Catalog snapshots and path building
|
|
759
|
+
|
|
760
|
+
Use this protocol when code needs to modify the catalog. For read-only
|
|
761
|
+
operations, prefer DerivaMLCatalogReader.
|
|
762
|
+
|
|
763
|
+
Attributes:
|
|
764
|
+
catalog: The underlying ERMrest catalog connection.
|
|
765
|
+
catalog_id: Catalog identifier or name.
|
|
766
|
+
s3_bucket: S3 bucket URL for dataset storage, or None if not configured.
|
|
767
|
+
use_minid: Whether MINID service is enabled for this catalog.
|
|
768
|
+
|
|
769
|
+
Example:
|
|
770
|
+
>>> def process_data(catalog: DerivaMLCatalog):
|
|
771
|
+
... datasets = list(catalog.find_datasets())
|
|
772
|
+
... for ds in datasets:
|
|
773
|
+
... print(ds.description)
|
|
774
|
+
... return datasets
|
|
775
|
+
"""
|
|
776
|
+
|
|
777
|
+
catalog: ErmrestCatalog | ErmrestSnapshot
|
|
778
|
+
catalog_id: str | int
|
|
779
|
+
s3_bucket: str | None
|
|
780
|
+
use_minid: bool
|
|
781
|
+
|
|
782
|
+
def pathBuilder(self) -> SchemaWrapper:
|
|
783
|
+
"""Get a path builder for constructing catalog queries.
|
|
784
|
+
|
|
785
|
+
Returns:
|
|
786
|
+
SchemaWrapper for building datapath queries.
|
|
787
|
+
"""
|
|
788
|
+
...
|
|
789
|
+
|
|
790
|
+
def catalog_snapshot(self, version_snapshot: str) -> Self:
|
|
791
|
+
"""Create a view of the catalog at a specific snapshot time.
|
|
792
|
+
|
|
793
|
+
Args:
|
|
794
|
+
version_snapshot: Snapshot timestamp string.
|
|
795
|
+
|
|
796
|
+
Returns:
|
|
797
|
+
A new catalog instance bound to the snapshot.
|
|
798
|
+
"""
|
|
799
|
+
...
|
|
800
|
+
|
|
801
|
+
def resolve_rid(self, rid: RID) -> ResolveRidResult:
|
|
802
|
+
"""Resolve a RID to its catalog location.
|
|
803
|
+
|
|
804
|
+
Args:
|
|
805
|
+
rid: Resource Identifier to resolve.
|
|
806
|
+
|
|
807
|
+
Returns:
|
|
808
|
+
Information about the RID's location in the catalog.
|
|
809
|
+
"""
|
|
810
|
+
...
|
|
811
|
+
|
|
812
|
+
def resolve_rids(
|
|
813
|
+
self,
|
|
814
|
+
rids: set[RID] | list[RID],
|
|
815
|
+
candidate_tables: list[Table] | None = None,
|
|
816
|
+
) -> dict[RID, BatchRidResult]:
|
|
817
|
+
"""Batch resolve multiple RIDs efficiently.
|
|
818
|
+
|
|
819
|
+
Resolves multiple RIDs in batched queries, significantly faster than
|
|
820
|
+
calling resolve_rid() for each RID individually.
|
|
821
|
+
|
|
822
|
+
Args:
|
|
823
|
+
rids: Set or list of RIDs to resolve.
|
|
824
|
+
candidate_tables: Optional list of Table objects to search in.
|
|
825
|
+
If not provided, searches all tables in domain and ML schemas.
|
|
826
|
+
|
|
827
|
+
Returns:
|
|
828
|
+
Mapping from each resolved RID to its BatchRidResult.
|
|
829
|
+
"""
|
|
830
|
+
...
|
|
831
|
+
|
|
832
|
+
def lookup_dataset(self, dataset: RID | DatasetSpec, deleted: bool = False) -> "Dataset":
|
|
833
|
+
"""Look up a dataset by RID or specification.
|
|
834
|
+
|
|
835
|
+
Args:
|
|
836
|
+
dataset: RID or DatasetSpec identifying the dataset.
|
|
837
|
+
deleted: Whether to include deleted datasets.
|
|
838
|
+
|
|
839
|
+
Returns:
|
|
840
|
+
The dataset.
|
|
841
|
+
"""
|
|
842
|
+
...
|
|
843
|
+
|
|
844
|
+
def find_datasets(self, deleted: bool = False) -> Iterable["Dataset"]:
|
|
845
|
+
"""Find all datasets in the catalog.
|
|
846
|
+
|
|
847
|
+
Args:
|
|
848
|
+
deleted: Whether to include deleted datasets.
|
|
849
|
+
|
|
850
|
+
Returns:
|
|
851
|
+
Iterable of all datasets.
|
|
852
|
+
"""
|
|
853
|
+
...
|
|
854
|
+
|
|
855
|
+
@property
|
|
856
|
+
def _dataset_table(self) -> Table:
|
|
857
|
+
"""Get the Dataset table from the model.
|
|
858
|
+
|
|
859
|
+
Returns:
|
|
860
|
+
The Dataset table object.
|
|
861
|
+
"""
|
|
862
|
+
...
|