deriva-ml 1.17.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/.DS_Store +0 -0
- deriva_ml/__init__.py +79 -0
- deriva_ml/bump_version.py +142 -0
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1527 -0
- deriva_ml/core/config.py +69 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +12 -0
- deriva_ml/dataset/aux_classes.py +225 -0
- deriva_ml/dataset/dataset.py +1519 -0
- deriva_ml/dataset/dataset_bag.py +450 -0
- deriva_ml/dataset/history.py +109 -0
- deriva_ml/dataset/upload.py +439 -0
- deriva_ml/demo_catalog.py +495 -0
- deriva_ml/execution/__init__.py +26 -0
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/execution/execution.py +1180 -0
- deriva_ml/execution/execution_configuration.py +147 -0
- deriva_ml/execution/workflow.py +413 -0
- deriva_ml/feature.py +228 -0
- deriva_ml/install_kernel.py +71 -0
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/model/catalog.py +485 -0
- deriva_ml/model/database.py +719 -0
- deriva_ml/protocols/dataset.py +19 -0
- deriva_ml/run_notebook.py +228 -0
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/schema/annotations.py +473 -0
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/schema/create_schema.py +393 -0
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/policy.json +81 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- deriva_ml/test.py +94 -0
- deriva_ml-1.17.10.dist-info/METADATA +38 -0
- deriva_ml-1.17.10.dist-info/RECORD +45 -0
- deriva_ml-1.17.10.dist-info/WHEEL +5 -0
- deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
- deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
- deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
deriva_ml/core/config.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import getpass
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from hydra.conf import HydraConf, RunDir
|
|
7
|
+
from hydra.core.hydra_config import HydraConfig
|
|
8
|
+
from hydra_zen import store
|
|
9
|
+
from omegaconf import OmegaConf
|
|
10
|
+
from pydantic import BaseModel, model_validator
|
|
11
|
+
|
|
12
|
+
from deriva_ml.core.definitions import ML_SCHEMA
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DerivaMLConfig(BaseModel):
|
|
16
|
+
hostname: str
|
|
17
|
+
catalog_id: str | int = 1
|
|
18
|
+
domain_schema: str | None = None
|
|
19
|
+
project_name: str | None = None
|
|
20
|
+
cache_dir: str | Path | None = None
|
|
21
|
+
working_dir: str | Path | None = None
|
|
22
|
+
hydra_runtime_output_dir: str | Path | None = None
|
|
23
|
+
ml_schema: str = ML_SCHEMA
|
|
24
|
+
logging_level: Any = logging.WARNING
|
|
25
|
+
deriva_logging_level: Any = logging.WARNING
|
|
26
|
+
credential: Any = None
|
|
27
|
+
use_minid: bool = True
|
|
28
|
+
check_auth: bool = True
|
|
29
|
+
|
|
30
|
+
@model_validator(mode="after")
|
|
31
|
+
def init_working_dir(self):
|
|
32
|
+
"""
|
|
33
|
+
Sets up the working directory for the model.
|
|
34
|
+
|
|
35
|
+
This method configures the working directory, ensuring that all required
|
|
36
|
+
file operations are performed in the appropriate location. If the user does not
|
|
37
|
+
specify a directory, a default directory based on the user's home directory
|
|
38
|
+
or username will be used.
|
|
39
|
+
|
|
40
|
+
This is a repeat of what is in the DerivaML.__init__ bu we put this here so that the working
|
|
41
|
+
directory is available to hydra.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Self: The object instance with the working directory initialized.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
self.working_dir = DerivaMLConfig.compute_workdir(self.working_dir)
|
|
48
|
+
self.hydra_runtime_output_dir = Path(HydraConfig.get().runtime.output_dir)
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def compute_workdir(working_dir) -> Path:
|
|
53
|
+
# Create a default working directory if none is provided. If a working directory is provided, we add the
|
|
54
|
+
# user name to it to ensure that multiple users do not overwrite each other's work.'
|
|
55
|
+
working_dir = (Path(working_dir) / getpass.getuser() if working_dir else Path.home()) / "deriva-ml"
|
|
56
|
+
return working_dir.absolute()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
OmegaConf.register_new_resolver("compute_workdir", DerivaMLConfig.compute_workdir, replace=True)
|
|
60
|
+
store(
|
|
61
|
+
HydraConf(
|
|
62
|
+
run=RunDir("${compute_workdir:${deriva_ml.working_dir}}/hydra/${now:%Y-%m-%d_%H-%M-%S}"),
|
|
63
|
+
output_subdir="hydra-config",
|
|
64
|
+
),
|
|
65
|
+
group="hydra",
|
|
66
|
+
name="config",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
store.add_to_hydra_store()
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Constants used throughout the DerivaML package.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import NewType, TypeAlias
|
|
8
|
+
|
|
9
|
+
from pydantic import constr
|
|
10
|
+
|
|
11
|
+
# Schema name
|
|
12
|
+
ML_SCHEMA = "deriva-ml"
|
|
13
|
+
|
|
14
|
+
# Special RID for dry runs
|
|
15
|
+
DRY_RUN_RID = "0000"
|
|
16
|
+
|
|
17
|
+
# Regular expression parts for RIDs
|
|
18
|
+
rid_part = r"(?P<rid>(?:[A-Z\d]{1,4}|[A-Z\d]{1,4}(?:-[A-Z\d]{4})+))"
|
|
19
|
+
snapshot_part = r"(?:@(?P<snapshot>(?:[A-Z\d]{1,4}|[A-Z\d]{1,4}(?:-[A-Z\d]{4})+)))?"
|
|
20
|
+
rid_regex = f"^{rid_part}{snapshot_part}$"
|
|
21
|
+
|
|
22
|
+
# RID type definition
|
|
23
|
+
BaseRIDString = constr(pattern=rid_regex)
|
|
24
|
+
# RID = TypeVar("RID", bound=BaseRIDString)
|
|
25
|
+
RIDType: TypeAlias = constr(pattern=rid_regex)
|
|
26
|
+
RID = NewType("RID", BaseRIDString)
|
|
27
|
+
|
|
28
|
+
# System columns in Deriva
|
|
29
|
+
DerivaSystemColumns = ["RID", "RCT", "RMT", "RCB", "RMB"]
|
|
30
|
+
DerivaAssetColumns = {
|
|
31
|
+
"Filename",
|
|
32
|
+
"URL",
|
|
33
|
+
"Length",
|
|
34
|
+
"MD5",
|
|
35
|
+
"Description",
|
|
36
|
+
}.union(set(DerivaSystemColumns))
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared definitions that are used in different DerivaML modules.
|
|
3
|
+
This module re-exports all symbols from the core submodules for backwards compatibility.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
# Re-export constants
|
|
9
|
+
from deriva_ml.core.constants import (
|
|
10
|
+
DRY_RUN_RID,
|
|
11
|
+
ML_SCHEMA,
|
|
12
|
+
RID,
|
|
13
|
+
DerivaAssetColumns,
|
|
14
|
+
DerivaSystemColumns,
|
|
15
|
+
rid_part,
|
|
16
|
+
rid_regex,
|
|
17
|
+
snapshot_part,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Re-export enums
|
|
21
|
+
from deriva_ml.core.enums import (
|
|
22
|
+
BaseStrEnum,
|
|
23
|
+
BuiltinTypes,
|
|
24
|
+
ExecAssetType,
|
|
25
|
+
ExecMetadataType,
|
|
26
|
+
MLAsset,
|
|
27
|
+
MLTable,
|
|
28
|
+
MLVocab,
|
|
29
|
+
Status,
|
|
30
|
+
UploadState,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Re-export models
|
|
34
|
+
from deriva_ml.core.ermrest import (
|
|
35
|
+
ColumnDefinition,
|
|
36
|
+
FileUploadState,
|
|
37
|
+
ForeignKeyDefinition,
|
|
38
|
+
KeyDefinition,
|
|
39
|
+
TableDefinition,
|
|
40
|
+
VocabularyTerm,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Re-export exceptions
|
|
44
|
+
from deriva_ml.core.filespec import FileSpec
|
|
45
|
+
|
|
46
|
+
__all__ = [
|
|
47
|
+
# Constants
|
|
48
|
+
"ML_SCHEMA",
|
|
49
|
+
"DRY_RUN_RID",
|
|
50
|
+
"rid_part",
|
|
51
|
+
"snapshot_part",
|
|
52
|
+
"rid_regex",
|
|
53
|
+
"DerivaSystemColumns",
|
|
54
|
+
"DerivaAssetColumns",
|
|
55
|
+
"RID",
|
|
56
|
+
# Enums
|
|
57
|
+
"BaseStrEnum",
|
|
58
|
+
"UploadState",
|
|
59
|
+
"Status",
|
|
60
|
+
"BuiltinTypes",
|
|
61
|
+
"MLVocab",
|
|
62
|
+
"MLTable",
|
|
63
|
+
"MLAsset",
|
|
64
|
+
"ExecMetadataType",
|
|
65
|
+
"ExecAssetType",
|
|
66
|
+
# Models
|
|
67
|
+
"FileUploadState",
|
|
68
|
+
"FileSpec",
|
|
69
|
+
"VocabularyTerm",
|
|
70
|
+
"ColumnDefinition",
|
|
71
|
+
"KeyDefinition",
|
|
72
|
+
"ForeignKeyDefinition",
|
|
73
|
+
"TableDefinition",
|
|
74
|
+
]
|
deriva_ml/core/enums.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""Enumeration classes for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides enumeration classes used throughout DerivaML for representing states, statuses,
|
|
4
|
+
types, and vocabularies. Each enum class represents a specific set of constants used in the system.
|
|
5
|
+
|
|
6
|
+
Classes:
|
|
7
|
+
BaseStrEnum: Base class for string-based enums.
|
|
8
|
+
UploadState: States for file upload operations.
|
|
9
|
+
Status: Execution status values.
|
|
10
|
+
BuiltinTypes: ERMrest built-in data types.
|
|
11
|
+
MLVocab: Controlled vocabulary types.
|
|
12
|
+
MLAsset: Asset type identifiers.
|
|
13
|
+
ExecMetadataType: Execution metadata type identifiers.
|
|
14
|
+
ExecAssetType: Execution asset type identifiers.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from enum import Enum
|
|
18
|
+
|
|
19
|
+
from deriva.core.ermrest_model import builtin_types
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BaseStrEnum(str, Enum):
|
|
23
|
+
"""Base class for string-based enumerations.
|
|
24
|
+
|
|
25
|
+
Extends both str and Enum to create string enums that are both string-like and enumerated.
|
|
26
|
+
This provides type safety while maintaining string compatibility.
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
>>> class MyEnum(BaseStrEnum):
|
|
30
|
+
... VALUE = "value"
|
|
31
|
+
>>> isinstance(MyEnum.VALUE, str) # True
|
|
32
|
+
>>> isinstance(MyEnum.VALUE, Enum) # True
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class UploadState(Enum):
|
|
39
|
+
"""File upload operation states.
|
|
40
|
+
|
|
41
|
+
Represents the various states a file upload operation can be in, from initiation to completion.
|
|
42
|
+
|
|
43
|
+
Attributes:
|
|
44
|
+
success (int): Upload completed successfully.
|
|
45
|
+
failed (int): Upload failed.
|
|
46
|
+
pending (int): Upload is queued.
|
|
47
|
+
running (int): Upload is in progress.
|
|
48
|
+
paused (int): Upload is temporarily paused.
|
|
49
|
+
aborted (int): Upload was aborted.
|
|
50
|
+
cancelled (int): Upload was cancelled.
|
|
51
|
+
timeout (int): Upload timed out.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
success = 0
|
|
55
|
+
failed = 1
|
|
56
|
+
pending = 2
|
|
57
|
+
running = 3
|
|
58
|
+
paused = 4
|
|
59
|
+
aborted = 5
|
|
60
|
+
cancelled = 6
|
|
61
|
+
timeout = 7
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class Status(BaseStrEnum):
|
|
65
|
+
"""Execution status values.
|
|
66
|
+
|
|
67
|
+
Represents the various states an execution can be in throughout its lifecycle.
|
|
68
|
+
|
|
69
|
+
Attributes:
|
|
70
|
+
initializing (str): Initial setup is in progress.
|
|
71
|
+
created (str): Execution record has been created.
|
|
72
|
+
pending (str): Execution is queued.
|
|
73
|
+
running (str): Execution is in progress.
|
|
74
|
+
aborted (str): Execution was manually stopped.
|
|
75
|
+
completed (str): Execution finished successfully.
|
|
76
|
+
failed (str): Execution encountered an error.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
initializing = "Initializing"
|
|
80
|
+
created = "Created"
|
|
81
|
+
pending = "Pending"
|
|
82
|
+
running = "Running"
|
|
83
|
+
aborted = "Aborted"
|
|
84
|
+
completed = "Completed"
|
|
85
|
+
failed = "Failed"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class BuiltinTypes(Enum):
|
|
89
|
+
"""ERMrest built-in data types.
|
|
90
|
+
|
|
91
|
+
Maps ERMrest's built-in data types to their type names. These types are used for defining
|
|
92
|
+
column types in tables and for type validation.
|
|
93
|
+
|
|
94
|
+
Attributes:
|
|
95
|
+
text (str): Text/string type.
|
|
96
|
+
int2 (str): 16-bit integer.
|
|
97
|
+
jsonb (str): Binary JSON.
|
|
98
|
+
float8 (str): 64-bit float.
|
|
99
|
+
timestamp (str): Timestamp without timezone.
|
|
100
|
+
int8 (str): 64-bit integer.
|
|
101
|
+
boolean (str): Boolean type.
|
|
102
|
+
json (str): JSON type.
|
|
103
|
+
float4 (str): 32-bit float.
|
|
104
|
+
int4 (str): 32-bit integer.
|
|
105
|
+
timestamptz (str): Timestamp with timezone.
|
|
106
|
+
date (str): Date type.
|
|
107
|
+
ermrest_rid (str): Resource identifier.
|
|
108
|
+
ermrest_rcb (str): Record created by.
|
|
109
|
+
ermrest_rmb (str): Record modified by.
|
|
110
|
+
ermrest_rct (str): Record creation time.
|
|
111
|
+
ermrest_rmt (str): Record modification time.
|
|
112
|
+
markdown (str): Markdown text.
|
|
113
|
+
longtext (str): Long text.
|
|
114
|
+
ermrest_curie (str): Compact URI.
|
|
115
|
+
ermrest_uri (str): URI type.
|
|
116
|
+
color_rgb_hex (str): RGB color in hex.
|
|
117
|
+
serial2 (str): 16-bit auto-incrementing.
|
|
118
|
+
serial4 (str): 32-bit auto-incrementing.
|
|
119
|
+
serial8 (str): 64-bit auto-incrementing.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
text = builtin_types.text.typename
|
|
123
|
+
int2 = builtin_types.int2.typename
|
|
124
|
+
jsonb = builtin_types.json.typename
|
|
125
|
+
float8 = builtin_types.float8.typename
|
|
126
|
+
timestamp = builtin_types.timestamp.typename
|
|
127
|
+
int8 = builtin_types.int8.typename
|
|
128
|
+
boolean = builtin_types.boolean.typename
|
|
129
|
+
json = builtin_types.json.typename
|
|
130
|
+
float4 = builtin_types.float4.typename
|
|
131
|
+
int4 = builtin_types.int4.typename
|
|
132
|
+
timestamptz = builtin_types.timestamptz.typename
|
|
133
|
+
date = builtin_types.date.typename
|
|
134
|
+
ermrest_rid = builtin_types.ermrest_rid.typename
|
|
135
|
+
ermrest_rcb = builtin_types.ermrest_rcb.typename
|
|
136
|
+
ermrest_rmb = builtin_types.ermrest_rmb.typename
|
|
137
|
+
ermrest_rct = builtin_types.ermrest_rct.typename
|
|
138
|
+
ermrest_rmt = builtin_types.ermrest_rmt.typename
|
|
139
|
+
markdown = builtin_types.markdown.typename
|
|
140
|
+
longtext = builtin_types.longtext.typename
|
|
141
|
+
ermrest_curie = builtin_types.ermrest_curie.typename
|
|
142
|
+
ermrest_uri = builtin_types.ermrest_uri.typename
|
|
143
|
+
color_rgb_hex = builtin_types.color_rgb_hex.typename
|
|
144
|
+
serial2 = builtin_types.serial2.typename
|
|
145
|
+
serial4 = builtin_types.serial4.typename
|
|
146
|
+
serial8 = builtin_types.serial8.typename
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class MLVocab(BaseStrEnum):
|
|
150
|
+
"""Controlled vocabulary type identifiers.
|
|
151
|
+
|
|
152
|
+
Defines the names of controlled vocabulary tables used in DerivaML for various types
|
|
153
|
+
of entities and attributes.
|
|
154
|
+
|
|
155
|
+
Attributes:
|
|
156
|
+
dataset_type (str): Dataset classification vocabulary.
|
|
157
|
+
workflow_type (str): Workflow classification vocabulary.
|
|
158
|
+
asset_type (str): Asset classification vocabulary.
|
|
159
|
+
asset_role (str): Asset role classification vocabulary.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
dataset_type = "Dataset_Type"
|
|
163
|
+
workflow_type = "Workflow_Type"
|
|
164
|
+
asset_type = "Asset_Type"
|
|
165
|
+
asset_role = "Asset_Role"
|
|
166
|
+
feature_name = "Feature_Name"
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class MLAsset(BaseStrEnum):
|
|
170
|
+
"""Asset type identifiers.
|
|
171
|
+
|
|
172
|
+
Defines the types of assets that can be associated with executions.
|
|
173
|
+
|
|
174
|
+
Attributes:
|
|
175
|
+
execution_metadata (str): Metadata about an execution.
|
|
176
|
+
execution_asset (str): Asset produced by an execution.
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
execution_metadata = "Execution_Metadata"
|
|
180
|
+
execution_asset = "Execution_Asset"
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class MLTable(BaseStrEnum):
|
|
184
|
+
dataset = "Dataset"
|
|
185
|
+
workflow = "Workflow"
|
|
186
|
+
file = "File"
|
|
187
|
+
asset = "Asset"
|
|
188
|
+
execution = "Execution"
|
|
189
|
+
dataset_version = "Dataset_Version"
|
|
190
|
+
execution_metadata = "Execution_Metadata"
|
|
191
|
+
execution_asset = "Execution_Asset"
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class ExecMetadataType(BaseStrEnum):
|
|
195
|
+
"""Execution metadata type identifiers.
|
|
196
|
+
|
|
197
|
+
Defines the types of metadata that can be associated with an execution.
|
|
198
|
+
|
|
199
|
+
Attributes:
|
|
200
|
+
execution_config (str): Execution configuration data.
|
|
201
|
+
runtime_env (str): Runtime environment information.
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
execution_config = "Execution_Config"
|
|
205
|
+
runtime_env = "Runtime_Env"
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class ExecAssetType(BaseStrEnum):
|
|
209
|
+
"""Execution asset type identifiers.
|
|
210
|
+
|
|
211
|
+
Defines the types of assets that can be produced during an execution.
|
|
212
|
+
|
|
213
|
+
Attributes:
|
|
214
|
+
input_file (str): Input file used by the execution.
|
|
215
|
+
output_file (str): Output file produced by the execution.
|
|
216
|
+
notebook_output (str): Jupyter notebook output from the execution.
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
input_file = "Input_File"
|
|
220
|
+
output_file = "Output_File"
|
|
221
|
+
notebook_output = "Notebook_Output"
|
|
222
|
+
model_file = "Model_File"
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
"""ERMrest data models for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides Pydantic models that represent ERMrest catalog structures. These models are used
|
|
4
|
+
throughout DerivaML for defining and manipulating catalog elements like tables, columns, and keys.
|
|
5
|
+
|
|
6
|
+
Classes:
|
|
7
|
+
FileUploadState: Tracks the state of file uploads.
|
|
8
|
+
VocabularyTerm: Represents terms in controlled vocabularies.
|
|
9
|
+
ColumnDefinition: Defines columns in tables.
|
|
10
|
+
KeyDefinition: Defines primary and unique keys.
|
|
11
|
+
ForeignKeyDefinition: Defines foreign key relationships.
|
|
12
|
+
TableDefinition: Defines complete table structures.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import warnings
|
|
18
|
+
from typing import Any, Iterable
|
|
19
|
+
|
|
20
|
+
import deriva.core.ermrest_model as em
|
|
21
|
+
from deriva.core.ermrest_model import builtin_types
|
|
22
|
+
from pydantic import (
|
|
23
|
+
BaseModel,
|
|
24
|
+
Field,
|
|
25
|
+
computed_field,
|
|
26
|
+
field_validator,
|
|
27
|
+
model_serializer,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
from .constants import RID
|
|
31
|
+
from .enums import BuiltinTypes, UploadState
|
|
32
|
+
|
|
33
|
+
# Pydantic warnings suppression
|
|
34
|
+
warnings.filterwarnings("ignore", message='Field name "schema"', category=Warning, module="pydantic")
|
|
35
|
+
warnings.filterwarnings(
|
|
36
|
+
"ignore",
|
|
37
|
+
message="fields may not start with an underscore",
|
|
38
|
+
category=Warning,
|
|
39
|
+
module="pydantic",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class FileUploadState(BaseModel):
|
|
44
|
+
"""Tracks the state and result of a file upload operation.
|
|
45
|
+
|
|
46
|
+
Attributes:
|
|
47
|
+
state (UploadState): Current state of the upload (success, failed, etc.).
|
|
48
|
+
status (str): Detailed status message.
|
|
49
|
+
result (Any): Upload result data, if any.
|
|
50
|
+
rid (RID | None): Resource identifier of the uploaded file, if successful.
|
|
51
|
+
"""
|
|
52
|
+
state: UploadState
|
|
53
|
+
status: str
|
|
54
|
+
result: Any
|
|
55
|
+
|
|
56
|
+
@computed_field
|
|
57
|
+
@property
|
|
58
|
+
def rid(self) -> RID | None:
|
|
59
|
+
return self.result and self.result["RID"]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class VocabularyTerm(BaseModel):
|
|
63
|
+
"""Represents a term in a controlled vocabulary.
|
|
64
|
+
|
|
65
|
+
A vocabulary term is a standardized entry in a controlled vocabulary table. Each term has
|
|
66
|
+
a primary name, optional synonyms, and identifiers for cross-referencing.
|
|
67
|
+
|
|
68
|
+
Attributes:
|
|
69
|
+
name (str): Primary name of the term.
|
|
70
|
+
synonyms (list[str] | None): Alternative names for the term.
|
|
71
|
+
id (str): CURIE (Compact URI) identifier.
|
|
72
|
+
uri (str): Full URI for the term.
|
|
73
|
+
description (str): Explanation of the term's meaning.
|
|
74
|
+
rid (str): Resource identifier in the catalog.
|
|
75
|
+
|
|
76
|
+
Example:
|
|
77
|
+
>>> term = VocabularyTerm(
|
|
78
|
+
... Name="epithelial",
|
|
79
|
+
... Synonyms=["epithelium"],
|
|
80
|
+
... ID="tissue:0001",
|
|
81
|
+
... URI="http://example.org/tissue/0001",
|
|
82
|
+
... Description="Epithelial tissue type",
|
|
83
|
+
... RID="1-abc123"
|
|
84
|
+
... )
|
|
85
|
+
"""
|
|
86
|
+
name: str = Field(alias="Name")
|
|
87
|
+
synonyms: list[str] | None = Field(alias="Synonyms")
|
|
88
|
+
id: str = Field(alias="ID")
|
|
89
|
+
uri: str = Field(alias="URI")
|
|
90
|
+
description: str = Field(alias="Description")
|
|
91
|
+
rid: str = Field(alias="RID")
|
|
92
|
+
|
|
93
|
+
class Config:
|
|
94
|
+
extra = "ignore"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class ColumnDefinition(BaseModel):
|
|
98
|
+
"""Defines a column in an ERMrest table.
|
|
99
|
+
|
|
100
|
+
Provides a Pydantic model for defining columns with their types, constraints, and metadata.
|
|
101
|
+
Maps to deriva_py's Column.define functionality.
|
|
102
|
+
|
|
103
|
+
Attributes:
|
|
104
|
+
name (str): Name of the column.
|
|
105
|
+
type (BuiltinTypes): ERMrest data type for the column.
|
|
106
|
+
nullok (bool): Whether NULL values are allowed. Defaults to True.
|
|
107
|
+
default (Any): Default value for the column.
|
|
108
|
+
comment (str | None): Description of the column's purpose.
|
|
109
|
+
acls (dict): Access control lists.
|
|
110
|
+
acl_bindings (dict): Dynamic access control bindings.
|
|
111
|
+
annotations (dict): Additional metadata annotations.
|
|
112
|
+
|
|
113
|
+
Example:
|
|
114
|
+
>>> col = ColumnDefinition(
|
|
115
|
+
... name="score",
|
|
116
|
+
... type=BuiltinTypes.float4,
|
|
117
|
+
... nullok=False,
|
|
118
|
+
... comment="Confidence score between 0 and 1"
|
|
119
|
+
... )
|
|
120
|
+
"""
|
|
121
|
+
name: str
|
|
122
|
+
type: BuiltinTypes
|
|
123
|
+
nullok: bool = True
|
|
124
|
+
default: Any = None
|
|
125
|
+
comment: str | None = None
|
|
126
|
+
acls: dict = Field(default_factory=dict)
|
|
127
|
+
acl_bindings: dict = Field(default_factory=dict)
|
|
128
|
+
annotations: dict = Field(default_factory=dict)
|
|
129
|
+
|
|
130
|
+
@field_validator("type", mode="before")
|
|
131
|
+
@classmethod
|
|
132
|
+
def extract_type_name(cls, value: Any) -> Any:
|
|
133
|
+
if isinstance(value, dict):
|
|
134
|
+
return BuiltinTypes(value["typename"])
|
|
135
|
+
else:
|
|
136
|
+
return value
|
|
137
|
+
|
|
138
|
+
@model_serializer()
|
|
139
|
+
def serialize_column_definition(self):
|
|
140
|
+
return em.Column.define(
|
|
141
|
+
self.name,
|
|
142
|
+
builtin_types[self.type.value],
|
|
143
|
+
nullok=self.nullok,
|
|
144
|
+
default=self.default,
|
|
145
|
+
comment=self.comment,
|
|
146
|
+
acls=self.acls,
|
|
147
|
+
acl_bindings=self.acl_bindings,
|
|
148
|
+
annotations=self.annotations,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class KeyDefinition(BaseModel):
|
|
153
|
+
"""Defines a key constraint in an ERMrest table.
|
|
154
|
+
|
|
155
|
+
Provides a Pydantic model for defining primary keys and unique constraints.
|
|
156
|
+
Maps to deriva_py's Key.define functionality.
|
|
157
|
+
|
|
158
|
+
Attributes:
|
|
159
|
+
colnames (Iterable[str]): Names of columns that form the key.
|
|
160
|
+
constraint_names (Iterable[str]): Names for the key constraints.
|
|
161
|
+
comment (str | None): Description of the key's purpose.
|
|
162
|
+
annotations (dict): Additional metadata annotations.
|
|
163
|
+
|
|
164
|
+
Example:
|
|
165
|
+
>>> key = KeyDefinition(
|
|
166
|
+
... colnames=["id", "version"],
|
|
167
|
+
... constraint_names=["unique_id_version"],
|
|
168
|
+
... comment="Unique identifier with version"
|
|
169
|
+
... )
|
|
170
|
+
"""
|
|
171
|
+
colnames: Iterable[str]
|
|
172
|
+
constraint_names: Iterable[str]
|
|
173
|
+
comment: str | None = None
|
|
174
|
+
annotations: dict = Field(default_factory=dict)
|
|
175
|
+
|
|
176
|
+
@model_serializer()
|
|
177
|
+
def serialize_key_definition(self):
|
|
178
|
+
return em.Key.define(
|
|
179
|
+
colnames=self.colnames,
|
|
180
|
+
constraint_names=self.constraint_names,
|
|
181
|
+
comment=self.comment,
|
|
182
|
+
annotations=self.annotations,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class ForeignKeyDefinition(BaseModel):
|
|
187
|
+
"""Defines a foreign key relationship between tables.
|
|
188
|
+
|
|
189
|
+
Provides a Pydantic model for defining foreign key constraints with referential actions
|
|
190
|
+
and metadata. Maps to deriva_py's ForeignKey.define functionality.
|
|
191
|
+
|
|
192
|
+
Attributes:
|
|
193
|
+
colnames (Iterable[str]): Names of columns in the referencing table.
|
|
194
|
+
pk_sname (str): Schema name of the referenced table.
|
|
195
|
+
pk_tname (str): Name of the referenced table.
|
|
196
|
+
pk_colnames (Iterable[str]): Names of columns in the referenced table.
|
|
197
|
+
constraint_names (Iterable[str]): Names for the foreign key constraints.
|
|
198
|
+
on_update (str): Action on update of referenced row. Defaults to "NO ACTION".
|
|
199
|
+
on_delete (str): Action on delete of referenced row. Defaults to "NO ACTION".
|
|
200
|
+
comment (str | None): Description of the relationship.
|
|
201
|
+
acls (dict): Access control lists.
|
|
202
|
+
acl_bindings (dict): Dynamic access control bindings.
|
|
203
|
+
annotations (dict): Additional metadata annotations.
|
|
204
|
+
|
|
205
|
+
Example:
|
|
206
|
+
>>> fk = ForeignKeyDefinition(
|
|
207
|
+
... colnames=["dataset_id"],
|
|
208
|
+
... pk_sname="core",
|
|
209
|
+
... pk_tname="dataset",
|
|
210
|
+
... pk_colnames=["id"],
|
|
211
|
+
... on_delete="CASCADE"
|
|
212
|
+
... )
|
|
213
|
+
"""
|
|
214
|
+
colnames: Iterable[str]
|
|
215
|
+
pk_sname: str
|
|
216
|
+
pk_tname: str
|
|
217
|
+
pk_colnames: Iterable[str]
|
|
218
|
+
constraint_names: Iterable[str] = Field(default_factory=list)
|
|
219
|
+
on_update: str = "NO ACTION"
|
|
220
|
+
on_delete: str = "NO ACTION"
|
|
221
|
+
comment: str | None = None
|
|
222
|
+
acls: dict[str, Any] = Field(default_factory=dict)
|
|
223
|
+
acl_bindings: dict[str, Any] = Field(default_factory=dict)
|
|
224
|
+
annotations: dict[str, Any] = Field(default_factory=dict)
|
|
225
|
+
|
|
226
|
+
@model_serializer()
|
|
227
|
+
def serialize_fk_definition(self):
|
|
228
|
+
return em.ForeignKey.define(
|
|
229
|
+
fk_colnames=self.colnames,
|
|
230
|
+
pk_sname=self.pk_sname,
|
|
231
|
+
pk_tname=self.pk_tname,
|
|
232
|
+
pk_colnames=self.pk_colnames,
|
|
233
|
+
on_update=self.on_update,
|
|
234
|
+
on_delete=self.on_delete,
|
|
235
|
+
comment=self.comment,
|
|
236
|
+
acls=self.acls,
|
|
237
|
+
acl_bindings=self.acl_bindings,
|
|
238
|
+
annotations=self.annotations,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class TableDefinition(BaseModel):
|
|
243
|
+
"""Defines a complete table structure in ERMrest.
|
|
244
|
+
|
|
245
|
+
Provides a Pydantic model for defining tables with their columns, keys, and relationships.
|
|
246
|
+
Maps to deriva_py's Table.define functionality.
|
|
247
|
+
|
|
248
|
+
Attributes:
|
|
249
|
+
name (str): Name of the table.
|
|
250
|
+
column_defs (Iterable[ColumnDefinition]): Column definitions.
|
|
251
|
+
key_defs (Iterable[KeyDefinition]): Key constraint definitions.
|
|
252
|
+
fkey_defs (Iterable[ForeignKeyDefinition]): Foreign key relationship definitions.
|
|
253
|
+
comment (str | None): Description of the table's purpose.
|
|
254
|
+
acls (dict): Access control lists.
|
|
255
|
+
acl_bindings (dict): Dynamic access control bindings.
|
|
256
|
+
annotations (dict): Additional metadata annotations.
|
|
257
|
+
|
|
258
|
+
Example:
|
|
259
|
+
>>> table = TableDefinition(
|
|
260
|
+
... name="experiment",
|
|
261
|
+
... column_defs=[
|
|
262
|
+
... ColumnDefinition(name="id", type=BuiltinTypes.text),
|
|
263
|
+
... ColumnDefinition(name="date", type=BuiltinTypes.date)
|
|
264
|
+
... ],
|
|
265
|
+
... comment="Experimental data records"
|
|
266
|
+
... )
|
|
267
|
+
"""
|
|
268
|
+
name: str
|
|
269
|
+
column_defs: Iterable[ColumnDefinition]
|
|
270
|
+
key_defs: Iterable[KeyDefinition] = Field(default_factory=list)
|
|
271
|
+
fkey_defs: Iterable[ForeignKeyDefinition] = Field(default_factory=list)
|
|
272
|
+
comment: str | None = None
|
|
273
|
+
acls: dict = Field(default_factory=dict)
|
|
274
|
+
acl_bindings: dict = Field(default_factory=dict)
|
|
275
|
+
annotations: dict = Field(default_factory=dict)
|
|
276
|
+
|
|
277
|
+
@model_serializer()
|
|
278
|
+
def serialize_table_definition(self):
|
|
279
|
+
return em.Table.define(
|
|
280
|
+
tname=self.name,
|
|
281
|
+
column_defs=[c.model_dump() for c in self.column_defs],
|
|
282
|
+
key_defs=[k.model_dump() for k in self.key_defs],
|
|
283
|
+
fkey_defs=[fk.model_dump() for fk in self.fkey_defs],
|
|
284
|
+
comment=self.comment,
|
|
285
|
+
acls=self.acls,
|
|
286
|
+
acl_bindings=self.acl_bindings,
|
|
287
|
+
annotations=self.annotations,
|
|
288
|
+
)
|