deriva-ml 1.17.9__py3-none-any.whl → 1.17.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +43 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +21 -0
- deriva_ml/catalog/clone.py +1199 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +817 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +186 -105
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +545 -244
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +224 -35
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -5
- deriva_ml-1.17.11.dist-info/RECORD +77 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +2 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.9.dist-info/RECORD +0 -45
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
deriva_ml/core/config.py
CHANGED
|
@@ -1,9 +1,62 @@
|
|
|
1
|
+
"""Configuration management for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides the DerivaMLConfig class for managing DerivaML instance
|
|
4
|
+
configuration. It integrates with hydra-zen for configuration management and supports
|
|
5
|
+
both programmatic and structured configuration.
|
|
6
|
+
|
|
7
|
+
The configuration handles:
|
|
8
|
+
- Server connection settings (hostname, catalog_id, credentials)
|
|
9
|
+
- Schema configuration (domain_schema, ml_schema)
|
|
10
|
+
- Directory paths (working_dir, cache_dir)
|
|
11
|
+
- Logging levels for both DerivaML and underlying Deriva libraries
|
|
12
|
+
- Feature toggles (use_minid, check_auth)
|
|
13
|
+
|
|
14
|
+
Integration with hydra-zen:
|
|
15
|
+
The module registers a custom resolver for computing working directories
|
|
16
|
+
and configures Hydra's output directory structure for reproducible runs.
|
|
17
|
+
Use hydra-zen's `builds()` and `store` to create composable configurations.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
Programmatic configuration:
|
|
21
|
+
>>> config = DerivaMLConfig(
|
|
22
|
+
... hostname='deriva.example.org',
|
|
23
|
+
... catalog_id='my_catalog',
|
|
24
|
+
... working_dir='/path/to/work'
|
|
25
|
+
... )
|
|
26
|
+
>>> ml = DerivaML.instantiate(config)
|
|
27
|
+
|
|
28
|
+
With hydra-zen:
|
|
29
|
+
>>> from hydra_zen import builds, instantiate, store, zen
|
|
30
|
+
>>> from deriva_ml import DerivaML
|
|
31
|
+
>>> from deriva_ml.core.config import DerivaMLConfig
|
|
32
|
+
>>>
|
|
33
|
+
>>> # Create a structured config for DerivaML
|
|
34
|
+
>>> DerivaMLConf = builds(DerivaMLConfig, populate_full_signature=True)
|
|
35
|
+
>>>
|
|
36
|
+
>>> # Store configurations for different environments
|
|
37
|
+
>>> store(DerivaMLConf(
|
|
38
|
+
... hostname='dev.example.org',
|
|
39
|
+
... catalog_id='1',
|
|
40
|
+
... ), name='dev')
|
|
41
|
+
>>>
|
|
42
|
+
>>> store(DerivaMLConf(
|
|
43
|
+
... hostname='prod.example.org',
|
|
44
|
+
... catalog_id='52',
|
|
45
|
+
... ), name='prod')
|
|
46
|
+
>>>
|
|
47
|
+
>>> # Use with Hydra's @hydra.main or zen() wrapper
|
|
48
|
+
>>> @zen(DerivaMLConf)
|
|
49
|
+
... def my_task(cfg: DerivaMLConfig):
|
|
50
|
+
... ml = DerivaML.instantiate(cfg)
|
|
51
|
+
... # ... do work with ml instance
|
|
52
|
+
"""
|
|
53
|
+
|
|
1
54
|
import getpass
|
|
2
55
|
import logging
|
|
3
56
|
from pathlib import Path
|
|
4
57
|
from typing import Any
|
|
5
58
|
|
|
6
|
-
from hydra.conf import HydraConf, RunDir
|
|
59
|
+
from hydra.conf import HydraConf, RunDir, SweepDir
|
|
7
60
|
from hydra.core.hydra_config import HydraConfig
|
|
8
61
|
from hydra_zen import store
|
|
9
62
|
from omegaconf import OmegaConf
|
|
@@ -13,9 +66,52 @@ from deriva_ml.core.definitions import ML_SCHEMA
|
|
|
13
66
|
|
|
14
67
|
|
|
15
68
|
class DerivaMLConfig(BaseModel):
|
|
69
|
+
"""Configuration model for DerivaML instances.
|
|
70
|
+
|
|
71
|
+
This Pydantic model defines all configurable parameters for a DerivaML instance.
|
|
72
|
+
It can be used directly or via Hydra configuration files.
|
|
73
|
+
|
|
74
|
+
Attributes:
|
|
75
|
+
hostname: Hostname of the Deriva server (e.g., 'deriva.example.org').
|
|
76
|
+
catalog_id: Catalog identifier, either numeric ID or catalog name.
|
|
77
|
+
domain_schemas: Optional set of domain schema names. If None, auto-detects all
|
|
78
|
+
non-system schemas. Use this when working with catalogs that have multiple
|
|
79
|
+
user-defined schemas.
|
|
80
|
+
default_schema: The default schema for table creation operations. If None and
|
|
81
|
+
there is exactly one domain schema, that schema is used. If there are multiple
|
|
82
|
+
domain schemas, this must be specified for table creation to work without
|
|
83
|
+
explicit schema parameters.
|
|
84
|
+
project_name: Project name for organizing outputs. Defaults to default_schema.
|
|
85
|
+
cache_dir: Directory for caching downloaded datasets. Defaults to working_dir/cache.
|
|
86
|
+
working_dir: Base directory for computation data. Defaults to ~/deriva-ml.
|
|
87
|
+
hydra_runtime_output_dir: Hydra's runtime output directory (set automatically).
|
|
88
|
+
ml_schema: Schema name for ML tables. Defaults to 'deriva-ml'.
|
|
89
|
+
logging_level: Logging level for DerivaML. Defaults to WARNING.
|
|
90
|
+
deriva_logging_level: Logging level for Deriva libraries. Defaults to WARNING.
|
|
91
|
+
credential: Authentication credentials. If None, retrieved automatically.
|
|
92
|
+
s3_bucket: S3 bucket URL for dataset bag storage (e.g., 's3://my-bucket').
|
|
93
|
+
If provided, enables MINID creation and S3 upload for dataset exports.
|
|
94
|
+
If None, MINID functionality is disabled regardless of use_minid setting.
|
|
95
|
+
use_minid: Whether to use MINID service for dataset bags. Only effective when
|
|
96
|
+
s3_bucket is configured. Defaults to True when s3_bucket is set, False otherwise.
|
|
97
|
+
check_auth: Whether to verify authentication on connection. Defaults to True.
|
|
98
|
+
clean_execution_dir: Whether to automatically clean execution working directories
|
|
99
|
+
after successful upload. Defaults to True. Set to False to retain local copies
|
|
100
|
+
of execution outputs for debugging or manual inspection.
|
|
101
|
+
|
|
102
|
+
Example:
|
|
103
|
+
>>> config = DerivaMLConfig(
|
|
104
|
+
... hostname='deriva.example.org',
|
|
105
|
+
... catalog_id=1,
|
|
106
|
+
... default_schema='my_domain',
|
|
107
|
+
... logging_level=logging.INFO
|
|
108
|
+
... )
|
|
109
|
+
"""
|
|
110
|
+
|
|
16
111
|
hostname: str
|
|
17
112
|
catalog_id: str | int = 1
|
|
18
|
-
|
|
113
|
+
domain_schemas: set[str] | None = None
|
|
114
|
+
default_schema: str | None = None
|
|
19
115
|
project_name: str | None = None
|
|
20
116
|
cache_dir: str | Path | None = None
|
|
21
117
|
working_dir: str | Path | None = None
|
|
@@ -24,46 +120,98 @@ class DerivaMLConfig(BaseModel):
|
|
|
24
120
|
logging_level: Any = logging.WARNING
|
|
25
121
|
deriva_logging_level: Any = logging.WARNING
|
|
26
122
|
credential: Any = None
|
|
27
|
-
|
|
123
|
+
s3_bucket: str | None = None
|
|
124
|
+
use_minid: bool | None = None # None means "auto" - True if s3_bucket is set
|
|
28
125
|
check_auth: bool = True
|
|
126
|
+
clean_execution_dir: bool = True
|
|
29
127
|
|
|
30
128
|
@model_validator(mode="after")
|
|
31
|
-
def init_working_dir(self):
|
|
32
|
-
"""
|
|
33
|
-
|
|
129
|
+
def init_working_dir(self) -> "DerivaMLConfig":
|
|
130
|
+
"""Initialize working directory and resolve use_minid after model validation.
|
|
131
|
+
|
|
132
|
+
Sets up the working directory path, computing a default if not specified.
|
|
133
|
+
Also captures Hydra's runtime output directory for logging and outputs.
|
|
34
134
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
or username will be used.
|
|
135
|
+
Resolves the use_minid flag based on s3_bucket configuration:
|
|
136
|
+
- If use_minid is explicitly set, use that value (but it only takes effect if s3_bucket is set)
|
|
137
|
+
- If use_minid is None (auto), set it to True if s3_bucket is configured, False otherwise
|
|
39
138
|
|
|
40
|
-
This
|
|
41
|
-
directory is available
|
|
139
|
+
This validator runs after all field validation and ensures the working
|
|
140
|
+
directory is available for Hydra configuration resolution.
|
|
42
141
|
|
|
43
142
|
Returns:
|
|
44
|
-
Self: The
|
|
143
|
+
Self: The configuration instance with initialized paths.
|
|
45
144
|
"""
|
|
46
|
-
|
|
47
|
-
self.working_dir = DerivaMLConfig.compute_workdir(self.working_dir)
|
|
145
|
+
self.working_dir = DerivaMLConfig.compute_workdir(self.working_dir, self.catalog_id)
|
|
48
146
|
self.hydra_runtime_output_dir = Path(HydraConfig.get().runtime.output_dir)
|
|
147
|
+
|
|
148
|
+
# Resolve use_minid based on s3_bucket configuration
|
|
149
|
+
if self.use_minid is None:
|
|
150
|
+
# Auto mode: enable MINID if s3_bucket is configured
|
|
151
|
+
self.use_minid = self.s3_bucket is not None
|
|
152
|
+
elif self.use_minid and self.s3_bucket is None:
|
|
153
|
+
# User requested MINID but no S3 bucket configured - disable MINID
|
|
154
|
+
self.use_minid = False
|
|
155
|
+
|
|
49
156
|
return self
|
|
50
157
|
|
|
51
158
|
@staticmethod
|
|
52
|
-
def compute_workdir(working_dir) -> Path:
|
|
53
|
-
|
|
54
|
-
# user name to it to ensure that multiple users do not overwrite each other's work.'
|
|
55
|
-
working_dir = (Path(working_dir) / getpass.getuser() if working_dir else Path.home()) / "deriva-ml"
|
|
56
|
-
return working_dir.absolute()
|
|
159
|
+
def compute_workdir(working_dir: str | Path | None, catalog_id: str | int | None = None) -> Path:
|
|
160
|
+
"""Compute the effective working directory path.
|
|
57
161
|
|
|
162
|
+
Creates a standardized working directory path. If a base directory is provided,
|
|
163
|
+
appends the current username to prevent conflicts between users. If no directory
|
|
164
|
+
is provided, uses ~/.deriva-ml. The catalog_id is appended to
|
|
165
|
+
separate data from different catalogs.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
working_dir: Base working directory path, or None for default.
|
|
169
|
+
catalog_id: Catalog identifier to include in the path. If None, no
|
|
170
|
+
catalog subdirectory is created.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Path: Absolute path to the working directory.
|
|
58
174
|
|
|
175
|
+
Example:
|
|
176
|
+
>>> DerivaMLConfig.compute_workdir('/shared/data', '52')
|
|
177
|
+
PosixPath('/shared/data/username/deriva-ml/52')
|
|
178
|
+
>>> DerivaMLConfig.compute_workdir(None, 1)
|
|
179
|
+
PosixPath('/home/username/.deriva-ml/1')
|
|
180
|
+
"""
|
|
181
|
+
# Append username and deriva-ml to provided path, or use ~/.deriva-ml as base
|
|
182
|
+
if working_dir:
|
|
183
|
+
base_dir = Path(working_dir) / getpass.getuser() / "deriva-ml"
|
|
184
|
+
else:
|
|
185
|
+
base_dir = Path.home() / ".deriva-ml"
|
|
186
|
+
# Append catalog_id if provided
|
|
187
|
+
if catalog_id is not None:
|
|
188
|
+
base_dir = base_dir / str(catalog_id)
|
|
189
|
+
return base_dir.absolute()
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# =============================================================================
|
|
193
|
+
# Hydra Integration
|
|
194
|
+
# =============================================================================
|
|
195
|
+
|
|
196
|
+
# Register custom resolver for computing working directories in Hydra configs
|
|
197
|
+
# This allows ${compute_workdir:${working_dir},${catalog_id}} syntax in YAML configuration files
|
|
59
198
|
OmegaConf.register_new_resolver("compute_workdir", DerivaMLConfig.compute_workdir, replace=True)
|
|
199
|
+
|
|
200
|
+
# Configure Hydra's output directory structure for reproducible runs
|
|
201
|
+
# Outputs are organized by timestamp under the computed working directory
|
|
202
|
+
# For multirun/sweep, outputs go to a sweep subdirectory with job number subfolders
|
|
60
203
|
store(
|
|
61
204
|
HydraConf(
|
|
62
|
-
run=RunDir("${compute_workdir:${deriva_ml.working_dir}}/hydra/${now:%Y-%m-%d_%H-%M-%S}"),
|
|
205
|
+
run=RunDir("${compute_workdir:${deriva_ml.working_dir},${deriva_ml.catalog_id}}/hydra/${now:%Y-%m-%d_%H-%M-%S}"),
|
|
206
|
+
sweep=SweepDir(
|
|
207
|
+
dir="${compute_workdir:${deriva_ml.working_dir},${deriva_ml.catalog_id}}/hydra-sweep/${now:%Y-%m-%d_%H-%M-%S}",
|
|
208
|
+
subdir="${hydra.job.num}",
|
|
209
|
+
),
|
|
63
210
|
output_subdir="hydra-config",
|
|
64
211
|
),
|
|
65
212
|
group="hydra",
|
|
66
213
|
name="config",
|
|
67
214
|
)
|
|
68
215
|
|
|
216
|
+
# Add the configuration to Hydra's store for discovery
|
|
69
217
|
store.add_to_hydra_store()
|
deriva_ml/core/constants.py
CHANGED
|
@@ -1,36 +1,137 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""Constants used throughout the DerivaML package.
|
|
2
|
+
|
|
3
|
+
This module defines fundamental constants, type aliases, and regular expressions
|
|
4
|
+
used for validating and working with Deriva catalog structures.
|
|
5
|
+
|
|
6
|
+
Constants:
|
|
7
|
+
ML_SCHEMA: Default schema name for ML-related tables ('deriva-ml').
|
|
8
|
+
DRY_RUN_RID: Special RID used for dry-run operations without database changes.
|
|
9
|
+
|
|
10
|
+
Type Aliases:
|
|
11
|
+
RID: Annotated string type for Resource Identifiers with validation.
|
|
12
|
+
|
|
13
|
+
Regular Expressions:
|
|
14
|
+
rid_part: Pattern for matching the RID portion of an identifier.
|
|
15
|
+
snapshot_part: Pattern for matching optional snapshot timestamps.
|
|
16
|
+
rid_regex: Complete pattern for validating RID strings.
|
|
17
|
+
|
|
18
|
+
Column Sets:
|
|
19
|
+
DerivaSystemColumns: Standard Deriva system columns present in all tables.
|
|
20
|
+
DerivaAssetColumns: Columns specific to asset tables (files, etc.).
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
>>> from deriva_ml.core.constants import RID, ML_SCHEMA
|
|
24
|
+
>>> def process_entity(rid: RID) -> None:
|
|
25
|
+
... # RID is validated by Pydantic
|
|
26
|
+
... pass
|
|
3
27
|
"""
|
|
4
28
|
|
|
5
29
|
from __future__ import annotations
|
|
6
30
|
|
|
7
|
-
from typing import
|
|
31
|
+
from typing import Annotated
|
|
8
32
|
|
|
9
|
-
from pydantic import
|
|
33
|
+
from pydantic import StringConstraints
|
|
10
34
|
|
|
11
|
-
#
|
|
35
|
+
# =============================================================================
|
|
36
|
+
# Schema Constants
|
|
37
|
+
# =============================================================================
|
|
38
|
+
|
|
39
|
+
# Default schema name for ML-related tables in the catalog
|
|
12
40
|
ML_SCHEMA = "deriva-ml"
|
|
13
41
|
|
|
14
|
-
# Special RID for dry
|
|
42
|
+
# Special RID value used for dry-run operations that don't modify the database
|
|
15
43
|
DRY_RUN_RID = "0000"
|
|
16
44
|
|
|
17
|
-
#
|
|
45
|
+
# System schemas that are part of Deriva infrastructure (not user domain schemas)
|
|
46
|
+
# These are excluded when auto-detecting domain schemas
|
|
47
|
+
SYSTEM_SCHEMAS: frozenset[str] = frozenset({"public", "www", "WWW"})
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def is_system_schema(schema_name: str, ml_schema: str = ML_SCHEMA) -> bool:
|
|
51
|
+
"""Check if a schema is a system or ML schema (not a domain schema).
|
|
52
|
+
|
|
53
|
+
System schemas are Deriva infrastructure schemas (public, www, WWW) and the
|
|
54
|
+
ML schema (deriva-ml by default). Domain schemas are user-defined schemas
|
|
55
|
+
containing business logic tables.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
schema_name: Name of the schema to check.
|
|
59
|
+
ml_schema: Name of the ML schema (default: 'deriva-ml').
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
True if the schema is a system or ML schema, False if it's a domain schema.
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
>>> is_system_schema("public")
|
|
66
|
+
True
|
|
67
|
+
>>> is_system_schema("deriva-ml")
|
|
68
|
+
True
|
|
69
|
+
>>> is_system_schema("my_project")
|
|
70
|
+
False
|
|
71
|
+
"""
|
|
72
|
+
return schema_name.lower() in {s.lower() for s in SYSTEM_SCHEMAS} or schema_name == ml_schema
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_domain_schemas(all_schemas: set[str] | list[str], ml_schema: str = ML_SCHEMA) -> frozenset[str]:
|
|
76
|
+
"""Return all domain schemas from a collection of schema names.
|
|
77
|
+
|
|
78
|
+
Filters out system schemas (public, www, WWW) and the ML schema to return
|
|
79
|
+
only user-defined domain schemas.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
all_schemas: Collection of schema names to filter.
|
|
83
|
+
ml_schema: Name of the ML schema to exclude (default: 'deriva-ml').
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Frozen set of domain schema names.
|
|
87
|
+
|
|
88
|
+
Example:
|
|
89
|
+
>>> get_domain_schemas(["public", "deriva-ml", "my_project", "www"])
|
|
90
|
+
frozenset({'my_project'})
|
|
91
|
+
"""
|
|
92
|
+
return frozenset(s for s in all_schemas if not is_system_schema(s, ml_schema))
|
|
93
|
+
|
|
94
|
+
# =============================================================================
|
|
95
|
+
# RID Regular Expression Components
|
|
96
|
+
# =============================================================================
|
|
97
|
+
|
|
98
|
+
# Pattern for the RID portion: 1-4 alphanumeric chars, optionally followed by
|
|
99
|
+
# hyphen-separated groups of exactly 4 alphanumeric chars (e.g., "1ABC" or "1ABC-DEF2-3GHI")
|
|
18
100
|
rid_part = r"(?P<rid>(?:[A-Z\d]{1,4}|[A-Z\d]{1,4}(?:-[A-Z\d]{4})+))"
|
|
101
|
+
|
|
102
|
+
# Pattern for optional snapshot timestamp suffix (e.g., "@2024-01-01T12:00:00")
|
|
103
|
+
# Uses the same format as RID for the snapshot identifier
|
|
19
104
|
snapshot_part = r"(?:@(?P<snapshot>(?:[A-Z\d]{1,4}|[A-Z\d]{1,4}(?:-[A-Z\d]{4})+)))?"
|
|
105
|
+
|
|
106
|
+
# Complete regex for validating RID strings with optional snapshot
|
|
20
107
|
rid_regex = f"^{rid_part}{snapshot_part}$"
|
|
21
108
|
|
|
22
|
-
#
|
|
23
|
-
|
|
24
|
-
#
|
|
25
|
-
RIDType: TypeAlias = constr(pattern=rid_regex)
|
|
26
|
-
RID = NewType("RID", BaseRIDString)
|
|
109
|
+
# =============================================================================
|
|
110
|
+
# Type Aliases
|
|
111
|
+
# =============================================================================
|
|
27
112
|
|
|
28
|
-
#
|
|
113
|
+
# RID type with Pydantic validation - ensures strings match the RID format
|
|
114
|
+
# Used throughout the codebase for type hints and runtime validation
|
|
115
|
+
RID = Annotated[str, StringConstraints(pattern=rid_regex)]
|
|
116
|
+
|
|
117
|
+
# =============================================================================
|
|
118
|
+
# Column Definitions
|
|
119
|
+
# =============================================================================
|
|
120
|
+
|
|
121
|
+
# Standard Deriva system columns present in every table:
|
|
122
|
+
# - RID: Resource Identifier (unique key)
|
|
123
|
+
# - RCT: Record Creation Time
|
|
124
|
+
# - RMT: Record Modification Time
|
|
125
|
+
# - RCB: Record Created By (user ID)
|
|
126
|
+
# - RMB: Record Modified By (user ID)
|
|
29
127
|
DerivaSystemColumns = ["RID", "RCT", "RMT", "RCB", "RMB"]
|
|
128
|
+
|
|
129
|
+
# Columns specific to asset tables (files, images, etc.)
|
|
130
|
+
# Includes system columns plus asset-specific metadata
|
|
30
131
|
DerivaAssetColumns = {
|
|
31
|
-
"Filename",
|
|
32
|
-
"URL",
|
|
33
|
-
"Length",
|
|
34
|
-
"MD5",
|
|
35
|
-
"Description",
|
|
36
|
-
}.union(set(DerivaSystemColumns))
|
|
132
|
+
"Filename", # Original filename
|
|
133
|
+
"URL", # Hatrac storage URL
|
|
134
|
+
"Length", # File size in bytes
|
|
135
|
+
"MD5", # MD5 checksum for integrity verification
|
|
136
|
+
"Description", # Optional description of the asset
|
|
137
|
+
}.union(set(DerivaSystemColumns))
|
deriva_ml/core/definitions.py
CHANGED
|
@@ -1,23 +1,52 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
This module
|
|
1
|
+
"""Shared definitions for DerivaML modules.
|
|
2
|
+
|
|
3
|
+
This module serves as the central location for type definitions, constants, enums,
|
|
4
|
+
and data models used throughout DerivaML. It re-exports symbols from specialized
|
|
5
|
+
submodules for convenience and backwards compatibility.
|
|
6
|
+
|
|
7
|
+
The module consolidates:
|
|
8
|
+
- Constants: Schema names, RID patterns, column definitions
|
|
9
|
+
- Enums: Status codes, upload states, built-in types, vocabulary identifiers
|
|
10
|
+
- Models: Dataclass-based models for ERMrest structures (tables, columns, keys)
|
|
11
|
+
- Utilities: FileSpec for file metadata handling
|
|
12
|
+
|
|
13
|
+
Core definition classes (ColumnDef, KeyDef, ForeignKeyDef, TableDef) are provided by
|
|
14
|
+
`deriva.core.typed` and re-exported here. Legacy aliases (ColumnDefinition, etc.)
|
|
15
|
+
are maintained for backwards compatibility.
|
|
16
|
+
|
|
17
|
+
This is the recommended import location for most DerivaML type definitions:
|
|
18
|
+
>>> from deriva_ml.core.definitions import RID, MLVocab, TableDef
|
|
19
|
+
|
|
20
|
+
For more specialized imports, you can import directly from submodules:
|
|
21
|
+
>>> from deriva_ml.core.constants import ML_SCHEMA
|
|
22
|
+
>>> from deriva_ml.core.enums import Status
|
|
23
|
+
>>> from deriva.core.typed import ColumnDef
|
|
4
24
|
"""
|
|
5
25
|
|
|
6
26
|
from __future__ import annotations
|
|
7
27
|
|
|
8
|
-
#
|
|
28
|
+
# =============================================================================
|
|
29
|
+
# Re-exported Constants
|
|
30
|
+
# =============================================================================
|
|
31
|
+
# From constants.py: Schema names, RID patterns, and column definitions
|
|
9
32
|
from deriva_ml.core.constants import (
|
|
10
33
|
DRY_RUN_RID,
|
|
11
34
|
ML_SCHEMA,
|
|
12
35
|
RID,
|
|
36
|
+
SYSTEM_SCHEMAS,
|
|
13
37
|
DerivaAssetColumns,
|
|
14
38
|
DerivaSystemColumns,
|
|
39
|
+
get_domain_schemas,
|
|
40
|
+
is_system_schema,
|
|
15
41
|
rid_part,
|
|
16
42
|
rid_regex,
|
|
17
43
|
snapshot_part,
|
|
18
44
|
)
|
|
19
45
|
|
|
20
|
-
#
|
|
46
|
+
# =============================================================================
|
|
47
|
+
# Re-exported Enums
|
|
48
|
+
# =============================================================================
|
|
49
|
+
# From enums.py: Status codes, type identifiers, and vocabulary names
|
|
21
50
|
from deriva_ml.core.enums import (
|
|
22
51
|
BaseStrEnum,
|
|
23
52
|
BuiltinTypes,
|
|
@@ -29,46 +58,127 @@ from deriva_ml.core.enums import (
|
|
|
29
58
|
Status,
|
|
30
59
|
UploadState,
|
|
31
60
|
)
|
|
61
|
+
# Also export BuiltinType directly (BuiltinTypes is the backwards-compatible alias)
|
|
62
|
+
from deriva.core.typed import BuiltinType
|
|
32
63
|
|
|
33
|
-
#
|
|
64
|
+
# =============================================================================
|
|
65
|
+
# Re-exported ERMrest Models
|
|
66
|
+
# =============================================================================
|
|
67
|
+
# From ermrest.py: Dataclass-based models for catalog structure definitions
|
|
68
|
+
# New typed classes from deriva.core.typed
|
|
34
69
|
from deriva_ml.core.ermrest import (
|
|
70
|
+
# New dataclass-based definitions from deriva.core.typed
|
|
71
|
+
ColumnDef,
|
|
72
|
+
KeyDef,
|
|
73
|
+
ForeignKeyDef,
|
|
74
|
+
TableDef,
|
|
75
|
+
VocabularyTableDef,
|
|
76
|
+
AssetTableDef,
|
|
77
|
+
AssociationTableDef,
|
|
78
|
+
SchemaDef,
|
|
79
|
+
# Legacy aliases for backwards compatibility
|
|
35
80
|
ColumnDefinition,
|
|
36
|
-
FileUploadState,
|
|
37
|
-
ForeignKeyDefinition,
|
|
38
81
|
KeyDefinition,
|
|
82
|
+
ForeignKeyDefinition,
|
|
39
83
|
TableDefinition,
|
|
84
|
+
# DerivaML-specific classes
|
|
85
|
+
FileUploadState,
|
|
86
|
+
UploadCallback,
|
|
87
|
+
UploadProgress,
|
|
40
88
|
VocabularyTerm,
|
|
89
|
+
VocabularyTermHandle,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# =============================================================================
|
|
93
|
+
# Re-exported Exceptions
|
|
94
|
+
# =============================================================================
|
|
95
|
+
# From exceptions.py: Exception hierarchy for DerivaML errors
|
|
96
|
+
from deriva_ml.core.exceptions import (
|
|
97
|
+
DerivaMLAuthenticationError,
|
|
98
|
+
DerivaMLConfigurationError,
|
|
99
|
+
DerivaMLCycleError,
|
|
100
|
+
DerivaMLDataError,
|
|
101
|
+
DerivaMLDatasetNotFound,
|
|
102
|
+
DerivaMLException,
|
|
103
|
+
DerivaMLExecutionError,
|
|
104
|
+
DerivaMLInvalidTerm,
|
|
105
|
+
DerivaMLNotFoundError,
|
|
106
|
+
DerivaMLReadOnlyError,
|
|
107
|
+
DerivaMLSchemaError,
|
|
108
|
+
DerivaMLTableNotFound,
|
|
109
|
+
DerivaMLTableTypeError,
|
|
110
|
+
DerivaMLUploadError,
|
|
111
|
+
DerivaMLValidationError,
|
|
112
|
+
DerivaMLWorkflowError,
|
|
41
113
|
)
|
|
42
114
|
|
|
43
|
-
#
|
|
115
|
+
# =============================================================================
|
|
116
|
+
# Re-exported Utilities
|
|
117
|
+
# =============================================================================
|
|
118
|
+
# From filespec.py: File metadata and specification handling
|
|
44
119
|
from deriva_ml.core.filespec import FileSpec
|
|
45
120
|
|
|
46
121
|
__all__ = [
|
|
47
122
|
# Constants
|
|
48
123
|
"ML_SCHEMA",
|
|
49
124
|
"DRY_RUN_RID",
|
|
125
|
+
"SYSTEM_SCHEMAS",
|
|
50
126
|
"rid_part",
|
|
51
127
|
"snapshot_part",
|
|
52
128
|
"rid_regex",
|
|
53
129
|
"DerivaSystemColumns",
|
|
54
130
|
"DerivaAssetColumns",
|
|
55
131
|
"RID",
|
|
132
|
+
# Schema classification helpers
|
|
133
|
+
"is_system_schema",
|
|
134
|
+
"get_domain_schemas",
|
|
56
135
|
# Enums
|
|
57
136
|
"BaseStrEnum",
|
|
58
137
|
"UploadState",
|
|
59
138
|
"Status",
|
|
139
|
+
"BuiltinType",
|
|
60
140
|
"BuiltinTypes",
|
|
61
141
|
"MLVocab",
|
|
62
142
|
"MLTable",
|
|
63
143
|
"MLAsset",
|
|
64
144
|
"ExecMetadataType",
|
|
65
145
|
"ExecAssetType",
|
|
66
|
-
#
|
|
67
|
-
"
|
|
68
|
-
"
|
|
69
|
-
"
|
|
146
|
+
# Typed definitions from deriva.core.typed
|
|
147
|
+
"ColumnDef",
|
|
148
|
+
"KeyDef",
|
|
149
|
+
"ForeignKeyDef",
|
|
150
|
+
"TableDef",
|
|
151
|
+
"VocabularyTableDef",
|
|
152
|
+
"AssetTableDef",
|
|
153
|
+
"AssociationTableDef",
|
|
154
|
+
"SchemaDef",
|
|
155
|
+
# Legacy aliases for backwards compatibility
|
|
70
156
|
"ColumnDefinition",
|
|
71
157
|
"KeyDefinition",
|
|
72
158
|
"ForeignKeyDefinition",
|
|
73
159
|
"TableDefinition",
|
|
160
|
+
# DerivaML-specific models
|
|
161
|
+
"FileUploadState",
|
|
162
|
+
"FileSpec",
|
|
163
|
+
"VocabularyTerm",
|
|
164
|
+
"VocabularyTermHandle",
|
|
165
|
+
"UploadProgress",
|
|
166
|
+
"UploadCallback",
|
|
167
|
+
# Exceptions
|
|
168
|
+
"DerivaMLException",
|
|
169
|
+
"DerivaMLConfigurationError",
|
|
170
|
+
"DerivaMLSchemaError",
|
|
171
|
+
"DerivaMLAuthenticationError",
|
|
172
|
+
"DerivaMLDataError",
|
|
173
|
+
"DerivaMLNotFoundError",
|
|
174
|
+
"DerivaMLDatasetNotFound",
|
|
175
|
+
"DerivaMLTableNotFound",
|
|
176
|
+
"DerivaMLInvalidTerm",
|
|
177
|
+
"DerivaMLTableTypeError",
|
|
178
|
+
"DerivaMLValidationError",
|
|
179
|
+
"DerivaMLCycleError",
|
|
180
|
+
"DerivaMLExecutionError",
|
|
181
|
+
"DerivaMLWorkflowError",
|
|
182
|
+
"DerivaMLUploadError",
|
|
183
|
+
"DerivaMLReadOnlyError",
|
|
74
184
|
]
|