deriva-ml 1.17.9__py3-none-any.whl → 1.17.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +43 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +21 -0
- deriva_ml/catalog/clone.py +1199 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +817 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +186 -105
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +545 -244
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +224 -35
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -5
- deriva_ml-1.17.11.dist-info/RECORD +77 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +2 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.9.dist-info/RECORD +0 -45
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Path builder mixin for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides the PathBuilderMixin class which handles
|
|
4
|
+
catalog path building and table access utilities.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Iterable
|
|
11
|
+
|
|
12
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
13
|
+
import importlib
|
|
14
|
+
datapath = importlib.import_module("deriva.core.datapath")
|
|
15
|
+
_ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")
|
|
16
|
+
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
17
|
+
|
|
18
|
+
SchemaWrapper = datapath._SchemaWrapper
|
|
19
|
+
ErmrestCatalog = _ermrest_catalog.ErmrestCatalog
|
|
20
|
+
ErmrestSnapshot = _ermrest_catalog.ErmrestSnapshot
|
|
21
|
+
Table = _ermrest_model.Table
|
|
22
|
+
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
from deriva_ml.dataset.upload import table_path as _table_path
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class PathBuilderMixin:
|
|
32
|
+
"""Mixin providing path building and table access utilities.
|
|
33
|
+
|
|
34
|
+
This mixin requires the host class to have:
|
|
35
|
+
- catalog: ErmrestCatalog or ErmrestSnapshot instance
|
|
36
|
+
- domain_schema: str - name of the domain schema
|
|
37
|
+
- model: DerivaModel instance
|
|
38
|
+
- working_dir: Path - working directory path
|
|
39
|
+
|
|
40
|
+
Methods:
|
|
41
|
+
pathBuilder: Get catalog path builder for queries
|
|
42
|
+
domain_path: Property returning path builder for domain schema
|
|
43
|
+
table_path: Get local filesystem path for table CSV files
|
|
44
|
+
get_table_as_dataframe: Get table contents as pandas DataFrame
|
|
45
|
+
get_table_as_dict: Get table contents as dictionaries
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
# Type hints for IDE support - actual attributes from host class
|
|
49
|
+
catalog: ErmrestCatalog | ErmrestSnapshot
|
|
50
|
+
domain_schemas: frozenset[str]
|
|
51
|
+
default_schema: str | None
|
|
52
|
+
model: "DerivaModel"
|
|
53
|
+
working_dir: Path
|
|
54
|
+
|
|
55
|
+
def pathBuilder(self) -> SchemaWrapper:
|
|
56
|
+
"""Returns catalog path builder for queries.
|
|
57
|
+
|
|
58
|
+
The path builder provides a fluent interface for constructing complex queries against the catalog.
|
|
59
|
+
This is a core component used by many other methods to interact with the catalog.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
datapath._CatalogWrapper: A new instance of the catalog path builder.
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
>>> path = ml.pathBuilder.schemas['my_schema'].tables['my_table']
|
|
66
|
+
>>> results = path.entities().fetch()
|
|
67
|
+
"""
|
|
68
|
+
return self.catalog.getPathBuilder()
|
|
69
|
+
|
|
70
|
+
def domain_path(self, schema: str | None = None) -> datapath.DataPath:
|
|
71
|
+
"""Returns path builder for a domain schema.
|
|
72
|
+
|
|
73
|
+
Provides a convenient way to access tables and construct queries within a domain-specific schema.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
schema: Schema name to get path builder for. If None, uses default_schema.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
datapath._CatalogWrapper: Path builder object scoped to the specified domain schema.
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
DerivaMLException: If no schema specified and default_schema is not set.
|
|
83
|
+
|
|
84
|
+
Example:
|
|
85
|
+
>>> domain = ml.domain_path() # Uses default schema
|
|
86
|
+
>>> results = domain.my_table.entities().fetch()
|
|
87
|
+
>>> # Or with explicit schema:
|
|
88
|
+
>>> domain = ml.domain_path("my_schema")
|
|
89
|
+
"""
|
|
90
|
+
schema = schema or self.model._require_default_schema()
|
|
91
|
+
return self.pathBuilder().schemas[schema]
|
|
92
|
+
|
|
93
|
+
def table_path(self, table: str | Table, schema: str | None = None) -> Path:
|
|
94
|
+
"""Returns a local filesystem path for table CSV files.
|
|
95
|
+
|
|
96
|
+
Generates a standardized path where CSV files should be placed when preparing to upload data to a table.
|
|
97
|
+
The path follows the project's directory structure conventions.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
table: Name of the table or Table object to get the path for.
|
|
101
|
+
schema: Schema name for the path. If None, uses the table's schema or default_schema.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Path: Filesystem path where the CSV file should be placed.
|
|
105
|
+
|
|
106
|
+
Example:
|
|
107
|
+
>>> path = ml.table_path("experiment_results")
|
|
108
|
+
>>> df.to_csv(path) # Save data for upload
|
|
109
|
+
"""
|
|
110
|
+
table_obj = self.model.name_to_table(table)
|
|
111
|
+
# Use table's schema if available, otherwise use provided schema or default
|
|
112
|
+
schema = schema or table_obj.schema.name
|
|
113
|
+
return _table_path(
|
|
114
|
+
self.working_dir,
|
|
115
|
+
schema=schema,
|
|
116
|
+
table=table_obj.name,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def get_table_as_dataframe(self, table: str) -> pd.DataFrame:
|
|
120
|
+
"""Get table contents as a pandas DataFrame.
|
|
121
|
+
|
|
122
|
+
Retrieves all contents of a table from the catalog.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
table: Name of the table to retrieve.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
DataFrame containing all table contents.
|
|
129
|
+
"""
|
|
130
|
+
return pd.DataFrame(list(self.get_table_as_dict(table)))
|
|
131
|
+
|
|
132
|
+
def get_table_as_dict(self, table: str) -> Iterable[dict[str, Any]]:
|
|
133
|
+
"""Get table contents as dictionaries.
|
|
134
|
+
|
|
135
|
+
Retrieves all contents of a table from the catalog.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
table: Name of the table to retrieve.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Iterable yielding dictionaries for each row.
|
|
142
|
+
"""
|
|
143
|
+
table_obj = self.model.name_to_table(table)
|
|
144
|
+
pb = self.pathBuilder()
|
|
145
|
+
yield from pb.schemas[table_obj.schema.name].tables[table_obj.name].entities().fetch()
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""RID resolution mixin for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides the RidResolutionMixin class which handles
|
|
4
|
+
Resource Identifier (RID) resolution and retrieval operations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
13
|
+
import importlib
|
|
14
|
+
_datapath = importlib.import_module("deriva.core.datapath")
|
|
15
|
+
_ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")
|
|
16
|
+
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
17
|
+
|
|
18
|
+
AnyQuantifier = _datapath.Any
|
|
19
|
+
ErmrestCatalog = _ermrest_catalog.ErmrestCatalog
|
|
20
|
+
ErmrestSnapshot = _ermrest_catalog.ErmrestSnapshot
|
|
21
|
+
ResolveRidResult = _ermrest_catalog.ResolveRidResult
|
|
22
|
+
Table = _ermrest_model.Table
|
|
23
|
+
|
|
24
|
+
from deriva_ml.core.definitions import RID
|
|
25
|
+
from deriva_ml.core.exceptions import DerivaMLException
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class BatchRidResult:
|
|
33
|
+
"""Result of batch RID resolution.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
rid: The resolved RID (normalized form).
|
|
37
|
+
table: The Table object containing this RID.
|
|
38
|
+
table_name: The name of the table containing this RID.
|
|
39
|
+
schema_name: The name of the schema containing this RID.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
rid: RID
|
|
43
|
+
table: Table
|
|
44
|
+
table_name: str
|
|
45
|
+
schema_name: str
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class RidResolutionMixin:
|
|
49
|
+
"""Mixin providing RID resolution and retrieval operations.
|
|
50
|
+
|
|
51
|
+
This mixin requires the host class to have:
|
|
52
|
+
- catalog: ErmrestCatalog or ErmrestSnapshot instance
|
|
53
|
+
- model: DerivaModel instance (with .model attribute for ermrest model)
|
|
54
|
+
- pathBuilder(): method returning catalog path builder
|
|
55
|
+
|
|
56
|
+
Methods:
|
|
57
|
+
resolve_rid: Resolve a RID to its catalog location
|
|
58
|
+
resolve_rids: Batch resolve multiple RIDs efficiently
|
|
59
|
+
retrieve_rid: Retrieve the complete record for a RID
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
# Type hints for IDE support - actual attributes from host class
|
|
63
|
+
catalog: ErmrestCatalog | ErmrestSnapshot
|
|
64
|
+
model: "DerivaModel"
|
|
65
|
+
pathBuilder: Any # Callable returning path builder
|
|
66
|
+
|
|
67
|
+
def resolve_rid(self, rid: RID) -> ResolveRidResult:
|
|
68
|
+
"""Resolves RID to catalog location.
|
|
69
|
+
|
|
70
|
+
Looks up a RID and returns information about where it exists in the catalog, including schema,
|
|
71
|
+
table, and column metadata.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
rid: Resource Identifier to resolve.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
ResolveRidResult: Named tuple containing:
|
|
78
|
+
- schema: Schema name
|
|
79
|
+
- table: Table name
|
|
80
|
+
- columns: Column definitions
|
|
81
|
+
- datapath: Path builder for accessing the entity
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
DerivaMLException: If RID doesn't exist in catalog.
|
|
85
|
+
|
|
86
|
+
Examples:
|
|
87
|
+
>>> result = ml.resolve_rid("1-abc123")
|
|
88
|
+
>>> print(f"Found in {result.schema}.{result.table}")
|
|
89
|
+
>>> data = result.datapath.entities().fetch()
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
# Attempt to resolve RID using catalog model
|
|
93
|
+
return self.catalog.resolve_rid(rid, self.model.model)
|
|
94
|
+
except KeyError as _e:
|
|
95
|
+
raise DerivaMLException(f"Invalid RID {rid}")
|
|
96
|
+
|
|
97
|
+
def retrieve_rid(self, rid: RID) -> dict[str, Any]:
|
|
98
|
+
"""Retrieves complete record for RID.
|
|
99
|
+
|
|
100
|
+
Fetches all column values for the entity identified by the RID.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
rid: Resource Identifier of the record to retrieve.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
dict[str, Any]: Dictionary containing all column values for the entity.
|
|
107
|
+
|
|
108
|
+
Raises:
|
|
109
|
+
DerivaMLException: If the RID doesn't exist in the catalog.
|
|
110
|
+
|
|
111
|
+
Example:
|
|
112
|
+
>>> record = ml.retrieve_rid("1-abc123")
|
|
113
|
+
>>> print(f"Name: {record['name']}, Created: {record['creation_date']}")
|
|
114
|
+
"""
|
|
115
|
+
# Resolve RID and fetch the first (only) matching record
|
|
116
|
+
return self.resolve_rid(rid).datapath.entities().fetch()[0]
|
|
117
|
+
|
|
118
|
+
def resolve_rids(
|
|
119
|
+
self,
|
|
120
|
+
rids: set[RID] | list[RID],
|
|
121
|
+
candidate_tables: list[Table] | None = None,
|
|
122
|
+
) -> dict[RID, BatchRidResult]:
|
|
123
|
+
"""Batch resolve multiple RIDs efficiently.
|
|
124
|
+
|
|
125
|
+
Resolves multiple RIDs in batched queries, significantly faster than
|
|
126
|
+
calling resolve_rid() for each RID individually. Instead of N network
|
|
127
|
+
calls for N RIDs, this makes one query per candidate table.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
rids: Set or list of RIDs to resolve.
|
|
131
|
+
candidate_tables: Optional list of Table objects to search in.
|
|
132
|
+
If not provided, searches all tables in domain and ML schemas.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
dict[RID, BatchRidResult]: Mapping from each resolved RID to its
|
|
136
|
+
BatchRidResult containing table information.
|
|
137
|
+
|
|
138
|
+
Raises:
|
|
139
|
+
DerivaMLException: If any RID cannot be resolved.
|
|
140
|
+
|
|
141
|
+
Example:
|
|
142
|
+
>>> results = ml.resolve_rids(["1-ABC", "2-DEF", "3-GHI"])
|
|
143
|
+
>>> for rid, info in results.items():
|
|
144
|
+
... print(f"{rid} is in table {info.table_name}")
|
|
145
|
+
"""
|
|
146
|
+
rids = set(rids)
|
|
147
|
+
if not rids:
|
|
148
|
+
return {}
|
|
149
|
+
|
|
150
|
+
results: dict[RID, BatchRidResult] = {}
|
|
151
|
+
remaining_rids = set(rids)
|
|
152
|
+
|
|
153
|
+
# Determine which tables to search
|
|
154
|
+
if candidate_tables is None:
|
|
155
|
+
# Search all tables in domain and ML schemas
|
|
156
|
+
candidate_tables = []
|
|
157
|
+
for schema_name in [*self.model.domain_schemas, self.model.ml_schema]:
|
|
158
|
+
schema = self.model.model.schemas.get(schema_name)
|
|
159
|
+
if schema:
|
|
160
|
+
candidate_tables.extend(schema.tables.values())
|
|
161
|
+
|
|
162
|
+
pb = self.pathBuilder()
|
|
163
|
+
|
|
164
|
+
# Query each candidate table for matching RIDs
|
|
165
|
+
for table in candidate_tables:
|
|
166
|
+
if not remaining_rids:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
schema_name = table.schema.name
|
|
170
|
+
table_name = table.name
|
|
171
|
+
|
|
172
|
+
# Build a query with RID filter for all remaining RIDs
|
|
173
|
+
table_path = pb.schemas[schema_name].tables[table_name]
|
|
174
|
+
|
|
175
|
+
# Use ERMrest's Any quantifier for IN-style query
|
|
176
|
+
# Query only for RID column to minimize data transfer
|
|
177
|
+
try:
|
|
178
|
+
# Filter: RID = any(rid1, rid2, ...) - ERMrest's way of doing IN clause
|
|
179
|
+
found_entities = list(
|
|
180
|
+
table_path.filter(table_path.RID == AnyQuantifier(*remaining_rids))
|
|
181
|
+
.attributes(table_path.RID)
|
|
182
|
+
.fetch()
|
|
183
|
+
)
|
|
184
|
+
except Exception:
|
|
185
|
+
# Table might not support this query, skip it
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
# Process found RIDs
|
|
189
|
+
for entity in found_entities:
|
|
190
|
+
rid = entity["RID"]
|
|
191
|
+
if rid in remaining_rids:
|
|
192
|
+
results[rid] = BatchRidResult(
|
|
193
|
+
rid=rid,
|
|
194
|
+
table=table,
|
|
195
|
+
table_name=table_name,
|
|
196
|
+
schema_name=schema_name,
|
|
197
|
+
)
|
|
198
|
+
remaining_rids.remove(rid)
|
|
199
|
+
|
|
200
|
+
# Check if any RIDs were not found
|
|
201
|
+
if remaining_rids:
|
|
202
|
+
raise DerivaMLException(f"Invalid RIDs: {remaining_rids}")
|
|
203
|
+
|
|
204
|
+
return results
|