deriva-ml 1.17.14__py3-none-any.whl → 1.17.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +2 -2
- deriva_ml/asset/asset.py +0 -4
- deriva_ml/catalog/__init__.py +6 -0
- deriva_ml/catalog/clone.py +1591 -38
- deriva_ml/catalog/localize.py +66 -29
- deriva_ml/core/base.py +12 -9
- deriva_ml/core/definitions.py +13 -12
- deriva_ml/core/ermrest.py +11 -12
- deriva_ml/core/mixins/annotation.py +2 -2
- deriva_ml/core/mixins/asset.py +3 -3
- deriva_ml/core/mixins/dataset.py +3 -3
- deriva_ml/core/mixins/execution.py +1 -0
- deriva_ml/core/mixins/feature.py +2 -2
- deriva_ml/core/mixins/file.py +2 -2
- deriva_ml/core/mixins/path_builder.py +2 -2
- deriva_ml/core/mixins/rid_resolution.py +2 -2
- deriva_ml/core/mixins/vocabulary.py +2 -2
- deriva_ml/core/mixins/workflow.py +3 -3
- deriva_ml/dataset/catalog_graph.py +3 -4
- deriva_ml/dataset/dataset.py +5 -3
- deriva_ml/dataset/dataset_bag.py +0 -2
- deriva_ml/dataset/upload.py +2 -2
- deriva_ml/demo_catalog.py +0 -1
- deriva_ml/execution/__init__.py +8 -8
- deriva_ml/execution/base_config.py +2 -2
- deriva_ml/execution/execution.py +5 -3
- deriva_ml/execution/execution_record.py +0 -1
- deriva_ml/execution/model_protocol.py +1 -1
- deriva_ml/execution/multirun_config.py +0 -1
- deriva_ml/execution/runner.py +3 -3
- deriva_ml/experiment/experiment.py +3 -3
- deriva_ml/feature.py +2 -2
- deriva_ml/interfaces.py +2 -2
- deriva_ml/model/__init__.py +45 -24
- deriva_ml/model/annotations.py +0 -1
- deriva_ml/model/catalog.py +3 -2
- deriva_ml/model/data_loader.py +330 -0
- deriva_ml/model/data_sources.py +439 -0
- deriva_ml/model/database.py +216 -32
- deriva_ml/model/fk_orderer.py +379 -0
- deriva_ml/model/handles.py +1 -1
- deriva_ml/model/schema_builder.py +816 -0
- deriva_ml/run_model.py +3 -3
- deriva_ml/schema/annotations.py +2 -1
- deriva_ml/schema/create_schema.py +1 -1
- deriva_ml/schema/validation.py +1 -1
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/METADATA +1 -1
- deriva_ml-1.17.16.dist-info/RECORD +81 -0
- deriva_ml-1.17.14.dist-info/RECORD +0 -77
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/WHEEL +0 -0
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.14.dist-info → deriva_ml-1.17.16.dist-info}/top_level.txt +0 -0
deriva_ml/model/database.py
CHANGED
|
@@ -1,52 +1,68 @@
|
|
|
1
1
|
"""DerivaML-specific database model for downloaded BDBags.
|
|
2
2
|
|
|
3
|
-
This module provides the DatabaseModel class which
|
|
4
|
-
from
|
|
3
|
+
This module provides the DatabaseModel class which creates a SQLite database
|
|
4
|
+
from a BDBag and provides DerivaML-specific functionality:
|
|
5
5
|
|
|
6
6
|
- Dataset version tracking
|
|
7
7
|
- Dataset RID resolution
|
|
8
8
|
- Integration with DerivaModel for schema analysis
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
The implementation uses a two-phase pattern:
|
|
11
|
+
1. Phase 1 (SchemaBuilder): Create SQLAlchemy ORM from schema.json
|
|
12
|
+
2. Phase 2 (DataLoader): Load data from CSV files
|
|
13
|
+
|
|
14
|
+
For the low-level components, see:
|
|
15
|
+
- schema_builder.py: SchemaBuilder, SchemaORM
|
|
16
|
+
- data_sources.py: DataSource, BagDataSource, CatalogDataSource
|
|
17
|
+
- data_loader.py: DataLoader
|
|
18
|
+
- fk_orderer.py: ForeignKeyOrderer
|
|
11
19
|
"""
|
|
12
20
|
|
|
13
21
|
from __future__ import annotations
|
|
14
22
|
|
|
15
23
|
import logging
|
|
16
24
|
from pathlib import Path
|
|
17
|
-
from typing import Any, Generator,
|
|
25
|
+
from typing import Any, Generator, Type
|
|
18
26
|
|
|
19
|
-
from sqlalchemy import select
|
|
20
|
-
from sqlalchemy.orm import Session
|
|
21
|
-
|
|
22
|
-
from deriva.core.bag_database import BagDatabase
|
|
23
27
|
from deriva.core.ermrest_model import Model
|
|
24
28
|
from deriva.core.ermrest_model import Table as DerivaTable
|
|
29
|
+
from sqlalchemy import Table as SQLTable
|
|
30
|
+
from sqlalchemy import select
|
|
31
|
+
from sqlalchemy.orm import Session
|
|
25
32
|
|
|
26
33
|
from deriva_ml.core.definitions import ML_SCHEMA, RID, get_domain_schemas
|
|
27
34
|
from deriva_ml.core.exceptions import DerivaMLException
|
|
28
35
|
from deriva_ml.dataset.aux_classes import DatasetMinid, DatasetVersion
|
|
29
36
|
from deriva_ml.model.catalog import DerivaModel
|
|
37
|
+
from deriva_ml.model.data_loader import DataLoader
|
|
38
|
+
from deriva_ml.model.data_sources import BagDataSource
|
|
39
|
+
from deriva_ml.model.schema_builder import SchemaBuilder, SchemaORM
|
|
30
40
|
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
31
42
|
|
|
32
|
-
class DatabaseModel(BagDatabase, DerivaModel):
|
|
33
|
-
"""DerivaML database model for downloaded BDBags.
|
|
34
43
|
|
|
35
|
-
|
|
36
|
-
|
|
44
|
+
class DatabaseModel(DerivaModel):
|
|
45
|
+
"""DerivaML database model for downloaded BDBags.
|
|
37
46
|
|
|
38
|
-
|
|
39
|
-
-
|
|
40
|
-
-
|
|
47
|
+
This class creates a SQLite database from a BDBag and provides:
|
|
48
|
+
- SQLAlchemy ORM access (engine, metadata, Base)
|
|
49
|
+
- DerivaModel schema methods (find_features, is_asset, etc.)
|
|
41
50
|
- Dataset version tracking (bag_rids, dataset_version)
|
|
42
51
|
- Dataset RID validation (rid_lookup)
|
|
43
52
|
|
|
53
|
+
The implementation uses a two-phase pattern:
|
|
54
|
+
1. SchemaBuilder creates SQLAlchemy ORM from schema.json
|
|
55
|
+
2. DataLoader fills the database from CSV files
|
|
56
|
+
|
|
44
57
|
Attributes:
|
|
45
58
|
bag_path: Path to the BDBag directory.
|
|
46
59
|
minid: DatasetMinid for the downloaded bag.
|
|
47
60
|
dataset_rid: Primary dataset RID in this bag.
|
|
48
61
|
bag_rids: Dictionary mapping all dataset RIDs to their versions.
|
|
49
62
|
dataset_table: The Dataset table from the ERMrest model.
|
|
63
|
+
engine: SQLAlchemy engine for database access.
|
|
64
|
+
metadata: SQLAlchemy MetaData with table definitions.
|
|
65
|
+
Base: SQLAlchemy automap base for ORM classes.
|
|
50
66
|
|
|
51
67
|
Example:
|
|
52
68
|
>>> db = DatabaseModel(minid, bag_path, working_dir)
|
|
@@ -66,33 +82,47 @@ class DatabaseModel(BagDatabase, DerivaModel):
|
|
|
66
82
|
self._logger = logging.getLogger("deriva_ml")
|
|
67
83
|
self.minid = minid
|
|
68
84
|
self.dataset_rid = minid.dataset_rid
|
|
85
|
+
self.bag_path = bag_path
|
|
69
86
|
|
|
70
|
-
# Load the model
|
|
87
|
+
# Load the model from schema.json
|
|
71
88
|
schema_file = bag_path / "data/schema.json"
|
|
72
|
-
|
|
89
|
+
model = Model.fromfile("file-system", schema_file)
|
|
73
90
|
|
|
74
|
-
# Determine
|
|
91
|
+
# Determine schemas using schema classification
|
|
75
92
|
ml_schema = ML_SCHEMA
|
|
76
|
-
domain_schemas = get_domain_schemas(
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
93
|
+
domain_schemas = get_domain_schemas(model.schemas.keys(), ml_schema)
|
|
94
|
+
schemas = [*domain_schemas, ml_schema]
|
|
95
|
+
|
|
96
|
+
# Extract bag checksum for unique database path
|
|
97
|
+
bag_cache_dir = bag_path.parent.name
|
|
98
|
+
self.database_dir = dbase_path / bag_cache_dir
|
|
99
|
+
self.database_dir.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
|
|
101
|
+
# Phase 1: Build ORM structure
|
|
102
|
+
builder = SchemaBuilder(
|
|
103
|
+
model=model,
|
|
104
|
+
schemas=schemas,
|
|
105
|
+
database_path=self.database_dir,
|
|
84
106
|
)
|
|
107
|
+
self._orm: SchemaORM = builder.build()
|
|
108
|
+
|
|
109
|
+
# Phase 2: Load data from bag CSVs
|
|
110
|
+
source = BagDataSource(bag_path, model=model)
|
|
111
|
+
loader = DataLoader(self._orm, source)
|
|
112
|
+
load_counts = loader.load_tables()
|
|
113
|
+
|
|
114
|
+
total_rows = sum(load_counts.values())
|
|
115
|
+
self._logger.debug(f"Loaded {total_rows} rows from bag")
|
|
85
116
|
|
|
86
117
|
# Initialize DerivaModel (provides schema analysis methods)
|
|
87
|
-
# Note: We pass self.model which was set by BagDatabase
|
|
88
118
|
DerivaModel.__init__(
|
|
89
119
|
self,
|
|
90
|
-
model=
|
|
120
|
+
model=model,
|
|
91
121
|
ml_schema=ml_schema,
|
|
92
122
|
domain_schemas=domain_schemas,
|
|
93
123
|
)
|
|
94
124
|
|
|
95
|
-
self.dataset_table =
|
|
125
|
+
self.dataset_table = model.schemas[ml_schema].tables["Dataset"]
|
|
96
126
|
|
|
97
127
|
# Build dataset RID -> version mapping from Dataset_Version table
|
|
98
128
|
self._build_bag_rids()
|
|
@@ -103,6 +133,34 @@ class DatabaseModel(BagDatabase, DerivaModel):
|
|
|
103
133
|
self.database_dir,
|
|
104
134
|
)
|
|
105
135
|
|
|
136
|
+
# =========================================================================
|
|
137
|
+
# Property delegates to SchemaORM
|
|
138
|
+
# =========================================================================
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def engine(self):
|
|
142
|
+
"""SQLAlchemy engine for database access."""
|
|
143
|
+
return self._orm.engine
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def metadata(self):
|
|
147
|
+
"""SQLAlchemy MetaData with table definitions."""
|
|
148
|
+
return self._orm.metadata
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def Base(self):
|
|
152
|
+
"""SQLAlchemy automap base for ORM classes."""
|
|
153
|
+
return self._orm.Base
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def schemas(self) -> list[str]:
|
|
157
|
+
"""List of schema names in the database."""
|
|
158
|
+
return self._orm.schemas
|
|
159
|
+
|
|
160
|
+
# =========================================================================
|
|
161
|
+
# Dataset version tracking
|
|
162
|
+
# =========================================================================
|
|
163
|
+
|
|
106
164
|
def _build_bag_rids(self) -> None:
|
|
107
165
|
"""Build mapping of dataset RIDs to their versions in this bag."""
|
|
108
166
|
self.bag_rids: dict[RID, DatasetVersion] = {}
|
|
@@ -121,7 +179,7 @@ class DatabaseModel(BagDatabase, DerivaModel):
|
|
|
121
179
|
if rid not in self.bag_rids or version > self.bag_rids[rid]:
|
|
122
180
|
self.bag_rids[rid] = version
|
|
123
181
|
|
|
124
|
-
def dataset_version(self, dataset_rid:
|
|
182
|
+
def dataset_version(self, dataset_rid: RID | None = None) -> DatasetVersion:
|
|
125
183
|
"""Get the version of a dataset in this bag.
|
|
126
184
|
|
|
127
185
|
Args:
|
|
@@ -154,6 +212,107 @@ class DatabaseModel(BagDatabase, DerivaModel):
|
|
|
154
212
|
return self.bag_rids[dataset_rid]
|
|
155
213
|
raise DerivaMLException(f"Dataset {dataset_rid} not found in this bag")
|
|
156
214
|
|
|
215
|
+
# =========================================================================
|
|
216
|
+
# Table/ORM access methods - delegate to SchemaORM
|
|
217
|
+
# =========================================================================
|
|
218
|
+
|
|
219
|
+
def list_tables(self) -> list[str]:
|
|
220
|
+
"""List all tables in the database.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
List of fully-qualified table names (schema.table), sorted.
|
|
224
|
+
"""
|
|
225
|
+
return self._orm.list_tables()
|
|
226
|
+
|
|
227
|
+
def find_table(self, table_name: str) -> SQLTable:
|
|
228
|
+
"""Find a table by name.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
table_name: Table name, with or without schema prefix.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
SQLAlchemy Table object.
|
|
235
|
+
|
|
236
|
+
Raises:
|
|
237
|
+
KeyError: If table not found.
|
|
238
|
+
"""
|
|
239
|
+
return self._orm.find_table(table_name)
|
|
240
|
+
|
|
241
|
+
def get_table_contents(self, table: str) -> Generator[dict[str, Any], None, None]:
|
|
242
|
+
"""Retrieve all rows from a table as dictionaries.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
table: Table name (with or without schema prefix).
|
|
246
|
+
|
|
247
|
+
Yields:
|
|
248
|
+
Dictionary for each row with column names as keys.
|
|
249
|
+
"""
|
|
250
|
+
yield from self._orm.get_table_contents(table)
|
|
251
|
+
|
|
252
|
+
def get_orm_class_by_name(self, table_name: str) -> Any | None:
|
|
253
|
+
"""Get the ORM class for a table by name.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
table_name: Table name, with or without schema prefix.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
SQLAlchemy ORM class for the table.
|
|
260
|
+
|
|
261
|
+
Raises:
|
|
262
|
+
KeyError: If table not found.
|
|
263
|
+
"""
|
|
264
|
+
return self._orm.get_orm_class(table_name)
|
|
265
|
+
|
|
266
|
+
def get_orm_class_for_table(self, table: SQLTable | DerivaTable | str) -> Any | None:
|
|
267
|
+
"""Get the ORM class for a table.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
table: SQLAlchemy Table, Deriva Table, or table name.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
SQLAlchemy ORM class, or None if not found.
|
|
274
|
+
"""
|
|
275
|
+
return self._orm.get_orm_class_for_table(table)
|
|
276
|
+
|
|
277
|
+
@staticmethod
|
|
278
|
+
def is_association_table(
|
|
279
|
+
table_class,
|
|
280
|
+
min_arity: int = 2,
|
|
281
|
+
max_arity: int = 2,
|
|
282
|
+
unqualified: bool = True,
|
|
283
|
+
pure: bool = True,
|
|
284
|
+
no_overlap: bool = True,
|
|
285
|
+
return_fkeys: bool = False,
|
|
286
|
+
):
|
|
287
|
+
"""Check if an ORM class represents an association table.
|
|
288
|
+
|
|
289
|
+
Delegates to SchemaORM.is_association_table.
|
|
290
|
+
"""
|
|
291
|
+
return SchemaORM.is_association_table(
|
|
292
|
+
table_class, min_arity, max_arity, unqualified, pure, no_overlap, return_fkeys
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
def get_association_class(
|
|
296
|
+
self,
|
|
297
|
+
left_cls: Type[Any],
|
|
298
|
+
right_cls: Type[Any],
|
|
299
|
+
) -> tuple[Any, Any, Any] | None:
|
|
300
|
+
"""Find an association class connecting two ORM classes.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
left_cls: First ORM class.
|
|
304
|
+
right_cls: Second ORM class.
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
Tuple of (association_class, left_relationship, right_relationship),
|
|
308
|
+
or None if no association found.
|
|
309
|
+
"""
|
|
310
|
+
return self._orm.get_association_class(left_cls, right_cls)
|
|
311
|
+
|
|
312
|
+
# =========================================================================
|
|
313
|
+
# Compatibility methods
|
|
314
|
+
# =========================================================================
|
|
315
|
+
|
|
157
316
|
def _get_table_contents(self, table: str) -> Generator[dict[str, Any], None, None]:
|
|
158
317
|
"""Retrieve table contents as dictionaries.
|
|
159
318
|
|
|
@@ -195,14 +354,26 @@ class DatabaseModel(BagDatabase, DerivaModel):
|
|
|
195
354
|
result = session.execute(cmd).mappings().first()
|
|
196
355
|
return dict(result) if result else None
|
|
197
356
|
|
|
198
|
-
# Compatibility aliases for methods that have different names in BagDatabase
|
|
199
357
|
def get_orm_association_class(self, left_cls, right_cls, **kwargs):
|
|
200
358
|
"""Find association class between two ORM classes.
|
|
201
359
|
|
|
202
|
-
Wrapper around
|
|
360
|
+
Wrapper around get_association_class for compatibility.
|
|
203
361
|
"""
|
|
204
362
|
return self.get_association_class(left_cls, right_cls)
|
|
205
363
|
|
|
364
|
+
# =========================================================================
|
|
365
|
+
# Resource management
|
|
366
|
+
# =========================================================================
|
|
367
|
+
|
|
368
|
+
def dispose(self) -> None:
|
|
369
|
+
"""Dispose of SQLAlchemy resources.
|
|
370
|
+
|
|
371
|
+
Call this when done with the database to properly clean up connections.
|
|
372
|
+
After calling dispose(), the instance should not be used further.
|
|
373
|
+
"""
|
|
374
|
+
if hasattr(self, "_orm") and self._orm is not None:
|
|
375
|
+
self._orm.dispose()
|
|
376
|
+
|
|
206
377
|
def delete_database(self) -> None:
|
|
207
378
|
"""Delete the database files.
|
|
208
379
|
|
|
@@ -212,3 +383,16 @@ class DatabaseModel(BagDatabase, DerivaModel):
|
|
|
212
383
|
self.dispose()
|
|
213
384
|
# Note: We don't actually delete files here to avoid data loss.
|
|
214
385
|
# The caller should handle file deletion if needed.
|
|
386
|
+
|
|
387
|
+
def __del__(self) -> None:
|
|
388
|
+
"""Cleanup resources when garbage collected."""
|
|
389
|
+
self.dispose()
|
|
390
|
+
|
|
391
|
+
def __enter__(self) -> "DatabaseModel":
|
|
392
|
+
"""Context manager entry."""
|
|
393
|
+
return self
|
|
394
|
+
|
|
395
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
|
|
396
|
+
"""Context manager exit - dispose resources."""
|
|
397
|
+
self.dispose()
|
|
398
|
+
return False
|