deriva-ml 1.17.15__py3-none-any.whl → 1.17.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. deriva_ml/__init__.py +2 -2
  2. deriva_ml/asset/asset.py +0 -4
  3. deriva_ml/catalog/__init__.py +6 -0
  4. deriva_ml/catalog/clone.py +1513 -22
  5. deriva_ml/catalog/localize.py +66 -29
  6. deriva_ml/core/base.py +12 -9
  7. deriva_ml/core/definitions.py +13 -12
  8. deriva_ml/core/ermrest.py +11 -12
  9. deriva_ml/core/mixins/annotation.py +2 -2
  10. deriva_ml/core/mixins/asset.py +3 -3
  11. deriva_ml/core/mixins/dataset.py +3 -3
  12. deriva_ml/core/mixins/execution.py +1 -0
  13. deriva_ml/core/mixins/feature.py +2 -2
  14. deriva_ml/core/mixins/file.py +2 -2
  15. deriva_ml/core/mixins/path_builder.py +2 -2
  16. deriva_ml/core/mixins/rid_resolution.py +2 -2
  17. deriva_ml/core/mixins/vocabulary.py +2 -2
  18. deriva_ml/core/mixins/workflow.py +3 -3
  19. deriva_ml/dataset/catalog_graph.py +3 -4
  20. deriva_ml/dataset/dataset.py +5 -3
  21. deriva_ml/dataset/dataset_bag.py +0 -2
  22. deriva_ml/dataset/upload.py +2 -2
  23. deriva_ml/demo_catalog.py +0 -1
  24. deriva_ml/execution/__init__.py +8 -8
  25. deriva_ml/execution/base_config.py +2 -2
  26. deriva_ml/execution/execution.py +5 -3
  27. deriva_ml/execution/execution_record.py +0 -1
  28. deriva_ml/execution/model_protocol.py +1 -1
  29. deriva_ml/execution/multirun_config.py +0 -1
  30. deriva_ml/execution/runner.py +3 -3
  31. deriva_ml/experiment/experiment.py +3 -3
  32. deriva_ml/feature.py +2 -2
  33. deriva_ml/interfaces.py +2 -2
  34. deriva_ml/model/__init__.py +45 -24
  35. deriva_ml/model/annotations.py +0 -1
  36. deriva_ml/model/catalog.py +3 -2
  37. deriva_ml/model/data_loader.py +330 -0
  38. deriva_ml/model/data_sources.py +439 -0
  39. deriva_ml/model/database.py +216 -32
  40. deriva_ml/model/fk_orderer.py +379 -0
  41. deriva_ml/model/handles.py +1 -1
  42. deriva_ml/model/schema_builder.py +816 -0
  43. deriva_ml/run_model.py +3 -3
  44. deriva_ml/schema/annotations.py +2 -1
  45. deriva_ml/schema/create_schema.py +1 -1
  46. deriva_ml/schema/validation.py +1 -1
  47. {deriva_ml-1.17.15.dist-info → deriva_ml-1.17.16.dist-info}/METADATA +1 -1
  48. deriva_ml-1.17.16.dist-info/RECORD +81 -0
  49. deriva_ml-1.17.15.dist-info/RECORD +0 -77
  50. {deriva_ml-1.17.15.dist-info → deriva_ml-1.17.16.dist-info}/WHEEL +0 -0
  51. {deriva_ml-1.17.15.dist-info → deriva_ml-1.17.16.dist-info}/entry_points.txt +0 -0
  52. {deriva_ml-1.17.15.dist-info → deriva_ml-1.17.16.dist-info}/licenses/LICENSE +0 -0
  53. {deriva_ml-1.17.15.dist-info → deriva_ml-1.17.16.dist-info}/top_level.txt +0 -0
@@ -1,52 +1,68 @@
1
1
  """DerivaML-specific database model for downloaded BDBags.
2
2
 
3
- This module provides the DatabaseModel class which extends the generic BagDatabase
4
- from deriva-py with DerivaML-specific functionality:
3
+ This module provides the DatabaseModel class which creates a SQLite database
4
+ from a BDBag and provides DerivaML-specific functionality:
5
5
 
6
6
  - Dataset version tracking
7
7
  - Dataset RID resolution
8
8
  - Integration with DerivaModel for schema analysis
9
9
 
10
- For schema-independent BDBag operations, see deriva.core.bag_database.BagDatabase.
10
+ The implementation uses a two-phase pattern:
11
+ 1. Phase 1 (SchemaBuilder): Create SQLAlchemy ORM from schema.json
12
+ 2. Phase 2 (DataLoader): Load data from CSV files
13
+
14
+ For the low-level components, see:
15
+ - schema_builder.py: SchemaBuilder, SchemaORM
16
+ - data_sources.py: DataSource, BagDataSource, CatalogDataSource
17
+ - data_loader.py: DataLoader
18
+ - fk_orderer.py: ForeignKeyOrderer
11
19
  """
12
20
 
13
21
  from __future__ import annotations
14
22
 
15
23
  import logging
16
24
  from pathlib import Path
17
- from typing import Any, Generator, Optional
25
+ from typing import Any, Generator, Type
18
26
 
19
- from sqlalchemy import select
20
- from sqlalchemy.orm import Session
21
-
22
- from deriva.core.bag_database import BagDatabase
23
27
  from deriva.core.ermrest_model import Model
24
28
  from deriva.core.ermrest_model import Table as DerivaTable
29
+ from sqlalchemy import Table as SQLTable
30
+ from sqlalchemy import select
31
+ from sqlalchemy.orm import Session
25
32
 
26
33
  from deriva_ml.core.definitions import ML_SCHEMA, RID, get_domain_schemas
27
34
  from deriva_ml.core.exceptions import DerivaMLException
28
35
  from deriva_ml.dataset.aux_classes import DatasetMinid, DatasetVersion
29
36
  from deriva_ml.model.catalog import DerivaModel
37
+ from deriva_ml.model.data_loader import DataLoader
38
+ from deriva_ml.model.data_sources import BagDataSource
39
+ from deriva_ml.model.schema_builder import SchemaBuilder, SchemaORM
30
40
 
41
+ logger = logging.getLogger(__name__)
31
42
 
32
- class DatabaseModel(BagDatabase, DerivaModel):
33
- """DerivaML database model for downloaded BDBags.
34
43
 
35
- This class combines the generic BagDatabase functionality with DerivaML-specific
36
- features like dataset versioning and the DerivaModel schema utilities.
44
+ class DatabaseModel(DerivaModel):
45
+ """DerivaML database model for downloaded BDBags.
37
46
 
38
- It reads a BDBag and creates a SQLite database, then provides:
39
- - All BagDatabase query methods (list_tables, get_table_contents, etc.)
40
- - All DerivaModel schema methods (find_features, is_asset, etc.)
47
+ This class creates a SQLite database from a BDBag and provides:
48
+ - SQLAlchemy ORM access (engine, metadata, Base)
49
+ - DerivaModel schema methods (find_features, is_asset, etc.)
41
50
  - Dataset version tracking (bag_rids, dataset_version)
42
51
  - Dataset RID validation (rid_lookup)
43
52
 
53
+ The implementation uses a two-phase pattern:
54
+ 1. SchemaBuilder creates SQLAlchemy ORM from schema.json
55
+ 2. DataLoader fills the database from CSV files
56
+
44
57
  Attributes:
45
58
  bag_path: Path to the BDBag directory.
46
59
  minid: DatasetMinid for the downloaded bag.
47
60
  dataset_rid: Primary dataset RID in this bag.
48
61
  bag_rids: Dictionary mapping all dataset RIDs to their versions.
49
62
  dataset_table: The Dataset table from the ERMrest model.
63
+ engine: SQLAlchemy engine for database access.
64
+ metadata: SQLAlchemy MetaData with table definitions.
65
+ Base: SQLAlchemy automap base for ORM classes.
50
66
 
51
67
  Example:
52
68
  >>> db = DatabaseModel(minid, bag_path, working_dir)
@@ -66,33 +82,47 @@ class DatabaseModel(BagDatabase, DerivaModel):
66
82
  self._logger = logging.getLogger("deriva_ml")
67
83
  self.minid = minid
68
84
  self.dataset_rid = minid.dataset_rid
85
+ self.bag_path = bag_path
69
86
 
70
- # Load the model first to determine schema names
87
+ # Load the model from schema.json
71
88
  schema_file = bag_path / "data/schema.json"
72
- temp_model = Model.fromfile("file-system", schema_file)
89
+ model = Model.fromfile("file-system", schema_file)
73
90
 
74
- # Determine domain schemas using schema classification
91
+ # Determine schemas using schema classification
75
92
  ml_schema = ML_SCHEMA
76
- domain_schemas = get_domain_schemas(temp_model.schemas.keys(), ml_schema)
77
-
78
- # Initialize BagDatabase (creates SQLite DB)
79
- BagDatabase.__init__(
80
- self,
81
- bag_path=bag_path,
82
- database_dir=dbase_path,
83
- schemas=[*domain_schemas, ml_schema],
93
+ domain_schemas = get_domain_schemas(model.schemas.keys(), ml_schema)
94
+ schemas = [*domain_schemas, ml_schema]
95
+
96
+ # Extract bag checksum for unique database path
97
+ bag_cache_dir = bag_path.parent.name
98
+ self.database_dir = dbase_path / bag_cache_dir
99
+ self.database_dir.mkdir(parents=True, exist_ok=True)
100
+
101
+ # Phase 1: Build ORM structure
102
+ builder = SchemaBuilder(
103
+ model=model,
104
+ schemas=schemas,
105
+ database_path=self.database_dir,
84
106
  )
107
+ self._orm: SchemaORM = builder.build()
108
+
109
+ # Phase 2: Load data from bag CSVs
110
+ source = BagDataSource(bag_path, model=model)
111
+ loader = DataLoader(self._orm, source)
112
+ load_counts = loader.load_tables()
113
+
114
+ total_rows = sum(load_counts.values())
115
+ self._logger.debug(f"Loaded {total_rows} rows from bag")
85
116
 
86
117
  # Initialize DerivaModel (provides schema analysis methods)
87
- # Note: We pass self.model which was set by BagDatabase
88
118
  DerivaModel.__init__(
89
119
  self,
90
- model=self.model,
120
+ model=model,
91
121
  ml_schema=ml_schema,
92
122
  domain_schemas=domain_schemas,
93
123
  )
94
124
 
95
- self.dataset_table = self.model.schemas[self.ml_schema].tables["Dataset"]
125
+ self.dataset_table = model.schemas[ml_schema].tables["Dataset"]
96
126
 
97
127
  # Build dataset RID -> version mapping from Dataset_Version table
98
128
  self._build_bag_rids()
@@ -103,6 +133,34 @@ class DatabaseModel(BagDatabase, DerivaModel):
103
133
  self.database_dir,
104
134
  )
105
135
 
136
+ # =========================================================================
137
+ # Property delegates to SchemaORM
138
+ # =========================================================================
139
+
140
+ @property
141
+ def engine(self):
142
+ """SQLAlchemy engine for database access."""
143
+ return self._orm.engine
144
+
145
+ @property
146
+ def metadata(self):
147
+ """SQLAlchemy MetaData with table definitions."""
148
+ return self._orm.metadata
149
+
150
+ @property
151
+ def Base(self):
152
+ """SQLAlchemy automap base for ORM classes."""
153
+ return self._orm.Base
154
+
155
+ @property
156
+ def schemas(self) -> list[str]:
157
+ """List of schema names in the database."""
158
+ return self._orm.schemas
159
+
160
+ # =========================================================================
161
+ # Dataset version tracking
162
+ # =========================================================================
163
+
106
164
  def _build_bag_rids(self) -> None:
107
165
  """Build mapping of dataset RIDs to their versions in this bag."""
108
166
  self.bag_rids: dict[RID, DatasetVersion] = {}
@@ -121,7 +179,7 @@ class DatabaseModel(BagDatabase, DerivaModel):
121
179
  if rid not in self.bag_rids or version > self.bag_rids[rid]:
122
180
  self.bag_rids[rid] = version
123
181
 
124
- def dataset_version(self, dataset_rid: Optional[RID] = None) -> DatasetVersion:
182
+ def dataset_version(self, dataset_rid: RID | None = None) -> DatasetVersion:
125
183
  """Get the version of a dataset in this bag.
126
184
 
127
185
  Args:
@@ -154,6 +212,107 @@ class DatabaseModel(BagDatabase, DerivaModel):
154
212
  return self.bag_rids[dataset_rid]
155
213
  raise DerivaMLException(f"Dataset {dataset_rid} not found in this bag")
156
214
 
215
+ # =========================================================================
216
+ # Table/ORM access methods - delegate to SchemaORM
217
+ # =========================================================================
218
+
219
+ def list_tables(self) -> list[str]:
220
+ """List all tables in the database.
221
+
222
+ Returns:
223
+ List of fully-qualified table names (schema.table), sorted.
224
+ """
225
+ return self._orm.list_tables()
226
+
227
+ def find_table(self, table_name: str) -> SQLTable:
228
+ """Find a table by name.
229
+
230
+ Args:
231
+ table_name: Table name, with or without schema prefix.
232
+
233
+ Returns:
234
+ SQLAlchemy Table object.
235
+
236
+ Raises:
237
+ KeyError: If table not found.
238
+ """
239
+ return self._orm.find_table(table_name)
240
+
241
+ def get_table_contents(self, table: str) -> Generator[dict[str, Any], None, None]:
242
+ """Retrieve all rows from a table as dictionaries.
243
+
244
+ Args:
245
+ table: Table name (with or without schema prefix).
246
+
247
+ Yields:
248
+ Dictionary for each row with column names as keys.
249
+ """
250
+ yield from self._orm.get_table_contents(table)
251
+
252
+ def get_orm_class_by_name(self, table_name: str) -> Any | None:
253
+ """Get the ORM class for a table by name.
254
+
255
+ Args:
256
+ table_name: Table name, with or without schema prefix.
257
+
258
+ Returns:
259
+ SQLAlchemy ORM class for the table.
260
+
261
+ Raises:
262
+ KeyError: If table not found.
263
+ """
264
+ return self._orm.get_orm_class(table_name)
265
+
266
+ def get_orm_class_for_table(self, table: SQLTable | DerivaTable | str) -> Any | None:
267
+ """Get the ORM class for a table.
268
+
269
+ Args:
270
+ table: SQLAlchemy Table, Deriva Table, or table name.
271
+
272
+ Returns:
273
+ SQLAlchemy ORM class, or None if not found.
274
+ """
275
+ return self._orm.get_orm_class_for_table(table)
276
+
277
+ @staticmethod
278
+ def is_association_table(
279
+ table_class,
280
+ min_arity: int = 2,
281
+ max_arity: int = 2,
282
+ unqualified: bool = True,
283
+ pure: bool = True,
284
+ no_overlap: bool = True,
285
+ return_fkeys: bool = False,
286
+ ):
287
+ """Check if an ORM class represents an association table.
288
+
289
+ Delegates to SchemaORM.is_association_table.
290
+ """
291
+ return SchemaORM.is_association_table(
292
+ table_class, min_arity, max_arity, unqualified, pure, no_overlap, return_fkeys
293
+ )
294
+
295
+ def get_association_class(
296
+ self,
297
+ left_cls: Type[Any],
298
+ right_cls: Type[Any],
299
+ ) -> tuple[Any, Any, Any] | None:
300
+ """Find an association class connecting two ORM classes.
301
+
302
+ Args:
303
+ left_cls: First ORM class.
304
+ right_cls: Second ORM class.
305
+
306
+ Returns:
307
+ Tuple of (association_class, left_relationship, right_relationship),
308
+ or None if no association found.
309
+ """
310
+ return self._orm.get_association_class(left_cls, right_cls)
311
+
312
+ # =========================================================================
313
+ # Compatibility methods
314
+ # =========================================================================
315
+
157
316
  def _get_table_contents(self, table: str) -> Generator[dict[str, Any], None, None]:
158
317
  """Retrieve table contents as dictionaries.
159
318
 
@@ -195,14 +354,26 @@ class DatabaseModel(BagDatabase, DerivaModel):
195
354
  result = session.execute(cmd).mappings().first()
196
355
  return dict(result) if result else None
197
356
 
198
- # Compatibility aliases for methods that have different names in BagDatabase
199
357
  def get_orm_association_class(self, left_cls, right_cls, **kwargs):
200
358
  """Find association class between two ORM classes.
201
359
 
202
- Wrapper around BagDatabase.get_association_class for compatibility.
360
+ Wrapper around get_association_class for compatibility.
203
361
  """
204
362
  return self.get_association_class(left_cls, right_cls)
205
363
 
364
+ # =========================================================================
365
+ # Resource management
366
+ # =========================================================================
367
+
368
+ def dispose(self) -> None:
369
+ """Dispose of SQLAlchemy resources.
370
+
371
+ Call this when done with the database to properly clean up connections.
372
+ After calling dispose(), the instance should not be used further.
373
+ """
374
+ if hasattr(self, "_orm") and self._orm is not None:
375
+ self._orm.dispose()
376
+
206
377
  def delete_database(self) -> None:
207
378
  """Delete the database files.
208
379
 
@@ -212,3 +383,16 @@ class DatabaseModel(BagDatabase, DerivaModel):
212
383
  self.dispose()
213
384
  # Note: We don't actually delete files here to avoid data loss.
214
385
  # The caller should handle file deletion if needed.
386
+
387
+ def __del__(self) -> None:
388
+ """Cleanup resources when garbage collected."""
389
+ self.dispose()
390
+
391
+ def __enter__(self) -> "DatabaseModel":
392
+ """Context manager entry."""
393
+ return self
394
+
395
+ def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
396
+ """Context manager exit - dispose resources."""
397
+ self.dispose()
398
+ return False