deriva-ml 1.17.9__py3-none-any.whl → 1.17.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +43 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +21 -0
  7. deriva_ml/catalog/clone.py +1199 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +817 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +186 -105
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +545 -244
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +224 -35
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -5
  67. deriva_ml-1.17.11.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +2 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.9.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,400 @@
1
+ """Vocabulary management mixin for DerivaML.
2
+
3
+ This module provides the VocabularyMixin class which handles vocabulary
4
+ term operations including adding, looking up, and listing terms in
5
+ controlled vocabulary tables.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import TYPE_CHECKING, Any, Callable
11
+
12
+ # Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
13
+ import importlib
14
+ _datapath = importlib.import_module("deriva.core.datapath")
15
+ _ermrest_model = importlib.import_module("deriva.core.ermrest_model")
16
+ DataPathException = _datapath.DataPathException
17
+ Table = _ermrest_model.Table
18
+
19
+ from pydantic import ConfigDict, validate_call
20
+
21
+ from deriva_ml.core.definitions import MLVocab, VocabularyTerm, VocabularyTermHandle
22
+ from deriva_ml.core.exceptions import (
23
+ DerivaMLException,
24
+ DerivaMLInvalidTerm,
25
+ DerivaMLTableTypeError,
26
+ )
27
+
28
+ if TYPE_CHECKING:
29
+ from deriva_ml.model.catalog import DerivaModel
30
+
31
+
32
+ # Type alias for the vocabulary cache structure
33
+ # Maps (schema_name, table_name) -> {term_name -> VocabularyTermHandle, synonym -> VocabularyTermHandle}
34
+ VocabCache = dict[tuple[str, str], dict[str, VocabularyTermHandle]]
35
+
36
+
37
+ class VocabularyMixin:
38
+ """Mixin providing vocabulary/term management operations.
39
+
40
+ This mixin requires the host class to have:
41
+ - model: DerivaModel instance
42
+ - pathBuilder(): method returning catalog path builder
43
+
44
+ Methods:
45
+ add_term: Add a new term to a vocabulary table
46
+ lookup_term: Find a term by name or synonym
47
+ list_vocabulary_terms: List all terms in a vocabulary table
48
+ clear_vocabulary_cache: Clear the vocabulary term cache
49
+ """
50
+
51
+ # Type hints for IDE support - actual attributes/methods from host class
52
+ model: "DerivaModel"
53
+ pathBuilder: Callable[[], Any]
54
+
55
+ # Vocabulary term cache: maps (schema, table) -> {name_or_synonym -> VocabularyTerm}
56
+ _vocab_cache: VocabCache
57
+
58
+ def _get_vocab_cache(self) -> VocabCache:
59
+ """Get the vocabulary cache, initializing if needed."""
60
+ if not hasattr(self, "_vocab_cache"):
61
+ self._vocab_cache = {}
62
+ return self._vocab_cache
63
+
64
+ def clear_vocabulary_cache(self, table: str | Table | None = None) -> None:
65
+ """Clear the vocabulary term cache.
66
+
67
+ Args:
68
+ table: If provided, only clear cache for this specific vocabulary table.
69
+ If None, clear the entire cache.
70
+ """
71
+ cache = self._get_vocab_cache()
72
+ if table is None:
73
+ cache.clear()
74
+ else:
75
+ vocab_table = self.model.name_to_table(table)
76
+ cache_key = (vocab_table.schema.name, vocab_table.name)
77
+ cache.pop(cache_key, None)
78
+
79
+ def _populate_vocab_cache(self, schema_name: str, table_name: str) -> dict[str, VocabularyTermHandle]:
80
+ """Fetch all terms from a vocabulary table and populate the cache.
81
+
82
+ Returns:
83
+ Dictionary mapping term names and synonyms to VocabularyTermHandle objects.
84
+ """
85
+ cache = self._get_vocab_cache()
86
+ cache_key = (schema_name, table_name)
87
+
88
+ # Fetch all terms from the server
89
+ schema_path = self.pathBuilder().schemas[schema_name]
90
+ term_lookup: dict[str, VocabularyTermHandle] = {}
91
+
92
+ for term_data in schema_path.tables[table_name].entities().fetch():
93
+ term = VocabularyTermHandle(ml=self, table=table_name, **term_data)
94
+ # Index by primary name
95
+ term_lookup[term.name] = term
96
+ # Also index by each synonym
97
+ if term.synonyms:
98
+ for synonym in term.synonyms:
99
+ term_lookup[synonym] = term
100
+
101
+ cache[cache_key] = term_lookup
102
+ return term_lookup
103
+
104
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
105
+ def add_term(
106
+ self,
107
+ table: str | Table,
108
+ term_name: str,
109
+ description: str,
110
+ synonyms: list[str] | None = None,
111
+ exists_ok: bool = True,
112
+ ) -> VocabularyTermHandle:
113
+ """Adds a term to a vocabulary table.
114
+
115
+ Creates a new standardized term with description and optional synonyms in a vocabulary table.
116
+ Can either create a new term or return an existing one if it already exists.
117
+
118
+ Args:
119
+ table: Vocabulary table to add term to (name or Table object).
120
+ term_name: Primary name of the term (must be unique within vocabulary).
121
+ description: Explanation of term's meaning and usage.
122
+ synonyms: Alternative names for the term.
123
+ exists_ok: If True, return the existing term if found. If False, raise error.
124
+
125
+ Returns:
126
+ VocabularyTermHandle: Object representing the created or existing term, with
127
+ methods to modify it in the catalog.
128
+
129
+ Raises:
130
+ DerivaMLException: If a term exists and exists_ok=False, or if the table is not a vocabulary table.
131
+
132
+ Examples:
133
+ Add a new tissue type:
134
+ >>> term = ml.add_term(
135
+ ... table="tissue_types",
136
+ ... term_name="epithelial",
137
+ ... description="Epithelial tissue type",
138
+ ... synonyms=["epithelium"]
139
+ ... )
140
+ >>> # Modify the term
141
+ >>> term.description = "Updated description"
142
+ >>> term.synonyms = ("epithelium", "epithelial_tissue")
143
+
144
+ Attempt to add an existing term:
145
+ >>> term = ml.add_term("tissue_types", "epithelial", "...", exists_ok=True)
146
+ """
147
+ # Initialize an empty synonyms list if None
148
+ synonyms = synonyms or []
149
+
150
+ # Get table reference and validate if it is a vocabulary table
151
+ vocab_table = self.model.name_to_table(table)
152
+ pb = self.pathBuilder()
153
+ if not (self.model.is_vocabulary(vocab_table)):
154
+ raise DerivaMLTableTypeError("vocabulary", vocab_table.name)
155
+
156
+ # Get schema and table names for path building
157
+ schema_name = vocab_table.schema.name
158
+ table_name = vocab_table.name
159
+
160
+ try:
161
+ # Attempt to insert a new term
162
+ term_data = pb.schemas[schema_name].tables[table_name].insert(
163
+ [
164
+ {
165
+ "Name": term_name,
166
+ "Description": description,
167
+ "Synonyms": synonyms,
168
+ }
169
+ ],
170
+ defaults={"ID", "URI"},
171
+ )[0]
172
+ term_handle = VocabularyTermHandle(ml=self, table=table_name, **term_data)
173
+ # Invalidate cache for this vocabulary since we added a new term
174
+ self.clear_vocabulary_cache(vocab_table)
175
+ return term_handle
176
+ except DataPathException:
177
+ # Term exists - look it up or raise an error
178
+ if not exists_ok:
179
+ raise DerivaMLInvalidTerm(vocab_table.name, term_name, msg="term already exists")
180
+ return self.lookup_term(vocab_table, term_name)
181
+
182
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
183
+ def lookup_term(self, table: str | Table, term_name: str) -> VocabularyTermHandle:
184
+ """Finds a term in a vocabulary table.
185
+
186
+ Searches for a term in the specified vocabulary table, matching either the primary name
187
+ or any of its synonyms. Results are cached for performance - subsequent lookups in the
188
+ same vocabulary table are served from cache.
189
+
190
+ Args:
191
+ table: Vocabulary table to search in (name or Table object).
192
+ term_name: Name or synonym of the term to find.
193
+
194
+ Returns:
195
+ VocabularyTermHandle: The matching vocabulary term, with methods to modify it.
196
+
197
+ Raises:
198
+ DerivaMLVocabularyException: If the table is not a vocabulary table, or term is not found.
199
+
200
+ Examples:
201
+ Look up by primary name:
202
+ >>> term = ml.lookup_term("tissue_types", "epithelial")
203
+ >>> print(term.description)
204
+
205
+ Look up by synonym:
206
+ >>> term = ml.lookup_term("tissue_types", "epithelium")
207
+
208
+ Modify the term:
209
+ >>> term = ml.lookup_term("tissue_types", "epithelial")
210
+ >>> term.description = "Updated description"
211
+ >>> term.synonyms = ("epithelium", "epithelial_tissue")
212
+ """
213
+ # Get and validate vocabulary table reference
214
+ vocab_table = self.model.name_to_table(table)
215
+ if not self.model.is_vocabulary(vocab_table):
216
+ raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
217
+
218
+ # Get schema and table names
219
+ schema_name, table_name = vocab_table.schema.name, vocab_table.name
220
+ cache_key = (schema_name, table_name)
221
+
222
+ # Check cache first
223
+ cache = self._get_vocab_cache()
224
+ if cache_key in cache:
225
+ term_lookup = cache[cache_key]
226
+ if term_name in term_lookup:
227
+ return term_lookup[term_name]
228
+ # Term not in cache - might be newly added, try server-side lookup
229
+ else:
230
+ # Vocabulary not cached yet - try server-side lookup first for single term
231
+ term = self._server_lookup_term(schema_name, table_name, term_name)
232
+ if term is not None:
233
+ # Found it - populate the full cache for future lookups
234
+ self._populate_vocab_cache(schema_name, table_name)
235
+ return self._get_vocab_cache()[cache_key][term_name]
236
+ # Not found by name - need to check synonyms, populate cache
237
+ term_lookup = self._populate_vocab_cache(schema_name, table_name)
238
+ if term_name in term_lookup:
239
+ return term_lookup[term_name]
240
+ raise DerivaMLInvalidTerm(table_name, term_name)
241
+
242
+ # Term not in cache - try server-side lookup (might be newly added)
243
+ term = self._server_lookup_term(schema_name, table_name, term_name)
244
+ if term is not None:
245
+ # Refresh cache to get the VocabularyTermHandle
246
+ self._populate_vocab_cache(schema_name, table_name)
247
+ return self._get_vocab_cache()[cache_key][term_name]
248
+
249
+ # Still not found - refresh cache and try one more time
250
+ term_lookup = self._populate_vocab_cache(schema_name, table_name)
251
+ if term_name in term_lookup:
252
+ return term_lookup[term_name]
253
+
254
+ # Term not found
255
+ raise DerivaMLInvalidTerm(table_name, term_name)
256
+
257
+ def _server_lookup_term(
258
+ self, schema_name: str, table_name: str, term_name: str
259
+ ) -> VocabularyTermHandle | None:
260
+ """Look up a term by name using server-side filtering.
261
+
262
+ This performs a targeted server query for a specific term name.
263
+ Does NOT check synonyms (that requires client-side filtering).
264
+
265
+ Args:
266
+ schema_name: Schema containing the vocabulary table.
267
+ table_name: Vocabulary table name.
268
+ term_name: Primary name of the term to find.
269
+
270
+ Returns:
271
+ VocabularyTermHandle if found by exact name match, None otherwise.
272
+ """
273
+ schema_path = self.pathBuilder().schemas[schema_name]
274
+ table_path = schema_path.tables[table_name]
275
+
276
+ # Server-side filter by Name
277
+ results = list(table_path.filter(table_path.Name == term_name).entities().fetch())
278
+ if results:
279
+ return VocabularyTermHandle(ml=self, table=table_name, **results[0])
280
+ return None
281
+
282
+ def list_vocabulary_terms(self, table: str | Table) -> list[VocabularyTerm]:
283
+ """Lists all terms in a vocabulary table.
284
+
285
+ Retrieves all terms, their descriptions, and synonyms from a controlled vocabulary table.
286
+
287
+ Args:
288
+ table: Vocabulary table to list terms from (name or Table object).
289
+
290
+ Returns:
291
+ list[VocabularyTerm]: List of vocabulary terms with their metadata.
292
+
293
+ Raises:
294
+ DerivaMLException: If table doesn't exist or is not a vocabulary table.
295
+
296
+ Examples:
297
+ >>> terms = ml.list_vocabulary_terms("tissue_types")
298
+ >>> for term in terms:
299
+ ... print(f"{term.name}: {term.description}")
300
+ ... if term.synonyms:
301
+ ... print(f" Synonyms: {', '.join(term.synonyms)}")
302
+ """
303
+ # Get path builder and table reference
304
+ pb = self.pathBuilder()
305
+ table = self.model.name_to_table(table.value if isinstance(table, MLVocab) else table)
306
+
307
+ # Validate table is a vocabulary table
308
+ if not (self.model.is_vocabulary(table)):
309
+ raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
310
+
311
+ # Fetch and convert all terms to VocabularyTerm objects
312
+ return [VocabularyTerm(**v) for v in pb.schemas[table.schema.name].tables[table.name].entities().fetch()]
313
+
314
+ def _update_term_synonyms(self, table: str | Table, term_name: str, synonyms: list[str]) -> None:
315
+ """Internal: Update synonyms for a vocabulary term.
316
+
317
+ Called by VocabularyTermHandle.synonyms setter.
318
+
319
+ Args:
320
+ table: Vocabulary table containing the term.
321
+ term_name: Primary name of the term to update.
322
+ synonyms: New list of synonyms (replaces all existing).
323
+ """
324
+ # Look up the term to get its RID
325
+ term = self.lookup_term(table, term_name)
326
+
327
+ # Update the term in the catalog
328
+ vocab_table = self.model.name_to_table(table)
329
+ pb = self.pathBuilder()
330
+ table_path = pb.schemas[vocab_table.schema.name].tables[vocab_table.name]
331
+ table_path.update([{"RID": term.rid, "Synonyms": synonyms}])
332
+
333
+ # Invalidate cache
334
+ self.clear_vocabulary_cache(table)
335
+
336
+ def _update_term_description(self, table: str | Table, term_name: str, description: str) -> None:
337
+ """Internal: Update description for a vocabulary term.
338
+
339
+ Called by VocabularyTermHandle.description setter.
340
+
341
+ Args:
342
+ table: Vocabulary table containing the term.
343
+ term_name: Primary name of the term to update.
344
+ description: New description for the term.
345
+ """
346
+ # Look up the term to get its RID
347
+ term = self.lookup_term(table, term_name)
348
+
349
+ # Update the term in the catalog
350
+ vocab_table = self.model.name_to_table(table)
351
+ pb = self.pathBuilder()
352
+ table_path = pb.schemas[vocab_table.schema.name].tables[vocab_table.name]
353
+ table_path.update([{"RID": term.rid, "Description": description}])
354
+
355
+ # Invalidate cache
356
+ self.clear_vocabulary_cache(table)
357
+
358
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
359
+ def delete_term(self, table: str | Table, term_name: str) -> None:
360
+ """Delete a term from a vocabulary table.
361
+
362
+ Removes a term from the vocabulary. The term must not be in use by any
363
+ records in the catalog (e.g., no datasets using this dataset type, no
364
+ assets using this asset type).
365
+
366
+ Args:
367
+ table: Vocabulary table containing the term (name or Table object).
368
+ term_name: Primary name of the term to delete.
369
+
370
+ Raises:
371
+ DerivaMLInvalidTerm: If the term doesn't exist in the vocabulary.
372
+ DerivaMLException: If the term is currently in use by other records.
373
+
374
+ Example:
375
+ >>> ml.delete_term("Dataset_Type", "Obsolete_Type")
376
+ """
377
+ # Look up the term (validates table and term existence)
378
+ term = self.lookup_term(table, term_name)
379
+ vocab_table = self.model.name_to_table(table)
380
+
381
+ # Check if the term is in use by examining association tables
382
+ associations = list(vocab_table.find_associations())
383
+ pb = self.pathBuilder()
384
+
385
+ for assoc in associations:
386
+ assoc_path = pb.schemas[assoc.schema.name].tables[assoc.name]
387
+ # Check if any rows reference this term
388
+ count = len(list(assoc_path.filter(getattr(assoc_path, vocab_table.name) == term.name).entities().fetch()))
389
+ if count > 0:
390
+ raise DerivaMLException(
391
+ f"Cannot delete term '{term_name}' from {vocab_table.name}: "
392
+ f"it is referenced by {count} record(s) in {assoc.name}"
393
+ )
394
+
395
+ # No references found - safe to delete
396
+ table_path = pb.schemas[vocab_table.schema.name].tables[vocab_table.name]
397
+ table_path.filter(table_path.RID == term.rid).delete()
398
+
399
+ # Invalidate cache
400
+ self.clear_vocabulary_cache(table)