deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +69 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +31 -0
  7. deriva_ml/catalog/clone.py +1939 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +845 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +126 -110
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +543 -242
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +223 -34
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
  67. deriva_ml-1.17.12.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.10.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,389 @@
1
+ """Centralized validation configuration for DerivaML.
2
+
3
+ This module provides shared Pydantic configuration, custom validators,
4
+ and RID validation utilities used throughout DerivaML.
5
+
6
+ The module provides:
7
+ - DERIVA_ML_CONFIG: Shared ConfigDict for Pydantic models
8
+ - VALIDATION_CONFIG: Alias for DERIVA_ML_CONFIG (for use with @validate_call)
9
+ - Custom Pydantic types for common patterns (RID validation, etc.)
10
+ - validate_rids(): Validate that RIDs exist in the catalog
11
+ - ValidationResult: Result container for validation operations
12
+
13
+ Example (Pydantic config):
14
+ >>> from deriva_ml.core.validation import VALIDATION_CONFIG
15
+ >>> from pydantic import validate_call
16
+ >>>
17
+ >>> @validate_call(config=VALIDATION_CONFIG)
18
+ ... def process_table(table: Table) -> None:
19
+ ... pass
20
+
21
+ Example (RID validation):
22
+ >>> from deriva_ml.core.validation import validate_rids
23
+ >>>
24
+ >>> result = validate_rids(
25
+ ... ml,
26
+ ... dataset_rids=["1-ABC", "2-DEF"],
27
+ ... asset_rids=["3-GHI"],
28
+ ... )
29
+ >>> if not result.is_valid:
30
+ ... for error in result.errors:
31
+ ... print(f"ERROR: {error}")
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ from pydantic import ConfigDict
37
+
38
+ # =============================================================================
39
+ # Shared Pydantic Configuration
40
+ # =============================================================================
41
+
42
+ # Standard configuration for DerivaML Pydantic models and validate_call decorators.
43
+ # This allows arbitrary types (like deriva Table, Column, etc.) to be used in
44
+ # Pydantic validation without explicit type adapters.
45
+ VALIDATION_CONFIG = ConfigDict(
46
+ arbitrary_types_allowed=True,
47
+ # Validate default values during model creation
48
+ validate_default=True,
49
+ # Use enum values instead of enum members for serialization
50
+ use_enum_values=True,
51
+ )
52
+
53
+ # Alias for backwards compatibility and clarity in model definitions
54
+ DERIVA_ML_CONFIG = VALIDATION_CONFIG
55
+
56
+ # Configuration for models that should be strict about extra fields
57
+ STRICT_VALIDATION_CONFIG = ConfigDict(
58
+ arbitrary_types_allowed=True,
59
+ validate_default=True,
60
+ use_enum_values=True,
61
+ extra="forbid", # Raise error if extra fields provided
62
+ )
63
+
64
+ __all__ = [
65
+ "VALIDATION_CONFIG",
66
+ "DERIVA_ML_CONFIG",
67
+ "STRICT_VALIDATION_CONFIG",
68
+ "ValidationResult",
69
+ "validate_rids",
70
+ "validate_vocabulary_terms",
71
+ ]
72
+
73
+
74
+ # =============================================================================
75
+ # RID Validation
76
+ # =============================================================================
77
+
78
+ from dataclasses import dataclass, field
79
+ from typing import TYPE_CHECKING, Any
80
+
81
+ if TYPE_CHECKING:
82
+ from deriva_ml.core.base import DerivaML
83
+
84
+
85
+ @dataclass
86
+ class ValidationResult:
87
+ """Result of configuration validation.
88
+
89
+ When printed, displays a formatted summary of validation results including
90
+ any errors and warnings. This makes it easy to inspect validation results
91
+ in interactive sessions.
92
+
93
+ Attributes:
94
+ is_valid: True if all validations passed, False otherwise.
95
+ errors: List of error messages for failed validations.
96
+ warnings: List of warning messages for potential issues.
97
+ validated_rids: Dictionary mapping RID to its resolved table info.
98
+
99
+ Example:
100
+ >>> result = validate_rids(ml, dataset_rids=["1-ABC"])
101
+ >>> print(result)
102
+ ✓ Validation passed
103
+ Validated 1 RIDs
104
+
105
+ >>> result = validate_rids(ml, dataset_rids=["INVALID"])
106
+ >>> print(result)
107
+ ✗ Validation failed with 1 error(s)
108
+
109
+ Errors:
110
+ • Dataset RID 'INVALID' does not exist in catalog
111
+ """
112
+
113
+ is_valid: bool = True
114
+ errors: list[str] = field(default_factory=list)
115
+ warnings: list[str] = field(default_factory=list)
116
+ validated_rids: dict[str, dict[str, Any]] = field(default_factory=dict)
117
+
118
+ def add_error(self, message: str) -> None:
119
+ """Add an error message and mark result as invalid."""
120
+ self.errors.append(message)
121
+ self.is_valid = False
122
+
123
+ def add_warning(self, message: str) -> None:
124
+ """Add a warning message."""
125
+ self.warnings.append(message)
126
+
127
+ def merge(self, other: "ValidationResult") -> "ValidationResult":
128
+ """Merge another validation result into this one."""
129
+ if not other.is_valid:
130
+ self.is_valid = False
131
+ self.errors.extend(other.errors)
132
+ self.warnings.extend(other.warnings)
133
+ self.validated_rids.update(other.validated_rids)
134
+ return self
135
+
136
+ def __repr__(self) -> str:
137
+ """Return a formatted string representation of the validation result."""
138
+ lines = []
139
+
140
+ if self.is_valid:
141
+ lines.append("✓ Validation passed")
142
+ if self.validated_rids:
143
+ lines.append(f" Validated {len(self.validated_rids)} RID(s)")
144
+ else:
145
+ lines.append(f"✗ Validation failed with {len(self.errors)} error(s)")
146
+
147
+ if self.errors:
148
+ lines.append("")
149
+ lines.append("Errors:")
150
+ for error in self.errors:
151
+ lines.append(f" • {error}")
152
+
153
+ if self.warnings:
154
+ lines.append("")
155
+ lines.append("Warnings:")
156
+ for warning in self.warnings:
157
+ lines.append(f" ⚠ {warning}")
158
+
159
+ return "\n".join(lines)
160
+
161
+ def __str__(self) -> str:
162
+ """Return a formatted string for print()."""
163
+ return self.__repr__()
164
+
165
+
166
+ def validate_rids(
167
+ ml: "DerivaML",
168
+ dataset_rids: list[str] | None = None,
169
+ asset_rids: list[str] | None = None,
170
+ dataset_versions: dict[str, str] | None = None,
171
+ workflow_rids: list[str] | None = None,
172
+ execution_rids: list[str] | None = None,
173
+ warn_missing_descriptions: bool = True,
174
+ ) -> ValidationResult:
175
+ """Validate that RIDs exist in the catalog.
176
+
177
+ Performs batch validation of RIDs to ensure they exist before running
178
+ experiments. This catches configuration errors early with clear messages.
179
+
180
+ Args:
181
+ ml: Connected DerivaML instance.
182
+ dataset_rids: List of dataset RIDs to validate.
183
+ asset_rids: List of asset RIDs to validate.
184
+ dataset_versions: Dictionary mapping dataset RID to required version string.
185
+ If provided, validates that the dataset has the specified version.
186
+ workflow_rids: List of workflow RIDs to validate.
187
+ execution_rids: List of execution RIDs to validate.
188
+ warn_missing_descriptions: If True (default), warn when datasets or other
189
+ entities are missing descriptions.
190
+
191
+ Returns:
192
+ ValidationResult with is_valid flag, error/warning messages, and
193
+ resolved RID information.
194
+
195
+ Example:
196
+ >>> result = validate_rids(
197
+ ... ml,
198
+ ... dataset_rids=["1-ABC", "2-DEF"],
199
+ ... dataset_versions={"1-ABC": "0.4.0"},
200
+ ... asset_rids=["3-GHI"],
201
+ ... )
202
+ >>> print(result)
203
+ ✓ Validation passed
204
+ Validated 3 RID(s)
205
+ """
206
+ from deriva_ml.core.exceptions import DerivaMLException
207
+
208
+ result = ValidationResult()
209
+
210
+ # Collect all RIDs for batch resolution
211
+ all_rids: set[str] = set()
212
+ rid_categories: dict[str, str] = {} # Maps RID to category for error messages
213
+
214
+ if dataset_rids:
215
+ for rid in dataset_rids:
216
+ all_rids.add(rid)
217
+ rid_categories[rid] = "dataset"
218
+
219
+ if asset_rids:
220
+ for rid in asset_rids:
221
+ all_rids.add(rid)
222
+ rid_categories[rid] = "asset"
223
+
224
+ if workflow_rids:
225
+ for rid in workflow_rids:
226
+ all_rids.add(rid)
227
+ rid_categories[rid] = "workflow"
228
+
229
+ if execution_rids:
230
+ for rid in execution_rids:
231
+ all_rids.add(rid)
232
+ rid_categories[rid] = "execution"
233
+
234
+ if not all_rids:
235
+ return result # Nothing to validate
236
+
237
+ # Batch resolve all RIDs
238
+ try:
239
+ resolved = ml.resolve_rids(all_rids)
240
+ for rid, info in resolved.items():
241
+ result.validated_rids[rid] = {
242
+ "rid": rid,
243
+ "table": info.table_name,
244
+ "schema": info.schema_name,
245
+ }
246
+ except DerivaMLException as e:
247
+ # Extract invalid RIDs from the error message
248
+ error_msg = str(e)
249
+ if "Invalid RIDs:" in error_msg:
250
+ # Parse out the invalid RIDs - report each one
251
+ for rid in all_rids:
252
+ if rid not in result.validated_rids:
253
+ category = rid_categories.get(rid, "unknown")
254
+ result.add_error(f"{category.title()} RID '{rid}' does not exist in catalog")
255
+ else:
256
+ result.add_error(f"RID validation failed: {e}")
257
+
258
+ # Validate dataset versions if specified
259
+ if dataset_versions and dataset_rids:
260
+ for rid, required_version in dataset_versions.items():
261
+ if rid not in result.validated_rids:
262
+ continue # Already reported as missing
263
+
264
+ try:
265
+ dataset = ml.lookup_dataset(rid)
266
+ current_version = str(dataset.current_version) if dataset.current_version else None
267
+
268
+ if current_version is None:
269
+ result.add_warning(
270
+ f"Dataset '{rid}' has no version information. "
271
+ f"Required version: {required_version}"
272
+ )
273
+ elif current_version != required_version:
274
+ # Check if the required version exists in history
275
+ try:
276
+ history = dataset.list_versions()
277
+ version_exists = any(
278
+ str(h.dataset_version) == required_version for h in history
279
+ )
280
+ if not version_exists:
281
+ result.add_error(
282
+ f"Dataset '{rid}' does not have version '{required_version}'. "
283
+ f"Current version: {current_version}. "
284
+ f"Available versions: {[str(h.dataset_version) for h in history]}"
285
+ )
286
+ else:
287
+ # Version exists but is not current - this is OK
288
+ result.validated_rids[rid]["version"] = required_version
289
+ result.validated_rids[rid]["current_version"] = current_version
290
+ except Exception:
291
+ # Can't check history, just warn
292
+ result.add_warning(
293
+ f"Dataset '{rid}' current version ({current_version}) differs from "
294
+ f"required version ({required_version}). Could not verify version history."
295
+ )
296
+ else:
297
+ result.validated_rids[rid]["version"] = required_version
298
+ except Exception as e:
299
+ result.add_error(f"Failed to validate dataset '{rid}' version: {e}")
300
+
301
+ # Validate that datasets are actually in Dataset table
302
+ if dataset_rids:
303
+ for rid in dataset_rids:
304
+ if rid in result.validated_rids:
305
+ info = result.validated_rids[rid]
306
+ if info.get("table") != "Dataset":
307
+ result.add_error(
308
+ f"RID '{rid}' specified as dataset but found in table "
309
+ f"'{info.get('schema')}.{info.get('table')}'"
310
+ )
311
+
312
+ # Validate that workflow RIDs are in Workflow table
313
+ if workflow_rids:
314
+ for rid in workflow_rids:
315
+ if rid in result.validated_rids:
316
+ info = result.validated_rids[rid]
317
+ if info.get("table") != "Workflow":
318
+ result.add_error(
319
+ f"RID '{rid}' specified as workflow but found in table "
320
+ f"'{info.get('schema')}.{info.get('table')}'"
321
+ )
322
+
323
+ # Validate that execution RIDs are in Execution table
324
+ if execution_rids:
325
+ for rid in execution_rids:
326
+ if rid in result.validated_rids:
327
+ info = result.validated_rids[rid]
328
+ if info.get("table") != "Execution":
329
+ result.add_error(
330
+ f"RID '{rid}' specified as execution but found in table "
331
+ f"'{info.get('schema')}.{info.get('table')}'"
332
+ )
333
+
334
+ # Check for missing descriptions
335
+ if warn_missing_descriptions and dataset_rids:
336
+ for rid in dataset_rids:
337
+ if rid in result.validated_rids and result.validated_rids[rid].get("table") == "Dataset":
338
+ try:
339
+ dataset = ml.lookup_dataset(rid)
340
+ if not dataset.description or dataset.description.strip() == "":
341
+ result.add_warning(f"Dataset '{rid}' has no description")
342
+ except Exception:
343
+ pass # Already reported other errors
344
+
345
+ return result
346
+
347
+
348
+ def validate_vocabulary_terms(
349
+ ml: "DerivaML",
350
+ vocabulary_name: str,
351
+ terms: list[str],
352
+ ) -> ValidationResult:
353
+ """Validate that terms exist in a vocabulary.
354
+
355
+ Args:
356
+ ml: Connected DerivaML instance.
357
+ vocabulary_name: Name of the vocabulary table.
358
+ terms: List of term names to validate.
359
+
360
+ Returns:
361
+ ValidationResult with validation status and details.
362
+
363
+ Example:
364
+ >>> result = validate_vocabulary_terms(ml, "Dataset_Type", ["Training", "Testing"])
365
+ >>> if not result.is_valid:
366
+ ... for error in result.errors:
367
+ ... print(f" - {error}")
368
+ """
369
+ result = ValidationResult()
370
+
371
+ try:
372
+ existing_terms = ml.list_terms(vocabulary_name)
373
+ existing_names = {t.name for t in existing_terms}
374
+
375
+ for term in terms:
376
+ if term not in existing_names:
377
+ result.add_error(
378
+ f"Term '{term}' not found in vocabulary '{vocabulary_name}'. "
379
+ f"Available terms: {sorted(existing_names)}"
380
+ )
381
+ else:
382
+ result.validated_rids[f"{vocabulary_name}:{term}"] = {
383
+ "vocabulary": vocabulary_name,
384
+ "term": term,
385
+ }
386
+ except Exception as e:
387
+ result.add_error(f"Failed to validate vocabulary '{vocabulary_name}': {e}")
388
+
389
+ return result
@@ -1,6 +1,6 @@
1
1
  from .aux_classes import DatasetSpec, DatasetSpecConfig, DatasetVersion, VersionPart
2
2
  from .dataset import Dataset
3
- from .dataset_bag import DatasetBag
3
+ from .dataset_bag import DatasetBag, FeatureValueRecord
4
4
 
5
5
  __all__ = [
6
6
  "Dataset",
@@ -8,5 +8,6 @@ __all__ = [
8
8
  "DatasetSpecConfig",
9
9
  "DatasetBag",
10
10
  "DatasetVersion",
11
+ "FeatureValueRecord",
11
12
  "VersionPart",
12
13
  ]
@@ -3,6 +3,7 @@ THis module defines the DataSet class with is used to manipulate n
3
3
  """
4
4
 
5
5
  from enum import Enum
6
+ from pprint import pformat
6
7
  from typing import Any, Optional, SupportsInt
7
8
 
8
9
  from hydra_zen import hydrated_dataclass
@@ -20,6 +21,16 @@ from semver import Version
20
21
 
21
22
  from deriva_ml.core.definitions import RID
22
23
 
24
+ try:
25
+ from icecream import ic
26
+
27
+ ic.configureOutput(
28
+ includeContext=True,
29
+ argToStringFunction=lambda x: pformat(x.model_dump() if hasattr(x, "model_dump") else x, width=80, depth=10),
30
+ )
31
+ except ImportError: # Graceful fallback if IceCream isn't installed.
32
+ ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
33
+
23
34
 
24
35
  class VersionPart(Enum):
25
36
  """Simple enumeration for semantic versioning.
@@ -43,7 +54,7 @@ class DatasetVersion(Version):
43
54
  replace(major, minor, patch): Replace the major and minor versions
44
55
  """
45
56
 
46
- def __init__(self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0):
57
+ def __init__(self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0) -> None:
47
58
  """Initialize a DatasetVersion object.
48
59
 
49
60
  Args:
@@ -72,7 +83,7 @@ class DatasetVersion(Version):
72
83
  return self.major, self.minor, self.patch
73
84
 
74
85
  @classmethod
75
- def parse(cls, version: str, optional_minor_an_path=False) -> "DatasetVersion":
86
+ def parse(cls, version: str, optional_minor_an_path: bool = False) -> "DatasetVersion":
76
87
  v = Version.parse(version)
77
88
  return DatasetVersion(v.major, v.minor, v.patch)
78
89
 
@@ -111,8 +122,13 @@ class DatasetHistory(BaseModel):
111
122
 
112
123
  model_config = ConfigDict(arbitrary_types_allowed=True)
113
124
 
125
+ @field_validator("execution_rid", mode="before")
126
+ @classmethod
127
+ def _default_execution_rid(cls, v: str | None) -> str | None:
128
+ return None if v == "" else v
129
+
114
130
  @field_validator("description", mode="after")
115
- def _default_description(cls, v) -> str:
131
+ def _default_description(cls, v: str | None) -> str:
116
132
  return v or ""
117
133
 
118
134
 
@@ -153,7 +169,7 @@ class DatasetMinid(BaseModel):
153
169
 
154
170
  @model_validator(mode="before")
155
171
  @classmethod
156
- def insert_metadata(cls, data: Any) -> Any:
172
+ def insert_metadata(cls, data: dict) -> dict:
157
173
  if isinstance(data, dict):
158
174
  if "metadata" in data:
159
175
  data = data | data["metadata"]