deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +69 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +31 -0
- deriva_ml/catalog/clone.py +1939 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +845 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +126 -110
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +543 -242
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +223 -34
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
- deriva_ml-1.17.12.dist-info/RECORD +77 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.10.dist-info/RECORD +0 -45
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,601 @@
|
|
|
1
|
+
"""Schema validation for DerivaML catalogs.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to validate that a catalog's ML schema matches
|
|
4
|
+
the expected structure created by create_schema.py. It can check for:
|
|
5
|
+
- Required tables and their columns
|
|
6
|
+
- Required vocabulary tables and their initial terms
|
|
7
|
+
- Foreign key relationships
|
|
8
|
+
- Extra tables/columns (in strict mode)
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
from deriva_ml import DerivaML
|
|
12
|
+
|
|
13
|
+
ml = DerivaML('localhost', 'my_catalog')
|
|
14
|
+
report = ml.validate_schema(strict=False)
|
|
15
|
+
if not report.is_valid:
|
|
16
|
+
print(report.to_text())
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from enum import Enum
|
|
24
|
+
from typing import TYPE_CHECKING, Any
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from deriva_ml.core.base import DerivaML
|
|
28
|
+
|
|
29
|
+
from deriva_ml.core.definitions import ML_SCHEMA, MLTable, MLVocab
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ValidationSeverity(str, Enum):
|
|
33
|
+
"""Severity levels for validation issues."""
|
|
34
|
+
|
|
35
|
+
ERROR = "error" # Schema is invalid, will cause failures
|
|
36
|
+
WARNING = "warning" # Schema may work but has issues
|
|
37
|
+
INFO = "info" # Informational (e.g., extra items in non-strict mode)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class ValidationIssue:
|
|
42
|
+
"""A single validation issue found during schema inspection."""
|
|
43
|
+
|
|
44
|
+
severity: ValidationSeverity
|
|
45
|
+
category: str # e.g., "table", "column", "vocabulary", "foreign_key"
|
|
46
|
+
message: str
|
|
47
|
+
table: str | None = None
|
|
48
|
+
column: str | None = None
|
|
49
|
+
expected: Any = None
|
|
50
|
+
actual: Any = None
|
|
51
|
+
|
|
52
|
+
def __str__(self) -> str:
|
|
53
|
+
location = ""
|
|
54
|
+
if self.table:
|
|
55
|
+
location = f"[{self.table}"
|
|
56
|
+
if self.column:
|
|
57
|
+
location += f".{self.column}"
|
|
58
|
+
location += "] "
|
|
59
|
+
return f"{self.severity.value.upper()}: {location}{self.message}"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class SchemaValidationReport:
|
|
64
|
+
"""Complete validation report for a DerivaML catalog schema."""
|
|
65
|
+
|
|
66
|
+
schema_name: str
|
|
67
|
+
strict_mode: bool
|
|
68
|
+
issues: list[ValidationIssue] = field(default_factory=list)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def is_valid(self) -> bool:
|
|
72
|
+
"""Returns True if no errors were found."""
|
|
73
|
+
return not any(i.severity == ValidationSeverity.ERROR for i in self.issues)
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def errors(self) -> list[ValidationIssue]:
|
|
77
|
+
"""Returns only error-level issues."""
|
|
78
|
+
return [i for i in self.issues if i.severity == ValidationSeverity.ERROR]
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def warnings(self) -> list[ValidationIssue]:
|
|
82
|
+
"""Returns only warning-level issues."""
|
|
83
|
+
return [i for i in self.issues if i.severity == ValidationSeverity.WARNING]
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def info(self) -> list[ValidationIssue]:
|
|
87
|
+
"""Returns only info-level issues."""
|
|
88
|
+
return [i for i in self.issues if i.severity == ValidationSeverity.INFO]
|
|
89
|
+
|
|
90
|
+
def add_error(self, category: str, message: str, **kwargs) -> None:
|
|
91
|
+
"""Add an error-level issue."""
|
|
92
|
+
self.issues.append(ValidationIssue(
|
|
93
|
+
severity=ValidationSeverity.ERROR,
|
|
94
|
+
category=category,
|
|
95
|
+
message=message,
|
|
96
|
+
**kwargs
|
|
97
|
+
))
|
|
98
|
+
|
|
99
|
+
def add_warning(self, category: str, message: str, **kwargs) -> None:
|
|
100
|
+
"""Add a warning-level issue."""
|
|
101
|
+
self.issues.append(ValidationIssue(
|
|
102
|
+
severity=ValidationSeverity.WARNING,
|
|
103
|
+
category=category,
|
|
104
|
+
message=message,
|
|
105
|
+
**kwargs
|
|
106
|
+
))
|
|
107
|
+
|
|
108
|
+
def add_info(self, category: str, message: str, **kwargs) -> None:
|
|
109
|
+
"""Add an info-level issue."""
|
|
110
|
+
self.issues.append(ValidationIssue(
|
|
111
|
+
severity=ValidationSeverity.INFO,
|
|
112
|
+
category=category,
|
|
113
|
+
message=message,
|
|
114
|
+
**kwargs
|
|
115
|
+
))
|
|
116
|
+
|
|
117
|
+
def to_text(self) -> str:
|
|
118
|
+
"""Generate a human-readable text report."""
|
|
119
|
+
lines = [
|
|
120
|
+
f"Schema Validation Report for '{self.schema_name}'",
|
|
121
|
+
f"Mode: {'Strict' if self.strict_mode else 'Non-strict'}",
|
|
122
|
+
f"Status: {'VALID' if self.is_valid else 'INVALID'}",
|
|
123
|
+
"",
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
if not self.issues:
|
|
127
|
+
lines.append("No issues found.")
|
|
128
|
+
return "\n".join(lines)
|
|
129
|
+
|
|
130
|
+
# Summary
|
|
131
|
+
lines.append(f"Summary: {len(self.errors)} errors, {len(self.warnings)} warnings, {len(self.info)} info")
|
|
132
|
+
lines.append("")
|
|
133
|
+
|
|
134
|
+
# Group by category
|
|
135
|
+
by_category: dict[str, list[ValidationIssue]] = {}
|
|
136
|
+
for issue in self.issues:
|
|
137
|
+
by_category.setdefault(issue.category, []).append(issue)
|
|
138
|
+
|
|
139
|
+
for category, category_issues in sorted(by_category.items()):
|
|
140
|
+
lines.append(f"## {category.replace('_', ' ').title()}")
|
|
141
|
+
for issue in category_issues:
|
|
142
|
+
lines.append(f" - {issue}")
|
|
143
|
+
if issue.expected is not None or issue.actual is not None:
|
|
144
|
+
if issue.expected is not None:
|
|
145
|
+
lines.append(f" Expected: {issue.expected}")
|
|
146
|
+
if issue.actual is not None:
|
|
147
|
+
lines.append(f" Actual: {issue.actual}")
|
|
148
|
+
lines.append("")
|
|
149
|
+
|
|
150
|
+
return "\n".join(lines)
|
|
151
|
+
|
|
152
|
+
def to_dict(self) -> dict[str, Any]:
|
|
153
|
+
"""Convert report to dictionary format for JSON serialization.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Dictionary with schema validation results suitable for JSON encoding.
|
|
157
|
+
Structure:
|
|
158
|
+
{
|
|
159
|
+
"schema_name": str,
|
|
160
|
+
"strict_mode": bool,
|
|
161
|
+
"is_valid": bool,
|
|
162
|
+
"summary": {"errors": int, "warnings": int, "info": int},
|
|
163
|
+
"issues": [
|
|
164
|
+
{
|
|
165
|
+
"severity": "error"|"warning"|"info",
|
|
166
|
+
"category": str,
|
|
167
|
+
"message": str,
|
|
168
|
+
"table": str|null,
|
|
169
|
+
"column": str|null,
|
|
170
|
+
"expected": any|null,
|
|
171
|
+
"actual": any|null
|
|
172
|
+
},
|
|
173
|
+
...
|
|
174
|
+
]
|
|
175
|
+
}
|
|
176
|
+
"""
|
|
177
|
+
return {
|
|
178
|
+
"schema_name": self.schema_name,
|
|
179
|
+
"strict_mode": self.strict_mode,
|
|
180
|
+
"is_valid": self.is_valid,
|
|
181
|
+
"summary": {
|
|
182
|
+
"errors": len(self.errors),
|
|
183
|
+
"warnings": len(self.warnings),
|
|
184
|
+
"info": len(self.info),
|
|
185
|
+
},
|
|
186
|
+
"issues": [
|
|
187
|
+
{
|
|
188
|
+
"severity": i.severity.value,
|
|
189
|
+
"category": i.category,
|
|
190
|
+
"message": i.message,
|
|
191
|
+
"table": i.table,
|
|
192
|
+
"column": i.column,
|
|
193
|
+
"expected": i.expected,
|
|
194
|
+
"actual": i.actual,
|
|
195
|
+
}
|
|
196
|
+
for i in self.issues
|
|
197
|
+
],
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
def to_json(self, indent: int | None = 2) -> str:
|
|
201
|
+
"""Convert report to JSON string format.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
indent: Number of spaces for indentation, or None for compact output.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
JSON string representation of the validation report.
|
|
208
|
+
|
|
209
|
+
Example:
|
|
210
|
+
>>> report = ml.validate_schema()
|
|
211
|
+
>>> print(report.to_json())
|
|
212
|
+
{
|
|
213
|
+
"schema_name": "deriva-ml",
|
|
214
|
+
"strict_mode": false,
|
|
215
|
+
"is_valid": true,
|
|
216
|
+
...
|
|
217
|
+
}
|
|
218
|
+
"""
|
|
219
|
+
return json.dumps(self.to_dict(), indent=indent)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# =============================================================================
|
|
223
|
+
# Expected Schema Structure
|
|
224
|
+
# =============================================================================
|
|
225
|
+
|
|
226
|
+
# Expected columns for each table in the ML schema
|
|
227
|
+
# Format: {table_name: {column_name: (type_name, nullok)}}
|
|
228
|
+
EXPECTED_TABLE_COLUMNS: dict[str, dict[str, tuple[str, bool]]] = {
|
|
229
|
+
MLTable.dataset: {
|
|
230
|
+
"Description": ("markdown", True),
|
|
231
|
+
"Deleted": ("boolean", True),
|
|
232
|
+
"Version": ("text", True), # FK column
|
|
233
|
+
},
|
|
234
|
+
MLTable.dataset_version: {
|
|
235
|
+
"Version": ("text", True),
|
|
236
|
+
"Description": ("markdown", True),
|
|
237
|
+
"Dataset": ("text", True),
|
|
238
|
+
"Execution": ("text", True),
|
|
239
|
+
"Minid": ("text", True),
|
|
240
|
+
"Snapshot": ("text", True),
|
|
241
|
+
},
|
|
242
|
+
MLTable.workflow: {
|
|
243
|
+
"Name": ("text", True),
|
|
244
|
+
"Description": ("markdown", True),
|
|
245
|
+
"URL": ("ermrest_uri", True),
|
|
246
|
+
"Checksum": ("text", True),
|
|
247
|
+
"Version": ("text", True),
|
|
248
|
+
"Workflow_Type": ("text", True), # FK column
|
|
249
|
+
},
|
|
250
|
+
MLTable.execution: {
|
|
251
|
+
"Workflow": ("text", True),
|
|
252
|
+
"Description": ("markdown", True),
|
|
253
|
+
"Duration": ("text", True),
|
|
254
|
+
"Status": ("text", True),
|
|
255
|
+
"Status_Detail": ("text", True),
|
|
256
|
+
},
|
|
257
|
+
MLTable.execution_metadata: {
|
|
258
|
+
# Asset table columns
|
|
259
|
+
"URL": ("text", False),
|
|
260
|
+
"Filename": ("text", True),
|
|
261
|
+
"Description": ("markdown", True),
|
|
262
|
+
"Length": ("int8", False),
|
|
263
|
+
"MD5": ("text", False),
|
|
264
|
+
},
|
|
265
|
+
MLTable.execution_asset: {
|
|
266
|
+
# Asset table columns
|
|
267
|
+
"URL": ("text", False),
|
|
268
|
+
"Filename": ("text", True),
|
|
269
|
+
"Description": ("markdown", True),
|
|
270
|
+
"Length": ("int8", False),
|
|
271
|
+
"MD5": ("text", False),
|
|
272
|
+
},
|
|
273
|
+
MLTable.file: {
|
|
274
|
+
# Asset table columns
|
|
275
|
+
"URL": ("text", False),
|
|
276
|
+
"Filename": ("text", True),
|
|
277
|
+
"Description": ("markdown", True),
|
|
278
|
+
"Length": ("int8", False),
|
|
279
|
+
"MD5": ("text", False),
|
|
280
|
+
},
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
# Expected vocabulary tables
|
|
284
|
+
EXPECTED_VOCABULARY_TABLES: list[str] = [
|
|
285
|
+
MLVocab.dataset_type,
|
|
286
|
+
MLVocab.workflow_type,
|
|
287
|
+
MLVocab.asset_type,
|
|
288
|
+
MLVocab.asset_role,
|
|
289
|
+
MLVocab.feature_name,
|
|
290
|
+
]
|
|
291
|
+
|
|
292
|
+
# Expected vocabulary columns (all vocab tables have these)
|
|
293
|
+
EXPECTED_VOCABULARY_COLUMNS: dict[str, tuple[str, bool]] = {
|
|
294
|
+
"Name": ("text", False),
|
|
295
|
+
"Description": ("markdown", False),
|
|
296
|
+
"Synonyms": ("text[]", True),
|
|
297
|
+
"ID": ("ermrest_curie", False),
|
|
298
|
+
"URI": ("ermrest_uri", False),
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
# Expected initial terms for each vocabulary table
|
|
302
|
+
# Format: {vocab_table: [term_names]}
|
|
303
|
+
EXPECTED_VOCABULARY_TERMS: dict[str, list[str]] = {
|
|
304
|
+
MLVocab.asset_type: [
|
|
305
|
+
"Execution_Config",
|
|
306
|
+
"Runtime_Env",
|
|
307
|
+
"Hydra_Config",
|
|
308
|
+
"Deriva_Config",
|
|
309
|
+
"Execution_Metadata",
|
|
310
|
+
"Execution_Asset",
|
|
311
|
+
"File",
|
|
312
|
+
"Input_File",
|
|
313
|
+
"Output_File",
|
|
314
|
+
"Model_File",
|
|
315
|
+
"Notebook_Output",
|
|
316
|
+
],
|
|
317
|
+
MLVocab.asset_role: [
|
|
318
|
+
"Input",
|
|
319
|
+
"Output",
|
|
320
|
+
],
|
|
321
|
+
MLVocab.dataset_type: [
|
|
322
|
+
"File",
|
|
323
|
+
],
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
# Expected association tables
|
|
327
|
+
EXPECTED_ASSOCIATION_TABLES: list[str] = [
|
|
328
|
+
"Dataset_Dataset_Type",
|
|
329
|
+
"Dataset_Dataset", # Nested datasets
|
|
330
|
+
"Dataset_Execution",
|
|
331
|
+
"Dataset_File",
|
|
332
|
+
"Execution_Execution", # Nested executions
|
|
333
|
+
"Execution_Metadata_Asset_Type",
|
|
334
|
+
"Execution_Metadata_Execution",
|
|
335
|
+
"Execution_Asset_Asset_Type",
|
|
336
|
+
"Execution_Asset_Execution",
|
|
337
|
+
"File_Asset_Type",
|
|
338
|
+
"File_Execution",
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
# System columns present in all tables
|
|
342
|
+
SYSTEM_COLUMNS = {"RID", "RCT", "RMT", "RCB", "RMB"}
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
class SchemaValidator:
|
|
346
|
+
"""Validates a DerivaML catalog schema against expected structure."""
|
|
347
|
+
|
|
348
|
+
def __init__(self, ml: "DerivaML"):
|
|
349
|
+
"""Initialize the validator.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
ml: DerivaML instance connected to the catalog to validate.
|
|
353
|
+
"""
|
|
354
|
+
self.ml = ml
|
|
355
|
+
self.model = ml.model
|
|
356
|
+
self.ml_schema_name = ml.ml_schema
|
|
357
|
+
|
|
358
|
+
def validate(self, strict: bool = False) -> SchemaValidationReport:
|
|
359
|
+
"""Validate the ML schema structure.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
strict: If True, report extra tables/columns as errors.
|
|
363
|
+
If False, report them as info only.
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
SchemaValidationReport with all validation results.
|
|
367
|
+
"""
|
|
368
|
+
report = SchemaValidationReport(
|
|
369
|
+
schema_name=self.ml_schema_name,
|
|
370
|
+
strict_mode=strict,
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
# Check that ML schema exists
|
|
374
|
+
if self.ml_schema_name not in self.model.model.schemas:
|
|
375
|
+
report.add_error(
|
|
376
|
+
"schema",
|
|
377
|
+
f"ML schema '{self.ml_schema_name}' does not exist",
|
|
378
|
+
)
|
|
379
|
+
return report
|
|
380
|
+
|
|
381
|
+
schema = self.model.model.schemas[self.ml_schema_name]
|
|
382
|
+
|
|
383
|
+
# Validate core tables
|
|
384
|
+
self._validate_core_tables(schema, report, strict)
|
|
385
|
+
|
|
386
|
+
# Validate vocabulary tables
|
|
387
|
+
self._validate_vocabulary_tables(schema, report, strict)
|
|
388
|
+
|
|
389
|
+
# Validate association tables
|
|
390
|
+
self._validate_association_tables(schema, report, strict)
|
|
391
|
+
|
|
392
|
+
# Validate vocabulary terms
|
|
393
|
+
self._validate_vocabulary_terms(report)
|
|
394
|
+
|
|
395
|
+
# Check for extra tables (in strict mode)
|
|
396
|
+
if strict:
|
|
397
|
+
self._check_extra_tables(schema, report)
|
|
398
|
+
|
|
399
|
+
return report
|
|
400
|
+
|
|
401
|
+
def _validate_core_tables(
|
|
402
|
+
self,
|
|
403
|
+
schema,
|
|
404
|
+
report: SchemaValidationReport,
|
|
405
|
+
strict: bool,
|
|
406
|
+
) -> None:
|
|
407
|
+
"""Validate that all core tables exist with required columns."""
|
|
408
|
+
for table_name, expected_columns in EXPECTED_TABLE_COLUMNS.items():
|
|
409
|
+
if table_name not in schema.tables:
|
|
410
|
+
report.add_error(
|
|
411
|
+
"table",
|
|
412
|
+
f"Missing required table '{table_name}'",
|
|
413
|
+
table=table_name,
|
|
414
|
+
)
|
|
415
|
+
continue
|
|
416
|
+
|
|
417
|
+
table = schema.tables[table_name]
|
|
418
|
+
self._validate_table_columns(
|
|
419
|
+
table, table_name, expected_columns, report, strict
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
def _validate_vocabulary_tables(
|
|
423
|
+
self,
|
|
424
|
+
schema,
|
|
425
|
+
report: SchemaValidationReport,
|
|
426
|
+
strict: bool,
|
|
427
|
+
) -> None:
|
|
428
|
+
"""Validate that all vocabulary tables exist with required columns."""
|
|
429
|
+
for table_name in EXPECTED_VOCABULARY_TABLES:
|
|
430
|
+
if table_name not in schema.tables:
|
|
431
|
+
report.add_error(
|
|
432
|
+
"vocabulary_table",
|
|
433
|
+
f"Missing required vocabulary table '{table_name}'",
|
|
434
|
+
table=table_name,
|
|
435
|
+
)
|
|
436
|
+
continue
|
|
437
|
+
|
|
438
|
+
table = schema.tables[table_name]
|
|
439
|
+
self._validate_table_columns(
|
|
440
|
+
table, table_name, EXPECTED_VOCABULARY_COLUMNS, report, strict
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
def _validate_association_tables(
|
|
444
|
+
self,
|
|
445
|
+
schema,
|
|
446
|
+
report: SchemaValidationReport,
|
|
447
|
+
strict: bool,
|
|
448
|
+
) -> None:
|
|
449
|
+
"""Validate that all association tables exist."""
|
|
450
|
+
for table_name in EXPECTED_ASSOCIATION_TABLES:
|
|
451
|
+
if table_name not in schema.tables:
|
|
452
|
+
report.add_error(
|
|
453
|
+
"association_table",
|
|
454
|
+
f"Missing required association table '{table_name}'",
|
|
455
|
+
table=table_name,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
def _validate_table_columns(
|
|
459
|
+
self,
|
|
460
|
+
table,
|
|
461
|
+
table_name: str,
|
|
462
|
+
expected_columns: dict[str, tuple[str, bool]],
|
|
463
|
+
report: SchemaValidationReport,
|
|
464
|
+
strict: bool,
|
|
465
|
+
) -> None:
|
|
466
|
+
"""Validate columns of a specific table."""
|
|
467
|
+
actual_columns = {col.name: col for col in table.columns}
|
|
468
|
+
|
|
469
|
+
# Check for missing columns
|
|
470
|
+
for col_name, (expected_type, expected_nullok) in expected_columns.items():
|
|
471
|
+
if col_name not in actual_columns:
|
|
472
|
+
report.add_error(
|
|
473
|
+
"column",
|
|
474
|
+
f"Missing required column '{col_name}'",
|
|
475
|
+
table=table_name,
|
|
476
|
+
column=col_name,
|
|
477
|
+
)
|
|
478
|
+
continue
|
|
479
|
+
|
|
480
|
+
col = actual_columns[col_name]
|
|
481
|
+
actual_type = col.type.typename
|
|
482
|
+
|
|
483
|
+
# Check type (allow some flexibility for domain types)
|
|
484
|
+
if not self._types_compatible(expected_type, actual_type):
|
|
485
|
+
report.add_warning(
|
|
486
|
+
"column_type",
|
|
487
|
+
f"Column '{col_name}' has unexpected type",
|
|
488
|
+
table=table_name,
|
|
489
|
+
column=col_name,
|
|
490
|
+
expected=expected_type,
|
|
491
|
+
actual=actual_type,
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
# Check nullok (only warn, don't error)
|
|
495
|
+
if col.nullok != expected_nullok:
|
|
496
|
+
report.add_info(
|
|
497
|
+
"column_nullok",
|
|
498
|
+
f"Column '{col_name}' has different nullok setting",
|
|
499
|
+
table=table_name,
|
|
500
|
+
column=col_name,
|
|
501
|
+
expected=expected_nullok,
|
|
502
|
+
actual=col.nullok,
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
# Check for extra columns in strict mode
|
|
506
|
+
if strict:
|
|
507
|
+
expected_col_names = set(expected_columns.keys()) | SYSTEM_COLUMNS
|
|
508
|
+
for col_name in actual_columns:
|
|
509
|
+
if col_name not in expected_col_names:
|
|
510
|
+
report.add_error(
|
|
511
|
+
"extra_column",
|
|
512
|
+
f"Unexpected column '{col_name}' (strict mode)",
|
|
513
|
+
table=table_name,
|
|
514
|
+
column=col_name,
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
def _types_compatible(self, expected: str, actual: str) -> bool:
|
|
518
|
+
"""Check if two type names are compatible.
|
|
519
|
+
|
|
520
|
+
Allows for domain type variations (e.g., markdown is text-based).
|
|
521
|
+
"""
|
|
522
|
+
if expected == actual:
|
|
523
|
+
return True
|
|
524
|
+
|
|
525
|
+
# Handle domain types that are based on text
|
|
526
|
+
text_based = {"text", "markdown", "longtext", "ermrest_curie", "ermrest_uri"}
|
|
527
|
+
if expected in text_based and actual in text_based:
|
|
528
|
+
return True
|
|
529
|
+
|
|
530
|
+
# Handle timestamp variations
|
|
531
|
+
timestamp_types = {"timestamp", "timestamptz", "ermrest_rct", "ermrest_rmt"}
|
|
532
|
+
if expected in timestamp_types and actual in timestamp_types:
|
|
533
|
+
return True
|
|
534
|
+
|
|
535
|
+
return False
|
|
536
|
+
|
|
537
|
+
def _validate_vocabulary_terms(self, report: SchemaValidationReport) -> None:
|
|
538
|
+
"""Validate that required vocabulary terms exist."""
|
|
539
|
+
for vocab_table, expected_terms in EXPECTED_VOCABULARY_TERMS.items():
|
|
540
|
+
try:
|
|
541
|
+
actual_terms = self.ml.list_vocabulary_terms(vocab_table)
|
|
542
|
+
actual_term_names = {term.name for term in actual_terms}
|
|
543
|
+
|
|
544
|
+
for term_name in expected_terms:
|
|
545
|
+
if term_name not in actual_term_names:
|
|
546
|
+
report.add_error(
|
|
547
|
+
"vocabulary_term",
|
|
548
|
+
f"Missing required term '{term_name}'",
|
|
549
|
+
table=vocab_table,
|
|
550
|
+
expected=term_name,
|
|
551
|
+
)
|
|
552
|
+
except Exception as e:
|
|
553
|
+
report.add_error(
|
|
554
|
+
"vocabulary_term",
|
|
555
|
+
f"Could not validate terms: {e}",
|
|
556
|
+
table=vocab_table,
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
def _check_extra_tables(
|
|
560
|
+
self,
|
|
561
|
+
schema,
|
|
562
|
+
report: SchemaValidationReport,
|
|
563
|
+
) -> None:
|
|
564
|
+
"""Check for extra tables not in the expected schema."""
|
|
565
|
+
expected_tables = (
|
|
566
|
+
set(EXPECTED_TABLE_COLUMNS.keys())
|
|
567
|
+
| set(EXPECTED_VOCABULARY_TABLES)
|
|
568
|
+
| set(EXPECTED_ASSOCIATION_TABLES)
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
for table_name in schema.tables:
|
|
572
|
+
if table_name not in expected_tables:
|
|
573
|
+
report.add_error(
|
|
574
|
+
"extra_table",
|
|
575
|
+
f"Unexpected table '{table_name}' (strict mode)",
|
|
576
|
+
table=table_name,
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def validate_ml_schema(ml: "DerivaML", strict: bool = False) -> SchemaValidationReport:
|
|
581
|
+
"""Validate the ML schema of a DerivaML catalog.
|
|
582
|
+
|
|
583
|
+
This is a convenience function that creates a validator and runs validation.
|
|
584
|
+
|
|
585
|
+
Args:
|
|
586
|
+
ml: DerivaML instance connected to the catalog to validate.
|
|
587
|
+
strict: If True, report extra tables/columns as errors.
|
|
588
|
+
If False, report them as info only.
|
|
589
|
+
|
|
590
|
+
Returns:
|
|
591
|
+
SchemaValidationReport with all validation results.
|
|
592
|
+
|
|
593
|
+
Example:
|
|
594
|
+
>>> from deriva_ml import DerivaML
|
|
595
|
+
>>> ml = DerivaML('localhost', 'my_catalog')
|
|
596
|
+
>>> report = validate_ml_schema(ml, strict=False)
|
|
597
|
+
>>> if not report.is_valid:
|
|
598
|
+
... print(report.to_text())
|
|
599
|
+
"""
|
|
600
|
+
validator = SchemaValidator(ml)
|
|
601
|
+
return validator.validate(strict=strict)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deriva-ml
|
|
3
|
-
Version: 1.17.
|
|
3
|
+
Version: 1.17.12
|
|
4
4
|
Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
|
|
5
5
|
Author-email: ISRD <isrd-dev@isi.edu>
|
|
6
|
-
Requires-Python: >=3.
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
License-File: LICENSE
|
|
9
9
|
Requires-Dist: bump-my-version
|
|
@@ -14,9 +14,9 @@ Requires-Dist: nbconvert
|
|
|
14
14
|
Requires-Dist: pandas
|
|
15
15
|
Requires-Dist: pydantic>=2.11
|
|
16
16
|
Requires-Dist: papermill
|
|
17
|
-
Requires-Dist: pandas-stubs
|
|
17
|
+
Requires-Dist: pandas-stubs
|
|
18
18
|
Requires-Dist: pyyaml
|
|
19
|
-
Requires-Dist: regex
|
|
19
|
+
Requires-Dist: regex
|
|
20
20
|
Requires-Dist: semver>3.0.0
|
|
21
21
|
Requires-Dist: setuptools>=80
|
|
22
22
|
Requires-Dist: setuptools-scm>=8.0
|