deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +69 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +31 -0
  7. deriva_ml/catalog/clone.py +1939 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +845 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +126 -110
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +543 -242
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +223 -34
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
  67. deriva_ml-1.17.12.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.10.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,601 @@
1
+ """Schema validation for DerivaML catalogs.
2
+
3
+ This module provides functionality to validate that a catalog's ML schema matches
4
+ the expected structure created by create_schema.py. It can check for:
5
+ - Required tables and their columns
6
+ - Required vocabulary tables and their initial terms
7
+ - Foreign key relationships
8
+ - Extra tables/columns (in strict mode)
9
+
10
+ Usage:
11
+ from deriva_ml import DerivaML
12
+
13
+ ml = DerivaML('localhost', 'my_catalog')
14
+ report = ml.validate_schema(strict=False)
15
+ if not report.is_valid:
16
+ print(report.to_text())
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ from dataclasses import dataclass, field
23
+ from enum import Enum
24
+ from typing import TYPE_CHECKING, Any
25
+
26
+ if TYPE_CHECKING:
27
+ from deriva_ml.core.base import DerivaML
28
+
29
+ from deriva_ml.core.definitions import ML_SCHEMA, MLTable, MLVocab
30
+
31
+
32
+ class ValidationSeverity(str, Enum):
33
+ """Severity levels for validation issues."""
34
+
35
+ ERROR = "error" # Schema is invalid, will cause failures
36
+ WARNING = "warning" # Schema may work but has issues
37
+ INFO = "info" # Informational (e.g., extra items in non-strict mode)
38
+
39
+
40
+ @dataclass
41
+ class ValidationIssue:
42
+ """A single validation issue found during schema inspection."""
43
+
44
+ severity: ValidationSeverity
45
+ category: str # e.g., "table", "column", "vocabulary", "foreign_key"
46
+ message: str
47
+ table: str | None = None
48
+ column: str | None = None
49
+ expected: Any = None
50
+ actual: Any = None
51
+
52
+ def __str__(self) -> str:
53
+ location = ""
54
+ if self.table:
55
+ location = f"[{self.table}"
56
+ if self.column:
57
+ location += f".{self.column}"
58
+ location += "] "
59
+ return f"{self.severity.value.upper()}: {location}{self.message}"
60
+
61
+
62
+ @dataclass
63
+ class SchemaValidationReport:
64
+ """Complete validation report for a DerivaML catalog schema."""
65
+
66
+ schema_name: str
67
+ strict_mode: bool
68
+ issues: list[ValidationIssue] = field(default_factory=list)
69
+
70
+ @property
71
+ def is_valid(self) -> bool:
72
+ """Returns True if no errors were found."""
73
+ return not any(i.severity == ValidationSeverity.ERROR for i in self.issues)
74
+
75
+ @property
76
+ def errors(self) -> list[ValidationIssue]:
77
+ """Returns only error-level issues."""
78
+ return [i for i in self.issues if i.severity == ValidationSeverity.ERROR]
79
+
80
+ @property
81
+ def warnings(self) -> list[ValidationIssue]:
82
+ """Returns only warning-level issues."""
83
+ return [i for i in self.issues if i.severity == ValidationSeverity.WARNING]
84
+
85
+ @property
86
+ def info(self) -> list[ValidationIssue]:
87
+ """Returns only info-level issues."""
88
+ return [i for i in self.issues if i.severity == ValidationSeverity.INFO]
89
+
90
+ def add_error(self, category: str, message: str, **kwargs) -> None:
91
+ """Add an error-level issue."""
92
+ self.issues.append(ValidationIssue(
93
+ severity=ValidationSeverity.ERROR,
94
+ category=category,
95
+ message=message,
96
+ **kwargs
97
+ ))
98
+
99
+ def add_warning(self, category: str, message: str, **kwargs) -> None:
100
+ """Add a warning-level issue."""
101
+ self.issues.append(ValidationIssue(
102
+ severity=ValidationSeverity.WARNING,
103
+ category=category,
104
+ message=message,
105
+ **kwargs
106
+ ))
107
+
108
+ def add_info(self, category: str, message: str, **kwargs) -> None:
109
+ """Add an info-level issue."""
110
+ self.issues.append(ValidationIssue(
111
+ severity=ValidationSeverity.INFO,
112
+ category=category,
113
+ message=message,
114
+ **kwargs
115
+ ))
116
+
117
+ def to_text(self) -> str:
118
+ """Generate a human-readable text report."""
119
+ lines = [
120
+ f"Schema Validation Report for '{self.schema_name}'",
121
+ f"Mode: {'Strict' if self.strict_mode else 'Non-strict'}",
122
+ f"Status: {'VALID' if self.is_valid else 'INVALID'}",
123
+ "",
124
+ ]
125
+
126
+ if not self.issues:
127
+ lines.append("No issues found.")
128
+ return "\n".join(lines)
129
+
130
+ # Summary
131
+ lines.append(f"Summary: {len(self.errors)} errors, {len(self.warnings)} warnings, {len(self.info)} info")
132
+ lines.append("")
133
+
134
+ # Group by category
135
+ by_category: dict[str, list[ValidationIssue]] = {}
136
+ for issue in self.issues:
137
+ by_category.setdefault(issue.category, []).append(issue)
138
+
139
+ for category, category_issues in sorted(by_category.items()):
140
+ lines.append(f"## {category.replace('_', ' ').title()}")
141
+ for issue in category_issues:
142
+ lines.append(f" - {issue}")
143
+ if issue.expected is not None or issue.actual is not None:
144
+ if issue.expected is not None:
145
+ lines.append(f" Expected: {issue.expected}")
146
+ if issue.actual is not None:
147
+ lines.append(f" Actual: {issue.actual}")
148
+ lines.append("")
149
+
150
+ return "\n".join(lines)
151
+
152
+ def to_dict(self) -> dict[str, Any]:
153
+ """Convert report to dictionary format for JSON serialization.
154
+
155
+ Returns:
156
+ Dictionary with schema validation results suitable for JSON encoding.
157
+ Structure:
158
+ {
159
+ "schema_name": str,
160
+ "strict_mode": bool,
161
+ "is_valid": bool,
162
+ "summary": {"errors": int, "warnings": int, "info": int},
163
+ "issues": [
164
+ {
165
+ "severity": "error"|"warning"|"info",
166
+ "category": str,
167
+ "message": str,
168
+ "table": str|null,
169
+ "column": str|null,
170
+ "expected": any|null,
171
+ "actual": any|null
172
+ },
173
+ ...
174
+ ]
175
+ }
176
+ """
177
+ return {
178
+ "schema_name": self.schema_name,
179
+ "strict_mode": self.strict_mode,
180
+ "is_valid": self.is_valid,
181
+ "summary": {
182
+ "errors": len(self.errors),
183
+ "warnings": len(self.warnings),
184
+ "info": len(self.info),
185
+ },
186
+ "issues": [
187
+ {
188
+ "severity": i.severity.value,
189
+ "category": i.category,
190
+ "message": i.message,
191
+ "table": i.table,
192
+ "column": i.column,
193
+ "expected": i.expected,
194
+ "actual": i.actual,
195
+ }
196
+ for i in self.issues
197
+ ],
198
+ }
199
+
200
+ def to_json(self, indent: int | None = 2) -> str:
201
+ """Convert report to JSON string format.
202
+
203
+ Args:
204
+ indent: Number of spaces for indentation, or None for compact output.
205
+
206
+ Returns:
207
+ JSON string representation of the validation report.
208
+
209
+ Example:
210
+ >>> report = ml.validate_schema()
211
+ >>> print(report.to_json())
212
+ {
213
+ "schema_name": "deriva-ml",
214
+ "strict_mode": false,
215
+ "is_valid": true,
216
+ ...
217
+ }
218
+ """
219
+ return json.dumps(self.to_dict(), indent=indent)
220
+
221
+
222
+ # =============================================================================
223
+ # Expected Schema Structure
224
+ # =============================================================================
225
+
226
+ # Expected columns for each table in the ML schema
227
+ # Format: {table_name: {column_name: (type_name, nullok)}}
228
+ EXPECTED_TABLE_COLUMNS: dict[str, dict[str, tuple[str, bool]]] = {
229
+ MLTable.dataset: {
230
+ "Description": ("markdown", True),
231
+ "Deleted": ("boolean", True),
232
+ "Version": ("text", True), # FK column
233
+ },
234
+ MLTable.dataset_version: {
235
+ "Version": ("text", True),
236
+ "Description": ("markdown", True),
237
+ "Dataset": ("text", True),
238
+ "Execution": ("text", True),
239
+ "Minid": ("text", True),
240
+ "Snapshot": ("text", True),
241
+ },
242
+ MLTable.workflow: {
243
+ "Name": ("text", True),
244
+ "Description": ("markdown", True),
245
+ "URL": ("ermrest_uri", True),
246
+ "Checksum": ("text", True),
247
+ "Version": ("text", True),
248
+ "Workflow_Type": ("text", True), # FK column
249
+ },
250
+ MLTable.execution: {
251
+ "Workflow": ("text", True),
252
+ "Description": ("markdown", True),
253
+ "Duration": ("text", True),
254
+ "Status": ("text", True),
255
+ "Status_Detail": ("text", True),
256
+ },
257
+ MLTable.execution_metadata: {
258
+ # Asset table columns
259
+ "URL": ("text", False),
260
+ "Filename": ("text", True),
261
+ "Description": ("markdown", True),
262
+ "Length": ("int8", False),
263
+ "MD5": ("text", False),
264
+ },
265
+ MLTable.execution_asset: {
266
+ # Asset table columns
267
+ "URL": ("text", False),
268
+ "Filename": ("text", True),
269
+ "Description": ("markdown", True),
270
+ "Length": ("int8", False),
271
+ "MD5": ("text", False),
272
+ },
273
+ MLTable.file: {
274
+ # Asset table columns
275
+ "URL": ("text", False),
276
+ "Filename": ("text", True),
277
+ "Description": ("markdown", True),
278
+ "Length": ("int8", False),
279
+ "MD5": ("text", False),
280
+ },
281
+ }
282
+
283
+ # Expected vocabulary tables
284
+ EXPECTED_VOCABULARY_TABLES: list[str] = [
285
+ MLVocab.dataset_type,
286
+ MLVocab.workflow_type,
287
+ MLVocab.asset_type,
288
+ MLVocab.asset_role,
289
+ MLVocab.feature_name,
290
+ ]
291
+
292
+ # Expected vocabulary columns (all vocab tables have these)
293
+ EXPECTED_VOCABULARY_COLUMNS: dict[str, tuple[str, bool]] = {
294
+ "Name": ("text", False),
295
+ "Description": ("markdown", False),
296
+ "Synonyms": ("text[]", True),
297
+ "ID": ("ermrest_curie", False),
298
+ "URI": ("ermrest_uri", False),
299
+ }
300
+
301
+ # Expected initial terms for each vocabulary table
302
+ # Format: {vocab_table: [term_names]}
303
+ EXPECTED_VOCABULARY_TERMS: dict[str, list[str]] = {
304
+ MLVocab.asset_type: [
305
+ "Execution_Config",
306
+ "Runtime_Env",
307
+ "Hydra_Config",
308
+ "Deriva_Config",
309
+ "Execution_Metadata",
310
+ "Execution_Asset",
311
+ "File",
312
+ "Input_File",
313
+ "Output_File",
314
+ "Model_File",
315
+ "Notebook_Output",
316
+ ],
317
+ MLVocab.asset_role: [
318
+ "Input",
319
+ "Output",
320
+ ],
321
+ MLVocab.dataset_type: [
322
+ "File",
323
+ ],
324
+ }
325
+
326
+ # Expected association tables
327
+ EXPECTED_ASSOCIATION_TABLES: list[str] = [
328
+ "Dataset_Dataset_Type",
329
+ "Dataset_Dataset", # Nested datasets
330
+ "Dataset_Execution",
331
+ "Dataset_File",
332
+ "Execution_Execution", # Nested executions
333
+ "Execution_Metadata_Asset_Type",
334
+ "Execution_Metadata_Execution",
335
+ "Execution_Asset_Asset_Type",
336
+ "Execution_Asset_Execution",
337
+ "File_Asset_Type",
338
+ "File_Execution",
339
+ ]
340
+
341
+ # System columns present in all tables
342
+ SYSTEM_COLUMNS = {"RID", "RCT", "RMT", "RCB", "RMB"}
343
+
344
+
345
+ class SchemaValidator:
346
+ """Validates a DerivaML catalog schema against expected structure."""
347
+
348
+ def __init__(self, ml: "DerivaML"):
349
+ """Initialize the validator.
350
+
351
+ Args:
352
+ ml: DerivaML instance connected to the catalog to validate.
353
+ """
354
+ self.ml = ml
355
+ self.model = ml.model
356
+ self.ml_schema_name = ml.ml_schema
357
+
358
+ def validate(self, strict: bool = False) -> SchemaValidationReport:
359
+ """Validate the ML schema structure.
360
+
361
+ Args:
362
+ strict: If True, report extra tables/columns as errors.
363
+ If False, report them as info only.
364
+
365
+ Returns:
366
+ SchemaValidationReport with all validation results.
367
+ """
368
+ report = SchemaValidationReport(
369
+ schema_name=self.ml_schema_name,
370
+ strict_mode=strict,
371
+ )
372
+
373
+ # Check that ML schema exists
374
+ if self.ml_schema_name not in self.model.model.schemas:
375
+ report.add_error(
376
+ "schema",
377
+ f"ML schema '{self.ml_schema_name}' does not exist",
378
+ )
379
+ return report
380
+
381
+ schema = self.model.model.schemas[self.ml_schema_name]
382
+
383
+ # Validate core tables
384
+ self._validate_core_tables(schema, report, strict)
385
+
386
+ # Validate vocabulary tables
387
+ self._validate_vocabulary_tables(schema, report, strict)
388
+
389
+ # Validate association tables
390
+ self._validate_association_tables(schema, report, strict)
391
+
392
+ # Validate vocabulary terms
393
+ self._validate_vocabulary_terms(report)
394
+
395
+ # Check for extra tables (in strict mode)
396
+ if strict:
397
+ self._check_extra_tables(schema, report)
398
+
399
+ return report
400
+
401
+ def _validate_core_tables(
402
+ self,
403
+ schema,
404
+ report: SchemaValidationReport,
405
+ strict: bool,
406
+ ) -> None:
407
+ """Validate that all core tables exist with required columns."""
408
+ for table_name, expected_columns in EXPECTED_TABLE_COLUMNS.items():
409
+ if table_name not in schema.tables:
410
+ report.add_error(
411
+ "table",
412
+ f"Missing required table '{table_name}'",
413
+ table=table_name,
414
+ )
415
+ continue
416
+
417
+ table = schema.tables[table_name]
418
+ self._validate_table_columns(
419
+ table, table_name, expected_columns, report, strict
420
+ )
421
+
422
+ def _validate_vocabulary_tables(
423
+ self,
424
+ schema,
425
+ report: SchemaValidationReport,
426
+ strict: bool,
427
+ ) -> None:
428
+ """Validate that all vocabulary tables exist with required columns."""
429
+ for table_name in EXPECTED_VOCABULARY_TABLES:
430
+ if table_name not in schema.tables:
431
+ report.add_error(
432
+ "vocabulary_table",
433
+ f"Missing required vocabulary table '{table_name}'",
434
+ table=table_name,
435
+ )
436
+ continue
437
+
438
+ table = schema.tables[table_name]
439
+ self._validate_table_columns(
440
+ table, table_name, EXPECTED_VOCABULARY_COLUMNS, report, strict
441
+ )
442
+
443
+ def _validate_association_tables(
444
+ self,
445
+ schema,
446
+ report: SchemaValidationReport,
447
+ strict: bool,
448
+ ) -> None:
449
+ """Validate that all association tables exist."""
450
+ for table_name in EXPECTED_ASSOCIATION_TABLES:
451
+ if table_name not in schema.tables:
452
+ report.add_error(
453
+ "association_table",
454
+ f"Missing required association table '{table_name}'",
455
+ table=table_name,
456
+ )
457
+
458
+ def _validate_table_columns(
459
+ self,
460
+ table,
461
+ table_name: str,
462
+ expected_columns: dict[str, tuple[str, bool]],
463
+ report: SchemaValidationReport,
464
+ strict: bool,
465
+ ) -> None:
466
+ """Validate columns of a specific table."""
467
+ actual_columns = {col.name: col for col in table.columns}
468
+
469
+ # Check for missing columns
470
+ for col_name, (expected_type, expected_nullok) in expected_columns.items():
471
+ if col_name not in actual_columns:
472
+ report.add_error(
473
+ "column",
474
+ f"Missing required column '{col_name}'",
475
+ table=table_name,
476
+ column=col_name,
477
+ )
478
+ continue
479
+
480
+ col = actual_columns[col_name]
481
+ actual_type = col.type.typename
482
+
483
+ # Check type (allow some flexibility for domain types)
484
+ if not self._types_compatible(expected_type, actual_type):
485
+ report.add_warning(
486
+ "column_type",
487
+ f"Column '{col_name}' has unexpected type",
488
+ table=table_name,
489
+ column=col_name,
490
+ expected=expected_type,
491
+ actual=actual_type,
492
+ )
493
+
494
+ # Check nullok (only warn, don't error)
495
+ if col.nullok != expected_nullok:
496
+ report.add_info(
497
+ "column_nullok",
498
+ f"Column '{col_name}' has different nullok setting",
499
+ table=table_name,
500
+ column=col_name,
501
+ expected=expected_nullok,
502
+ actual=col.nullok,
503
+ )
504
+
505
+ # Check for extra columns in strict mode
506
+ if strict:
507
+ expected_col_names = set(expected_columns.keys()) | SYSTEM_COLUMNS
508
+ for col_name in actual_columns:
509
+ if col_name not in expected_col_names:
510
+ report.add_error(
511
+ "extra_column",
512
+ f"Unexpected column '{col_name}' (strict mode)",
513
+ table=table_name,
514
+ column=col_name,
515
+ )
516
+
517
+ def _types_compatible(self, expected: str, actual: str) -> bool:
518
+ """Check if two type names are compatible.
519
+
520
+ Allows for domain type variations (e.g., markdown is text-based).
521
+ """
522
+ if expected == actual:
523
+ return True
524
+
525
+ # Handle domain types that are based on text
526
+ text_based = {"text", "markdown", "longtext", "ermrest_curie", "ermrest_uri"}
527
+ if expected in text_based and actual in text_based:
528
+ return True
529
+
530
+ # Handle timestamp variations
531
+ timestamp_types = {"timestamp", "timestamptz", "ermrest_rct", "ermrest_rmt"}
532
+ if expected in timestamp_types and actual in timestamp_types:
533
+ return True
534
+
535
+ return False
536
+
537
+ def _validate_vocabulary_terms(self, report: SchemaValidationReport) -> None:
538
+ """Validate that required vocabulary terms exist."""
539
+ for vocab_table, expected_terms in EXPECTED_VOCABULARY_TERMS.items():
540
+ try:
541
+ actual_terms = self.ml.list_vocabulary_terms(vocab_table)
542
+ actual_term_names = {term.name for term in actual_terms}
543
+
544
+ for term_name in expected_terms:
545
+ if term_name not in actual_term_names:
546
+ report.add_error(
547
+ "vocabulary_term",
548
+ f"Missing required term '{term_name}'",
549
+ table=vocab_table,
550
+ expected=term_name,
551
+ )
552
+ except Exception as e:
553
+ report.add_error(
554
+ "vocabulary_term",
555
+ f"Could not validate terms: {e}",
556
+ table=vocab_table,
557
+ )
558
+
559
+ def _check_extra_tables(
560
+ self,
561
+ schema,
562
+ report: SchemaValidationReport,
563
+ ) -> None:
564
+ """Check for extra tables not in the expected schema."""
565
+ expected_tables = (
566
+ set(EXPECTED_TABLE_COLUMNS.keys())
567
+ | set(EXPECTED_VOCABULARY_TABLES)
568
+ | set(EXPECTED_ASSOCIATION_TABLES)
569
+ )
570
+
571
+ for table_name in schema.tables:
572
+ if table_name not in expected_tables:
573
+ report.add_error(
574
+ "extra_table",
575
+ f"Unexpected table '{table_name}' (strict mode)",
576
+ table=table_name,
577
+ )
578
+
579
+
580
+ def validate_ml_schema(ml: "DerivaML", strict: bool = False) -> SchemaValidationReport:
581
+ """Validate the ML schema of a DerivaML catalog.
582
+
583
+ This is a convenience function that creates a validator and runs validation.
584
+
585
+ Args:
586
+ ml: DerivaML instance connected to the catalog to validate.
587
+ strict: If True, report extra tables/columns as errors.
588
+ If False, report them as info only.
589
+
590
+ Returns:
591
+ SchemaValidationReport with all validation results.
592
+
593
+ Example:
594
+ >>> from deriva_ml import DerivaML
595
+ >>> ml = DerivaML('localhost', 'my_catalog')
596
+ >>> report = validate_ml_schema(ml, strict=False)
597
+ >>> if not report.is_valid:
598
+ ... print(report.to_text())
599
+ """
600
+ validator = SchemaValidator(ml)
601
+ return validator.validate(strict=strict)
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.17.10
3
+ Version: 1.17.12
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
- Requires-Python: >=3.10
6
+ Requires-Python: >=3.12
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE
9
9
  Requires-Dist: bump-my-version
@@ -14,9 +14,9 @@ Requires-Dist: nbconvert
14
14
  Requires-Dist: pandas
15
15
  Requires-Dist: pydantic>=2.11
16
16
  Requires-Dist: papermill
17
- Requires-Dist: pandas-stubs==2.2.3.250527
17
+ Requires-Dist: pandas-stubs
18
18
  Requires-Dist: pyyaml
19
- Requires-Dist: regex~=2024.7.24
19
+ Requires-Dist: regex
20
20
  Requires-Dist: semver>3.0.0
21
21
  Requires-Dist: setuptools>=80
22
22
  Requires-Dist: setuptools-scm>=8.0