deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +69 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +31 -0
  7. deriva_ml/catalog/clone.py +1939 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +845 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +126 -110
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +543 -242
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +223 -34
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
  67. deriva_ml-1.17.12.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.10.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,862 @@
1
+ """Protocol definitions for DerivaML dataset, asset, and catalog operations.
2
+
3
+ This module defines the protocols (interfaces) used throughout DerivaML for
4
+ type checking and polymorphic access to datasets, assets, and catalogs.
5
+
6
+ Protocol Hierarchies
7
+ --------------------
8
+
9
+ **Dataset Protocols:**
10
+ DatasetLike: Read-only operations for both live datasets and downloaded bags.
11
+ WritableDataset: Write operations only available on live catalog datasets.
12
+
13
+ **Asset Protocols:**
14
+ AssetLike: Read-only operations for asset access.
15
+ WritableAsset: Write operations for asset modification.
16
+
17
+ **Catalog Protocols:**
18
+ DerivaMLCatalogReader: Read-only catalog operations (lookups, queries).
19
+ DerivaMLCatalog: Full catalog operations including write operations.
20
+
21
+ The separation allows code to express its requirements precisely:
22
+ - Code that only reads data can accept DatasetLike, AssetLike, or DerivaMLCatalogReader
23
+ - Code that modifies data requires WritableDataset, WritableAsset, or DerivaMLCatalog
24
+
25
+ API Naming Conventions
26
+ ----------------------
27
+
28
+ The DerivaML API follows consistent naming conventions:
29
+
30
+ - ``lookup_*``: Single item retrieval by identifier. Returns one item or raises exception.
31
+ Examples: lookup_dataset(), lookup_asset(), lookup_term()
32
+
33
+ - ``find_*``: Search/discovery operations. Returns Iterable of matching items.
34
+ Examples: find_datasets(), find_assets(), find_features()
35
+
36
+ - ``list_*``: List all items of a type, often with context (e.g., members of a dataset).
37
+ Examples: list_assets(), list_vocabulary_terms(), list_dataset_members()
38
+
39
+ - ``get_*``: Data retrieval with transformation (e.g., to DataFrame).
40
+ Examples: get_table_as_dataframe(), get_metadata()
41
+
42
+ - ``create_*``: Create new entities in the catalog.
43
+ Examples: create_dataset(), create_execution(), create_feature()
44
+
45
+ - ``add_*``: Add items to existing entities or create vocabulary terms.
46
+ Examples: add_term(), add_dataset_members(), add_asset_type()
47
+
48
+ - ``delete_*`` / ``remove_*``: Remove items from entities.
49
+ Examples: delete_dataset_members(), remove_asset_type()
50
+
51
+ Implementation Notes
52
+ --------------------
53
+ - Dataset: Live catalog access via deriva-py/datapath (implements both protocols)
54
+ - DatasetBag: Downloaded bag access via SQLAlchemy/SQLite (read-only only)
55
+ - Asset: Live catalog access for file-based records (implements WritableAsset)
56
+ - DerivaML: Full catalog operations (implements DerivaMLCatalog)
57
+ - DerivaMLDatabase: Bag-backed catalog (implements DerivaMLCatalogReader only)
58
+ """
59
+
60
+ from __future__ import annotations
61
+
62
+ from pathlib import Path
63
+ from typing import TYPE_CHECKING, Any, Generator, Iterable, Protocol, Self, runtime_checkable
64
+
65
+ import pandas as pd
66
+
67
+ # Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
68
+ import importlib
69
+ _deriva_core = importlib.import_module("deriva.core")
70
+ _datapath = importlib.import_module("deriva.core.datapath")
71
+ _ermrest_catalog = importlib.import_module("deriva.core.ermrest_catalog")
72
+ _ermrest_model = importlib.import_module("deriva.core.ermrest_model")
73
+
74
+ ErmrestSnapshot = _deriva_core.ErmrestSnapshot
75
+ SchemaWrapper = _datapath._SchemaWrapper
76
+ ErmrestCatalog = _ermrest_catalog.ErmrestCatalog
77
+ ResolveRidResult = _ermrest_catalog.ResolveRidResult
78
+ Table = _ermrest_model.Table
79
+
80
+ from deriva_ml.core.definitions import RID, VocabularyTerm
81
+ from deriva_ml.core.mixins.rid_resolution import BatchRidResult
82
+ from deriva_ml.feature import Feature, FeatureRecord
83
+ from deriva_ml.model.catalog import DerivaModel
84
+
85
+ if TYPE_CHECKING:
86
+ from deriva_ml.core.enums import Status
87
+ from deriva_ml.dataset.aux_classes import DatasetHistory, DatasetSpec, DatasetVersion
88
+ from deriva_ml.dataset.dataset import Dataset
89
+ from deriva_ml.execution.execution_record import ExecutionRecord
90
+ from deriva_ml.execution.workflow import Workflow
91
+
92
+
93
+ @runtime_checkable
94
+ class DatasetLike(Protocol):
95
+ """Protocol defining read-only interface for dataset access.
96
+
97
+ This protocol is implemented by both Dataset (live catalog) and DatasetBag
98
+ (downloaded bag). It defines the common read interface for accessing dataset
99
+ metadata, members, and relationships.
100
+
101
+ The protocol defines the minimal interface that both implementations support.
102
+ Dataset extends this with optional `version` parameters on some methods to
103
+ support querying historical versions. DatasetBag doesn't need version parameters
104
+ since bags are immutable snapshots of a specific version.
105
+
106
+ Note on `_visited` parameters: Both implementations use `_visited` internally
107
+ for recursion guards, but this is not part of the protocol as it's an
108
+ implementation detail.
109
+
110
+ Attributes:
111
+ dataset_rid: Resource Identifier for the dataset.
112
+ execution_rid: Optional execution RID associated with the dataset.
113
+ description: Description of the dataset.
114
+ dataset_types: Type(s) of the dataset from Dataset_Type vocabulary.
115
+ current_version: Current semantic version of the dataset.
116
+ """
117
+
118
+ dataset_rid: RID
119
+ execution_rid: RID | None
120
+ description: str
121
+ dataset_types: list[str]
122
+
123
+ @property
124
+ def current_version(self) -> DatasetVersion:
125
+ """Get the current version of the dataset."""
126
+ ...
127
+
128
+ def dataset_history(self) -> list[DatasetHistory]:
129
+ """Get the version history of the dataset."""
130
+ ...
131
+
132
+ def list_dataset_children(
133
+ self,
134
+ recurse: bool = False,
135
+ _visited: set[RID] | None = None,
136
+ version: Any = None,
137
+ **kwargs: Any,
138
+ ) -> list[Self]:
139
+ """Get nested child datasets.
140
+
141
+ Args:
142
+ recurse: Whether to recursively include children of children.
143
+ _visited: Internal parameter to track visited datasets and prevent infinite recursion.
144
+ version: Dataset version to list children from (Dataset only, ignored by DatasetBag).
145
+ **kwargs: Additional implementation-specific arguments.
146
+
147
+ Returns:
148
+ List of child datasets (Dataset or DatasetBag depending on implementation).
149
+
150
+ Note:
151
+ Both Dataset and DatasetBag have `recurse` as the first parameter.
152
+ Dataset uses the `version` parameter to query historical versions.
153
+ """
154
+ ...
155
+
156
+ def list_dataset_parents(
157
+ self,
158
+ recurse: bool = False,
159
+ _visited: set[RID] | None = None,
160
+ version: Any = None,
161
+ **kwargs: Any,
162
+ ) -> list[Self]:
163
+ """Get parent datasets that contain this dataset.
164
+
165
+ Args:
166
+ recurse: Whether to recursively include parents of parents.
167
+ _visited: Internal parameter to track visited datasets and prevent infinite recursion.
168
+ version: Dataset version to list parents from (Dataset only, ignored by DatasetBag).
169
+ **kwargs: Additional implementation-specific arguments.
170
+
171
+ Returns:
172
+ List of parent datasets (Dataset or DatasetBag depending on implementation).
173
+
174
+ Note:
175
+ Both Dataset and DatasetBag have `recurse` as the first parameter.
176
+ Dataset uses the `version` parameter to query historical versions.
177
+ """
178
+ ...
179
+
180
+ def list_dataset_members(
181
+ self,
182
+ recurse: bool = False,
183
+ limit: int | None = None,
184
+ _visited: set[RID] | None = None,
185
+ version: Any = None,
186
+ **kwargs: Any,
187
+ ) -> dict[str, list[dict[str, Any]]]:
188
+ """List members of the dataset.
189
+
190
+ Args:
191
+ recurse: Whether to include members of nested datasets.
192
+ limit: Maximum number of members per type. None for no limit.
193
+ _visited: Internal parameter to track visited datasets and prevent infinite recursion.
194
+ version: Dataset version to list members from (Dataset only, ignored by DatasetBag).
195
+ **kwargs: Additional implementation-specific arguments.
196
+
197
+ Returns:
198
+ Dictionary mapping member types to lists of member records.
199
+
200
+ Note:
201
+ Both Dataset and DatasetBag have `recurse` as the first parameter.
202
+ Dataset uses the `version` parameter to query historical versions.
203
+ """
204
+ ...
205
+
206
+ def list_dataset_element_types(self) -> Iterable[Table]:
207
+ """List the types of elements that can be contained in this dataset.
208
+
209
+ Returns:
210
+ Iterable of Table objects representing element types.
211
+ """
212
+ ...
213
+
214
+ def find_features(self, table: str | Table) -> Iterable[Feature]:
215
+ """Find features associated with a table.
216
+
217
+ Args:
218
+ table: Table to find features for.
219
+
220
+ Returns:
221
+ Iterable of Feature objects.
222
+ """
223
+ ...
224
+
225
+ def denormalize_as_dataframe(
226
+ self,
227
+ include_tables: list[str],
228
+ version: Any = None,
229
+ **kwargs: Any,
230
+ ) -> pd.DataFrame:
231
+ """Denormalize the dataset into a single wide table (DataFrame).
232
+
233
+ Denormalization transforms normalized relational data into a single "wide table"
234
+ (also called a "flat table" or "denormalized table") by joining related tables
235
+ together. This produces a DataFrame where each row contains all related information
236
+ from multiple source tables, with columns from each table combined side-by-side.
237
+
238
+ Wide tables are the standard input format for most machine learning frameworks,
239
+ which expect all features for a single observation to be in one row. This method
240
+ bridges the gap between normalized database schemas and ML-ready tabular data.
241
+
242
+ **How it works:**
243
+
244
+ Tables are joined based on their foreign key relationships. For example, if
245
+ Image has a foreign key to Subject, and Diagnosis has a foreign key to Image,
246
+ then denormalizing ["Subject", "Image", "Diagnosis"] produces rows where each
247
+ image appears with its subject's metadata and any associated diagnoses.
248
+
249
+ The result uses outer join semantics - if a table has no FK relationship to
250
+ others, its rows are included with NULL values for unrelated columns.
251
+
252
+ **Column naming:**
253
+
254
+ To avoid column name collisions (e.g., multiple tables having "RID" or "Name"),
255
+ all column names are prefixed with their source table name.
256
+
257
+ Args:
258
+ include_tables: List of table names to include in the output.
259
+ Tables are joined based on their foreign key relationships.
260
+ Order doesn't matter - the join order is determined automatically.
261
+ version: Dataset version to query (Dataset only, ignored by DatasetBag).
262
+ **kwargs: Additional implementation-specific arguments.
263
+
264
+ Returns:
265
+ pd.DataFrame: Wide table with columns from all included tables.
266
+ Column names are prefixed with the source table name.
267
+
268
+ Note:
269
+ Column naming conventions differ between implementations:
270
+
271
+ - Dataset (catalog): Uses underscore separator (e.g., "Image_Filename")
272
+ - DatasetBag (bag): Uses dot separator (e.g., "Image.Filename")
273
+
274
+ Example:
275
+ Suppose you have a dataset with Images linked to Subjects, and each
276
+ Image has a Diagnosis label::
277
+
278
+ # Normalized schema:
279
+ # Subject: RID, Name, Age
280
+ # Image: RID, Filename, Subject (FK)
281
+ # Diagnosis: RID, Image (FK), Label
282
+
283
+ # Denormalize into a wide table for ML
284
+ df = dataset.denormalize_as_dataframe(["Subject", "Image", "Diagnosis"])
285
+
286
+ # Result has columns like:
287
+ # Subject_RID, Subject_Name, Subject_Age,
288
+ # Image_RID, Image_Filename, Image_Subject,
289
+ # Diagnosis_RID, Diagnosis_Image, Diagnosis_Label
290
+
291
+ # Each row represents one Image with its Subject info and Diagnosis
292
+ # Ready for use with sklearn, pandas, or other ML tools
293
+
294
+ See Also:
295
+ denormalize_as_dict: Generator version for memory-efficient processing.
296
+ """
297
+ ...
298
+
299
+ def denormalize_as_dict(
300
+ self,
301
+ include_tables: list[str],
302
+ version: Any = None,
303
+ **kwargs: Any,
304
+ ) -> Generator[dict[str, Any], None, None]:
305
+ """Denormalize the dataset and yield rows as dictionaries.
306
+
307
+ This is a memory-efficient alternative to denormalize_as_dataframe() that
308
+ yields one row at a time as a dictionary instead of loading all data into
309
+ a DataFrame. Use this when processing large datasets that may not fit in
310
+ memory, or when you want to process rows incrementally.
311
+
312
+ Like denormalize_as_dataframe(), this produces a "wide table" representation
313
+ where each yielded dictionary contains all columns from the joined tables.
314
+ See denormalize_as_dataframe() for detailed explanation of how denormalization
315
+ works.
316
+
317
+ Args:
318
+ include_tables: List of table names to include in the output.
319
+ Tables are joined based on their foreign key relationships.
320
+ version: Dataset version to query (Dataset only, ignored by DatasetBag).
321
+ **kwargs: Additional implementation-specific arguments.
322
+
323
+ Yields:
324
+ dict[str, Any]: Dictionary representing one row of the wide table.
325
+ Keys are column names prefixed by table name.
326
+
327
+ Note:
328
+ Column naming conventions differ between implementations:
329
+
330
+ - Dataset (catalog): Uses underscore separator (e.g., "Image_Filename")
331
+ - DatasetBag (bag): Uses dot separator (e.g., "Image.Filename")
332
+
333
+ Example:
334
+ Process a large dataset without loading everything into memory::
335
+
336
+ # Stream through rows one at a time
337
+ for row in dataset.denormalize_as_dict(["Image", "Diagnosis"]):
338
+ image_path = row["Image_Filename"]
339
+ label = row["Diagnosis_Label"]
340
+ # Process each image-label pair...
341
+
342
+ # Or convert to list if you need random access
343
+ rows = list(dataset.denormalize_as_dict(["Image", "Diagnosis"]))
344
+
345
+ See Also:
346
+ denormalize_as_dataframe: Returns all data as a pandas DataFrame.
347
+ """
348
+ ...
349
+
350
+
351
+ @runtime_checkable
352
+ class WritableDataset(DatasetLike, Protocol):
353
+ """Protocol defining write operations for datasets.
354
+
355
+ This protocol extends DatasetLike with write operations that are only
356
+ available on live catalog datasets. Downloaded bags (DatasetBag) are
357
+ immutable snapshots and do not implement these methods.
358
+
359
+ Use this protocol when you need to express that code requires the ability
360
+ to modify a dataset, not just read from it.
361
+
362
+ Example:
363
+ >>> def add_samples(dataset: WritableDataset, sample_rids: list[str]):
364
+ ... dataset.add_dataset_members(sample_rids)
365
+ ... dataset.increment_dataset_version(VersionPart.minor)
366
+ """
367
+
368
+ def add_dataset_members(self, members: list[RID]) -> None:
369
+ """Add members to the dataset.
370
+
371
+ Args:
372
+ members: List of RIDs to add to the dataset.
373
+ """
374
+ ...
375
+
376
+ def delete_dataset_members(
377
+ self,
378
+ members: list[RID],
379
+ description: str = "",
380
+ execution_rid: RID | None = None,
381
+ ) -> None:
382
+ """Remove members from the dataset.
383
+
384
+ Args:
385
+ members: List of RIDs to remove from the dataset.
386
+ description: Optional description of the removal operation.
387
+ execution_rid: Optional RID of execution associated with this operation.
388
+ """
389
+ ...
390
+
391
+ def increment_dataset_version(
392
+ self,
393
+ component: Any,
394
+ description: str | None = "",
395
+ execution_rid: RID | None = None,
396
+ ) -> DatasetVersion:
397
+ """Increment the dataset version.
398
+
399
+ Args:
400
+ component: Which version component to increment (major, minor, patch).
401
+ description: Optional description of the changes in this version.
402
+ execution_rid: Optional execution RID to associate with this version.
403
+
404
+ Returns:
405
+ The new version after incrementing.
406
+ """
407
+ ...
408
+
409
+ def download_dataset_bag(
410
+ self,
411
+ version: DatasetVersion | str | None = None,
412
+ use_minid: bool = False,
413
+ ) -> Any:
414
+ """Download the dataset as a BDBag.
415
+
416
+ Args:
417
+ version: Optional version to download. Defaults to current version.
418
+ use_minid: If True, upload the bag to S3 and create a MINID.
419
+ Requires s3_bucket to be configured on the catalog. Defaults to False.
420
+
421
+ Returns:
422
+ DatasetBag containing the downloaded data.
423
+
424
+ Raises:
425
+ DerivaMLException: If use_minid=True but s3_bucket is not configured.
426
+ """
427
+ ...
428
+
429
+
430
+ @runtime_checkable
431
+ class AssetLike(Protocol):
432
+ """Protocol defining read-only interface for asset access.
433
+
434
+ This protocol defines the common read interface for accessing asset
435
+ metadata, types, and provenance. It parallels DatasetLike but for
436
+ individual file-based records rather than data collections.
437
+
438
+ Attributes:
439
+ asset_rid: Resource Identifier for the asset.
440
+ asset_table: Name of the asset table containing this asset.
441
+ filename: Original filename of the asset.
442
+ url: URL to access the asset file.
443
+ length: Size of the asset file in bytes.
444
+ md5: MD5 checksum of the asset file.
445
+ asset_types: Type(s) of the asset from Asset_Type vocabulary.
446
+ description: Description of the asset.
447
+ execution_rid: Optional execution RID that created the asset.
448
+ """
449
+
450
+ asset_rid: RID
451
+ asset_table: str
452
+ filename: str
453
+ url: str
454
+ length: int
455
+ md5: str
456
+ asset_types: list[str]
457
+ description: str
458
+ execution_rid: RID | None
459
+
460
+ def list_executions(self, asset_role: str | None = None) -> list[dict[str, Any]]:
461
+ """List all executions associated with this asset.
462
+
463
+ Args:
464
+ asset_role: Optional filter for asset role ('Input' or 'Output').
465
+
466
+ Returns:
467
+ List of records with Execution RID and Asset_Role.
468
+ """
469
+ ...
470
+
471
+ def find_features(self) -> Iterable[Feature]:
472
+ """Find features defined on this asset's table.
473
+
474
+ Returns:
475
+ Iterable of Feature objects.
476
+ """
477
+ ...
478
+
479
+ def list_feature_values(self, feature_name: str) -> list[FeatureRecord]:
480
+ """Get feature values for this specific asset.
481
+
482
+ Args:
483
+ feature_name: Name of the feature to query.
484
+
485
+ Returns:
486
+ List of FeatureRecord instances. Each record has:
487
+ - Execution: RID of the execution that created this feature value
488
+ - Feature_Name: Name of the feature
489
+ - All feature-specific columns as typed attributes
490
+ - model_dump() method to convert back to a dictionary
491
+ """
492
+ ...
493
+
494
+ def get_metadata(self) -> dict[str, Any]:
495
+ """Get all metadata for this asset from the catalog.
496
+
497
+ Returns:
498
+ Dictionary of all columns/values for this asset record.
499
+ """
500
+ ...
501
+
502
+ def get_chaise_url(self) -> str:
503
+ """Get the Chaise URL for viewing this asset in the web interface.
504
+
505
+ Returns:
506
+ URL to view this asset in Chaise.
507
+ """
508
+ ...
509
+
510
+
511
+ @runtime_checkable
512
+ class WritableAsset(AssetLike, Protocol):
513
+ """Protocol defining write operations for assets.
514
+
515
+ This protocol extends AssetLike with write operations that are only
516
+ available on live catalog assets. Downloaded assets are immutable
517
+ and do not implement these methods.
518
+ """
519
+
520
+ def add_asset_type(self, type_name: str) -> None:
521
+ """Add an asset type to this asset.
522
+
523
+ Args:
524
+ type_name: Name of the asset type vocabulary term.
525
+ """
526
+ ...
527
+
528
+ def remove_asset_type(self, type_name: str) -> None:
529
+ """Remove an asset type from this asset.
530
+
531
+ Args:
532
+ type_name: Name of the asset type vocabulary term.
533
+ """
534
+ ...
535
+
536
+
537
+ @runtime_checkable
538
+ class DerivaMLCatalogReader(Protocol):
539
+ """Protocol for read-only catalog operations.
540
+
541
+ This protocol defines the minimal interface for reading from a catalog,
542
+ implemented by both DerivaML (live catalog) and DerivaMLDatabase (downloaded bags).
543
+
544
+ Use this protocol when code only needs to read data and should work with
545
+ both live catalogs and downloaded bags.
546
+
547
+ Attributes:
548
+ ml_schema: Name of the ML schema (typically 'deriva-ml').
549
+ domain_schema: Name of the domain-specific schema.
550
+ model: The catalog model containing schema information.
551
+ cache_dir: Directory for caching downloaded data.
552
+ working_dir: Directory for working files.
553
+
554
+ Example:
555
+ >>> def analyze_dataset(catalog: DerivaMLCatalogReader, dataset_rid: str):
556
+ ... dataset = catalog.lookup_dataset(dataset_rid)
557
+ ... members = dataset.list_dataset_members()
558
+ ... return len(members)
559
+ """
560
+
561
+ ml_schema: str
562
+ domain_schemas: frozenset[str]
563
+ default_schema: str | None
564
+ model: DerivaModel
565
+ cache_dir: Path
566
+ working_dir: Path
567
+
568
+ def lookup_dataset(self, dataset: RID | DatasetSpec, deleted: bool = False) -> DatasetLike:
569
+ """Look up a dataset by RID or specification.
570
+
571
+ Args:
572
+ dataset: RID or DatasetSpec identifying the dataset.
573
+ deleted: Whether to include deleted datasets.
574
+
575
+ Returns:
576
+ The dataset (Dataset for live catalogs, DatasetBag for bags).
577
+ """
578
+ ...
579
+
580
+ def find_datasets(self, deleted: bool = False) -> Iterable[DatasetLike]:
581
+ """Find all datasets in the catalog.
582
+
583
+ Args:
584
+ deleted: Whether to include deleted datasets.
585
+
586
+ Returns:
587
+ Iterable of all datasets.
588
+ """
589
+ ...
590
+
591
+ def lookup_term(self, table: str | Table, term_name: str) -> VocabularyTerm:
592
+ """Look up a vocabulary term.
593
+
594
+ Args:
595
+ table: Vocabulary table name or Table object.
596
+ term_name: Name of the term to look up.
597
+
598
+ Returns:
599
+ The vocabulary term.
600
+ """
601
+ ...
602
+
603
+ def get_table_as_dataframe(self, table: str) -> pd.DataFrame:
604
+ """Get table contents as a pandas DataFrame.
605
+
606
+ Args:
607
+ table: Name of the table to retrieve.
608
+
609
+ Returns:
610
+ DataFrame containing table contents.
611
+ """
612
+ ...
613
+
614
+ def get_table_as_dict(self, table: str) -> Iterable[dict[str, Any]]:
615
+ """Get table contents as dictionaries.
616
+
617
+ Args:
618
+ table: Name of the table to retrieve.
619
+
620
+ Returns:
621
+ Iterable of dictionaries for each row.
622
+ """
623
+ ...
624
+
625
+ def list_dataset_element_types(self) -> Iterable[Table]:
626
+ """List the types of elements that can be contained in datasets.
627
+
628
+ Returns:
629
+ Iterable of Table objects representing element types.
630
+ """
631
+ ...
632
+
633
+ def find_features(self, table: str | Table) -> Iterable[Feature]:
634
+ """Find features associated with a table.
635
+
636
+ Args:
637
+ table: Table to find features for.
638
+
639
+ Returns:
640
+ Iterable of Feature objects.
641
+ """
642
+ ...
643
+
644
+ def lookup_workflow(self, rid: RID) -> "Workflow":
645
+ """Look up a workflow by its Resource Identifier (RID).
646
+
647
+ Retrieves a workflow from the catalog by its RID. The returned Workflow
648
+ is bound to the catalog, allowing its description to be updated (on
649
+ writable catalogs).
650
+
651
+ Args:
652
+ rid: Resource Identifier of the workflow to look up.
653
+
654
+ Returns:
655
+ Workflow: The workflow object bound to this catalog.
656
+
657
+ Raises:
658
+ DerivaMLException: If the RID does not correspond to a workflow.
659
+
660
+ Example:
661
+ >>> workflow = catalog.lookup_workflow("2-ABC1")
662
+ >>> print(f"{workflow.name}: {workflow.description}")
663
+ """
664
+ ...
665
+
666
+ def find_workflows(self) -> Iterable["Workflow"]:
667
+ """Find all workflows in the catalog.
668
+
669
+ Returns all workflow definitions, each bound to the catalog for
670
+ potential modification.
671
+
672
+ Returns:
673
+ Iterable of Workflow objects.
674
+
675
+ Example:
676
+ >>> for workflow in catalog.find_workflows():
677
+ ... print(f"{workflow.name}: {workflow.description}")
678
+ """
679
+ ...
680
+
681
+ def lookup_workflow_by_url(self, url_or_checksum: str) -> "Workflow":
682
+ """Look up a workflow by URL or checksum.
683
+
684
+ Searches for a workflow matching the given GitHub URL or Git object
685
+ hash (checksum) and returns a bound Workflow object.
686
+
687
+ Args:
688
+ url_or_checksum: GitHub URL with commit hash, or Git object hash.
689
+
690
+ Returns:
691
+ Workflow: The workflow object bound to this catalog.
692
+
693
+ Raises:
694
+ DerivaMLException: If no matching workflow is found.
695
+
696
+ Example:
697
+ >>> url = "https://github.com/org/repo/blob/abc123/workflow.py"
698
+ >>> workflow = catalog.lookup_workflow_by_url(url)
699
+ >>> print(f"{workflow.name}: {workflow.description}")
700
+ """
701
+ ...
702
+
703
+ def lookup_execution(self, execution_rid: RID) -> "ExecutionRecord":
704
+ """Look up an execution by RID.
705
+
706
+ Returns an ExecutionRecord for querying and modifying execution metadata.
707
+
708
+ Args:
709
+ execution_rid: Resource Identifier of the execution.
710
+
711
+ Returns:
712
+ ExecutionRecord: The execution record bound to this catalog.
713
+
714
+ Raises:
715
+ DerivaMLException: If the RID doesn't refer to an Execution.
716
+
717
+ Example:
718
+ >>> record = catalog.lookup_execution("2-ABC1")
719
+ >>> print(f"{record.status}: {record.description}")
720
+ """
721
+ ...
722
+
723
+ def find_executions(
724
+ self,
725
+ workflow: "Workflow | RID | None" = None,
726
+ workflow_type: str | None = None,
727
+ status: "Status | None" = None,
728
+ ) -> Iterable["ExecutionRecord"]:
729
+ """List all executions in the catalog.
730
+
731
+ Args:
732
+ workflow: Optional Workflow object or RID to filter by.
733
+ workflow_type: Optional workflow type name to filter by.
734
+ status: Optional status to filter by.
735
+
736
+ Returns:
737
+ Iterable of ExecutionRecord objects.
738
+
739
+ Example:
740
+ >>> for record in catalog.find_executions():
741
+ ... print(f"{record.execution_rid}: {record.status}")
742
+ >>> # Filter by workflow type
743
+ >>> for record in catalog.find_executions(workflow_type="python_script"):
744
+ ... print(f"{record.execution_rid}")
745
+ """
746
+ ...
747
+
748
+
749
+ @runtime_checkable
750
+ class DerivaMLCatalog(DerivaMLCatalogReader, Protocol):
751
+ """Protocol for full catalog operations including writes.
752
+
753
+ This protocol extends DerivaMLCatalogReader with write operations and
754
+ is implemented by DerivaML (for live catalogs). It provides methods for:
755
+ - Schema and table access
756
+ - Dataset creation and modification
757
+ - Vocabulary term management
758
+ - Catalog snapshots and path building
759
+
760
+ Use this protocol when code needs to modify the catalog. For read-only
761
+ operations, prefer DerivaMLCatalogReader.
762
+
763
+ Attributes:
764
+ catalog: The underlying ERMrest catalog connection.
765
+ catalog_id: Catalog identifier or name.
766
+ s3_bucket: S3 bucket URL for dataset storage, or None if not configured.
767
+ use_minid: Whether MINID service is enabled for this catalog.
768
+
769
+ Example:
770
+ >>> def process_data(catalog: DerivaMLCatalog):
771
+ ... datasets = list(catalog.find_datasets())
772
+ ... for ds in datasets:
773
+ ... print(ds.description)
774
+ ... return datasets
775
+ """
776
+
777
+ catalog: ErmrestCatalog | ErmrestSnapshot
778
+ catalog_id: str | int
779
+ s3_bucket: str | None
780
+ use_minid: bool
781
+
782
+ def pathBuilder(self) -> SchemaWrapper:
783
+ """Get a path builder for constructing catalog queries.
784
+
785
+ Returns:
786
+ SchemaWrapper for building datapath queries.
787
+ """
788
+ ...
789
+
790
+ def catalog_snapshot(self, version_snapshot: str) -> Self:
791
+ """Create a view of the catalog at a specific snapshot time.
792
+
793
+ Args:
794
+ version_snapshot: Snapshot timestamp string.
795
+
796
+ Returns:
797
+ A new catalog instance bound to the snapshot.
798
+ """
799
+ ...
800
+
801
+ def resolve_rid(self, rid: RID) -> ResolveRidResult:
802
+ """Resolve a RID to its catalog location.
803
+
804
+ Args:
805
+ rid: Resource Identifier to resolve.
806
+
807
+ Returns:
808
+ Information about the RID's location in the catalog.
809
+ """
810
+ ...
811
+
812
+ def resolve_rids(
813
+ self,
814
+ rids: set[RID] | list[RID],
815
+ candidate_tables: list[Table] | None = None,
816
+ ) -> dict[RID, BatchRidResult]:
817
+ """Batch resolve multiple RIDs efficiently.
818
+
819
+ Resolves multiple RIDs in batched queries, significantly faster than
820
+ calling resolve_rid() for each RID individually.
821
+
822
+ Args:
823
+ rids: Set or list of RIDs to resolve.
824
+ candidate_tables: Optional list of Table objects to search in.
825
+ If not provided, searches all tables in domain and ML schemas.
826
+
827
+ Returns:
828
+ Mapping from each resolved RID to its BatchRidResult.
829
+ """
830
+ ...
831
+
832
+ def lookup_dataset(self, dataset: RID | DatasetSpec, deleted: bool = False) -> "Dataset":
833
+ """Look up a dataset by RID or specification.
834
+
835
+ Args:
836
+ dataset: RID or DatasetSpec identifying the dataset.
837
+ deleted: Whether to include deleted datasets.
838
+
839
+ Returns:
840
+ The dataset.
841
+ """
842
+ ...
843
+
844
+ def find_datasets(self, deleted: bool = False) -> Iterable["Dataset"]:
845
+ """Find all datasets in the catalog.
846
+
847
+ Args:
848
+ deleted: Whether to include deleted datasets.
849
+
850
+ Returns:
851
+ Iterable of all datasets.
852
+ """
853
+ ...
854
+
855
+ @property
856
+ def _dataset_table(self) -> Table:
857
+ """Get the Dataset table from the model.
858
+
859
+ Returns:
860
+ The Dataset table object.
861
+ """
862
+ ...