deriva-ml 1.17.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. deriva_ml/.DS_Store +0 -0
  2. deriva_ml/__init__.py +79 -0
  3. deriva_ml/bump_version.py +142 -0
  4. deriva_ml/core/__init__.py +39 -0
  5. deriva_ml/core/base.py +1527 -0
  6. deriva_ml/core/config.py +69 -0
  7. deriva_ml/core/constants.py +36 -0
  8. deriva_ml/core/definitions.py +74 -0
  9. deriva_ml/core/enums.py +222 -0
  10. deriva_ml/core/ermrest.py +288 -0
  11. deriva_ml/core/exceptions.py +28 -0
  12. deriva_ml/core/filespec.py +116 -0
  13. deriva_ml/dataset/__init__.py +12 -0
  14. deriva_ml/dataset/aux_classes.py +225 -0
  15. deriva_ml/dataset/dataset.py +1519 -0
  16. deriva_ml/dataset/dataset_bag.py +450 -0
  17. deriva_ml/dataset/history.py +109 -0
  18. deriva_ml/dataset/upload.py +439 -0
  19. deriva_ml/demo_catalog.py +495 -0
  20. deriva_ml/execution/__init__.py +26 -0
  21. deriva_ml/execution/environment.py +290 -0
  22. deriva_ml/execution/execution.py +1180 -0
  23. deriva_ml/execution/execution_configuration.py +147 -0
  24. deriva_ml/execution/workflow.py +413 -0
  25. deriva_ml/feature.py +228 -0
  26. deriva_ml/install_kernel.py +71 -0
  27. deriva_ml/model/__init__.py +0 -0
  28. deriva_ml/model/catalog.py +485 -0
  29. deriva_ml/model/database.py +719 -0
  30. deriva_ml/protocols/dataset.py +19 -0
  31. deriva_ml/run_notebook.py +228 -0
  32. deriva_ml/schema/__init__.py +3 -0
  33. deriva_ml/schema/annotations.py +473 -0
  34. deriva_ml/schema/check_schema.py +104 -0
  35. deriva_ml/schema/create_schema.py +393 -0
  36. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  37. deriva_ml/schema/policy.json +81 -0
  38. deriva_ml/schema/table_comments_utils.py +57 -0
  39. deriva_ml/test.py +94 -0
  40. deriva_ml-1.17.10.dist-info/METADATA +38 -0
  41. deriva_ml-1.17.10.dist-info/RECORD +45 -0
  42. deriva_ml-1.17.10.dist-info/WHEEL +5 -0
  43. deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
  44. deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
  45. deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
@@ -0,0 +1,495 @@
1
+ from __future__ import annotations
2
+
3
+ import atexit
4
+ import itertools
5
+ import logging
6
+ import os
7
+ import string
8
+ from collections.abc import Iterator, Sequence
9
+ from datetime import datetime
10
+ from numbers import Integral
11
+ from pathlib import Path
12
+ from random import choice, randint, random
13
+ from tempfile import TemporaryDirectory
14
+
15
+ from deriva.core import BaseCLI, ErmrestCatalog
16
+ from deriva.core.ermrest_model import Column, Schema, Table, builtin_types
17
+ from pydantic import BaseModel, ConfigDict
18
+ from requests.exceptions import HTTPError
19
+
20
+ from deriva_ml import DerivaML, MLVocab
21
+ from deriva_ml.core.definitions import RID, BuiltinTypes, ColumnDefinition
22
+ from deriva_ml.dataset.aux_classes import DatasetVersion
23
+ from deriva_ml.execution.execution import Execution, Workflow
24
+ from deriva_ml.execution.execution_configuration import ExecutionConfiguration
25
+ from deriva_ml.schema import (
26
+ create_ml_catalog,
27
+ )
28
+ from deriva_ml.schema.annotations import catalog_annotation
29
+
30
+ try:
31
+ from icecream import ic
32
+
33
+ ic.configureOutput(includeContext=True)
34
+ except ImportError: # Graceful fallback if IceCream isn't installed.
35
+ ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
36
+
37
+
38
+ TEST_DATASET_SIZE = 12
39
+
40
+
41
+ def populate_demo_catalog(ml_instance: DerivaML) -> None:
42
+ # Delete any vocabularies and features.
43
+ domain_schema = ml_instance.pathBuilder.schemas[ml_instance.domain_schema]
44
+ subject = domain_schema.tables["Subject"]
45
+ ss = subject.insert([{"Name": f"Thing{t + 1}"} for t in range(TEST_DATASET_SIZE)])
46
+
47
+ ml_instance.add_term(
48
+ MLVocab.workflow_type,
49
+ "Demo Catalog Creation",
50
+ description="A workflow demonstrating how to create a demo catalog.",
51
+ )
52
+ workflow = Workflow(
53
+ name="Demo Catalog",
54
+ workflow_type="Demo Catalog Creation",
55
+ url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/src/deriva_ml/demo_catalog.py",
56
+ version="1.0.0",
57
+ checksum="27",
58
+ git_root=Path(),
59
+ )
60
+ execution = ml_instance.create_execution(ExecutionConfiguration(workflow=workflow))
61
+
62
+ with execution.execute() as e:
63
+ for s in ss:
64
+ image_file = e.asset_file_path(
65
+ "Image",
66
+ f"test_{s['RID']}.txt",
67
+ Subject=s["RID"],
68
+ Acquisition_Time=datetime.now(),
69
+ Acquisition_Date=datetime.now().date(),
70
+ )
71
+ with image_file.open("w") as f:
72
+ f.write(f"Hello there {random()}\n")
73
+ execution.upload_execution_outputs()
74
+
75
+
76
+ class DatasetDescription(BaseModel):
77
+ types: list[str] # Types of the dataset.
78
+ description: str # Description.
79
+ members: dict[
80
+ str, int | list[DatasetDescription]
81
+ ] # Either a list of nested dataset, or then number of elements to add
82
+ member_rids: dict[str, list[RID]] = {} # The rids of the members of the dataset.
83
+ version: DatasetVersion = DatasetVersion(1, 0, 0) # The initial version.
84
+ rid: RID = None # RID of dataset that was created.
85
+
86
+ model_config = ConfigDict(arbitrary_types_allowed=True)
87
+
88
+
89
+ def create_datasets(
90
+ client: Execution,
91
+ spec: DatasetDescription,
92
+ member_rids: dict[str, Iterator[RID]],
93
+ ) -> DatasetDescription:
94
+ """
95
+ Create a dataset per `spec`, then add child members (either by slicing
96
+ off pre-generated RIDs or by recursing on nested specs).
97
+ """
98
+ dataset_rid = client.create_dataset(
99
+ dataset_types=spec.types,
100
+ description=spec.description,
101
+ version=spec.version,
102
+ )
103
+
104
+ result_spec = DatasetDescription(
105
+ description=spec.description,
106
+ members={},
107
+ types=spec.types,
108
+ rid=dataset_rid,
109
+ version=spec.version,
110
+ )
111
+ dataset_rids = {}
112
+ for member_type, value in spec.members.items():
113
+ if isinstance(value, Sequence) and not isinstance(value, (str, bytes)):
114
+ nested_specs: list[DatasetDescription] = list(value)
115
+ rids: list[RID] = []
116
+ for child_spec in nested_specs:
117
+ child_ds = create_datasets(client, child_spec, member_rids)
118
+ result_spec.members.setdefault(member_type, []).append(child_ds)
119
+ rids.append(child_ds.rid)
120
+ elif isinstance(value, Integral):
121
+ count = int(value)
122
+ # take exactly `count` RIDs (or an empty list if count <= 0)
123
+ rids = list(itertools.islice(member_rids[member_type], count))
124
+ assert len(rids) == count, f"Expected {count} RIDs, got {len(rids)}"
125
+ result_spec.members[member_type] = count
126
+ else:
127
+ raise TypeError(
128
+ f"Expected spec.members['{member_type}'] to be either an int or a list, got {type(value).__name__!r}"
129
+ )
130
+
131
+ # attach and record
132
+ if rids:
133
+ dataset_rids[member_type] = rids
134
+ result_spec.member_rids.setdefault(member_type, []).extend(rids)
135
+ client.add_dataset_members(dataset_rid, dataset_rids, description="Added by create_datasets")
136
+
137
+ return result_spec
138
+
139
+
140
+ def dataset_spec() -> DatasetDescription:
141
+ dataset = DatasetDescription(
142
+ description="A dataset",
143
+ members={"Subject": 2},
144
+ types=[],
145
+ )
146
+
147
+ training_dataset = DatasetDescription(
148
+ description="A dataset that is nested",
149
+ members={"Dataset": [dataset, dataset], "Image": 2},
150
+ types=["Testing"],
151
+ )
152
+
153
+ testing_dataset = DatasetDescription(
154
+ description="A dataset that is nested",
155
+ members={"Dataset": [dataset, dataset], "Image": 2},
156
+ types=["Testing"],
157
+ )
158
+
159
+ double_nested_dataset = DatasetDescription(
160
+ description="A dataset that is double nested",
161
+ members={"Dataset": [training_dataset, testing_dataset]},
162
+ types=["Complete"],
163
+ )
164
+ return double_nested_dataset
165
+
166
+
167
+ def create_demo_datasets(ml_instance: DerivaML) -> DatasetDescription:
168
+ """Create datasets from a populated catalog."""
169
+ ml_instance.add_dataset_element_type("Subject")
170
+ ml_instance.add_dataset_element_type("Image")
171
+
172
+ _type_rid = ml_instance.add_term("Dataset_Type", "Complete", synonyms=["Whole"], description="A test")
173
+ _training_rid = ml_instance.add_term("Dataset_Type", "Training", synonyms=["Train"], description="A training set")
174
+ _testing_rid = ml_instance.add_term("Dataset_Type", "Testing", description="A testing set")
175
+
176
+ table_path = ml_instance.catalog.getPathBuilder().schemas[ml_instance.domain_schema].tables["Subject"]
177
+ subject_rids = [i["RID"] for i in table_path.entities().fetch()]
178
+ table_path = ml_instance.catalog.getPathBuilder().schemas[ml_instance.domain_schema].tables["Image"]
179
+ image_rids = [i["RID"] for i in table_path.entities().fetch()]
180
+
181
+ ml_instance.add_term(
182
+ MLVocab.workflow_type,
183
+ "Create Dataset Workflow",
184
+ description="A Workflow that creates a new dataset.",
185
+ )
186
+ dataset_workflow = ml_instance.create_workflow(name="API Workflow", workflow_type="Create Dataset Workflow")
187
+
188
+ dataset_execution = ml_instance.create_execution(
189
+ ExecutionConfiguration(workflow=dataset_workflow, description="Create Dataset")
190
+ )
191
+
192
+ with dataset_execution.execute() as exe:
193
+ spec = dataset_spec()
194
+ dataset = create_datasets(exe, spec, {"Subject": iter(subject_rids), "Image": iter(image_rids)})
195
+ return dataset
196
+
197
+
198
+ def create_demo_features(ml_instance: DerivaML) -> None:
199
+ ml_instance.create_vocabulary("SubjectHealth", "A vocab")
200
+ ml_instance.add_term(
201
+ "SubjectHealth",
202
+ "Sick",
203
+ description="The subject self reports that they are sick",
204
+ )
205
+ ml_instance.add_term(
206
+ "SubjectHealth",
207
+ "Well",
208
+ description="The subject self reports that they feel well",
209
+ )
210
+ ml_instance.create_vocabulary("ImageQuality", "Controlled vocabulary for image quality")
211
+ ml_instance.add_term("ImageQuality", "Good", description="The image is good")
212
+ ml_instance.add_term("ImageQuality", "Bad", description="The image is bad")
213
+ box_asset = ml_instance.create_asset("BoundingBox", comment="A file that contains a cropped version of a image")
214
+
215
+ ml_instance.create_feature(
216
+ "Subject",
217
+ "Health",
218
+ terms=["SubjectHealth"],
219
+ metadata=[ColumnDefinition(name="Scale", type=BuiltinTypes.int2, nullok=True)],
220
+ optional=["Scale"],
221
+ )
222
+ ml_instance.create_feature("Image", "BoundingBox", assets=[box_asset])
223
+ ml_instance.create_feature("Image", "Quality", terms=["ImageQuality"])
224
+
225
+ ImageQualityFeature = ml_instance.feature_record_class("Image", "Quality")
226
+ ImageBoundingboxFeature = ml_instance.feature_record_class("Image", "BoundingBox")
227
+ SubjectWellnessFeature = ml_instance.feature_record_class("Subject", "Health")
228
+
229
+ # Get the workflow for this notebook
230
+
231
+ ml_instance.add_term(
232
+ MLVocab.workflow_type,
233
+ "Feature Notebook Workflow",
234
+ description="A Workflow that uses Deriva ML API",
235
+ )
236
+ ml_instance.add_term(MLVocab.asset_type, "API_Model", description="Model for our Notebook workflow")
237
+ notebook_workflow = ml_instance.create_workflow(name="API Workflow", workflow_type="Feature Notebook Workflow")
238
+
239
+ feature_execution = ml_instance.create_execution(
240
+ ExecutionConfiguration(workflow=notebook_workflow, description="Our Sample Workflow instance")
241
+ )
242
+
243
+ subject_rids = [i["RID"] for i in ml_instance.domain_path.tables["Subject"].entities().fetch()]
244
+ image_rids = [i["RID"] for i in ml_instance.domain_path.tables["Image"].entities().fetch()]
245
+ _subject_feature_list = [
246
+ SubjectWellnessFeature(
247
+ Subject=subject_rid,
248
+ Execution=feature_execution.execution_rid,
249
+ SubjectHealth=choice(["Well", "Sick"]),
250
+ Scale=randint(1, 10),
251
+ )
252
+ for subject_rid in subject_rids
253
+ ]
254
+
255
+ # Create a new set of images. For fun, lets wrap this in an execution so we get status updates
256
+ bounding_box_files = []
257
+ for i in range(10):
258
+ bounding_box_file = feature_execution.asset_file_path("BoundingBox", f"box{i}.txt")
259
+ with bounding_box_file.open("w") as fp:
260
+ fp.write(f"Hi there {i}")
261
+ bounding_box_files.append(bounding_box_file)
262
+
263
+ image_bounding_box_feature_list = [
264
+ ImageBoundingboxFeature(
265
+ Image=image_rid,
266
+ BoundingBox=asset_name,
267
+ )
268
+ for image_rid, asset_name in zip(image_rids, itertools.cycle(bounding_box_files))
269
+ ]
270
+
271
+ image_quality_feature_list = [
272
+ ImageQualityFeature(
273
+ Image=image_rid,
274
+ ImageQuality=choice(["Good", "Bad"]),
275
+ )
276
+ for image_rid in image_rids
277
+ ]
278
+
279
+ subject_feature_list = [
280
+ SubjectWellnessFeature(
281
+ Subject=subject_rid,
282
+ SubjectHealth=choice(["Well", "Sick"]),
283
+ Scale=randint(1, 10),
284
+ )
285
+ for subject_rid in subject_rids
286
+ ]
287
+
288
+ with feature_execution.execute() as execution:
289
+ execution.add_features(image_bounding_box_feature_list)
290
+ execution.add_features(image_quality_feature_list)
291
+ execution.add_features(subject_feature_list)
292
+
293
+ feature_execution.upload_execution_outputs()
294
+
295
+
296
+ def create_demo_files(ml_instance: DerivaML):
297
+ """Create demo files for testing purposes.
298
+
299
+ Args:
300
+ ml_instance: The DerivaML instance to create files for.
301
+
302
+ Returns:
303
+ None. Creates files in the working directory.
304
+ """
305
+
306
+ def random_string(length: int) -> str:
307
+ """Generate a random string of specified length.
308
+
309
+ Args:
310
+ length: The length of the string to generate.
311
+
312
+ Returns:
313
+ A random string of the specified length.
314
+ """
315
+ return "".join(random.choice(string.ascii_letters) for _ in range(length))
316
+
317
+ test_dir = ml_instance.working_dir / "test_dir"
318
+ test_dir.mkdir(parents=True, exist_ok=True)
319
+ d1 = test_dir / "d1"
320
+ d1.mkdir(parents=True, exist_ok=True)
321
+ d2 = test_dir / "d2"
322
+ d2.mkdir(parents=True, exist_ok=True)
323
+
324
+ # Create some demo files
325
+ for d in [test_dir, d1, d2]:
326
+ for i in range(5):
327
+ fname = Path(d) / f"file{i}.{random.choice(['txt', 'jpeg'])}"
328
+ with fname.open("w") as f:
329
+ f.write(random_string(10))
330
+ ml_instance.add_term(MLVocab.workflow_type, "File Test Workflow", description="Test workflow")
331
+
332
+
333
+ def create_domain_schema(catalog: ErmrestCatalog, sname: str) -> None:
334
+ """
335
+ Create a domain schema. Assumes that the ml-schema has already been created.
336
+ :param sname:
337
+ :return:
338
+ """
339
+ model = catalog.getCatalogModel()
340
+ _ = model.schemas["deriva-ml"]
341
+
342
+ try:
343
+ model.schemas[sname].drop(cascade=True)
344
+ except KeyError:
345
+ pass
346
+ except HTTPError as e:
347
+ print(e)
348
+ if f"Schema {sname} does not exist" in str(e):
349
+ pass
350
+ else:
351
+ raise e
352
+
353
+ domain_schema = model.create_schema(Schema.define(sname, annotations={"name_style": {"underline_space": True}}))
354
+ subject_table = domain_schema.create_table(
355
+ Table.define("Subject", column_defs=[Column.define("Name", builtin_types.text)])
356
+ )
357
+ with TemporaryDirectory() as tmpdir:
358
+ ml_instance = DerivaML(hostname=catalog.deriva_server.server, catalog_id=catalog.catalog_id, working_dir=tmpdir)
359
+ ml_instance.create_asset(
360
+ "Image",
361
+ column_defs=[
362
+ Column.define("Acquisition_Time", builtin_types.timestamp),
363
+ Column.define("Acquisition_Date", builtin_types.date),
364
+ ],
365
+ referenced_tables=[subject_table],
366
+ )
367
+ catalog_annotation(ml_instance.model)
368
+
369
+
370
+ def destroy_demo_catalog(catalog):
371
+ """Destroy the demo catalog and clean up resources.
372
+
373
+ Args:
374
+ catalog: The ErmrestCatalog instance to destroy.
375
+
376
+ Returns:
377
+ None. Destroys the catalog.
378
+ """
379
+ catalog.delete_ermrest_catalog(really=True)
380
+
381
+
382
+ def create_demo_catalog(
383
+ hostname,
384
+ domain_schema="demo-schema",
385
+ project_name="ml-test",
386
+ populate=True,
387
+ create_features=False,
388
+ create_datasets=False,
389
+ on_exit_delete=True,
390
+ logging_level=logging.WARNING,
391
+ ) -> ErmrestCatalog:
392
+ test_catalog = create_ml_catalog(hostname, project_name=project_name)
393
+ if on_exit_delete:
394
+ atexit.register(destroy_demo_catalog, test_catalog)
395
+
396
+ try:
397
+ with TemporaryDirectory() as tmpdir:
398
+ os.chdir(tmpdir) # Do this so we don't get confused if running from a GitHub repo.
399
+ create_domain_schema(test_catalog, domain_schema)
400
+
401
+ ml_instance = DerivaML(
402
+ hostname,
403
+ catalog_id=test_catalog.catalog_id,
404
+ domain_schema=domain_schema,
405
+ working_dir=tmpdir,
406
+ logging_level=logging_level,
407
+ )
408
+ if populate or create_features or create_datasets:
409
+ populate_demo_catalog(ml_instance)
410
+ if create_features:
411
+ create_demo_features(ml_instance)
412
+ if create_datasets:
413
+ create_demo_datasets(ml_instance)
414
+
415
+ except Exception:
416
+ # on failure, delete catalog and re-raise exception
417
+ test_catalog.delete_ermrest_catalog(really=True)
418
+ raise
419
+ return test_catalog
420
+
421
+
422
+ class DemoML(DerivaML):
423
+ def __init__(
424
+ self,
425
+ hostname,
426
+ catalog_id,
427
+ cache_dir: str | None = None,
428
+ working_dir: str | None = None,
429
+ use_minid=True,
430
+ ):
431
+ super().__init__(
432
+ hostname=hostname,
433
+ catalog_id=catalog_id,
434
+ project_name="ml-test",
435
+ cache_dir=cache_dir,
436
+ working_dir=working_dir,
437
+ use_minid=use_minid,
438
+ )
439
+
440
+
441
+ class DerivaMLDemoCatalogCLI(BaseCLI):
442
+ """Main class to part command line arguments and call model"""
443
+
444
+ def __init__(self, description, epilog, **kwargs):
445
+ BaseCLI.__init__(self, description, epilog, **kwargs)
446
+ # Optional domain schema name for the demo catalog. Defaults to None if not provided.
447
+ self.parser.add_argument(
448
+ "--domain_schema",
449
+ type=str,
450
+ default="demo-schema",
451
+ help="Name of the domain schema to create/use for the demo catalog (default: demo-schema).",
452
+ )
453
+
454
+ @staticmethod
455
+ def _coerce_number(val: str):
456
+ """
457
+ Try to convert a string to int, then float; otherwise return str.
458
+ """
459
+ try:
460
+ return int(val)
461
+ except ValueError:
462
+ try:
463
+ return float(val)
464
+ except ValueError:
465
+ return val
466
+
467
+ def main(self) -> ErmrestCatalog:
468
+ """Parse arguments and set up execution environment."""
469
+ args = self.parse_cli()
470
+ if not args.host:
471
+ raise ValueError("Host must be specified.")
472
+ demo_catalog = create_demo_catalog(args.host, args.domain_schema)
473
+ return demo_catalog
474
+
475
+
476
+ def main() -> None:
477
+ """Main entry point for the notebook runner CLI.
478
+
479
+ Creates and runs the DerivaMLRunNotebookCLI instance.
480
+
481
+ Returns:
482
+ None. Executes the CLI.
483
+ """
484
+ cli = DerivaMLDemoCatalogCLI(description="Create a Deriva ML Sample Catalog", epilog="")
485
+ catalog = cli.main()
486
+ print("Created catalog: {}".format(catalog._server_uri))
487
+
488
+
489
+ if __name__ == "__main__":
490
+ try:
491
+ main()
492
+ except Exception as e:
493
+ print("Error creating catalog:")
494
+ print(e)
495
+ exit(1)
@@ -0,0 +1,26 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ # Safe imports - no circular dependencies
4
+ from deriva_ml.execution.execution_configuration import AssetRIDConfig, ExecutionConfiguration
5
+ from deriva_ml.execution.workflow import Workflow
6
+
7
+ if TYPE_CHECKING:
8
+ from deriva_ml.execution.execution import Execution
9
+
10
+
11
+ # Lazy import for runtime
12
+ def __getattr__(name):
13
+ """Lazy import to avoid circular dependencies."""
14
+ if name == "Execution":
15
+ from deriva_ml.execution.execution import Execution
16
+
17
+ return Execution
18
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
19
+
20
+
21
+ __all__ = [
22
+ "Execution", # Lazy-loaded
23
+ "ExecutionConfiguration",
24
+ "Workflow",
25
+ "AssetRIDConfig",
26
+ ]