deriva-ml 1.10.1__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- """Ths module contains the definition of the DatabaseModel class. The role of this class is to provide an nterface between the BDBag representation
1
+ """Ths module contains the definition of the DatabaseModel class. The role of this class is to provide an interface between the BDBag representation
2
2
  of a dataset and a sqllite database in which the contents of the bag are stored.
3
3
  """
4
4
 
@@ -51,7 +51,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
51
51
  appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
52
52
  into DatabaseModels, is kept in the class variable `_rid_map`.
53
53
 
54
- Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
54
+ Because you can load different versions of a dataset simultaneously, the dataset RID and version number are tracked, and a new
55
55
  sqllite instance is created for every new dataset version present.
56
56
 
57
57
  Attributes:
@@ -290,6 +290,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
290
290
  return DatasetBag(self, dataset_rid or self.dataset_rid)
291
291
 
292
292
  def dataset_version(self, dataset_rid: Optional[RID] = None) -> DatasetVersion:
293
+ """Return the version of the specified dataset."""
293
294
  if dataset_rid and dataset_rid not in self.bag_rids:
294
295
  DerivaMLException(f"Dataset RID {dataset_rid} is not in model.")
295
296
  return self.bag_rids[dataset_rid]
deriva_ml/dataset.py CHANGED
@@ -232,12 +232,10 @@ class Dataset:
232
232
  """Increment the version of the specified dataset_table.
233
233
 
234
234
  Args:
235
- dataset_rid: RID to a dataset_table
236
- component: Which version of the dataset_table to increment.
237
- dataset_rid: RID of the dataset whose version is to be incremented.
238
- component: Major, Minor or Patch
239
- description: Description of the version update of the dataset_table.
240
- execution_rid: Which execution is performing increment.
235
+ dataset_rid: RID of the dataset whose version is to be incremented.
236
+ component: Which version of the dataset_table to increment. Major, Minor or Patch
237
+ description: Description of the version update of the dataset_table.
238
+ execution_rid: Which execution is performing increment.
241
239
 
242
240
  Returns:
243
241
  new semantic version of the dataset_table as a 3-tuple
@@ -275,9 +273,6 @@ class Dataset:
275
273
  description: Description of the dataset_table.
276
274
  execution_rid: Execution under which the dataset_table will be created.
277
275
  version: Version of the dataset_table.
278
- type: str | list[str]:
279
- description: str:
280
-
281
276
 
282
277
  Returns:
283
278
  New dataset_table RID.
@@ -349,7 +344,6 @@ class Dataset:
349
344
  Args:
350
345
  dataset_rid: RID of the dataset_table to delete.
351
346
  recurse: If True, delete the dataset_table along with any nested datasets. (Default value = False)
352
- dataset_rid: RID:
353
347
  """
354
348
  # Get association table entries for this dataset_table
355
349
  # Delete association table entries
@@ -397,7 +391,7 @@ class Dataset:
397
391
  filtered_path = dataset_path
398
392
  else:
399
393
  filtered_path = dataset_path.filter(
400
- (dataset_path.Deleted == False) | (dataset_path.Deleted == None)
394
+ (dataset_path.Deleted == False) | (dataset_path.Deleted == None) # noqa: E712
401
395
  )
402
396
 
403
397
  # Get a list of all the dataset_type values associated with this dataset_table.
@@ -439,8 +433,7 @@ class Dataset:
439
433
  routine makes it possible to add objects from the specified table to a dataset_table.
440
434
 
441
435
  Args:
442
- element: Name or the table or table object that is to be added to the dataset_table.
443
- element: str | Table:
436
+ element: Name of the table or table object that is to be added to the dataset_table.
444
437
 
445
438
  Returns:
446
439
  The table object that was added to the dataset_table.
@@ -464,7 +457,6 @@ class Dataset:
464
457
 
465
458
  Args:
466
459
  dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
467
- dataset_rid: RID:
468
460
  recurse: (Default value = False)
469
461
  limit: If provided, the maximum number of members to return for each element type.
470
462
 
@@ -677,7 +669,6 @@ class Dataset:
677
669
 
678
670
  Args:
679
671
  dataset_rid: return: RID of the parent dataset_table.
680
- dataset_rid: RID:
681
672
 
682
673
  Returns:
683
674
  RID of the parent dataset_table.
@@ -805,7 +796,7 @@ class Dataset:
805
796
  dataset_elements = [
806
797
  snapshot_catalog._model.name_to_table(e)
807
798
  for e, m in snapshot_catalog.list_dataset_members(
808
- dataset_rid=dataset_rid, limit=1
799
+ dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
809
800
  ).items()
810
801
  if m
811
802
  ]
deriva_ml/dataset_bag.py CHANGED
@@ -168,7 +168,7 @@ class DatasetBag:
168
168
  yield dict(zip(col_names, row))
169
169
 
170
170
  @validate_call
171
- def list_dataset_members(self, recurse: bool = False) -> dict[str, list[tuple]]:
171
+ def list_dataset_members(self, recurse: bool = False) -> dict[str, dict[str, list]]:
172
172
  """Return a list of entities associated with a specific _dataset_table.
173
173
 
174
174
  Args:
@@ -206,12 +206,19 @@ class DatasetBag:
206
206
  )
207
207
 
208
208
  with self.database as db:
209
+ col_names = [
210
+ c[1]
211
+ for c in db.execute(f'PRAGMA table_info("{sql_target}")').fetchall()
212
+ ]
213
+ select_cols = ",".join([f'"{sql_target}".{c}' for c in col_names])
209
214
  sql_cmd = (
210
- f'SELECT * FROM "{sql_member}" '
215
+ f'SELECT {select_cols} FROM "{sql_member}" '
211
216
  f'JOIN "{sql_target}" ON "{sql_member}".{member_link[0]} = "{sql_target}".{member_link[1]} '
212
217
  f'WHERE "{self.dataset_rid}" = "{sql_member}".Dataset;'
213
218
  )
214
- target_entities = db.execute(sql_cmd).fetchall()
219
+ target_entities = [
220
+ dict(zip(col_names, e)) for e in db.execute(sql_cmd).fetchall()
221
+ ]
215
222
  members[target_table.name].extend(target_entities)
216
223
 
217
224
  target_entities = [] # path.entities().fetch()
deriva_ml/demo_catalog.py CHANGED
@@ -2,9 +2,7 @@ import atexit
2
2
  from importlib.metadata import version
3
3
  from importlib.resources import files
4
4
  import logging
5
- from random import random, randint
6
- import tempfile
7
- from tempfile import TemporaryDirectory
5
+ from random import randint, random
8
6
  from typing import Optional
9
7
  import itertools
10
8
 
@@ -12,7 +10,6 @@ from deriva.config.acl_config import AclConfig
12
10
  from deriva.core import DerivaServer
13
11
  from deriva.core import ErmrestCatalog, get_credential
14
12
  from deriva.core.datapath import DataPathException
15
- from deriva.core.ermrest_model import Model
16
13
  from deriva.core.ermrest_model import builtin_types, Schema, Table, Column
17
14
  from requests import HTTPError
18
15
 
@@ -35,48 +32,51 @@ TEST_DATASET_SIZE = 4
35
32
  def reset_demo_catalog(deriva_ml: DerivaML, sname: str):
36
33
  model = deriva_ml.model
37
34
  for trial in range(3):
38
- for t in [
39
- v
40
- for v in model.schemas[sname].tables.values()
41
- if v.name not in {"Subject", "Image"}
42
- ]:
35
+ for t in [v for v in model.schemas[sname].tables.values()]:
43
36
  try:
44
37
  t.drop()
45
38
  except HTTPError:
46
39
  pass
47
-
40
+ model.schemas[sname].drop()
48
41
  # Empty out remaining tables.
49
42
  pb = deriva_ml.pathBuilder
50
43
  retry = True
51
44
  while retry:
52
- retry = False
53
- for s in [sname, "deriva-ml"]:
54
- for t in pb.schemas[s].tables.values():
55
- for e in t.entities().fetch():
56
- try:
57
- t.filter(t.RID == e["RID"]).delete()
58
- except DataPathException: # FK constraint.
59
- retry = True
60
-
45
+ for t in pb.schemas["deriva-ml"].tables.values():
46
+ for e in t.entities().fetch():
47
+ try:
48
+ t.filter(t.RID == e["RID"]).delete()
49
+ except DataPathException: # FK constraint.
50
+ retry = True
61
51
  initialize_ml_schema(model, "deriva-ml")
52
+ create_domain_schema(deriva_ml, sname)
62
53
 
63
54
 
64
55
  def populate_demo_catalog(deriva_ml: DerivaML, sname: str) -> None:
65
56
  # Delete any vocabularies and features.
66
- reset_demo_catalog(deriva_ml, sname)
67
57
  domain_schema = deriva_ml.catalog.getPathBuilder().schemas[sname]
68
58
  subject = domain_schema.tables["Subject"]
69
59
  ss = subject.insert([{"Name": f"Thing{t + 1}"} for t in range(TEST_DATASET_SIZE)])
70
-
71
- with TemporaryDirectory() as tmpdir:
72
- image_dir = deriva_ml.asset_dir("Image", prefix=tmpdir)
60
+ deriva_ml.add_term(
61
+ MLVocab.workflow_type,
62
+ "Demo Catalog Creation",
63
+ description="A workflow demonstrating how to create a demo catalog.",
64
+ )
65
+ execution = deriva_ml.create_execution(
66
+ ExecutionConfiguration(
67
+ workflow=deriva_ml.create_workflow(
68
+ name="Demo Catalog", workflow_type="Demo Catalog Creation"
69
+ )
70
+ )
71
+ )
72
+ with execution.execute() as e:
73
73
  for s in ss:
74
- image_file = image_dir.create_file(
75
- f"test_{s['RID']}.txt", {"Subject": s["RID"]}
74
+ image_file = e.asset_file_path(
75
+ "Image", f"test_{s['RID']}.txt", Subject=s["RID"]
76
76
  )
77
77
  with open(image_file, "w") as f:
78
78
  f.write(f"Hello there {random()}\n")
79
- deriva_ml.upload_assets(image_dir)
79
+ execution.upload_execution_outputs()
80
80
 
81
81
 
82
82
  def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RID]]:
@@ -84,6 +84,13 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
84
84
  ml_instance.add_dataset_element_type("Image")
85
85
 
86
86
  type_rid = ml_instance.add_term("Dataset_Type", "TestSet", description="A test")
87
+ training_rid = ml_instance.add_term(
88
+ "Dataset_Type", "Training", description="A traing set"
89
+ )
90
+ testing_rid = ml_instance.add_term(
91
+ "Dataset_Type", "Testing", description="A testing set"
92
+ )
93
+
87
94
  table_path = (
88
95
  ml_instance.catalog.getPathBuilder()
89
96
  .schemas[ml_instance.domain_schema]
@@ -94,7 +101,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
94
101
  dataset_rids = []
95
102
  for r in subject_rids[0:4]:
96
103
  d = ml_instance.create_dataset(
97
- type_rid.name,
104
+ type=[type_rid.name, "Testing"],
98
105
  description=f"Dataset {r}",
99
106
  version=DatasetVersion(1, 0, 0),
100
107
  )
@@ -104,7 +111,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
104
111
  nested_datasets = []
105
112
  for i in range(0, 4, 2):
106
113
  nested_dataset = ml_instance.create_dataset(
107
- type_rid.name,
114
+ type=[type_rid.name, "Training"],
108
115
  description=f"Nested Dataset {i}",
109
116
  version=DatasetVersion(1, 0, 0),
110
117
  )
@@ -132,13 +139,11 @@ def create_demo_features(ml_instance):
132
139
  "Well",
133
140
  description="The subject self reports that they feel well",
134
141
  )
135
-
136
142
  ml_instance.create_vocabulary(
137
143
  "ImageQuality", "Controlled vocabulary for image quality"
138
144
  )
139
145
  ml_instance.add_term("ImageQuality", "Good", description="The image is good")
140
146
  ml_instance.add_term("ImageQuality", "Bad", description="The image is bad")
141
-
142
147
  box_asset = ml_instance.create_asset(
143
148
  "BoundingBox", comment="A file that contains a cropped version of a image"
144
149
  )
@@ -150,7 +155,6 @@ def create_demo_features(ml_instance):
150
155
  metadata=[ColumnDefinition(name="Scale", type=BuiltinTypes.int2, nullok=True)],
151
156
  optional=["Scale"],
152
157
  )
153
-
154
158
  ml_instance.create_feature("Image", "BoundingBox", assets=[box_asset])
155
159
  ml_instance.create_feature("Image", "Quality", terms=["ImageQuality"])
156
160
 
@@ -158,78 +162,88 @@ def create_demo_features(ml_instance):
158
162
  ImageBoundingboxFeature = ml_instance.feature_record_class("Image", "BoundingBox")
159
163
  SubjectWellnessFeature = ml_instance.feature_record_class("Subject", "Health")
160
164
 
165
+ # Get the workflow for this notebook
166
+
161
167
  ml_instance.add_term(
162
168
  MLVocab.workflow_type,
163
- "API Workflow",
169
+ "Feature Notebook Workflow",
164
170
  description="A Workflow that uses Deriva ML API",
165
171
  )
166
172
  ml_instance.add_term(
167
- MLVocab.execution_asset_type,
168
- "API_Model",
169
- description="Model for our API workflow",
173
+ MLVocab.asset_type, "API_Model", description="Model for our Notebook workflow"
170
174
  )
171
-
172
- api_workflow = ml_instance.create_workflow(
173
- name="API Workflow",
174
- workflow_type="API Workflow",
175
+ notebook_workflow = ml_instance.create_workflow(
176
+ name="API Workflow", workflow_type="Feature Notebook Workflow"
175
177
  )
176
178
 
177
- api_execution = ml_instance.create_execution(
179
+ feature_execution = ml_instance.create_execution(
178
180
  ExecutionConfiguration(
179
- workflow=api_workflow, description="Our Sample Workflow instance"
181
+ workflow=notebook_workflow, description="Our Sample Workflow instance"
180
182
  )
181
183
  )
182
184
 
183
- with tempfile.TemporaryDirectory() as temp_dir:
184
- assetdir = ml_instance.asset_dir("BoundingBox", prefix=temp_dir)
185
- for i in range(10):
186
- with open(assetdir.path / f"box{i}.txt", "w") as fp:
187
- fp.write(f"Hi there {i}")
188
- bounding_box_assets = ml_instance.upload_assets(assetdir)
189
- bounding_box_rids = [a.result["RID"] for a in bounding_box_assets.values()]
190
-
191
- # Get the IDs of al of the things that we are going to want to attach features to.
192
185
  subject_rids = [
193
186
  i["RID"] for i in ml_instance.domain_path.tables["Subject"].entities().fetch()
194
187
  ]
195
188
  image_rids = [
196
189
  i["RID"] for i in ml_instance.domain_path.tables["Image"].entities().fetch()
197
190
  ]
198
-
199
191
  subject_feature_list = [
200
192
  SubjectWellnessFeature(
201
193
  Subject=subject_rid,
202
- Execution=api_execution.execution_rid,
194
+ Execution=feature_execution.execution_rid,
203
195
  SubjectHealth=["Well", "Sick"][randint(0, 1)],
204
196
  Scale=randint(1, 10),
205
197
  )
206
198
  for subject_rid in subject_rids
207
199
  ]
208
200
 
201
+ # Create a new set of images. For fun, lets wrap this in an execution so we get status updates
202
+ bounding_box_files = []
203
+ for i in range(10):
204
+ bounding_box_file = feature_execution.asset_file_path(
205
+ "BoundingBox", f"box{i}.txt"
206
+ )
207
+ with open(bounding_box_file, "w") as fp:
208
+ fp.write(f"Hi there {i}")
209
+ bounding_box_files.append(bounding_box_file)
210
+
211
+ image_bounding_box_feature_list = [
212
+ ImageBoundingboxFeature(
213
+ Image=image_rid,
214
+ BoundingBox=asset_name,
215
+ )
216
+ for image_rid, asset_name in zip(
217
+ image_rids, itertools.cycle(bounding_box_files)
218
+ )
219
+ ]
220
+
209
221
  image_quality_feature_list = [
210
222
  ImageQualityFeature(
211
223
  Image=image_rid,
212
- Execution=api_execution.execution_rid,
213
224
  ImageQuality=["Good", "Bad"][randint(0, 1)],
214
225
  )
215
226
  for image_rid in image_rids
216
227
  ]
217
228
 
218
- image_bounding_box_feature_list = [
219
- ImageBoundingboxFeature(
220
- Image=image_rid,
221
- Execution=api_execution.execution_rid,
222
- BoundingBox=asset_rid,
229
+ subject_feature_list = [
230
+ SubjectWellnessFeature(
231
+ Subject=subject_rid,
232
+ SubjectHealth=["Well", "Sick"][randint(0, 1)],
233
+ Scale=randint(1, 10),
223
234
  )
224
- for image_rid, asset_rid in zip(image_rids, itertools.cycle(bounding_box_rids))
235
+ for subject_rid in subject_rids
225
236
  ]
226
237
 
227
- ml_instance.add_features(subject_feature_list)
228
- ml_instance.add_features(image_quality_feature_list)
229
- ml_instance.add_features(image_bounding_box_feature_list)
238
+ with feature_execution.execute() as execution:
239
+ feature_execution.add_features(image_bounding_box_feature_list)
240
+ feature_execution.add_features(image_quality_feature_list)
241
+ feature_execution.add_features(subject_feature_list)
242
+
243
+ feature_execution.upload_execution_outputs()
230
244
 
231
245
 
232
- def create_domain_schema(model: Model, sname: str) -> None:
246
+ def create_domain_schema(ml_instance: DerivaML, sname: str) -> None:
233
247
  """
234
248
  Create a domain schema. Assumes that the ml-schema has already been created.
235
249
  :param model:
@@ -238,28 +252,19 @@ def create_domain_schema(model: Model, sname: str) -> None:
238
252
  """
239
253
 
240
254
  # Make sure that we have a ml schema
241
- _ = model.schemas["deriva-ml"]
255
+ _ = ml_instance.model.schemas["deriva-ml"]
242
256
 
243
- if model.schemas.get(sname):
257
+ if ml_instance.model.schemas.get(sname):
244
258
  # Clean out any old junk....
245
- model.schemas[sname].drop()
259
+ ml_instance.model.schemas[sname].drop()
246
260
 
247
- domain_schema = model.create_schema(
261
+ domain_schema = ml_instance.model.model.create_schema(
248
262
  Schema.define(sname, annotations={"name_style": {"underline_space": True}})
249
263
  )
250
264
  subject_table = domain_schema.create_table(
251
265
  Table.define("Subject", column_defs=[Column.define("Name", builtin_types.text)])
252
266
  )
253
-
254
- image_table = domain_schema.create_table(
255
- Table.define_asset(
256
- sname=sname,
257
- tname="Image",
258
- hatrac_template="/hatrac/image_asset/{{MD5}}.{{Filename}}",
259
- column_defs=[Column.define("Name", builtin_types.text)],
260
- )
261
- )
262
- image_table.create_reference(subject_table)
267
+ ml_instance.create_asset("Image", referenced_tables=[subject_table])
263
268
 
264
269
 
265
270
  def destroy_demo_catalog(catalog):
@@ -284,13 +289,14 @@ def create_demo_catalog(
284
289
 
285
290
  try:
286
291
  create_ml_schema(model, project_name=project_name)
287
- create_domain_schema(model, domain_schema)
288
292
  deriva_ml = DerivaML(
289
293
  hostname=hostname,
290
294
  catalog_id=test_catalog.catalog_id,
291
295
  project_name=project_name,
296
+ domain_schema=domain_schema,
292
297
  logging_level=logging.WARN,
293
298
  )
299
+ create_domain_schema(deriva_ml, domain_schema)
294
300
  working_dir = deriva_ml.working_dir
295
301
  dataset_table = deriva_ml.dataset_table
296
302
  dataset_table.annotations.update(
@@ -186,9 +186,9 @@ class MLVocab(StrEnum):
186
186
 
187
187
  dataset_type = "Dataset_Type"
188
188
  workflow_type = "Workflow_Type"
189
- execution_asset_type = "Execution_Asset_Type"
190
- execution_metadata_type = "Execution_Metadata_Type"
191
189
  file_type = "File_Type"
190
+ asset_type = "Asset_Type"
191
+ asset_role = "Asset_Role"
192
192
 
193
193
 
194
194
  class ExecMetadataVocab(StrEnum):