deriva-ml 1.7.0__tar.gz → 1.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {deriva_ml-1.7.0/src/deriva_ml.egg-info → deriva_ml-1.8.1}/PKG-INFO +1 -1
  2. deriva_ml-1.8.1/src/deriva_ml/VERSION.py +1 -0
  3. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/__init__.py +2 -0
  4. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/dataset.py +19 -25
  5. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/dataset_aux_classes.py +8 -0
  6. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/dataset_bag.py +2 -2
  7. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/demo_catalog.py +2 -2
  8. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/deriva_definitions.py +42 -1
  9. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/deriva_ml_base.py +102 -25
  10. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/execution.py +6 -6
  11. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/execution_configuration.py +2 -2
  12. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/execution_environment.py +2 -2
  13. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/schema_setup/create_schema.py +33 -7
  14. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/test_functions.py +5 -9
  15. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/upload.py +0 -1
  16. {deriva_ml-1.7.0 → deriva_ml-1.8.1/src/deriva_ml.egg-info}/PKG-INFO +1 -1
  17. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/tests/test_basic_tables.py +1 -1
  18. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/tests/test_dataset.py +16 -5
  19. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/tests/test_execution.py +2 -5
  20. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/tests/test_upload.py +2 -2
  21. deriva_ml-1.7.0/src/deriva_ml/VERSION.py +0 -1
  22. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/LICENSE +0 -0
  23. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/README.md +0 -0
  24. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/pyproject.toml +0 -0
  25. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/setup.cfg +0 -0
  26. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/build/lib/schema_setup/__init__.py +0 -0
  27. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -0
  28. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -0
  29. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/build/lib/schema_setup/create_schema.py +0 -0
  30. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/build/lib/schema_setup/table_comments_utils.py +0 -0
  31. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/database_model.py +0 -0
  32. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/deriva_model.py +0 -0
  33. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/feature.py +0 -0
  34. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/history.py +0 -0
  35. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/schema_setup/__init__.py +0 -0
  36. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/schema_setup/alter_annotation.py +0 -0
  37. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/schema_setup/annotations.py +0 -0
  38. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/schema_setup/policy.json +0 -0
  39. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/schema_setup/table_comments_utils.py +0 -0
  40. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml.egg-info/SOURCES.txt +0 -0
  41. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
  42. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml.egg-info/entry_points.txt +0 -0
  43. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml.egg-info/requires.txt +0 -0
  44. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml.egg-info/top_level.txt +0 -0
  45. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/tests/test_download.py +0 -0
  46. {deriva_ml-1.7.0 → deriva_ml-1.8.1}/tests/test_features.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: deriva-ml
3
- Version: 1.7.0
3
+ Version: 1.8.1
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -0,0 +1 @@
1
+ __version__ = "1.8.1"
@@ -2,6 +2,7 @@ __all__ = [
2
2
  "DerivaML",
3
3
  "DerivaMLException",
4
4
  "FileUploadState",
5
+ "FileSpec",
5
6
  "ExecutionConfiguration",
6
7
  "Workflow",
7
8
  "DatasetBag",
@@ -26,6 +27,7 @@ from .deriva_definitions import (
26
27
  BuiltinTypes,
27
28
  UploadState,
28
29
  FileUploadState,
30
+ FileSpec,
29
31
  RID,
30
32
  DerivaMLException,
31
33
  MLVocab,
@@ -75,9 +75,10 @@ class Dataset:
75
75
  rid_info = self._model.catalog.resolve_rid(dataset_rid, self._model.model)
76
76
  except KeyError as _e:
77
77
  raise DerivaMLException(f"Invalid RID {dataset_rid}")
78
-
79
- # Got a dataset rid. Now check to see if its deleted or not.
80
- if deleted:
78
+ if rid_info.table != self.dataset_table:
79
+ return False
80
+ elif deleted:
81
+ # Got a dataset rid. Now check to see if its deleted or not.
81
82
  return True
82
83
  else:
83
84
  return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
@@ -293,7 +294,7 @@ class Dataset:
293
294
  pb = self._model.catalog.getPathBuilder()
294
295
  for ds_type in ds_types:
295
296
  if not check_dataset_type(ds_type):
296
- raise DerivaMLException(f"Dataset type must be a vocabulary term.")
297
+ raise DerivaMLException("Dataset type must be a vocabulary term.")
297
298
  dataset_table_path = pb.schemas[self.dataset_table.schema.name].tables[
298
299
  self.dataset_table.name
299
300
  ]
@@ -444,7 +445,7 @@ class Dataset:
444
445
  self._model.model.apply()
445
446
  return table
446
447
 
447
- @validate_call
448
+ # @validate_call
448
449
  def list_dataset_members(
449
450
  self, dataset_rid: RID, recurse: bool = False
450
451
  ) -> dict[str, list[dict[str, Any]]]:
@@ -469,34 +470,27 @@ class Dataset:
469
470
  pb = self._model.catalog.getPathBuilder()
470
471
  for assoc_table in self.dataset_table.find_associations():
471
472
  other_fkey = assoc_table.other_fkeys.pop()
472
- self_fkey = assoc_table.self_fkey
473
473
  target_table = other_fkey.pk_table
474
474
  member_table = assoc_table.table
475
475
 
476
+ # Look at domain tables and nested datasets.
476
477
  if (
477
478
  target_table.schema.name != self._model.domain_schema
478
479
  and target_table != self.dataset_table
479
480
  ):
480
- # Look at domain tables and nested datasets.
481
481
  continue
482
- if target_table == self.dataset_table:
483
- # find_assoc gives us the keys in the wrong position, so swap.
484
- self_fkey, other_fkey = other_fkey, self_fkey
482
+ member_column = (
483
+ "Nested_Dataset"
484
+ if target_table == self.dataset_table
485
+ else other_fkey.foreign_key_columns[0].name
486
+ )
485
487
 
486
488
  target_path = pb.schemas[target_table.schema.name].tables[target_table.name]
487
489
  member_path = pb.schemas[member_table.schema.name].tables[member_table.name]
488
- # Get the names of the columns that we are going to need for linking
489
- member_link = tuple(
490
- c.name for c in next(iter(other_fkey.column_map.items()))
491
- )
492
- path = pb.schemas[member_table.schema.name].tables[member_table.name].path
493
- path.filter(member_path.Dataset == dataset_rid)
494
- path.link(
490
+
491
+ path = member_path.filter(member_path.Dataset == dataset_rid).link(
495
492
  target_path,
496
- on=(
497
- member_path.columns[member_link[0]]
498
- == target_path.columns[member_link[1]]
499
- ),
493
+ on=(member_path.columns[member_column] == target_path.columns["RID"]),
500
494
  )
501
495
  target_entities = list(path.entities().fetch())
502
496
  members[target_table.name].extend(target_entities)
@@ -747,9 +741,9 @@ class Dataset:
747
741
  p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
748
742
  for table in path[1:]:
749
743
  if table == dataset_dataset:
750
- p.append(f"(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
744
+ p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
751
745
  elif table == self.dataset_table:
752
- p.append(f"(Nested_Dataset)=(deriva-ml:Dataset:RID)")
746
+ p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
753
747
  elif table.name == "Dataset_Version":
754
748
  p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
755
749
  else:
@@ -898,7 +892,7 @@ class Dataset:
898
892
  config_file=spec_file,
899
893
  output_dir=tmp_dir,
900
894
  defer_download=True,
901
- timeout=(10, 300),
895
+ timeout=(10, 610),
902
896
  envars={"Dataset_RID": dataset.rid},
903
897
  )
904
898
  minid_page_url = exporter.export()[0] # Get the MINID launch page
@@ -1111,7 +1105,7 @@ class Dataset:
1111
1105
  return [
1112
1106
  {
1113
1107
  "processor": "json",
1114
- "processor_params": {"query_path": f"/schema", "output_path": "schema"},
1108
+ "processor_params": {"query_path": "/schema", "output_path": "schema"},
1115
1109
  }
1116
1110
  ] + self._dataset_specification(writer)
1117
1111
 
@@ -187,6 +187,14 @@ class DatasetSpec(BaseModel):
187
187
 
188
188
  model_config = ConfigDict(arbitrary_types_allowed=True)
189
189
 
190
+ @field_validator("version", mode="before")
191
+ @classmethod
192
+ def version_field_validator(cls, v: Any) -> Any:
193
+ if isinstance(v, dict):
194
+ return DatasetVersion(**v)
195
+ else:
196
+ return v
197
+
190
198
  @model_validator(mode="before")
191
199
  @classmethod
192
200
  def _check_bare_rid(cls, data: Any) -> dict[str, str | bool]:
@@ -79,7 +79,7 @@ class DatasetBag:
79
79
  with self.database as dbase:
80
80
  select_args = ",".join(
81
81
  [
82
- f'"{table_name}".{c[1]}'
82
+ f'"{table_name}"."{c[1]}"'
83
83
  for c in dbase.execute(
84
84
  f'PRAGMA table_info("{table_name}")'
85
85
  ).fetchall()
@@ -104,7 +104,7 @@ class DatasetBag:
104
104
  )
105
105
 
106
106
  def column_name(col: Column) -> str:
107
- return f'"{self.model.normalize_table_name(col.table.name)}".{col.name}'
107
+ return f'"{self.model.normalize_table_name(col.table.name)}"."{col.name}"'
108
108
 
109
109
  for ts, on in paths:
110
110
  tables = " JOIN ".join(ts)
@@ -26,7 +26,6 @@ from deriva_ml import (
26
26
  RID,
27
27
  )
28
28
 
29
- from deriva_ml.execution import Execution
30
29
  from deriva_ml.schema_setup.create_schema import initialize_ml_schema, create_ml_schema
31
30
  from deriva_ml.dataset import Dataset
32
31
 
@@ -114,7 +113,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
114
113
 
115
114
  double_nested_dataset = ml_instance.create_dataset(
116
115
  type_rid.name,
117
- description=f"Double nested dataset",
116
+ description="Double nested dataset",
118
117
  version=DatasetVersion(1, 0, 0),
119
118
  )
120
119
  ml_instance.add_dataset_members(double_nested_dataset, nested_datasets)
@@ -295,6 +294,7 @@ def create_demo_catalog(
295
294
  project_name=project_name,
296
295
  logging_level=logging.WARN,
297
296
  )
297
+ working_dir = deriva_ml.working_dir
298
298
  dataset_table = deriva_ml.dataset_table
299
299
  dataset_table.annotations.update(
300
300
  Dataset(
@@ -3,12 +3,22 @@ Shared definitions that are used in different DerivaML modules.
3
3
  """
4
4
 
5
5
  import warnings
6
+ from datetime import date
6
7
  from enum import Enum
7
8
  from typing import Any, Iterable, Optional, Annotated
8
9
 
9
10
  import deriva.core.ermrest_model as em
11
+ from urllib.parse import urlparse, urljoin
10
12
  from deriva.core.ermrest_model import builtin_types
11
- from pydantic import BaseModel, model_serializer, Field, computed_field, field_validator
13
+ from pydantic import (
14
+ BaseModel,
15
+ model_serializer,
16
+ Field,
17
+ computed_field,
18
+ field_validator,
19
+ ValidationError,
20
+ )
21
+ from socket import gethostname
12
22
 
13
23
  ML_SCHEMA = "deriva-ml"
14
24
 
@@ -109,6 +119,36 @@ class BuiltinTypes(Enum):
109
119
  serial8 = builtin_types.serial8.typename
110
120
 
111
121
 
122
+ class FileSpec(BaseModel):
123
+ """An entry into the File table
124
+
125
+ Attributes:
126
+ url: The File url to the url.
127
+ description: The description of the file.
128
+ """
129
+
130
+ url: str
131
+ description: Optional[str] = ""
132
+ md5: str
133
+ length: int
134
+
135
+ @field_validator("url")
136
+ @classmethod
137
+ def validate_file_url(cls, v):
138
+ url_parts = urlparse(v)
139
+ if url_parts.scheme == "tag":
140
+ return v
141
+ elif not url_parts.scheme:
142
+ print(v)
143
+ return f'tag://{gethostname()},{date.today()}:file://{v}'
144
+ else:
145
+ raise ValidationError("url is not a file URL")
146
+
147
+ @model_serializer()
148
+ def serialize_filespec(self):
149
+ return {'URL': self.url, 'Description': self.description, 'MD5': self.md5, 'Length': self.length}
150
+
151
+
112
152
  class VocabularyTerm(BaseModel):
113
153
  """An entry in a vocabulary table.
114
154
 
@@ -144,6 +184,7 @@ class MLVocab(StrEnum):
144
184
  workflow_type = "Workflow_Type"
145
185
  execution_asset_type = "Execution_Asset_Type"
146
186
  execution_metadata_type = "Execution_Metadata_Type"
187
+ file_type = "File_Type"
147
188
 
148
189
 
149
190
  class ExecMetadataVocab(StrEnum):
@@ -8,6 +8,8 @@ relationships that follow a specific data model.
8
8
 
9
9
  """
10
10
 
11
+ from __future__ import annotations
12
+
11
13
  import getpass
12
14
  import logging
13
15
  from datetime import datetime
@@ -51,6 +53,7 @@ from .deriva_definitions import (
51
53
  ML_SCHEMA,
52
54
  VocabularyTerm,
53
55
  MLVocab,
56
+ FileSpec,
54
57
  )
55
58
 
56
59
  if TYPE_CHECKING:
@@ -112,10 +115,12 @@ class DerivaML(Dataset):
112
115
  if working_dir
113
116
  else Path.home() / "deriva-ml"
114
117
  ) / default_workdir
118
+
115
119
  self.working_dir.mkdir(parents=True, exist_ok=True)
116
120
  self.cache_dir = (
117
121
  Path(cache_dir) if cache_dir else Path.home() / "deriva-ml" / "cache"
118
122
  )
123
+
119
124
  self.cache_dir.mkdir(parents=True, exist_ok=True)
120
125
 
121
126
  # Initialize dataset class.
@@ -151,11 +156,11 @@ class DerivaML(Dataset):
151
156
  )
152
157
 
153
158
  def __del__(self):
154
- if self._execution and self._execution.status != Status.completed:
155
- try:
156
- self._execution.update_status(Status.aborted, f"Execution Aborted")
157
- except requests.HTTPError as e:
158
- pass
159
+ try:
160
+ if self._execution and self._execution.status != Status.completed:
161
+ self._execution.update_status(Status.aborted, "Execution Aborted")
162
+ except (AttributeError, requests.HTTPError):
163
+ pass
159
164
 
160
165
  @staticmethod
161
166
  def _get_session_config():
@@ -450,9 +455,9 @@ class DerivaML(Dataset):
450
455
 
451
456
  # Make sure that the provided assets or terms are actually assets or terms.
452
457
  if not all(map(self.model.is_asset, assets)):
453
- raise DerivaMLException(f"Invalid create_feature asset table.")
458
+ raise DerivaMLException("Invalid create_feature asset table.")
454
459
  if not all(map(self.model.is_vocabulary, terms)):
455
- raise DerivaMLException(f"Invalid create_feature asset table.")
460
+ raise DerivaMLException("Invalid create_feature asset table.")
456
461
 
457
462
  # Get references to the necessary tables and make sure that the
458
463
  # provided feature name exists.
@@ -785,7 +790,77 @@ class DerivaML(Dataset):
785
790
  ]
786
791
  )
787
792
 
788
- def list_files(self) -> list[dict[str, Any]]:
793
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
794
+ def add_files(
795
+ self,
796
+ files: Iterable[FileSpec],
797
+ file_types: str | list[str],
798
+ execution_rid: Optional[RID] = None,
799
+ ) -> Iterable[RID]:
800
+ """Add a new file to the File table in the catalog.
801
+
802
+ Args:
803
+ file_types: One or more file types. Must be a term from the File_Type controlled vocabulary.
804
+ files: A sequence of file specifications that describe the files to add.
805
+ execution_rid: Resource Identifier (RID) of the execution to associate with the file.
806
+
807
+ Returns:
808
+ Iterable of the RIDs of the files that were added.
809
+ """
810
+ defined_types = self.list_vocabulary_terms(MLVocab.file_type)
811
+ if execution_rid and self.resolve_rid(execution_rid).table.name != 'Execution':
812
+ raise DerivaMLException(f'RID {execution_rid} is not for an execution table.')
813
+
814
+ def check_file_type(dtype: str) -> bool:
815
+ for term in defined_types:
816
+ if dtype == term.name or (term.synonyms and file_type in term.synonyms):
817
+ return True
818
+ return False
819
+
820
+ # Create the entry for the new dataset_table and get its RID.
821
+ file_types = [file_types] if isinstance(file_types, str) else file_types
822
+ pb = self._model.catalog.getPathBuilder()
823
+ for file_type in file_types:
824
+ if not check_file_type(file_type):
825
+ raise DerivaMLException("File type must be a vocabulary term.")
826
+ file_table_path = pb.schemas[self.ml_schema].tables["File"]
827
+ file_rids = [
828
+ e["RID"] for e in file_table_path.insert([f.model_dump() for f in files])
829
+ ]
830
+
831
+ # Get the name of the association table between file_table and file_type.
832
+ atable = next(
833
+ self._model.schemas[self._ml_schema]
834
+ .tables[MLVocab.file_type]
835
+ .find_associations()
836
+ ).name
837
+ pb.schemas[self._ml_schema].tables[atable].insert(
838
+ [
839
+ {"File_Type": file_type, "File": file_rid}
840
+ for file_rid in file_rids
841
+ for file_type in file_types
842
+ ]
843
+ )
844
+
845
+ if execution_rid:
846
+ # Get the name of the association table between file_table and execution.
847
+ exec_table = next(
848
+ self._model.schemas[self._ml_schema]
849
+ .tables["Execution"]
850
+ .find_associations()
851
+ ).name
852
+ pb.schemas[self._ml_schema].tables[exec_table].insert(
853
+ [
854
+ {"File": file_rid, "Execution": execution_rid}
855
+ for file_rid in file_rids
856
+ ]
857
+ )
858
+
859
+ return file_rids
860
+
861
+ def list_files(
862
+ self, file_types: Optional[list[str]] = None
863
+ ) -> list[dict[str, Any]]:
789
864
  """Return the contents of the file table. Denormalized file types into the file record."""
790
865
  atable = next(
791
866
  self._model.schemas[self._ml_schema]
@@ -795,26 +870,28 @@ class DerivaML(Dataset):
795
870
  ml_path = self.pathBuilder.schemas[self._ml_schema]
796
871
  atable_path = ml_path.tables[atable]
797
872
  file_path = ml_path.File
873
+ type_path = ml_path.File_File_Type
874
+
798
875
  # Get a list of all the dataset_type values associated with this dataset_table.
799
876
  files = []
800
- for file in file_path.entities().fetch():
801
- file_types = (
802
- atable_path.filter(file_path.Dataset == file["RID"])
803
- .attributes(atable_path.Dataset_Type)
804
- .fetch()
805
- )
806
- files.append(
807
- file
808
- | {
809
- MLVocab.dataset_type: [
810
- ds[MLVocab.dataset_type] for ft in file_types
811
- ]
812
- }
813
- )
814
- return files
877
+ path = file_path.link(type_path)
878
+ path = path.attributes(
879
+ path.File.RID,
880
+ path.File.URL,
881
+ path.File.MD5,
882
+ path.File.Length,
883
+ path.File.Description,
884
+ path.File_File_Type.File_Type,
885
+ )
886
+ file_map = {}
887
+ for f in path.fetch():
888
+ file_map.setdefault(f['RID'], f | {'File_Types': []})['File_Types'].append(f['File_Type'])
889
+
890
+ # Now get rid of the File_Type key and return the result
891
+ return [ (f, f.pop('File_Type'))[0] for f in file_map.values()]
815
892
 
816
893
  def list_workflows(self) -> list[Workflow]:
817
- """Return a list of all of the workflows in the catalog."""
894
+ """Return a list of all the workflows in the catalog."""
818
895
  workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
819
896
  return [
820
897
  Workflow(
@@ -898,7 +975,7 @@ class DerivaML(Dataset):
898
975
 
899
976
  if self._execution:
900
977
  DerivaMLException(
901
- f"Only one execution can be created for a Deriva ML instance."
978
+ "Only one execution can be created for a Deriva ML instance."
902
979
  )
903
980
  else:
904
981
  self._execution = Execution(configuration, self)
@@ -113,13 +113,13 @@ class Execution:
113
113
 
114
114
  if self._ml_object.resolve_rid(configuration.workflow).table.name != "Workflow":
115
115
  raise DerivaMLException(
116
- f"Workflow specified in execution configuration is not a Workflow"
116
+ "Workflow specified in execution configuration is not a Workflow"
117
117
  )
118
118
 
119
119
  for d in self.configuration.datasets:
120
120
  if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
121
121
  raise DerivaMLException(
122
- f"Dataset specified in execution configuration is not a dataset"
122
+ "Dataset specified in execution configuration is not a dataset"
123
123
  )
124
124
 
125
125
  for a in self.configuration.assets:
@@ -127,7 +127,7 @@ class Execution:
127
127
  self._ml_object.resolve_rid(a).table.name
128
128
  ):
129
129
  raise DerivaMLException(
130
- f"Asset specified in execution configuration is not a asset table"
130
+ "Asset specified in execution configuration is not a asset table"
131
131
  )
132
132
 
133
133
  schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
@@ -248,7 +248,7 @@ class Execution:
248
248
 
249
249
  self.start_time = datetime.now()
250
250
  self.uploaded_assets = None
251
- self.update_status(Status.initializing, f"Start ML algorithm ...")
251
+ self.update_status(Status.initializing, "Start ML algorithm ...")
252
252
 
253
253
  def execution_stop(self) -> None:
254
254
  """Finish the execution and update the duration and status of execution."""
@@ -303,7 +303,7 @@ class Execution:
303
303
  self.update_status(Status.failed, error)
304
304
  raise DerivaMLException(f"Fail to upload execution_assets. Error: {error}")
305
305
 
306
- self.update_status(Status.running, f"Updating features...")
306
+ self.update_status(Status.running, "Updating features...")
307
307
 
308
308
  feature_assets = defaultdict(dict)
309
309
 
@@ -350,7 +350,7 @@ class Execution:
350
350
  ],
351
351
  )
352
352
 
353
- self.update_status(Status.running, f"Upload assets complete")
353
+ self.update_status(Status.running, "Upload assets complete")
354
354
  return results
355
355
 
356
356
  def upload_execution_outputs(
@@ -1,12 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
- from typing import Optional
4
+ from typing import Optional, Any
5
5
 
6
6
  from pydantic import (
7
7
  BaseModel,
8
8
  conlist,
9
- ConfigDict,
9
+ ConfigDict, field_validator,
10
10
  )
11
11
  from pathlib import Path
12
12
 
@@ -47,7 +47,7 @@ def get_platform_info():
47
47
  for attr in attributes:
48
48
  try:
49
49
  platform_info[attr] = getattr(platform, attr)()
50
- except Exception as exc:
50
+ except Exception:
51
51
  # Not all attributes are available on all platforms.
52
52
  continue
53
53
  return platform_info
@@ -67,7 +67,7 @@ def get_os_info():
67
67
  ]:
68
68
  try:
69
69
  values[func] = getattr(os, "get" + func)()
70
- except (OSError, AttributeError) as exc:
70
+ except (OSError, AttributeError):
71
71
  pass
72
72
  values["umask"] = oct(get_umask())
73
73
  values["name"] = os.name
@@ -1,7 +1,7 @@
1
1
  import argparse
2
2
  import sys
3
3
 
4
- from deriva.core import DerivaServer, ErmrestCatalog, get_credential
4
+ from deriva.core import DerivaServer, get_credential
5
5
  from deriva.core.ermrest_model import Model
6
6
  from deriva.core.ermrest_model import (
7
7
  builtin_types,
@@ -32,7 +32,7 @@ def define_table_workflow(workflow_annotation: dict):
32
32
  )
33
33
 
34
34
 
35
- def define_table_dataset(sname, dataset_annotation: dict = None):
35
+ def define_table_dataset(dataset_annotation: dict = None):
36
36
  return Table.define(
37
37
  tname="Dataset",
38
38
  column_defs=[
@@ -43,7 +43,7 @@ def define_table_dataset(sname, dataset_annotation: dict = None):
43
43
  )
44
44
 
45
45
 
46
- def define_table_dataset_version(sname: str, dataset_version_annotation: dict = None):
46
+ def define_table_dataset_version(sname: str):
47
47
  return Table.define(
48
48
  tname="Dataset_Version",
49
49
  column_defs=[
@@ -100,6 +100,14 @@ def define_asset_execution_asset(sname: str, execution_asset_annotation: dict):
100
100
  return table_def
101
101
 
102
102
 
103
+ def define_table_file(sname):
104
+ """Define files table structure"""
105
+ return Table.define_asset(
106
+ sname=sname,
107
+ tname="File",
108
+ )
109
+
110
+
103
111
  def create_www_schema(model: Model):
104
112
  """
105
113
  Set up a new schema and tables to hold web-page like content. The tables include a page table, and an asset
@@ -142,15 +150,12 @@ def create_www_schema(model: Model):
142
150
  },
143
151
  )
144
152
  )
145
-
146
153
  return www_schema
147
154
 
148
155
 
149
156
  def create_ml_schema(
150
157
  model: Model, schema_name: str = "deriva-ml", project_name: str = None
151
158
  ):
152
- ml_catalog: ErmrestCatalog = model.catalog
153
-
154
159
  if model.schemas.get(schema_name):
155
160
  model.schemas[schema_name].drop(cascade=True)
156
161
  # get annotations
@@ -195,7 +200,7 @@ def create_ml_schema(
195
200
  )
196
201
 
197
202
  dataset_table = schema.create_table(
198
- define_table_dataset(schema_name, annotations["dataset_annotation"])
203
+ define_table_dataset(annotations["dataset_annotation"])
199
204
  )
200
205
  dataset_type = schema.create_table(
201
206
  Table.define_vocabulary(MLVocab.dataset_type, f"{project_name}:{{RID}}")
@@ -263,6 +268,27 @@ def create_ml_schema(
263
268
  )
264
269
  )
265
270
 
271
+ # File table
272
+ file_table = schema.create_table(define_table_file(schema_name))
273
+ file_type = schema.create_table(
274
+ Table.define_vocabulary(MLVocab.file_type, f"{project_name}:{{RID}}")
275
+ )
276
+ schema.create_table(
277
+ Table.define_association(
278
+ associates=[
279
+ ("File", file_table),
280
+ (MLVocab.file_type, file_type),
281
+ ]
282
+ )
283
+ )
284
+ schema.create_table(
285
+ Table.define_association(
286
+ [
287
+ ("File", file_table),
288
+ ("Execution", execution_table),
289
+ ]
290
+ )
291
+ )
266
292
  create_www_schema(model)
267
293
  initialize_ml_schema(model, schema_name)
268
294
 
@@ -4,16 +4,13 @@ catalog_id = "eye-ai"
4
4
  # source_dataset = '2-7K8W'
5
5
  source_dataset = "3R6"
6
6
  create_catalog = False
7
- import logging
8
- from deriva_ml.demo_catalog import create_demo_catalog, DemoML, populate_demo_catalog
7
+ from deriva_ml.demo_catalog import create_demo_catalog, DemoML
9
8
  from deriva_ml import (
10
9
  Workflow,
11
10
  ExecutionConfiguration,
12
11
  MLVocab as vc,
13
12
  DerivaML,
14
13
  DatasetSpec,
15
- DatasetVersion,
16
- RID,
17
14
  )
18
15
 
19
16
 
@@ -23,8 +20,7 @@ def setup_demo_ml():
23
20
  host, "test-schema", create_features=True, create_datasets=True
24
21
  )
25
22
  ml_instance = DemoML(host, test_catalog.catalog_id)
26
- config = execution_test(ml_instance)
27
- return ml_instance, config
23
+ return ml_instance
28
24
 
29
25
 
30
26
  def setup_dev():
@@ -100,12 +96,12 @@ def execution_test(ml_instance):
100
96
  vc.workflow_type, "ML Demo", description="A ML Workflow that uses Deriva ML API"
101
97
  )
102
98
 
103
- api_workflow = Workflow(
99
+ api_workflow = ml_instance.add_workflow(Workflow(
104
100
  name="Manual Workflow",
105
101
  url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/Notebooks/DerivaML%20Execution.ipynb",
106
102
  workflow_type="Manual Workflow",
107
103
  description="A manual operation",
108
- )
104
+ ))
109
105
 
110
106
  manual_execution = ml_instance.create_execution(
111
107
  ExecutionConfiguration(description="Sample Execution", workflow=api_workflow)
@@ -114,7 +110,7 @@ def execution_test(ml_instance):
114
110
  # Now lets create model configuration for our program.
115
111
  model_file = manual_execution.execution_asset_path("API_Model") / "modelfile.txt"
116
112
  with open(model_file, "w") as fp:
117
- fp.write(f"My model")
113
+ fp.write("My model")
118
114
 
119
115
  # Now upload the file and retrieve the RID of the new asset from the returned results.
120
116
  uploaded_assets = manual_execution.upload_execution_outputs()
@@ -48,7 +48,6 @@ from deriva.core.ermrest_model import Table
48
48
  from deriva.core.hatrac_store import HatracStore
49
49
  from deriva.core.utils import hash_utils, mime_utils
50
50
  from deriva.transfer.upload.deriva_upload import GenericUploader
51
- import logging
52
51
  from pydantic import validate_call, ConfigDict
53
52
 
54
53
  from deriva_ml.deriva_definitions import (
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: deriva-ml
3
- Version: 1.7.0
3
+ Version: 1.8.1
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -36,7 +36,7 @@ class TestVocabulary(TestDerivaML):
36
36
  self.assertEqual(term.name, self.ml_instance.lookup_term("CV2", "T1").name)
37
37
 
38
38
  # Check for redundant terms.
39
- with self.assertRaises(DerivaMLException) as context:
39
+ with self.assertRaises(DerivaMLException):
40
40
  self.ml_instance.add_term(
41
41
  "CV2", "T1", description="A vocab", exists_ok=False
42
42
  )
@@ -121,11 +121,24 @@ class TestDataset(TestDerivaML):
121
121
  print(f"datasets {datasets}")
122
122
  import pprint
123
123
 
124
+ print("double_nested_dataset")
124
125
  pprint.pprint(
125
- self.ml_instance.list_dataset_members(dataset_rid=double_nested_dataset)[
126
- "Dataset"
126
+ self.ml_instance.list_dataset_members(dataset_rid=double_nested_dataset)
127
+ )
128
+
129
+ print("nested_dataset")
130
+ pprint.pprint(
131
+ [
132
+ self.ml_instance.list_dataset_members(dataset_rid=ds)
133
+ for ds in nested_datasets
127
134
  ]
128
135
  )
136
+
137
+ print("dataset")
138
+ pprint.pprint(
139
+ [self.ml_instance.list_dataset_members(dataset_rid=ds) for ds in datasets]
140
+ )
141
+
129
142
  print(
130
143
  "double nested children",
131
144
  self.ml_instance.list_dataset_children(dataset_rid=double_nested_dataset),
@@ -194,9 +207,7 @@ class TestDataset(TestDerivaML):
194
207
  "Manual Workflow",
195
208
  description="Initial setup of Model File",
196
209
  )
197
- type_rid = self.ml_instance.add_term(
198
- "Dataset_Type", "TestSet", description="A test"
199
- )
210
+ self.ml_instance.add_term("Dataset_Type", "TestSet", description="A test")
200
211
 
201
212
  api_workflow = self.ml_instance.add_workflow(
202
213
  Workflow(
@@ -1,12 +1,9 @@
1
- from idlelib.run import manage_socket
2
-
3
1
  from derivaml_test import TestDerivaML
4
2
  from deriva_ml import (
5
3
  MLVocab as vc,
6
4
  Workflow,
7
5
  ExecutionConfiguration,
8
6
  DatasetSpec,
9
- DerivaML,
10
7
  )
11
8
 
12
9
 
@@ -42,7 +39,7 @@ class TestExecution(TestDerivaML):
42
39
  description="Sample Execution", workflow=api_workflow
43
40
  )
44
41
  )
45
- with manual_execution as e:
42
+ with manual_execution:
46
43
  pass
47
44
  manual_execution.upload_execution_outputs()
48
45
 
@@ -141,7 +138,7 @@ class TestExecution(TestDerivaML):
141
138
  manual_execution.execution_asset_path("API_Model") / "modelfile.txt"
142
139
  )
143
140
  with open(model_file, "w") as fp:
144
- fp.write(f"My model")
141
+ fp.write("My model")
145
142
  # Now upload the file and retrieve the RID of the new asset from the returned results.
146
143
  uploaded_assets = manual_execution.upload_execution_outputs()
147
144
  self.ml_instance._execution = None
@@ -90,10 +90,10 @@ class TestUpload(TestDerivaML):
90
90
  manual_execution.execution_asset_path("API_Model") / "modelfile.txt"
91
91
  )
92
92
  with open(model_file, "w") as fp:
93
- fp.write(f"My model")
93
+ fp.write("My model")
94
94
 
95
95
  # Now upload the file and retrieve the RID of the new asset from the returned results.
96
- uploaded_assets = manual_execution.upload_execution_outputs()
96
+ manual_execution.upload_execution_outputs()
97
97
  path = self.ml_instance.catalog.getPathBuilder().schemas["deriva-ml"]
98
98
  self.assertEqual(1, len(list(path.Execution_Asset.entities().fetch())))
99
99
 
@@ -1 +0,0 @@
1
- __version__ = "1.7.0"
File without changes
File without changes
File without changes
File without changes