deriva-ml 1.10.1__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,7 +31,6 @@ from deriva.core.datapath import DataPathException
31
31
  from deriva.core.deriva_server import DerivaServer
32
32
  from deriva.core.ermrest_catalog import ResolveRidResult
33
33
  from deriva.core.ermrest_model import Key, Table
34
- from deriva.core.hatrac_store import HatracStore
35
34
  from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
36
35
  from pydantic import validate_call, ConfigDict
37
36
  from requests import RequestException
@@ -42,24 +41,17 @@ from .dataset import Dataset
42
41
  from .dataset_aux_classes import DatasetSpec
43
42
  from .dataset_bag import DatasetBag
44
43
  from .deriva_model import DerivaModel
45
- from .upload import (
46
- table_path,
47
- execution_rids,
48
- execution_metadata_dir,
49
- upload_directory,
50
- UploadAssetDirectory,
51
- )
44
+ from .upload import table_path, execution_rids, asset_file_path
52
45
  from .deriva_definitions import ColumnDefinition
53
- from .deriva_definitions import ExecMetadataVocab
54
46
  from .deriva_definitions import (
55
47
  RID,
56
48
  Status,
57
- FileUploadState,
58
49
  DerivaMLException,
59
50
  ML_SCHEMA,
60
51
  VocabularyTerm,
61
52
  MLVocab,
62
53
  FileSpec,
54
+ TableDefinition,
63
55
  )
64
56
 
65
57
  try:
@@ -273,10 +265,13 @@ class DerivaML(Dataset):
273
265
  is_notebook = True
274
266
  else:
275
267
  stack = inspect.stack()
268
+ # Get the caller's filename, which is two up the stack from here.
276
269
  if len(stack) > 1:
277
- filename = Path(
278
- stack[2].filename
279
- ) # Get the caller's filename, which is two up the stack from here.
270
+ filename = Path(stack[2].filename)
271
+ if not filename.exists():
272
+ # Begin called from command line interpreter.
273
+ filename = "REPL"
274
+ # Get the caller's filename, which is two up the stack from here.
280
275
  else:
281
276
  raise DerivaMLException(
282
277
  "Looking for caller failed"
@@ -334,7 +329,6 @@ class DerivaML(Dataset):
334
329
  """Return a local file path in which to place a CSV to add values to a table on upload.
335
330
 
336
331
  Args:
337
- table: return:
338
332
  table: str | Table:
339
333
 
340
334
  Returns:
@@ -346,30 +340,6 @@ class DerivaML(Dataset):
346
340
  table=self.model.name_to_table(table).name,
347
341
  )
348
342
 
349
- def asset_dir(
350
- self, table: str | Table, prefix: Optional[str | Path] = None
351
- ) -> UploadAssetDirectory:
352
- """Return a local file path in which to place a files for an asset table. T
353
-
354
- Args:
355
- table: Location of where to place files. Defaults to execution_assets_path.
356
- prefix: Root path to asset directory.
357
-
358
- Returns:
359
- Path to the directory in which asset files should be placed.
360
- """
361
- table = self.model.name_to_table(table)
362
- if not self.model.is_asset(table):
363
- raise DerivaMLException(f"The table {table} is not an asset table.")
364
-
365
- prefix = Path(prefix) if prefix else self.working_dir
366
- return UploadAssetDirectory(
367
- model=self.model,
368
- prefix=prefix,
369
- schema=table.schema.name,
370
- table=table.name,
371
- )
372
-
373
343
  def download_dir(self, cached: bool = False) -> Path:
374
344
  """Location where downloaded files are placed.
375
345
 
@@ -532,10 +502,17 @@ class DerivaML(Dataset):
532
502
  )
533
503
  )
534
504
 
505
+ def create_table(self, table: TableDefinition) -> Table:
506
+ """Create a table from a table definition."""
507
+ return self.model.schemas[self.domain_schema].create_table(table.model_dump())
508
+
509
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
535
510
  def create_asset(
536
511
  self,
537
512
  asset_name: str,
538
513
  column_defs: Optional[Iterable[ColumnDefinition]] = None,
514
+ fkey_defs: Optional[Iterable[ColumnDefinition]] = None,
515
+ referenced_tables: Optional[Iterable[Table]] = None,
539
516
  comment: str = "",
540
517
  schema: Optional[str] = None,
541
518
  ) -> Table:
@@ -544,6 +521,8 @@ class DerivaML(Dataset):
544
521
  Args:
545
522
  asset_name: Name of the asset table.
546
523
  column_defs: Iterable of ColumnDefinition objects to provide additional metadata for asset.
524
+ fkey_defs: Iterable of ForeignKeyDefinition objects to provide additional metadata for asset.
525
+ referenced_tables: Iterable of Table objects to which asset should provide foreign-key references to.
547
526
  comment: Description of the asset table. (Default value = '')
548
527
  schema: Schema in which to create the asset table. Defaults to domain_schema.
549
528
  asset_name: str:
@@ -553,17 +532,82 @@ class DerivaML(Dataset):
553
532
  Table object for the asset table.
554
533
  """
555
534
  column_defs = column_defs or []
535
+ fkey_defs = fkey_defs or []
536
+ referenced_tables = referenced_tables or []
556
537
  schema = schema or self.domain_schema
538
+
539
+ self.add_term(
540
+ MLVocab.asset_type, asset_name, description=f"A {asset_name} asset"
541
+ )
557
542
  asset_table = self.model.schemas[schema].create_table(
558
543
  Table.define_asset(
559
544
  schema,
560
545
  asset_name,
561
546
  column_defs=[c.model_dump() for c in column_defs],
547
+ fkey_defs=[fk.model_dump() for fk in fkey_defs],
562
548
  comment=comment,
563
549
  )
564
550
  )
551
+
552
+ self.model.schemas[self.domain_schema].create_table(
553
+ Table.define_association(
554
+ [
555
+ (asset_table.name, asset_table),
556
+ ("Asset_Type", self.model.name_to_table("Asset_Type")),
557
+ ]
558
+ )
559
+ )
560
+ for t in referenced_tables:
561
+ asset_table.create_reference(self.model.name_to_table(t))
562
+ # Create a table to track execution that creates the asset
563
+ atable = self.model.schemas[self.domain_schema].create_table(
564
+ Table.define_association(
565
+ [
566
+ (asset_name, asset_table),
567
+ (
568
+ "Execution",
569
+ self.model.schemas[self.ml_schema].tables["Execution"],
570
+ ),
571
+ ]
572
+ )
573
+ )
574
+ atable.create_reference(self.model.name_to_table("Asset_Role"))
565
575
  return asset_table
566
576
 
577
+ # @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
578
+ def list_assets(self, asset_table: Table | str):
579
+ """Return the contents of an asset table"""
580
+
581
+ if not self.model.is_asset(asset_table):
582
+ raise DerivaMLException(f"Table {asset_table.name} is not an asset")
583
+ asset_table = self.model.name_to_table(asset_table)
584
+ pb = self._model.catalog.getPathBuilder()
585
+ asset_path = pb.schemas[asset_table.schema.name].tables[asset_table.name]
586
+
587
+ asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
588
+ type_path = pb.schemas[asset_type_table.schema.name].tables[
589
+ asset_type_table.name
590
+ ]
591
+
592
+ # Get a list of all the asset_type values associated with this dataset_table.
593
+ assets = []
594
+ for asset in asset_path.entities().fetch():
595
+ asset_types = (
596
+ type_path.filter(type_path.columns[asset_table.name] == asset["RID"])
597
+ .attributes(type_path.Asset_Type)
598
+ .fetch()
599
+ )
600
+ assets.append(
601
+ asset
602
+ | {
603
+ MLVocab.asset_type.value: [
604
+ asset_type[MLVocab.asset_type.value]
605
+ for asset_type in asset_types
606
+ ]
607
+ }
608
+ )
609
+ return assets
610
+
567
611
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
568
612
  def create_feature(
569
613
  self,
@@ -717,24 +761,6 @@ class DerivaML(Dataset):
717
761
  """
718
762
  return self.model.find_features(table)
719
763
 
720
- @validate_call
721
- def add_features(self, features: Iterable[FeatureRecord]) -> int:
722
- """Add a set of new feature values to the catalog.
723
-
724
- Args:
725
- features: Iterable[FeatureRecord]:
726
-
727
- Returns:
728
- Number of attributes added
729
- """
730
- features = list(features)
731
- feature_table = features[0].feature.feature_table
732
- feature_path = self.pathBuilder.schemas[feature_table.schema.name].tables[
733
- feature_table.name
734
- ]
735
- entries = feature_path.insert(f.model_dump() for f in features)
736
- return len(entries)
737
-
738
764
  # noinspection PyProtectedMember
739
765
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
740
766
  def list_feature_values(
@@ -838,7 +864,8 @@ class DerivaML(Dataset):
838
864
  raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
839
865
  schema_name, table_name = vocab_table.schema.name, vocab_table.name
840
866
  schema_path = self.catalog.getPathBuilder().schemas[schema_name]
841
- for term in schema_path.tables[table_name].entities():
867
+
868
+ for term in schema_path.tables[table_name].entities().fetch():
842
869
  if term_name == term["Name"] or (
843
870
  term["Synonyms"] and term_name in term["Synonyms"]
844
871
  ):
@@ -891,65 +918,6 @@ class DerivaML(Dataset):
891
918
  snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
892
919
  )
893
920
 
894
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
895
- def download_asset(self, asset_rid: RID, dest_dir: Path) -> Path:
896
- """Download an asset from a URL and place it in a local directory.
897
-
898
- Args:
899
- asset_rid: URL of the asset.
900
- dest_dir: Destination directory for the asset.
901
-
902
- Returns:
903
- A Path object to the downloaded asset.
904
- """
905
- table = self.resolve_rid(asset_rid).table
906
- if not self.model.is_asset(table):
907
- raise DerivaMLException(f"RID {asset_rid} is not for an asset table.")
908
-
909
- tpath = self.pathBuilder.schemas[table.schema.name].tables[table.name]
910
- asset_metadata = list(tpath.filter(tpath.RID == asset_rid).entities())[0]
911
- asset_url = asset_metadata["URL"]
912
- asset_filename = dest_dir / asset_metadata["Filename"]
913
-
914
- hs = HatracStore("https", self.host_name, self.credential)
915
- hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
916
- return Path(asset_filename)
917
-
918
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
919
- def upload_assets(
920
- self,
921
- assets_dir: str | Path | UploadAssetDirectory,
922
- ) -> dict[Any, FileUploadState] | None:
923
- """Upload assets from a directory.
924
-
925
- This routine assumes that the current upload specification includes a configuration for the specified directory.
926
- Every asset in the specified directory is uploaded
927
-
928
- Args:
929
- assets_dir: Directory containing the assets to upload.
930
-
931
- Returns:
932
- Results of the upload operation.
933
-
934
- Raises:
935
- DerivaMLException: If there is an issue uploading the assets.
936
- """
937
-
938
- def path_to_asset(path: str) -> str:
939
- """Pull the asset name out of a path to that asset in the filesystem"""
940
- components = path.split("/")
941
- return components[
942
- components.index("asset") + 2
943
- ] # Look for asset in the path to find the name
944
-
945
- if isinstance(assets_dir, UploadAssetDirectory):
946
- assets_dir = assets_dir.path
947
-
948
- if not self.model.is_asset(Path(assets_dir).name):
949
- raise DerivaMLException("Directory does not have name of an asset table.")
950
- results = upload_directory(self.model, assets_dir)
951
- return {path_to_asset(p): r for p, r in results.items()}
952
-
953
921
  def _update_status(
954
922
  self, new_status: Status, status_detail: str, execution_rid: RID
955
923
  ):
@@ -1177,13 +1145,17 @@ class DerivaML(Dataset):
1177
1145
  if self._is_notebook
1178
1146
  else f"git hash-object {self.executable_path}"
1179
1147
  )
1180
- checksum = subprocess.run(
1181
- cmd,
1182
- capture_output=True,
1183
- text=True,
1184
- check=True,
1185
- shell=True,
1186
- ).stdout.strip()
1148
+ checksum = (
1149
+ subprocess.run(
1150
+ cmd,
1151
+ capture_output=True,
1152
+ text=True,
1153
+ check=False,
1154
+ shell=True,
1155
+ ).stdout.strip()
1156
+ if self.executable_path != "REPL"
1157
+ else "1"
1158
+ )
1187
1159
 
1188
1160
  return Workflow(
1189
1161
  name=name,
@@ -1205,7 +1177,9 @@ class DerivaML(Dataset):
1205
1177
 
1206
1178
  """
1207
1179
 
1208
- # Get repo URL from local github repo.
1180
+ # Get repo URL from local gitHub repo.
1181
+ if self.executable_path == "REPL":
1182
+ return "REPL", True
1209
1183
  try:
1210
1184
  result = subprocess.run(
1211
1185
  ["git", "remote", "get-url", "origin"],
@@ -1261,7 +1235,7 @@ class DerivaML(Dataset):
1261
1235
 
1262
1236
  Args:
1263
1237
  configuration: ExecutionConfiguration:
1264
- dryrun: Do not create an execution record or upload results.
1238
+ dry_run: Do not create an execution record or upload results.
1265
1239
 
1266
1240
  Returns:
1267
1241
  An execution object.
@@ -1274,6 +1248,7 @@ class DerivaML(Dataset):
1274
1248
  # @validate_call
1275
1249
  def restore_execution(self, execution_rid: Optional[RID] = None) -> "Execution":
1276
1250
  """Return an Execution object for a previously started execution with the specified RID."""
1251
+
1277
1252
  from .execution import Execution
1278
1253
 
1279
1254
  # Find path to execution
@@ -1283,13 +1258,11 @@ class DerivaML(Dataset):
1283
1258
  raise DerivaMLException(f"Multiple execution RIDs were found {e_rids}.")
1284
1259
 
1285
1260
  execution_rid = e_rids[0]
1286
- cfile = (
1287
- execution_metadata_dir(
1288
- self.working_dir,
1289
- exec_rid=execution_rid,
1290
- metadata_type=ExecMetadataVocab.execution_config.value,
1291
- )
1292
- / "configuration.json"
1261
+ cfile = asset_file_path(
1262
+ prefix=self.working_dir,
1263
+ exec_rid=execution_rid,
1264
+ file_name="configuration.json",
1265
+ asset_table=self.model.name_to_table("Execution_Metadata"),
1293
1266
  )
1294
1267
  configuration = ExecutionConfiguration.load_configuration(cfile)
1295
1268
  return Execution(configuration, self, reload=execution_rid)
deriva_ml/deriva_model.py CHANGED
@@ -27,6 +27,8 @@ from typing import Iterable, Optional
27
27
  class DerivaModel:
28
28
  """Augmented interface to deriva model class.
29
29
 
30
+ This class provides a number of DerivaML specific methods that augment the interface in the deriva model class.
31
+
30
32
  Attributes:
31
33
  domain_schema: Schema name for domain specific tables and relationships.
32
34
  model: ERMRest model for the catalog.
@@ -71,6 +73,10 @@ class DerivaModel:
71
73
  # No domain schema defined.
72
74
  self.domain_schema = domain_schema
73
75
 
76
+ def __getattr__(self, name):
77
+ # Called only if `name` is not found in Manager. Delegate attributes to model class.
78
+ return getattr(self.model, name)
79
+
74
80
  def name_to_table(self, table: str | Table) -> Table:
75
81
  """Return the table object corresponding to the given table name.
76
82
 
@@ -126,6 +132,31 @@ class DerivaModel:
126
132
  table = self.name_to_table(table_name)
127
133
  return table.is_association(unqualified=unqualified, pure=pure)
128
134
 
135
+ def find_association(self, table1: Table | str, table2: Table | str) -> Table:
136
+ """Given two tables, return an association table that connects the two.
137
+
138
+ Raises:
139
+ DerivaML exception if there is either not an association table or more than one association table.
140
+ """
141
+ table1 = self.name_to_table(table1)
142
+ table2 = self.name_to_table(table2)
143
+
144
+ tables = [
145
+ a.table
146
+ for a in table1.find_associations(pure=False)
147
+ if a.other_fkeys.pop().pk_table == table2
148
+ ]
149
+ if len(tables) == 1:
150
+ return tables[0]
151
+ elif len(tables) == 0:
152
+ raise DerivaMLException(
153
+ f"No association tables found between {table1.name} and {table2.name}."
154
+ )
155
+ else:
156
+ raise DerivaMLException(
157
+ f"There are {len(tables)} association tables between {table1.name} and {table2.name}."
158
+ )
159
+
129
160
  def is_asset(self, table_name: str | Table) -> bool:
130
161
  """True if the specified table is an asset table.
131
162