deriva-ml 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,7 +31,6 @@ from deriva.core.datapath import DataPathException
31
31
  from deriva.core.deriva_server import DerivaServer
32
32
  from deriva.core.ermrest_catalog import ResolveRidResult
33
33
  from deriva.core.ermrest_model import Key, Table
34
- from deriva.core.hatrac_store import HatracStore
35
34
  from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
36
35
  from pydantic import validate_call, ConfigDict
37
36
  from requests import RequestException
@@ -42,24 +41,17 @@ from .dataset import Dataset
42
41
  from .dataset_aux_classes import DatasetSpec
43
42
  from .dataset_bag import DatasetBag
44
43
  from .deriva_model import DerivaModel
45
- from .upload import (
46
- table_path,
47
- execution_rids,
48
- execution_metadata_dir,
49
- upload_directory,
50
- UploadAssetDirectory,
51
- )
44
+ from .upload import table_path, execution_rids, asset_file_path
52
45
  from .deriva_definitions import ColumnDefinition
53
- from .deriva_definitions import ExecMetadataVocab
54
46
  from .deriva_definitions import (
55
47
  RID,
56
48
  Status,
57
- FileUploadState,
58
49
  DerivaMLException,
59
50
  ML_SCHEMA,
60
51
  VocabularyTerm,
61
52
  MLVocab,
62
53
  FileSpec,
54
+ TableDefinition,
63
55
  )
64
56
 
65
57
  try:
@@ -346,30 +338,6 @@ class DerivaML(Dataset):
346
338
  table=self.model.name_to_table(table).name,
347
339
  )
348
340
 
349
- def asset_dir(
350
- self, table: str | Table, prefix: Optional[str | Path] = None
351
- ) -> UploadAssetDirectory:
352
- """Return a local file path in which to place a files for an asset table. T
353
-
354
- Args:
355
- table: Location of where to place files. Defaults to execution_assets_path.
356
- prefix: Root path to asset directory.
357
-
358
- Returns:
359
- Path to the directory in which asset files should be placed.
360
- """
361
- table = self.model.name_to_table(table)
362
- if not self.model.is_asset(table):
363
- raise DerivaMLException(f"The table {table} is not an asset table.")
364
-
365
- prefix = Path(prefix) if prefix else self.working_dir
366
- return UploadAssetDirectory(
367
- model=self.model,
368
- prefix=prefix,
369
- schema=table.schema.name,
370
- table=table.name,
371
- )
372
-
373
341
  def download_dir(self, cached: bool = False) -> Path:
374
342
  """Location where downloaded files are placed.
375
343
 
@@ -532,10 +500,17 @@ class DerivaML(Dataset):
532
500
  )
533
501
  )
534
502
 
503
+ def create_table(self, table: TableDefinition) -> Table:
504
+ """Create a table from a table definition."""
505
+ return self.model.schemas[self.domain_schema].create_table(table.model_dump())
506
+
507
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
535
508
  def create_asset(
536
509
  self,
537
510
  asset_name: str,
538
511
  column_defs: Optional[Iterable[ColumnDefinition]] = None,
512
+ fkey_defs: Optional[Iterable[ColumnDefinition]] = None,
513
+ referenced_tables: Optional[Iterable[Table]] = None,
539
514
  comment: str = "",
540
515
  schema: Optional[str] = None,
541
516
  ) -> Table:
@@ -544,6 +519,8 @@ class DerivaML(Dataset):
544
519
  Args:
545
520
  asset_name: Name of the asset table.
546
521
  column_defs: Iterable of ColumnDefinition objects to provide additional metadata for asset.
522
+ fkey_defs: Iterable of ForeignKeyDefinition objects to provide additional metadata for asset.
523
+ referenced_tables: Iterable of Table objects to which asset should provide foreign-key references to.
547
524
  comment: Description of the asset table. (Default value = '')
548
525
  schema: Schema in which to create the asset table. Defaults to domain_schema.
549
526
  asset_name: str:
@@ -553,17 +530,82 @@ class DerivaML(Dataset):
553
530
  Table object for the asset table.
554
531
  """
555
532
  column_defs = column_defs or []
533
+ fkey_defs = fkey_defs or []
534
+ referenced_tables = referenced_tables or []
556
535
  schema = schema or self.domain_schema
536
+
537
+ self.add_term(
538
+ MLVocab.asset_type, asset_name, description=f"A {asset_name} asset"
539
+ )
557
540
  asset_table = self.model.schemas[schema].create_table(
558
541
  Table.define_asset(
559
542
  schema,
560
543
  asset_name,
561
544
  column_defs=[c.model_dump() for c in column_defs],
545
+ fkey_defs=[fk.model_dump() for fk in fkey_defs],
562
546
  comment=comment,
563
547
  )
564
548
  )
549
+
550
+ self.model.schemas[self.domain_schema].create_table(
551
+ Table.define_association(
552
+ [
553
+ (asset_table.name, asset_table),
554
+ ("Asset_Type", self.model.name_to_table("Asset_Type")),
555
+ ]
556
+ )
557
+ )
558
+ for t in referenced_tables:
559
+ asset_table.create_reference(self.model.name_to_table(t))
560
+ # Create a table to track execution that creates the asset
561
+ atable = self.model.schemas[self.domain_schema].create_table(
562
+ Table.define_association(
563
+ [
564
+ (asset_name, asset_table),
565
+ (
566
+ "Execution",
567
+ self.model.schemas[self.ml_schema].tables["Execution"],
568
+ ),
569
+ ]
570
+ )
571
+ )
572
+ atable.create_reference(self.model.name_to_table("Asset_Role"))
565
573
  return asset_table
566
574
 
575
+ # @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
576
+ def list_assets(self, asset_table: Table | str):
577
+ """Return the contents of an asset table"""
578
+
579
+ if not self.model.is_asset(asset_table):
580
+ raise DerivaMLException(f"Table {asset_table.name} is not an asset")
581
+ asset_table = self.model.name_to_table(asset_table)
582
+ pb = self._model.catalog.getPathBuilder()
583
+ asset_path = pb.schemas[asset_table.schema.name].tables[asset_table.name]
584
+
585
+ asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
586
+ type_path = pb.schemas[asset_type_table.schema.name].tables[
587
+ asset_type_table.name
588
+ ]
589
+
590
+ # Get a list of all the asset_type values associated with this dataset_table.
591
+ assets = []
592
+ for asset in asset_path.entities().fetch():
593
+ asset_types = (
594
+ type_path.filter(type_path.columns[asset_table.name] == asset["RID"])
595
+ .attributes(type_path.Asset_Type)
596
+ .fetch()
597
+ )
598
+ assets.append(
599
+ asset
600
+ | {
601
+ MLVocab.asset_type.value: [
602
+ asset_type[MLVocab.asset_type.value]
603
+ for asset_type in asset_types
604
+ ]
605
+ }
606
+ )
607
+ return assets
608
+
567
609
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
568
610
  def create_feature(
569
611
  self,
@@ -717,24 +759,6 @@ class DerivaML(Dataset):
717
759
  """
718
760
  return self.model.find_features(table)
719
761
 
720
- @validate_call
721
- def add_features(self, features: Iterable[FeatureRecord]) -> int:
722
- """Add a set of new feature values to the catalog.
723
-
724
- Args:
725
- features: Iterable[FeatureRecord]:
726
-
727
- Returns:
728
- Number of attributes added
729
- """
730
- features = list(features)
731
- feature_table = features[0].feature.feature_table
732
- feature_path = self.pathBuilder.schemas[feature_table.schema.name].tables[
733
- feature_table.name
734
- ]
735
- entries = feature_path.insert(f.model_dump() for f in features)
736
- return len(entries)
737
-
738
762
  # noinspection PyProtectedMember
739
763
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
740
764
  def list_feature_values(
@@ -838,7 +862,8 @@ class DerivaML(Dataset):
838
862
  raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
839
863
  schema_name, table_name = vocab_table.schema.name, vocab_table.name
840
864
  schema_path = self.catalog.getPathBuilder().schemas[schema_name]
841
- for term in schema_path.tables[table_name].entities():
865
+
866
+ for term in schema_path.tables[table_name].entities().fetch():
842
867
  if term_name == term["Name"] or (
843
868
  term["Synonyms"] and term_name in term["Synonyms"]
844
869
  ):
@@ -891,65 +916,6 @@ class DerivaML(Dataset):
891
916
  snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
892
917
  )
893
918
 
894
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
895
- def download_asset(self, asset_rid: RID, dest_dir: Path) -> Path:
896
- """Download an asset from a URL and place it in a local directory.
897
-
898
- Args:
899
- asset_rid: URL of the asset.
900
- dest_dir: Destination directory for the asset.
901
-
902
- Returns:
903
- A Path object to the downloaded asset.
904
- """
905
- table = self.resolve_rid(asset_rid).table
906
- if not self.model.is_asset(table):
907
- raise DerivaMLException(f"RID {asset_rid} is not for an asset table.")
908
-
909
- tpath = self.pathBuilder.schemas[table.schema.name].tables[table.name]
910
- asset_metadata = list(tpath.filter(tpath.RID == asset_rid).entities())[0]
911
- asset_url = asset_metadata["URL"]
912
- asset_filename = dest_dir / asset_metadata["Filename"]
913
-
914
- hs = HatracStore("https", self.host_name, self.credential)
915
- hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
916
- return Path(asset_filename)
917
-
918
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
919
- def upload_assets(
920
- self,
921
- assets_dir: str | Path | UploadAssetDirectory,
922
- ) -> dict[Any, FileUploadState] | None:
923
- """Upload assets from a directory.
924
-
925
- This routine assumes that the current upload specification includes a configuration for the specified directory.
926
- Every asset in the specified directory is uploaded
927
-
928
- Args:
929
- assets_dir: Directory containing the assets to upload.
930
-
931
- Returns:
932
- Results of the upload operation.
933
-
934
- Raises:
935
- DerivaMLException: If there is an issue uploading the assets.
936
- """
937
-
938
- def path_to_asset(path: str) -> str:
939
- """Pull the asset name out of a path to that asset in the filesystem"""
940
- components = path.split("/")
941
- return components[
942
- components.index("asset") + 2
943
- ] # Look for asset in the path to find the name
944
-
945
- if isinstance(assets_dir, UploadAssetDirectory):
946
- assets_dir = assets_dir.path
947
-
948
- if not self.model.is_asset(Path(assets_dir).name):
949
- raise DerivaMLException("Directory does not have name of an asset table.")
950
- results = upload_directory(self.model, assets_dir)
951
- return {path_to_asset(p): r for p, r in results.items()}
952
-
953
919
  def _update_status(
954
920
  self, new_status: Status, status_detail: str, execution_rid: RID
955
921
  ):
@@ -1205,7 +1171,7 @@ class DerivaML(Dataset):
1205
1171
 
1206
1172
  """
1207
1173
 
1208
- # Get repo URL from local github repo.
1174
+ # Get repo URL from local gitHub repo.
1209
1175
  try:
1210
1176
  result = subprocess.run(
1211
1177
  ["git", "remote", "get-url", "origin"],
@@ -1249,7 +1215,7 @@ class DerivaML(Dataset):
1249
1215
 
1250
1216
  # @validate_call
1251
1217
  def create_execution(
1252
- self, configuration: ExecutionConfiguration, dryrun: bool = False
1218
+ self, configuration: ExecutionConfiguration, dry_run: bool = False
1253
1219
  ) -> "Execution":
1254
1220
  """Create an execution object
1255
1221
 
@@ -1261,19 +1227,14 @@ class DerivaML(Dataset):
1261
1227
 
1262
1228
  Args:
1263
1229
  configuration: ExecutionConfiguration:
1264
- dryrun: Do not create an execution record or upload results.
1230
+ dry_run: Do not create an execution record or upload results.
1265
1231
 
1266
1232
  Returns:
1267
1233
  An execution object.
1268
1234
  """
1269
1235
  from .execution import Execution
1270
1236
 
1271
- if self._execution:
1272
- DerivaMLException(
1273
- "Only one execution can be created for a Deriva ML instance."
1274
- )
1275
- else:
1276
- self._execution = Execution(configuration, self, dryrun=dryrun)
1237
+ self._execution = Execution(configuration, self, dry_run=dry_run)
1277
1238
  return self._execution
1278
1239
 
1279
1240
  # @validate_call
@@ -1288,13 +1249,11 @@ class DerivaML(Dataset):
1288
1249
  raise DerivaMLException(f"Multiple execution RIDs were found {e_rids}.")
1289
1250
 
1290
1251
  execution_rid = e_rids[0]
1291
- cfile = (
1292
- execution_metadata_dir(
1293
- self.working_dir,
1294
- exec_rid=execution_rid,
1295
- metadata_type=ExecMetadataVocab.execution_config.value,
1296
- )
1297
- / "configuration.json"
1252
+ cfile = asset_file_path(
1253
+ prefix=self.working_dir,
1254
+ exec_rid=execution_rid,
1255
+ file_name="configuration.json",
1256
+ asset_table=self.model.name_to_table("Execution_Metadata"),
1298
1257
  )
1299
1258
  configuration = ExecutionConfiguration.load_configuration(cfile)
1300
1259
  return Execution(configuration, self, reload=execution_rid)
deriva_ml/deriva_model.py CHANGED
@@ -126,6 +126,31 @@ class DerivaModel:
126
126
  table = self.name_to_table(table_name)
127
127
  return table.is_association(unqualified=unqualified, pure=pure)
128
128
 
129
+ def find_association(self, table1: Table | str, table2: Table | str) -> Table:
130
+ """Given two tables, return an association table that connects the two.
131
+
132
+ Raises"
133
+ DerivaML exception if there is either not an association table or more than one association table.
134
+ """
135
+ table1 = self.name_to_table(table1)
136
+ table2 = self.name_to_table(table2)
137
+
138
+ tables = [
139
+ a.table
140
+ for a in table1.find_associations(pure=False)
141
+ if (t := a.other_fkeys.pop().pk_table) == table2
142
+ ]
143
+ if len(tables) == 1:
144
+ return tables[0]
145
+ elif len(tables) == 0:
146
+ raise DerivaMLException(
147
+ f"No association tables found between {table1.name} and {table2.name}."
148
+ )
149
+ else:
150
+ raise DerivaMLException(
151
+ f"There are {len(tables)} association tables between {table1.name} and {table2.name}."
152
+ )
153
+
129
154
  def is_asset(self, table_name: str | Table) -> bool:
130
155
  """True if the specified table is an asset table.
131
156