deriva-ml 1.10.1__py3-none-any.whl → 1.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/database_model.py +3 -2
- deriva_ml/dataset.py +7 -16
- deriva_ml/dataset_bag.py +10 -3
- deriva_ml/demo_catalog.py +84 -78
- deriva_ml/deriva_definitions.py +2 -2
- deriva_ml/deriva_ml_base.py +105 -132
- deriva_ml/deriva_model.py +31 -0
- deriva_ml/execution.py +422 -315
- deriva_ml/execution_configuration.py +4 -0
- deriva_ml/feature.py +1 -2
- deriva_ml/schema_setup/create_schema.py +223 -183
- deriva_ml/upload.py +99 -236
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/METADATA +3 -1
- deriva_ml-1.12.0.dist-info/RECORD +27 -0
- deriva_ml-1.10.1.dist-info/RECORD +0 -27
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/WHEEL +0 -0
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/top_level.txt +0 -0
deriva_ml/deriva_ml_base.py
CHANGED
|
@@ -31,7 +31,6 @@ from deriva.core.datapath import DataPathException
|
|
|
31
31
|
from deriva.core.deriva_server import DerivaServer
|
|
32
32
|
from deriva.core.ermrest_catalog import ResolveRidResult
|
|
33
33
|
from deriva.core.ermrest_model import Key, Table
|
|
34
|
-
from deriva.core.hatrac_store import HatracStore
|
|
35
34
|
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
|
|
36
35
|
from pydantic import validate_call, ConfigDict
|
|
37
36
|
from requests import RequestException
|
|
@@ -42,24 +41,17 @@ from .dataset import Dataset
|
|
|
42
41
|
from .dataset_aux_classes import DatasetSpec
|
|
43
42
|
from .dataset_bag import DatasetBag
|
|
44
43
|
from .deriva_model import DerivaModel
|
|
45
|
-
from .upload import
|
|
46
|
-
table_path,
|
|
47
|
-
execution_rids,
|
|
48
|
-
execution_metadata_dir,
|
|
49
|
-
upload_directory,
|
|
50
|
-
UploadAssetDirectory,
|
|
51
|
-
)
|
|
44
|
+
from .upload import table_path, execution_rids, asset_file_path
|
|
52
45
|
from .deriva_definitions import ColumnDefinition
|
|
53
|
-
from .deriva_definitions import ExecMetadataVocab
|
|
54
46
|
from .deriva_definitions import (
|
|
55
47
|
RID,
|
|
56
48
|
Status,
|
|
57
|
-
FileUploadState,
|
|
58
49
|
DerivaMLException,
|
|
59
50
|
ML_SCHEMA,
|
|
60
51
|
VocabularyTerm,
|
|
61
52
|
MLVocab,
|
|
62
53
|
FileSpec,
|
|
54
|
+
TableDefinition,
|
|
63
55
|
)
|
|
64
56
|
|
|
65
57
|
try:
|
|
@@ -273,10 +265,13 @@ class DerivaML(Dataset):
|
|
|
273
265
|
is_notebook = True
|
|
274
266
|
else:
|
|
275
267
|
stack = inspect.stack()
|
|
268
|
+
# Get the caller's filename, which is two up the stack from here.
|
|
276
269
|
if len(stack) > 1:
|
|
277
|
-
filename = Path(
|
|
278
|
-
|
|
279
|
-
|
|
270
|
+
filename = Path(stack[2].filename)
|
|
271
|
+
if not filename.exists():
|
|
272
|
+
# Begin called from command line interpreter.
|
|
273
|
+
filename = "REPL"
|
|
274
|
+
# Get the caller's filename, which is two up the stack from here.
|
|
280
275
|
else:
|
|
281
276
|
raise DerivaMLException(
|
|
282
277
|
"Looking for caller failed"
|
|
@@ -334,7 +329,6 @@ class DerivaML(Dataset):
|
|
|
334
329
|
"""Return a local file path in which to place a CSV to add values to a table on upload.
|
|
335
330
|
|
|
336
331
|
Args:
|
|
337
|
-
table: return:
|
|
338
332
|
table: str | Table:
|
|
339
333
|
|
|
340
334
|
Returns:
|
|
@@ -346,30 +340,6 @@ class DerivaML(Dataset):
|
|
|
346
340
|
table=self.model.name_to_table(table).name,
|
|
347
341
|
)
|
|
348
342
|
|
|
349
|
-
def asset_dir(
|
|
350
|
-
self, table: str | Table, prefix: Optional[str | Path] = None
|
|
351
|
-
) -> UploadAssetDirectory:
|
|
352
|
-
"""Return a local file path in which to place a files for an asset table. T
|
|
353
|
-
|
|
354
|
-
Args:
|
|
355
|
-
table: Location of where to place files. Defaults to execution_assets_path.
|
|
356
|
-
prefix: Root path to asset directory.
|
|
357
|
-
|
|
358
|
-
Returns:
|
|
359
|
-
Path to the directory in which asset files should be placed.
|
|
360
|
-
"""
|
|
361
|
-
table = self.model.name_to_table(table)
|
|
362
|
-
if not self.model.is_asset(table):
|
|
363
|
-
raise DerivaMLException(f"The table {table} is not an asset table.")
|
|
364
|
-
|
|
365
|
-
prefix = Path(prefix) if prefix else self.working_dir
|
|
366
|
-
return UploadAssetDirectory(
|
|
367
|
-
model=self.model,
|
|
368
|
-
prefix=prefix,
|
|
369
|
-
schema=table.schema.name,
|
|
370
|
-
table=table.name,
|
|
371
|
-
)
|
|
372
|
-
|
|
373
343
|
def download_dir(self, cached: bool = False) -> Path:
|
|
374
344
|
"""Location where downloaded files are placed.
|
|
375
345
|
|
|
@@ -532,10 +502,17 @@ class DerivaML(Dataset):
|
|
|
532
502
|
)
|
|
533
503
|
)
|
|
534
504
|
|
|
505
|
+
def create_table(self, table: TableDefinition) -> Table:
|
|
506
|
+
"""Create a table from a table definition."""
|
|
507
|
+
return self.model.schemas[self.domain_schema].create_table(table.model_dump())
|
|
508
|
+
|
|
509
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
535
510
|
def create_asset(
|
|
536
511
|
self,
|
|
537
512
|
asset_name: str,
|
|
538
513
|
column_defs: Optional[Iterable[ColumnDefinition]] = None,
|
|
514
|
+
fkey_defs: Optional[Iterable[ColumnDefinition]] = None,
|
|
515
|
+
referenced_tables: Optional[Iterable[Table]] = None,
|
|
539
516
|
comment: str = "",
|
|
540
517
|
schema: Optional[str] = None,
|
|
541
518
|
) -> Table:
|
|
@@ -544,6 +521,8 @@ class DerivaML(Dataset):
|
|
|
544
521
|
Args:
|
|
545
522
|
asset_name: Name of the asset table.
|
|
546
523
|
column_defs: Iterable of ColumnDefinition objects to provide additional metadata for asset.
|
|
524
|
+
fkey_defs: Iterable of ForeignKeyDefinition objects to provide additional metadata for asset.
|
|
525
|
+
referenced_tables: Iterable of Table objects to which asset should provide foreign-key references to.
|
|
547
526
|
comment: Description of the asset table. (Default value = '')
|
|
548
527
|
schema: Schema in which to create the asset table. Defaults to domain_schema.
|
|
549
528
|
asset_name: str:
|
|
@@ -553,17 +532,82 @@ class DerivaML(Dataset):
|
|
|
553
532
|
Table object for the asset table.
|
|
554
533
|
"""
|
|
555
534
|
column_defs = column_defs or []
|
|
535
|
+
fkey_defs = fkey_defs or []
|
|
536
|
+
referenced_tables = referenced_tables or []
|
|
556
537
|
schema = schema or self.domain_schema
|
|
538
|
+
|
|
539
|
+
self.add_term(
|
|
540
|
+
MLVocab.asset_type, asset_name, description=f"A {asset_name} asset"
|
|
541
|
+
)
|
|
557
542
|
asset_table = self.model.schemas[schema].create_table(
|
|
558
543
|
Table.define_asset(
|
|
559
544
|
schema,
|
|
560
545
|
asset_name,
|
|
561
546
|
column_defs=[c.model_dump() for c in column_defs],
|
|
547
|
+
fkey_defs=[fk.model_dump() for fk in fkey_defs],
|
|
562
548
|
comment=comment,
|
|
563
549
|
)
|
|
564
550
|
)
|
|
551
|
+
|
|
552
|
+
self.model.schemas[self.domain_schema].create_table(
|
|
553
|
+
Table.define_association(
|
|
554
|
+
[
|
|
555
|
+
(asset_table.name, asset_table),
|
|
556
|
+
("Asset_Type", self.model.name_to_table("Asset_Type")),
|
|
557
|
+
]
|
|
558
|
+
)
|
|
559
|
+
)
|
|
560
|
+
for t in referenced_tables:
|
|
561
|
+
asset_table.create_reference(self.model.name_to_table(t))
|
|
562
|
+
# Create a table to track execution that creates the asset
|
|
563
|
+
atable = self.model.schemas[self.domain_schema].create_table(
|
|
564
|
+
Table.define_association(
|
|
565
|
+
[
|
|
566
|
+
(asset_name, asset_table),
|
|
567
|
+
(
|
|
568
|
+
"Execution",
|
|
569
|
+
self.model.schemas[self.ml_schema].tables["Execution"],
|
|
570
|
+
),
|
|
571
|
+
]
|
|
572
|
+
)
|
|
573
|
+
)
|
|
574
|
+
atable.create_reference(self.model.name_to_table("Asset_Role"))
|
|
565
575
|
return asset_table
|
|
566
576
|
|
|
577
|
+
# @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
578
|
+
def list_assets(self, asset_table: Table | str):
|
|
579
|
+
"""Return the contents of an asset table"""
|
|
580
|
+
|
|
581
|
+
if not self.model.is_asset(asset_table):
|
|
582
|
+
raise DerivaMLException(f"Table {asset_table.name} is not an asset")
|
|
583
|
+
asset_table = self.model.name_to_table(asset_table)
|
|
584
|
+
pb = self._model.catalog.getPathBuilder()
|
|
585
|
+
asset_path = pb.schemas[asset_table.schema.name].tables[asset_table.name]
|
|
586
|
+
|
|
587
|
+
asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
|
|
588
|
+
type_path = pb.schemas[asset_type_table.schema.name].tables[
|
|
589
|
+
asset_type_table.name
|
|
590
|
+
]
|
|
591
|
+
|
|
592
|
+
# Get a list of all the asset_type values associated with this dataset_table.
|
|
593
|
+
assets = []
|
|
594
|
+
for asset in asset_path.entities().fetch():
|
|
595
|
+
asset_types = (
|
|
596
|
+
type_path.filter(type_path.columns[asset_table.name] == asset["RID"])
|
|
597
|
+
.attributes(type_path.Asset_Type)
|
|
598
|
+
.fetch()
|
|
599
|
+
)
|
|
600
|
+
assets.append(
|
|
601
|
+
asset
|
|
602
|
+
| {
|
|
603
|
+
MLVocab.asset_type.value: [
|
|
604
|
+
asset_type[MLVocab.asset_type.value]
|
|
605
|
+
for asset_type in asset_types
|
|
606
|
+
]
|
|
607
|
+
}
|
|
608
|
+
)
|
|
609
|
+
return assets
|
|
610
|
+
|
|
567
611
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
568
612
|
def create_feature(
|
|
569
613
|
self,
|
|
@@ -717,24 +761,6 @@ class DerivaML(Dataset):
|
|
|
717
761
|
"""
|
|
718
762
|
return self.model.find_features(table)
|
|
719
763
|
|
|
720
|
-
@validate_call
|
|
721
|
-
def add_features(self, features: Iterable[FeatureRecord]) -> int:
|
|
722
|
-
"""Add a set of new feature values to the catalog.
|
|
723
|
-
|
|
724
|
-
Args:
|
|
725
|
-
features: Iterable[FeatureRecord]:
|
|
726
|
-
|
|
727
|
-
Returns:
|
|
728
|
-
Number of attributes added
|
|
729
|
-
"""
|
|
730
|
-
features = list(features)
|
|
731
|
-
feature_table = features[0].feature.feature_table
|
|
732
|
-
feature_path = self.pathBuilder.schemas[feature_table.schema.name].tables[
|
|
733
|
-
feature_table.name
|
|
734
|
-
]
|
|
735
|
-
entries = feature_path.insert(f.model_dump() for f in features)
|
|
736
|
-
return len(entries)
|
|
737
|
-
|
|
738
764
|
# noinspection PyProtectedMember
|
|
739
765
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
740
766
|
def list_feature_values(
|
|
@@ -838,7 +864,8 @@ class DerivaML(Dataset):
|
|
|
838
864
|
raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
|
|
839
865
|
schema_name, table_name = vocab_table.schema.name, vocab_table.name
|
|
840
866
|
schema_path = self.catalog.getPathBuilder().schemas[schema_name]
|
|
841
|
-
|
|
867
|
+
|
|
868
|
+
for term in schema_path.tables[table_name].entities().fetch():
|
|
842
869
|
if term_name == term["Name"] or (
|
|
843
870
|
term["Synonyms"] and term_name in term["Synonyms"]
|
|
844
871
|
):
|
|
@@ -891,65 +918,6 @@ class DerivaML(Dataset):
|
|
|
891
918
|
snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
|
|
892
919
|
)
|
|
893
920
|
|
|
894
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
895
|
-
def download_asset(self, asset_rid: RID, dest_dir: Path) -> Path:
|
|
896
|
-
"""Download an asset from a URL and place it in a local directory.
|
|
897
|
-
|
|
898
|
-
Args:
|
|
899
|
-
asset_rid: URL of the asset.
|
|
900
|
-
dest_dir: Destination directory for the asset.
|
|
901
|
-
|
|
902
|
-
Returns:
|
|
903
|
-
A Path object to the downloaded asset.
|
|
904
|
-
"""
|
|
905
|
-
table = self.resolve_rid(asset_rid).table
|
|
906
|
-
if not self.model.is_asset(table):
|
|
907
|
-
raise DerivaMLException(f"RID {asset_rid} is not for an asset table.")
|
|
908
|
-
|
|
909
|
-
tpath = self.pathBuilder.schemas[table.schema.name].tables[table.name]
|
|
910
|
-
asset_metadata = list(tpath.filter(tpath.RID == asset_rid).entities())[0]
|
|
911
|
-
asset_url = asset_metadata["URL"]
|
|
912
|
-
asset_filename = dest_dir / asset_metadata["Filename"]
|
|
913
|
-
|
|
914
|
-
hs = HatracStore("https", self.host_name, self.credential)
|
|
915
|
-
hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
|
|
916
|
-
return Path(asset_filename)
|
|
917
|
-
|
|
918
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
919
|
-
def upload_assets(
|
|
920
|
-
self,
|
|
921
|
-
assets_dir: str | Path | UploadAssetDirectory,
|
|
922
|
-
) -> dict[Any, FileUploadState] | None:
|
|
923
|
-
"""Upload assets from a directory.
|
|
924
|
-
|
|
925
|
-
This routine assumes that the current upload specification includes a configuration for the specified directory.
|
|
926
|
-
Every asset in the specified directory is uploaded
|
|
927
|
-
|
|
928
|
-
Args:
|
|
929
|
-
assets_dir: Directory containing the assets to upload.
|
|
930
|
-
|
|
931
|
-
Returns:
|
|
932
|
-
Results of the upload operation.
|
|
933
|
-
|
|
934
|
-
Raises:
|
|
935
|
-
DerivaMLException: If there is an issue uploading the assets.
|
|
936
|
-
"""
|
|
937
|
-
|
|
938
|
-
def path_to_asset(path: str) -> str:
|
|
939
|
-
"""Pull the asset name out of a path to that asset in the filesystem"""
|
|
940
|
-
components = path.split("/")
|
|
941
|
-
return components[
|
|
942
|
-
components.index("asset") + 2
|
|
943
|
-
] # Look for asset in the path to find the name
|
|
944
|
-
|
|
945
|
-
if isinstance(assets_dir, UploadAssetDirectory):
|
|
946
|
-
assets_dir = assets_dir.path
|
|
947
|
-
|
|
948
|
-
if not self.model.is_asset(Path(assets_dir).name):
|
|
949
|
-
raise DerivaMLException("Directory does not have name of an asset table.")
|
|
950
|
-
results = upload_directory(self.model, assets_dir)
|
|
951
|
-
return {path_to_asset(p): r for p, r in results.items()}
|
|
952
|
-
|
|
953
921
|
def _update_status(
|
|
954
922
|
self, new_status: Status, status_detail: str, execution_rid: RID
|
|
955
923
|
):
|
|
@@ -1177,13 +1145,17 @@ class DerivaML(Dataset):
|
|
|
1177
1145
|
if self._is_notebook
|
|
1178
1146
|
else f"git hash-object {self.executable_path}"
|
|
1179
1147
|
)
|
|
1180
|
-
checksum =
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1148
|
+
checksum = (
|
|
1149
|
+
subprocess.run(
|
|
1150
|
+
cmd,
|
|
1151
|
+
capture_output=True,
|
|
1152
|
+
text=True,
|
|
1153
|
+
check=False,
|
|
1154
|
+
shell=True,
|
|
1155
|
+
).stdout.strip()
|
|
1156
|
+
if self.executable_path != "REPL"
|
|
1157
|
+
else "1"
|
|
1158
|
+
)
|
|
1187
1159
|
|
|
1188
1160
|
return Workflow(
|
|
1189
1161
|
name=name,
|
|
@@ -1205,7 +1177,9 @@ class DerivaML(Dataset):
|
|
|
1205
1177
|
|
|
1206
1178
|
"""
|
|
1207
1179
|
|
|
1208
|
-
# Get repo URL from local
|
|
1180
|
+
# Get repo URL from local gitHub repo.
|
|
1181
|
+
if self.executable_path == "REPL":
|
|
1182
|
+
return "REPL", True
|
|
1209
1183
|
try:
|
|
1210
1184
|
result = subprocess.run(
|
|
1211
1185
|
["git", "remote", "get-url", "origin"],
|
|
@@ -1261,7 +1235,7 @@ class DerivaML(Dataset):
|
|
|
1261
1235
|
|
|
1262
1236
|
Args:
|
|
1263
1237
|
configuration: ExecutionConfiguration:
|
|
1264
|
-
|
|
1238
|
+
dry_run: Do not create an execution record or upload results.
|
|
1265
1239
|
|
|
1266
1240
|
Returns:
|
|
1267
1241
|
An execution object.
|
|
@@ -1274,6 +1248,7 @@ class DerivaML(Dataset):
|
|
|
1274
1248
|
# @validate_call
|
|
1275
1249
|
def restore_execution(self, execution_rid: Optional[RID] = None) -> "Execution":
|
|
1276
1250
|
"""Return an Execution object for a previously started execution with the specified RID."""
|
|
1251
|
+
|
|
1277
1252
|
from .execution import Execution
|
|
1278
1253
|
|
|
1279
1254
|
# Find path to execution
|
|
@@ -1283,13 +1258,11 @@ class DerivaML(Dataset):
|
|
|
1283
1258
|
raise DerivaMLException(f"Multiple execution RIDs were found {e_rids}.")
|
|
1284
1259
|
|
|
1285
1260
|
execution_rid = e_rids[0]
|
|
1286
|
-
cfile = (
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
)
|
|
1292
|
-
/ "configuration.json"
|
|
1261
|
+
cfile = asset_file_path(
|
|
1262
|
+
prefix=self.working_dir,
|
|
1263
|
+
exec_rid=execution_rid,
|
|
1264
|
+
file_name="configuration.json",
|
|
1265
|
+
asset_table=self.model.name_to_table("Execution_Metadata"),
|
|
1293
1266
|
)
|
|
1294
1267
|
configuration = ExecutionConfiguration.load_configuration(cfile)
|
|
1295
1268
|
return Execution(configuration, self, reload=execution_rid)
|
deriva_ml/deriva_model.py
CHANGED
|
@@ -27,6 +27,8 @@ from typing import Iterable, Optional
|
|
|
27
27
|
class DerivaModel:
|
|
28
28
|
"""Augmented interface to deriva model class.
|
|
29
29
|
|
|
30
|
+
This class provides a number of DerivaML specific methods that augment the interface in the deriva model class.
|
|
31
|
+
|
|
30
32
|
Attributes:
|
|
31
33
|
domain_schema: Schema name for domain specific tables and relationships.
|
|
32
34
|
model: ERMRest model for the catalog.
|
|
@@ -71,6 +73,10 @@ class DerivaModel:
|
|
|
71
73
|
# No domain schema defined.
|
|
72
74
|
self.domain_schema = domain_schema
|
|
73
75
|
|
|
76
|
+
def __getattr__(self, name):
|
|
77
|
+
# Called only if `name` is not found in Manager. Delegate attributes to model class.
|
|
78
|
+
return getattr(self.model, name)
|
|
79
|
+
|
|
74
80
|
def name_to_table(self, table: str | Table) -> Table:
|
|
75
81
|
"""Return the table object corresponding to the given table name.
|
|
76
82
|
|
|
@@ -126,6 +132,31 @@ class DerivaModel:
|
|
|
126
132
|
table = self.name_to_table(table_name)
|
|
127
133
|
return table.is_association(unqualified=unqualified, pure=pure)
|
|
128
134
|
|
|
135
|
+
def find_association(self, table1: Table | str, table2: Table | str) -> Table:
|
|
136
|
+
"""Given two tables, return an association table that connects the two.
|
|
137
|
+
|
|
138
|
+
Raises:
|
|
139
|
+
DerivaML exception if there is either not an association table or more than one association table.
|
|
140
|
+
"""
|
|
141
|
+
table1 = self.name_to_table(table1)
|
|
142
|
+
table2 = self.name_to_table(table2)
|
|
143
|
+
|
|
144
|
+
tables = [
|
|
145
|
+
a.table
|
|
146
|
+
for a in table1.find_associations(pure=False)
|
|
147
|
+
if a.other_fkeys.pop().pk_table == table2
|
|
148
|
+
]
|
|
149
|
+
if len(tables) == 1:
|
|
150
|
+
return tables[0]
|
|
151
|
+
elif len(tables) == 0:
|
|
152
|
+
raise DerivaMLException(
|
|
153
|
+
f"No association tables found between {table1.name} and {table2.name}."
|
|
154
|
+
)
|
|
155
|
+
else:
|
|
156
|
+
raise DerivaMLException(
|
|
157
|
+
f"There are {len(tables)} association tables between {table1.name} and {table2.name}."
|
|
158
|
+
)
|
|
159
|
+
|
|
129
160
|
def is_asset(self, table_name: str | Table) -> bool:
|
|
130
161
|
"""True if the specified table is an asset table.
|
|
131
162
|
|