deriva-ml 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/dataset.py +1 -1
- deriva_ml/dataset_bag.py +10 -3
- deriva_ml/demo_catalog.py +84 -78
- deriva_ml/deriva_definitions.py +2 -2
- deriva_ml/deriva_ml_base.py +87 -128
- deriva_ml/deriva_model.py +25 -0
- deriva_ml/execution.py +389 -309
- deriva_ml/execution_configuration.py +16 -6
- deriva_ml/feature.py +1 -2
- deriva_ml/schema_setup/create_schema.py +223 -183
- deriva_ml/upload.py +95 -232
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/METADATA +2 -1
- deriva_ml-1.11.0.dist-info/RECORD +27 -0
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/WHEEL +1 -1
- deriva_ml-1.10.0.dist-info/RECORD +0 -27
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.10.0.dist-info → deriva_ml-1.11.0.dist-info}/top_level.txt +0 -0
deriva_ml/deriva_ml_base.py
CHANGED
|
@@ -31,7 +31,6 @@ from deriva.core.datapath import DataPathException
|
|
|
31
31
|
from deriva.core.deriva_server import DerivaServer
|
|
32
32
|
from deriva.core.ermrest_catalog import ResolveRidResult
|
|
33
33
|
from deriva.core.ermrest_model import Key, Table
|
|
34
|
-
from deriva.core.hatrac_store import HatracStore
|
|
35
34
|
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
|
|
36
35
|
from pydantic import validate_call, ConfigDict
|
|
37
36
|
from requests import RequestException
|
|
@@ -42,24 +41,17 @@ from .dataset import Dataset
|
|
|
42
41
|
from .dataset_aux_classes import DatasetSpec
|
|
43
42
|
from .dataset_bag import DatasetBag
|
|
44
43
|
from .deriva_model import DerivaModel
|
|
45
|
-
from .upload import
|
|
46
|
-
table_path,
|
|
47
|
-
execution_rids,
|
|
48
|
-
execution_metadata_dir,
|
|
49
|
-
upload_directory,
|
|
50
|
-
UploadAssetDirectory,
|
|
51
|
-
)
|
|
44
|
+
from .upload import table_path, execution_rids, asset_file_path
|
|
52
45
|
from .deriva_definitions import ColumnDefinition
|
|
53
|
-
from .deriva_definitions import ExecMetadataVocab
|
|
54
46
|
from .deriva_definitions import (
|
|
55
47
|
RID,
|
|
56
48
|
Status,
|
|
57
|
-
FileUploadState,
|
|
58
49
|
DerivaMLException,
|
|
59
50
|
ML_SCHEMA,
|
|
60
51
|
VocabularyTerm,
|
|
61
52
|
MLVocab,
|
|
62
53
|
FileSpec,
|
|
54
|
+
TableDefinition,
|
|
63
55
|
)
|
|
64
56
|
|
|
65
57
|
try:
|
|
@@ -346,30 +338,6 @@ class DerivaML(Dataset):
|
|
|
346
338
|
table=self.model.name_to_table(table).name,
|
|
347
339
|
)
|
|
348
340
|
|
|
349
|
-
def asset_dir(
|
|
350
|
-
self, table: str | Table, prefix: Optional[str | Path] = None
|
|
351
|
-
) -> UploadAssetDirectory:
|
|
352
|
-
"""Return a local file path in which to place a files for an asset table. T
|
|
353
|
-
|
|
354
|
-
Args:
|
|
355
|
-
table: Location of where to place files. Defaults to execution_assets_path.
|
|
356
|
-
prefix: Root path to asset directory.
|
|
357
|
-
|
|
358
|
-
Returns:
|
|
359
|
-
Path to the directory in which asset files should be placed.
|
|
360
|
-
"""
|
|
361
|
-
table = self.model.name_to_table(table)
|
|
362
|
-
if not self.model.is_asset(table):
|
|
363
|
-
raise DerivaMLException(f"The table {table} is not an asset table.")
|
|
364
|
-
|
|
365
|
-
prefix = Path(prefix) if prefix else self.working_dir
|
|
366
|
-
return UploadAssetDirectory(
|
|
367
|
-
model=self.model,
|
|
368
|
-
prefix=prefix,
|
|
369
|
-
schema=table.schema.name,
|
|
370
|
-
table=table.name,
|
|
371
|
-
)
|
|
372
|
-
|
|
373
341
|
def download_dir(self, cached: bool = False) -> Path:
|
|
374
342
|
"""Location where downloaded files are placed.
|
|
375
343
|
|
|
@@ -532,10 +500,17 @@ class DerivaML(Dataset):
|
|
|
532
500
|
)
|
|
533
501
|
)
|
|
534
502
|
|
|
503
|
+
def create_table(self, table: TableDefinition) -> Table:
|
|
504
|
+
"""Create a table from a table definition."""
|
|
505
|
+
return self.model.schemas[self.domain_schema].create_table(table.model_dump())
|
|
506
|
+
|
|
507
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
535
508
|
def create_asset(
|
|
536
509
|
self,
|
|
537
510
|
asset_name: str,
|
|
538
511
|
column_defs: Optional[Iterable[ColumnDefinition]] = None,
|
|
512
|
+
fkey_defs: Optional[Iterable[ColumnDefinition]] = None,
|
|
513
|
+
referenced_tables: Optional[Iterable[Table]] = None,
|
|
539
514
|
comment: str = "",
|
|
540
515
|
schema: Optional[str] = None,
|
|
541
516
|
) -> Table:
|
|
@@ -544,6 +519,8 @@ class DerivaML(Dataset):
|
|
|
544
519
|
Args:
|
|
545
520
|
asset_name: Name of the asset table.
|
|
546
521
|
column_defs: Iterable of ColumnDefinition objects to provide additional metadata for asset.
|
|
522
|
+
fkey_defs: Iterable of ForeignKeyDefinition objects to provide additional metadata for asset.
|
|
523
|
+
referenced_tables: Iterable of Table objects to which asset should provide foreign-key references to.
|
|
547
524
|
comment: Description of the asset table. (Default value = '')
|
|
548
525
|
schema: Schema in which to create the asset table. Defaults to domain_schema.
|
|
549
526
|
asset_name: str:
|
|
@@ -553,17 +530,82 @@ class DerivaML(Dataset):
|
|
|
553
530
|
Table object for the asset table.
|
|
554
531
|
"""
|
|
555
532
|
column_defs = column_defs or []
|
|
533
|
+
fkey_defs = fkey_defs or []
|
|
534
|
+
referenced_tables = referenced_tables or []
|
|
556
535
|
schema = schema or self.domain_schema
|
|
536
|
+
|
|
537
|
+
self.add_term(
|
|
538
|
+
MLVocab.asset_type, asset_name, description=f"A {asset_name} asset"
|
|
539
|
+
)
|
|
557
540
|
asset_table = self.model.schemas[schema].create_table(
|
|
558
541
|
Table.define_asset(
|
|
559
542
|
schema,
|
|
560
543
|
asset_name,
|
|
561
544
|
column_defs=[c.model_dump() for c in column_defs],
|
|
545
|
+
fkey_defs=[fk.model_dump() for fk in fkey_defs],
|
|
562
546
|
comment=comment,
|
|
563
547
|
)
|
|
564
548
|
)
|
|
549
|
+
|
|
550
|
+
self.model.schemas[self.domain_schema].create_table(
|
|
551
|
+
Table.define_association(
|
|
552
|
+
[
|
|
553
|
+
(asset_table.name, asset_table),
|
|
554
|
+
("Asset_Type", self.model.name_to_table("Asset_Type")),
|
|
555
|
+
]
|
|
556
|
+
)
|
|
557
|
+
)
|
|
558
|
+
for t in referenced_tables:
|
|
559
|
+
asset_table.create_reference(self.model.name_to_table(t))
|
|
560
|
+
# Create a table to track execution that creates the asset
|
|
561
|
+
atable = self.model.schemas[self.domain_schema].create_table(
|
|
562
|
+
Table.define_association(
|
|
563
|
+
[
|
|
564
|
+
(asset_name, asset_table),
|
|
565
|
+
(
|
|
566
|
+
"Execution",
|
|
567
|
+
self.model.schemas[self.ml_schema].tables["Execution"],
|
|
568
|
+
),
|
|
569
|
+
]
|
|
570
|
+
)
|
|
571
|
+
)
|
|
572
|
+
atable.create_reference(self.model.name_to_table("Asset_Role"))
|
|
565
573
|
return asset_table
|
|
566
574
|
|
|
575
|
+
# @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
576
|
+
def list_assets(self, asset_table: Table | str):
|
|
577
|
+
"""Return the contents of an asset table"""
|
|
578
|
+
|
|
579
|
+
if not self.model.is_asset(asset_table):
|
|
580
|
+
raise DerivaMLException(f"Table {asset_table.name} is not an asset")
|
|
581
|
+
asset_table = self.model.name_to_table(asset_table)
|
|
582
|
+
pb = self._model.catalog.getPathBuilder()
|
|
583
|
+
asset_path = pb.schemas[asset_table.schema.name].tables[asset_table.name]
|
|
584
|
+
|
|
585
|
+
asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
|
|
586
|
+
type_path = pb.schemas[asset_type_table.schema.name].tables[
|
|
587
|
+
asset_type_table.name
|
|
588
|
+
]
|
|
589
|
+
|
|
590
|
+
# Get a list of all the asset_type values associated with this dataset_table.
|
|
591
|
+
assets = []
|
|
592
|
+
for asset in asset_path.entities().fetch():
|
|
593
|
+
asset_types = (
|
|
594
|
+
type_path.filter(type_path.columns[asset_table.name] == asset["RID"])
|
|
595
|
+
.attributes(type_path.Asset_Type)
|
|
596
|
+
.fetch()
|
|
597
|
+
)
|
|
598
|
+
assets.append(
|
|
599
|
+
asset
|
|
600
|
+
| {
|
|
601
|
+
MLVocab.asset_type.value: [
|
|
602
|
+
asset_type[MLVocab.asset_type.value]
|
|
603
|
+
for asset_type in asset_types
|
|
604
|
+
]
|
|
605
|
+
}
|
|
606
|
+
)
|
|
607
|
+
return assets
|
|
608
|
+
|
|
567
609
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
568
610
|
def create_feature(
|
|
569
611
|
self,
|
|
@@ -717,24 +759,6 @@ class DerivaML(Dataset):
|
|
|
717
759
|
"""
|
|
718
760
|
return self.model.find_features(table)
|
|
719
761
|
|
|
720
|
-
@validate_call
|
|
721
|
-
def add_features(self, features: Iterable[FeatureRecord]) -> int:
|
|
722
|
-
"""Add a set of new feature values to the catalog.
|
|
723
|
-
|
|
724
|
-
Args:
|
|
725
|
-
features: Iterable[FeatureRecord]:
|
|
726
|
-
|
|
727
|
-
Returns:
|
|
728
|
-
Number of attributes added
|
|
729
|
-
"""
|
|
730
|
-
features = list(features)
|
|
731
|
-
feature_table = features[0].feature.feature_table
|
|
732
|
-
feature_path = self.pathBuilder.schemas[feature_table.schema.name].tables[
|
|
733
|
-
feature_table.name
|
|
734
|
-
]
|
|
735
|
-
entries = feature_path.insert(f.model_dump() for f in features)
|
|
736
|
-
return len(entries)
|
|
737
|
-
|
|
738
762
|
# noinspection PyProtectedMember
|
|
739
763
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
740
764
|
def list_feature_values(
|
|
@@ -838,7 +862,8 @@ class DerivaML(Dataset):
|
|
|
838
862
|
raise DerivaMLException(f"The table {table} is not a controlled vocabulary")
|
|
839
863
|
schema_name, table_name = vocab_table.schema.name, vocab_table.name
|
|
840
864
|
schema_path = self.catalog.getPathBuilder().schemas[schema_name]
|
|
841
|
-
|
|
865
|
+
|
|
866
|
+
for term in schema_path.tables[table_name].entities().fetch():
|
|
842
867
|
if term_name == term["Name"] or (
|
|
843
868
|
term["Synonyms"] and term_name in term["Synonyms"]
|
|
844
869
|
):
|
|
@@ -891,65 +916,6 @@ class DerivaML(Dataset):
|
|
|
891
916
|
snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
|
|
892
917
|
)
|
|
893
918
|
|
|
894
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
895
|
-
def download_asset(self, asset_rid: RID, dest_dir: Path) -> Path:
|
|
896
|
-
"""Download an asset from a URL and place it in a local directory.
|
|
897
|
-
|
|
898
|
-
Args:
|
|
899
|
-
asset_rid: URL of the asset.
|
|
900
|
-
dest_dir: Destination directory for the asset.
|
|
901
|
-
|
|
902
|
-
Returns:
|
|
903
|
-
A Path object to the downloaded asset.
|
|
904
|
-
"""
|
|
905
|
-
table = self.resolve_rid(asset_rid).table
|
|
906
|
-
if not self.model.is_asset(table):
|
|
907
|
-
raise DerivaMLException(f"RID {asset_rid} is not for an asset table.")
|
|
908
|
-
|
|
909
|
-
tpath = self.pathBuilder.schemas[table.schema.name].tables[table.name]
|
|
910
|
-
asset_metadata = list(tpath.filter(tpath.RID == asset_rid).entities())[0]
|
|
911
|
-
asset_url = asset_metadata["URL"]
|
|
912
|
-
asset_filename = dest_dir / asset_metadata["Filename"]
|
|
913
|
-
|
|
914
|
-
hs = HatracStore("https", self.host_name, self.credential)
|
|
915
|
-
hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
|
|
916
|
-
return Path(asset_filename)
|
|
917
|
-
|
|
918
|
-
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
919
|
-
def upload_assets(
|
|
920
|
-
self,
|
|
921
|
-
assets_dir: str | Path | UploadAssetDirectory,
|
|
922
|
-
) -> dict[Any, FileUploadState] | None:
|
|
923
|
-
"""Upload assets from a directory.
|
|
924
|
-
|
|
925
|
-
This routine assumes that the current upload specification includes a configuration for the specified directory.
|
|
926
|
-
Every asset in the specified directory is uploaded
|
|
927
|
-
|
|
928
|
-
Args:
|
|
929
|
-
assets_dir: Directory containing the assets to upload.
|
|
930
|
-
|
|
931
|
-
Returns:
|
|
932
|
-
Results of the upload operation.
|
|
933
|
-
|
|
934
|
-
Raises:
|
|
935
|
-
DerivaMLException: If there is an issue uploading the assets.
|
|
936
|
-
"""
|
|
937
|
-
|
|
938
|
-
def path_to_asset(path: str) -> str:
|
|
939
|
-
"""Pull the asset name out of a path to that asset in the filesystem"""
|
|
940
|
-
components = path.split("/")
|
|
941
|
-
return components[
|
|
942
|
-
components.index("asset") + 2
|
|
943
|
-
] # Look for asset in the path to find the name
|
|
944
|
-
|
|
945
|
-
if isinstance(assets_dir, UploadAssetDirectory):
|
|
946
|
-
assets_dir = assets_dir.path
|
|
947
|
-
|
|
948
|
-
if not self.model.is_asset(Path(assets_dir).name):
|
|
949
|
-
raise DerivaMLException("Directory does not have name of an asset table.")
|
|
950
|
-
results = upload_directory(self.model, assets_dir)
|
|
951
|
-
return {path_to_asset(p): r for p, r in results.items()}
|
|
952
|
-
|
|
953
919
|
def _update_status(
|
|
954
920
|
self, new_status: Status, status_detail: str, execution_rid: RID
|
|
955
921
|
):
|
|
@@ -1205,7 +1171,7 @@ class DerivaML(Dataset):
|
|
|
1205
1171
|
|
|
1206
1172
|
"""
|
|
1207
1173
|
|
|
1208
|
-
# Get repo URL from local
|
|
1174
|
+
# Get repo URL from local gitHub repo.
|
|
1209
1175
|
try:
|
|
1210
1176
|
result = subprocess.run(
|
|
1211
1177
|
["git", "remote", "get-url", "origin"],
|
|
@@ -1249,7 +1215,7 @@ class DerivaML(Dataset):
|
|
|
1249
1215
|
|
|
1250
1216
|
# @validate_call
|
|
1251
1217
|
def create_execution(
|
|
1252
|
-
self, configuration: ExecutionConfiguration,
|
|
1218
|
+
self, configuration: ExecutionConfiguration, dry_run: bool = False
|
|
1253
1219
|
) -> "Execution":
|
|
1254
1220
|
"""Create an execution object
|
|
1255
1221
|
|
|
@@ -1261,19 +1227,14 @@ class DerivaML(Dataset):
|
|
|
1261
1227
|
|
|
1262
1228
|
Args:
|
|
1263
1229
|
configuration: ExecutionConfiguration:
|
|
1264
|
-
|
|
1230
|
+
dry_run: Do not create an execution record or upload results.
|
|
1265
1231
|
|
|
1266
1232
|
Returns:
|
|
1267
1233
|
An execution object.
|
|
1268
1234
|
"""
|
|
1269
1235
|
from .execution import Execution
|
|
1270
1236
|
|
|
1271
|
-
|
|
1272
|
-
DerivaMLException(
|
|
1273
|
-
"Only one execution can be created for a Deriva ML instance."
|
|
1274
|
-
)
|
|
1275
|
-
else:
|
|
1276
|
-
self._execution = Execution(configuration, self, dryrun=dryrun)
|
|
1237
|
+
self._execution = Execution(configuration, self, dry_run=dry_run)
|
|
1277
1238
|
return self._execution
|
|
1278
1239
|
|
|
1279
1240
|
# @validate_call
|
|
@@ -1288,13 +1249,11 @@ class DerivaML(Dataset):
|
|
|
1288
1249
|
raise DerivaMLException(f"Multiple execution RIDs were found {e_rids}.")
|
|
1289
1250
|
|
|
1290
1251
|
execution_rid = e_rids[0]
|
|
1291
|
-
cfile = (
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
)
|
|
1297
|
-
/ "configuration.json"
|
|
1252
|
+
cfile = asset_file_path(
|
|
1253
|
+
prefix=self.working_dir,
|
|
1254
|
+
exec_rid=execution_rid,
|
|
1255
|
+
file_name="configuration.json",
|
|
1256
|
+
asset_table=self.model.name_to_table("Execution_Metadata"),
|
|
1298
1257
|
)
|
|
1299
1258
|
configuration = ExecutionConfiguration.load_configuration(cfile)
|
|
1300
1259
|
return Execution(configuration, self, reload=execution_rid)
|
deriva_ml/deriva_model.py
CHANGED
|
@@ -126,6 +126,31 @@ class DerivaModel:
|
|
|
126
126
|
table = self.name_to_table(table_name)
|
|
127
127
|
return table.is_association(unqualified=unqualified, pure=pure)
|
|
128
128
|
|
|
129
|
+
def find_association(self, table1: Table | str, table2: Table | str) -> Table:
|
|
130
|
+
"""Given two tables, return an association table that connects the two.
|
|
131
|
+
|
|
132
|
+
Raises"
|
|
133
|
+
DerivaML exception if there is either not an association table or more than one association table.
|
|
134
|
+
"""
|
|
135
|
+
table1 = self.name_to_table(table1)
|
|
136
|
+
table2 = self.name_to_table(table2)
|
|
137
|
+
|
|
138
|
+
tables = [
|
|
139
|
+
a.table
|
|
140
|
+
for a in table1.find_associations(pure=False)
|
|
141
|
+
if (t := a.other_fkeys.pop().pk_table) == table2
|
|
142
|
+
]
|
|
143
|
+
if len(tables) == 1:
|
|
144
|
+
return tables[0]
|
|
145
|
+
elif len(tables) == 0:
|
|
146
|
+
raise DerivaMLException(
|
|
147
|
+
f"No association tables found between {table1.name} and {table2.name}."
|
|
148
|
+
)
|
|
149
|
+
else:
|
|
150
|
+
raise DerivaMLException(
|
|
151
|
+
f"There are {len(tables)} association tables between {table1.name} and {table2.name}."
|
|
152
|
+
)
|
|
153
|
+
|
|
129
154
|
def is_asset(self, table_name: str | Table) -> bool:
|
|
130
155
|
"""True if the specified table is an asset table.
|
|
131
156
|
|