deriva-ml 1.11.0__tar.gz → 1.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deriva_ml-1.11.0/src/deriva_ml.egg-info → deriva_ml-1.12.0}/PKG-INFO +2 -1
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/README.md +1 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/Notebooks/DerivaML Execution.ipynb +2 -2
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/user-guide/datasets.md +0 -2
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/database_model.py +3 -2
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/dataset.py +6 -15
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/dataset_bag.py +1 -1
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/deriva_ml_base.py +20 -11
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/deriva_model.py +8 -2
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/execution.py +43 -13
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/execution_configuration.py +4 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/upload.py +5 -5
- {deriva_ml-1.11.0 → deriva_ml-1.12.0/src/deriva_ml.egg-info}/PKG-INFO +2 -1
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/.github/workflows/publish-docs.yml +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/.gitignore +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/LICENSE +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/.DS_Store +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/Notebooks/DerivaML Create Notes.ipynb +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/Notebooks/DerivaML Dataset.ipynb +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/Notebooks/DerivaML Features.ipynb +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/Notebooks/DerivaML Vocabulary.ipynb +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/assets/ERD.png +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/assets/Launcher.png +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/assets/copy_minid.png +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/assets/deriva-logo.png +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/assets/deriva-ml.pdf +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/assets/sharing-at-home.pdf +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/code-docs/dataset.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/code-docs/dataset_aux_classes.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/code-docs/dataset_bag.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/code-docs/deriva_ml_base.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/code-docs/deriva_model.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/code-docs/execution.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/code-docs/execution_configuration.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/code-docs/feature.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/code-docs/upload.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/deriva_ml_structure.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/index.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/release-notes.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/user-guide/execution-configuration.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/user-guide/identifiers.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/user-guide/install.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/user-guide/ml_workflow_instruction.md +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/mkdocs.yml +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/pyproject.toml +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/release.sh +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/setup.cfg +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/__init__.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/dataset_aux_classes.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/demo_catalog.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/deriva_definitions.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/execution_environment.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/feature.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/history.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/schema_setup/__init__.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/schema_setup/annotations.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/schema_setup/create_schema.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/schema_setup/policy.json +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/schema_setup/table_comments_utils.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/test_functions.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml.egg-info/SOURCES.txt +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml.egg-info/entry_points.txt +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml.egg-info/requires.txt +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml.egg-info/top_level.txt +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/tests/__init__.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/tests/derivaml_test.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/tests/runner.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/tests/test_basic_tables.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/tests/test_dataset.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/tests/test_download.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/tests/test_execution.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/tests/test_features.py +0 -0
- {deriva_ml-1.11.0 → deriva_ml-1.12.0}/tests/test_upload.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deriva-ml
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.12.0
|
|
4
4
|
Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
|
|
5
5
|
Author-email: ISRD <isrd-dev@isi.edu>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -28,3 +28,4 @@ The script release.sh will create a new release tag in GitHub. This script requ
|
|
|
28
28
|
GitHUB CLI be installed.
|
|
29
29
|
|
|
30
30
|
See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.
|
|
31
|
+
|
|
@@ -721,12 +721,12 @@
|
|
|
721
721
|
" pass\n",
|
|
722
722
|
"\n",
|
|
723
723
|
" # Write a new model\n",
|
|
724
|
-
" model_file = manual_execution.asset_path('API_Model'
|
|
724
|
+
" model_file = manual_execution.asset_path('API_Model', 'modelfile.txt')\n",
|
|
725
725
|
" with open(model_file, 'w') as f:\n",
|
|
726
726
|
" f.write(\"Hello there a new model;\\n\")\n",
|
|
727
727
|
"\n",
|
|
728
728
|
" # Create some new feature values.\n",
|
|
729
|
-
" bb_csv_path, bb_asset_paths = ml_execution.
|
|
729
|
+
" bb_csv_path, bb_asset_paths = ml_execution.asset_path('Image', 'BoundingBox')\n",
|
|
730
730
|
" bounding_box_files = [bb_asset_paths['BoundingBox'] / f\"box{i}.txt\" for i in range(10)]\n",
|
|
731
731
|
" for i in range(10):\n",
|
|
732
732
|
" bounding_box_files.append(fn := bb_asset_paths['BoundingBox'] / f\"box{i}.txt\")\n",
|
|
@@ -17,7 +17,6 @@ Dataset types are assigned from a controlled vocabulary called `MLVocab.dataset_
|
|
|
17
17
|
as you need:
|
|
18
18
|
```
|
|
19
19
|
from deriva_ml import MLVocab
|
|
20
|
-
...
|
|
21
20
|
ml_instance.add_term(MLVocab.dataset_type, "DemoSet", description="A test dataset_table")
|
|
22
21
|
```
|
|
23
22
|
When you create a dataset, you can provide as many dataset types as required to streamline orginizing and discovering
|
|
@@ -30,7 +29,6 @@ Its important to know how a dataset was created, so the most common way to creat
|
|
|
30
29
|
# Now lets create model configuration for our program.
|
|
31
30
|
api_workflow = Workflow(
|
|
32
31
|
name="API Workflow",
|
|
33
|
-
url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/Notebooks/DerivaML%20Dataset.ipynb",
|
|
34
32
|
workflow_type="Create Dataset Notebook"
|
|
35
33
|
)
|
|
36
34
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Ths module contains the definition of the DatabaseModel class. The role of this class is to provide an
|
|
1
|
+
"""Ths module contains the definition of the DatabaseModel class. The role of this class is to provide an interface between the BDBag representation
|
|
2
2
|
of a dataset and a sqllite database in which the contents of the bag are stored.
|
|
3
3
|
"""
|
|
4
4
|
|
|
@@ -51,7 +51,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
51
51
|
appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
|
|
52
52
|
into DatabaseModels, is kept in the class variable `_rid_map`.
|
|
53
53
|
|
|
54
|
-
Because you can load
|
|
54
|
+
Because you can load different versions of a dataset simultaneously, the dataset RID and version number are tracked, and a new
|
|
55
55
|
sqllite instance is created for every new dataset version present.
|
|
56
56
|
|
|
57
57
|
Attributes:
|
|
@@ -290,6 +290,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
290
290
|
return DatasetBag(self, dataset_rid or self.dataset_rid)
|
|
291
291
|
|
|
292
292
|
def dataset_version(self, dataset_rid: Optional[RID] = None) -> DatasetVersion:
|
|
293
|
+
"""Return the version of the specified dataset."""
|
|
293
294
|
if dataset_rid and dataset_rid not in self.bag_rids:
|
|
294
295
|
DerivaMLException(f"Dataset RID {dataset_rid} is not in model.")
|
|
295
296
|
return self.bag_rids[dataset_rid]
|
|
@@ -232,12 +232,10 @@ class Dataset:
|
|
|
232
232
|
"""Increment the version of the specified dataset_table.
|
|
233
233
|
|
|
234
234
|
Args:
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
description: Description of the version update of the dataset_table.
|
|
240
|
-
execution_rid: Which execution is performing increment.
|
|
235
|
+
dataset_rid: RID of the dataset whose version is to be incremented.
|
|
236
|
+
component: Which version of the dataset_table to increment. Major, Minor or Patch
|
|
237
|
+
description: Description of the version update of the dataset_table.
|
|
238
|
+
execution_rid: Which execution is performing increment.
|
|
241
239
|
|
|
242
240
|
Returns:
|
|
243
241
|
new semantic version of the dataset_table as a 3-tuple
|
|
@@ -275,9 +273,6 @@ class Dataset:
|
|
|
275
273
|
description: Description of the dataset_table.
|
|
276
274
|
execution_rid: Execution under which the dataset_table will be created.
|
|
277
275
|
version: Version of the dataset_table.
|
|
278
|
-
type: str | list[str]:
|
|
279
|
-
description: str:
|
|
280
|
-
|
|
281
276
|
|
|
282
277
|
Returns:
|
|
283
278
|
New dataset_table RID.
|
|
@@ -349,7 +344,6 @@ class Dataset:
|
|
|
349
344
|
Args:
|
|
350
345
|
dataset_rid: RID of the dataset_table to delete.
|
|
351
346
|
recurse: If True, delete the dataset_table along with any nested datasets. (Default value = False)
|
|
352
|
-
dataset_rid: RID:
|
|
353
347
|
"""
|
|
354
348
|
# Get association table entries for this dataset_table
|
|
355
349
|
# Delete association table entries
|
|
@@ -397,7 +391,7 @@ class Dataset:
|
|
|
397
391
|
filtered_path = dataset_path
|
|
398
392
|
else:
|
|
399
393
|
filtered_path = dataset_path.filter(
|
|
400
|
-
(dataset_path.Deleted == False) | (dataset_path.Deleted == None)
|
|
394
|
+
(dataset_path.Deleted == False) | (dataset_path.Deleted == None) # noqa: E712
|
|
401
395
|
)
|
|
402
396
|
|
|
403
397
|
# Get a list of all the dataset_type values associated with this dataset_table.
|
|
@@ -439,8 +433,7 @@ class Dataset:
|
|
|
439
433
|
routine makes it possible to add objects from the specified table to a dataset_table.
|
|
440
434
|
|
|
441
435
|
Args:
|
|
442
|
-
element: Name
|
|
443
|
-
element: str | Table:
|
|
436
|
+
element: Name of the table or table object that is to be added to the dataset_table.
|
|
444
437
|
|
|
445
438
|
Returns:
|
|
446
439
|
The table object that was added to the dataset_table.
|
|
@@ -464,7 +457,6 @@ class Dataset:
|
|
|
464
457
|
|
|
465
458
|
Args:
|
|
466
459
|
dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
|
|
467
|
-
dataset_rid: RID:
|
|
468
460
|
recurse: (Default value = False)
|
|
469
461
|
limit: If provided, the maximum number of members to return for each element type.
|
|
470
462
|
|
|
@@ -677,7 +669,6 @@ class Dataset:
|
|
|
677
669
|
|
|
678
670
|
Args:
|
|
679
671
|
dataset_rid: return: RID of the parent dataset_table.
|
|
680
|
-
dataset_rid: RID:
|
|
681
672
|
|
|
682
673
|
Returns:
|
|
683
674
|
RID of the parent dataset_table.
|
|
@@ -168,7 +168,7 @@ class DatasetBag:
|
|
|
168
168
|
yield dict(zip(col_names, row))
|
|
169
169
|
|
|
170
170
|
@validate_call
|
|
171
|
-
def list_dataset_members(self, recurse: bool = False) -> dict[str, dict[str,
|
|
171
|
+
def list_dataset_members(self, recurse: bool = False) -> dict[str, dict[str, list]]:
|
|
172
172
|
"""Return a list of entities associated with a specific _dataset_table.
|
|
173
173
|
|
|
174
174
|
Args:
|
|
@@ -265,10 +265,13 @@ class DerivaML(Dataset):
|
|
|
265
265
|
is_notebook = True
|
|
266
266
|
else:
|
|
267
267
|
stack = inspect.stack()
|
|
268
|
+
# Get the caller's filename, which is two up the stack from here.
|
|
268
269
|
if len(stack) > 1:
|
|
269
|
-
filename = Path(
|
|
270
|
-
|
|
271
|
-
|
|
270
|
+
filename = Path(stack[2].filename)
|
|
271
|
+
if not filename.exists():
|
|
272
|
+
# Begin called from command line interpreter.
|
|
273
|
+
filename = "REPL"
|
|
274
|
+
# Get the caller's filename, which is two up the stack from here.
|
|
272
275
|
else:
|
|
273
276
|
raise DerivaMLException(
|
|
274
277
|
"Looking for caller failed"
|
|
@@ -326,7 +329,6 @@ class DerivaML(Dataset):
|
|
|
326
329
|
"""Return a local file path in which to place a CSV to add values to a table on upload.
|
|
327
330
|
|
|
328
331
|
Args:
|
|
329
|
-
table: return:
|
|
330
332
|
table: str | Table:
|
|
331
333
|
|
|
332
334
|
Returns:
|
|
@@ -1143,13 +1145,17 @@ class DerivaML(Dataset):
|
|
|
1143
1145
|
if self._is_notebook
|
|
1144
1146
|
else f"git hash-object {self.executable_path}"
|
|
1145
1147
|
)
|
|
1146
|
-
checksum =
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1148
|
+
checksum = (
|
|
1149
|
+
subprocess.run(
|
|
1150
|
+
cmd,
|
|
1151
|
+
capture_output=True,
|
|
1152
|
+
text=True,
|
|
1153
|
+
check=False,
|
|
1154
|
+
shell=True,
|
|
1155
|
+
).stdout.strip()
|
|
1156
|
+
if self.executable_path != "REPL"
|
|
1157
|
+
else "1"
|
|
1158
|
+
)
|
|
1153
1159
|
|
|
1154
1160
|
return Workflow(
|
|
1155
1161
|
name=name,
|
|
@@ -1172,6 +1178,8 @@ class DerivaML(Dataset):
|
|
|
1172
1178
|
"""
|
|
1173
1179
|
|
|
1174
1180
|
# Get repo URL from local gitHub repo.
|
|
1181
|
+
if self.executable_path == "REPL":
|
|
1182
|
+
return "REPL", True
|
|
1175
1183
|
try:
|
|
1176
1184
|
result = subprocess.run(
|
|
1177
1185
|
["git", "remote", "get-url", "origin"],
|
|
@@ -1240,6 +1248,7 @@ class DerivaML(Dataset):
|
|
|
1240
1248
|
# @validate_call
|
|
1241
1249
|
def restore_execution(self, execution_rid: Optional[RID] = None) -> "Execution":
|
|
1242
1250
|
"""Return an Execution object for a previously started execution with the specified RID."""
|
|
1251
|
+
|
|
1243
1252
|
from .execution import Execution
|
|
1244
1253
|
|
|
1245
1254
|
# Find path to execution
|
|
@@ -27,6 +27,8 @@ from typing import Iterable, Optional
|
|
|
27
27
|
class DerivaModel:
|
|
28
28
|
"""Augmented interface to deriva model class.
|
|
29
29
|
|
|
30
|
+
This class provides a number of DerivaML specific methods that augment the interface in the deriva model class.
|
|
31
|
+
|
|
30
32
|
Attributes:
|
|
31
33
|
domain_schema: Schema name for domain specific tables and relationships.
|
|
32
34
|
model: ERMRest model for the catalog.
|
|
@@ -71,6 +73,10 @@ class DerivaModel:
|
|
|
71
73
|
# No domain schema defined.
|
|
72
74
|
self.domain_schema = domain_schema
|
|
73
75
|
|
|
76
|
+
def __getattr__(self, name):
|
|
77
|
+
# Called only if `name` is not found in Manager. Delegate attributes to model class.
|
|
78
|
+
return getattr(self.model, name)
|
|
79
|
+
|
|
74
80
|
def name_to_table(self, table: str | Table) -> Table:
|
|
75
81
|
"""Return the table object corresponding to the given table name.
|
|
76
82
|
|
|
@@ -129,7 +135,7 @@ class DerivaModel:
|
|
|
129
135
|
def find_association(self, table1: Table | str, table2: Table | str) -> Table:
|
|
130
136
|
"""Given two tables, return an association table that connects the two.
|
|
131
137
|
|
|
132
|
-
Raises
|
|
138
|
+
Raises:
|
|
133
139
|
DerivaML exception if there is either not an association table or more than one association table.
|
|
134
140
|
"""
|
|
135
141
|
table1 = self.name_to_table(table1)
|
|
@@ -138,7 +144,7 @@ class DerivaModel:
|
|
|
138
144
|
tables = [
|
|
139
145
|
a.table
|
|
140
146
|
for a in table1.find_associations(pure=False)
|
|
141
|
-
if
|
|
147
|
+
if a.other_fkeys.pop().pk_table == table2
|
|
142
148
|
]
|
|
143
149
|
if len(tables) == 1:
|
|
144
150
|
return tables[0]
|
|
@@ -66,7 +66,6 @@ class AssetFilePath(type(Path())):
|
|
|
66
66
|
asset_rid: The RID of the asset if it has been uploaded into an asset table
|
|
67
67
|
"""
|
|
68
68
|
|
|
69
|
-
|
|
70
69
|
def __new__(
|
|
71
70
|
cls,
|
|
72
71
|
asset_path,
|
|
@@ -76,6 +75,17 @@ class AssetFilePath(type(Path())):
|
|
|
76
75
|
asset_types: list[str] | str,
|
|
77
76
|
asset_rid: Optional[RID] = None,
|
|
78
77
|
):
|
|
78
|
+
"""
|
|
79
|
+
Create a new Path object that has additional information related to the use of this path as an asset.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
asset_path: Local path to the location of the asset.
|
|
83
|
+
asset_name: The name of the asset in the catalog (e.g. the asset table name).
|
|
84
|
+
file_name: Name of the local file that contains the contents of the asset.
|
|
85
|
+
asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
|
|
86
|
+
asset_types: A list of terms from the Asset_Type controlled vocabulary.
|
|
87
|
+
asset_rid: The RID of the asset if it has been uploaded into an asset table
|
|
88
|
+
"""
|
|
79
89
|
obj = super().__new__(cls, asset_path)
|
|
80
90
|
obj.asset_types = (
|
|
81
91
|
asset_types if isinstance(asset_types, list) else [asset_types]
|
|
@@ -133,7 +143,7 @@ class Execution:
|
|
|
133
143
|
ml_object: The DerivaML instance that created the execution.
|
|
134
144
|
reload: RID of previously initialized execution object.
|
|
135
145
|
"""
|
|
136
|
-
self.asset_paths: list[
|
|
146
|
+
self.asset_paths: list[AssetFilePath] = []
|
|
137
147
|
self.configuration = configuration
|
|
138
148
|
self._ml_object = ml_object
|
|
139
149
|
self._model = ml_object.model
|
|
@@ -141,7 +151,7 @@ class Execution:
|
|
|
141
151
|
self.start_time = None
|
|
142
152
|
self.stop_time = None
|
|
143
153
|
self.status = Status.created
|
|
144
|
-
self.uploaded_assets: list[
|
|
154
|
+
self.uploaded_assets: Optional[dict[str, list[AssetFilePath]]] = None
|
|
145
155
|
self.configuration.argv = sys.argv
|
|
146
156
|
|
|
147
157
|
self.dataset_rids: list[RID] = []
|
|
@@ -152,6 +162,7 @@ class Execution:
|
|
|
152
162
|
self._cache_dir = self._ml_object.cache_dir
|
|
153
163
|
self._dry_run = dry_run
|
|
154
164
|
|
|
165
|
+
# Make sure we have a good workflow.
|
|
155
166
|
if isinstance(self.configuration.workflow, Workflow):
|
|
156
167
|
self.workflow_rid = (
|
|
157
168
|
self._ml_object.add_workflow(self.configuration.workflow)
|
|
@@ -168,6 +179,7 @@ class Execution:
|
|
|
168
179
|
"Workflow specified in execution configuration is not a Workflow"
|
|
169
180
|
)
|
|
170
181
|
|
|
182
|
+
# Validate the datasets and assets to be valid.
|
|
171
183
|
for d in self.configuration.datasets:
|
|
172
184
|
if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
|
|
173
185
|
raise DerivaMLException(
|
|
@@ -265,7 +277,7 @@ class Execution:
|
|
|
265
277
|
file_name="configuration.json",
|
|
266
278
|
asset_types=ExecMetadataVocab.execution_config.value,
|
|
267
279
|
)
|
|
268
|
-
with open(cfile, "w", encoding="utf-8") as config_file:
|
|
280
|
+
with open(cfile.as_posix(), "w", encoding="utf-8") as config_file:
|
|
269
281
|
json.dump(self.configuration.model_dump(), config_file)
|
|
270
282
|
|
|
271
283
|
# save runtime env
|
|
@@ -387,7 +399,7 @@ class Execution:
|
|
|
387
399
|
try:
|
|
388
400
|
self.update_status(Status.running, "Uploading execution files...")
|
|
389
401
|
results = upload_directory(self._model, self._asset_root)
|
|
390
|
-
except
|
|
402
|
+
except RuntimeError as e:
|
|
391
403
|
error = format_exception(e)
|
|
392
404
|
self.update_status(Status.failed, error)
|
|
393
405
|
raise DerivaMLException(f"Fail to upload execution_assets. Error: {error}")
|
|
@@ -519,7 +531,7 @@ class Execution:
|
|
|
519
531
|
|
|
520
532
|
def upload_execution_outputs(
|
|
521
533
|
self, clean_folder: bool = True
|
|
522
|
-
) -> dict[str, AssetFilePath]:
|
|
534
|
+
) -> dict[str, list[AssetFilePath]]:
|
|
523
535
|
"""Upload all the assets and metadata associated with the current execution.
|
|
524
536
|
|
|
525
537
|
This will include any new assets, features, or table values.
|
|
@@ -535,11 +547,11 @@ class Execution:
|
|
|
535
547
|
if self._dry_run:
|
|
536
548
|
return {}
|
|
537
549
|
try:
|
|
538
|
-
uploaded_assets = self._upload_execution_dirs()
|
|
550
|
+
self.uploaded_assets = self._upload_execution_dirs()
|
|
539
551
|
self.update_status(Status.completed, "Successfully end the execution.")
|
|
540
552
|
if clean_folder:
|
|
541
553
|
self._clean_folder_contents(self._execution_root)
|
|
542
|
-
return uploaded_assets
|
|
554
|
+
return self.uploaded_assets
|
|
543
555
|
except Exception as e:
|
|
544
556
|
error = format_exception(e)
|
|
545
557
|
self.update_status(Status.failed, error)
|
|
@@ -688,16 +700,26 @@ class Execution:
|
|
|
688
700
|
asset_name: str,
|
|
689
701
|
file_name: str,
|
|
690
702
|
asset_types: Optional[list[str] | str] = None,
|
|
703
|
+
copy_file=False,
|
|
691
704
|
**kwargs,
|
|
692
705
|
) -> AssetFilePath:
|
|
693
706
|
"""Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
|
|
694
707
|
|
|
695
|
-
|
|
708
|
+
Given the name of an asset table, and a file name, register the file for upload, and return a path to that
|
|
709
|
+
file in the upload directory. In addition to the filename, additioal asset metadata and file asset types may
|
|
710
|
+
be specified.
|
|
711
|
+
|
|
712
|
+
This routine has three modes, depending on if file_name refers to an existing file. If it doesn't, a path
|
|
713
|
+
to a new file with the specified name is returned. The caller can then open that file for writing.
|
|
714
|
+
|
|
715
|
+
If the provided filename refers to an existing file and the copy_file argument is False (the default), then the
|
|
716
|
+
returned path contains a symbolic link to that file. If the copy_file argument is True then the contents of
|
|
717
|
+
file_name are copied into the target directory.
|
|
696
718
|
|
|
697
719
|
Args:
|
|
698
720
|
asset_name: Type of asset to be uploaded. Must be a term in Asset_Type controlled vocabulary.
|
|
699
|
-
asset_types: Type of asset to be uploaded. Defaults to name of the asset.
|
|
700
721
|
file_name: Name of file to be uploaded.
|
|
722
|
+
asset_types: Type of asset to be uploaded. Defaults to name of the asset.
|
|
701
723
|
**kwargs: Any additional metadata values that may be part of the asset table.
|
|
702
724
|
|
|
703
725
|
Returns:
|
|
@@ -716,26 +738,33 @@ class Execution:
|
|
|
716
738
|
for t in asset_types:
|
|
717
739
|
self._ml_object.lookup_term(MLVocab.asset_type, t)
|
|
718
740
|
|
|
741
|
+
file_name = Path(file_name)
|
|
719
742
|
asset_path = asset_file_path(
|
|
720
743
|
self._working_dir,
|
|
721
744
|
self.execution_rid,
|
|
722
745
|
self._model.name_to_table(asset_name),
|
|
723
|
-
file_name,
|
|
746
|
+
file_name.name,
|
|
724
747
|
metadata=kwargs,
|
|
725
748
|
)
|
|
726
749
|
|
|
750
|
+
if file_name.exists():
|
|
751
|
+
if copy_file:
|
|
752
|
+
asset_path.write_bytes(file_name.read_bytes())
|
|
753
|
+
else:
|
|
754
|
+
asset_path.symlink_to(file_name)
|
|
755
|
+
|
|
727
756
|
# Persist the asset types into a file
|
|
728
757
|
with open(
|
|
729
758
|
asset_type_path(self._working_dir, self.execution_rid, asset_table),
|
|
730
759
|
"a",
|
|
731
760
|
encoding="utf-8",
|
|
732
761
|
) as f:
|
|
733
|
-
f.write(json.dumps({file_name: asset_types}) + "\n")
|
|
762
|
+
f.write(json.dumps({file_name.name: asset_types}) + "\n")
|
|
734
763
|
|
|
735
764
|
return AssetFilePath(
|
|
736
765
|
asset_path=asset_path,
|
|
737
766
|
asset_name=asset_name,
|
|
738
|
-
file_name=file_name,
|
|
767
|
+
file_name=file_name.name,
|
|
739
768
|
asset_metadata=kwargs,
|
|
740
769
|
asset_types=asset_types,
|
|
741
770
|
)
|
|
@@ -760,6 +789,7 @@ class Execution:
|
|
|
760
789
|
|
|
761
790
|
def execute(self) -> Execution:
|
|
762
791
|
"""Initiate an execution with provided configuration. Can be used in a context manager."""
|
|
792
|
+
self.execution_start()
|
|
763
793
|
return self
|
|
764
794
|
|
|
765
795
|
@validate_call
|
|
@@ -216,10 +216,10 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
|
|
|
216
216
|
"checksum_types": ["sha256", "md5"],
|
|
217
217
|
"hatrac_options": {"versioned_urls": True},
|
|
218
218
|
"hatrac_templates": {
|
|
219
|
-
"hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}",
|
|
219
|
+
"hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}.{{file_ext}}",
|
|
220
220
|
"content-disposition": "filename*=UTF-8''{file_name}.{file_ext}",
|
|
221
221
|
},
|
|
222
|
-
"record_query_template": "/entity/{target_table}/MD5={{md5}}&Filename={
|
|
222
|
+
"record_query_template": "/entity/{target_table}/MD5={{md5}}&Filename={file_name}.{file_ext}",
|
|
223
223
|
}
|
|
224
224
|
|
|
225
225
|
|
|
@@ -252,10 +252,10 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
|
|
|
252
252
|
"checksum_types": ["sha256", "md5"],
|
|
253
253
|
"hatrac_options": {"versioned_urls": True},
|
|
254
254
|
"hatrac_templates": {
|
|
255
|
-
"hatrac_uri": "/hatrac/{asset_table}/{md5}.{file_name}",
|
|
255
|
+
"hatrac_uri": "/hatrac/{asset_table}/{md5}.{file_name}.{file_ext}",
|
|
256
256
|
"content-disposition": "filename*=UTF-8''{file_name}.{file_ext}",
|
|
257
257
|
},
|
|
258
|
-
"record_query_template": "/entity/{target_table}/MD5={
|
|
258
|
+
"record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}.{file_ext}",
|
|
259
259
|
},
|
|
260
260
|
# {
|
|
261
261
|
# Upload the records into a table
|
|
@@ -448,7 +448,7 @@ def asset_type_path(prefix: Path | str, exec_rid: RID, asset_table: Table) -> Pa
|
|
|
448
448
|
asset_table: Table in which to place assets.
|
|
449
449
|
|
|
450
450
|
Returns:
|
|
451
|
-
Path to the file in which to place asset_type values for the named asset
|
|
451
|
+
Path to the file in which to place asset_type values for the named asset.
|
|
452
452
|
"""
|
|
453
453
|
path = (
|
|
454
454
|
execution_root(prefix, exec_rid=exec_rid)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deriva-ml
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.12.0
|
|
4
4
|
Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
|
|
5
5
|
Author-email: ISRD <isrd-dev@isi.edu>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -28,3 +28,4 @@ The script release.sh will create a new release tag in GitHub. This script requ
|
|
|
28
28
|
GitHUB CLI be installed.
|
|
29
29
|
|
|
30
30
|
See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.
|
|
31
|
+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|