deriva-ml 1.8.11__tar.gz → 1.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deriva_ml-1.8.11/src/deriva_ml.egg-info → deriva_ml-1.9.0}/PKG-INFO +1 -1
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/Notebooks/DerivaML Execution.ipynb +3 -11
- deriva_ml-1.9.0/docs/user-guide/execution-configuration.md +26 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/release.sh +2 -2
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/database_model.py +27 -4
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/dataset.py +14 -9
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/dataset_bag.py +1 -1
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/demo_catalog.py +9 -8
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/deriva_definitions.py +8 -3
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/deriva_ml_base.py +60 -21
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/deriva_model.py +2 -2
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/execution.py +5 -4
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/execution_configuration.py +20 -23
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/schema_setup/annotations.py +1 -1
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/schema_setup/create_schema.py +3 -2
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/upload.py +1 -1
- {deriva_ml-1.8.11 → deriva_ml-1.9.0/src/deriva_ml.egg-info}/PKG-INFO +1 -1
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml.egg-info/SOURCES.txt +0 -6
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/derivaml_test.py +1 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/test_dataset.py +8 -38
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/test_execution.py +9 -15
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/test_upload.py +5 -7
- deriva_ml-1.8.11/docs/user-guide/execution-configuration.md +0 -14
- deriva_ml-1.8.11/src/deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -36
- deriva_ml-1.8.11/src/deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -255
- deriva_ml-1.8.11/src/deriva_ml/build/lib/schema_setup/create_schema.py +0 -165
- deriva_ml-1.8.11/src/deriva_ml/schema_setup/alter_annotation.py +0 -55
- deriva_ml-1.8.11/src/deriva_ml/schema_setup/table_comments_utils.py +0 -56
- deriva_ml-1.8.11/tests/__init__.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/.github/workflows/publish-docs.yml +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/.gitignore +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/LICENSE +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/README.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/.DS_Store +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/Notebooks/DerivaML Create Notes.ipynb +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/Notebooks/DerivaML Dataset.ipynb +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/Notebooks/DerivaML Features.ipynb +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/Notebooks/DerivaML Vocabulary.ipynb +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/assets/ERD.png +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/assets/Launcher.png +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/assets/copy_minid.png +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/assets/deriva-logo.png +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/assets/deriva-ml.pdf +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/assets/sharing-at-home.pdf +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/dataset.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/dataset_aux_classes.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/dataset_bag.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/deriva_ml_base.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/deriva_model.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/execution.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/execution_configuration.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/feature.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/upload.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/deriva_ml_structure.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/index.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/release-notes.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/user-guide/datasets.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/user-guide/identifiers.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/user-guide/install.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/user-guide/ml_workflow_instruction.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/mkdocs.yml +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/pyproject.toml +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/setup.cfg +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/__init__.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/dataset_aux_classes.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/execution_environment.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/feature.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/history.py +0 -0
- {deriva_ml-1.8.11/src/deriva_ml/build/lib → deriva_ml-1.9.0/src/deriva_ml}/schema_setup/__init__.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/schema_setup/policy.json +0 -0
- {deriva_ml-1.8.11/src/deriva_ml/build/lib → deriva_ml-1.9.0/src/deriva_ml}/schema_setup/table_comments_utils.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/test_functions.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml.egg-info/entry_points.txt +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml.egg-info/requires.txt +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml.egg-info/top_level.txt +0 -0
- {deriva_ml-1.8.11/src/deriva_ml/schema_setup → deriva_ml-1.9.0/tests}/__init__.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/runner.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/test_basic_tables.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/test_download.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/test_features.py +0 -0
|
@@ -28,7 +28,7 @@
|
|
|
28
28
|
"source": [
|
|
29
29
|
"import builtins\n",
|
|
30
30
|
"from deriva.core.utils.globus_auth_utils import GlobusNativeLogin\n",
|
|
31
|
-
"from deriva_ml import ExecutionConfiguration,
|
|
31
|
+
"from deriva_ml import ExecutionConfiguration, MLVocab, DerivaSystemColumns\n",
|
|
32
32
|
"from deriva_ml.demo_catalog import create_demo_catalog, DemoML\n",
|
|
33
33
|
"from IPython.display import display, Markdown, JSON\n",
|
|
34
34
|
"import itertools\n",
|
|
@@ -166,12 +166,11 @@
|
|
|
166
166
|
"metadata": {},
|
|
167
167
|
"cell_type": "code",
|
|
168
168
|
"source": [
|
|
169
|
-
"ml_instance.add_term(MLVocab.workflow_type, \"Manual Workflow\", description=\"
|
|
169
|
+
"ml_instance.add_term(MLVocab.workflow_type, \"Manual Workflow\", description=\"Initial setup of Model File\")\n",
|
|
170
170
|
"ml_instance.add_term(MLVocab.execution_asset_type, \"API_Model\", description=\"Model for our API workflow\")\n",
|
|
171
171
|
"\n",
|
|
172
|
-
"api_workflow =
|
|
172
|
+
"api_workflow = ml_instance.create_workflow(\n",
|
|
173
173
|
" name=\"Manual Workflow\",\n",
|
|
174
|
-
" url='https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/Notebooks/DerivaML%20Execution.ipynb',\n",
|
|
175
174
|
" workflow_type=\"Manual Workflow\",\n",
|
|
176
175
|
" description=\"A manual operation\"\n",
|
|
177
176
|
")\n",
|
|
@@ -207,13 +206,6 @@
|
|
|
207
206
|
"source": [
|
|
208
207
|
"ml_instance.add_term(MLVocab.workflow_type, \"ML Demo\", description=\"A ML Workflow that uses Deriva ML API\")\n",
|
|
209
208
|
"\n",
|
|
210
|
-
"api_workflow = Workflow(\n",
|
|
211
|
-
" name=\"ML Demo\",\n",
|
|
212
|
-
" url=\"https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml\",\n",
|
|
213
|
-
" workflow_type=\"ML Demo\",\n",
|
|
214
|
-
" description=\"A workflow that uses Deriva ML\"\n",
|
|
215
|
-
")\n",
|
|
216
|
-
"\n",
|
|
217
209
|
"config = ExecutionConfiguration(\n",
|
|
218
210
|
" datasets=[training_dataset_rid, {'rid':testing_dataset_rid, 'materialize':False}],\n",
|
|
219
211
|
" assets = [training_model_rid],\n",
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Configuring an execution
|
|
2
|
+
|
|
3
|
+
One of the essential functions of DerivaML is to help keep track how ML model results are created so that hey can be shared and reproduced.
|
|
4
|
+
Every execution in DerivaML is represented by an Execution object, whick keeps track of all of the paramemters associated with and execution and
|
|
5
|
+
provides a number of functions that enable a program to help keep track of the configuation and results of a model execution.
|
|
6
|
+
|
|
7
|
+
The first step in creating a DerivaML execution is to create an `ExectuionConfiguration`.
|
|
8
|
+
The `ExecutionConfiguration` class is used to specify the inputs that go are to be used by an Execution.
|
|
9
|
+
These inputs include
|
|
10
|
+
* A list of datasets that are used
|
|
11
|
+
* A list of other files (assets) that are to be used. This can include existing models, or any other infomration that the execution might need.
|
|
12
|
+
* The actual code that is being executed.
|
|
13
|
+
|
|
14
|
+
[`ExecutionConfiguration`][deriva_ml.execution_configuration.ExecutionConfiguration] is a Pydantic dataclass.
|
|
15
|
+
As part of initializing an execution, the assets and datasets in the configuration object are downloaded and cached.
|
|
16
|
+
The datasets are provided as a list of DatasetSpecw which
|
|
17
|
+
```DatasetSpec(dataset_rid:RID, version:DatasetVersion, materialize:bool)```
|
|
18
|
+
|
|
19
|
+
it will be common to just want to use the latest version of the dataset, in which case you would use: `
|
|
20
|
+
````
|
|
21
|
+
deriva_nl = DerivaML(...)
|
|
22
|
+
dataset_rid = ...
|
|
23
|
+
datasets = [DatasetSpec(dataset_rid, version=deriva_ml.dataset_version(dataset_rid))]
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
If a dataset is large, downloading from the catalog might take a signficant amount of time.
|
|
@@ -13,7 +13,7 @@ echo "Bumping version: $VERSION_TYPE"
|
|
|
13
13
|
|
|
14
14
|
# Bump the version using bump-my-version.
|
|
15
15
|
# This command should update version files, commit the changes, and create a Git tag.
|
|
16
|
-
bump-my-version bump $VERSION_TYPE --verbose
|
|
16
|
+
bump-my-version bump "$VERSION_TYPE" --verbose
|
|
17
17
|
|
|
18
18
|
# Push commits and tags to the remote repository.
|
|
19
19
|
echo "Pushing changes to remote repository..."
|
|
@@ -32,6 +32,6 @@ python -m build
|
|
|
32
32
|
NEW_TAG=$(git describe --tags --abbrev=0)
|
|
33
33
|
echo "New version tag: $NEW_TAG"
|
|
34
34
|
|
|
35
|
-
twine upload dist/*${NEW_TAG}
|
|
35
|
+
twine upload "dist/*${NEW_TAG/v/}"
|
|
36
36
|
|
|
37
37
|
echo "Release process complete!"
|
|
@@ -1,12 +1,15 @@
|
|
|
1
|
-
"""Ths module
|
|
1
|
+
"""Ths module contains the definition of the DatabaseModel class. The role of this class is to provide an nterface between the BDBag representation
|
|
2
2
|
of a dataset and a sqllite database in which the contents of the bag are stored.
|
|
3
3
|
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
4
7
|
import logging
|
|
5
8
|
import sqlite3
|
|
6
9
|
|
|
7
10
|
from csv import reader
|
|
8
11
|
from pathlib import Path
|
|
9
|
-
from typing import Any, Optional
|
|
12
|
+
from typing import Any, Optional, Generator
|
|
10
13
|
from urllib.parse import urlparse
|
|
11
14
|
|
|
12
15
|
from deriva.core.ermrest_model import Model
|
|
@@ -20,7 +23,7 @@ from .dataset_bag import DatasetBag
|
|
|
20
23
|
class DatabaseModelMeta(type):
|
|
21
24
|
"""Use metaclass to ensure that there is onl one instance per path"""
|
|
22
25
|
|
|
23
|
-
_paths_loaded: dict[Path
|
|
26
|
+
_paths_loaded: dict[Path, "DatabaseModel"] = {}
|
|
24
27
|
|
|
25
28
|
def __call__(cls, *args, **kwargs):
|
|
26
29
|
logger = logging.getLogger("deriva_ml")
|
|
@@ -47,7 +50,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
47
50
|
Because of nested datasets, it's possible that more than one dataset rid is in a bag, or that a dataset rid might
|
|
48
51
|
appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
|
|
49
52
|
into DatabaseModels, is kept in the class variable `_rid_map`.
|
|
50
|
-
|
|
53
|
+
|
|
51
54
|
Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
|
|
52
55
|
sqllite instance is created for every new dataset version present.
|
|
53
56
|
|
|
@@ -315,6 +318,26 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
315
318
|
)
|
|
316
319
|
return datasets
|
|
317
320
|
|
|
321
|
+
def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
|
|
322
|
+
"""Retrieve the contents of the specified table as a dictionary.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
table: Table to retrieve data from. f schema is not provided as part of the table name,
|
|
326
|
+
the method will attempt to locate the schema for the table.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
A generator producing dictionaries containing the contents of the specified table as name/value pairs.
|
|
330
|
+
"""
|
|
331
|
+
table_name = self.normalize_table_name(table)
|
|
332
|
+
with self.dbase as dbase:
|
|
333
|
+
col_names = [
|
|
334
|
+
c[1]
|
|
335
|
+
for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()
|
|
336
|
+
]
|
|
337
|
+
result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
|
|
338
|
+
while row := result.fetchone():
|
|
339
|
+
yield dict(zip(col_names, row))
|
|
340
|
+
|
|
318
341
|
def normalize_table_name(self, table: str) -> str:
|
|
319
342
|
"""Attempt to insert the schema into a table name if it's not provided.
|
|
320
343
|
|
|
@@ -92,7 +92,7 @@ class Dataset:
|
|
|
92
92
|
dataset_list: list[DatasetSpec],
|
|
93
93
|
description: Optional[str] = "",
|
|
94
94
|
execution_rid: Optional[RID] = None,
|
|
95
|
-
) ->
|
|
95
|
+
) -> list[dict[str, Any]]:
|
|
96
96
|
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
97
97
|
|
|
98
98
|
# Construct version records for insert
|
|
@@ -245,7 +245,7 @@ class Dataset:
|
|
|
245
245
|
DerivaMLException: if provided RID is not to a dataset_table.
|
|
246
246
|
"""
|
|
247
247
|
|
|
248
|
-
# Find all
|
|
248
|
+
# Find all the datasets that are reachable from this dataset and determine their new version numbers.
|
|
249
249
|
related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
|
|
250
250
|
version_update_list = [
|
|
251
251
|
DatasetSpec(
|
|
@@ -254,7 +254,7 @@ class Dataset:
|
|
|
254
254
|
)
|
|
255
255
|
for ds_rid in related_datasets
|
|
256
256
|
]
|
|
257
|
-
|
|
257
|
+
self._insert_dataset_versions(
|
|
258
258
|
version_update_list, description=description, execution_rid=execution_rid
|
|
259
259
|
)
|
|
260
260
|
return [d.version for d in version_update_list if d.rid == dataset_rid][0]
|
|
@@ -751,9 +751,10 @@ class Dataset:
|
|
|
751
751
|
]
|
|
752
752
|
|
|
753
753
|
def _table_paths(
|
|
754
|
-
self,
|
|
754
|
+
self,
|
|
755
|
+
dataset: Optional[DatasetSpec] = None,
|
|
756
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
755
757
|
) -> Iterator[tuple[str, str, Table]]:
|
|
756
|
-
|
|
757
758
|
paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
|
|
758
759
|
|
|
759
760
|
def source_path(path: tuple[Table, ...]):
|
|
@@ -779,17 +780,20 @@ class Dataset:
|
|
|
779
780
|
def _collect_paths(
|
|
780
781
|
self,
|
|
781
782
|
dataset_rid: Optional[RID] = None,
|
|
782
|
-
|
|
783
|
+
snapshot: Optional[Dataset] = None,
|
|
783
784
|
dataset_nesting_depth: Optional[int] = None,
|
|
784
785
|
) -> set[tuple[Table, ...]]:
|
|
785
786
|
|
|
786
|
-
snapshot_catalog =
|
|
787
|
+
snapshot_catalog = snapshot if snapshot else self
|
|
788
|
+
|
|
787
789
|
dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
|
|
788
790
|
"Dataset"
|
|
789
791
|
]
|
|
790
792
|
dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
|
|
791
793
|
"Dataset_Dataset"
|
|
792
794
|
]
|
|
795
|
+
|
|
796
|
+
# Figure out what types of elements the dataset contains.
|
|
793
797
|
dataset_associations = [
|
|
794
798
|
a
|
|
795
799
|
for a in self.dataset_table.find_associations()
|
|
@@ -812,7 +816,8 @@ class Dataset:
|
|
|
812
816
|
]
|
|
813
817
|
else:
|
|
814
818
|
included_associations = dataset_associations
|
|
815
|
-
|
|
819
|
+
|
|
820
|
+
# Get the paths through the schema and filter out all the dataset paths not used by this dataset.
|
|
816
821
|
paths = {
|
|
817
822
|
tuple(p)
|
|
818
823
|
for p in snapshot_catalog._model._schema_to_paths()
|
|
@@ -827,7 +832,7 @@ class Dataset:
|
|
|
827
832
|
if dataset_rid:
|
|
828
833
|
for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
|
|
829
834
|
nested_paths |= self._collect_paths(
|
|
830
|
-
c,
|
|
835
|
+
c, snapshot=snapshot_catalog
|
|
831
836
|
)
|
|
832
837
|
else:
|
|
833
838
|
# Initialize nesting depth if not already provided.
|
|
@@ -109,7 +109,7 @@ class DatasetBag:
|
|
|
109
109
|
for ts, on in paths:
|
|
110
110
|
tables = " JOIN ".join(ts)
|
|
111
111
|
on_expression = " and ".join(
|
|
112
|
-
[f"{column_name(
|
|
112
|
+
[f"{column_name(left)}={column_name(right)}" for left, right in on]
|
|
113
113
|
)
|
|
114
114
|
sql.append(
|
|
115
115
|
f"SELECT {select_args} FROM {tables} ON {on_expression} WHERE {dataset_table_name}.RID IN ({datasets})"
|
|
@@ -5,6 +5,7 @@ import logging
|
|
|
5
5
|
from random import random, randint
|
|
6
6
|
import tempfile
|
|
7
7
|
from tempfile import TemporaryDirectory
|
|
8
|
+
from typing import Optional
|
|
8
9
|
import itertools
|
|
9
10
|
|
|
10
11
|
from deriva.config.acl_config import AclConfig
|
|
@@ -18,7 +19,6 @@ from requests import HTTPError
|
|
|
18
19
|
from deriva_ml import (
|
|
19
20
|
DerivaML,
|
|
20
21
|
ExecutionConfiguration,
|
|
21
|
-
Workflow,
|
|
22
22
|
MLVocab,
|
|
23
23
|
BuiltinTypes,
|
|
24
24
|
ColumnDefinition,
|
|
@@ -169,12 +169,9 @@ def create_demo_features(ml_instance):
|
|
|
169
169
|
description="Model for our API workflow",
|
|
170
170
|
)
|
|
171
171
|
|
|
172
|
-
api_workflow = ml_instance.
|
|
173
|
-
Workflow
|
|
174
|
-
|
|
175
|
-
url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml",
|
|
176
|
-
workflow_type="API Workflow",
|
|
177
|
-
)
|
|
172
|
+
api_workflow = ml_instance.create_workflow(
|
|
173
|
+
name="API Workflow",
|
|
174
|
+
workflow_type="API Workflow",
|
|
178
175
|
)
|
|
179
176
|
|
|
180
177
|
api_execution = ml_instance.create_execution(
|
|
@@ -322,7 +319,11 @@ def create_demo_catalog(
|
|
|
322
319
|
|
|
323
320
|
class DemoML(DerivaML):
|
|
324
321
|
def __init__(
|
|
325
|
-
self,
|
|
322
|
+
self,
|
|
323
|
+
hostname,
|
|
324
|
+
catalog_id,
|
|
325
|
+
cache_dir: Optional[str] = None,
|
|
326
|
+
working_dir: Optional[str] = None,
|
|
326
327
|
):
|
|
327
328
|
super().__init__(
|
|
328
329
|
hostname=hostname,
|
|
@@ -8,7 +8,7 @@ from enum import Enum
|
|
|
8
8
|
from typing import Any, Iterable, Optional, Annotated
|
|
9
9
|
|
|
10
10
|
import deriva.core.ermrest_model as em
|
|
11
|
-
from urllib.parse import urlparse
|
|
11
|
+
from urllib.parse import urlparse
|
|
12
12
|
from deriva.core.ermrest_model import builtin_types
|
|
13
13
|
from pydantic import (
|
|
14
14
|
BaseModel,
|
|
@@ -139,13 +139,18 @@ class FileSpec(BaseModel):
|
|
|
139
139
|
if url_parts.scheme == "tag":
|
|
140
140
|
return v
|
|
141
141
|
elif not url_parts.scheme:
|
|
142
|
-
return f
|
|
142
|
+
return f"tag://{gethostname()},{date.today()}:file://{v}"
|
|
143
143
|
else:
|
|
144
144
|
raise ValidationError("url is not a file URL")
|
|
145
145
|
|
|
146
146
|
@model_serializer()
|
|
147
147
|
def serialize_filespec(self):
|
|
148
|
-
return {
|
|
148
|
+
return {
|
|
149
|
+
"URL": self.url,
|
|
150
|
+
"Description": self.description,
|
|
151
|
+
"MD5": self.md5,
|
|
152
|
+
"Length": self.length,
|
|
153
|
+
}
|
|
149
154
|
|
|
150
155
|
|
|
151
156
|
class VocabularyTerm(BaseModel):
|
|
@@ -32,6 +32,7 @@ from deriva.core.deriva_server import DerivaServer
|
|
|
32
32
|
from deriva.core.ermrest_catalog import ResolveRidResult
|
|
33
33
|
from deriva.core.ermrest_model import Key, Table
|
|
34
34
|
from deriva.core.hatrac_store import HatracStore
|
|
35
|
+
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
|
|
35
36
|
from pydantic import validate_call, ConfigDict
|
|
36
37
|
from requests import RequestException
|
|
37
38
|
|
|
@@ -70,17 +71,29 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
70
71
|
try:
|
|
71
72
|
from IPython import get_ipython
|
|
72
73
|
except ImportError: # Graceful fallback if IPython isn't installed.
|
|
73
|
-
|
|
74
|
+
|
|
75
|
+
def get_ipython():
|
|
76
|
+
"""Dummy routine in case you are not running in IPython."""
|
|
77
|
+
return None
|
|
78
|
+
|
|
74
79
|
|
|
75
80
|
try:
|
|
76
81
|
from jupyter_server.serverapp import list_running_servers
|
|
77
82
|
except ImportError:
|
|
78
|
-
|
|
83
|
+
|
|
84
|
+
def list_running_servers():
|
|
85
|
+
"""Dummy routine in case you are not running in Jupyter."""
|
|
86
|
+
return []
|
|
87
|
+
|
|
79
88
|
|
|
80
89
|
try:
|
|
81
90
|
from ipykernel import get_connection_file
|
|
82
91
|
except ImportError:
|
|
83
|
-
|
|
92
|
+
|
|
93
|
+
def get_connection_file():
|
|
94
|
+
"""Dummy routine in case you are not running in Jupyter."""
|
|
95
|
+
return ""
|
|
96
|
+
|
|
84
97
|
|
|
85
98
|
if TYPE_CHECKING:
|
|
86
99
|
from .execution import Execution
|
|
@@ -102,8 +115,8 @@ class DerivaML(Dataset):
|
|
|
102
115
|
self,
|
|
103
116
|
hostname: str,
|
|
104
117
|
catalog_id: str | int,
|
|
105
|
-
domain_schema: str = None,
|
|
106
|
-
project_name: str = None,
|
|
118
|
+
domain_schema: Optional[str] = None,
|
|
119
|
+
project_name: Optional[str] = None,
|
|
107
120
|
cache_dir: Optional[str] = None,
|
|
108
121
|
working_dir: Optional[str] = None,
|
|
109
122
|
model_version: str = "1",
|
|
@@ -205,9 +218,8 @@ class DerivaML(Dataset):
|
|
|
205
218
|
except subprocess.CalledProcessError:
|
|
206
219
|
self._logger.error("nbstripout is not found.")
|
|
207
220
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
|
|
221
|
+
@staticmethod
|
|
222
|
+
def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
|
|
211
223
|
"""Return the absolute path of the current notebook."""
|
|
212
224
|
# Get the kernel's connection file and extract the kernel ID
|
|
213
225
|
try:
|
|
@@ -267,7 +279,7 @@ class DerivaML(Dataset):
|
|
|
267
279
|
) # Get the caller's filename, which is two up the stack from here.
|
|
268
280
|
else:
|
|
269
281
|
raise DerivaMLException(
|
|
270
|
-
|
|
282
|
+
"Looking for caller failed"
|
|
271
283
|
) # Stack is too shallow
|
|
272
284
|
return filename, is_notebook
|
|
273
285
|
|
|
@@ -335,7 +347,7 @@ class DerivaML(Dataset):
|
|
|
335
347
|
)
|
|
336
348
|
|
|
337
349
|
def asset_dir(
|
|
338
|
-
self, table: str | Table, prefix: str | Path = None
|
|
350
|
+
self, table: str | Table, prefix: Optional[str | Path] = None
|
|
339
351
|
) -> UploadAssetDirectory:
|
|
340
352
|
"""Return a local file path in which to place a files for an asset table. T
|
|
341
353
|
|
|
@@ -369,6 +381,29 @@ class DerivaML(Dataset):
|
|
|
369
381
|
"""
|
|
370
382
|
return self.cache_dir if cached else self.working_dir
|
|
371
383
|
|
|
384
|
+
@staticmethod
|
|
385
|
+
def globus_login(host: str) -> None:
|
|
386
|
+
"""Log into the specified host using Globus.
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
host:
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
|
|
393
|
+
"""
|
|
394
|
+
gnl = GlobusNativeLogin(host=host)
|
|
395
|
+
if gnl.is_logged_in([host]):
|
|
396
|
+
print("You are already logged in.")
|
|
397
|
+
else:
|
|
398
|
+
gnl.login(
|
|
399
|
+
[host],
|
|
400
|
+
no_local_server=True,
|
|
401
|
+
no_browser=True,
|
|
402
|
+
refresh_tokens=True,
|
|
403
|
+
update_bdbag_keychain=True,
|
|
404
|
+
)
|
|
405
|
+
print("Login Successful")
|
|
406
|
+
|
|
372
407
|
def chaise_url(self, table: RID | Table) -> str:
|
|
373
408
|
"""Return a Chaise URL to the specified table.
|
|
374
409
|
|
|
@@ -379,15 +414,15 @@ class DerivaML(Dataset):
|
|
|
379
414
|
Returns:
|
|
380
415
|
URL to the table in Chaise format.
|
|
381
416
|
"""
|
|
417
|
+
table_obj = self.model.name_to_table(table)
|
|
382
418
|
try:
|
|
383
|
-
table = self.model.name_to_table(table)
|
|
384
419
|
uri = self.catalog.get_server_uri().replace(
|
|
385
420
|
"ermrest/catalog/", "chaise/recordset/#"
|
|
386
421
|
)
|
|
387
422
|
except DerivaMLException:
|
|
388
423
|
# Perhaps we have a RID....
|
|
389
424
|
uri = self.cite(table)
|
|
390
|
-
return f"{uri}/{urlquote(
|
|
425
|
+
return f"{uri}/{urlquote(table_obj.schema.name)}:{urlquote(table_obj.name)}"
|
|
391
426
|
|
|
392
427
|
def cite(self, entity: dict | str) -> str:
|
|
393
428
|
"""Return a citation URL for the provided entity.
|
|
@@ -401,7 +436,9 @@ class DerivaML(Dataset):
|
|
|
401
436
|
Raises:
|
|
402
437
|
DerivaMLException: if provided RID does not exist.
|
|
403
438
|
"""
|
|
404
|
-
if entity.startswith(
|
|
439
|
+
if isinstance(entity, str) and entity.startswith(
|
|
440
|
+
f"https://{self.host_name}/id/{self.catalog_id}/"
|
|
441
|
+
):
|
|
405
442
|
# Already got a citation...
|
|
406
443
|
return entity
|
|
407
444
|
try:
|
|
@@ -498,9 +535,9 @@ class DerivaML(Dataset):
|
|
|
498
535
|
def create_asset(
|
|
499
536
|
self,
|
|
500
537
|
asset_name: str,
|
|
501
|
-
column_defs: Iterable[ColumnDefinition] = None,
|
|
538
|
+
column_defs: Optional[Iterable[ColumnDefinition]] = None,
|
|
502
539
|
comment: str = "",
|
|
503
|
-
schema: str = None,
|
|
540
|
+
schema: Optional[str] = None,
|
|
504
541
|
) -> Table:
|
|
505
542
|
"""Create an asset table with the given asset name.
|
|
506
543
|
|
|
@@ -532,9 +569,9 @@ class DerivaML(Dataset):
|
|
|
532
569
|
self,
|
|
533
570
|
target_table: Table | str,
|
|
534
571
|
feature_name: str,
|
|
535
|
-
terms: list[Table | str] = None,
|
|
536
|
-
assets: list[Table | str] = None,
|
|
537
|
-
metadata: Iterable[ColumnDefinition | Table | Key | str] = None,
|
|
572
|
+
terms: Optional[list[Table | str]] = None,
|
|
573
|
+
assets: Optional[list[Table | str]] = None,
|
|
574
|
+
metadata: Optional[Iterable[ColumnDefinition | Table | Key | str]] = None,
|
|
538
575
|
optional: Optional[list[str]] = None,
|
|
539
576
|
comment: str = "",
|
|
540
577
|
) -> type[FeatureRecord]:
|
|
@@ -899,6 +936,7 @@ class DerivaML(Dataset):
|
|
|
899
936
|
"""
|
|
900
937
|
|
|
901
938
|
def path_to_asset(path: str) -> str:
|
|
939
|
+
"""Pull the asset name out of a path to that asset in the filesystem"""
|
|
902
940
|
components = path.split("/")
|
|
903
941
|
return components[
|
|
904
942
|
components.index("asset") + 2
|
|
@@ -963,6 +1001,7 @@ class DerivaML(Dataset):
|
|
|
963
1001
|
)
|
|
964
1002
|
|
|
965
1003
|
def check_file_type(dtype: str) -> bool:
|
|
1004
|
+
"""Make sure that the specified string is either the name or synonym for a file type term."""
|
|
966
1005
|
for term in defined_types:
|
|
967
1006
|
if dtype == term.name or (term.synonyms and file_type in term.synonyms):
|
|
968
1007
|
return True
|
|
@@ -1098,7 +1137,7 @@ class DerivaML(Dataset):
|
|
|
1098
1137
|
|
|
1099
1138
|
def create_workflow(
|
|
1100
1139
|
self, name: str, workflow_type: str, description: str = "", create: bool = True
|
|
1101
|
-
) -> RID:
|
|
1140
|
+
) -> RID | None:
|
|
1102
1141
|
"""Identify current executing program and return a workflow RID for it
|
|
1103
1142
|
|
|
1104
1143
|
Determine the notebook or script that is currently being executed. Assume that this is
|
|
@@ -1166,7 +1205,7 @@ class DerivaML(Dataset):
|
|
|
1166
1205
|
)
|
|
1167
1206
|
github_url = result.stdout.strip().removesuffix(".git")
|
|
1168
1207
|
except subprocess.CalledProcessError:
|
|
1169
|
-
raise DerivaMLException(
|
|
1208
|
+
raise DerivaMLException("No GIT remote found")
|
|
1170
1209
|
|
|
1171
1210
|
# Find the root directory for the repository
|
|
1172
1211
|
repo_root = self._get_git_root()
|
|
@@ -1188,7 +1227,7 @@ class DerivaML(Dataset):
|
|
|
1188
1227
|
|
|
1189
1228
|
"""Get SHA-1 hash of latest commit of the file in the repository"""
|
|
1190
1229
|
result = subprocess.run(
|
|
1191
|
-
["git", "log", "-n", "1", "--pretty=format:%H
|
|
1230
|
+
["git", "log", "-n", "1", "--pretty=format:%H--", self.executable_path],
|
|
1192
1231
|
cwd=self.executable_path.parent,
|
|
1193
1232
|
capture_output=True,
|
|
1194
1233
|
text=True,
|
|
@@ -21,7 +21,7 @@ from .deriva_definitions import (
|
|
|
21
21
|
|
|
22
22
|
from collections import Counter
|
|
23
23
|
from pydantic import validate_call, ConfigDict
|
|
24
|
-
from typing import Iterable
|
|
24
|
+
from typing import Iterable, Optional
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class DerivaModel:
|
|
@@ -267,7 +267,7 @@ class DerivaModel:
|
|
|
267
267
|
def _schema_to_paths(
|
|
268
268
|
self,
|
|
269
269
|
root: Table = None,
|
|
270
|
-
path: list[Table] = None,
|
|
270
|
+
path: Optional[list[Table]] = None,
|
|
271
271
|
) -> list[list[Table]]:
|
|
272
272
|
"""Recursively walk over the domain schema graph and extend the current path.
|
|
273
273
|
|
|
@@ -54,7 +54,9 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
54
54
|
try:
|
|
55
55
|
from jupyter_server.serverapp import list_running_servers
|
|
56
56
|
except ImportError:
|
|
57
|
-
|
|
57
|
+
|
|
58
|
+
def list_running_servers():
|
|
59
|
+
return []
|
|
58
60
|
|
|
59
61
|
|
|
60
62
|
class Execution:
|
|
@@ -155,7 +157,6 @@ class Execution:
|
|
|
155
157
|
self._initialize_execution(reload)
|
|
156
158
|
|
|
157
159
|
def _save_runtime_environment(self):
|
|
158
|
-
|
|
159
160
|
runtime_env_path = ExecMetadataVocab.runtime_env.value
|
|
160
161
|
runtime_env_dir = self.execution_metadata_path(runtime_env_path)
|
|
161
162
|
with NamedTemporaryFile(
|
|
@@ -267,7 +268,7 @@ class Execution:
|
|
|
267
268
|
# Execution metadata cannot be in a directory, so map path into filename.
|
|
268
269
|
checkpoint_path = (
|
|
269
270
|
self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
|
|
270
|
-
/ f"{notebook_name.replace('/','_')}.checkpoint"
|
|
271
|
+
/ f"{notebook_name.replace('/', '_')}.checkpoint"
|
|
271
272
|
)
|
|
272
273
|
with open(checkpoint_path, "w", encoding="utf-8") as f:
|
|
273
274
|
json.dump(notebook_content, f)
|
|
@@ -359,7 +360,7 @@ class Execution:
|
|
|
359
360
|
if m := is_feature_asset_dir(p):
|
|
360
361
|
try:
|
|
361
362
|
self.update_status(
|
|
362
|
-
Status.running, f
|
|
363
|
+
Status.running, f"Uploading feature {m['feature_name']}..."
|
|
363
364
|
)
|
|
364
365
|
feature_assets[m["target_table"], m["feature_name"]] = (
|
|
365
366
|
self._ml_object.upload_assets(p)
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
-
from typing import Optional
|
|
4
|
+
from typing import Optional
|
|
5
5
|
|
|
6
6
|
from pydantic import (
|
|
7
7
|
BaseModel,
|
|
8
8
|
conlist,
|
|
9
|
-
ConfigDict,
|
|
9
|
+
ConfigDict,
|
|
10
10
|
)
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
|
|
@@ -36,7 +36,6 @@ class Workflow(BaseModel):
|
|
|
36
36
|
checksum: Optional[str]
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
|
|
40
39
|
class ExecutionConfiguration(BaseModel):
|
|
41
40
|
"""Define the parameters that are used to configure a specific execution.
|
|
42
41
|
|
|
@@ -69,23 +68,21 @@ class ExecutionConfiguration(BaseModel):
|
|
|
69
68
|
config = json.load(fd)
|
|
70
69
|
return ExecutionConfiguration.model_validate(config)
|
|
71
70
|
|
|
72
|
-
def download_execution_configuration(
|
|
73
|
-
|
|
74
|
-
) -> ExecutionConfiguration:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
# hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
|
|
91
|
-
# return ExecutionConfiguration.load_configuration(Path(dest_file.name))
|
|
71
|
+
# def download_execution_configuration(
|
|
72
|
+
# self, configuration_rid: RID
|
|
73
|
+
# ) -> ExecutionConfiguration:
|
|
74
|
+
# """Create an ExecutionConfiguration object from a catalog RID that points to a JSON representation of that
|
|
75
|
+
# configuration in hatrac
|
|
76
|
+
#
|
|
77
|
+
# Args:
|
|
78
|
+
# configuration_rid: RID that should be to an asset table that refers to an execution configuration
|
|
79
|
+
#
|
|
80
|
+
# Returns:
|
|
81
|
+
# A ExecutionConfiguration object for configured by the parameters in the configuration file.
|
|
82
|
+
# """
|
|
83
|
+
# AssertionError("Not Implemented")
|
|
84
|
+
# configuration = self.retrieve_rid(configuration_rid)
|
|
85
|
+
# with NamedTemporaryFile("w+", delete=False, suffix=".json") as dest_file:
|
|
86
|
+
# hs = HatracStore("https", self.host_name, self.credential)
|
|
87
|
+
# hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
|
|
88
|
+
# return ExecutionConfiguration.load_configuration(Path(dest_file.name))
|
|
@@ -240,7 +240,7 @@ def main():
|
|
|
240
240
|
parser.add_argument("--catalog_id", type=str, required=True)
|
|
241
241
|
parser.add_argument("--schema_name", type=str, required=True)
|
|
242
242
|
args = parser.parse_args()
|
|
243
|
-
generate_annotation(args.catalog_id
|
|
243
|
+
generate_annotation(args.catalog_id)
|
|
244
244
|
|
|
245
245
|
|
|
246
246
|
if __name__ == "__main__":
|