deriva-ml 1.8.11__tar.gz → 1.9.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deriva_ml-1.8.11/src/deriva_ml.egg-info → deriva_ml-1.9.1}/PKG-INFO +11 -2
- deriva_ml-1.9.1/README.md +11 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/Notebooks/DerivaML Execution.ipynb +3 -11
- deriva_ml-1.9.1/docs/user-guide/execution-configuration.md +26 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/release.sh +3 -2
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/database_model.py +29 -7
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/dataset.py +16 -13
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/dataset_bag.py +1 -1
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/demo_catalog.py +9 -8
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/deriva_definitions.py +8 -3
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/deriva_ml_base.py +62 -23
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/deriva_model.py +2 -2
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/execution.py +5 -4
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/execution_configuration.py +20 -23
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/schema_setup/annotations.py +1 -1
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/schema_setup/create_schema.py +3 -2
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/upload.py +1 -1
- {deriva_ml-1.8.11 → deriva_ml-1.9.1/src/deriva_ml.egg-info}/PKG-INFO +11 -2
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml.egg-info/SOURCES.txt +0 -6
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/derivaml_test.py +1 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/test_dataset.py +8 -38
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/test_execution.py +9 -15
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/test_upload.py +5 -7
- deriva_ml-1.8.11/README.md +0 -2
- deriva_ml-1.8.11/docs/user-guide/execution-configuration.md +0 -14
- deriva_ml-1.8.11/src/deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -36
- deriva_ml-1.8.11/src/deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -255
- deriva_ml-1.8.11/src/deriva_ml/build/lib/schema_setup/create_schema.py +0 -165
- deriva_ml-1.8.11/src/deriva_ml/schema_setup/alter_annotation.py +0 -55
- deriva_ml-1.8.11/src/deriva_ml/schema_setup/table_comments_utils.py +0 -56
- deriva_ml-1.8.11/tests/__init__.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/.github/workflows/publish-docs.yml +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/.gitignore +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/LICENSE +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/.DS_Store +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/Notebooks/DerivaML Create Notes.ipynb +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/Notebooks/DerivaML Dataset.ipynb +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/Notebooks/DerivaML Features.ipynb +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/Notebooks/DerivaML Vocabulary.ipynb +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/assets/ERD.png +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/assets/Launcher.png +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/assets/copy_minid.png +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/assets/deriva-logo.png +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/assets/deriva-ml.pdf +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/assets/sharing-at-home.pdf +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/dataset.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/dataset_aux_classes.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/dataset_bag.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/deriva_ml_base.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/deriva_model.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/execution.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/execution_configuration.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/feature.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/upload.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/deriva_ml_structure.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/index.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/release-notes.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/user-guide/datasets.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/user-guide/identifiers.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/user-guide/install.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/user-guide/ml_workflow_instruction.md +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/mkdocs.yml +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/pyproject.toml +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/setup.cfg +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/__init__.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/dataset_aux_classes.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/execution_environment.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/feature.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/history.py +0 -0
- {deriva_ml-1.8.11/src/deriva_ml/build/lib → deriva_ml-1.9.1/src/deriva_ml}/schema_setup/__init__.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/schema_setup/policy.json +0 -0
- {deriva_ml-1.8.11/src/deriva_ml/build/lib → deriva_ml-1.9.1/src/deriva_ml}/schema_setup/table_comments_utils.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/test_functions.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml.egg-info/entry_points.txt +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml.egg-info/requires.txt +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml.egg-info/top_level.txt +0 -0
- {deriva_ml-1.8.11/src/deriva_ml/schema_setup → deriva_ml-1.9.1/tests}/__init__.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/runner.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/test_basic_tables.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/test_download.py +0 -0
- {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/test_features.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deriva-ml
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.9.1
|
|
4
4
|
Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
|
|
5
5
|
Author-email: ISRD <isrd-dev@isi.edu>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -16,5 +16,14 @@ Requires-Dist: setuptools-scm<=6.0
|
|
|
16
16
|
Requires-Dist: nbstripout
|
|
17
17
|
Dynamic: license-file
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
# DerivaML
|
|
20
|
+
Deriva-ML is a python library to simplify the process of creating and executing reproducible machine learning workflows
|
|
20
21
|
using a deriva catalog.
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
## Installing the GitHub CLI
|
|
25
|
+
|
|
26
|
+
The script release.sh will create a new release tag in GitHub. This script requires the
|
|
27
|
+
GitHUB CLI be installed.
|
|
28
|
+
|
|
29
|
+
See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# DerivaML
|
|
2
|
+
Deriva-ML is a python library to simplify the process of creating and executing reproducible machine learning workflows
|
|
3
|
+
using a deriva catalog.
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
## Installing the GitHub CLI
|
|
7
|
+
|
|
8
|
+
The script release.sh will create a new release tag in GitHub. This script requires the
|
|
9
|
+
GitHUB CLI be installed.
|
|
10
|
+
|
|
11
|
+
See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.
|
|
@@ -28,7 +28,7 @@
|
|
|
28
28
|
"source": [
|
|
29
29
|
"import builtins\n",
|
|
30
30
|
"from deriva.core.utils.globus_auth_utils import GlobusNativeLogin\n",
|
|
31
|
-
"from deriva_ml import ExecutionConfiguration,
|
|
31
|
+
"from deriva_ml import ExecutionConfiguration, MLVocab, DerivaSystemColumns\n",
|
|
32
32
|
"from deriva_ml.demo_catalog import create_demo_catalog, DemoML\n",
|
|
33
33
|
"from IPython.display import display, Markdown, JSON\n",
|
|
34
34
|
"import itertools\n",
|
|
@@ -166,12 +166,11 @@
|
|
|
166
166
|
"metadata": {},
|
|
167
167
|
"cell_type": "code",
|
|
168
168
|
"source": [
|
|
169
|
-
"ml_instance.add_term(MLVocab.workflow_type, \"Manual Workflow\", description=\"
|
|
169
|
+
"ml_instance.add_term(MLVocab.workflow_type, \"Manual Workflow\", description=\"Initial setup of Model File\")\n",
|
|
170
170
|
"ml_instance.add_term(MLVocab.execution_asset_type, \"API_Model\", description=\"Model for our API workflow\")\n",
|
|
171
171
|
"\n",
|
|
172
|
-
"api_workflow =
|
|
172
|
+
"api_workflow = ml_instance.create_workflow(\n",
|
|
173
173
|
" name=\"Manual Workflow\",\n",
|
|
174
|
-
" url='https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/Notebooks/DerivaML%20Execution.ipynb',\n",
|
|
175
174
|
" workflow_type=\"Manual Workflow\",\n",
|
|
176
175
|
" description=\"A manual operation\"\n",
|
|
177
176
|
")\n",
|
|
@@ -207,13 +206,6 @@
|
|
|
207
206
|
"source": [
|
|
208
207
|
"ml_instance.add_term(MLVocab.workflow_type, \"ML Demo\", description=\"A ML Workflow that uses Deriva ML API\")\n",
|
|
209
208
|
"\n",
|
|
210
|
-
"api_workflow = Workflow(\n",
|
|
211
|
-
" name=\"ML Demo\",\n",
|
|
212
|
-
" url=\"https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml\",\n",
|
|
213
|
-
" workflow_type=\"ML Demo\",\n",
|
|
214
|
-
" description=\"A workflow that uses Deriva ML\"\n",
|
|
215
|
-
")\n",
|
|
216
|
-
"\n",
|
|
217
209
|
"config = ExecutionConfiguration(\n",
|
|
218
210
|
" datasets=[training_dataset_rid, {'rid':testing_dataset_rid, 'materialize':False}],\n",
|
|
219
211
|
" assets = [training_model_rid],\n",
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Configuring an execution
|
|
2
|
+
|
|
3
|
+
One of the essential functions of DerivaML is to help keep track how ML model results are created so that hey can be shared and reproduced.
|
|
4
|
+
Every execution in DerivaML is represented by an Execution object, whick keeps track of all of the paramemters associated with and execution and
|
|
5
|
+
provides a number of functions that enable a program to help keep track of the configuation and results of a model execution.
|
|
6
|
+
|
|
7
|
+
The first step in creating a DerivaML execution is to create an `ExectuionConfiguration`.
|
|
8
|
+
The `ExecutionConfiguration` class is used to specify the inputs that go are to be used by an Execution.
|
|
9
|
+
These inputs include
|
|
10
|
+
* A list of datasets that are used
|
|
11
|
+
* A list of other files (assets) that are to be used. This can include existing models, or any other infomration that the execution might need.
|
|
12
|
+
* The actual code that is being executed.
|
|
13
|
+
|
|
14
|
+
[`ExecutionConfiguration`][deriva_ml.execution_configuration.ExecutionConfiguration] is a Pydantic dataclass.
|
|
15
|
+
As part of initializing an execution, the assets and datasets in the configuration object are downloaded and cached.
|
|
16
|
+
The datasets are provided as a list of DatasetSpecw which
|
|
17
|
+
```DatasetSpec(dataset_rid:RID, version:DatasetVersion, materialize:bool)```
|
|
18
|
+
|
|
19
|
+
it will be common to just want to use the latest version of the dataset, in which case you would use: `
|
|
20
|
+
````
|
|
21
|
+
deriva_nl = DerivaML(...)
|
|
22
|
+
dataset_rid = ...
|
|
23
|
+
datasets = [DatasetSpec(dataset_rid, version=deriva_ml.dataset_version(dataset_rid))]
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
If a dataset is large, downloading from the catalog might take a signficant amount of time.
|
|
@@ -9,11 +9,12 @@ fi
|
|
|
9
9
|
|
|
10
10
|
# Default version bump is patch unless specified (patch, minor, or major)
|
|
11
11
|
VERSION_TYPE=${1:-patch}
|
|
12
|
+
|
|
12
13
|
echo "Bumping version: $VERSION_TYPE"
|
|
13
14
|
|
|
14
15
|
# Bump the version using bump-my-version.
|
|
15
16
|
# This command should update version files, commit the changes, and create a Git tag.
|
|
16
|
-
bump-my-version bump $VERSION_TYPE --verbose
|
|
17
|
+
bump-my-version bump "$VERSION_TYPE" --verbose
|
|
17
18
|
|
|
18
19
|
# Push commits and tags to the remote repository.
|
|
19
20
|
echo "Pushing changes to remote repository..."
|
|
@@ -32,6 +33,6 @@ python -m build
|
|
|
32
33
|
NEW_TAG=$(git describe --tags --abbrev=0)
|
|
33
34
|
echo "New version tag: $NEW_TAG"
|
|
34
35
|
|
|
35
|
-
twine upload dist/*${NEW_TAG}
|
|
36
|
+
twine upload dist/*${NEW_TAG/v/}
|
|
36
37
|
|
|
37
38
|
echo "Release process complete!"
|
|
@@ -1,12 +1,15 @@
|
|
|
1
|
-
"""Ths module
|
|
1
|
+
"""Ths module contains the definition of the DatabaseModel class. The role of this class is to provide an nterface between the BDBag representation
|
|
2
2
|
of a dataset and a sqllite database in which the contents of the bag are stored.
|
|
3
3
|
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
4
7
|
import logging
|
|
5
8
|
import sqlite3
|
|
6
9
|
|
|
7
10
|
from csv import reader
|
|
8
11
|
from pathlib import Path
|
|
9
|
-
from typing import Any, Optional
|
|
12
|
+
from typing import Any, Optional, Generator
|
|
10
13
|
from urllib.parse import urlparse
|
|
11
14
|
|
|
12
15
|
from deriva.core.ermrest_model import Model
|
|
@@ -20,7 +23,7 @@ from .dataset_bag import DatasetBag
|
|
|
20
23
|
class DatabaseModelMeta(type):
|
|
21
24
|
"""Use metaclass to ensure that there is onl one instance per path"""
|
|
22
25
|
|
|
23
|
-
_paths_loaded: dict[Path
|
|
26
|
+
_paths_loaded: dict[Path, "DatabaseModel"] = {}
|
|
24
27
|
|
|
25
28
|
def __call__(cls, *args, **kwargs):
|
|
26
29
|
logger = logging.getLogger("deriva_ml")
|
|
@@ -47,7 +50,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
47
50
|
Because of nested datasets, it's possible that more than one dataset rid is in a bag, or that a dataset rid might
|
|
48
51
|
appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
|
|
49
52
|
into DatabaseModels, is kept in the class variable `_rid_map`.
|
|
50
|
-
|
|
53
|
+
|
|
51
54
|
Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
|
|
52
55
|
sqllite instance is created for every new dataset version present.
|
|
53
56
|
|
|
@@ -81,7 +84,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
81
84
|
except KeyError:
|
|
82
85
|
raise DerivaMLException(f"Dataset {dataset_rid} not found")
|
|
83
86
|
|
|
84
|
-
def __init__(self, minid: DatasetMinid, bag_path: Path):
|
|
87
|
+
def __init__(self, minid: DatasetMinid, bag_path: Path, dbase_path: Path):
|
|
85
88
|
"""Create a new DatabaseModel.
|
|
86
89
|
|
|
87
90
|
Args:
|
|
@@ -92,8 +95,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
92
95
|
self.bag_path = bag_path
|
|
93
96
|
self.minid = minid
|
|
94
97
|
self.dataset_rid = minid.dataset_rid
|
|
95
|
-
|
|
96
|
-
self.dbase_file = dir_path / f"{minid.version_rid}.db"
|
|
98
|
+
self.dbase_file = dbase_path / f"{minid.version_rid}.db"
|
|
97
99
|
self.dbase = sqlite3.connect(self.dbase_file)
|
|
98
100
|
|
|
99
101
|
super().__init__(
|
|
@@ -315,6 +317,26 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
|
|
|
315
317
|
)
|
|
316
318
|
return datasets
|
|
317
319
|
|
|
320
|
+
def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
|
|
321
|
+
"""Retrieve the contents of the specified table as a dictionary.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
table: Table to retrieve data from. f schema is not provided as part of the table name,
|
|
325
|
+
the method will attempt to locate the schema for the table.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
A generator producing dictionaries containing the contents of the specified table as name/value pairs.
|
|
329
|
+
"""
|
|
330
|
+
table_name = self.normalize_table_name(table)
|
|
331
|
+
with self.dbase as dbase:
|
|
332
|
+
col_names = [
|
|
333
|
+
c[1]
|
|
334
|
+
for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()
|
|
335
|
+
]
|
|
336
|
+
result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
|
|
337
|
+
while row := result.fetchone():
|
|
338
|
+
yield dict(zip(col_names, row))
|
|
339
|
+
|
|
318
340
|
def normalize_table_name(self, table: str) -> str:
|
|
319
341
|
"""Attempt to insert the schema into a table name if it's not provided.
|
|
320
342
|
|
|
@@ -67,11 +67,12 @@ class Dataset:
|
|
|
67
67
|
|
|
68
68
|
_Logger = logging.getLogger("deriva_ml")
|
|
69
69
|
|
|
70
|
-
def __init__(self, model: DerivaModel, cache_dir: Path):
|
|
70
|
+
def __init__(self, model: DerivaModel, cache_dir: Path, working_dir: Path):
|
|
71
71
|
self._model = model
|
|
72
72
|
self._ml_schema = ML_SCHEMA
|
|
73
73
|
self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
|
|
74
74
|
self._cache_dir = cache_dir
|
|
75
|
+
self._working_dir = working_dir
|
|
75
76
|
self._logger = logging.getLogger("deriva_ml")
|
|
76
77
|
|
|
77
78
|
def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
|
|
@@ -92,7 +93,7 @@ class Dataset:
|
|
|
92
93
|
dataset_list: list[DatasetSpec],
|
|
93
94
|
description: Optional[str] = "",
|
|
94
95
|
execution_rid: Optional[RID] = None,
|
|
95
|
-
) ->
|
|
96
|
+
) -> list[dict[str, Any]]:
|
|
96
97
|
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
97
98
|
|
|
98
99
|
# Construct version records for insert
|
|
@@ -245,7 +246,7 @@ class Dataset:
|
|
|
245
246
|
DerivaMLException: if provided RID is not to a dataset_table.
|
|
246
247
|
"""
|
|
247
248
|
|
|
248
|
-
# Find all
|
|
249
|
+
# Find all the datasets that are reachable from this dataset and determine their new version numbers.
|
|
249
250
|
related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
|
|
250
251
|
version_update_list = [
|
|
251
252
|
DatasetSpec(
|
|
@@ -254,7 +255,7 @@ class Dataset:
|
|
|
254
255
|
)
|
|
255
256
|
for ds_rid in related_datasets
|
|
256
257
|
]
|
|
257
|
-
|
|
258
|
+
self._insert_dataset_versions(
|
|
258
259
|
version_update_list, description=description, execution_rid=execution_rid
|
|
259
260
|
)
|
|
260
261
|
return [d.version for d in version_update_list if d.rid == dataset_rid][0]
|
|
@@ -751,9 +752,10 @@ class Dataset:
|
|
|
751
752
|
]
|
|
752
753
|
|
|
753
754
|
def _table_paths(
|
|
754
|
-
self,
|
|
755
|
+
self,
|
|
756
|
+
dataset: Optional[DatasetSpec] = None,
|
|
757
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
755
758
|
) -> Iterator[tuple[str, str, Table]]:
|
|
756
|
-
|
|
757
759
|
paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
|
|
758
760
|
|
|
759
761
|
def source_path(path: tuple[Table, ...]):
|
|
@@ -779,17 +781,19 @@ class Dataset:
|
|
|
779
781
|
def _collect_paths(
|
|
780
782
|
self,
|
|
781
783
|
dataset_rid: Optional[RID] = None,
|
|
782
|
-
|
|
784
|
+
snapshot: Optional[Dataset] = None,
|
|
783
785
|
dataset_nesting_depth: Optional[int] = None,
|
|
784
786
|
) -> set[tuple[Table, ...]]:
|
|
787
|
+
snapshot_catalog = snapshot if snapshot else self
|
|
785
788
|
|
|
786
|
-
snapshot_catalog = snapshot_catalog or self
|
|
787
789
|
dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
|
|
788
790
|
"Dataset"
|
|
789
791
|
]
|
|
790
792
|
dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
|
|
791
793
|
"Dataset_Dataset"
|
|
792
794
|
]
|
|
795
|
+
|
|
796
|
+
# Figure out what types of elements the dataset contains.
|
|
793
797
|
dataset_associations = [
|
|
794
798
|
a
|
|
795
799
|
for a in self.dataset_table.find_associations()
|
|
@@ -812,7 +816,8 @@ class Dataset:
|
|
|
812
816
|
]
|
|
813
817
|
else:
|
|
814
818
|
included_associations = dataset_associations
|
|
815
|
-
|
|
819
|
+
|
|
820
|
+
# Get the paths through the schema and filter out all the dataset paths not used by this dataset.
|
|
816
821
|
paths = {
|
|
817
822
|
tuple(p)
|
|
818
823
|
for p in snapshot_catalog._model._schema_to_paths()
|
|
@@ -826,9 +831,7 @@ class Dataset:
|
|
|
826
831
|
nested_paths = set()
|
|
827
832
|
if dataset_rid:
|
|
828
833
|
for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
|
|
829
|
-
nested_paths |= self._collect_paths(
|
|
830
|
-
c, snapshot_catalog=snapshot_catalog
|
|
831
|
-
)
|
|
834
|
+
nested_paths |= self._collect_paths(c, snapshot=snapshot_catalog)
|
|
832
835
|
else:
|
|
833
836
|
# Initialize nesting depth if not already provided.
|
|
834
837
|
dataset_nesting_depth = (
|
|
@@ -974,7 +977,7 @@ class Dataset:
|
|
|
974
977
|
if dataset.materialize
|
|
975
978
|
else self._download_dataset_minid(minid)
|
|
976
979
|
)
|
|
977
|
-
return DatabaseModel(minid, bag_path).get_dataset()
|
|
980
|
+
return DatabaseModel(minid, bag_path, self._working_dir).get_dataset()
|
|
978
981
|
|
|
979
982
|
def _version_snapshot(self, dataset: DatasetSpec) -> str:
|
|
980
983
|
"""Return a catalog with snapshot for the specified dataset version"""
|
|
@@ -109,7 +109,7 @@ class DatasetBag:
|
|
|
109
109
|
for ts, on in paths:
|
|
110
110
|
tables = " JOIN ".join(ts)
|
|
111
111
|
on_expression = " and ".join(
|
|
112
|
-
[f"{column_name(
|
|
112
|
+
[f"{column_name(left)}={column_name(right)}" for left, right in on]
|
|
113
113
|
)
|
|
114
114
|
sql.append(
|
|
115
115
|
f"SELECT {select_args} FROM {tables} ON {on_expression} WHERE {dataset_table_name}.RID IN ({datasets})"
|
|
@@ -5,6 +5,7 @@ import logging
|
|
|
5
5
|
from random import random, randint
|
|
6
6
|
import tempfile
|
|
7
7
|
from tempfile import TemporaryDirectory
|
|
8
|
+
from typing import Optional
|
|
8
9
|
import itertools
|
|
9
10
|
|
|
10
11
|
from deriva.config.acl_config import AclConfig
|
|
@@ -18,7 +19,6 @@ from requests import HTTPError
|
|
|
18
19
|
from deriva_ml import (
|
|
19
20
|
DerivaML,
|
|
20
21
|
ExecutionConfiguration,
|
|
21
|
-
Workflow,
|
|
22
22
|
MLVocab,
|
|
23
23
|
BuiltinTypes,
|
|
24
24
|
ColumnDefinition,
|
|
@@ -169,12 +169,9 @@ def create_demo_features(ml_instance):
|
|
|
169
169
|
description="Model for our API workflow",
|
|
170
170
|
)
|
|
171
171
|
|
|
172
|
-
api_workflow = ml_instance.
|
|
173
|
-
Workflow
|
|
174
|
-
|
|
175
|
-
url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml",
|
|
176
|
-
workflow_type="API Workflow",
|
|
177
|
-
)
|
|
172
|
+
api_workflow = ml_instance.create_workflow(
|
|
173
|
+
name="API Workflow",
|
|
174
|
+
workflow_type="API Workflow",
|
|
178
175
|
)
|
|
179
176
|
|
|
180
177
|
api_execution = ml_instance.create_execution(
|
|
@@ -322,7 +319,11 @@ def create_demo_catalog(
|
|
|
322
319
|
|
|
323
320
|
class DemoML(DerivaML):
|
|
324
321
|
def __init__(
|
|
325
|
-
self,
|
|
322
|
+
self,
|
|
323
|
+
hostname,
|
|
324
|
+
catalog_id,
|
|
325
|
+
cache_dir: Optional[str] = None,
|
|
326
|
+
working_dir: Optional[str] = None,
|
|
326
327
|
):
|
|
327
328
|
super().__init__(
|
|
328
329
|
hostname=hostname,
|
|
@@ -8,7 +8,7 @@ from enum import Enum
|
|
|
8
8
|
from typing import Any, Iterable, Optional, Annotated
|
|
9
9
|
|
|
10
10
|
import deriva.core.ermrest_model as em
|
|
11
|
-
from urllib.parse import urlparse
|
|
11
|
+
from urllib.parse import urlparse
|
|
12
12
|
from deriva.core.ermrest_model import builtin_types
|
|
13
13
|
from pydantic import (
|
|
14
14
|
BaseModel,
|
|
@@ -139,13 +139,18 @@ class FileSpec(BaseModel):
|
|
|
139
139
|
if url_parts.scheme == "tag":
|
|
140
140
|
return v
|
|
141
141
|
elif not url_parts.scheme:
|
|
142
|
-
return f
|
|
142
|
+
return f"tag://{gethostname()},{date.today()}:file://{v}"
|
|
143
143
|
else:
|
|
144
144
|
raise ValidationError("url is not a file URL")
|
|
145
145
|
|
|
146
146
|
@model_serializer()
|
|
147
147
|
def serialize_filespec(self):
|
|
148
|
-
return {
|
|
148
|
+
return {
|
|
149
|
+
"URL": self.url,
|
|
150
|
+
"Description": self.description,
|
|
151
|
+
"MD5": self.md5,
|
|
152
|
+
"Length": self.length,
|
|
153
|
+
}
|
|
149
154
|
|
|
150
155
|
|
|
151
156
|
class VocabularyTerm(BaseModel):
|
|
@@ -32,6 +32,7 @@ from deriva.core.deriva_server import DerivaServer
|
|
|
32
32
|
from deriva.core.ermrest_catalog import ResolveRidResult
|
|
33
33
|
from deriva.core.ermrest_model import Key, Table
|
|
34
34
|
from deriva.core.hatrac_store import HatracStore
|
|
35
|
+
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
|
|
35
36
|
from pydantic import validate_call, ConfigDict
|
|
36
37
|
from requests import RequestException
|
|
37
38
|
|
|
@@ -70,17 +71,29 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
70
71
|
try:
|
|
71
72
|
from IPython import get_ipython
|
|
72
73
|
except ImportError: # Graceful fallback if IPython isn't installed.
|
|
73
|
-
|
|
74
|
+
|
|
75
|
+
def get_ipython():
|
|
76
|
+
"""Dummy routine in case you are not running in IPython."""
|
|
77
|
+
return None
|
|
78
|
+
|
|
74
79
|
|
|
75
80
|
try:
|
|
76
81
|
from jupyter_server.serverapp import list_running_servers
|
|
77
82
|
except ImportError:
|
|
78
|
-
|
|
83
|
+
|
|
84
|
+
def list_running_servers():
|
|
85
|
+
"""Dummy routine in case you are not running in Jupyter."""
|
|
86
|
+
return []
|
|
87
|
+
|
|
79
88
|
|
|
80
89
|
try:
|
|
81
90
|
from ipykernel import get_connection_file
|
|
82
91
|
except ImportError:
|
|
83
|
-
|
|
92
|
+
|
|
93
|
+
def get_connection_file():
|
|
94
|
+
"""Dummy routine in case you are not running in Jupyter."""
|
|
95
|
+
return ""
|
|
96
|
+
|
|
84
97
|
|
|
85
98
|
if TYPE_CHECKING:
|
|
86
99
|
from .execution import Execution
|
|
@@ -102,8 +115,8 @@ class DerivaML(Dataset):
|
|
|
102
115
|
self,
|
|
103
116
|
hostname: str,
|
|
104
117
|
catalog_id: str | int,
|
|
105
|
-
domain_schema: str = None,
|
|
106
|
-
project_name: str = None,
|
|
118
|
+
domain_schema: Optional[str] = None,
|
|
119
|
+
project_name: Optional[str] = None,
|
|
107
120
|
cache_dir: Optional[str] = None,
|
|
108
121
|
working_dir: Optional[str] = None,
|
|
109
122
|
model_version: str = "1",
|
|
@@ -150,7 +163,7 @@ class DerivaML(Dataset):
|
|
|
150
163
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
151
164
|
|
|
152
165
|
# Initialize dataset class.
|
|
153
|
-
super().__init__(self.model, self.cache_dir)
|
|
166
|
+
super().__init__(self.model, self.cache_dir, self.working_dir)
|
|
154
167
|
self._logger = logging.getLogger("deriva_ml")
|
|
155
168
|
self._logger.setLevel(logging_level)
|
|
156
169
|
|
|
@@ -205,9 +218,8 @@ class DerivaML(Dataset):
|
|
|
205
218
|
except subprocess.CalledProcessError:
|
|
206
219
|
self._logger.error("nbstripout is not found.")
|
|
207
220
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
|
|
221
|
+
@staticmethod
|
|
222
|
+
def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
|
|
211
223
|
"""Return the absolute path of the current notebook."""
|
|
212
224
|
# Get the kernel's connection file and extract the kernel ID
|
|
213
225
|
try:
|
|
@@ -245,7 +257,7 @@ class DerivaML(Dataset):
|
|
|
245
257
|
def _get_notebook_path(self) -> Path | None:
|
|
246
258
|
"""Return the absolute path of the current notebook."""
|
|
247
259
|
|
|
248
|
-
server, session =
|
|
260
|
+
server, session = DerivaML._get_notebook_session()
|
|
249
261
|
if server and session:
|
|
250
262
|
self._check_nbstrip_status()
|
|
251
263
|
relative_path = session["notebook"]["path"]
|
|
@@ -267,7 +279,7 @@ class DerivaML(Dataset):
|
|
|
267
279
|
) # Get the caller's filename, which is two up the stack from here.
|
|
268
280
|
else:
|
|
269
281
|
raise DerivaMLException(
|
|
270
|
-
|
|
282
|
+
"Looking for caller failed"
|
|
271
283
|
) # Stack is too shallow
|
|
272
284
|
return filename, is_notebook
|
|
273
285
|
|
|
@@ -335,7 +347,7 @@ class DerivaML(Dataset):
|
|
|
335
347
|
)
|
|
336
348
|
|
|
337
349
|
def asset_dir(
|
|
338
|
-
self, table: str | Table, prefix: str | Path = None
|
|
350
|
+
self, table: str | Table, prefix: Optional[str | Path] = None
|
|
339
351
|
) -> UploadAssetDirectory:
|
|
340
352
|
"""Return a local file path in which to place a files for an asset table. T
|
|
341
353
|
|
|
@@ -369,6 +381,29 @@ class DerivaML(Dataset):
|
|
|
369
381
|
"""
|
|
370
382
|
return self.cache_dir if cached else self.working_dir
|
|
371
383
|
|
|
384
|
+
@staticmethod
|
|
385
|
+
def globus_login(host: str) -> None:
|
|
386
|
+
"""Log into the specified host using Globus.
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
host:
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
|
|
393
|
+
"""
|
|
394
|
+
gnl = GlobusNativeLogin(host=host)
|
|
395
|
+
if gnl.is_logged_in([host]):
|
|
396
|
+
print("You are already logged in.")
|
|
397
|
+
else:
|
|
398
|
+
gnl.login(
|
|
399
|
+
[host],
|
|
400
|
+
no_local_server=True,
|
|
401
|
+
no_browser=True,
|
|
402
|
+
refresh_tokens=True,
|
|
403
|
+
update_bdbag_keychain=True,
|
|
404
|
+
)
|
|
405
|
+
print("Login Successful")
|
|
406
|
+
|
|
372
407
|
def chaise_url(self, table: RID | Table) -> str:
|
|
373
408
|
"""Return a Chaise URL to the specified table.
|
|
374
409
|
|
|
@@ -379,15 +414,15 @@ class DerivaML(Dataset):
|
|
|
379
414
|
Returns:
|
|
380
415
|
URL to the table in Chaise format.
|
|
381
416
|
"""
|
|
417
|
+
table_obj = self.model.name_to_table(table)
|
|
382
418
|
try:
|
|
383
|
-
table = self.model.name_to_table(table)
|
|
384
419
|
uri = self.catalog.get_server_uri().replace(
|
|
385
420
|
"ermrest/catalog/", "chaise/recordset/#"
|
|
386
421
|
)
|
|
387
422
|
except DerivaMLException:
|
|
388
423
|
# Perhaps we have a RID....
|
|
389
424
|
uri = self.cite(table)
|
|
390
|
-
return f"{uri}/{urlquote(
|
|
425
|
+
return f"{uri}/{urlquote(table_obj.schema.name)}:{urlquote(table_obj.name)}"
|
|
391
426
|
|
|
392
427
|
def cite(self, entity: dict | str) -> str:
|
|
393
428
|
"""Return a citation URL for the provided entity.
|
|
@@ -401,7 +436,9 @@ class DerivaML(Dataset):
|
|
|
401
436
|
Raises:
|
|
402
437
|
DerivaMLException: if provided RID does not exist.
|
|
403
438
|
"""
|
|
404
|
-
if entity.startswith(
|
|
439
|
+
if isinstance(entity, str) and entity.startswith(
|
|
440
|
+
f"https://{self.host_name}/id/{self.catalog_id}/"
|
|
441
|
+
):
|
|
405
442
|
# Already got a citation...
|
|
406
443
|
return entity
|
|
407
444
|
try:
|
|
@@ -498,9 +535,9 @@ class DerivaML(Dataset):
|
|
|
498
535
|
def create_asset(
|
|
499
536
|
self,
|
|
500
537
|
asset_name: str,
|
|
501
|
-
column_defs: Iterable[ColumnDefinition] = None,
|
|
538
|
+
column_defs: Optional[Iterable[ColumnDefinition]] = None,
|
|
502
539
|
comment: str = "",
|
|
503
|
-
schema: str = None,
|
|
540
|
+
schema: Optional[str] = None,
|
|
504
541
|
) -> Table:
|
|
505
542
|
"""Create an asset table with the given asset name.
|
|
506
543
|
|
|
@@ -532,9 +569,9 @@ class DerivaML(Dataset):
|
|
|
532
569
|
self,
|
|
533
570
|
target_table: Table | str,
|
|
534
571
|
feature_name: str,
|
|
535
|
-
terms: list[Table | str] = None,
|
|
536
|
-
assets: list[Table | str] = None,
|
|
537
|
-
metadata: Iterable[ColumnDefinition | Table | Key | str] = None,
|
|
572
|
+
terms: Optional[list[Table | str]] = None,
|
|
573
|
+
assets: Optional[list[Table | str]] = None,
|
|
574
|
+
metadata: Optional[Iterable[ColumnDefinition | Table | Key | str]] = None,
|
|
538
575
|
optional: Optional[list[str]] = None,
|
|
539
576
|
comment: str = "",
|
|
540
577
|
) -> type[FeatureRecord]:
|
|
@@ -899,6 +936,7 @@ class DerivaML(Dataset):
|
|
|
899
936
|
"""
|
|
900
937
|
|
|
901
938
|
def path_to_asset(path: str) -> str:
|
|
939
|
+
"""Pull the asset name out of a path to that asset in the filesystem"""
|
|
902
940
|
components = path.split("/")
|
|
903
941
|
return components[
|
|
904
942
|
components.index("asset") + 2
|
|
@@ -963,6 +1001,7 @@ class DerivaML(Dataset):
|
|
|
963
1001
|
)
|
|
964
1002
|
|
|
965
1003
|
def check_file_type(dtype: str) -> bool:
|
|
1004
|
+
"""Make sure that the specified string is either the name or synonym for a file type term."""
|
|
966
1005
|
for term in defined_types:
|
|
967
1006
|
if dtype == term.name or (term.synonyms and file_type in term.synonyms):
|
|
968
1007
|
return True
|
|
@@ -1098,7 +1137,7 @@ class DerivaML(Dataset):
|
|
|
1098
1137
|
|
|
1099
1138
|
def create_workflow(
|
|
1100
1139
|
self, name: str, workflow_type: str, description: str = "", create: bool = True
|
|
1101
|
-
) -> RID:
|
|
1140
|
+
) -> RID | None:
|
|
1102
1141
|
"""Identify current executing program and return a workflow RID for it
|
|
1103
1142
|
|
|
1104
1143
|
Determine the notebook or script that is currently being executed. Assume that this is
|
|
@@ -1166,7 +1205,7 @@ class DerivaML(Dataset):
|
|
|
1166
1205
|
)
|
|
1167
1206
|
github_url = result.stdout.strip().removesuffix(".git")
|
|
1168
1207
|
except subprocess.CalledProcessError:
|
|
1169
|
-
raise DerivaMLException(
|
|
1208
|
+
raise DerivaMLException("No GIT remote found")
|
|
1170
1209
|
|
|
1171
1210
|
# Find the root directory for the repository
|
|
1172
1211
|
repo_root = self._get_git_root()
|
|
@@ -1188,7 +1227,7 @@ class DerivaML(Dataset):
|
|
|
1188
1227
|
|
|
1189
1228
|
"""Get SHA-1 hash of latest commit of the file in the repository"""
|
|
1190
1229
|
result = subprocess.run(
|
|
1191
|
-
["git", "log", "-n", "1", "--pretty=format:%H
|
|
1230
|
+
["git", "log", "-n", "1", "--pretty=format:%H--", self.executable_path],
|
|
1192
1231
|
cwd=self.executable_path.parent,
|
|
1193
1232
|
capture_output=True,
|
|
1194
1233
|
text=True,
|
|
@@ -21,7 +21,7 @@ from .deriva_definitions import (
|
|
|
21
21
|
|
|
22
22
|
from collections import Counter
|
|
23
23
|
from pydantic import validate_call, ConfigDict
|
|
24
|
-
from typing import Iterable
|
|
24
|
+
from typing import Iterable, Optional
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class DerivaModel:
|
|
@@ -267,7 +267,7 @@ class DerivaModel:
|
|
|
267
267
|
def _schema_to_paths(
|
|
268
268
|
self,
|
|
269
269
|
root: Table = None,
|
|
270
|
-
path: list[Table] = None,
|
|
270
|
+
path: Optional[list[Table]] = None,
|
|
271
271
|
) -> list[list[Table]]:
|
|
272
272
|
"""Recursively walk over the domain schema graph and extend the current path.
|
|
273
273
|
|
|
@@ -54,7 +54,9 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
54
54
|
try:
|
|
55
55
|
from jupyter_server.serverapp import list_running_servers
|
|
56
56
|
except ImportError:
|
|
57
|
-
|
|
57
|
+
|
|
58
|
+
def list_running_servers():
|
|
59
|
+
return []
|
|
58
60
|
|
|
59
61
|
|
|
60
62
|
class Execution:
|
|
@@ -155,7 +157,6 @@ class Execution:
|
|
|
155
157
|
self._initialize_execution(reload)
|
|
156
158
|
|
|
157
159
|
def _save_runtime_environment(self):
|
|
158
|
-
|
|
159
160
|
runtime_env_path = ExecMetadataVocab.runtime_env.value
|
|
160
161
|
runtime_env_dir = self.execution_metadata_path(runtime_env_path)
|
|
161
162
|
with NamedTemporaryFile(
|
|
@@ -267,7 +268,7 @@ class Execution:
|
|
|
267
268
|
# Execution metadata cannot be in a directory, so map path into filename.
|
|
268
269
|
checkpoint_path = (
|
|
269
270
|
self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
|
|
270
|
-
/ f"{notebook_name.replace('/','_')}.checkpoint"
|
|
271
|
+
/ f"{notebook_name.replace('/', '_')}.checkpoint"
|
|
271
272
|
)
|
|
272
273
|
with open(checkpoint_path, "w", encoding="utf-8") as f:
|
|
273
274
|
json.dump(notebook_content, f)
|
|
@@ -359,7 +360,7 @@ class Execution:
|
|
|
359
360
|
if m := is_feature_asset_dir(p):
|
|
360
361
|
try:
|
|
361
362
|
self.update_status(
|
|
362
|
-
Status.running, f
|
|
363
|
+
Status.running, f"Uploading feature {m['feature_name']}..."
|
|
363
364
|
)
|
|
364
365
|
feature_assets[m["target_table"], m["feature_name"]] = (
|
|
365
366
|
self._ml_object.upload_assets(p)
|