deriva-ml 1.8.11__tar.gz → 1.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {deriva_ml-1.8.11/src/deriva_ml.egg-info → deriva_ml-1.9.1}/PKG-INFO +11 -2
  2. deriva_ml-1.9.1/README.md +11 -0
  3. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/Notebooks/DerivaML Execution.ipynb +3 -11
  4. deriva_ml-1.9.1/docs/user-guide/execution-configuration.md +26 -0
  5. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/release.sh +3 -2
  6. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/database_model.py +29 -7
  7. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/dataset.py +16 -13
  8. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/dataset_bag.py +1 -1
  9. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/demo_catalog.py +9 -8
  10. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/deriva_definitions.py +8 -3
  11. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/deriva_ml_base.py +62 -23
  12. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/deriva_model.py +2 -2
  13. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/execution.py +5 -4
  14. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/execution_configuration.py +20 -23
  15. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/schema_setup/annotations.py +1 -1
  16. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/schema_setup/create_schema.py +3 -2
  17. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/upload.py +1 -1
  18. {deriva_ml-1.8.11 → deriva_ml-1.9.1/src/deriva_ml.egg-info}/PKG-INFO +11 -2
  19. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml.egg-info/SOURCES.txt +0 -6
  20. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/derivaml_test.py +1 -0
  21. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/test_dataset.py +8 -38
  22. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/test_execution.py +9 -15
  23. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/test_upload.py +5 -7
  24. deriva_ml-1.8.11/README.md +0 -2
  25. deriva_ml-1.8.11/docs/user-guide/execution-configuration.md +0 -14
  26. deriva_ml-1.8.11/src/deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -36
  27. deriva_ml-1.8.11/src/deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -255
  28. deriva_ml-1.8.11/src/deriva_ml/build/lib/schema_setup/create_schema.py +0 -165
  29. deriva_ml-1.8.11/src/deriva_ml/schema_setup/alter_annotation.py +0 -55
  30. deriva_ml-1.8.11/src/deriva_ml/schema_setup/table_comments_utils.py +0 -56
  31. deriva_ml-1.8.11/tests/__init__.py +0 -0
  32. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/.github/workflows/publish-docs.yml +0 -0
  33. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/.gitignore +0 -0
  34. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/LICENSE +0 -0
  35. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/.DS_Store +0 -0
  36. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/Notebooks/DerivaML Create Notes.ipynb +0 -0
  37. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/Notebooks/DerivaML Dataset.ipynb +0 -0
  38. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/Notebooks/DerivaML Features.ipynb +0 -0
  39. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/Notebooks/DerivaML Vocabulary.ipynb +0 -0
  40. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/assets/ERD.png +0 -0
  41. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/assets/Launcher.png +0 -0
  42. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/assets/copy_minid.png +0 -0
  43. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/assets/deriva-logo.png +0 -0
  44. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/assets/deriva-ml.pdf +0 -0
  45. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/assets/sharing-at-home.pdf +0 -0
  46. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/dataset.md +0 -0
  47. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/dataset_aux_classes.md +0 -0
  48. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/dataset_bag.md +0 -0
  49. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/deriva_ml_base.md +0 -0
  50. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/deriva_model.md +0 -0
  51. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/execution.md +0 -0
  52. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/execution_configuration.md +0 -0
  53. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/feature.md +0 -0
  54. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/code-docs/upload.md +0 -0
  55. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/deriva_ml_structure.md +0 -0
  56. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/index.md +0 -0
  57. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/release-notes.md +0 -0
  58. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/user-guide/datasets.md +0 -0
  59. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/user-guide/identifiers.md +0 -0
  60. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/user-guide/install.md +0 -0
  61. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/docs/user-guide/ml_workflow_instruction.md +0 -0
  62. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/mkdocs.yml +0 -0
  63. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/pyproject.toml +0 -0
  64. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/setup.cfg +0 -0
  65. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/__init__.py +0 -0
  66. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/dataset_aux_classes.py +0 -0
  67. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/execution_environment.py +0 -0
  68. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/feature.py +0 -0
  69. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/history.py +0 -0
  70. {deriva_ml-1.8.11/src/deriva_ml/build/lib → deriva_ml-1.9.1/src/deriva_ml}/schema_setup/__init__.py +0 -0
  71. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/schema_setup/policy.json +0 -0
  72. {deriva_ml-1.8.11/src/deriva_ml/build/lib → deriva_ml-1.9.1/src/deriva_ml}/schema_setup/table_comments_utils.py +0 -0
  73. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml/test_functions.py +0 -0
  74. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
  75. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml.egg-info/entry_points.txt +0 -0
  76. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml.egg-info/requires.txt +0 -0
  77. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/src/deriva_ml.egg-info/top_level.txt +0 -0
  78. {deriva_ml-1.8.11/src/deriva_ml/schema_setup → deriva_ml-1.9.1/tests}/__init__.py +0 -0
  79. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/runner.py +0 -0
  80. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/test_basic_tables.py +0 -0
  81. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/test_download.py +0 -0
  82. {deriva_ml-1.8.11 → deriva_ml-1.9.1}/tests/test_features.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.8.11
3
+ Version: 1.9.1
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -16,5 +16,14 @@ Requires-Dist: setuptools-scm<=6.0
16
16
  Requires-Dist: nbstripout
17
17
  Dynamic: license-file
18
18
 
19
- Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
19
+ # DerivaML
20
+ Deriva-ML is a python library to simplify the process of creating and executing reproducible machine learning workflows
20
21
  using a deriva catalog.
22
+
23
+
24
+ ## Installing the GitHub CLI
25
+
26
+ The script release.sh will create a new release tag in GitHub. This script requires the
27
+ GitHUB CLI be installed.
28
+
29
+ See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.
@@ -0,0 +1,11 @@
1
+ # DerivaML
2
+ Deriva-ML is a python library to simplify the process of creating and executing reproducible machine learning workflows
3
+ using a deriva catalog.
4
+
5
+
6
+ ## Installing the GitHub CLI
7
+
8
+ The script release.sh will create a new release tag in GitHub. This script requires the
9
+ GitHUB CLI be installed.
10
+
11
+ See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.
@@ -28,7 +28,7 @@
28
28
  "source": [
29
29
  "import builtins\n",
30
30
  "from deriva.core.utils.globus_auth_utils import GlobusNativeLogin\n",
31
- "from deriva_ml import ExecutionConfiguration, Workflow, MLVocab, DerivaSystemColumns\n",
31
+ "from deriva_ml import ExecutionConfiguration, MLVocab, DerivaSystemColumns\n",
32
32
  "from deriva_ml.demo_catalog import create_demo_catalog, DemoML\n",
33
33
  "from IPython.display import display, Markdown, JSON\n",
34
34
  "import itertools\n",
@@ -166,12 +166,11 @@
166
166
  "metadata": {},
167
167
  "cell_type": "code",
168
168
  "source": [
169
- "ml_instance.add_term(MLVocab.workflow_type, \"Manual Workflow\", description=\"Inital setup of Model File\")\n",
169
+ "ml_instance.add_term(MLVocab.workflow_type, \"Manual Workflow\", description=\"Initial setup of Model File\")\n",
170
170
  "ml_instance.add_term(MLVocab.execution_asset_type, \"API_Model\", description=\"Model for our API workflow\")\n",
171
171
  "\n",
172
- "api_workflow = Workflow(\n",
172
+ "api_workflow = ml_instance.create_workflow(\n",
173
173
  " name=\"Manual Workflow\",\n",
174
- " url='https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/Notebooks/DerivaML%20Execution.ipynb',\n",
175
174
  " workflow_type=\"Manual Workflow\",\n",
176
175
  " description=\"A manual operation\"\n",
177
176
  ")\n",
@@ -207,13 +206,6 @@
207
206
  "source": [
208
207
  "ml_instance.add_term(MLVocab.workflow_type, \"ML Demo\", description=\"A ML Workflow that uses Deriva ML API\")\n",
209
208
  "\n",
210
- "api_workflow = Workflow(\n",
211
- " name=\"ML Demo\",\n",
212
- " url=\"https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml\",\n",
213
- " workflow_type=\"ML Demo\",\n",
214
- " description=\"A workflow that uses Deriva ML\"\n",
215
- ")\n",
216
- "\n",
217
209
  "config = ExecutionConfiguration(\n",
218
210
  " datasets=[training_dataset_rid, {'rid':testing_dataset_rid, 'materialize':False}],\n",
219
211
  " assets = [training_model_rid],\n",
@@ -0,0 +1,26 @@
1
+ # Configuring an execution
2
+
3
+ One of the essential functions of DerivaML is to help keep track how ML model results are created so that hey can be shared and reproduced.
4
+ Every execution in DerivaML is represented by an Execution object, whick keeps track of all of the paramemters associated with and execution and
5
+ provides a number of functions that enable a program to help keep track of the configuation and results of a model execution.
6
+
7
+ The first step in creating a DerivaML execution is to create an `ExectuionConfiguration`.
8
+ The `ExecutionConfiguration` class is used to specify the inputs that go are to be used by an Execution.
9
+ These inputs include
10
+ * A list of datasets that are used
11
+ * A list of other files (assets) that are to be used. This can include existing models, or any other infomration that the execution might need.
12
+ * The actual code that is being executed.
13
+
14
+ [`ExecutionConfiguration`][deriva_ml.execution_configuration.ExecutionConfiguration] is a Pydantic dataclass.
15
+ As part of initializing an execution, the assets and datasets in the configuration object are downloaded and cached.
16
+ The datasets are provided as a list of DatasetSpecw which
17
+ ```DatasetSpec(dataset_rid:RID, version:DatasetVersion, materialize:bool)```
18
+
19
+ it will be common to just want to use the latest version of the dataset, in which case you would use: `
20
+ ````
21
+ deriva_nl = DerivaML(...)
22
+ dataset_rid = ...
23
+ datasets = [DatasetSpec(dataset_rid, version=deriva_ml.dataset_version(dataset_rid))]
24
+ ```
25
+
26
+ If a dataset is large, downloading from the catalog might take a signficant amount of time.
@@ -9,11 +9,12 @@ fi
9
9
 
10
10
  # Default version bump is patch unless specified (patch, minor, or major)
11
11
  VERSION_TYPE=${1:-patch}
12
+
12
13
  echo "Bumping version: $VERSION_TYPE"
13
14
 
14
15
  # Bump the version using bump-my-version.
15
16
  # This command should update version files, commit the changes, and create a Git tag.
16
- bump-my-version bump $VERSION_TYPE --verbose
17
+ bump-my-version bump "$VERSION_TYPE" --verbose
17
18
 
18
19
  # Push commits and tags to the remote repository.
19
20
  echo "Pushing changes to remote repository..."
@@ -32,6 +33,6 @@ python -m build
32
33
  NEW_TAG=$(git describe --tags --abbrev=0)
33
34
  echo "New version tag: $NEW_TAG"
34
35
 
35
- twine upload dist/*${NEW_TAG}
36
+ twine upload dist/*${NEW_TAG/v/}
36
37
 
37
38
  echo "Release process complete!"
@@ -1,12 +1,15 @@
1
- """Ths module constains the definition of the DatabaseModel class. The role of this class is to provide an nterface between the BDBag representation
1
+ """Ths module contains the definition of the DatabaseModel class. The role of this class is to provide an nterface between the BDBag representation
2
2
  of a dataset and a sqllite database in which the contents of the bag are stored.
3
3
  """
4
+
5
+ from __future__ import annotations
6
+
4
7
  import logging
5
8
  import sqlite3
6
9
 
7
10
  from csv import reader
8
11
  from pathlib import Path
9
- from typing import Any, Optional
12
+ from typing import Any, Optional, Generator
10
13
  from urllib.parse import urlparse
11
14
 
12
15
  from deriva.core.ermrest_model import Model
@@ -20,7 +23,7 @@ from .dataset_bag import DatasetBag
20
23
  class DatabaseModelMeta(type):
21
24
  """Use metaclass to ensure that there is onl one instance per path"""
22
25
 
23
- _paths_loaded: dict[Path:"DatabaseModel"] = {}
26
+ _paths_loaded: dict[Path, "DatabaseModel"] = {}
24
27
 
25
28
  def __call__(cls, *args, **kwargs):
26
29
  logger = logging.getLogger("deriva_ml")
@@ -47,7 +50,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
47
50
  Because of nested datasets, it's possible that more than one dataset rid is in a bag, or that a dataset rid might
48
51
  appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
49
52
  into DatabaseModels, is kept in the class variable `_rid_map`.
50
-
53
+
51
54
  Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
52
55
  sqllite instance is created for every new dataset version present.
53
56
 
@@ -81,7 +84,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
81
84
  except KeyError:
82
85
  raise DerivaMLException(f"Dataset {dataset_rid} not found")
83
86
 
84
- def __init__(self, minid: DatasetMinid, bag_path: Path):
87
+ def __init__(self, minid: DatasetMinid, bag_path: Path, dbase_path: Path):
85
88
  """Create a new DatabaseModel.
86
89
 
87
90
  Args:
@@ -92,8 +95,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
92
95
  self.bag_path = bag_path
93
96
  self.minid = minid
94
97
  self.dataset_rid = minid.dataset_rid
95
- dir_path = bag_path.parent
96
- self.dbase_file = dir_path / f"{minid.version_rid}.db"
98
+ self.dbase_file = dbase_path / f"{minid.version_rid}.db"
97
99
  self.dbase = sqlite3.connect(self.dbase_file)
98
100
 
99
101
  super().__init__(
@@ -315,6 +317,26 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
315
317
  )
316
318
  return datasets
317
319
 
320
+ def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
321
+ """Retrieve the contents of the specified table as a dictionary.
322
+
323
+ Args:
324
+ table: Table to retrieve data from. f schema is not provided as part of the table name,
325
+ the method will attempt to locate the schema for the table.
326
+
327
+ Returns:
328
+ A generator producing dictionaries containing the contents of the specified table as name/value pairs.
329
+ """
330
+ table_name = self.normalize_table_name(table)
331
+ with self.dbase as dbase:
332
+ col_names = [
333
+ c[1]
334
+ for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()
335
+ ]
336
+ result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
337
+ while row := result.fetchone():
338
+ yield dict(zip(col_names, row))
339
+
318
340
  def normalize_table_name(self, table: str) -> str:
319
341
  """Attempt to insert the schema into a table name if it's not provided.
320
342
 
@@ -67,11 +67,12 @@ class Dataset:
67
67
 
68
68
  _Logger = logging.getLogger("deriva_ml")
69
69
 
70
- def __init__(self, model: DerivaModel, cache_dir: Path):
70
+ def __init__(self, model: DerivaModel, cache_dir: Path, working_dir: Path):
71
71
  self._model = model
72
72
  self._ml_schema = ML_SCHEMA
73
73
  self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
74
74
  self._cache_dir = cache_dir
75
+ self._working_dir = working_dir
75
76
  self._logger = logging.getLogger("deriva_ml")
76
77
 
77
78
  def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
@@ -92,7 +93,7 @@ class Dataset:
92
93
  dataset_list: list[DatasetSpec],
93
94
  description: Optional[str] = "",
94
95
  execution_rid: Optional[RID] = None,
95
- ) -> RID:
96
+ ) -> list[dict[str, Any]]:
96
97
  schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
97
98
 
98
99
  # Construct version records for insert
@@ -245,7 +246,7 @@ class Dataset:
245
246
  DerivaMLException: if provided RID is not to a dataset_table.
246
247
  """
247
248
 
248
- # Find all of the datasets that are reachable from this dataset and determine their new version numbers.
249
+ # Find all the datasets that are reachable from this dataset and determine their new version numbers.
249
250
  related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
250
251
  version_update_list = [
251
252
  DatasetSpec(
@@ -254,7 +255,7 @@ class Dataset:
254
255
  )
255
256
  for ds_rid in related_datasets
256
257
  ]
257
- updated_versions = self._insert_dataset_versions(
258
+ self._insert_dataset_versions(
258
259
  version_update_list, description=description, execution_rid=execution_rid
259
260
  )
260
261
  return [d.version for d in version_update_list if d.rid == dataset_rid][0]
@@ -751,9 +752,10 @@ class Dataset:
751
752
  ]
752
753
 
753
754
  def _table_paths(
754
- self, dataset: DatasetSpec = None, snapshot_catalog: Optional[DerivaML] = None
755
+ self,
756
+ dataset: Optional[DatasetSpec] = None,
757
+ snapshot_catalog: Optional[DerivaML] = None,
755
758
  ) -> Iterator[tuple[str, str, Table]]:
756
-
757
759
  paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
758
760
 
759
761
  def source_path(path: tuple[Table, ...]):
@@ -779,17 +781,19 @@ class Dataset:
779
781
  def _collect_paths(
780
782
  self,
781
783
  dataset_rid: Optional[RID] = None,
782
- snapshot_catalog: Optional[DerivaML] = None,
784
+ snapshot: Optional[Dataset] = None,
783
785
  dataset_nesting_depth: Optional[int] = None,
784
786
  ) -> set[tuple[Table, ...]]:
787
+ snapshot_catalog = snapshot if snapshot else self
785
788
 
786
- snapshot_catalog = snapshot_catalog or self
787
789
  dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
788
790
  "Dataset"
789
791
  ]
790
792
  dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
791
793
  "Dataset_Dataset"
792
794
  ]
795
+
796
+ # Figure out what types of elements the dataset contains.
793
797
  dataset_associations = [
794
798
  a
795
799
  for a in self.dataset_table.find_associations()
@@ -812,7 +816,8 @@ class Dataset:
812
816
  ]
813
817
  else:
814
818
  included_associations = dataset_associations
815
- # Get the paths through the schema and filter out all of dataset paths not used by this dataset.
819
+
820
+ # Get the paths through the schema and filter out all the dataset paths not used by this dataset.
816
821
  paths = {
817
822
  tuple(p)
818
823
  for p in snapshot_catalog._model._schema_to_paths()
@@ -826,9 +831,7 @@ class Dataset:
826
831
  nested_paths = set()
827
832
  if dataset_rid:
828
833
  for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
829
- nested_paths |= self._collect_paths(
830
- c, snapshot_catalog=snapshot_catalog
831
- )
834
+ nested_paths |= self._collect_paths(c, snapshot=snapshot_catalog)
832
835
  else:
833
836
  # Initialize nesting depth if not already provided.
834
837
  dataset_nesting_depth = (
@@ -974,7 +977,7 @@ class Dataset:
974
977
  if dataset.materialize
975
978
  else self._download_dataset_minid(minid)
976
979
  )
977
- return DatabaseModel(minid, bag_path).get_dataset()
980
+ return DatabaseModel(minid, bag_path, self._working_dir).get_dataset()
978
981
 
979
982
  def _version_snapshot(self, dataset: DatasetSpec) -> str:
980
983
  """Return a catalog with snapshot for the specified dataset version"""
@@ -109,7 +109,7 @@ class DatasetBag:
109
109
  for ts, on in paths:
110
110
  tables = " JOIN ".join(ts)
111
111
  on_expression = " and ".join(
112
- [f"{column_name(l)}={column_name(r)}" for l, r in on]
112
+ [f"{column_name(left)}={column_name(right)}" for left, right in on]
113
113
  )
114
114
  sql.append(
115
115
  f"SELECT {select_args} FROM {tables} ON {on_expression} WHERE {dataset_table_name}.RID IN ({datasets})"
@@ -5,6 +5,7 @@ import logging
5
5
  from random import random, randint
6
6
  import tempfile
7
7
  from tempfile import TemporaryDirectory
8
+ from typing import Optional
8
9
  import itertools
9
10
 
10
11
  from deriva.config.acl_config import AclConfig
@@ -18,7 +19,6 @@ from requests import HTTPError
18
19
  from deriva_ml import (
19
20
  DerivaML,
20
21
  ExecutionConfiguration,
21
- Workflow,
22
22
  MLVocab,
23
23
  BuiltinTypes,
24
24
  ColumnDefinition,
@@ -169,12 +169,9 @@ def create_demo_features(ml_instance):
169
169
  description="Model for our API workflow",
170
170
  )
171
171
 
172
- api_workflow = ml_instance.add_workflow(
173
- Workflow(
174
- name="API Workflow",
175
- url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml",
176
- workflow_type="API Workflow",
177
- )
172
+ api_workflow = ml_instance.create_workflow(
173
+ name="API Workflow",
174
+ workflow_type="API Workflow",
178
175
  )
179
176
 
180
177
  api_execution = ml_instance.create_execution(
@@ -322,7 +319,11 @@ def create_demo_catalog(
322
319
 
323
320
  class DemoML(DerivaML):
324
321
  def __init__(
325
- self, hostname, catalog_id, cache_dir: str = None, working_dir: str = None
322
+ self,
323
+ hostname,
324
+ catalog_id,
325
+ cache_dir: Optional[str] = None,
326
+ working_dir: Optional[str] = None,
326
327
  ):
327
328
  super().__init__(
328
329
  hostname=hostname,
@@ -8,7 +8,7 @@ from enum import Enum
8
8
  from typing import Any, Iterable, Optional, Annotated
9
9
 
10
10
  import deriva.core.ermrest_model as em
11
- from urllib.parse import urlparse, urljoin
11
+ from urllib.parse import urlparse
12
12
  from deriva.core.ermrest_model import builtin_types
13
13
  from pydantic import (
14
14
  BaseModel,
@@ -139,13 +139,18 @@ class FileSpec(BaseModel):
139
139
  if url_parts.scheme == "tag":
140
140
  return v
141
141
  elif not url_parts.scheme:
142
- return f'tag://{gethostname()},{date.today()}:file://{v}'
142
+ return f"tag://{gethostname()},{date.today()}:file://{v}"
143
143
  else:
144
144
  raise ValidationError("url is not a file URL")
145
145
 
146
146
  @model_serializer()
147
147
  def serialize_filespec(self):
148
- return {'URL': self.url, 'Description': self.description, 'MD5': self.md5, 'Length': self.length}
148
+ return {
149
+ "URL": self.url,
150
+ "Description": self.description,
151
+ "MD5": self.md5,
152
+ "Length": self.length,
153
+ }
149
154
 
150
155
 
151
156
  class VocabularyTerm(BaseModel):
@@ -32,6 +32,7 @@ from deriva.core.deriva_server import DerivaServer
32
32
  from deriva.core.ermrest_catalog import ResolveRidResult
33
33
  from deriva.core.ermrest_model import Key, Table
34
34
  from deriva.core.hatrac_store import HatracStore
35
+ from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
35
36
  from pydantic import validate_call, ConfigDict
36
37
  from requests import RequestException
37
38
 
@@ -70,17 +71,29 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
70
71
  try:
71
72
  from IPython import get_ipython
72
73
  except ImportError: # Graceful fallback if IPython isn't installed.
73
- get_ipython = lambda: None
74
+
75
+ def get_ipython():
76
+ """Dummy routine in case you are not running in IPython."""
77
+ return None
78
+
74
79
 
75
80
  try:
76
81
  from jupyter_server.serverapp import list_running_servers
77
82
  except ImportError:
78
- list_running_servers = lambda: []
83
+
84
+ def list_running_servers():
85
+ """Dummy routine in case you are not running in Jupyter."""
86
+ return []
87
+
79
88
 
80
89
  try:
81
90
  from ipykernel import get_connection_file
82
91
  except ImportError:
83
- get_connection_file = lambda: ""
92
+
93
+ def get_connection_file():
94
+ """Dummy routine in case you are not running in Jupyter."""
95
+ return ""
96
+
84
97
 
85
98
  if TYPE_CHECKING:
86
99
  from .execution import Execution
@@ -102,8 +115,8 @@ class DerivaML(Dataset):
102
115
  self,
103
116
  hostname: str,
104
117
  catalog_id: str | int,
105
- domain_schema: str = None,
106
- project_name: str = None,
118
+ domain_schema: Optional[str] = None,
119
+ project_name: Optional[str] = None,
107
120
  cache_dir: Optional[str] = None,
108
121
  working_dir: Optional[str] = None,
109
122
  model_version: str = "1",
@@ -150,7 +163,7 @@ class DerivaML(Dataset):
150
163
  self.cache_dir.mkdir(parents=True, exist_ok=True)
151
164
 
152
165
  # Initialize dataset class.
153
- super().__init__(self.model, self.cache_dir)
166
+ super().__init__(self.model, self.cache_dir, self.working_dir)
154
167
  self._logger = logging.getLogger("deriva_ml")
155
168
  self._logger.setLevel(logging_level)
156
169
 
@@ -205,9 +218,8 @@ class DerivaML(Dataset):
205
218
  except subprocess.CalledProcessError:
206
219
  self._logger.error("nbstripout is not found.")
207
220
 
208
- def _get_notebook_session(
209
- self,
210
- ) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
221
+ @staticmethod
222
+ def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
211
223
  """Return the absolute path of the current notebook."""
212
224
  # Get the kernel's connection file and extract the kernel ID
213
225
  try:
@@ -245,7 +257,7 @@ class DerivaML(Dataset):
245
257
  def _get_notebook_path(self) -> Path | None:
246
258
  """Return the absolute path of the current notebook."""
247
259
 
248
- server, session = self._get_notebook_session()
260
+ server, session = DerivaML._get_notebook_session()
249
261
  if server and session:
250
262
  self._check_nbstrip_status()
251
263
  relative_path = session["notebook"]["path"]
@@ -267,7 +279,7 @@ class DerivaML(Dataset):
267
279
  ) # Get the caller's filename, which is two up the stack from here.
268
280
  else:
269
281
  raise DerivaMLException(
270
- f"Looking for caller failed"
282
+ "Looking for caller failed"
271
283
  ) # Stack is too shallow
272
284
  return filename, is_notebook
273
285
 
@@ -335,7 +347,7 @@ class DerivaML(Dataset):
335
347
  )
336
348
 
337
349
  def asset_dir(
338
- self, table: str | Table, prefix: str | Path = None
350
+ self, table: str | Table, prefix: Optional[str | Path] = None
339
351
  ) -> UploadAssetDirectory:
340
352
  """Return a local file path in which to place a files for an asset table. T
341
353
 
@@ -369,6 +381,29 @@ class DerivaML(Dataset):
369
381
  """
370
382
  return self.cache_dir if cached else self.working_dir
371
383
 
384
+ @staticmethod
385
+ def globus_login(host: str) -> None:
386
+ """Log into the specified host using Globus.
387
+
388
+ Args:
389
+ host:
390
+
391
+ Returns:
392
+
393
+ """
394
+ gnl = GlobusNativeLogin(host=host)
395
+ if gnl.is_logged_in([host]):
396
+ print("You are already logged in.")
397
+ else:
398
+ gnl.login(
399
+ [host],
400
+ no_local_server=True,
401
+ no_browser=True,
402
+ refresh_tokens=True,
403
+ update_bdbag_keychain=True,
404
+ )
405
+ print("Login Successful")
406
+
372
407
  def chaise_url(self, table: RID | Table) -> str:
373
408
  """Return a Chaise URL to the specified table.
374
409
 
@@ -379,15 +414,15 @@ class DerivaML(Dataset):
379
414
  Returns:
380
415
  URL to the table in Chaise format.
381
416
  """
417
+ table_obj = self.model.name_to_table(table)
382
418
  try:
383
- table = self.model.name_to_table(table)
384
419
  uri = self.catalog.get_server_uri().replace(
385
420
  "ermrest/catalog/", "chaise/recordset/#"
386
421
  )
387
422
  except DerivaMLException:
388
423
  # Perhaps we have a RID....
389
424
  uri = self.cite(table)
390
- return f"{uri}/{urlquote(table.schema.name)}:{urlquote(table.name)}"
425
+ return f"{uri}/{urlquote(table_obj.schema.name)}:{urlquote(table_obj.name)}"
391
426
 
392
427
  def cite(self, entity: dict | str) -> str:
393
428
  """Return a citation URL for the provided entity.
@@ -401,7 +436,9 @@ class DerivaML(Dataset):
401
436
  Raises:
402
437
  DerivaMLException: if provided RID does not exist.
403
438
  """
404
- if entity.startswith(f"https://{self.host_name}/id/{self.catalog_id}/"):
439
+ if isinstance(entity, str) and entity.startswith(
440
+ f"https://{self.host_name}/id/{self.catalog_id}/"
441
+ ):
405
442
  # Already got a citation...
406
443
  return entity
407
444
  try:
@@ -498,9 +535,9 @@ class DerivaML(Dataset):
498
535
  def create_asset(
499
536
  self,
500
537
  asset_name: str,
501
- column_defs: Iterable[ColumnDefinition] = None,
538
+ column_defs: Optional[Iterable[ColumnDefinition]] = None,
502
539
  comment: str = "",
503
- schema: str = None,
540
+ schema: Optional[str] = None,
504
541
  ) -> Table:
505
542
  """Create an asset table with the given asset name.
506
543
 
@@ -532,9 +569,9 @@ class DerivaML(Dataset):
532
569
  self,
533
570
  target_table: Table | str,
534
571
  feature_name: str,
535
- terms: list[Table | str] = None,
536
- assets: list[Table | str] = None,
537
- metadata: Iterable[ColumnDefinition | Table | Key | str] = None,
572
+ terms: Optional[list[Table | str]] = None,
573
+ assets: Optional[list[Table | str]] = None,
574
+ metadata: Optional[Iterable[ColumnDefinition | Table | Key | str]] = None,
538
575
  optional: Optional[list[str]] = None,
539
576
  comment: str = "",
540
577
  ) -> type[FeatureRecord]:
@@ -899,6 +936,7 @@ class DerivaML(Dataset):
899
936
  """
900
937
 
901
938
  def path_to_asset(path: str) -> str:
939
+ """Pull the asset name out of a path to that asset in the filesystem"""
902
940
  components = path.split("/")
903
941
  return components[
904
942
  components.index("asset") + 2
@@ -963,6 +1001,7 @@ class DerivaML(Dataset):
963
1001
  )
964
1002
 
965
1003
  def check_file_type(dtype: str) -> bool:
1004
+ """Make sure that the specified string is either the name or synonym for a file type term."""
966
1005
  for term in defined_types:
967
1006
  if dtype == term.name or (term.synonyms and file_type in term.synonyms):
968
1007
  return True
@@ -1098,7 +1137,7 @@ class DerivaML(Dataset):
1098
1137
 
1099
1138
  def create_workflow(
1100
1139
  self, name: str, workflow_type: str, description: str = "", create: bool = True
1101
- ) -> RID:
1140
+ ) -> RID | None:
1102
1141
  """Identify current executing program and return a workflow RID for it
1103
1142
 
1104
1143
  Determine the notebook or script that is currently being executed. Assume that this is
@@ -1166,7 +1205,7 @@ class DerivaML(Dataset):
1166
1205
  )
1167
1206
  github_url = result.stdout.strip().removesuffix(".git")
1168
1207
  except subprocess.CalledProcessError:
1169
- raise DerivaMLException(f"No GIT remote found")
1208
+ raise DerivaMLException("No GIT remote found")
1170
1209
 
1171
1210
  # Find the root directory for the repository
1172
1211
  repo_root = self._get_git_root()
@@ -1188,7 +1227,7 @@ class DerivaML(Dataset):
1188
1227
 
1189
1228
  """Get SHA-1 hash of latest commit of the file in the repository"""
1190
1229
  result = subprocess.run(
1191
- ["git", "log", "-n", "1", "--pretty=format:%H" "--", self.executable_path],
1230
+ ["git", "log", "-n", "1", "--pretty=format:%H--", self.executable_path],
1192
1231
  cwd=self.executable_path.parent,
1193
1232
  capture_output=True,
1194
1233
  text=True,
@@ -21,7 +21,7 @@ from .deriva_definitions import (
21
21
 
22
22
  from collections import Counter
23
23
  from pydantic import validate_call, ConfigDict
24
- from typing import Iterable
24
+ from typing import Iterable, Optional
25
25
 
26
26
 
27
27
  class DerivaModel:
@@ -267,7 +267,7 @@ class DerivaModel:
267
267
  def _schema_to_paths(
268
268
  self,
269
269
  root: Table = None,
270
- path: list[Table] = None,
270
+ path: Optional[list[Table]] = None,
271
271
  ) -> list[list[Table]]:
272
272
  """Recursively walk over the domain schema graph and extend the current path.
273
273
 
@@ -54,7 +54,9 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
54
54
  try:
55
55
  from jupyter_server.serverapp import list_running_servers
56
56
  except ImportError:
57
- list_running_servers = lambda: []
57
+
58
+ def list_running_servers():
59
+ return []
58
60
 
59
61
 
60
62
  class Execution:
@@ -155,7 +157,6 @@ class Execution:
155
157
  self._initialize_execution(reload)
156
158
 
157
159
  def _save_runtime_environment(self):
158
-
159
160
  runtime_env_path = ExecMetadataVocab.runtime_env.value
160
161
  runtime_env_dir = self.execution_metadata_path(runtime_env_path)
161
162
  with NamedTemporaryFile(
@@ -267,7 +268,7 @@ class Execution:
267
268
  # Execution metadata cannot be in a directory, so map path into filename.
268
269
  checkpoint_path = (
269
270
  self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
270
- / f"{notebook_name.replace('/','_')}.checkpoint"
271
+ / f"{notebook_name.replace('/', '_')}.checkpoint"
271
272
  )
272
273
  with open(checkpoint_path, "w", encoding="utf-8") as f:
273
274
  json.dump(notebook_content, f)
@@ -359,7 +360,7 @@ class Execution:
359
360
  if m := is_feature_asset_dir(p):
360
361
  try:
361
362
  self.update_status(
362
- Status.running, f'Uploading feature {m["feature_name"]}...'
363
+ Status.running, f"Uploading feature {m['feature_name']}..."
363
364
  )
364
365
  feature_assets[m["target_table"], m["feature_name"]] = (
365
366
  self._ml_object.upload_assets(p)