deriva-ml 1.8.11__tar.gz → 1.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {deriva_ml-1.8.11/src/deriva_ml.egg-info → deriva_ml-1.9.0}/PKG-INFO +1 -1
  2. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/Notebooks/DerivaML Execution.ipynb +3 -11
  3. deriva_ml-1.9.0/docs/user-guide/execution-configuration.md +26 -0
  4. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/release.sh +2 -2
  5. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/database_model.py +27 -4
  6. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/dataset.py +14 -9
  7. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/dataset_bag.py +1 -1
  8. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/demo_catalog.py +9 -8
  9. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/deriva_definitions.py +8 -3
  10. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/deriva_ml_base.py +60 -21
  11. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/deriva_model.py +2 -2
  12. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/execution.py +5 -4
  13. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/execution_configuration.py +20 -23
  14. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/schema_setup/annotations.py +1 -1
  15. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/schema_setup/create_schema.py +3 -2
  16. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/upload.py +1 -1
  17. {deriva_ml-1.8.11 → deriva_ml-1.9.0/src/deriva_ml.egg-info}/PKG-INFO +1 -1
  18. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml.egg-info/SOURCES.txt +0 -6
  19. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/derivaml_test.py +1 -0
  20. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/test_dataset.py +8 -38
  21. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/test_execution.py +9 -15
  22. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/test_upload.py +5 -7
  23. deriva_ml-1.8.11/docs/user-guide/execution-configuration.md +0 -14
  24. deriva_ml-1.8.11/src/deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -36
  25. deriva_ml-1.8.11/src/deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -255
  26. deriva_ml-1.8.11/src/deriva_ml/build/lib/schema_setup/create_schema.py +0 -165
  27. deriva_ml-1.8.11/src/deriva_ml/schema_setup/alter_annotation.py +0 -55
  28. deriva_ml-1.8.11/src/deriva_ml/schema_setup/table_comments_utils.py +0 -56
  29. deriva_ml-1.8.11/tests/__init__.py +0 -0
  30. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/.github/workflows/publish-docs.yml +0 -0
  31. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/.gitignore +0 -0
  32. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/LICENSE +0 -0
  33. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/README.md +0 -0
  34. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/.DS_Store +0 -0
  35. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/Notebooks/DerivaML Create Notes.ipynb +0 -0
  36. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/Notebooks/DerivaML Dataset.ipynb +0 -0
  37. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/Notebooks/DerivaML Features.ipynb +0 -0
  38. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/Notebooks/DerivaML Vocabulary.ipynb +0 -0
  39. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/assets/ERD.png +0 -0
  40. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/assets/Launcher.png +0 -0
  41. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/assets/copy_minid.png +0 -0
  42. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/assets/deriva-logo.png +0 -0
  43. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/assets/deriva-ml.pdf +0 -0
  44. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/assets/sharing-at-home.pdf +0 -0
  45. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/dataset.md +0 -0
  46. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/dataset_aux_classes.md +0 -0
  47. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/dataset_bag.md +0 -0
  48. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/deriva_ml_base.md +0 -0
  49. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/deriva_model.md +0 -0
  50. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/execution.md +0 -0
  51. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/execution_configuration.md +0 -0
  52. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/feature.md +0 -0
  53. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/code-docs/upload.md +0 -0
  54. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/deriva_ml_structure.md +0 -0
  55. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/index.md +0 -0
  56. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/release-notes.md +0 -0
  57. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/user-guide/datasets.md +0 -0
  58. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/user-guide/identifiers.md +0 -0
  59. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/user-guide/install.md +0 -0
  60. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/docs/user-guide/ml_workflow_instruction.md +0 -0
  61. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/mkdocs.yml +0 -0
  62. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/pyproject.toml +0 -0
  63. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/setup.cfg +0 -0
  64. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/__init__.py +0 -0
  65. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/dataset_aux_classes.py +0 -0
  66. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/execution_environment.py +0 -0
  67. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/feature.py +0 -0
  68. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/history.py +0 -0
  69. {deriva_ml-1.8.11/src/deriva_ml/build/lib → deriva_ml-1.9.0/src/deriva_ml}/schema_setup/__init__.py +0 -0
  70. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/schema_setup/policy.json +0 -0
  71. {deriva_ml-1.8.11/src/deriva_ml/build/lib → deriva_ml-1.9.0/src/deriva_ml}/schema_setup/table_comments_utils.py +0 -0
  72. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml/test_functions.py +0 -0
  73. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
  74. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml.egg-info/entry_points.txt +0 -0
  75. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml.egg-info/requires.txt +0 -0
  76. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/src/deriva_ml.egg-info/top_level.txt +0 -0
  77. {deriva_ml-1.8.11/src/deriva_ml/schema_setup → deriva_ml-1.9.0/tests}/__init__.py +0 -0
  78. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/runner.py +0 -0
  79. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/test_basic_tables.py +0 -0
  80. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/test_download.py +0 -0
  81. {deriva_ml-1.8.11 → deriva_ml-1.9.0}/tests/test_features.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deriva-ml
3
- Version: 1.8.11
3
+ Version: 1.9.0
4
4
  Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
5
5
  Author-email: ISRD <isrd-dev@isi.edu>
6
6
  Requires-Python: >=3.10
@@ -28,7 +28,7 @@
28
28
  "source": [
29
29
  "import builtins\n",
30
30
  "from deriva.core.utils.globus_auth_utils import GlobusNativeLogin\n",
31
- "from deriva_ml import ExecutionConfiguration, Workflow, MLVocab, DerivaSystemColumns\n",
31
+ "from deriva_ml import ExecutionConfiguration, MLVocab, DerivaSystemColumns\n",
32
32
  "from deriva_ml.demo_catalog import create_demo_catalog, DemoML\n",
33
33
  "from IPython.display import display, Markdown, JSON\n",
34
34
  "import itertools\n",
@@ -166,12 +166,11 @@
166
166
  "metadata": {},
167
167
  "cell_type": "code",
168
168
  "source": [
169
- "ml_instance.add_term(MLVocab.workflow_type, \"Manual Workflow\", description=\"Inital setup of Model File\")\n",
169
+ "ml_instance.add_term(MLVocab.workflow_type, \"Manual Workflow\", description=\"Initial setup of Model File\")\n",
170
170
  "ml_instance.add_term(MLVocab.execution_asset_type, \"API_Model\", description=\"Model for our API workflow\")\n",
171
171
  "\n",
172
- "api_workflow = Workflow(\n",
172
+ "api_workflow = ml_instance.create_workflow(\n",
173
173
  " name=\"Manual Workflow\",\n",
174
- " url='https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/Notebooks/DerivaML%20Execution.ipynb',\n",
175
174
  " workflow_type=\"Manual Workflow\",\n",
176
175
  " description=\"A manual operation\"\n",
177
176
  ")\n",
@@ -207,13 +206,6 @@
207
206
  "source": [
208
207
  "ml_instance.add_term(MLVocab.workflow_type, \"ML Demo\", description=\"A ML Workflow that uses Deriva ML API\")\n",
209
208
  "\n",
210
- "api_workflow = Workflow(\n",
211
- " name=\"ML Demo\",\n",
212
- " url=\"https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml\",\n",
213
- " workflow_type=\"ML Demo\",\n",
214
- " description=\"A workflow that uses Deriva ML\"\n",
215
- ")\n",
216
- "\n",
217
209
  "config = ExecutionConfiguration(\n",
218
210
  " datasets=[training_dataset_rid, {'rid':testing_dataset_rid, 'materialize':False}],\n",
219
211
  " assets = [training_model_rid],\n",
@@ -0,0 +1,26 @@
1
+ # Configuring an execution
2
+
3
+ One of the essential functions of DerivaML is to help keep track how ML model results are created so that hey can be shared and reproduced.
4
+ Every execution in DerivaML is represented by an Execution object, whick keeps track of all of the paramemters associated with and execution and
5
+ provides a number of functions that enable a program to help keep track of the configuation and results of a model execution.
6
+
7
+ The first step in creating a DerivaML execution is to create an `ExectuionConfiguration`.
8
+ The `ExecutionConfiguration` class is used to specify the inputs that go are to be used by an Execution.
9
+ These inputs include
10
+ * A list of datasets that are used
11
+ * A list of other files (assets) that are to be used. This can include existing models, or any other infomration that the execution might need.
12
+ * The actual code that is being executed.
13
+
14
+ [`ExecutionConfiguration`][deriva_ml.execution_configuration.ExecutionConfiguration] is a Pydantic dataclass.
15
+ As part of initializing an execution, the assets and datasets in the configuration object are downloaded and cached.
16
+ The datasets are provided as a list of DatasetSpecw which
17
+ ```DatasetSpec(dataset_rid:RID, version:DatasetVersion, materialize:bool)```
18
+
19
+ it will be common to just want to use the latest version of the dataset, in which case you would use: `
20
+ ````
21
+ deriva_nl = DerivaML(...)
22
+ dataset_rid = ...
23
+ datasets = [DatasetSpec(dataset_rid, version=deriva_ml.dataset_version(dataset_rid))]
24
+ ```
25
+
26
+ If a dataset is large, downloading from the catalog might take a signficant amount of time.
@@ -13,7 +13,7 @@ echo "Bumping version: $VERSION_TYPE"
13
13
 
14
14
  # Bump the version using bump-my-version.
15
15
  # This command should update version files, commit the changes, and create a Git tag.
16
- bump-my-version bump $VERSION_TYPE --verbose
16
+ bump-my-version bump "$VERSION_TYPE" --verbose
17
17
 
18
18
  # Push commits and tags to the remote repository.
19
19
  echo "Pushing changes to remote repository..."
@@ -32,6 +32,6 @@ python -m build
32
32
  NEW_TAG=$(git describe --tags --abbrev=0)
33
33
  echo "New version tag: $NEW_TAG"
34
34
 
35
- twine upload dist/*${NEW_TAG}
35
+ twine upload "dist/*${NEW_TAG/v/}"
36
36
 
37
37
  echo "Release process complete!"
@@ -1,12 +1,15 @@
1
- """Ths module constains the definition of the DatabaseModel class. The role of this class is to provide an nterface between the BDBag representation
1
+ """Ths module contains the definition of the DatabaseModel class. The role of this class is to provide an nterface between the BDBag representation
2
2
  of a dataset and a sqllite database in which the contents of the bag are stored.
3
3
  """
4
+
5
+ from __future__ import annotations
6
+
4
7
  import logging
5
8
  import sqlite3
6
9
 
7
10
  from csv import reader
8
11
  from pathlib import Path
9
- from typing import Any, Optional
12
+ from typing import Any, Optional, Generator
10
13
  from urllib.parse import urlparse
11
14
 
12
15
  from deriva.core.ermrest_model import Model
@@ -20,7 +23,7 @@ from .dataset_bag import DatasetBag
20
23
  class DatabaseModelMeta(type):
21
24
  """Use metaclass to ensure that there is onl one instance per path"""
22
25
 
23
- _paths_loaded: dict[Path:"DatabaseModel"] = {}
26
+ _paths_loaded: dict[Path, "DatabaseModel"] = {}
24
27
 
25
28
  def __call__(cls, *args, **kwargs):
26
29
  logger = logging.getLogger("deriva_ml")
@@ -47,7 +50,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
47
50
  Because of nested datasets, it's possible that more than one dataset rid is in a bag, or that a dataset rid might
48
51
  appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
49
52
  into DatabaseModels, is kept in the class variable `_rid_map`.
50
-
53
+
51
54
  Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
52
55
  sqllite instance is created for every new dataset version present.
53
56
 
@@ -315,6 +318,26 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
315
318
  )
316
319
  return datasets
317
320
 
321
+ def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
322
+ """Retrieve the contents of the specified table as a dictionary.
323
+
324
+ Args:
325
+ table: Table to retrieve data from. f schema is not provided as part of the table name,
326
+ the method will attempt to locate the schema for the table.
327
+
328
+ Returns:
329
+ A generator producing dictionaries containing the contents of the specified table as name/value pairs.
330
+ """
331
+ table_name = self.normalize_table_name(table)
332
+ with self.dbase as dbase:
333
+ col_names = [
334
+ c[1]
335
+ for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()
336
+ ]
337
+ result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
338
+ while row := result.fetchone():
339
+ yield dict(zip(col_names, row))
340
+
318
341
  def normalize_table_name(self, table: str) -> str:
319
342
  """Attempt to insert the schema into a table name if it's not provided.
320
343
 
@@ -92,7 +92,7 @@ class Dataset:
92
92
  dataset_list: list[DatasetSpec],
93
93
  description: Optional[str] = "",
94
94
  execution_rid: Optional[RID] = None,
95
- ) -> RID:
95
+ ) -> list[dict[str, Any]]:
96
96
  schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
97
97
 
98
98
  # Construct version records for insert
@@ -245,7 +245,7 @@ class Dataset:
245
245
  DerivaMLException: if provided RID is not to a dataset_table.
246
246
  """
247
247
 
248
- # Find all of the datasets that are reachable from this dataset and determine their new version numbers.
248
+ # Find all the datasets that are reachable from this dataset and determine their new version numbers.
249
249
  related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
250
250
  version_update_list = [
251
251
  DatasetSpec(
@@ -254,7 +254,7 @@ class Dataset:
254
254
  )
255
255
  for ds_rid in related_datasets
256
256
  ]
257
- updated_versions = self._insert_dataset_versions(
257
+ self._insert_dataset_versions(
258
258
  version_update_list, description=description, execution_rid=execution_rid
259
259
  )
260
260
  return [d.version for d in version_update_list if d.rid == dataset_rid][0]
@@ -751,9 +751,10 @@ class Dataset:
751
751
  ]
752
752
 
753
753
  def _table_paths(
754
- self, dataset: DatasetSpec = None, snapshot_catalog: Optional[DerivaML] = None
754
+ self,
755
+ dataset: Optional[DatasetSpec] = None,
756
+ snapshot_catalog: Optional[DerivaML] = None,
755
757
  ) -> Iterator[tuple[str, str, Table]]:
756
-
757
758
  paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
758
759
 
759
760
  def source_path(path: tuple[Table, ...]):
@@ -779,17 +780,20 @@ class Dataset:
779
780
  def _collect_paths(
780
781
  self,
781
782
  dataset_rid: Optional[RID] = None,
782
- snapshot_catalog: Optional[DerivaML] = None,
783
+ snapshot: Optional[Dataset] = None,
783
784
  dataset_nesting_depth: Optional[int] = None,
784
785
  ) -> set[tuple[Table, ...]]:
785
786
 
786
- snapshot_catalog = snapshot_catalog or self
787
+ snapshot_catalog = snapshot if snapshot else self
788
+
787
789
  dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
788
790
  "Dataset"
789
791
  ]
790
792
  dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
791
793
  "Dataset_Dataset"
792
794
  ]
795
+
796
+ # Figure out what types of elements the dataset contains.
793
797
  dataset_associations = [
794
798
  a
795
799
  for a in self.dataset_table.find_associations()
@@ -812,7 +816,8 @@ class Dataset:
812
816
  ]
813
817
  else:
814
818
  included_associations = dataset_associations
815
- # Get the paths through the schema and filter out all of dataset paths not used by this dataset.
819
+
820
+ # Get the paths through the schema and filter out all the dataset paths not used by this dataset.
816
821
  paths = {
817
822
  tuple(p)
818
823
  for p in snapshot_catalog._model._schema_to_paths()
@@ -827,7 +832,7 @@ class Dataset:
827
832
  if dataset_rid:
828
833
  for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
829
834
  nested_paths |= self._collect_paths(
830
- c, snapshot_catalog=snapshot_catalog
835
+ c, snapshot=snapshot_catalog
831
836
  )
832
837
  else:
833
838
  # Initialize nesting depth if not already provided.
@@ -109,7 +109,7 @@ class DatasetBag:
109
109
  for ts, on in paths:
110
110
  tables = " JOIN ".join(ts)
111
111
  on_expression = " and ".join(
112
- [f"{column_name(l)}={column_name(r)}" for l, r in on]
112
+ [f"{column_name(left)}={column_name(right)}" for left, right in on]
113
113
  )
114
114
  sql.append(
115
115
  f"SELECT {select_args} FROM {tables} ON {on_expression} WHERE {dataset_table_name}.RID IN ({datasets})"
@@ -5,6 +5,7 @@ import logging
5
5
  from random import random, randint
6
6
  import tempfile
7
7
  from tempfile import TemporaryDirectory
8
+ from typing import Optional
8
9
  import itertools
9
10
 
10
11
  from deriva.config.acl_config import AclConfig
@@ -18,7 +19,6 @@ from requests import HTTPError
18
19
  from deriva_ml import (
19
20
  DerivaML,
20
21
  ExecutionConfiguration,
21
- Workflow,
22
22
  MLVocab,
23
23
  BuiltinTypes,
24
24
  ColumnDefinition,
@@ -169,12 +169,9 @@ def create_demo_features(ml_instance):
169
169
  description="Model for our API workflow",
170
170
  )
171
171
 
172
- api_workflow = ml_instance.add_workflow(
173
- Workflow(
174
- name="API Workflow",
175
- url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml",
176
- workflow_type="API Workflow",
177
- )
172
+ api_workflow = ml_instance.create_workflow(
173
+ name="API Workflow",
174
+ workflow_type="API Workflow",
178
175
  )
179
176
 
180
177
  api_execution = ml_instance.create_execution(
@@ -322,7 +319,11 @@ def create_demo_catalog(
322
319
 
323
320
  class DemoML(DerivaML):
324
321
  def __init__(
325
- self, hostname, catalog_id, cache_dir: str = None, working_dir: str = None
322
+ self,
323
+ hostname,
324
+ catalog_id,
325
+ cache_dir: Optional[str] = None,
326
+ working_dir: Optional[str] = None,
326
327
  ):
327
328
  super().__init__(
328
329
  hostname=hostname,
@@ -8,7 +8,7 @@ from enum import Enum
8
8
  from typing import Any, Iterable, Optional, Annotated
9
9
 
10
10
  import deriva.core.ermrest_model as em
11
- from urllib.parse import urlparse, urljoin
11
+ from urllib.parse import urlparse
12
12
  from deriva.core.ermrest_model import builtin_types
13
13
  from pydantic import (
14
14
  BaseModel,
@@ -139,13 +139,18 @@ class FileSpec(BaseModel):
139
139
  if url_parts.scheme == "tag":
140
140
  return v
141
141
  elif not url_parts.scheme:
142
- return f'tag://{gethostname()},{date.today()}:file://{v}'
142
+ return f"tag://{gethostname()},{date.today()}:file://{v}"
143
143
  else:
144
144
  raise ValidationError("url is not a file URL")
145
145
 
146
146
  @model_serializer()
147
147
  def serialize_filespec(self):
148
- return {'URL': self.url, 'Description': self.description, 'MD5': self.md5, 'Length': self.length}
148
+ return {
149
+ "URL": self.url,
150
+ "Description": self.description,
151
+ "MD5": self.md5,
152
+ "Length": self.length,
153
+ }
149
154
 
150
155
 
151
156
  class VocabularyTerm(BaseModel):
@@ -32,6 +32,7 @@ from deriva.core.deriva_server import DerivaServer
32
32
  from deriva.core.ermrest_catalog import ResolveRidResult
33
33
  from deriva.core.ermrest_model import Key, Table
34
34
  from deriva.core.hatrac_store import HatracStore
35
+ from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
35
36
  from pydantic import validate_call, ConfigDict
36
37
  from requests import RequestException
37
38
 
@@ -70,17 +71,29 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
70
71
  try:
71
72
  from IPython import get_ipython
72
73
  except ImportError: # Graceful fallback if IPython isn't installed.
73
- get_ipython = lambda: None
74
+
75
+ def get_ipython():
76
+ """Dummy routine in case you are not running in IPython."""
77
+ return None
78
+
74
79
 
75
80
  try:
76
81
  from jupyter_server.serverapp import list_running_servers
77
82
  except ImportError:
78
- list_running_servers = lambda: []
83
+
84
+ def list_running_servers():
85
+ """Dummy routine in case you are not running in Jupyter."""
86
+ return []
87
+
79
88
 
80
89
  try:
81
90
  from ipykernel import get_connection_file
82
91
  except ImportError:
83
- get_connection_file = lambda: ""
92
+
93
+ def get_connection_file():
94
+ """Dummy routine in case you are not running in Jupyter."""
95
+ return ""
96
+
84
97
 
85
98
  if TYPE_CHECKING:
86
99
  from .execution import Execution
@@ -102,8 +115,8 @@ class DerivaML(Dataset):
102
115
  self,
103
116
  hostname: str,
104
117
  catalog_id: str | int,
105
- domain_schema: str = None,
106
- project_name: str = None,
118
+ domain_schema: Optional[str] = None,
119
+ project_name: Optional[str] = None,
107
120
  cache_dir: Optional[str] = None,
108
121
  working_dir: Optional[str] = None,
109
122
  model_version: str = "1",
@@ -205,9 +218,8 @@ class DerivaML(Dataset):
205
218
  except subprocess.CalledProcessError:
206
219
  self._logger.error("nbstripout is not found.")
207
220
 
208
- def _get_notebook_session(
209
- self,
210
- ) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
221
+ @staticmethod
222
+ def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
211
223
  """Return the absolute path of the current notebook."""
212
224
  # Get the kernel's connection file and extract the kernel ID
213
225
  try:
@@ -267,7 +279,7 @@ class DerivaML(Dataset):
267
279
  ) # Get the caller's filename, which is two up the stack from here.
268
280
  else:
269
281
  raise DerivaMLException(
270
- f"Looking for caller failed"
282
+ "Looking for caller failed"
271
283
  ) # Stack is too shallow
272
284
  return filename, is_notebook
273
285
 
@@ -335,7 +347,7 @@ class DerivaML(Dataset):
335
347
  )
336
348
 
337
349
  def asset_dir(
338
- self, table: str | Table, prefix: str | Path = None
350
+ self, table: str | Table, prefix: Optional[str | Path] = None
339
351
  ) -> UploadAssetDirectory:
340
352
  """Return a local file path in which to place a files for an asset table. T
341
353
 
@@ -369,6 +381,29 @@ class DerivaML(Dataset):
369
381
  """
370
382
  return self.cache_dir if cached else self.working_dir
371
383
 
384
+ @staticmethod
385
+ def globus_login(host: str) -> None:
386
+ """Log into the specified host using Globus.
387
+
388
+ Args:
389
+ host:
390
+
391
+ Returns:
392
+
393
+ """
394
+ gnl = GlobusNativeLogin(host=host)
395
+ if gnl.is_logged_in([host]):
396
+ print("You are already logged in.")
397
+ else:
398
+ gnl.login(
399
+ [host],
400
+ no_local_server=True,
401
+ no_browser=True,
402
+ refresh_tokens=True,
403
+ update_bdbag_keychain=True,
404
+ )
405
+ print("Login Successful")
406
+
372
407
  def chaise_url(self, table: RID | Table) -> str:
373
408
  """Return a Chaise URL to the specified table.
374
409
 
@@ -379,15 +414,15 @@ class DerivaML(Dataset):
379
414
  Returns:
380
415
  URL to the table in Chaise format.
381
416
  """
417
+ table_obj = self.model.name_to_table(table)
382
418
  try:
383
- table = self.model.name_to_table(table)
384
419
  uri = self.catalog.get_server_uri().replace(
385
420
  "ermrest/catalog/", "chaise/recordset/#"
386
421
  )
387
422
  except DerivaMLException:
388
423
  # Perhaps we have a RID....
389
424
  uri = self.cite(table)
390
- return f"{uri}/{urlquote(table.schema.name)}:{urlquote(table.name)}"
425
+ return f"{uri}/{urlquote(table_obj.schema.name)}:{urlquote(table_obj.name)}"
391
426
 
392
427
  def cite(self, entity: dict | str) -> str:
393
428
  """Return a citation URL for the provided entity.
@@ -401,7 +436,9 @@ class DerivaML(Dataset):
401
436
  Raises:
402
437
  DerivaMLException: if provided RID does not exist.
403
438
  """
404
- if entity.startswith(f"https://{self.host_name}/id/{self.catalog_id}/"):
439
+ if isinstance(entity, str) and entity.startswith(
440
+ f"https://{self.host_name}/id/{self.catalog_id}/"
441
+ ):
405
442
  # Already got a citation...
406
443
  return entity
407
444
  try:
@@ -498,9 +535,9 @@ class DerivaML(Dataset):
498
535
  def create_asset(
499
536
  self,
500
537
  asset_name: str,
501
- column_defs: Iterable[ColumnDefinition] = None,
538
+ column_defs: Optional[Iterable[ColumnDefinition]] = None,
502
539
  comment: str = "",
503
- schema: str = None,
540
+ schema: Optional[str] = None,
504
541
  ) -> Table:
505
542
  """Create an asset table with the given asset name.
506
543
 
@@ -532,9 +569,9 @@ class DerivaML(Dataset):
532
569
  self,
533
570
  target_table: Table | str,
534
571
  feature_name: str,
535
- terms: list[Table | str] = None,
536
- assets: list[Table | str] = None,
537
- metadata: Iterable[ColumnDefinition | Table | Key | str] = None,
572
+ terms: Optional[list[Table | str]] = None,
573
+ assets: Optional[list[Table | str]] = None,
574
+ metadata: Optional[Iterable[ColumnDefinition | Table | Key | str]] = None,
538
575
  optional: Optional[list[str]] = None,
539
576
  comment: str = "",
540
577
  ) -> type[FeatureRecord]:
@@ -899,6 +936,7 @@ class DerivaML(Dataset):
899
936
  """
900
937
 
901
938
  def path_to_asset(path: str) -> str:
939
+ """Pull the asset name out of a path to that asset in the filesystem"""
902
940
  components = path.split("/")
903
941
  return components[
904
942
  components.index("asset") + 2
@@ -963,6 +1001,7 @@ class DerivaML(Dataset):
963
1001
  )
964
1002
 
965
1003
  def check_file_type(dtype: str) -> bool:
1004
+ """Make sure that the specified string is either the name or synonym for a file type term."""
966
1005
  for term in defined_types:
967
1006
  if dtype == term.name or (term.synonyms and file_type in term.synonyms):
968
1007
  return True
@@ -1098,7 +1137,7 @@ class DerivaML(Dataset):
1098
1137
 
1099
1138
  def create_workflow(
1100
1139
  self, name: str, workflow_type: str, description: str = "", create: bool = True
1101
- ) -> RID:
1140
+ ) -> RID | None:
1102
1141
  """Identify current executing program and return a workflow RID for it
1103
1142
 
1104
1143
  Determine the notebook or script that is currently being executed. Assume that this is
@@ -1166,7 +1205,7 @@ class DerivaML(Dataset):
1166
1205
  )
1167
1206
  github_url = result.stdout.strip().removesuffix(".git")
1168
1207
  except subprocess.CalledProcessError:
1169
- raise DerivaMLException(f"No GIT remote found")
1208
+ raise DerivaMLException("No GIT remote found")
1170
1209
 
1171
1210
  # Find the root directory for the repository
1172
1211
  repo_root = self._get_git_root()
@@ -1188,7 +1227,7 @@ class DerivaML(Dataset):
1188
1227
 
1189
1228
  """Get SHA-1 hash of latest commit of the file in the repository"""
1190
1229
  result = subprocess.run(
1191
- ["git", "log", "-n", "1", "--pretty=format:%H" "--", self.executable_path],
1230
+ ["git", "log", "-n", "1", "--pretty=format:%H--", self.executable_path],
1192
1231
  cwd=self.executable_path.parent,
1193
1232
  capture_output=True,
1194
1233
  text=True,
@@ -21,7 +21,7 @@ from .deriva_definitions import (
21
21
 
22
22
  from collections import Counter
23
23
  from pydantic import validate_call, ConfigDict
24
- from typing import Iterable
24
+ from typing import Iterable, Optional
25
25
 
26
26
 
27
27
  class DerivaModel:
@@ -267,7 +267,7 @@ class DerivaModel:
267
267
  def _schema_to_paths(
268
268
  self,
269
269
  root: Table = None,
270
- path: list[Table] = None,
270
+ path: Optional[list[Table]] = None,
271
271
  ) -> list[list[Table]]:
272
272
  """Recursively walk over the domain schema graph and extend the current path.
273
273
 
@@ -54,7 +54,9 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
54
54
  try:
55
55
  from jupyter_server.serverapp import list_running_servers
56
56
  except ImportError:
57
- list_running_servers = lambda: []
57
+
58
+ def list_running_servers():
59
+ return []
58
60
 
59
61
 
60
62
  class Execution:
@@ -155,7 +157,6 @@ class Execution:
155
157
  self._initialize_execution(reload)
156
158
 
157
159
  def _save_runtime_environment(self):
158
-
159
160
  runtime_env_path = ExecMetadataVocab.runtime_env.value
160
161
  runtime_env_dir = self.execution_metadata_path(runtime_env_path)
161
162
  with NamedTemporaryFile(
@@ -267,7 +268,7 @@ class Execution:
267
268
  # Execution metadata cannot be in a directory, so map path into filename.
268
269
  checkpoint_path = (
269
270
  self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
270
- / f"{notebook_name.replace('/','_')}.checkpoint"
271
+ / f"{notebook_name.replace('/', '_')}.checkpoint"
271
272
  )
272
273
  with open(checkpoint_path, "w", encoding="utf-8") as f:
273
274
  json.dump(notebook_content, f)
@@ -359,7 +360,7 @@ class Execution:
359
360
  if m := is_feature_asset_dir(p):
360
361
  try:
361
362
  self.update_status(
362
- Status.running, f'Uploading feature {m["feature_name"]}...'
363
+ Status.running, f"Uploading feature {m['feature_name']}..."
363
364
  )
364
365
  feature_assets[m["target_table"], m["feature_name"]] = (
365
366
  self._ml_object.upload_assets(p)
@@ -1,12 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
- from typing import Optional, Any
4
+ from typing import Optional
5
5
 
6
6
  from pydantic import (
7
7
  BaseModel,
8
8
  conlist,
9
- ConfigDict, field_validator,
9
+ ConfigDict,
10
10
  )
11
11
  from pathlib import Path
12
12
 
@@ -36,7 +36,6 @@ class Workflow(BaseModel):
36
36
  checksum: Optional[str]
37
37
 
38
38
 
39
-
40
39
  class ExecutionConfiguration(BaseModel):
41
40
  """Define the parameters that are used to configure a specific execution.
42
41
 
@@ -69,23 +68,21 @@ class ExecutionConfiguration(BaseModel):
69
68
  config = json.load(fd)
70
69
  return ExecutionConfiguration.model_validate(config)
71
70
 
72
- def download_execution_configuration(
73
- self, configuration_rid: RID
74
- ) -> ExecutionConfiguration:
75
- """Create an ExecutionConfiguration object from a catalog RID that points to a JSON representation of that
76
- configuration in hatrac
77
-
78
- Args:
79
- configuration_rid: RID that should be to an asset table that refers to an execution configuration
80
-
81
- Returns:
82
- A ExecutionConfiguration object for configured by the parameters in the configuration file.
83
- """
84
- AssertionError("Not Implemented")
85
- return ExecutionConfiguration.load_configuration(configuration_rid)
86
-
87
- # configuration = self.retrieve_rid(configuration_rid)
88
- # with NamedTemporaryFile("w+", delete=False, suffix=".json") as dest_file:
89
- # hs = HatracStore("https", self.host_name, self.credential)
90
- # hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
91
- # return ExecutionConfiguration.load_configuration(Path(dest_file.name))
71
+ # def download_execution_configuration(
72
+ # self, configuration_rid: RID
73
+ # ) -> ExecutionConfiguration:
74
+ # """Create an ExecutionConfiguration object from a catalog RID that points to a JSON representation of that
75
+ # configuration in hatrac
76
+ #
77
+ # Args:
78
+ # configuration_rid: RID that should be to an asset table that refers to an execution configuration
79
+ #
80
+ # Returns:
81
+ # A ExecutionConfiguration object for configured by the parameters in the configuration file.
82
+ # """
83
+ # AssertionError("Not Implemented")
84
+ # configuration = self.retrieve_rid(configuration_rid)
85
+ # with NamedTemporaryFile("w+", delete=False, suffix=".json") as dest_file:
86
+ # hs = HatracStore("https", self.host_name, self.credential)
87
+ # hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
88
+ # return ExecutionConfiguration.load_configuration(Path(dest_file.name))
@@ -240,7 +240,7 @@ def main():
240
240
  parser.add_argument("--catalog_id", type=str, required=True)
241
241
  parser.add_argument("--schema_name", type=str, required=True)
242
242
  args = parser.parse_args()
243
- generate_annotation(args.catalog_id, args.schema_name)
243
+ generate_annotation(args.catalog_id)
244
244
 
245
245
 
246
246
  if __name__ == "__main__":