deriva-ml 1.7.0__tar.gz → 1.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deriva_ml-1.7.0/src/deriva_ml.egg-info → deriva_ml-1.8.1}/PKG-INFO +1 -1
- deriva_ml-1.8.1/src/deriva_ml/VERSION.py +1 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/__init__.py +2 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/dataset.py +19 -25
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/dataset_aux_classes.py +8 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/dataset_bag.py +2 -2
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/demo_catalog.py +2 -2
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/deriva_definitions.py +42 -1
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/deriva_ml_base.py +102 -25
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/execution.py +6 -6
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/execution_configuration.py +2 -2
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/execution_environment.py +2 -2
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/schema_setup/create_schema.py +33 -7
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/test_functions.py +5 -9
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/upload.py +0 -1
- {deriva_ml-1.7.0 → deriva_ml-1.8.1/src/deriva_ml.egg-info}/PKG-INFO +1 -1
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/tests/test_basic_tables.py +1 -1
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/tests/test_dataset.py +16 -5
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/tests/test_execution.py +2 -5
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/tests/test_upload.py +2 -2
- deriva_ml-1.7.0/src/deriva_ml/VERSION.py +0 -1
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/LICENSE +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/README.md +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/pyproject.toml +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/setup.cfg +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/build/lib/schema_setup/__init__.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/build/lib/schema_setup/create_schema.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/build/lib/schema_setup/table_comments_utils.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/database_model.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/deriva_model.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/feature.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/history.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/schema_setup/__init__.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/schema_setup/alter_annotation.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/schema_setup/annotations.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/schema_setup/policy.json +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/schema_setup/table_comments_utils.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml.egg-info/SOURCES.txt +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml.egg-info/entry_points.txt +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml.egg-info/requires.txt +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml.egg-info/top_level.txt +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/tests/test_download.py +0 -0
- {deriva_ml-1.7.0 → deriva_ml-1.8.1}/tests/test_features.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.8.1"
|
|
@@ -2,6 +2,7 @@ __all__ = [
|
|
|
2
2
|
"DerivaML",
|
|
3
3
|
"DerivaMLException",
|
|
4
4
|
"FileUploadState",
|
|
5
|
+
"FileSpec",
|
|
5
6
|
"ExecutionConfiguration",
|
|
6
7
|
"Workflow",
|
|
7
8
|
"DatasetBag",
|
|
@@ -26,6 +27,7 @@ from .deriva_definitions import (
|
|
|
26
27
|
BuiltinTypes,
|
|
27
28
|
UploadState,
|
|
28
29
|
FileUploadState,
|
|
30
|
+
FileSpec,
|
|
29
31
|
RID,
|
|
30
32
|
DerivaMLException,
|
|
31
33
|
MLVocab,
|
|
@@ -75,9 +75,10 @@ class Dataset:
|
|
|
75
75
|
rid_info = self._model.catalog.resolve_rid(dataset_rid, self._model.model)
|
|
76
76
|
except KeyError as _e:
|
|
77
77
|
raise DerivaMLException(f"Invalid RID {dataset_rid}")
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
78
|
+
if rid_info.table != self.dataset_table:
|
|
79
|
+
return False
|
|
80
|
+
elif deleted:
|
|
81
|
+
# Got a dataset rid. Now check to see if its deleted or not.
|
|
81
82
|
return True
|
|
82
83
|
else:
|
|
83
84
|
return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
|
|
@@ -293,7 +294,7 @@ class Dataset:
|
|
|
293
294
|
pb = self._model.catalog.getPathBuilder()
|
|
294
295
|
for ds_type in ds_types:
|
|
295
296
|
if not check_dataset_type(ds_type):
|
|
296
|
-
raise DerivaMLException(
|
|
297
|
+
raise DerivaMLException("Dataset type must be a vocabulary term.")
|
|
297
298
|
dataset_table_path = pb.schemas[self.dataset_table.schema.name].tables[
|
|
298
299
|
self.dataset_table.name
|
|
299
300
|
]
|
|
@@ -444,7 +445,7 @@ class Dataset:
|
|
|
444
445
|
self._model.model.apply()
|
|
445
446
|
return table
|
|
446
447
|
|
|
447
|
-
@validate_call
|
|
448
|
+
# @validate_call
|
|
448
449
|
def list_dataset_members(
|
|
449
450
|
self, dataset_rid: RID, recurse: bool = False
|
|
450
451
|
) -> dict[str, list[dict[str, Any]]]:
|
|
@@ -469,34 +470,27 @@ class Dataset:
|
|
|
469
470
|
pb = self._model.catalog.getPathBuilder()
|
|
470
471
|
for assoc_table in self.dataset_table.find_associations():
|
|
471
472
|
other_fkey = assoc_table.other_fkeys.pop()
|
|
472
|
-
self_fkey = assoc_table.self_fkey
|
|
473
473
|
target_table = other_fkey.pk_table
|
|
474
474
|
member_table = assoc_table.table
|
|
475
475
|
|
|
476
|
+
# Look at domain tables and nested datasets.
|
|
476
477
|
if (
|
|
477
478
|
target_table.schema.name != self._model.domain_schema
|
|
478
479
|
and target_table != self.dataset_table
|
|
479
480
|
):
|
|
480
|
-
# Look at domain tables and nested datasets.
|
|
481
481
|
continue
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
482
|
+
member_column = (
|
|
483
|
+
"Nested_Dataset"
|
|
484
|
+
if target_table == self.dataset_table
|
|
485
|
+
else other_fkey.foreign_key_columns[0].name
|
|
486
|
+
)
|
|
485
487
|
|
|
486
488
|
target_path = pb.schemas[target_table.schema.name].tables[target_table.name]
|
|
487
489
|
member_path = pb.schemas[member_table.schema.name].tables[member_table.name]
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
c.name for c in next(iter(other_fkey.column_map.items()))
|
|
491
|
-
)
|
|
492
|
-
path = pb.schemas[member_table.schema.name].tables[member_table.name].path
|
|
493
|
-
path.filter(member_path.Dataset == dataset_rid)
|
|
494
|
-
path.link(
|
|
490
|
+
|
|
491
|
+
path = member_path.filter(member_path.Dataset == dataset_rid).link(
|
|
495
492
|
target_path,
|
|
496
|
-
on=(
|
|
497
|
-
member_path.columns[member_link[0]]
|
|
498
|
-
== target_path.columns[member_link[1]]
|
|
499
|
-
),
|
|
493
|
+
on=(member_path.columns[member_column] == target_path.columns["RID"]),
|
|
500
494
|
)
|
|
501
495
|
target_entities = list(path.entities().fetch())
|
|
502
496
|
members[target_table.name].extend(target_entities)
|
|
@@ -747,9 +741,9 @@ class Dataset:
|
|
|
747
741
|
p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
|
|
748
742
|
for table in path[1:]:
|
|
749
743
|
if table == dataset_dataset:
|
|
750
|
-
p.append(
|
|
744
|
+
p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
|
|
751
745
|
elif table == self.dataset_table:
|
|
752
|
-
p.append(
|
|
746
|
+
p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
|
|
753
747
|
elif table.name == "Dataset_Version":
|
|
754
748
|
p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
|
|
755
749
|
else:
|
|
@@ -898,7 +892,7 @@ class Dataset:
|
|
|
898
892
|
config_file=spec_file,
|
|
899
893
|
output_dir=tmp_dir,
|
|
900
894
|
defer_download=True,
|
|
901
|
-
timeout=(10,
|
|
895
|
+
timeout=(10, 610),
|
|
902
896
|
envars={"Dataset_RID": dataset.rid},
|
|
903
897
|
)
|
|
904
898
|
minid_page_url = exporter.export()[0] # Get the MINID launch page
|
|
@@ -1111,7 +1105,7 @@ class Dataset:
|
|
|
1111
1105
|
return [
|
|
1112
1106
|
{
|
|
1113
1107
|
"processor": "json",
|
|
1114
|
-
"processor_params": {"query_path":
|
|
1108
|
+
"processor_params": {"query_path": "/schema", "output_path": "schema"},
|
|
1115
1109
|
}
|
|
1116
1110
|
] + self._dataset_specification(writer)
|
|
1117
1111
|
|
|
@@ -187,6 +187,14 @@ class DatasetSpec(BaseModel):
|
|
|
187
187
|
|
|
188
188
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
189
189
|
|
|
190
|
+
@field_validator("version", mode="before")
|
|
191
|
+
@classmethod
|
|
192
|
+
def version_field_validator(cls, v: Any) -> Any:
|
|
193
|
+
if isinstance(v, dict):
|
|
194
|
+
return DatasetVersion(**v)
|
|
195
|
+
else:
|
|
196
|
+
return v
|
|
197
|
+
|
|
190
198
|
@model_validator(mode="before")
|
|
191
199
|
@classmethod
|
|
192
200
|
def _check_bare_rid(cls, data: Any) -> dict[str, str | bool]:
|
|
@@ -79,7 +79,7 @@ class DatasetBag:
|
|
|
79
79
|
with self.database as dbase:
|
|
80
80
|
select_args = ",".join(
|
|
81
81
|
[
|
|
82
|
-
f'"{table_name}".{c[1]}'
|
|
82
|
+
f'"{table_name}"."{c[1]}"'
|
|
83
83
|
for c in dbase.execute(
|
|
84
84
|
f'PRAGMA table_info("{table_name}")'
|
|
85
85
|
).fetchall()
|
|
@@ -104,7 +104,7 @@ class DatasetBag:
|
|
|
104
104
|
)
|
|
105
105
|
|
|
106
106
|
def column_name(col: Column) -> str:
|
|
107
|
-
return f'"{self.model.normalize_table_name(col.table.name)}".{col.name}'
|
|
107
|
+
return f'"{self.model.normalize_table_name(col.table.name)}"."{col.name}"'
|
|
108
108
|
|
|
109
109
|
for ts, on in paths:
|
|
110
110
|
tables = " JOIN ".join(ts)
|
|
@@ -26,7 +26,6 @@ from deriva_ml import (
|
|
|
26
26
|
RID,
|
|
27
27
|
)
|
|
28
28
|
|
|
29
|
-
from deriva_ml.execution import Execution
|
|
30
29
|
from deriva_ml.schema_setup.create_schema import initialize_ml_schema, create_ml_schema
|
|
31
30
|
from deriva_ml.dataset import Dataset
|
|
32
31
|
|
|
@@ -114,7 +113,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
|
|
|
114
113
|
|
|
115
114
|
double_nested_dataset = ml_instance.create_dataset(
|
|
116
115
|
type_rid.name,
|
|
117
|
-
description=
|
|
116
|
+
description="Double nested dataset",
|
|
118
117
|
version=DatasetVersion(1, 0, 0),
|
|
119
118
|
)
|
|
120
119
|
ml_instance.add_dataset_members(double_nested_dataset, nested_datasets)
|
|
@@ -295,6 +294,7 @@ def create_demo_catalog(
|
|
|
295
294
|
project_name=project_name,
|
|
296
295
|
logging_level=logging.WARN,
|
|
297
296
|
)
|
|
297
|
+
working_dir = deriva_ml.working_dir
|
|
298
298
|
dataset_table = deriva_ml.dataset_table
|
|
299
299
|
dataset_table.annotations.update(
|
|
300
300
|
Dataset(
|
|
@@ -3,12 +3,22 @@ Shared definitions that are used in different DerivaML modules.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import warnings
|
|
6
|
+
from datetime import date
|
|
6
7
|
from enum import Enum
|
|
7
8
|
from typing import Any, Iterable, Optional, Annotated
|
|
8
9
|
|
|
9
10
|
import deriva.core.ermrest_model as em
|
|
11
|
+
from urllib.parse import urlparse, urljoin
|
|
10
12
|
from deriva.core.ermrest_model import builtin_types
|
|
11
|
-
from pydantic import
|
|
13
|
+
from pydantic import (
|
|
14
|
+
BaseModel,
|
|
15
|
+
model_serializer,
|
|
16
|
+
Field,
|
|
17
|
+
computed_field,
|
|
18
|
+
field_validator,
|
|
19
|
+
ValidationError,
|
|
20
|
+
)
|
|
21
|
+
from socket import gethostname
|
|
12
22
|
|
|
13
23
|
ML_SCHEMA = "deriva-ml"
|
|
14
24
|
|
|
@@ -109,6 +119,36 @@ class BuiltinTypes(Enum):
|
|
|
109
119
|
serial8 = builtin_types.serial8.typename
|
|
110
120
|
|
|
111
121
|
|
|
122
|
+
class FileSpec(BaseModel):
|
|
123
|
+
"""An entry into the File table
|
|
124
|
+
|
|
125
|
+
Attributes:
|
|
126
|
+
url: The File url to the url.
|
|
127
|
+
description: The description of the file.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
url: str
|
|
131
|
+
description: Optional[str] = ""
|
|
132
|
+
md5: str
|
|
133
|
+
length: int
|
|
134
|
+
|
|
135
|
+
@field_validator("url")
|
|
136
|
+
@classmethod
|
|
137
|
+
def validate_file_url(cls, v):
|
|
138
|
+
url_parts = urlparse(v)
|
|
139
|
+
if url_parts.scheme == "tag":
|
|
140
|
+
return v
|
|
141
|
+
elif not url_parts.scheme:
|
|
142
|
+
print(v)
|
|
143
|
+
return f'tag://{gethostname()},{date.today()}:file://{v}'
|
|
144
|
+
else:
|
|
145
|
+
raise ValidationError("url is not a file URL")
|
|
146
|
+
|
|
147
|
+
@model_serializer()
|
|
148
|
+
def serialize_filespec(self):
|
|
149
|
+
return {'URL': self.url, 'Description': self.description, 'MD5': self.md5, 'Length': self.length}
|
|
150
|
+
|
|
151
|
+
|
|
112
152
|
class VocabularyTerm(BaseModel):
|
|
113
153
|
"""An entry in a vocabulary table.
|
|
114
154
|
|
|
@@ -144,6 +184,7 @@ class MLVocab(StrEnum):
|
|
|
144
184
|
workflow_type = "Workflow_Type"
|
|
145
185
|
execution_asset_type = "Execution_Asset_Type"
|
|
146
186
|
execution_metadata_type = "Execution_Metadata_Type"
|
|
187
|
+
file_type = "File_Type"
|
|
147
188
|
|
|
148
189
|
|
|
149
190
|
class ExecMetadataVocab(StrEnum):
|
|
@@ -8,6 +8,8 @@ relationships that follow a specific data model.
|
|
|
8
8
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
11
13
|
import getpass
|
|
12
14
|
import logging
|
|
13
15
|
from datetime import datetime
|
|
@@ -51,6 +53,7 @@ from .deriva_definitions import (
|
|
|
51
53
|
ML_SCHEMA,
|
|
52
54
|
VocabularyTerm,
|
|
53
55
|
MLVocab,
|
|
56
|
+
FileSpec,
|
|
54
57
|
)
|
|
55
58
|
|
|
56
59
|
if TYPE_CHECKING:
|
|
@@ -112,10 +115,12 @@ class DerivaML(Dataset):
|
|
|
112
115
|
if working_dir
|
|
113
116
|
else Path.home() / "deriva-ml"
|
|
114
117
|
) / default_workdir
|
|
118
|
+
|
|
115
119
|
self.working_dir.mkdir(parents=True, exist_ok=True)
|
|
116
120
|
self.cache_dir = (
|
|
117
121
|
Path(cache_dir) if cache_dir else Path.home() / "deriva-ml" / "cache"
|
|
118
122
|
)
|
|
123
|
+
|
|
119
124
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
120
125
|
|
|
121
126
|
# Initialize dataset class.
|
|
@@ -151,11 +156,11 @@ class DerivaML(Dataset):
|
|
|
151
156
|
)
|
|
152
157
|
|
|
153
158
|
def __del__(self):
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
self._execution.update_status(Status.aborted,
|
|
157
|
-
|
|
158
|
-
|
|
159
|
+
try:
|
|
160
|
+
if self._execution and self._execution.status != Status.completed:
|
|
161
|
+
self._execution.update_status(Status.aborted, "Execution Aborted")
|
|
162
|
+
except (AttributeError, requests.HTTPError):
|
|
163
|
+
pass
|
|
159
164
|
|
|
160
165
|
@staticmethod
|
|
161
166
|
def _get_session_config():
|
|
@@ -450,9 +455,9 @@ class DerivaML(Dataset):
|
|
|
450
455
|
|
|
451
456
|
# Make sure that the provided assets or terms are actually assets or terms.
|
|
452
457
|
if not all(map(self.model.is_asset, assets)):
|
|
453
|
-
raise DerivaMLException(
|
|
458
|
+
raise DerivaMLException("Invalid create_feature asset table.")
|
|
454
459
|
if not all(map(self.model.is_vocabulary, terms)):
|
|
455
|
-
raise DerivaMLException(
|
|
460
|
+
raise DerivaMLException("Invalid create_feature asset table.")
|
|
456
461
|
|
|
457
462
|
# Get references to the necessary tables and make sure that the
|
|
458
463
|
# provided feature name exists.
|
|
@@ -785,7 +790,77 @@ class DerivaML(Dataset):
|
|
|
785
790
|
]
|
|
786
791
|
)
|
|
787
792
|
|
|
788
|
-
|
|
793
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
794
|
+
def add_files(
|
|
795
|
+
self,
|
|
796
|
+
files: Iterable[FileSpec],
|
|
797
|
+
file_types: str | list[str],
|
|
798
|
+
execution_rid: Optional[RID] = None,
|
|
799
|
+
) -> Iterable[RID]:
|
|
800
|
+
"""Add a new file to the File table in the catalog.
|
|
801
|
+
|
|
802
|
+
Args:
|
|
803
|
+
file_types: One or more file types. Must be a term from the File_Type controlled vocabulary.
|
|
804
|
+
files: A sequence of file specifications that describe the files to add.
|
|
805
|
+
execution_rid: Resource Identifier (RID) of the execution to associate with the file.
|
|
806
|
+
|
|
807
|
+
Returns:
|
|
808
|
+
Iterable of the RIDs of the files that were added.
|
|
809
|
+
"""
|
|
810
|
+
defined_types = self.list_vocabulary_terms(MLVocab.file_type)
|
|
811
|
+
if execution_rid and self.resolve_rid(execution_rid).table.name != 'Execution':
|
|
812
|
+
raise DerivaMLException(f'RID {execution_rid} is not for an execution table.')
|
|
813
|
+
|
|
814
|
+
def check_file_type(dtype: str) -> bool:
|
|
815
|
+
for term in defined_types:
|
|
816
|
+
if dtype == term.name or (term.synonyms and file_type in term.synonyms):
|
|
817
|
+
return True
|
|
818
|
+
return False
|
|
819
|
+
|
|
820
|
+
# Create the entry for the new dataset_table and get its RID.
|
|
821
|
+
file_types = [file_types] if isinstance(file_types, str) else file_types
|
|
822
|
+
pb = self._model.catalog.getPathBuilder()
|
|
823
|
+
for file_type in file_types:
|
|
824
|
+
if not check_file_type(file_type):
|
|
825
|
+
raise DerivaMLException("File type must be a vocabulary term.")
|
|
826
|
+
file_table_path = pb.schemas[self.ml_schema].tables["File"]
|
|
827
|
+
file_rids = [
|
|
828
|
+
e["RID"] for e in file_table_path.insert([f.model_dump() for f in files])
|
|
829
|
+
]
|
|
830
|
+
|
|
831
|
+
# Get the name of the association table between file_table and file_type.
|
|
832
|
+
atable = next(
|
|
833
|
+
self._model.schemas[self._ml_schema]
|
|
834
|
+
.tables[MLVocab.file_type]
|
|
835
|
+
.find_associations()
|
|
836
|
+
).name
|
|
837
|
+
pb.schemas[self._ml_schema].tables[atable].insert(
|
|
838
|
+
[
|
|
839
|
+
{"File_Type": file_type, "File": file_rid}
|
|
840
|
+
for file_rid in file_rids
|
|
841
|
+
for file_type in file_types
|
|
842
|
+
]
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
if execution_rid:
|
|
846
|
+
# Get the name of the association table between file_table and execution.
|
|
847
|
+
exec_table = next(
|
|
848
|
+
self._model.schemas[self._ml_schema]
|
|
849
|
+
.tables["Execution"]
|
|
850
|
+
.find_associations()
|
|
851
|
+
).name
|
|
852
|
+
pb.schemas[self._ml_schema].tables[exec_table].insert(
|
|
853
|
+
[
|
|
854
|
+
{"File": file_rid, "Execution": execution_rid}
|
|
855
|
+
for file_rid in file_rids
|
|
856
|
+
]
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
return file_rids
|
|
860
|
+
|
|
861
|
+
def list_files(
|
|
862
|
+
self, file_types: Optional[list[str]] = None
|
|
863
|
+
) -> list[dict[str, Any]]:
|
|
789
864
|
"""Return the contents of the file table. Denormalized file types into the file record."""
|
|
790
865
|
atable = next(
|
|
791
866
|
self._model.schemas[self._ml_schema]
|
|
@@ -795,26 +870,28 @@ class DerivaML(Dataset):
|
|
|
795
870
|
ml_path = self.pathBuilder.schemas[self._ml_schema]
|
|
796
871
|
atable_path = ml_path.tables[atable]
|
|
797
872
|
file_path = ml_path.File
|
|
873
|
+
type_path = ml_path.File_File_Type
|
|
874
|
+
|
|
798
875
|
# Get a list of all the dataset_type values associated with this dataset_table.
|
|
799
876
|
files = []
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
return
|
|
877
|
+
path = file_path.link(type_path)
|
|
878
|
+
path = path.attributes(
|
|
879
|
+
path.File.RID,
|
|
880
|
+
path.File.URL,
|
|
881
|
+
path.File.MD5,
|
|
882
|
+
path.File.Length,
|
|
883
|
+
path.File.Description,
|
|
884
|
+
path.File_File_Type.File_Type,
|
|
885
|
+
)
|
|
886
|
+
file_map = {}
|
|
887
|
+
for f in path.fetch():
|
|
888
|
+
file_map.setdefault(f['RID'], f | {'File_Types': []})['File_Types'].append(f['File_Type'])
|
|
889
|
+
|
|
890
|
+
# Now get rid of the File_Type key and return the result
|
|
891
|
+
return [ (f, f.pop('File_Type'))[0] for f in file_map.values()]
|
|
815
892
|
|
|
816
893
|
def list_workflows(self) -> list[Workflow]:
|
|
817
|
-
"""Return a list of all
|
|
894
|
+
"""Return a list of all the workflows in the catalog."""
|
|
818
895
|
workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
|
|
819
896
|
return [
|
|
820
897
|
Workflow(
|
|
@@ -898,7 +975,7 @@ class DerivaML(Dataset):
|
|
|
898
975
|
|
|
899
976
|
if self._execution:
|
|
900
977
|
DerivaMLException(
|
|
901
|
-
|
|
978
|
+
"Only one execution can be created for a Deriva ML instance."
|
|
902
979
|
)
|
|
903
980
|
else:
|
|
904
981
|
self._execution = Execution(configuration, self)
|
|
@@ -113,13 +113,13 @@ class Execution:
|
|
|
113
113
|
|
|
114
114
|
if self._ml_object.resolve_rid(configuration.workflow).table.name != "Workflow":
|
|
115
115
|
raise DerivaMLException(
|
|
116
|
-
|
|
116
|
+
"Workflow specified in execution configuration is not a Workflow"
|
|
117
117
|
)
|
|
118
118
|
|
|
119
119
|
for d in self.configuration.datasets:
|
|
120
120
|
if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
|
|
121
121
|
raise DerivaMLException(
|
|
122
|
-
|
|
122
|
+
"Dataset specified in execution configuration is not a dataset"
|
|
123
123
|
)
|
|
124
124
|
|
|
125
125
|
for a in self.configuration.assets:
|
|
@@ -127,7 +127,7 @@ class Execution:
|
|
|
127
127
|
self._ml_object.resolve_rid(a).table.name
|
|
128
128
|
):
|
|
129
129
|
raise DerivaMLException(
|
|
130
|
-
|
|
130
|
+
"Asset specified in execution configuration is not a asset table"
|
|
131
131
|
)
|
|
132
132
|
|
|
133
133
|
schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
|
|
@@ -248,7 +248,7 @@ class Execution:
|
|
|
248
248
|
|
|
249
249
|
self.start_time = datetime.now()
|
|
250
250
|
self.uploaded_assets = None
|
|
251
|
-
self.update_status(Status.initializing,
|
|
251
|
+
self.update_status(Status.initializing, "Start ML algorithm ...")
|
|
252
252
|
|
|
253
253
|
def execution_stop(self) -> None:
|
|
254
254
|
"""Finish the execution and update the duration and status of execution."""
|
|
@@ -303,7 +303,7 @@ class Execution:
|
|
|
303
303
|
self.update_status(Status.failed, error)
|
|
304
304
|
raise DerivaMLException(f"Fail to upload execution_assets. Error: {error}")
|
|
305
305
|
|
|
306
|
-
self.update_status(Status.running,
|
|
306
|
+
self.update_status(Status.running, "Updating features...")
|
|
307
307
|
|
|
308
308
|
feature_assets = defaultdict(dict)
|
|
309
309
|
|
|
@@ -350,7 +350,7 @@ class Execution:
|
|
|
350
350
|
],
|
|
351
351
|
)
|
|
352
352
|
|
|
353
|
-
self.update_status(Status.running,
|
|
353
|
+
self.update_status(Status.running, "Upload assets complete")
|
|
354
354
|
return results
|
|
355
355
|
|
|
356
356
|
def upload_execution_outputs(
|
|
@@ -47,7 +47,7 @@ def get_platform_info():
|
|
|
47
47
|
for attr in attributes:
|
|
48
48
|
try:
|
|
49
49
|
platform_info[attr] = getattr(platform, attr)()
|
|
50
|
-
except Exception
|
|
50
|
+
except Exception:
|
|
51
51
|
# Not all attributes are available on all platforms.
|
|
52
52
|
continue
|
|
53
53
|
return platform_info
|
|
@@ -67,7 +67,7 @@ def get_os_info():
|
|
|
67
67
|
]:
|
|
68
68
|
try:
|
|
69
69
|
values[func] = getattr(os, "get" + func)()
|
|
70
|
-
except (OSError, AttributeError)
|
|
70
|
+
except (OSError, AttributeError):
|
|
71
71
|
pass
|
|
72
72
|
values["umask"] = oct(get_umask())
|
|
73
73
|
values["name"] = os.name
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import sys
|
|
3
3
|
|
|
4
|
-
from deriva.core import DerivaServer,
|
|
4
|
+
from deriva.core import DerivaServer, get_credential
|
|
5
5
|
from deriva.core.ermrest_model import Model
|
|
6
6
|
from deriva.core.ermrest_model import (
|
|
7
7
|
builtin_types,
|
|
@@ -32,7 +32,7 @@ def define_table_workflow(workflow_annotation: dict):
|
|
|
32
32
|
)
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def define_table_dataset(
|
|
35
|
+
def define_table_dataset(dataset_annotation: dict = None):
|
|
36
36
|
return Table.define(
|
|
37
37
|
tname="Dataset",
|
|
38
38
|
column_defs=[
|
|
@@ -43,7 +43,7 @@ def define_table_dataset(sname, dataset_annotation: dict = None):
|
|
|
43
43
|
)
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
def define_table_dataset_version(sname: str
|
|
46
|
+
def define_table_dataset_version(sname: str):
|
|
47
47
|
return Table.define(
|
|
48
48
|
tname="Dataset_Version",
|
|
49
49
|
column_defs=[
|
|
@@ -100,6 +100,14 @@ def define_asset_execution_asset(sname: str, execution_asset_annotation: dict):
|
|
|
100
100
|
return table_def
|
|
101
101
|
|
|
102
102
|
|
|
103
|
+
def define_table_file(sname):
|
|
104
|
+
"""Define files table structure"""
|
|
105
|
+
return Table.define_asset(
|
|
106
|
+
sname=sname,
|
|
107
|
+
tname="File",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
103
111
|
def create_www_schema(model: Model):
|
|
104
112
|
"""
|
|
105
113
|
Set up a new schema and tables to hold web-page like content. The tables include a page table, and an asset
|
|
@@ -142,15 +150,12 @@ def create_www_schema(model: Model):
|
|
|
142
150
|
},
|
|
143
151
|
)
|
|
144
152
|
)
|
|
145
|
-
|
|
146
153
|
return www_schema
|
|
147
154
|
|
|
148
155
|
|
|
149
156
|
def create_ml_schema(
|
|
150
157
|
model: Model, schema_name: str = "deriva-ml", project_name: str = None
|
|
151
158
|
):
|
|
152
|
-
ml_catalog: ErmrestCatalog = model.catalog
|
|
153
|
-
|
|
154
159
|
if model.schemas.get(schema_name):
|
|
155
160
|
model.schemas[schema_name].drop(cascade=True)
|
|
156
161
|
# get annotations
|
|
@@ -195,7 +200,7 @@ def create_ml_schema(
|
|
|
195
200
|
)
|
|
196
201
|
|
|
197
202
|
dataset_table = schema.create_table(
|
|
198
|
-
define_table_dataset(
|
|
203
|
+
define_table_dataset(annotations["dataset_annotation"])
|
|
199
204
|
)
|
|
200
205
|
dataset_type = schema.create_table(
|
|
201
206
|
Table.define_vocabulary(MLVocab.dataset_type, f"{project_name}:{{RID}}")
|
|
@@ -263,6 +268,27 @@ def create_ml_schema(
|
|
|
263
268
|
)
|
|
264
269
|
)
|
|
265
270
|
|
|
271
|
+
# File table
|
|
272
|
+
file_table = schema.create_table(define_table_file(schema_name))
|
|
273
|
+
file_type = schema.create_table(
|
|
274
|
+
Table.define_vocabulary(MLVocab.file_type, f"{project_name}:{{RID}}")
|
|
275
|
+
)
|
|
276
|
+
schema.create_table(
|
|
277
|
+
Table.define_association(
|
|
278
|
+
associates=[
|
|
279
|
+
("File", file_table),
|
|
280
|
+
(MLVocab.file_type, file_type),
|
|
281
|
+
]
|
|
282
|
+
)
|
|
283
|
+
)
|
|
284
|
+
schema.create_table(
|
|
285
|
+
Table.define_association(
|
|
286
|
+
[
|
|
287
|
+
("File", file_table),
|
|
288
|
+
("Execution", execution_table),
|
|
289
|
+
]
|
|
290
|
+
)
|
|
291
|
+
)
|
|
266
292
|
create_www_schema(model)
|
|
267
293
|
initialize_ml_schema(model, schema_name)
|
|
268
294
|
|
|
@@ -4,16 +4,13 @@ catalog_id = "eye-ai"
|
|
|
4
4
|
# source_dataset = '2-7K8W'
|
|
5
5
|
source_dataset = "3R6"
|
|
6
6
|
create_catalog = False
|
|
7
|
-
import
|
|
8
|
-
from deriva_ml.demo_catalog import create_demo_catalog, DemoML, populate_demo_catalog
|
|
7
|
+
from deriva_ml.demo_catalog import create_demo_catalog, DemoML
|
|
9
8
|
from deriva_ml import (
|
|
10
9
|
Workflow,
|
|
11
10
|
ExecutionConfiguration,
|
|
12
11
|
MLVocab as vc,
|
|
13
12
|
DerivaML,
|
|
14
13
|
DatasetSpec,
|
|
15
|
-
DatasetVersion,
|
|
16
|
-
RID,
|
|
17
14
|
)
|
|
18
15
|
|
|
19
16
|
|
|
@@ -23,8 +20,7 @@ def setup_demo_ml():
|
|
|
23
20
|
host, "test-schema", create_features=True, create_datasets=True
|
|
24
21
|
)
|
|
25
22
|
ml_instance = DemoML(host, test_catalog.catalog_id)
|
|
26
|
-
|
|
27
|
-
return ml_instance, config
|
|
23
|
+
return ml_instance
|
|
28
24
|
|
|
29
25
|
|
|
30
26
|
def setup_dev():
|
|
@@ -100,12 +96,12 @@ def execution_test(ml_instance):
|
|
|
100
96
|
vc.workflow_type, "ML Demo", description="A ML Workflow that uses Deriva ML API"
|
|
101
97
|
)
|
|
102
98
|
|
|
103
|
-
api_workflow = Workflow(
|
|
99
|
+
api_workflow = ml_instance.add_workflow(Workflow(
|
|
104
100
|
name="Manual Workflow",
|
|
105
101
|
url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/Notebooks/DerivaML%20Execution.ipynb",
|
|
106
102
|
workflow_type="Manual Workflow",
|
|
107
103
|
description="A manual operation",
|
|
108
|
-
)
|
|
104
|
+
))
|
|
109
105
|
|
|
110
106
|
manual_execution = ml_instance.create_execution(
|
|
111
107
|
ExecutionConfiguration(description="Sample Execution", workflow=api_workflow)
|
|
@@ -114,7 +110,7 @@ def execution_test(ml_instance):
|
|
|
114
110
|
# Now lets create model configuration for our program.
|
|
115
111
|
model_file = manual_execution.execution_asset_path("API_Model") / "modelfile.txt"
|
|
116
112
|
with open(model_file, "w") as fp:
|
|
117
|
-
fp.write(
|
|
113
|
+
fp.write("My model")
|
|
118
114
|
|
|
119
115
|
# Now upload the file and retrieve the RID of the new asset from the returned results.
|
|
120
116
|
uploaded_assets = manual_execution.upload_execution_outputs()
|
|
@@ -48,7 +48,6 @@ from deriva.core.ermrest_model import Table
|
|
|
48
48
|
from deriva.core.hatrac_store import HatracStore
|
|
49
49
|
from deriva.core.utils import hash_utils, mime_utils
|
|
50
50
|
from deriva.transfer.upload.deriva_upload import GenericUploader
|
|
51
|
-
import logging
|
|
52
51
|
from pydantic import validate_call, ConfigDict
|
|
53
52
|
|
|
54
53
|
from deriva_ml.deriva_definitions import (
|
|
@@ -36,7 +36,7 @@ class TestVocabulary(TestDerivaML):
|
|
|
36
36
|
self.assertEqual(term.name, self.ml_instance.lookup_term("CV2", "T1").name)
|
|
37
37
|
|
|
38
38
|
# Check for redundant terms.
|
|
39
|
-
with self.assertRaises(DerivaMLException)
|
|
39
|
+
with self.assertRaises(DerivaMLException):
|
|
40
40
|
self.ml_instance.add_term(
|
|
41
41
|
"CV2", "T1", description="A vocab", exists_ok=False
|
|
42
42
|
)
|
|
@@ -121,11 +121,24 @@ class TestDataset(TestDerivaML):
|
|
|
121
121
|
print(f"datasets {datasets}")
|
|
122
122
|
import pprint
|
|
123
123
|
|
|
124
|
+
print("double_nested_dataset")
|
|
124
125
|
pprint.pprint(
|
|
125
|
-
self.ml_instance.list_dataset_members(dataset_rid=double_nested_dataset)
|
|
126
|
-
|
|
126
|
+
self.ml_instance.list_dataset_members(dataset_rid=double_nested_dataset)
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
print("nested_dataset")
|
|
130
|
+
pprint.pprint(
|
|
131
|
+
[
|
|
132
|
+
self.ml_instance.list_dataset_members(dataset_rid=ds)
|
|
133
|
+
for ds in nested_datasets
|
|
127
134
|
]
|
|
128
135
|
)
|
|
136
|
+
|
|
137
|
+
print("dataset")
|
|
138
|
+
pprint.pprint(
|
|
139
|
+
[self.ml_instance.list_dataset_members(dataset_rid=ds) for ds in datasets]
|
|
140
|
+
)
|
|
141
|
+
|
|
129
142
|
print(
|
|
130
143
|
"double nested children",
|
|
131
144
|
self.ml_instance.list_dataset_children(dataset_rid=double_nested_dataset),
|
|
@@ -194,9 +207,7 @@ class TestDataset(TestDerivaML):
|
|
|
194
207
|
"Manual Workflow",
|
|
195
208
|
description="Initial setup of Model File",
|
|
196
209
|
)
|
|
197
|
-
|
|
198
|
-
"Dataset_Type", "TestSet", description="A test"
|
|
199
|
-
)
|
|
210
|
+
self.ml_instance.add_term("Dataset_Type", "TestSet", description="A test")
|
|
200
211
|
|
|
201
212
|
api_workflow = self.ml_instance.add_workflow(
|
|
202
213
|
Workflow(
|
|
@@ -1,12 +1,9 @@
|
|
|
1
|
-
from idlelib.run import manage_socket
|
|
2
|
-
|
|
3
1
|
from derivaml_test import TestDerivaML
|
|
4
2
|
from deriva_ml import (
|
|
5
3
|
MLVocab as vc,
|
|
6
4
|
Workflow,
|
|
7
5
|
ExecutionConfiguration,
|
|
8
6
|
DatasetSpec,
|
|
9
|
-
DerivaML,
|
|
10
7
|
)
|
|
11
8
|
|
|
12
9
|
|
|
@@ -42,7 +39,7 @@ class TestExecution(TestDerivaML):
|
|
|
42
39
|
description="Sample Execution", workflow=api_workflow
|
|
43
40
|
)
|
|
44
41
|
)
|
|
45
|
-
with manual_execution
|
|
42
|
+
with manual_execution:
|
|
46
43
|
pass
|
|
47
44
|
manual_execution.upload_execution_outputs()
|
|
48
45
|
|
|
@@ -141,7 +138,7 @@ class TestExecution(TestDerivaML):
|
|
|
141
138
|
manual_execution.execution_asset_path("API_Model") / "modelfile.txt"
|
|
142
139
|
)
|
|
143
140
|
with open(model_file, "w") as fp:
|
|
144
|
-
fp.write(
|
|
141
|
+
fp.write("My model")
|
|
145
142
|
# Now upload the file and retrieve the RID of the new asset from the returned results.
|
|
146
143
|
uploaded_assets = manual_execution.upload_execution_outputs()
|
|
147
144
|
self.ml_instance._execution = None
|
|
@@ -90,10 +90,10 @@ class TestUpload(TestDerivaML):
|
|
|
90
90
|
manual_execution.execution_asset_path("API_Model") / "modelfile.txt"
|
|
91
91
|
)
|
|
92
92
|
with open(model_file, "w") as fp:
|
|
93
|
-
fp.write(
|
|
93
|
+
fp.write("My model")
|
|
94
94
|
|
|
95
95
|
# Now upload the file and retrieve the RID of the new asset from the returned results.
|
|
96
|
-
|
|
96
|
+
manual_execution.upload_execution_outputs()
|
|
97
97
|
path = self.ml_instance.catalog.getPathBuilder().schemas["deriva-ml"]
|
|
98
98
|
self.assertEqual(1, len(list(path.Execution_Asset.entities().fetch())))
|
|
99
99
|
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.7.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/build/lib/schema_setup/alter_annotation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{deriva_ml-1.7.0 → deriva_ml-1.8.1}/src/deriva_ml/build/lib/schema_setup/table_comments_utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|