deriva-ml 1.13.2__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/database_model.py +5 -11
- deriva_ml/dataset.py +279 -295
- deriva_ml/dataset_aux_classes.py +10 -10
- deriva_ml/demo_catalog.py +90 -67
- deriva_ml/deriva_definitions.py +43 -4
- deriva_ml/deriva_ml_base.py +24 -29
- deriva_ml/deriva_model.py +17 -5
- deriva_ml/execution.py +23 -3
- deriva_ml/history.py +2 -0
- deriva_ml/schema_setup/annotations.py +341 -126
- deriva_ml/schema_setup/create_schema.py +33 -65
- deriva_ml/schema_setup/policy.json +7 -3
- deriva_ml/upload.py +3 -3
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.13.3.dist-info}/METADATA +2 -2
- deriva_ml-1.13.3.dist-info/RECORD +31 -0
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.13.3.dist-info}/WHEEL +1 -1
- deriva_ml-1.13.2.dist-info/RECORD +0 -31
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.13.3.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.13.3.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.13.3.dist-info}/top_level.txt +0 -0
deriva_ml/dataset_aux_classes.py
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
THis module defines the DataSet class with is used to manipulate n
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from datetime import datetime
|
|
6
5
|
from .deriva_definitions import RID
|
|
6
|
+
|
|
7
7
|
from enum import Enum
|
|
8
8
|
from pydantic import (
|
|
9
9
|
BaseModel,
|
|
@@ -98,7 +98,7 @@ class DatasetHistory(BaseModel):
|
|
|
98
98
|
version_rid (RID): The RID of the version record for the dataset in the Dataset_Version table.
|
|
99
99
|
minid (str): The URL that represents the handle of the dataset bag. This will be None if a MINID has not
|
|
100
100
|
been created yet.
|
|
101
|
-
|
|
101
|
+
snapshot (str): Catalog snapshot ID of when the version record was created.
|
|
102
102
|
"""
|
|
103
103
|
|
|
104
104
|
dataset_version: DatasetVersion
|
|
@@ -107,7 +107,7 @@ class DatasetHistory(BaseModel):
|
|
|
107
107
|
execution_rid: Optional[RID] = None
|
|
108
108
|
description: str = ""
|
|
109
109
|
minid: Optional[str] = None
|
|
110
|
-
|
|
110
|
+
snapshot: Optional[str] = None
|
|
111
111
|
|
|
112
112
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
113
113
|
|
|
@@ -128,12 +128,12 @@ class DatasetMinid(BaseModel):
|
|
|
128
128
|
"""
|
|
129
129
|
|
|
130
130
|
dataset_version: DatasetVersion
|
|
131
|
-
metadata: dict[str, str | int]
|
|
132
|
-
minid: str = Field(alias="compact_uri")
|
|
131
|
+
metadata: dict[str, str | int] = {}
|
|
132
|
+
minid: str = Field(alias="compact_uri", default=None)
|
|
133
133
|
bag_url: str = Field(alias="location")
|
|
134
|
-
identifier: str
|
|
135
|
-
landing_page: str
|
|
136
|
-
version_rid: RID = Field(alias="
|
|
134
|
+
identifier: Optional[str] = None
|
|
135
|
+
landing_page: Optional[str] = None
|
|
136
|
+
version_rid: RID = Field(alias="RID")
|
|
137
137
|
checksum: str = Field(alias="checksums", default="")
|
|
138
138
|
|
|
139
139
|
@computed_field
|
|
@@ -156,8 +156,8 @@ class DatasetMinid(BaseModel):
|
|
|
156
156
|
|
|
157
157
|
@field_validator("bag_url", mode="before")
|
|
158
158
|
@classmethod
|
|
159
|
-
def convert_location_to_str(cls, value: list[str]) -> str:
|
|
160
|
-
return value[0]
|
|
159
|
+
def convert_location_to_str(cls, value: list[str] | str) -> str:
|
|
160
|
+
return value[0] if isinstance(value, list) else value
|
|
161
161
|
|
|
162
162
|
@field_validator("checksum", mode="before")
|
|
163
163
|
@classmethod
|
deriva_ml/demo_catalog.py
CHANGED
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
import atexit
|
|
2
|
-
from importlib.metadata import version
|
|
3
2
|
from importlib.resources import files
|
|
3
|
+
import itertools
|
|
4
4
|
import logging
|
|
5
5
|
from random import randint, random
|
|
6
6
|
from typing import Optional
|
|
7
|
-
import
|
|
7
|
+
from tempfile import TemporaryDirectory
|
|
8
8
|
|
|
9
|
-
from deriva.
|
|
10
|
-
from deriva.core import
|
|
11
|
-
from deriva.core import ErmrestCatalog, get_credential
|
|
9
|
+
from deriva.core import DerivaServer, get_credential
|
|
10
|
+
from deriva.core import ErmrestCatalog
|
|
12
11
|
from deriva.core.datapath import DataPathException
|
|
13
12
|
from deriva.core.ermrest_model import builtin_types, Schema, Table, Column
|
|
14
13
|
from requests import HTTPError
|
|
14
|
+
import subprocess
|
|
15
15
|
|
|
16
|
+
from .schema_setup.annotations import catalog_annotation
|
|
16
17
|
from deriva_ml import (
|
|
17
18
|
DerivaML,
|
|
18
19
|
ExecutionConfiguration,
|
|
@@ -23,8 +24,10 @@ from deriva_ml import (
|
|
|
23
24
|
RID,
|
|
24
25
|
)
|
|
25
26
|
|
|
26
|
-
from deriva_ml.schema_setup.create_schema import
|
|
27
|
-
|
|
27
|
+
from deriva_ml.schema_setup.create_schema import (
|
|
28
|
+
initialize_ml_schema,
|
|
29
|
+
create_ml_schema,
|
|
30
|
+
)
|
|
28
31
|
|
|
29
32
|
TEST_DATASET_SIZE = 4
|
|
30
33
|
|
|
@@ -85,7 +88,7 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
|
|
|
85
88
|
|
|
86
89
|
type_rid = ml_instance.add_term("Dataset_Type", "TestSet", description="A test")
|
|
87
90
|
training_rid = ml_instance.add_term(
|
|
88
|
-
"Dataset_Type", "Training", description="A
|
|
91
|
+
"Dataset_Type", "Training", description="A training set"
|
|
89
92
|
)
|
|
90
93
|
testing_rid = ml_instance.add_term(
|
|
91
94
|
"Dataset_Type", "Testing", description="A testing set"
|
|
@@ -98,32 +101,46 @@ def create_demo_datasets(ml_instance: DerivaML) -> tuple[RID, list[RID], list[RI
|
|
|
98
101
|
)
|
|
99
102
|
subject_rids = [i["RID"] for i in table_path.entities().fetch()]
|
|
100
103
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
dataset_rids.append(d)
|
|
110
|
-
|
|
111
|
-
nested_datasets = []
|
|
112
|
-
for i in range(0, 4, 2):
|
|
113
|
-
nested_dataset = ml_instance.create_dataset(
|
|
114
|
-
type=[type_rid.name, "Training"],
|
|
115
|
-
description=f"Nested Dataset {i}",
|
|
116
|
-
version=DatasetVersion(1, 0, 0),
|
|
117
|
-
)
|
|
118
|
-
ml_instance.add_dataset_members(nested_dataset, dataset_rids[i : i + 2])
|
|
119
|
-
nested_datasets.append(nested_dataset)
|
|
104
|
+
ml_instance.add_term(
|
|
105
|
+
MLVocab.workflow_type,
|
|
106
|
+
"Create Dataset Workflow",
|
|
107
|
+
description="A Workflow that creates a new dataset.",
|
|
108
|
+
)
|
|
109
|
+
dataset_workflow = ml_instance.create_workflow(
|
|
110
|
+
name="API Workflow", workflow_type="Create Dataset Workflow"
|
|
111
|
+
)
|
|
120
112
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
description="Double nested dataset",
|
|
124
|
-
version=DatasetVersion(1, 0, 0),
|
|
113
|
+
dataset_execution = ml_instance.create_execution(
|
|
114
|
+
ExecutionConfiguration(workflow=dataset_workflow, description="Create Dataset")
|
|
125
115
|
)
|
|
126
|
-
|
|
116
|
+
|
|
117
|
+
with dataset_execution.execute() as exe:
|
|
118
|
+
dataset_rids = []
|
|
119
|
+
for r in subject_rids[0:4]:
|
|
120
|
+
d = exe.create_dataset(
|
|
121
|
+
dataset_types=[type_rid.name, "Testing"],
|
|
122
|
+
description=f"Dataset {r}",
|
|
123
|
+
version=DatasetVersion(1, 0, 0),
|
|
124
|
+
)
|
|
125
|
+
ml_instance.add_dataset_members(d, [r])
|
|
126
|
+
dataset_rids.append(d)
|
|
127
|
+
|
|
128
|
+
nested_datasets = []
|
|
129
|
+
for i in range(0, 4, 2):
|
|
130
|
+
nested_dataset = exe.create_dataset(
|
|
131
|
+
dataset_types=[type_rid.name, "Training"],
|
|
132
|
+
description=f"Nested Dataset {i}",
|
|
133
|
+
version=DatasetVersion(1, 0, 0),
|
|
134
|
+
)
|
|
135
|
+
exe.add_dataset_members(nested_dataset, dataset_rids[i : i + 2])
|
|
136
|
+
nested_datasets.append(nested_dataset)
|
|
137
|
+
|
|
138
|
+
double_nested_dataset = exe.create_dataset(
|
|
139
|
+
dataset_types=type_rid.name,
|
|
140
|
+
description="Double nested dataset",
|
|
141
|
+
version=DatasetVersion(1, 0, 0),
|
|
142
|
+
)
|
|
143
|
+
exe.add_dataset_members(double_nested_dataset, nested_datasets)
|
|
127
144
|
return double_nested_dataset, nested_datasets, dataset_rids
|
|
128
145
|
|
|
129
146
|
|
|
@@ -251,14 +268,13 @@ def create_domain_schema(ml_instance: DerivaML, sname: str) -> None:
|
|
|
251
268
|
:return:
|
|
252
269
|
"""
|
|
253
270
|
|
|
254
|
-
# Make sure that we have a ml schema
|
|
255
271
|
_ = ml_instance.model.schemas["deriva-ml"]
|
|
256
272
|
|
|
257
273
|
if ml_instance.model.schemas.get(sname):
|
|
258
274
|
# Clean out any old junk....
|
|
259
275
|
ml_instance.model.schemas[sname].drop()
|
|
260
276
|
|
|
261
|
-
domain_schema = ml_instance.model.
|
|
277
|
+
domain_schema = ml_instance.model.create_schema(
|
|
262
278
|
Schema.define(sname, annotations={"name_style": {"underline_space": True}})
|
|
263
279
|
)
|
|
264
280
|
subject_table = domain_schema.create_table(
|
|
@@ -266,6 +282,8 @@ def create_domain_schema(ml_instance: DerivaML, sname: str) -> None:
|
|
|
266
282
|
)
|
|
267
283
|
ml_instance.create_asset("Image", referenced_tables=[subject_table])
|
|
268
284
|
|
|
285
|
+
catalog_annotation(ml_instance.model)
|
|
286
|
+
|
|
269
287
|
|
|
270
288
|
def destroy_demo_catalog(catalog):
|
|
271
289
|
catalog.delete_ermrest_catalog(really=True)
|
|
@@ -280,43 +298,47 @@ def create_demo_catalog(
|
|
|
280
298
|
create_datasets=False,
|
|
281
299
|
on_exit_delete=True,
|
|
282
300
|
) -> ErmrestCatalog:
|
|
283
|
-
|
|
284
|
-
|
|
301
|
+
credential = get_credential(hostname)
|
|
302
|
+
|
|
303
|
+
server = DerivaServer("https", hostname, credentials=credential)
|
|
285
304
|
test_catalog = server.create_ermrest_catalog()
|
|
305
|
+
model = test_catalog.getCatalogModel()
|
|
306
|
+
model.configure_baseline_catalog()
|
|
307
|
+
policy_file = files("deriva_ml.schema_setup").joinpath("policy.json")
|
|
308
|
+
subprocess.run(
|
|
309
|
+
[
|
|
310
|
+
"deriva-acl-config",
|
|
311
|
+
"--host",
|
|
312
|
+
test_catalog.deriva_server.server,
|
|
313
|
+
"--config-file",
|
|
314
|
+
policy_file,
|
|
315
|
+
test_catalog.catalog_id,
|
|
316
|
+
]
|
|
317
|
+
)
|
|
318
|
+
|
|
286
319
|
if on_exit_delete:
|
|
287
320
|
atexit.register(destroy_demo_catalog, test_catalog)
|
|
288
|
-
model = test_catalog.getCatalogModel()
|
|
289
321
|
|
|
290
322
|
try:
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
policy_file = files("deriva_ml.schema_setup").joinpath("policy.json")
|
|
311
|
-
AclConfig(
|
|
312
|
-
hostname, test_catalog.catalog_id, policy_file, credentials=credentials
|
|
313
|
-
)
|
|
314
|
-
if populate or create_features or create_datasets:
|
|
315
|
-
populate_demo_catalog(deriva_ml, domain_schema)
|
|
316
|
-
if create_features:
|
|
317
|
-
create_demo_features(deriva_ml)
|
|
318
|
-
if create_datasets:
|
|
319
|
-
create_demo_datasets(deriva_ml)
|
|
323
|
+
with TemporaryDirectory() as tmpdir:
|
|
324
|
+
create_ml_schema(test_catalog, project_name=project_name)
|
|
325
|
+
deriva_ml = DerivaML(
|
|
326
|
+
hostname=hostname,
|
|
327
|
+
catalog_id=test_catalog.catalog_id,
|
|
328
|
+
project_name=project_name,
|
|
329
|
+
domain_schema=domain_schema,
|
|
330
|
+
logging_level=logging.WARN,
|
|
331
|
+
working_dir=tmpdir,
|
|
332
|
+
credential=credential,
|
|
333
|
+
)
|
|
334
|
+
create_domain_schema(deriva_ml, domain_schema)
|
|
335
|
+
|
|
336
|
+
if populate or create_features or create_datasets:
|
|
337
|
+
populate_demo_catalog(deriva_ml, domain_schema)
|
|
338
|
+
if create_features:
|
|
339
|
+
create_demo_features(deriva_ml)
|
|
340
|
+
if create_datasets:
|
|
341
|
+
create_demo_datasets(deriva_ml)
|
|
320
342
|
|
|
321
343
|
except Exception:
|
|
322
344
|
# on failure, delete catalog and re-raise exception
|
|
@@ -332,6 +354,7 @@ class DemoML(DerivaML):
|
|
|
332
354
|
catalog_id,
|
|
333
355
|
cache_dir: Optional[str] = None,
|
|
334
356
|
working_dir: Optional[str] = None,
|
|
357
|
+
use_minid=True,
|
|
335
358
|
):
|
|
336
359
|
super().__init__(
|
|
337
360
|
hostname=hostname,
|
|
@@ -339,5 +362,5 @@ class DemoML(DerivaML):
|
|
|
339
362
|
project_name="ml-test",
|
|
340
363
|
cache_dir=cache_dir,
|
|
341
364
|
working_dir=working_dir,
|
|
342
|
-
|
|
365
|
+
use_minid=use_minid,
|
|
343
366
|
)
|
deriva_ml/deriva_definitions.py
CHANGED
|
@@ -2,12 +2,16 @@
|
|
|
2
2
|
Shared definitions that are used in different DerivaML modules.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
5
7
|
import warnings
|
|
6
8
|
from datetime import date
|
|
7
9
|
from enum import Enum
|
|
8
|
-
from
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Iterable, Optional, Annotated, Generator
|
|
9
12
|
|
|
10
13
|
import deriva.core.ermrest_model as em
|
|
14
|
+
import deriva.core.utils.hash_utils as hash_utils
|
|
11
15
|
from urllib.parse import urlparse
|
|
12
16
|
from deriva.core.ermrest_model import builtin_types
|
|
13
17
|
from pydantic import (
|
|
@@ -136,11 +140,14 @@ class FileSpec(BaseModel):
|
|
|
136
140
|
@field_validator("url")
|
|
137
141
|
@classmethod
|
|
138
142
|
def validate_file_url(cls, v):
|
|
143
|
+
"""Examine the provided URL. If it's a local path, convert it into a tag URL."""
|
|
139
144
|
url_parts = urlparse(v)
|
|
140
145
|
if url_parts.scheme == "tag":
|
|
146
|
+
# Already a tag URL, so just return it.
|
|
141
147
|
return v
|
|
142
|
-
elif not url_parts.scheme:
|
|
143
|
-
|
|
148
|
+
elif (not url_parts.scheme) or url_parts.scheme == "file":
|
|
149
|
+
# There is no scheme part tof the URL, or it is a file URL, so it is a local file path, so convert to a tag URL.
|
|
150
|
+
return f"tag://{gethostname()},{date.today()}:file://{url_parts.path}"
|
|
144
151
|
else:
|
|
145
152
|
raise ValidationError("url is not a file URL")
|
|
146
153
|
|
|
@@ -153,6 +160,38 @@ class FileSpec(BaseModel):
|
|
|
153
160
|
"Length": self.length,
|
|
154
161
|
}
|
|
155
162
|
|
|
163
|
+
@staticmethod
|
|
164
|
+
def create_filespecs(
|
|
165
|
+
path: Path | str, description: str
|
|
166
|
+
) -> Generator["FileSpec", None, None]:
|
|
167
|
+
"""Given a file or directory, generate the sequence of corresponding FileSpecs sutable to create a File table
|
|
168
|
+
|
|
169
|
+
Arguments:
|
|
170
|
+
path: Path to the file or directory.
|
|
171
|
+
description: The description of the file(s)
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
An iterable of FileSpecs for each file in the directory.
|
|
175
|
+
"""
|
|
176
|
+
path = Path(path)
|
|
177
|
+
|
|
178
|
+
def list_all_files(p) -> list[Path]:
|
|
179
|
+
return (
|
|
180
|
+
(f for f in Path(p).rglob("*") if f.is_file()) if path.is_dir() else [p]
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def create_spec(p: Path, description: str) -> FileSpec:
|
|
184
|
+
hashes = hash_utils.compute_file_hashes(p, hashes=["md5", "sha256"])
|
|
185
|
+
md5 = hashes["md5"][0]
|
|
186
|
+
return FileSpec(
|
|
187
|
+
length=path.stat().st_size,
|
|
188
|
+
md5=md5,
|
|
189
|
+
description=description,
|
|
190
|
+
url=p.as_posix(),
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
return (create_spec(file, description) for file in list_all_files(path))
|
|
194
|
+
|
|
156
195
|
|
|
157
196
|
class VocabularyTerm(BaseModel):
|
|
158
197
|
"""An entry in a vocabulary table.
|
|
@@ -162,7 +201,7 @@ class VocabularyTerm(BaseModel):
|
|
|
162
201
|
synonyms: List of alternative names for the term
|
|
163
202
|
id: CURI identifier for the term
|
|
164
203
|
uri: Unique URI for the term.
|
|
165
|
-
description: A description of the meaning
|
|
204
|
+
description: A description of the term meaning
|
|
166
205
|
rid: Resource identifier assigned to the term
|
|
167
206
|
|
|
168
207
|
Args:
|
deriva_ml/deriva_ml_base.py
CHANGED
|
@@ -51,6 +51,7 @@ from .deriva_definitions import (
|
|
|
51
51
|
FileSpec,
|
|
52
52
|
TableDefinition,
|
|
53
53
|
)
|
|
54
|
+
from .schema_setup.annotations import asset_annotation
|
|
54
55
|
|
|
55
56
|
try:
|
|
56
57
|
from icecream import ic
|
|
@@ -82,9 +83,10 @@ class DerivaML(Dataset):
|
|
|
82
83
|
project_name: Optional[str] = None,
|
|
83
84
|
cache_dir: Optional[str] = None,
|
|
84
85
|
working_dir: Optional[str] = None,
|
|
85
|
-
model_version: str = "1",
|
|
86
86
|
ml_schema: str = ML_SCHEMA,
|
|
87
87
|
logging_level=logging.INFO,
|
|
88
|
+
credential=None,
|
|
89
|
+
use_minid=True,
|
|
88
90
|
):
|
|
89
91
|
"""Create and initialize a DerivaML instance.
|
|
90
92
|
|
|
@@ -93,13 +95,14 @@ class DerivaML(Dataset):
|
|
|
93
95
|
|
|
94
96
|
Args:
|
|
95
97
|
hostname: Hostname of the Deriva server.
|
|
96
|
-
catalog_id: Catalog ID. Either
|
|
97
|
-
domain_schema: Schema name for domain
|
|
98
|
+
catalog_id: Catalog ID. Either an identifier or a catalog name.
|
|
99
|
+
domain_schema: Schema name for domain-specific tables and relationships.
|
|
100
|
+
project_name: Project name. Defaults to name of domain schema.
|
|
98
101
|
cache_dir: Directory path for caching data downloaded from the Deriva server as bdbag.
|
|
99
102
|
working_dir: Directory path for storing data used by or generated by any computations.
|
|
100
|
-
|
|
103
|
+
use_minid: Use the MINID serice when downloading dataset bags.
|
|
101
104
|
"""
|
|
102
|
-
self.credential = get_credential(hostname)
|
|
105
|
+
self.credential = credential or get_credential(hostname)
|
|
103
106
|
server = DerivaServer(
|
|
104
107
|
"https",
|
|
105
108
|
hostname,
|
|
@@ -119,21 +122,20 @@ class DerivaML(Dataset):
|
|
|
119
122
|
) / default_workdir
|
|
120
123
|
|
|
121
124
|
self.working_dir.mkdir(parents=True, exist_ok=True)
|
|
122
|
-
self.cache_dir = (
|
|
123
|
-
Path(cache_dir) if cache_dir else Path.home() / "deriva-ml" / "cache"
|
|
124
|
-
)
|
|
125
|
+
self.cache_dir = Path(cache_dir) if cache_dir else self.working_dir / "cache"
|
|
125
126
|
|
|
126
127
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
127
128
|
|
|
128
129
|
# Initialize dataset class.
|
|
129
|
-
super().__init__(
|
|
130
|
+
super().__init__(
|
|
131
|
+
self.model, self.cache_dir, self.working_dir, use_minid=use_minid
|
|
132
|
+
)
|
|
130
133
|
self._logger = logging.getLogger("deriva_ml")
|
|
131
134
|
self._logger.setLevel(logging_level)
|
|
132
135
|
|
|
133
136
|
self.host_name = hostname
|
|
134
137
|
self.catalog_id = catalog_id
|
|
135
138
|
self.ml_schema = ml_schema
|
|
136
|
-
self.version = model_version
|
|
137
139
|
self.configuration = None
|
|
138
140
|
self._execution: Optional[Execution] = None
|
|
139
141
|
self.domain_schema = self.model.domain_schema
|
|
@@ -150,11 +152,6 @@ class DerivaML(Dataset):
|
|
|
150
152
|
deriva_logger = logging.getLogger("deriva")
|
|
151
153
|
deriva_logger.setLevel(logging_level)
|
|
152
154
|
|
|
153
|
-
if "dirty" in self.version:
|
|
154
|
-
logging.info(
|
|
155
|
-
f"Loading dirty model. Consider commiting and tagging: {self.version}"
|
|
156
|
-
)
|
|
157
|
-
|
|
158
155
|
def __del__(self):
|
|
159
156
|
try:
|
|
160
157
|
if self._execution and self._execution.status != Status.completed:
|
|
@@ -438,6 +435,8 @@ class DerivaML(Dataset):
|
|
|
438
435
|
)
|
|
439
436
|
)
|
|
440
437
|
atable.create_reference(self.model.name_to_table("Asset_Role"))
|
|
438
|
+
|
|
439
|
+
asset_annotation(asset_table)
|
|
441
440
|
return asset_table
|
|
442
441
|
|
|
443
442
|
# @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
@@ -820,6 +819,8 @@ class DerivaML(Dataset):
|
|
|
820
819
|
) -> Iterable[RID]:
|
|
821
820
|
"""Add a new file to the File table in the catalog.
|
|
822
821
|
|
|
822
|
+
The input is an iterator of FileSpec objects which provide the MD5 checksum, length, and URL.
|
|
823
|
+
|
|
823
824
|
Args:
|
|
824
825
|
file_types: One or more file types. Must be a term from the File_Type controlled vocabulary.
|
|
825
826
|
files: A sequence of file specifications that describe the files to add.
|
|
@@ -841,7 +842,6 @@ class DerivaML(Dataset):
|
|
|
841
842
|
return True
|
|
842
843
|
return False
|
|
843
844
|
|
|
844
|
-
# Create the entry for the new dataset_table and get its RID.
|
|
845
845
|
file_types = [file_types] if isinstance(file_types, str) else file_types
|
|
846
846
|
pb = self._model.catalog.getPathBuilder()
|
|
847
847
|
for file_type in file_types:
|
|
@@ -868,18 +868,12 @@ class DerivaML(Dataset):
|
|
|
868
868
|
|
|
869
869
|
if execution_rid:
|
|
870
870
|
# Get the name of the association table between file_table and execution.
|
|
871
|
-
|
|
872
|
-
self._model.schemas[self._ml_schema]
|
|
873
|
-
.tables["Execution"]
|
|
874
|
-
.find_associations()
|
|
875
|
-
).name
|
|
876
|
-
pb.schemas[self._ml_schema].tables[exec_table].insert(
|
|
871
|
+
pb.schemas[self._ml_schema].File_Execution.insert(
|
|
877
872
|
[
|
|
878
873
|
{"File": file_rid, "Execution": execution_rid}
|
|
879
874
|
for file_rid in file_rids
|
|
880
875
|
]
|
|
881
876
|
)
|
|
882
|
-
|
|
883
877
|
return file_rids
|
|
884
878
|
|
|
885
879
|
def list_files(
|
|
@@ -890,9 +884,10 @@ class DerivaML(Dataset):
|
|
|
890
884
|
file_path = ml_path.File
|
|
891
885
|
type_path = ml_path.File_File_Type
|
|
892
886
|
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
887
|
+
path = file_path.link(
|
|
888
|
+
type_path, on=file_path.RID == type_path.File, join_type="left"
|
|
889
|
+
)
|
|
890
|
+
path = path.File.attributes(
|
|
896
891
|
path.File.RID,
|
|
897
892
|
path.File.URL,
|
|
898
893
|
path.File.MD5,
|
|
@@ -902,9 +897,9 @@ class DerivaML(Dataset):
|
|
|
902
897
|
)
|
|
903
898
|
file_map = {}
|
|
904
899
|
for f in path.fetch():
|
|
905
|
-
file_map.setdefault(f["RID"], f
|
|
906
|
-
|
|
907
|
-
|
|
900
|
+
entry = file_map.setdefault(f["RID"], {**f, "File_Types": []})
|
|
901
|
+
if ft := f.get("File_Type"): # assign-and-test in one go
|
|
902
|
+
entry["File_Types"].append(ft)
|
|
908
903
|
|
|
909
904
|
# Now get rid of the File_Type key and return the result
|
|
910
905
|
return [(f, f.pop("File_Type"))[0] for f in file_map.values()]
|
deriva_ml/deriva_model.py
CHANGED
|
@@ -21,7 +21,7 @@ from .deriva_definitions import (
|
|
|
21
21
|
|
|
22
22
|
from collections import Counter
|
|
23
23
|
from pydantic import validate_call, ConfigDict
|
|
24
|
-
from typing import Iterable, Optional
|
|
24
|
+
from typing import Iterable, Optional, Any
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class DerivaModel:
|
|
@@ -61,7 +61,7 @@ class DerivaModel:
|
|
|
61
61
|
self.schemas = self.model.schemas
|
|
62
62
|
|
|
63
63
|
self.ml_schema = ml_schema
|
|
64
|
-
builtin_schemas = ["public", self.ml_schema, "www"]
|
|
64
|
+
builtin_schemas = ["public", self.ml_schema, "www", "WWW"]
|
|
65
65
|
try:
|
|
66
66
|
self.domain_schema = (
|
|
67
67
|
domain_schema
|
|
@@ -73,6 +73,11 @@ class DerivaModel:
|
|
|
73
73
|
# No domain schema defined.
|
|
74
74
|
self.domain_schema = domain_schema
|
|
75
75
|
|
|
76
|
+
@property
|
|
77
|
+
def chaise_config(self) -> dict[str, Any]:
|
|
78
|
+
"""Return the chaise configuration."""
|
|
79
|
+
return self.model.chaise_config
|
|
80
|
+
|
|
76
81
|
def __getattr__(self, name):
|
|
77
82
|
# Called only if `name` is not found in Manager. Delegate attributes to model class.
|
|
78
83
|
return getattr(self.model, name)
|
|
@@ -115,7 +120,12 @@ class DerivaModel:
|
|
|
115
120
|
return vocab_columns.issubset({c.name.upper() for c in table.columns})
|
|
116
121
|
|
|
117
122
|
def is_association(
|
|
118
|
-
self,
|
|
123
|
+
self,
|
|
124
|
+
table_name: str | Table,
|
|
125
|
+
unqualified: bool = True,
|
|
126
|
+
pure: bool = True,
|
|
127
|
+
min_arity: int = 2,
|
|
128
|
+
max_arity: int = 2,
|
|
119
129
|
) -> bool | set | int:
|
|
120
130
|
"""Check the specified table to see if it is an association table.
|
|
121
131
|
|
|
@@ -130,7 +140,9 @@ class DerivaModel:
|
|
|
130
140
|
|
|
131
141
|
"""
|
|
132
142
|
table = self.name_to_table(table_name)
|
|
133
|
-
return table.is_association(
|
|
143
|
+
return table.is_association(
|
|
144
|
+
unqualified=unqualified, pure=pure, min_arity=min_arity, max_arity=max_arity
|
|
145
|
+
)
|
|
134
146
|
|
|
135
147
|
def find_association(self, table1: Table | str, table2: Table | str) -> Table:
|
|
136
148
|
"""Given two tables, return an association table that connects the two.
|
|
@@ -302,7 +314,7 @@ class DerivaModel:
|
|
|
302
314
|
) -> list[list[Table]]:
|
|
303
315
|
"""Recursively walk over the domain schema graph and extend the current path.
|
|
304
316
|
|
|
305
|
-
Walk a schema graph and return a list all
|
|
317
|
+
Walk a schema graph and return a list all the paths through the graph.
|
|
306
318
|
|
|
307
319
|
Args:
|
|
308
320
|
path: Source path so far
|
deriva_ml/execution.py
CHANGED
|
@@ -27,6 +27,7 @@ from .deriva_definitions import (
|
|
|
27
27
|
MLAsset,
|
|
28
28
|
ExecMetadataType,
|
|
29
29
|
ExecAssetType,
|
|
30
|
+
FileSpec,
|
|
30
31
|
DRY_RUN_RID,
|
|
31
32
|
)
|
|
32
33
|
from .deriva_ml_base import DerivaML, FeatureRecord
|
|
@@ -882,19 +883,25 @@ class Execution:
|
|
|
882
883
|
feature.Execution = self.execution_rid
|
|
883
884
|
file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
|
|
884
885
|
|
|
885
|
-
@validate_call
|
|
886
|
-
def create_dataset(
|
|
886
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
887
|
+
def create_dataset(
|
|
888
|
+
self,
|
|
889
|
+
dataset_types: str | list[str],
|
|
890
|
+
description: str,
|
|
891
|
+
version: Optional[DatasetVersion] = None,
|
|
892
|
+
) -> RID:
|
|
887
893
|
"""Create a new dataset with specified types.
|
|
888
894
|
|
|
889
895
|
Args:
|
|
890
896
|
dataset_types: param description:
|
|
891
897
|
description: Markdown description of the dataset being created.
|
|
898
|
+
version: Version to assign to the dataset. Defaults to 0.1.0
|
|
892
899
|
|
|
893
900
|
Returns:
|
|
894
901
|
RID of the newly created dataset.
|
|
895
902
|
"""
|
|
896
903
|
return self._ml_object.create_dataset(
|
|
897
|
-
dataset_types, description, self.execution_rid
|
|
904
|
+
dataset_types, description, self.execution_rid, version=version
|
|
898
905
|
)
|
|
899
906
|
|
|
900
907
|
def add_dataset_members(
|
|
@@ -952,6 +959,19 @@ class Execution:
|
|
|
952
959
|
execution_rid=self.execution_rid,
|
|
953
960
|
)
|
|
954
961
|
|
|
962
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
963
|
+
def add_files(
|
|
964
|
+
self,
|
|
965
|
+
files: Iterable[FileSpec],
|
|
966
|
+
file_types: str | list[str],
|
|
967
|
+
) -> Iterable[RID]:
|
|
968
|
+
"""Add files to the file table"""
|
|
969
|
+
return self._ml_object.add_files(
|
|
970
|
+
files=files,
|
|
971
|
+
file_types=file_types,
|
|
972
|
+
execution_rid=self.execution_rid,
|
|
973
|
+
)
|
|
974
|
+
|
|
955
975
|
def __str__(self):
|
|
956
976
|
items = [
|
|
957
977
|
f"caching_dir: {self._cache_dir}",
|
deriva_ml/history.py
CHANGED
|
@@ -54,6 +54,8 @@ def datetime_epoch_us(dt):
|
|
|
54
54
|
# -- --------------------------------------------------------------------------------------
|
|
55
55
|
# Take the iso format string (same as RMT) and return the version number
|
|
56
56
|
#
|
|
57
|
+
|
|
58
|
+
|
|
57
59
|
def iso_to_snap(iso_datetime):
|
|
58
60
|
rmt = datetime.fromisoformat(iso_datetime)
|
|
59
61
|
return urlb32_encode(datetime_epoch_us(rmt))
|