deriva-ml 1.16.0__py3-none-any.whl → 1.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/.DS_Store +0 -0
- deriva_ml/__init__.py +0 -10
- deriva_ml/core/base.py +18 -6
- deriva_ml/dataset/__init__.py +2 -7
- deriva_ml/dataset/aux_classes.py +2 -10
- deriva_ml/dataset/dataset.py +5 -4
- deriva_ml/dataset/dataset_bag.py +144 -151
- deriva_ml/dataset/upload.py +6 -4
- deriva_ml/demo_catalog.py +16 -2
- deriva_ml/execution/__init__.py +2 -1
- deriva_ml/execution/execution.py +5 -3
- deriva_ml/execution/execution_configuration.py +28 -9
- deriva_ml/execution/workflow.py +8 -0
- deriva_ml/model/catalog.py +55 -50
- deriva_ml/model/database.py +455 -81
- deriva_ml/test.py +94 -0
- {deriva_ml-1.16.0.dist-info → deriva_ml-1.17.1.dist-info}/METADATA +9 -7
- {deriva_ml-1.16.0.dist-info → deriva_ml-1.17.1.dist-info}/RECORD +22 -21
- deriva_ml/model/sql_mapper.py +0 -44
- {deriva_ml-1.16.0.dist-info → deriva_ml-1.17.1.dist-info}/WHEEL +0 -0
- {deriva_ml-1.16.0.dist-info → deriva_ml-1.17.1.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.16.0.dist-info → deriva_ml-1.17.1.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.16.0.dist-info → deriva_ml-1.17.1.dist-info}/top_level.txt +0 -0
deriva_ml/dataset/upload.py
CHANGED
|
@@ -77,11 +77,11 @@ feature_value_regex = feature_table_dir_regex + f"{SEP}(?P=feature_name)[.](?P<e
|
|
|
77
77
|
feature_asset_dir_regex = feature_table_dir_regex + f"{SEP}asset{SEP}(?P<asset_table>[-\\w]+)"
|
|
78
78
|
feature_asset_regex = feature_asset_dir_regex + f"{SEP}(?P<file>[A-Za-z0-9_-]+)[.](?P<ext>[a-z0-9]*)$"
|
|
79
79
|
|
|
80
|
-
asset_path_regex = exec_dir_regex +
|
|
80
|
+
asset_path_regex = exec_dir_regex + rf"{SEP}asset{SEP}(?P<schema>[-\w]+){SEP}(?P<asset_table>[-\w]*)"
|
|
81
81
|
|
|
82
82
|
asset_file_regex = r"(?P<file>[-\w]+)[.](?P<ext>[a-z0-9]*)$"
|
|
83
83
|
|
|
84
|
-
table_regex = exec_dir_regex +
|
|
84
|
+
table_regex = exec_dir_regex + rf"{SEP}table{SEP}(?P<schema>[-\w]+){SEP}(?P<table>[-\w]+){SEP}(?P=table)[.](csv|json)$"
|
|
85
85
|
|
|
86
86
|
|
|
87
87
|
def is_feature_dir(path: Path) -> Optional[re.Match]:
|
|
@@ -190,7 +190,9 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
|
|
|
190
190
|
metadata_columns = model.asset_metadata(asset_table)
|
|
191
191
|
asset_table = model.name_to_table(asset_table)
|
|
192
192
|
schema = model.name_to_table(asset_table).schema.name
|
|
193
|
-
|
|
193
|
+
|
|
194
|
+
# Be careful here as a metadata value might be a string with can contain special characters.
|
|
195
|
+
metadata_path = "/".join([rf"(?P<{c}>[-:._ \w]+)" for c in metadata_columns])
|
|
194
196
|
asset_path = f"{exec_dir_regex}/asset/{schema}/{asset_table.name}/{metadata_path}/{asset_file_regex}"
|
|
195
197
|
asset_table = model.name_to_table(asset_table)
|
|
196
198
|
schema = model.name_to_table(asset_table).schema.name
|
|
@@ -417,7 +419,7 @@ def asset_file_path(
|
|
|
417
419
|
raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")
|
|
418
420
|
|
|
419
421
|
for m in asset_metadata:
|
|
420
|
-
path = path / metadata.get(m, "None")
|
|
422
|
+
path = path / str(metadata.get(m, "None"))
|
|
421
423
|
path.mkdir(parents=True, exist_ok=True)
|
|
422
424
|
return path / file_name
|
|
423
425
|
|
deriva_ml/demo_catalog.py
CHANGED
|
@@ -5,6 +5,7 @@ import itertools
|
|
|
5
5
|
import logging
|
|
6
6
|
import string
|
|
7
7
|
from collections.abc import Iterator, Sequence
|
|
8
|
+
from datetime import datetime
|
|
8
9
|
from numbers import Integral
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
from random import choice, randint, random
|
|
@@ -54,7 +55,13 @@ def populate_demo_catalog(ml_instance: DerivaML) -> None:
|
|
|
54
55
|
)
|
|
55
56
|
with execution.execute() as e:
|
|
56
57
|
for s in ss:
|
|
57
|
-
image_file = e.asset_file_path(
|
|
58
|
+
image_file = e.asset_file_path(
|
|
59
|
+
"Image",
|
|
60
|
+
f"test_{s['RID']}.txt",
|
|
61
|
+
Subject=s["RID"],
|
|
62
|
+
Acquisition_Time=datetime.now(),
|
|
63
|
+
Acquisition_Date=datetime.now().date(),
|
|
64
|
+
)
|
|
58
65
|
with image_file.open("w") as f:
|
|
59
66
|
f.write(f"Hello there {random()}\n")
|
|
60
67
|
execution.upload_execution_outputs()
|
|
@@ -343,7 +350,14 @@ def create_domain_schema(catalog: ErmrestCatalog, sname: str) -> None:
|
|
|
343
350
|
)
|
|
344
351
|
with TemporaryDirectory() as tmpdir:
|
|
345
352
|
ml_instance = DerivaML(hostname=catalog.deriva_server.server, catalog_id=catalog.catalog_id, working_dir=tmpdir)
|
|
346
|
-
ml_instance.create_asset(
|
|
353
|
+
ml_instance.create_asset(
|
|
354
|
+
"Image",
|
|
355
|
+
column_defs=[
|
|
356
|
+
Column.define("Acquisition_Time", builtin_types.timestamp),
|
|
357
|
+
Column.define("Acquisition_Date", builtin_types.date),
|
|
358
|
+
],
|
|
359
|
+
referenced_tables=[subject_table],
|
|
360
|
+
)
|
|
347
361
|
catalog_annotation(ml_instance.model)
|
|
348
362
|
|
|
349
363
|
|
deriva_ml/execution/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
3
|
# Safe imports - no circular dependencies
|
|
4
|
-
from deriva_ml.execution.execution_configuration import ExecutionConfiguration
|
|
4
|
+
from deriva_ml.execution.execution_configuration import ExecutionConfiguration, AssetRIDConfig
|
|
5
5
|
from deriva_ml.execution.workflow import Workflow
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
@@ -22,4 +22,5 @@ __all__ = [
|
|
|
22
22
|
"Execution", # Lazy-loaded
|
|
23
23
|
"ExecutionConfiguration",
|
|
24
24
|
"Workflow",
|
|
25
|
+
"AssetRIDConfig"
|
|
25
26
|
]
|
deriva_ml/execution/execution.py
CHANGED
|
@@ -583,7 +583,6 @@ class Execution:
|
|
|
583
583
|
asset_rid=status.result["RID"],
|
|
584
584
|
)
|
|
585
585
|
)
|
|
586
|
-
|
|
587
586
|
self._update_asset_execution_table(asset_map)
|
|
588
587
|
self.update_status(Status.running, "Updating features...")
|
|
589
588
|
|
|
@@ -805,7 +804,7 @@ class Execution:
|
|
|
805
804
|
self,
|
|
806
805
|
uploaded_assets: dict[str, list[AssetFilePath]],
|
|
807
806
|
asset_role: str = "Output",
|
|
808
|
-
):
|
|
807
|
+
) -> None:
|
|
809
808
|
"""Add entry to the association table connecting an asset to an execution RID
|
|
810
809
|
|
|
811
810
|
Args:
|
|
@@ -814,6 +813,9 @@ class Execution:
|
|
|
814
813
|
asset_role: A term or list of terms from the Asset_Role vocabulary.
|
|
815
814
|
"""
|
|
816
815
|
# Make sure the asset role is in the controlled vocabulary table.
|
|
816
|
+
if self._dry_run:
|
|
817
|
+
# Don't do any updates of we are doing a dry run.
|
|
818
|
+
return
|
|
817
819
|
self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
|
|
818
820
|
|
|
819
821
|
pb = self._ml_object.pathBuilder
|
|
@@ -1098,7 +1100,7 @@ class Execution:
|
|
|
1098
1100
|
description: Description of the files.
|
|
1099
1101
|
|
|
1100
1102
|
Returns:
|
|
1101
|
-
RID: Dataset RID that
|
|
1103
|
+
RID: Dataset RID that identifies newly added files. Will be nested to mirror original directory structure
|
|
1102
1104
|
of the files.
|
|
1103
1105
|
|
|
1104
1106
|
Raises:
|
|
@@ -22,15 +22,17 @@ Typical usage example:
|
|
|
22
22
|
|
|
23
23
|
from __future__ import annotations
|
|
24
24
|
|
|
25
|
+
from dataclasses import dataclass
|
|
25
26
|
import json
|
|
26
27
|
import sys
|
|
27
28
|
from pathlib import Path
|
|
28
29
|
from typing import Any
|
|
29
30
|
|
|
31
|
+
from hydra_zen import builds
|
|
30
32
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
31
33
|
|
|
32
34
|
from deriva_ml.core.definitions import RID
|
|
33
|
-
from deriva_ml.dataset.aux_classes import
|
|
35
|
+
from deriva_ml.dataset.aux_classes import DatasetSpec
|
|
34
36
|
from deriva_ml.execution.workflow import Workflow
|
|
35
37
|
|
|
36
38
|
|
|
@@ -64,7 +66,7 @@ class ExecutionConfiguration(BaseModel):
|
|
|
64
66
|
... )
|
|
65
67
|
"""
|
|
66
68
|
|
|
67
|
-
datasets: list[DatasetSpec]
|
|
69
|
+
datasets: list[DatasetSpec] = []
|
|
68
70
|
assets: list[RID] = []
|
|
69
71
|
workflow: RID | Workflow
|
|
70
72
|
description: str = ""
|
|
@@ -72,13 +74,13 @@ class ExecutionConfiguration(BaseModel):
|
|
|
72
74
|
|
|
73
75
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
74
76
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
77
|
+
# @field_validator("datasets", mode="before")
|
|
78
|
+
# @classmethod
|
|
79
|
+
# def validate_datasets(cls, value: Any) -> Any:
|
|
80
|
+
# if isinstance(value, DatasetList):
|
|
81
|
+
# config_list: DatasetList = value
|
|
82
|
+
# value = config_list.datasets
|
|
83
|
+
# return value
|
|
82
84
|
|
|
83
85
|
@field_validator("workflow", mode="before")
|
|
84
86
|
@classmethod
|
|
@@ -137,3 +139,20 @@ class ExecutionConfiguration(BaseModel):
|
|
|
137
139
|
# hs = HatracStore("https", self.host_name, self.credential)
|
|
138
140
|
# hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
|
|
139
141
|
# return ExecutionConfiguration.load_configuration(Path(dest_file.name))
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@dataclass
|
|
145
|
+
class AssetRID(str):
|
|
146
|
+
rid: str
|
|
147
|
+
description: str = ""
|
|
148
|
+
|
|
149
|
+
def __new__(cls, rid: str, description: str = ""):
|
|
150
|
+
obj = super().__new__(cls, rid)
|
|
151
|
+
obj.description = description
|
|
152
|
+
return obj
|
|
153
|
+
|
|
154
|
+
AssetRIDConfig = builds(AssetRID, populate_full_signature=True)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
|
deriva_ml/execution/workflow.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import Any
|
|
|
9
9
|
import requests
|
|
10
10
|
from pydantic import BaseModel, PrivateAttr, model_validator
|
|
11
11
|
from requests import RequestException
|
|
12
|
+
from setuptools_scm import get_version
|
|
12
13
|
|
|
13
14
|
from deriva_ml.core.definitions import RID
|
|
14
15
|
from deriva_ml.core.exceptions import DerivaMLException
|
|
@@ -129,6 +130,13 @@ class Workflow(BaseModel):
|
|
|
129
130
|
self.url, self.checksum = Workflow.get_url_and_checksum(path)
|
|
130
131
|
self.git_root = Workflow._get_git_root(path)
|
|
131
132
|
|
|
133
|
+
self.version = get_version(
|
|
134
|
+
root=str(self.git_root or Path.cwd()),
|
|
135
|
+
search_parent_directories=True,
|
|
136
|
+
# Optional but recommended: provide a safe fallback when tags are absent
|
|
137
|
+
fallback_version="0.0",
|
|
138
|
+
)
|
|
139
|
+
|
|
132
140
|
self._logger = logging.getLogger("deriva_ml")
|
|
133
141
|
return self
|
|
134
142
|
|
deriva_ml/model/catalog.py
CHANGED
|
@@ -8,7 +8,7 @@ ML-specific functionality. It handles schema management, feature definitions, an
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
10
|
# Standard library imports
|
|
11
|
-
from collections import Counter
|
|
11
|
+
from collections import Counter, defaultdict
|
|
12
12
|
from graphlib import CycleError, TopologicalSorter
|
|
13
13
|
from typing import Any, Callable, Final, Iterable, NewType, TypeAlias
|
|
14
14
|
|
|
@@ -312,7 +312,10 @@ class DerivaModel:
|
|
|
312
312
|
|
|
313
313
|
return [t for a in dataset_table.find_associations() if domain_table(t := a.other_fkeys.pop().pk_table)]
|
|
314
314
|
|
|
315
|
-
def _prepare_wide_table(self,
|
|
315
|
+
def _prepare_wide_table(self,
|
|
316
|
+
dataset,
|
|
317
|
+
dataset_rid: RID,
|
|
318
|
+
include_tables: list[str]) -> tuple[dict[str, Any], list[tuple]]:
|
|
316
319
|
"""
|
|
317
320
|
Generates details of a wide table from the model
|
|
318
321
|
|
|
@@ -327,7 +330,7 @@ class DerivaModel:
|
|
|
327
330
|
# Skip over tables that we don't want to include in the denormalized dataset.
|
|
328
331
|
# Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
|
|
329
332
|
# table.
|
|
330
|
-
include_tables = set(include_tables)
|
|
333
|
+
include_tables = set(include_tables)
|
|
331
334
|
for t in include_tables:
|
|
332
335
|
# Check to make sure the table is in the catalog.
|
|
333
336
|
_ = self.name_to_table(t)
|
|
@@ -335,8 +338,11 @@ class DerivaModel:
|
|
|
335
338
|
table_paths = [
|
|
336
339
|
path
|
|
337
340
|
for path in self._schema_to_paths()
|
|
338
|
-
if
|
|
341
|
+
if path[-1].name in include_tables and include_tables.intersection({p.name for p in path})
|
|
339
342
|
]
|
|
343
|
+
paths_by_element = defaultdict(list)
|
|
344
|
+
for p in table_paths:
|
|
345
|
+
paths_by_element[p[2].name].append(p)
|
|
340
346
|
|
|
341
347
|
# Get the names of all of the tables that can be dataset elements.
|
|
342
348
|
dataset_element_tables = {
|
|
@@ -344,58 +350,57 @@ class DerivaModel:
|
|
|
344
350
|
}
|
|
345
351
|
|
|
346
352
|
skip_columns = {"RCT", "RMT", "RCB", "RMB"}
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
for
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
(
|
|
385
|
-
|
|
386
|
-
|
|
353
|
+
element_tables = {}
|
|
354
|
+
for element_table, paths in paths_by_element.items():
|
|
355
|
+
graph = {}
|
|
356
|
+
for path in paths:
|
|
357
|
+
for left, right in zip(path[0:], path[1:]):
|
|
358
|
+
graph.setdefault(left.name, set()).add(right.name)
|
|
359
|
+
|
|
360
|
+
# New lets remove any cycles that we may have in the graph.
|
|
361
|
+
# We will use a topological sort to find the order in which we need to join the tables.
|
|
362
|
+
# If we find a cycle, we will remove the table from the graph and splice in an additional ON clause.
|
|
363
|
+
# We will then repeat the process until there are no cycles.
|
|
364
|
+
graph_has_cycles = True
|
|
365
|
+
element_join_tables = []
|
|
366
|
+
element_join_conditions = {}
|
|
367
|
+
while graph_has_cycles:
|
|
368
|
+
try:
|
|
369
|
+
ts = TopologicalSorter(graph)
|
|
370
|
+
element_join_tables = list(reversed(list(ts.static_order())))
|
|
371
|
+
graph_has_cycles = False
|
|
372
|
+
except CycleError as e:
|
|
373
|
+
cycle_nodes = e.args[1]
|
|
374
|
+
if len(cycle_nodes) > 3:
|
|
375
|
+
raise DerivaMLException(f"Unexpected cycle found when normalizing dataset {cycle_nodes}")
|
|
376
|
+
# Remove cycle from graph and splice in additional ON constraint.
|
|
377
|
+
graph[cycle_nodes[1]].remove(cycle_nodes[0])
|
|
378
|
+
|
|
379
|
+
# The Dataset_Version table is a special case as it points to dataset and dataset to version.
|
|
380
|
+
if "Dataset_Version" in element_join_tables:
|
|
381
|
+
element_join_tables.remove("Dataset_Version")
|
|
382
|
+
|
|
383
|
+
for path in paths:
|
|
384
|
+
for left, right in zip(path[0:], path[1:]):
|
|
385
|
+
if right.name == "Dataset_Version":
|
|
386
|
+
# The Dataset_Version table is a special case as it points to dataset and dataset to version.
|
|
387
|
+
continue
|
|
388
|
+
if element_join_tables.index(right.name) < element_join_tables.index(left.name):
|
|
389
|
+
continue
|
|
390
|
+
table_relationship = self._table_relationship(left, right)
|
|
391
|
+
element_join_conditions.setdefault(right.name, set()).add(
|
|
392
|
+
(table_relationship[0], table_relationship[1])
|
|
393
|
+
)
|
|
394
|
+
element_tables[element_table] = (element_join_tables, element_join_conditions)
|
|
387
395
|
# Get the list of columns that will appear in the final denormalized dataset.
|
|
388
396
|
denormalized_columns = [
|
|
389
397
|
(table_name, c.name)
|
|
390
|
-
for table_name in
|
|
398
|
+
for table_name in include_tables
|
|
391
399
|
if not self.is_association(table_name) # Don't include association columns in the denormalized view.'
|
|
392
400
|
for c in self.name_to_table(table_name).columns
|
|
393
|
-
if c.name not in skip_columns
|
|
401
|
+
if (not include_tables or table_name in include_tables) and (c.name not in skip_columns)
|
|
394
402
|
]
|
|
395
|
-
|
|
396
|
-
# List of dataset ids to include in the denormalized view.
|
|
397
|
-
dataset_rids = dataset.list_dataset_children(recurse=True)
|
|
398
|
-
return join_tables, tables, denormalized_columns, dataset_rids, dataset_element_tables
|
|
403
|
+
return element_tables, denormalized_columns
|
|
399
404
|
|
|
400
405
|
def _table_relationship(
|
|
401
406
|
self,
|