deriva-ml 1.16.0__py3-none-any.whl → 1.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -77,11 +77,11 @@ feature_value_regex = feature_table_dir_regex + f"{SEP}(?P=feature_name)[.](?P<e
77
77
  feature_asset_dir_regex = feature_table_dir_regex + f"{SEP}asset{SEP}(?P<asset_table>[-\\w]+)"
78
78
  feature_asset_regex = feature_asset_dir_regex + f"{SEP}(?P<file>[A-Za-z0-9_-]+)[.](?P<ext>[a-z0-9]*)$"
79
79
 
80
- asset_path_regex = exec_dir_regex + f"{SEP}asset{SEP}(?P<schema>[-\\w]+){SEP}(?P<asset_table>[-\\w]*)"
80
+ asset_path_regex = exec_dir_regex + rf"{SEP}asset{SEP}(?P<schema>[-\w]+){SEP}(?P<asset_table>[-\w]*)"
81
81
 
82
82
  asset_file_regex = r"(?P<file>[-\w]+)[.](?P<ext>[a-z0-9]*)$"
83
83
 
84
- table_regex = exec_dir_regex + f"{SEP}table{SEP}(?P<schema>[-\\w]+){SEP}(?P<table>[-\\w]+){SEP}(?P=table)[.](csv|json)$"
84
+ table_regex = exec_dir_regex + rf"{SEP}table{SEP}(?P<schema>[-\w]+){SEP}(?P<table>[-\w]+){SEP}(?P=table)[.](csv|json)$"
85
85
 
86
86
 
87
87
  def is_feature_dir(path: Path) -> Optional[re.Match]:
@@ -190,7 +190,9 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
190
190
  metadata_columns = model.asset_metadata(asset_table)
191
191
  asset_table = model.name_to_table(asset_table)
192
192
  schema = model.name_to_table(asset_table).schema.name
193
- metadata_path = "/".join([rf"(?P<{c}>[-\w]+)" for c in metadata_columns])
193
+
194
+ # Be careful here as a metadata value might be a string with can contain special characters.
195
+ metadata_path = "/".join([rf"(?P<{c}>[-:._ \w]+)" for c in metadata_columns])
194
196
  asset_path = f"{exec_dir_regex}/asset/{schema}/{asset_table.name}/{metadata_path}/{asset_file_regex}"
195
197
  asset_table = model.name_to_table(asset_table)
196
198
  schema = model.name_to_table(asset_table).schema.name
@@ -417,7 +419,7 @@ def asset_file_path(
417
419
  raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")
418
420
 
419
421
  for m in asset_metadata:
420
- path = path / metadata.get(m, "None")
422
+ path = path / str(metadata.get(m, "None"))
421
423
  path.mkdir(parents=True, exist_ok=True)
422
424
  return path / file_name
423
425
 
deriva_ml/demo_catalog.py CHANGED
@@ -5,6 +5,7 @@ import itertools
5
5
  import logging
6
6
  import string
7
7
  from collections.abc import Iterator, Sequence
8
+ from datetime import datetime
8
9
  from numbers import Integral
9
10
  from pathlib import Path
10
11
  from random import choice, randint, random
@@ -54,7 +55,13 @@ def populate_demo_catalog(ml_instance: DerivaML) -> None:
54
55
  )
55
56
  with execution.execute() as e:
56
57
  for s in ss:
57
- image_file = e.asset_file_path("Image", f"test_{s['RID']}.txt", Subject=s["RID"])
58
+ image_file = e.asset_file_path(
59
+ "Image",
60
+ f"test_{s['RID']}.txt",
61
+ Subject=s["RID"],
62
+ Acquisition_Time=datetime.now(),
63
+ Acquisition_Date=datetime.now().date(),
64
+ )
58
65
  with image_file.open("w") as f:
59
66
  f.write(f"Hello there {random()}\n")
60
67
  execution.upload_execution_outputs()
@@ -343,7 +350,14 @@ def create_domain_schema(catalog: ErmrestCatalog, sname: str) -> None:
343
350
  )
344
351
  with TemporaryDirectory() as tmpdir:
345
352
  ml_instance = DerivaML(hostname=catalog.deriva_server.server, catalog_id=catalog.catalog_id, working_dir=tmpdir)
346
- ml_instance.create_asset("Image", referenced_tables=[subject_table])
353
+ ml_instance.create_asset(
354
+ "Image",
355
+ column_defs=[
356
+ Column.define("Acquisition_Time", builtin_types.timestamp),
357
+ Column.define("Acquisition_Date", builtin_types.date),
358
+ ],
359
+ referenced_tables=[subject_table],
360
+ )
347
361
  catalog_annotation(ml_instance.model)
348
362
 
349
363
 
@@ -1,7 +1,7 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
3
  # Safe imports - no circular dependencies
4
- from deriva_ml.execution.execution_configuration import ExecutionConfiguration
4
+ from deriva_ml.execution.execution_configuration import ExecutionConfiguration, AssetRIDConfig
5
5
  from deriva_ml.execution.workflow import Workflow
6
6
 
7
7
  if TYPE_CHECKING:
@@ -22,4 +22,5 @@ __all__ = [
22
22
  "Execution", # Lazy-loaded
23
23
  "ExecutionConfiguration",
24
24
  "Workflow",
25
+ "AssetRIDConfig"
25
26
  ]
@@ -583,7 +583,6 @@ class Execution:
583
583
  asset_rid=status.result["RID"],
584
584
  )
585
585
  )
586
-
587
586
  self._update_asset_execution_table(asset_map)
588
587
  self.update_status(Status.running, "Updating features...")
589
588
 
@@ -805,7 +804,7 @@ class Execution:
805
804
  self,
806
805
  uploaded_assets: dict[str, list[AssetFilePath]],
807
806
  asset_role: str = "Output",
808
- ):
807
+ ) -> None:
809
808
  """Add entry to the association table connecting an asset to an execution RID
810
809
 
811
810
  Args:
@@ -814,6 +813,9 @@ class Execution:
814
813
  asset_role: A term or list of terms from the Asset_Role vocabulary.
815
814
  """
816
815
  # Make sure the asset role is in the controlled vocabulary table.
816
+ if self._dry_run:
817
+ # Don't do any updates of we are doing a dry run.
818
+ return
817
819
  self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
818
820
 
819
821
  pb = self._ml_object.pathBuilder
@@ -1098,7 +1100,7 @@ class Execution:
1098
1100
  description: Description of the files.
1099
1101
 
1100
1102
  Returns:
1101
- RID: Dataset RID that identifes newly added files. Will be nested to mirror origioanl directory structure
1103
+ RID: Dataset RID that identifies newly added files. Will be nested to mirror original directory structure
1102
1104
  of the files.
1103
1105
 
1104
1106
  Raises:
@@ -22,15 +22,17 @@ Typical usage example:
22
22
 
23
23
  from __future__ import annotations
24
24
 
25
+ from dataclasses import dataclass
25
26
  import json
26
27
  import sys
27
28
  from pathlib import Path
28
29
  from typing import Any
29
30
 
31
+ from hydra_zen import builds
30
32
  from pydantic import BaseModel, ConfigDict, Field, field_validator
31
33
 
32
34
  from deriva_ml.core.definitions import RID
33
- from deriva_ml.dataset.aux_classes import DatasetList, DatasetSpec
35
+ from deriva_ml.dataset.aux_classes import DatasetSpec
34
36
  from deriva_ml.execution.workflow import Workflow
35
37
 
36
38
 
@@ -64,7 +66,7 @@ class ExecutionConfiguration(BaseModel):
64
66
  ... )
65
67
  """
66
68
 
67
- datasets: list[DatasetSpec] | DatasetList = []
69
+ datasets: list[DatasetSpec] = []
68
70
  assets: list[RID] = []
69
71
  workflow: RID | Workflow
70
72
  description: str = ""
@@ -72,13 +74,13 @@ class ExecutionConfiguration(BaseModel):
72
74
 
73
75
  model_config = ConfigDict(arbitrary_types_allowed=True)
74
76
 
75
- @field_validator("datasets", mode="before")
76
- @classmethod
77
- def validate_datasets(cls, value: Any) -> Any:
78
- if isinstance(value, DatasetList):
79
- config_list: DatasetList = value
80
- value = config_list.datasets
81
- return value
77
+ # @field_validator("datasets", mode="before")
78
+ # @classmethod
79
+ # def validate_datasets(cls, value: Any) -> Any:
80
+ # if isinstance(value, DatasetList):
81
+ # config_list: DatasetList = value
82
+ # value = config_list.datasets
83
+ # return value
82
84
 
83
85
  @field_validator("workflow", mode="before")
84
86
  @classmethod
@@ -137,3 +139,20 @@ class ExecutionConfiguration(BaseModel):
137
139
  # hs = HatracStore("https", self.host_name, self.credential)
138
140
  # hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
139
141
  # return ExecutionConfiguration.load_configuration(Path(dest_file.name))
142
+
143
+
144
+ @dataclass
145
+ class AssetRID(str):
146
+ rid: str
147
+ description: str = ""
148
+
149
+ def __new__(cls, rid: str, description: str = ""):
150
+ obj = super().__new__(cls, rid)
151
+ obj.description = description
152
+ return obj
153
+
154
+ AssetRIDConfig = builds(AssetRID, populate_full_signature=True)
155
+
156
+
157
+
158
+
@@ -9,6 +9,7 @@ from typing import Any
9
9
  import requests
10
10
  from pydantic import BaseModel, PrivateAttr, model_validator
11
11
  from requests import RequestException
12
+ from setuptools_scm import get_version
12
13
 
13
14
  from deriva_ml.core.definitions import RID
14
15
  from deriva_ml.core.exceptions import DerivaMLException
@@ -129,6 +130,13 @@ class Workflow(BaseModel):
129
130
  self.url, self.checksum = Workflow.get_url_and_checksum(path)
130
131
  self.git_root = Workflow._get_git_root(path)
131
132
 
133
+ self.version = get_version(
134
+ root=str(self.git_root or Path.cwd()),
135
+ search_parent_directories=True,
136
+ # Optional but recommended: provide a safe fallback when tags are absent
137
+ fallback_version="0.0",
138
+ )
139
+
132
140
  self._logger = logging.getLogger("deriva_ml")
133
141
  return self
134
142
 
@@ -8,7 +8,7 @@ ML-specific functionality. It handles schema management, feature definitions, an
8
8
  from __future__ import annotations
9
9
 
10
10
  # Standard library imports
11
- from collections import Counter
11
+ from collections import Counter, defaultdict
12
12
  from graphlib import CycleError, TopologicalSorter
13
13
  from typing import Any, Callable, Final, Iterable, NewType, TypeAlias
14
14
 
@@ -312,7 +312,10 @@ class DerivaModel:
312
312
 
313
313
  return [t for a in dataset_table.find_associations() if domain_table(t := a.other_fkeys.pop().pk_table)]
314
314
 
315
- def _prepare_wide_table(self, dataset: DatasetLike, dataset_rid: RID, include_tables: list[str] | None) -> tuple:
315
+ def _prepare_wide_table(self,
316
+ dataset,
317
+ dataset_rid: RID,
318
+ include_tables: list[str]) -> tuple[dict[str, Any], list[tuple]]:
316
319
  """
317
320
  Generates details of a wide table from the model
318
321
 
@@ -327,7 +330,7 @@ class DerivaModel:
327
330
  # Skip over tables that we don't want to include in the denormalized dataset.
328
331
  # Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
329
332
  # table.
330
- include_tables = set(include_tables) if include_tables else set()
333
+ include_tables = set(include_tables)
331
334
  for t in include_tables:
332
335
  # Check to make sure the table is in the catalog.
333
336
  _ = self.name_to_table(t)
@@ -335,8 +338,11 @@ class DerivaModel:
335
338
  table_paths = [
336
339
  path
337
340
  for path in self._schema_to_paths()
338
- if (not include_tables) or include_tables.intersection({p.name for p in path})
341
+ if path[-1].name in include_tables and include_tables.intersection({p.name for p in path})
339
342
  ]
343
+ paths_by_element = defaultdict(list)
344
+ for p in table_paths:
345
+ paths_by_element[p[2].name].append(p)
340
346
 
341
347
  # Get the names of all of the tables that can be dataset elements.
342
348
  dataset_element_tables = {
@@ -344,58 +350,57 @@ class DerivaModel:
344
350
  }
345
351
 
346
352
  skip_columns = {"RCT", "RMT", "RCB", "RMB"}
347
- tables = {}
348
- graph = {}
349
- for path in table_paths:
350
- for left, right in zip(path[0:], path[1:]):
351
- graph.setdefault(left.name, set()).add(right.name)
352
-
353
- # New lets remove any cycles that we may have in the graph.
354
- # We will use a topological sort to find the order in which we need to join the tables.
355
- # If we find a cycle, we will remove the table from the graph and splice in an additional ON clause.
356
- # We will then repeat the process until there are no cycles.
357
- graph_has_cycles = True
358
- join_tables = []
359
- while graph_has_cycles:
360
- try:
361
- ts = TopologicalSorter(graph)
362
- join_tables = list(reversed(list(ts.static_order())))
363
- graph_has_cycles = False
364
- except CycleError as e:
365
- cycle_nodes = e.args[1]
366
- if len(cycle_nodes) > 3:
367
- raise DerivaMLException(f"Unexpected cycle found when normalizing dataset {cycle_nodes}")
368
- # Remove cycle from graph and splice in additional ON constraint.
369
- graph[cycle_nodes[1]].remove(cycle_nodes[0])
370
-
371
- # The Dataset_Version table is a special case as it points to dataset and dataset to version.
372
- if "Dataset_Version" in join_tables:
373
- join_tables.remove("Dataset_Version")
374
-
375
- for path in table_paths:
376
- for left, right in zip(path[0:], path[1:]):
377
- if right.name == "Dataset_Version":
378
- # The Dataset_Version table is a special case as it points to dataset and dataset to version.
379
- continue
380
- if join_tables.index(right.name) < join_tables.index(left.name):
381
- continue
382
- table_relationship = self._table_relationship(left, right)
383
- tables.setdefault(self.normalize_table_name(right.name), set()).add(
384
- (table_relationship[0], table_relationship[1])
385
- )
386
-
353
+ element_tables = {}
354
+ for element_table, paths in paths_by_element.items():
355
+ graph = {}
356
+ for path in paths:
357
+ for left, right in zip(path[0:], path[1:]):
358
+ graph.setdefault(left.name, set()).add(right.name)
359
+
360
+ # New lets remove any cycles that we may have in the graph.
361
+ # We will use a topological sort to find the order in which we need to join the tables.
362
+ # If we find a cycle, we will remove the table from the graph and splice in an additional ON clause.
363
+ # We will then repeat the process until there are no cycles.
364
+ graph_has_cycles = True
365
+ element_join_tables = []
366
+ element_join_conditions = {}
367
+ while graph_has_cycles:
368
+ try:
369
+ ts = TopologicalSorter(graph)
370
+ element_join_tables = list(reversed(list(ts.static_order())))
371
+ graph_has_cycles = False
372
+ except CycleError as e:
373
+ cycle_nodes = e.args[1]
374
+ if len(cycle_nodes) > 3:
375
+ raise DerivaMLException(f"Unexpected cycle found when normalizing dataset {cycle_nodes}")
376
+ # Remove cycle from graph and splice in additional ON constraint.
377
+ graph[cycle_nodes[1]].remove(cycle_nodes[0])
378
+
379
+ # The Dataset_Version table is a special case as it points to dataset and dataset to version.
380
+ if "Dataset_Version" in element_join_tables:
381
+ element_join_tables.remove("Dataset_Version")
382
+
383
+ for path in paths:
384
+ for left, right in zip(path[0:], path[1:]):
385
+ if right.name == "Dataset_Version":
386
+ # The Dataset_Version table is a special case as it points to dataset and dataset to version.
387
+ continue
388
+ if element_join_tables.index(right.name) < element_join_tables.index(left.name):
389
+ continue
390
+ table_relationship = self._table_relationship(left, right)
391
+ element_join_conditions.setdefault(right.name, set()).add(
392
+ (table_relationship[0], table_relationship[1])
393
+ )
394
+ element_tables[element_table] = (element_join_tables, element_join_conditions)
387
395
  # Get the list of columns that will appear in the final denormalized dataset.
388
396
  denormalized_columns = [
389
397
  (table_name, c.name)
390
- for table_name in join_tables
398
+ for table_name in include_tables
391
399
  if not self.is_association(table_name) # Don't include association columns in the denormalized view.'
392
400
  for c in self.name_to_table(table_name).columns
393
- if c.name not in skip_columns
401
+ if (not include_tables or table_name in include_tables) and (c.name not in skip_columns)
394
402
  ]
395
-
396
- # List of dataset ids to include in the denormalized view.
397
- dataset_rids = dataset.list_dataset_children(recurse=True)
398
- return join_tables, tables, denormalized_columns, dataset_rids, dataset_element_tables
403
+ return element_tables, denormalized_columns
399
404
 
400
405
  def _table_relationship(
401
406
  self,