deriva-ml 1.14.0__py3-none-any.whl → 1.14.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. deriva_ml/__init__.py +25 -30
  2. deriva_ml/core/__init__.py +39 -0
  3. deriva_ml/core/base.py +1489 -0
  4. deriva_ml/core/constants.py +36 -0
  5. deriva_ml/core/definitions.py +74 -0
  6. deriva_ml/core/enums.py +222 -0
  7. deriva_ml/core/ermrest.py +288 -0
  8. deriva_ml/core/exceptions.py +28 -0
  9. deriva_ml/core/filespec.py +116 -0
  10. deriva_ml/dataset/__init__.py +4 -0
  11. deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
  12. deriva_ml/{dataset.py → dataset/dataset.py} +406 -428
  13. deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
  14. deriva_ml/{history.py → dataset/history.py} +51 -33
  15. deriva_ml/{upload.py → dataset/upload.py} +48 -70
  16. deriva_ml/demo_catalog.py +233 -183
  17. deriva_ml/execution/environment.py +290 -0
  18. deriva_ml/{execution.py → execution/execution.py} +365 -252
  19. deriva_ml/execution/execution_configuration.py +163 -0
  20. deriva_ml/{execution_configuration.py → execution/workflow.py} +212 -224
  21. deriva_ml/feature.py +83 -46
  22. deriva_ml/model/__init__.py +0 -0
  23. deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
  24. deriva_ml/{database_model.py → model/database.py} +52 -74
  25. deriva_ml/model/sql_mapper.py +44 -0
  26. deriva_ml/run_notebook.py +19 -11
  27. deriva_ml/schema/__init__.py +3 -0
  28. deriva_ml/{schema_setup → schema}/annotations.py +31 -22
  29. deriva_ml/schema/check_schema.py +104 -0
  30. deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
  31. deriva_ml/schema/deriva-ml-reference.json +8525 -0
  32. deriva_ml/schema/table_comments_utils.py +57 -0
  33. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/METADATA +5 -4
  34. deriva_ml-1.14.27.dist-info/RECORD +40 -0
  35. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/entry_points.txt +1 -0
  36. deriva_ml/deriva_definitions.py +0 -391
  37. deriva_ml/deriva_ml_base.py +0 -1046
  38. deriva_ml/execution_environment.py +0 -139
  39. deriva_ml/schema_setup/table_comments_utils.py +0 -56
  40. deriva_ml/test-files/execution-parameters.json +0 -1
  41. deriva_ml/test-files/notebook-parameters.json +0 -5
  42. deriva_ml/test_functions.py +0 -141
  43. deriva_ml/test_notebook.ipynb +0 -197
  44. deriva_ml-1.14.0.dist-info/RECORD +0 -31
  45. /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
  46. /deriva_ml/{schema_setup → schema}/policy.json +0 -0
  47. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/WHEEL +0 -0
  48. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/licenses/LICENSE +0 -0
  49. {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.27.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,116 @@
1
+ """
2
+ File-related utility functions for DerivaML.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ from datetime import date
9
+ from pathlib import Path
10
+ from socket import gethostname
11
+ from typing import Callable, Generator
12
+ from urllib.parse import urlparse
13
+
14
+ import deriva.core.utils.hash_utils as hash_utils
15
+ from pydantic import BaseModel, Field, conlist, field_validator, validate_call
16
+
17
+
18
+ class FileSpec(BaseModel):
19
+ """An entry into the File table
20
+
21
+ Attributes:
22
+ url: The File url to the url.
23
+ description: The description of the file.
24
+ md5: The MD5 hash of the file.
25
+ length: The length of the file in bytes.
26
+ file_types: A list of file types. Each files_type should be a defined term in MLVocab.file_type vocabulary.
27
+ """
28
+
29
+ url: str = Field(alias="URL", validation_alias="url")
30
+ md5: str = Field(alias="MD5", validation_alias="md5")
31
+ length: int = Field(alias="Length", validation_alias="length")
32
+ description: str | None = Field(default="", alias="Description", validation_alias="description")
33
+ file_types: conlist(str) | None = []
34
+
35
+ @field_validator("url")
36
+ @classmethod
37
+ def validate_file_url(cls, url: str) -> str:
38
+ """Examine the provided URL. If it's a local path, convert it into a tag URL.
39
+
40
+ Args:
41
+ url: The URL to validate and potentially convert
42
+
43
+ Returns:
44
+ The validated/converted URL
45
+
46
+ Raises:
47
+ ValidationError: If the URL is not a file URL
48
+ """
49
+ url_parts = urlparse(url)
50
+ if url_parts.scheme == "tag":
51
+ # Already a tag URL, so just return it.
52
+ return url
53
+ elif (not url_parts.scheme) or url_parts.scheme == "file":
54
+ # There is no scheme part of the URL, or it is a file URL, so it is a local file path.
55
+ # Convert to a tag URL.
56
+ return f"tag://{gethostname()},{date.today()}:file://{url_parts.path}"
57
+ else:
58
+ raise ValueError("url is not a file URL")
59
+
60
+ @classmethod
61
+ def create_filespecs(
62
+ cls, path: Path | str, description: str, file_types: list[str] | Callable[[Path], list[str]] | None = None
63
+ ) -> Generator[FileSpec, None, None]:
64
+ """Given a file or directory, generate the sequence of corresponding FileSpecs suitable to create a File table.
65
+
66
+ Args:
67
+ path: Path to the file or directory.
68
+ description: The description of the file(s)
69
+ file_types: A list of file types or a function that takes a file path and returns a list of file types.
70
+
71
+ Returns:
72
+ An iterable of FileSpecs for each file in the directory.
73
+ """
74
+
75
+ path = Path(path)
76
+ file_types = file_types or []
77
+ file_types_fn = file_types if callable(file_types) else lambda _x: file_types
78
+
79
+ def create_spec(file_path: Path) -> FileSpec:
80
+ hashes = hash_utils.compute_file_hashes(file_path, hashes=frozenset(["md5", "sha256"]))
81
+ md5 = hashes["md5"][0]
82
+ type_list = file_types_fn(file_path)
83
+ return FileSpec(
84
+ length=path.stat().st_size,
85
+ md5=md5,
86
+ description=description,
87
+ url=file_path.as_posix(),
88
+ file_types=type_list if "File" in type_list else ["File"] + type_list,
89
+ )
90
+
91
+ files = [path] if path.is_file() else [f for f in Path(path).rglob("*") if f.is_file()]
92
+ return (create_spec(file) for file in files)
93
+
94
+ @staticmethod
95
+ def read_filespec(path: Path | str) -> Generator[FileSpec, None, None]:
96
+ """Get FileSpecs from a JSON lines file.
97
+
98
+ Args:
99
+ path: Path to the .jsonl file (string or Path).
100
+
101
+ Yields:
102
+ A FileSpec object.
103
+ """
104
+ path = Path(path)
105
+ with path.open("r", encoding="utf-8") as f:
106
+ for line in f:
107
+ line = line.strip()
108
+ if not line:
109
+ continue
110
+ yield FileSpec(**json.loads(line))
111
+
112
+
113
+ # Hack round pydantic validate_call and forward reference.
114
+ _raw = FileSpec.create_filespecs.__func__
115
+ # wrap it with validate_call, then re‐make it a classmethod
116
+ FileSpec.create_filespecs = classmethod(validate_call(_raw))
@@ -0,0 +1,4 @@
1
+ from .aux_classes import DatasetSpec
2
+ from .dataset import Dataset
3
+
4
+ __all__ = ["Dataset", "DatasetSpec"]
@@ -2,21 +2,22 @@
2
2
  THis module defines the DataSet class with is used to manipulate n
3
3
  """
4
4
 
5
- from .deriva_definitions import RID
6
-
7
5
  from enum import Enum
6
+ from typing import Any, Optional, SupportsInt
7
+
8
8
  from pydantic import (
9
9
  BaseModel,
10
10
  ConfigDict,
11
- field_validator,
12
11
  Field,
13
12
  computed_field,
14
- model_validator,
13
+ conlist,
15
14
  field_serializer,
15
+ field_validator,
16
+ model_validator,
16
17
  )
17
-
18
18
  from semver import Version
19
- from typing import Optional, Any, SupportsInt
19
+
20
+ from deriva_ml.core.definitions import RID
20
21
 
21
22
 
22
23
  class VersionPart(Enum):
@@ -41,9 +42,7 @@ class DatasetVersion(Version):
41
42
  replace(major, minor, patch): Replace the major and minor versions
42
43
  """
43
44
 
44
- def __init__(
45
- self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0
46
- ):
45
+ def __init__(self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0):
47
46
  """Initialize a DatasetVersion object.
48
47
 
49
48
  Args:
@@ -139,7 +138,8 @@ class DatasetMinid(BaseModel):
139
138
  @computed_field
140
139
  @property
141
140
  def dataset_rid(self) -> str:
142
- return self.version_rid.split("@")[0]
141
+ rid_parts = self.version_rid.split("@")
142
+ return rid_parts[0]
143
143
 
144
144
  @computed_field
145
145
  @property
@@ -177,13 +177,13 @@ class DatasetSpec(BaseModel):
177
177
 
178
178
  Attributes:
179
179
  rid (RID): A dataset_table RID
180
- materialize (bool): If False, do not materialize datasets, only download table data, no assets. Defaults to True
180
+ materialize (bool): If False do not materialize datasets, only download table data, no assets. Defaults to True
181
181
  version (DatasetVersion): The version of the dataset. Should follow semantic versioning.
182
182
  """
183
183
 
184
184
  rid: RID
185
185
  materialize: bool = True
186
- version: DatasetVersion
186
+ version: DatasetVersion | conlist(item_type=int, min_length=3, max_length=3) | tuple[int, int, int] | str
187
187
 
188
188
  model_config = ConfigDict(arbitrary_types_allowed=True)
189
189
 
@@ -192,6 +192,10 @@ class DatasetSpec(BaseModel):
192
192
  def version_field_validator(cls, v: Any) -> Any:
193
193
  if isinstance(v, dict):
194
194
  return DatasetVersion(**v)
195
+ elif isinstance(v, str):
196
+ return DatasetVersion.parse(v)
197
+ elif (isinstance(v, list) or isinstance(v, tuple)) and len(v) == 3:
198
+ return DatasetVersion(int(v[0]), int(v[1]), int(v[2]))
195
199
  else:
196
200
  return v
197
201