deriva-ml 1.13.3__py3-none-any.whl → 1.14.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +25 -30
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1489 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +4 -0
- deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
- deriva_ml/{dataset.py → dataset/dataset.py} +408 -416
- deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
- deriva_ml/{history.py → dataset/history.py} +52 -33
- deriva_ml/{upload.py → dataset/upload.py} +48 -70
- deriva_ml/demo_catalog.py +233 -183
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/{execution.py → execution/execution.py} +365 -252
- deriva_ml/execution/execution_configuration.py +163 -0
- deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
- deriva_ml/feature.py +83 -46
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
- deriva_ml/{database_model.py → model/database.py} +52 -74
- deriva_ml/model/sql_mapper.py +44 -0
- deriva_ml/run_notebook.py +19 -11
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/{schema_setup → schema}/annotations.py +31 -22
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
- deriva_ml-1.14.26.dist-info/RECORD +40 -0
- {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
- deriva_ml/deriva_definitions.py +0 -372
- deriva_ml/deriva_ml_base.py +0 -1046
- deriva_ml/execution_environment.py +0 -139
- deriva_ml/schema_setup/table_comments_utils.py +0 -56
- deriva_ml/test-files/execution-parameters.json +0 -1
- deriva_ml/test-files/notebook-parameters.json +0 -5
- deriva_ml/test_functions.py +0 -141
- deriva_ml/test_notebook.ipynb +0 -197
- deriva_ml-1.13.3.dist-info/RECORD +0 -31
- /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
- /deriva_ml/{schema_setup → schema}/policy.json +0 -0
- {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
- {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from deriva.core import ErmrestCatalog, get_credential
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def update_table_comments(model, schema_name: str, table_name: str, comments_dir: str) -> None:
|
|
9
|
+
table = model.schemas[schema_name].tables[table_name]
|
|
10
|
+
table_comments_dir = Path(comments_dir)/Path(f"{schema_name}/{table_name}")
|
|
11
|
+
for file in table_comments_dir.iterdir():
|
|
12
|
+
file_path = table_comments_dir / file.name
|
|
13
|
+
with file_path.open("r") as f:
|
|
14
|
+
comment_str = f.read()
|
|
15
|
+
if file.name.split(".")[0] == table_name:
|
|
16
|
+
table.comment = comment_str
|
|
17
|
+
else:
|
|
18
|
+
table.columns[file.name.split(".")[0]].comment = comment_str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def update_schema_comments(model, schema_name: str, comments_dir: str) -> None:
|
|
22
|
+
schema_comments_dir = Path(comments_dir)/Path(schema_name)
|
|
23
|
+
for table in schema_comments_dir.iterdir():
|
|
24
|
+
if not table.name.endswith(".DS_Store"):
|
|
25
|
+
update_table_comments(model, schema_name, table.name, comments_dir)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def main():
|
|
29
|
+
"""Main entry point for the table comments utility CLI.
|
|
30
|
+
|
|
31
|
+
Parses command line arguments and updates table comments.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
None. Executes the CLI.
|
|
35
|
+
"""
|
|
36
|
+
parser = argparse.ArgumentParser(description="Update table comments from files")
|
|
37
|
+
parser.add_argument("host", help="Hostname")
|
|
38
|
+
parser.add_argument("catalog_id", help="Catalog ID")
|
|
39
|
+
parser.add_argument("comments_dir", help="Directory containing comment files")
|
|
40
|
+
|
|
41
|
+
args = parser.parse_args()
|
|
42
|
+
|
|
43
|
+
catalog = ErmrestCatalog("https", args.host, args.catalog_id, credentials=get_credential(args.host))
|
|
44
|
+
model = catalog.getCatalogModel()
|
|
45
|
+
|
|
46
|
+
# Update comments for all schemas
|
|
47
|
+
for schema_name in model.schemas:
|
|
48
|
+
if schema_name not in ["public", "deriva-ml"]:
|
|
49
|
+
update_schema_comments(model, schema_name, args.comments_dir)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == '__main__':
|
|
53
|
+
sys.exit(main())
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# docs/<schema-name>/<table-name>/[table|<column-name>.Md
|
|
@@ -1,21 +1,22 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deriva-ml
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.14.26
|
|
4
4
|
Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
|
|
5
5
|
Author-email: ISRD <isrd-dev@isi.edu>
|
|
6
6
|
Requires-Python: >=3.10
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
License-File: LICENSE
|
|
9
|
-
Requires-Dist:
|
|
10
|
-
Requires-Dist:
|
|
9
|
+
Requires-Dist: deriva~=1.7.10
|
|
10
|
+
Requires-Dist: deepdiff
|
|
11
11
|
Requires-Dist: pandas
|
|
12
12
|
Requires-Dist: regex~=2024.7.24
|
|
13
|
-
Requires-Dist: pydantic>=2.
|
|
13
|
+
Requires-Dist: pydantic>=2.11
|
|
14
14
|
Requires-Dist: semver>3.0.0
|
|
15
15
|
Requires-Dist: setuptools>=64
|
|
16
16
|
Requires-Dist: setuptools-scm>=8.0
|
|
17
17
|
Requires-Dist: nbstripout
|
|
18
18
|
Requires-Dist: papermill
|
|
19
|
+
Requires-Dist: pandas-stubs==2.2.3.250527
|
|
19
20
|
Dynamic: license-file
|
|
20
21
|
|
|
21
22
|
# DerivaML
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
deriva_ml/__init__.py,sha256=_aMdxGG4mRTcXodLZLNpXqH8v5uqMbqFUryE9KqNSB8,1158
|
|
2
|
+
deriva_ml/demo_catalog.py,sha256=JjPAIac_hKPh5krEhGJydjXquRnivi7kQoR8W4Khp-s,14928
|
|
3
|
+
deriva_ml/feature.py,sha256=L1XUXLWGnUGCjkxX5KsGu0I8SaUTJG7eDs__yUCWuCY,8445
|
|
4
|
+
deriva_ml/run_notebook.py,sha256=SNjqRMDjy7zYnf6TwC7w0Y7nvqsgoYzW2NMxWD9BzUc,6479
|
|
5
|
+
deriva_ml/core/__init__.py,sha256=V_i90pc5PB1F4UdOO6DZWzpEFaZDTaPRU-EzKXQ19eI,787
|
|
6
|
+
deriva_ml/core/base.py,sha256=Rsa6fExlEJwQkU1PHf5ZK051ZB9oBTHrHx-_Omij__Y,60823
|
|
7
|
+
deriva_ml/core/constants.py,sha256=6wBJ8qMxe-dbCjRGrjUIX-RK0mTWrLDTeUpaVbLFoM8,888
|
|
8
|
+
deriva_ml/core/definitions.py,sha256=uq_8uYFBVBVHS691Ri2kdQsN37z0GNYTaZskJIb_ocM,1385
|
|
9
|
+
deriva_ml/core/enums.py,sha256=sSN4B4OynbB-AXwxRszoFr-KWIWIAfhVa06EzAEHwVc,7194
|
|
10
|
+
deriva_ml/core/ermrest.py,sha256=N0IJ3TE87jElaBChEIo5AFDTr0SIrb6F90yiimRfPr4,10182
|
|
11
|
+
deriva_ml/core/exceptions.py,sha256=4MZNPOyN-UMaGeY9sqJDVwh_iOmz1ntp4usSyCNqVMg,934
|
|
12
|
+
deriva_ml/core/filespec.py,sha256=BQAAcRXfXq1lDcsKlokLOOXCBtEZpPgXxrFOIZYAgLg,4229
|
|
13
|
+
deriva_ml/dataset/__init__.py,sha256=ukl2laJqa9J2AVqb4zlpIYc-3RaAlfRR33NMIQaoNrQ,104
|
|
14
|
+
deriva_ml/dataset/aux_classes.py,sha256=9mZAln7_rrzaRbKhKA6dJOp3xeD6dHOC9NXOtJKROo4,6933
|
|
15
|
+
deriva_ml/dataset/dataset.py,sha256=FnQG98eEr7mnUmo1ySgVRxRoyhbP5u_ZxuexdAcYxJY,64305
|
|
16
|
+
deriva_ml/dataset/dataset_bag.py,sha256=mPIZRX5aTbVRcJbCFtdkmlnexquF8NE-onbVK_8IxVk,14224
|
|
17
|
+
deriva_ml/dataset/history.py,sha256=FK5AYYz11p4E4FWMVg4r7UPWOD4eobrq3b3xMjWF59g,3197
|
|
18
|
+
deriva_ml/dataset/upload.py,sha256=Ad5JDfGvkIvefE-plP8SN9pNAxHzYrBoid5isz_bnNs,16411
|
|
19
|
+
deriva_ml/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
+
deriva_ml/execution/environment.py,sha256=B7nywqxFTRUWgyu8n7rFoKcVC9on422kjeFG2FPQfvg,9302
|
|
21
|
+
deriva_ml/execution/execution.py,sha256=tXWkFLDoSre836x6MMkcmhtmr3zP5_VoSioQ72-XmvE,44298
|
|
22
|
+
deriva_ml/execution/execution_configuration.py,sha256=Rw4VWkBCZN9yatvSKdTqEWTfu470lpcVKfHFR0uN0jI,6248
|
|
23
|
+
deriva_ml/execution/workflow.py,sha256=fd8dwCyDSpW_BOD_d0-zFs9yTNrbddwqD-xG3rKdvyU,13215
|
|
24
|
+
deriva_ml/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
deriva_ml/model/catalog.py,sha256=dzTBcRlqgEVkPY32AUax_iu75RgFiT4Pu5au7rmrv8k,14068
|
|
26
|
+
deriva_ml/model/database.py,sha256=cTe9rJHMCDo89_dmAT5aJGhzzLiK2DkXvCvZE6fczcM,14805
|
|
27
|
+
deriva_ml/model/sql_mapper.py,sha256=_0QsJEVSgSPtxrWKSgjfPZCQ1aMVcjR_Tk2OxLhWEvY,1696
|
|
28
|
+
deriva_ml/schema/__init__.py,sha256=yV-MfzCF3FA4OOz7mZwMM2q6-x1vgOJ057kUvikFF6E,130
|
|
29
|
+
deriva_ml/schema/annotations.py,sha256=TuQ3vWFnK0160fRmtvsCkHx9qAcRa63MSyERB4x5a98,18197
|
|
30
|
+
deriva_ml/schema/check_schema.py,sha256=6dadLYHPqRex6AYVClmsESI8WhC7-rb-XnGf2G298xw,3609
|
|
31
|
+
deriva_ml/schema/create_schema.py,sha256=0ydJSZEg3C3-m8hWPN6k2MoUvm-RWxAlKFzVChxcx3I,12791
|
|
32
|
+
deriva_ml/schema/deriva-ml-reference.json,sha256=AEOMIgwKO3dNMMWHb0lxaXyamvfAEbUPh8qw0aAtsUQ,242460
|
|
33
|
+
deriva_ml/schema/policy.json,sha256=5ykB8nnZFl-oCHzlAwppCFKJHWJFIkYognUMVEanfY8,1826
|
|
34
|
+
deriva_ml/schema/table_comments_utils.py,sha256=4flCqnZAaqg_uSZ9I18pNUWAZoLfmMCXbmI5uERY5vM,2007
|
|
35
|
+
deriva_ml-1.14.26.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
36
|
+
deriva_ml-1.14.26.dist-info/METADATA,sha256=nhgVSz2mmn24u8xsLlAQRu6wYa37-z4DJZNy3Fi8odM,1034
|
|
37
|
+
deriva_ml-1.14.26.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
38
|
+
deriva_ml-1.14.26.dist-info/entry_points.txt,sha256=dkf_z7E4V6_3_5Xjsm0hcixNg6ASHDw6NfYQuBvF1Wc,363
|
|
39
|
+
deriva_ml-1.14.26.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
|
|
40
|
+
deriva_ml-1.14.26.dist-info/RECORD,,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
[console_scripts]
|
|
2
2
|
deriva-ml-alter-annotation = deriva_ml.schema_setup.alter_annotation:main
|
|
3
|
+
deriva-ml-check-catalog-schema = deriva_ml.schema.check_schema:main
|
|
3
4
|
deriva-ml-create-schema = deriva_ml.schema_setup.create_schema:main
|
|
4
5
|
deriva-ml-run-notebook = deriva_ml.run_notebook:main
|
|
5
6
|
deriva-ml-table-comments-utils = deriva_ml.schema_setup.table_comments_utils:main
|
deriva_ml/deriva_definitions.py
DELETED
|
@@ -1,372 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Shared definitions that are used in different DerivaML modules.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from __future__ import annotations
|
|
6
|
-
|
|
7
|
-
import warnings
|
|
8
|
-
from datetime import date
|
|
9
|
-
from enum import Enum
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
from typing import Any, Iterable, Optional, Annotated, Generator
|
|
12
|
-
|
|
13
|
-
import deriva.core.ermrest_model as em
|
|
14
|
-
import deriva.core.utils.hash_utils as hash_utils
|
|
15
|
-
from urllib.parse import urlparse
|
|
16
|
-
from deriva.core.ermrest_model import builtin_types
|
|
17
|
-
from pydantic import (
|
|
18
|
-
BaseModel,
|
|
19
|
-
model_serializer,
|
|
20
|
-
Field,
|
|
21
|
-
computed_field,
|
|
22
|
-
field_validator,
|
|
23
|
-
ValidationError,
|
|
24
|
-
)
|
|
25
|
-
from socket import gethostname
|
|
26
|
-
|
|
27
|
-
ML_SCHEMA = "deriva-ml"
|
|
28
|
-
DRY_RUN_RID = "0000"
|
|
29
|
-
|
|
30
|
-
# We are going to use schema as a field name and this collides with method in pydantic base class
|
|
31
|
-
warnings.filterwarnings(
|
|
32
|
-
"ignore", message='Field name "schema"', category=Warning, module="pydantic"
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
warnings.filterwarnings(
|
|
36
|
-
"ignore",
|
|
37
|
-
message="fields may not start with an underscore",
|
|
38
|
-
category=Warning,
|
|
39
|
-
module="pydantic",
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
rid_part = r"(?P<rid>(?:[A-Z\d]{1,4}|[A-Z\d]{1,4}(?:-[A-Z\d]{4})+))"
|
|
43
|
-
snapshot_part = r"(?:@(?P<snapshot>(?:[A-Z\d]{1,4}|[A-Z\d]{1,4}(?:-[A-Z\d]{4})+)))?"
|
|
44
|
-
rid_regex = f"^{rid_part}{snapshot_part}$"
|
|
45
|
-
RID = Annotated[str, Field(pattern=rid_regex)]
|
|
46
|
-
|
|
47
|
-
DerivaSystemColumns = ["RID", "RCT", "RMT", "RCB", "RMB"]
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
# For some reason, deriva-py doesn't use the proper enum class!!
|
|
51
|
-
class UploadState(Enum):
|
|
52
|
-
"""State of file upload"""
|
|
53
|
-
|
|
54
|
-
success = 0
|
|
55
|
-
failed = 1
|
|
56
|
-
pending = 2
|
|
57
|
-
running = 3
|
|
58
|
-
paused = 4
|
|
59
|
-
aborted = 5
|
|
60
|
-
cancelled = 6
|
|
61
|
-
timeout = 7
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class StrEnum(str, Enum):
|
|
65
|
-
pass
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class FileUploadState(BaseModel):
|
|
69
|
-
state: UploadState
|
|
70
|
-
status: str
|
|
71
|
-
result: Any
|
|
72
|
-
|
|
73
|
-
@computed_field
|
|
74
|
-
@property
|
|
75
|
-
def rid(self) -> Optional[RID]:
|
|
76
|
-
return self.result and self.result["RID"]
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
class Status(StrEnum):
|
|
80
|
-
"""Enumeration class defining execution status.
|
|
81
|
-
|
|
82
|
-
Attributes:
|
|
83
|
-
running: Execution is currently running.
|
|
84
|
-
pending: Execution is pending.
|
|
85
|
-
completed: Execution has been completed successfully.
|
|
86
|
-
failed: Execution has failed.
|
|
87
|
-
|
|
88
|
-
"""
|
|
89
|
-
|
|
90
|
-
initializing = "Initializing"
|
|
91
|
-
created = "Created"
|
|
92
|
-
pending = "Pending"
|
|
93
|
-
running = "Running"
|
|
94
|
-
aborted = "Aborted"
|
|
95
|
-
completed = "Completed"
|
|
96
|
-
failed = "Failed"
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
class BuiltinTypes(Enum):
|
|
100
|
-
text = builtin_types.text.typename
|
|
101
|
-
int2 = builtin_types.int2.typename
|
|
102
|
-
jsonb = builtin_types.json.typename
|
|
103
|
-
float8 = builtin_types.float8.typename
|
|
104
|
-
timestamp = builtin_types.timestamp.typename
|
|
105
|
-
int8 = builtin_types.int8.typename
|
|
106
|
-
boolean = builtin_types.boolean.typename
|
|
107
|
-
json = builtin_types.json.typename
|
|
108
|
-
float4 = builtin_types.float4.typename
|
|
109
|
-
int4 = builtin_types.int4.typename
|
|
110
|
-
timestamptz = builtin_types.timestamptz.typename
|
|
111
|
-
date = builtin_types.date.typename
|
|
112
|
-
ermrest_rid = builtin_types.ermrest_rid.typename
|
|
113
|
-
ermrest_rcb = builtin_types.ermrest_rcb.typename
|
|
114
|
-
ermrest_rmb = builtin_types.ermrest_rmb.typename
|
|
115
|
-
ermrest_rct = builtin_types.ermrest_rct.typename
|
|
116
|
-
ermrest_rmt = builtin_types.ermrest_rmt.typename
|
|
117
|
-
markdown = builtin_types.markdown.typename
|
|
118
|
-
longtext = builtin_types.longtext.typename
|
|
119
|
-
ermrest_curie = builtin_types.ermrest_curie.typename
|
|
120
|
-
ermrest_uri = builtin_types.ermrest_uri.typename
|
|
121
|
-
color_rgb_hex = builtin_types.color_rgb_hex.typename
|
|
122
|
-
serial2 = builtin_types.serial2.typename
|
|
123
|
-
serial4 = builtin_types.serial4.typename
|
|
124
|
-
serial8 = builtin_types.serial8.typename
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
class FileSpec(BaseModel):
|
|
128
|
-
"""An entry into the File table
|
|
129
|
-
|
|
130
|
-
Attributes:
|
|
131
|
-
url: The File url to the url.
|
|
132
|
-
description: The description of the file.
|
|
133
|
-
"""
|
|
134
|
-
|
|
135
|
-
url: str
|
|
136
|
-
description: Optional[str] = ""
|
|
137
|
-
md5: str
|
|
138
|
-
length: int
|
|
139
|
-
|
|
140
|
-
@field_validator("url")
|
|
141
|
-
@classmethod
|
|
142
|
-
def validate_file_url(cls, v):
|
|
143
|
-
"""Examine the provided URL. If it's a local path, convert it into a tag URL."""
|
|
144
|
-
url_parts = urlparse(v)
|
|
145
|
-
if url_parts.scheme == "tag":
|
|
146
|
-
# Already a tag URL, so just return it.
|
|
147
|
-
return v
|
|
148
|
-
elif (not url_parts.scheme) or url_parts.scheme == "file":
|
|
149
|
-
# There is no scheme part tof the URL, or it is a file URL, so it is a local file path, so convert to a tag URL.
|
|
150
|
-
return f"tag://{gethostname()},{date.today()}:file://{url_parts.path}"
|
|
151
|
-
else:
|
|
152
|
-
raise ValidationError("url is not a file URL")
|
|
153
|
-
|
|
154
|
-
@model_serializer()
|
|
155
|
-
def serialize_filespec(self):
|
|
156
|
-
return {
|
|
157
|
-
"URL": self.url,
|
|
158
|
-
"Description": self.description,
|
|
159
|
-
"MD5": self.md5,
|
|
160
|
-
"Length": self.length,
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
@staticmethod
|
|
164
|
-
def create_filespecs(
|
|
165
|
-
path: Path | str, description: str
|
|
166
|
-
) -> Generator["FileSpec", None, None]:
|
|
167
|
-
"""Given a file or directory, generate the sequence of corresponding FileSpecs sutable to create a File table
|
|
168
|
-
|
|
169
|
-
Arguments:
|
|
170
|
-
path: Path to the file or directory.
|
|
171
|
-
description: The description of the file(s)
|
|
172
|
-
|
|
173
|
-
Returns:
|
|
174
|
-
An iterable of FileSpecs for each file in the directory.
|
|
175
|
-
"""
|
|
176
|
-
path = Path(path)
|
|
177
|
-
|
|
178
|
-
def list_all_files(p) -> list[Path]:
|
|
179
|
-
return (
|
|
180
|
-
(f for f in Path(p).rglob("*") if f.is_file()) if path.is_dir() else [p]
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
def create_spec(p: Path, description: str) -> FileSpec:
|
|
184
|
-
hashes = hash_utils.compute_file_hashes(p, hashes=["md5", "sha256"])
|
|
185
|
-
md5 = hashes["md5"][0]
|
|
186
|
-
return FileSpec(
|
|
187
|
-
length=path.stat().st_size,
|
|
188
|
-
md5=md5,
|
|
189
|
-
description=description,
|
|
190
|
-
url=p.as_posix(),
|
|
191
|
-
)
|
|
192
|
-
|
|
193
|
-
return (create_spec(file, description) for file in list_all_files(path))
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
class VocabularyTerm(BaseModel):
|
|
197
|
-
"""An entry in a vocabulary table.
|
|
198
|
-
|
|
199
|
-
Attributes:
|
|
200
|
-
name: Name of vocabulary term
|
|
201
|
-
synonyms: List of alternative names for the term
|
|
202
|
-
id: CURI identifier for the term
|
|
203
|
-
uri: Unique URI for the term.
|
|
204
|
-
description: A description of the term meaning
|
|
205
|
-
rid: Resource identifier assigned to the term
|
|
206
|
-
|
|
207
|
-
Args:
|
|
208
|
-
|
|
209
|
-
Returns:
|
|
210
|
-
|
|
211
|
-
"""
|
|
212
|
-
|
|
213
|
-
name: str = Field(alias="Name")
|
|
214
|
-
synonyms: Optional[list[str]] = Field(alias="Synonyms")
|
|
215
|
-
id: str = Field(alias="ID")
|
|
216
|
-
uri: str = Field(alias="URI")
|
|
217
|
-
description: str = Field(alias="Description")
|
|
218
|
-
rid: str = Field(alias="RID")
|
|
219
|
-
|
|
220
|
-
class Config:
|
|
221
|
-
extra = "ignore"
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
class MLVocab(StrEnum):
|
|
225
|
-
"""Names of controlled vocabulary for various types within DerivaML."""
|
|
226
|
-
|
|
227
|
-
dataset_type = "Dataset_Type"
|
|
228
|
-
workflow_type = "Workflow_Type"
|
|
229
|
-
file_type = "File_Type"
|
|
230
|
-
asset_type = "Asset_Type"
|
|
231
|
-
asset_role = "Asset_Role"
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
class MLAsset(StrEnum):
|
|
235
|
-
execution_metadata = "Execution_Metadata"
|
|
236
|
-
execution_asset = "Execution_Asset"
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
class ExecMetadataType(StrEnum):
|
|
240
|
-
"""
|
|
241
|
-
Predefined execution metadata types.
|
|
242
|
-
"""
|
|
243
|
-
|
|
244
|
-
execution_config = "Execution_Config"
|
|
245
|
-
runtime_env = "Runtime_Env"
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
class ExecAssetType(StrEnum):
|
|
249
|
-
"""
|
|
250
|
-
Predefined execution metadata types.
|
|
251
|
-
"""
|
|
252
|
-
|
|
253
|
-
input_file = "Input_File"
|
|
254
|
-
output_file = "Output_File"
|
|
255
|
-
notebook_output = "Notebook_Output"
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
class ColumnDefinition(BaseModel):
|
|
259
|
-
"""Pydantic model for deriva_py Column.define"""
|
|
260
|
-
|
|
261
|
-
name: str
|
|
262
|
-
type: BuiltinTypes
|
|
263
|
-
nullok: bool = True
|
|
264
|
-
default: Any = None
|
|
265
|
-
comment: Optional[str] = None
|
|
266
|
-
acls: dict = Field(default_factory=dict)
|
|
267
|
-
acl_bindings: dict = Field(default_factory=dict)
|
|
268
|
-
annotations: dict = Field(default_factory=dict)
|
|
269
|
-
|
|
270
|
-
@field_validator("type", mode="before")
|
|
271
|
-
@classmethod
|
|
272
|
-
def extract_type_name(cls, value: Any) -> Any:
|
|
273
|
-
if isinstance(value, dict):
|
|
274
|
-
return BuiltinTypes(value["typename"])
|
|
275
|
-
else:
|
|
276
|
-
return value
|
|
277
|
-
|
|
278
|
-
@model_serializer()
|
|
279
|
-
def serialize_column_definition(self):
|
|
280
|
-
return em.Column.define(
|
|
281
|
-
self.name,
|
|
282
|
-
builtin_types[self.type.value],
|
|
283
|
-
nullok=self.nullok,
|
|
284
|
-
default=self.default,
|
|
285
|
-
comment=self.comment,
|
|
286
|
-
acls=self.acls,
|
|
287
|
-
acl_bindings=self.acl_bindings,
|
|
288
|
-
annotations=self.annotations,
|
|
289
|
-
)
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
class KeyDefinition(BaseModel):
|
|
293
|
-
colnames: Iterable[str]
|
|
294
|
-
constraint_names: Iterable[str]
|
|
295
|
-
comment: Optional[str] = None
|
|
296
|
-
annotations: dict = Field(default_factory=dict)
|
|
297
|
-
|
|
298
|
-
@model_serializer()
|
|
299
|
-
def serialize_key_definition(self):
|
|
300
|
-
return em.Key.define(
|
|
301
|
-
colnames=self.colnames,
|
|
302
|
-
constraint_names=self.constraint_names,
|
|
303
|
-
comment=self.comment,
|
|
304
|
-
annotations=self.annotations,
|
|
305
|
-
)
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
class ForeignKeyDefinition(BaseModel):
|
|
309
|
-
"""Pydantic model for deriva_py ForeignKey.define"""
|
|
310
|
-
|
|
311
|
-
colnames: Iterable[str]
|
|
312
|
-
pk_sname: str
|
|
313
|
-
pk_tname: str
|
|
314
|
-
pk_colnames: Iterable[str]
|
|
315
|
-
constraint_names: Iterable[str] = Field(default_factory=list)
|
|
316
|
-
on_update: str = "NO ACTION"
|
|
317
|
-
on_delete: str = "NO ACTION"
|
|
318
|
-
comment: str = None
|
|
319
|
-
acls: dict[str, Any] = Field(default_factory=dict)
|
|
320
|
-
acl_bindings: dict[str, Any] = Field(default_factory=dict)
|
|
321
|
-
annotations: dict[str, Any] = Field(default_factory=dict)
|
|
322
|
-
|
|
323
|
-
@model_serializer()
|
|
324
|
-
def serialize_fk_definition(self):
|
|
325
|
-
return em.ForeignKey.define(
|
|
326
|
-
fk_colnames=self.colnames,
|
|
327
|
-
pk_sname=self.pk_sname,
|
|
328
|
-
pk_tname=self.pk_tname,
|
|
329
|
-
pk_colnames=self.pk_colnames,
|
|
330
|
-
on_update=self.on_update,
|
|
331
|
-
on_delete=self.on_delete,
|
|
332
|
-
comment=self.comment,
|
|
333
|
-
acls=self.acls,
|
|
334
|
-
acl_bindings=self.acl_bindings,
|
|
335
|
-
annotations=self.annotations,
|
|
336
|
-
)
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
class TableDefinition(BaseModel):
|
|
340
|
-
name: str
|
|
341
|
-
column_defs: Iterable[ColumnDefinition]
|
|
342
|
-
key_defs: Iterable[KeyDefinition] = Field(default_factory=list)
|
|
343
|
-
fkey_defs: Iterable[ForeignKeyDefinition] = Field(default_factory=list)
|
|
344
|
-
comment: str = None
|
|
345
|
-
acls: dict = Field(default_factory=dict)
|
|
346
|
-
acl_bindings: dict = Field(default_factory=dict)
|
|
347
|
-
annotations: dict = Field(default_factory=dict)
|
|
348
|
-
|
|
349
|
-
@model_serializer()
|
|
350
|
-
def serialize_table_definition(self):
|
|
351
|
-
return em.Table.define(
|
|
352
|
-
tname=self.name,
|
|
353
|
-
column_defs=[c.model_dump() for c in self.column_defs],
|
|
354
|
-
key_defs=[k.model_dump() for k in self.key_defs],
|
|
355
|
-
fkey_defs=[fk.model_dump() for fk in self.fkey_defs],
|
|
356
|
-
comment=self.comment,
|
|
357
|
-
acls=self.acls,
|
|
358
|
-
acl_bindings=self.acl_bindings,
|
|
359
|
-
annotations=self.annotations,
|
|
360
|
-
)
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
class DerivaMLException(Exception):
|
|
364
|
-
"""Exception class specific to DerivaML module.
|
|
365
|
-
|
|
366
|
-
Args:
|
|
367
|
-
msg (str): Optional message for the exception.
|
|
368
|
-
"""
|
|
369
|
-
|
|
370
|
-
def __init__(self, msg=""):
|
|
371
|
-
super().__init__(msg)
|
|
372
|
-
self._msg = msg
|