deriva-ml 1.17.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/.DS_Store +0 -0
- deriva_ml/__init__.py +79 -0
- deriva_ml/bump_version.py +142 -0
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1527 -0
- deriva_ml/core/config.py +69 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +12 -0
- deriva_ml/dataset/aux_classes.py +225 -0
- deriva_ml/dataset/dataset.py +1519 -0
- deriva_ml/dataset/dataset_bag.py +450 -0
- deriva_ml/dataset/history.py +109 -0
- deriva_ml/dataset/upload.py +439 -0
- deriva_ml/demo_catalog.py +495 -0
- deriva_ml/execution/__init__.py +26 -0
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/execution/execution.py +1180 -0
- deriva_ml/execution/execution_configuration.py +147 -0
- deriva_ml/execution/workflow.py +413 -0
- deriva_ml/feature.py +228 -0
- deriva_ml/install_kernel.py +71 -0
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/model/catalog.py +485 -0
- deriva_ml/model/database.py +719 -0
- deriva_ml/protocols/dataset.py +19 -0
- deriva_ml/run_notebook.py +228 -0
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/schema/annotations.py +473 -0
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/schema/create_schema.py +393 -0
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/policy.json +81 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- deriva_ml/test.py +94 -0
- deriva_ml-1.17.10.dist-info/METADATA +38 -0
- deriva_ml-1.17.10.dist-info/RECORD +45 -0
- deriva_ml-1.17.10.dist-info/WHEEL +5 -0
- deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
- deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
- deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module provides functions that help structure local directories for uploading to a DerivaML catalog, and
|
|
3
|
+
generating an upload specification for those directories.
|
|
4
|
+
|
|
5
|
+
Here is the directory layout we support:
|
|
6
|
+
|
|
7
|
+
deriva-ml/
|
|
8
|
+
execution
|
|
9
|
+
<execution_rid>
|
|
10
|
+
execution-asset
|
|
11
|
+
<asset_type>
|
|
12
|
+
file1, file2, .... <- Need to update execution_asset association table.
|
|
13
|
+
execution-metadata
|
|
14
|
+
<metadata_type>
|
|
15
|
+
feature
|
|
16
|
+
<schema>
|
|
17
|
+
<target_table>
|
|
18
|
+
<feature_name>
|
|
19
|
+
asset
|
|
20
|
+
<asset_table>
|
|
21
|
+
file1, file2, ...
|
|
22
|
+
<feature_name>.jsonl <- needs to have asset_name column remapped before uploading
|
|
23
|
+
table
|
|
24
|
+
<schema>
|
|
25
|
+
<record_table>
|
|
26
|
+
record_table.csv
|
|
27
|
+
asset
|
|
28
|
+
<schema>
|
|
29
|
+
<asset_table>
|
|
30
|
+
<metadata1>
|
|
31
|
+
<metadata2>
|
|
32
|
+
file1, file2, ....
|
|
33
|
+
asset-type
|
|
34
|
+
<schema>
|
|
35
|
+
file1.jsonl, file2.jsonl
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
import json
|
|
39
|
+
import os
|
|
40
|
+
from pathlib import Path
|
|
41
|
+
from tempfile import TemporaryDirectory
|
|
42
|
+
from typing import Any, Optional
|
|
43
|
+
|
|
44
|
+
import regex as re
|
|
45
|
+
from deriva.core import urlquote
|
|
46
|
+
from deriva.core.ermrest_model import Table
|
|
47
|
+
from deriva.core.hatrac_store import HatracStore
|
|
48
|
+
from deriva.core.utils import hash_utils, mime_utils
|
|
49
|
+
from deriva.transfer.upload.deriva_upload import GenericUploader
|
|
50
|
+
from pydantic import ConfigDict, validate_call
|
|
51
|
+
|
|
52
|
+
from deriva_ml.core.definitions import (
|
|
53
|
+
RID,
|
|
54
|
+
DerivaSystemColumns,
|
|
55
|
+
FileUploadState,
|
|
56
|
+
UploadState,
|
|
57
|
+
)
|
|
58
|
+
from deriva_ml.core.exceptions import DerivaMLException
|
|
59
|
+
from deriva_ml.model.catalog import DerivaModel
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
from icecream import ic
|
|
63
|
+
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
64
|
+
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
65
|
+
|
|
66
|
+
# Use os.path.sep for OS-agnostic paths in regex patterns
|
|
67
|
+
SEP = re.escape(os.path.sep)
|
|
68
|
+
upload_root_regex = f"(?i)^.*{SEP}deriva-ml"
|
|
69
|
+
|
|
70
|
+
exec_dir_regex = upload_root_regex + f"{SEP}execution{SEP}(?P<execution_rid>[-\\w]+)"
|
|
71
|
+
|
|
72
|
+
feature_dir_regex = exec_dir_regex + f"{SEP}feature"
|
|
73
|
+
feature_table_dir_regex = (
|
|
74
|
+
feature_dir_regex + f"{SEP}(?P<schema>[-\\w]+){SEP}(?P<target_table>[-\\w]+){SEP}(?P<feature_name>[-\\w]+)"
|
|
75
|
+
)
|
|
76
|
+
feature_value_regex = feature_table_dir_regex + f"{SEP}(?P=feature_name)[.](?P<ext>[(csv|json)]*)$"
|
|
77
|
+
feature_asset_dir_regex = feature_table_dir_regex + f"{SEP}asset{SEP}(?P<asset_table>[-\\w]+)"
|
|
78
|
+
feature_asset_regex = feature_asset_dir_regex + f"{SEP}(?P<file>[A-Za-z0-9_-]+)[.](?P<ext>[a-z0-9]*)$"
|
|
79
|
+
|
|
80
|
+
asset_path_regex = exec_dir_regex + rf"{SEP}asset{SEP}(?P<schema>[-\w]+){SEP}(?P<asset_table>[-\w]*)"
|
|
81
|
+
|
|
82
|
+
asset_file_regex = r"(?P<file>[-\w]+)[.](?P<ext>[a-z0-9]*)$"
|
|
83
|
+
|
|
84
|
+
table_regex = exec_dir_regex + rf"{SEP}table{SEP}(?P<schema>[-\w]+){SEP}(?P<table>[-\w]+){SEP}(?P=table)[.](csv|json)$"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def is_feature_dir(path: Path) -> Optional[re.Match]:
|
|
88
|
+
"""Path matches the pattern for where the table for a feature would go."""
|
|
89
|
+
return re.match(feature_table_dir_regex + "$", path.as_posix())
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def normalize_asset_dir(path: str) -> Optional[tuple[str, str]]:
|
|
93
|
+
"""Parse a path to an asset file and return the asset table name and file name.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
path: Path to the asset file
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Tuple of (schema/table, filename) or None if path doesn't match pattern
|
|
100
|
+
"""
|
|
101
|
+
path = Path(path)
|
|
102
|
+
if not (m := re.match(asset_path_regex, str(path))):
|
|
103
|
+
return None
|
|
104
|
+
return f"{m['schema']}/{m['asset_table']}", path.name
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def upload_root(prefix: Path | str) -> Path:
|
|
108
|
+
"""Return the top level directory of where to put files to be uploaded."""
|
|
109
|
+
path = Path(prefix) / "deriva-ml"
|
|
110
|
+
path.mkdir(exist_ok=True, parents=True)
|
|
111
|
+
return path
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def execution_rids(prefix: Path | str) -> list[RID]:
|
|
115
|
+
"""Return a list of all the execution RIDS that have files waiting to be uploaded."""
|
|
116
|
+
path = upload_root(prefix) / "execution"
|
|
117
|
+
return [d.name for d in path.iterdir()]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def execution_root(prefix: Path | str, exec_rid) -> Path:
|
|
121
|
+
"""Path to directory to place execution specific upload files."""
|
|
122
|
+
path = upload_root(prefix) / "execution" / exec_rid
|
|
123
|
+
path.mkdir(exist_ok=True, parents=True)
|
|
124
|
+
return path
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def feature_root(prefix: Path | str, exec_rid: str) -> Path:
|
|
128
|
+
"""Return the path to the directory in which features for the specified execution should be placed."""
|
|
129
|
+
path = execution_root(prefix, exec_rid) / "feature"
|
|
130
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
return path
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def asset_root(prefix: Path | str, exec_rid: str) -> Path:
|
|
135
|
+
"""Return the path to the directory in which features for the specified execution should be placed."""
|
|
136
|
+
path = execution_root(prefix, exec_rid) / "asset"
|
|
137
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
138
|
+
return path
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def feature_dir(prefix: Path | str, exec_rid: str, schema: str, target_table: str, feature_name: str) -> Path:
|
|
142
|
+
"""Return the path to eht directory in which a named feature for an execution should be placed."""
|
|
143
|
+
path = feature_root(prefix, exec_rid) / schema / target_table / feature_name
|
|
144
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
145
|
+
return path
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def feature_value_path(prefix: Path | str, exec_rid: str, schema: str, target_table: str, feature_name: str) -> Path:
|
|
149
|
+
"""Return the path to a CSV file in which to place feature values that are to be uploaded.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
prefix: Location of upload root directory
|
|
153
|
+
exec_rid: RID of the execution to be associated with this feature.
|
|
154
|
+
schema: Domain schema name
|
|
155
|
+
target_table: Target table name for the feature.
|
|
156
|
+
feature_name: Name of the feature.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Path to CSV file in which to place feature values
|
|
160
|
+
"""
|
|
161
|
+
return feature_dir(prefix, exec_rid, schema, target_table, feature_name) / f"{feature_name}.jsonl"
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def table_path(prefix: Path | str, schema: str, table: str) -> Path:
|
|
165
|
+
"""Return the path to a CSV file in which to place table values that are to be uploaded.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
prefix: Location of upload root directory
|
|
169
|
+
schema: Domain schema
|
|
170
|
+
table: Name of the table to be uploaded.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Path to the file in which to place table values that are to be uploaded.
|
|
174
|
+
"""
|
|
175
|
+
path = upload_root(prefix) / "table" / schema / table
|
|
176
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
177
|
+
return path / f"{table}.csv"
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
|
|
181
|
+
"""Generate upload specification for an asset table.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
model: The DerivaModel instance.
|
|
185
|
+
asset_table: The asset table name or Table object.
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
A dictionary containing the upload specification for the asset table.
|
|
189
|
+
"""
|
|
190
|
+
metadata_columns = model.asset_metadata(asset_table)
|
|
191
|
+
asset_table = model.name_to_table(asset_table)
|
|
192
|
+
schema = model.name_to_table(asset_table).schema.name
|
|
193
|
+
|
|
194
|
+
# Be careful here as a metadata value might be a string with can contain special characters.
|
|
195
|
+
metadata_path = "/".join([rf"(?P<{c}>[-:._ \w]+)" for c in metadata_columns])
|
|
196
|
+
asset_path = f"{exec_dir_regex}/asset/{schema}/{asset_table.name}/{metadata_path}/{asset_file_regex}"
|
|
197
|
+
asset_table = model.name_to_table(asset_table)
|
|
198
|
+
schema = model.name_to_table(asset_table).schema.name
|
|
199
|
+
|
|
200
|
+
# Create upload specification
|
|
201
|
+
spec = {
|
|
202
|
+
# Upload assets into an asset table of an asset table.
|
|
203
|
+
"column_map": {
|
|
204
|
+
"MD5": "{md5}",
|
|
205
|
+
"URL": "{URI}",
|
|
206
|
+
"Length": "{file_size}",
|
|
207
|
+
"Filename": "{file_name}",
|
|
208
|
+
}
|
|
209
|
+
| {c: f"{{{c}}}" for c in metadata_columns},
|
|
210
|
+
"file_pattern": asset_path, # Sets schema, asset_table, file
|
|
211
|
+
"asset_type": "file",
|
|
212
|
+
"target_table": [schema, asset_table.name],
|
|
213
|
+
"checksum_types": ["sha256", "md5"],
|
|
214
|
+
"hatrac_options": {"versioned_urls": True},
|
|
215
|
+
"hatrac_templates": {
|
|
216
|
+
"hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}",
|
|
217
|
+
"content-disposition": "filename*=UTF-8''{file_name}",
|
|
218
|
+
},
|
|
219
|
+
"record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}",
|
|
220
|
+
}
|
|
221
|
+
return spec
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
|
|
225
|
+
"""Return an upload specification for deriva-ml
|
|
226
|
+
Arguments:
|
|
227
|
+
model: Model from which to generate the upload configuration
|
|
228
|
+
"""
|
|
229
|
+
asset_tables_with_metadata = [
|
|
230
|
+
asset_table_upload_spec(model=model, asset_table=t) for t in model.find_assets() if model.asset_metadata(t)
|
|
231
|
+
]
|
|
232
|
+
return {
|
|
233
|
+
"asset_mappings": asset_tables_with_metadata
|
|
234
|
+
+ [
|
|
235
|
+
{
|
|
236
|
+
# Upload assets into an asset table of an asset table without any metadata
|
|
237
|
+
"column_map": {
|
|
238
|
+
"MD5": "{md5}",
|
|
239
|
+
"URL": "{URI}",
|
|
240
|
+
"Length": "{file_size}",
|
|
241
|
+
"Filename": "{file_name}",
|
|
242
|
+
},
|
|
243
|
+
"asset_type": "file",
|
|
244
|
+
"target_table": ["{schema}", "{asset_table}"],
|
|
245
|
+
"file_pattern": asset_path_regex + "/" + asset_file_regex, # Sets schema, asset_table, name, ext
|
|
246
|
+
"checksum_types": ["sha256", "md5"],
|
|
247
|
+
"hatrac_options": {"versioned_urls": True},
|
|
248
|
+
"hatrac_templates": {
|
|
249
|
+
"hatrac_uri": "/hatrac/{asset_table}/{md5}.{file_name}",
|
|
250
|
+
"content-disposition": "filename*=UTF-8''{file_name}",
|
|
251
|
+
},
|
|
252
|
+
"record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}",
|
|
253
|
+
},
|
|
254
|
+
# {
|
|
255
|
+
# Upload the records into a table
|
|
256
|
+
# "asset_type": "skip",
|
|
257
|
+
## "default_columns": ["RID", "RCB", "RMB", "RCT", "RMT"],
|
|
258
|
+
# "file_pattern": feature_value_regex, # Sets schema, table,
|
|
259
|
+
# "ext_pattern": "^.*[.](?P<file_ext>json|csv)$",
|
|
260
|
+
# "target_table": ["{schema}", "{table}"],
|
|
261
|
+
# },
|
|
262
|
+
{
|
|
263
|
+
# Upload the records into a table
|
|
264
|
+
"asset_type": "table",
|
|
265
|
+
"default_columns": ["RID", "RCB", "RMB", "RCT", "RMT"],
|
|
266
|
+
"file_pattern": table_regex, # Sets schema, table,
|
|
267
|
+
"ext_pattern": "^.*[.](?P<file_ext>json|csv)$",
|
|
268
|
+
"target_table": ["{schema}", "{table}"],
|
|
269
|
+
},
|
|
270
|
+
],
|
|
271
|
+
"version_update_url": "https://github.com/informatics-isi-edu/deriva-client",
|
|
272
|
+
"version_compatibility": [[">=1.4.0", "<2.0.0"]],
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
277
|
+
def upload_directory(model: DerivaModel, directory: Path | str) -> dict[Any, FileUploadState] | None:
|
|
278
|
+
"""Upload assets from a directory. This routine assumes that the current upload specification includes a
|
|
279
|
+
configuration for the specified directory. Every asset in the specified directory is uploaded
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
model: Model to upload assets to.
|
|
283
|
+
directory: Directory containing the assets and tables to upload.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
Results of the upload operation.
|
|
287
|
+
|
|
288
|
+
Raises:
|
|
289
|
+
DerivaMLException: If there is an issue with uploading the assets.
|
|
290
|
+
"""
|
|
291
|
+
directory = Path(directory)
|
|
292
|
+
if not directory.is_dir():
|
|
293
|
+
raise DerivaMLException("Directory does not exist")
|
|
294
|
+
|
|
295
|
+
# Now upload the files by creating an upload spec and then calling the uploader.
|
|
296
|
+
with TemporaryDirectory() as temp_dir:
|
|
297
|
+
spec_file = Path(temp_dir) / "config.json"
|
|
298
|
+
with spec_file.open("w+") as cfile:
|
|
299
|
+
json.dump(bulk_upload_configuration(model), cfile)
|
|
300
|
+
uploader = GenericUploader(
|
|
301
|
+
server={
|
|
302
|
+
"host": model.hostname,
|
|
303
|
+
"protocol": "https",
|
|
304
|
+
"catalog_id": model.catalog.catalog_id,
|
|
305
|
+
},
|
|
306
|
+
config_file=spec_file,
|
|
307
|
+
)
|
|
308
|
+
try:
|
|
309
|
+
uploader.getUpdatedConfig()
|
|
310
|
+
uploader.scanDirectory(directory, purge_state=True)
|
|
311
|
+
results = {
|
|
312
|
+
path: FileUploadState(
|
|
313
|
+
state=UploadState(result["State"]),
|
|
314
|
+
status=result["Status"],
|
|
315
|
+
result=result["Result"],
|
|
316
|
+
)
|
|
317
|
+
for path, result in uploader.uploadFiles().items()
|
|
318
|
+
}
|
|
319
|
+
finally:
|
|
320
|
+
uploader.cleanup()
|
|
321
|
+
return results
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
325
|
+
def upload_asset(model: DerivaModel, file: Path | str, table: Table, **kwargs: Any) -> dict:
|
|
326
|
+
"""Upload the specified file into Hatrac and update the associated asset table.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
file: path to the file to upload.
|
|
330
|
+
table: Name of the asset table
|
|
331
|
+
model: Model to upload assets to.
|
|
332
|
+
kwargs: Keyword arguments for values of additional columns to be added to the asset table.
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
|
|
336
|
+
"""
|
|
337
|
+
if not model.is_asset(table):
|
|
338
|
+
raise DerivaMLException(f"Table {table} is not an asset table.")
|
|
339
|
+
|
|
340
|
+
file_path = Path(file)
|
|
341
|
+
file_name = file_path.name
|
|
342
|
+
file_size = file_path.stat().st_size
|
|
343
|
+
|
|
344
|
+
hatrac_path = f"/hatrac/{table.name}/"
|
|
345
|
+
hs = HatracStore(
|
|
346
|
+
"https",
|
|
347
|
+
server=model.catalog.deriva_server.server,
|
|
348
|
+
credentials=model.catalog.deriva_server.credentials,
|
|
349
|
+
)
|
|
350
|
+
md5_hashes = hash_utils.compute_file_hashes(file, ["md5"])["md5"]
|
|
351
|
+
sanitized_filename = urlquote(re.sub("[^a-zA-Z0-9_.-]", "_", md5_hashes[0] + "." + file_name))
|
|
352
|
+
hatrac_path = f"{hatrac_path}{sanitized_filename}"
|
|
353
|
+
|
|
354
|
+
try:
|
|
355
|
+
# Upload the file to hatrac.
|
|
356
|
+
hatrac_uri = hs.put_obj(
|
|
357
|
+
hatrac_path,
|
|
358
|
+
file,
|
|
359
|
+
md5=md5_hashes[1],
|
|
360
|
+
content_type=mime_utils.guess_content_type(file),
|
|
361
|
+
content_disposition="filename*=UTF-8''" + file_name,
|
|
362
|
+
)
|
|
363
|
+
except Exception as e:
|
|
364
|
+
raise e
|
|
365
|
+
try:
|
|
366
|
+
# Now update the asset table.
|
|
367
|
+
ipath = model.catalog.getPathBuilder().schemas[table.schema.name].tables[table.name]
|
|
368
|
+
return list(
|
|
369
|
+
ipath.insert(
|
|
370
|
+
[
|
|
371
|
+
{
|
|
372
|
+
"URL": hatrac_uri,
|
|
373
|
+
"Filename": file_name,
|
|
374
|
+
"Length": file_size,
|
|
375
|
+
"MD5": md5_hashes[0],
|
|
376
|
+
}
|
|
377
|
+
| kwargs
|
|
378
|
+
]
|
|
379
|
+
)
|
|
380
|
+
)[0]
|
|
381
|
+
except Exception as e:
|
|
382
|
+
raise e
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def asset_file_path(
|
|
386
|
+
prefix: Path | str,
|
|
387
|
+
exec_rid: RID,
|
|
388
|
+
asset_table: Table,
|
|
389
|
+
file_name: str,
|
|
390
|
+
metadata: dict[str, Any],
|
|
391
|
+
) -> Path:
|
|
392
|
+
"""Return the file in which to place assets of a specified type are to be uploaded.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
prefix: Path prefix to use.
|
|
396
|
+
exec_rid: RID to use.
|
|
397
|
+
asset_table: Table in which to place assets.
|
|
398
|
+
file_name: File name to use.
|
|
399
|
+
metadata: Any additional metadata to add to the asset
|
|
400
|
+
Returns:
|
|
401
|
+
Path to directory in which to place assets of type asset_type.
|
|
402
|
+
"""
|
|
403
|
+
schema = asset_table.schema.name
|
|
404
|
+
asset_name = asset_table.name
|
|
405
|
+
|
|
406
|
+
path = execution_root(prefix, exec_rid) / "asset" / schema / asset_name
|
|
407
|
+
metadata = metadata or {}
|
|
408
|
+
asset_columns = {
|
|
409
|
+
"Filename",
|
|
410
|
+
"URL",
|
|
411
|
+
"Length",
|
|
412
|
+
"MD5",
|
|
413
|
+
"Description",
|
|
414
|
+
}.union(set(DerivaSystemColumns))
|
|
415
|
+
asset_metadata = {c.name for c in asset_table.columns} - asset_columns
|
|
416
|
+
|
|
417
|
+
if not (asset_metadata >= set(metadata.keys())):
|
|
418
|
+
raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")
|
|
419
|
+
|
|
420
|
+
for m in asset_metadata:
|
|
421
|
+
path = path / str(metadata.get(m, "None"))
|
|
422
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
423
|
+
return path / file_name
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def asset_type_path(prefix: Path | str, exec_rid: RID, asset_table: Table) -> Path:
|
|
427
|
+
"""Return the path to a JSON line file in which to place asset_type information.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
prefix: Location of upload root directory
|
|
431
|
+
exec_rid: Execution RID
|
|
432
|
+
asset_table: Table in which to place assets.
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
Path to the file in which to place asset_type values for the named asset.
|
|
436
|
+
"""
|
|
437
|
+
path = execution_root(prefix, exec_rid=exec_rid) / "asset-type" / asset_table.schema.name
|
|
438
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
439
|
+
return path / f"{asset_table.name}.jsonl"
|