deriva-ml 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +43 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +21 -0
- deriva_ml/catalog/clone.py +1199 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +817 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +126 -110
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +543 -242
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +223 -34
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -4
- deriva_ml-1.17.11.dist-info/RECORD +77 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +1 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.10.dist-info/RECORD +0 -45
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
deriva_ml/dataset/history.py
CHANGED
|
@@ -1,15 +1,25 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import struct
|
|
3
3
|
from datetime import datetime
|
|
4
|
+
from typing import Any
|
|
4
5
|
|
|
5
6
|
from dateutil.parser import isoparse
|
|
6
7
|
from deriva.core import urlquote
|
|
8
|
+
from deriva.core.deriva_server import DerivaServer
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
# -- ==============================================================================================
|
|
10
|
-
def get_record_history(
|
|
12
|
+
def get_record_history(
|
|
13
|
+
server: DerivaServer,
|
|
14
|
+
cid: str | int,
|
|
15
|
+
sname: str,
|
|
16
|
+
tname: str,
|
|
17
|
+
kvals: list[str],
|
|
18
|
+
kcols: list[str] | None = None,
|
|
19
|
+
snap: str | None = None,
|
|
20
|
+
) -> dict[str, dict[str, Any]]:
|
|
11
21
|
"""Get the history of a record from the catalog.
|
|
12
|
-
|
|
22
|
+
|
|
13
23
|
Args:
|
|
14
24
|
server: The server instance.
|
|
15
25
|
cid: The catalog ID.
|
|
@@ -18,13 +28,16 @@ def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=Non
|
|
|
18
28
|
kvals: The key values to look up.
|
|
19
29
|
kcols: The key columns. Defaults to ["RID"].
|
|
20
30
|
snap: Optional snapshot ID.
|
|
21
|
-
|
|
31
|
+
|
|
22
32
|
Returns:
|
|
23
33
|
The history data for the record.
|
|
24
|
-
|
|
34
|
+
|
|
25
35
|
Raises:
|
|
26
36
|
ValueError: If more than one row is returned.
|
|
27
37
|
"""
|
|
38
|
+
if kcols is None:
|
|
39
|
+
kcols = ["RID"]
|
|
40
|
+
|
|
28
41
|
parts = {
|
|
29
42
|
"cid": urlquote(cid),
|
|
30
43
|
"sname": urlquote(sname),
|
|
@@ -46,7 +59,7 @@ def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=Non
|
|
|
46
59
|
path = "/ermrest/catalog/%(cid)s@%(snap)s/entity/%(sname)s:%(tname)s/%(filter)s"
|
|
47
60
|
|
|
48
61
|
rows_found = []
|
|
49
|
-
snap2rows = {}
|
|
62
|
+
snap2rows: dict[str, dict[str, Any]] = {}
|
|
50
63
|
while True:
|
|
51
64
|
url = path % parts
|
|
52
65
|
# sys.stderr.write("%s\n" % url)
|
|
@@ -67,12 +80,12 @@ def get_record_history(server, cid, sname, tname, kvals, kcols=["RID"], snap=Non
|
|
|
67
80
|
|
|
68
81
|
|
|
69
82
|
# -- --------------------------------------------------------------------------------------
|
|
70
|
-
def datetime_epoch_us(dt):
|
|
83
|
+
def datetime_epoch_us(dt: datetime) -> int:
|
|
71
84
|
"""Convert datetime to epoch microseconds.
|
|
72
|
-
|
|
85
|
+
|
|
73
86
|
Args:
|
|
74
87
|
dt: The datetime object to convert.
|
|
75
|
-
|
|
88
|
+
|
|
76
89
|
Returns:
|
|
77
90
|
The epoch time in microseconds.
|
|
78
91
|
"""
|
|
@@ -84,12 +97,12 @@ def datetime_epoch_us(dt):
|
|
|
84
97
|
#
|
|
85
98
|
|
|
86
99
|
|
|
87
|
-
def iso_to_snap(iso_datetime):
|
|
100
|
+
def iso_to_snap(iso_datetime: str) -> int:
|
|
88
101
|
"""Convert ISO datetime string to snapshot format.
|
|
89
|
-
|
|
102
|
+
|
|
90
103
|
Args:
|
|
91
104
|
iso_datetime: The ISO datetime string.
|
|
92
|
-
|
|
105
|
+
|
|
93
106
|
Returns:
|
|
94
107
|
The snapshot timestamp.
|
|
95
108
|
"""
|
|
@@ -97,12 +110,12 @@ def iso_to_snap(iso_datetime):
|
|
|
97
110
|
|
|
98
111
|
|
|
99
112
|
# -- --------------------------------------------------------------------------------------
|
|
100
|
-
def urlb32_encode(i):
|
|
113
|
+
def urlb32_encode(i: int) -> str:
|
|
101
114
|
"""Encode an integer to URL-safe base32.
|
|
102
|
-
|
|
115
|
+
|
|
103
116
|
Args:
|
|
104
117
|
i: The integer to encode.
|
|
105
|
-
|
|
118
|
+
|
|
106
119
|
Returns:
|
|
107
120
|
The URL-safe base32 encoded string.
|
|
108
121
|
"""
|
deriva_ml/dataset/upload.py
CHANGED
|
@@ -39,20 +39,32 @@ import json
|
|
|
39
39
|
import os
|
|
40
40
|
from pathlib import Path
|
|
41
41
|
from tempfile import TemporaryDirectory
|
|
42
|
-
from typing import Any, Optional
|
|
42
|
+
from typing import Any, Callable, Optional
|
|
43
43
|
|
|
44
44
|
import regex as re
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
45
|
+
|
|
46
|
+
# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
|
|
47
|
+
import importlib
|
|
48
|
+
_deriva_core = importlib.import_module("deriva.core")
|
|
49
|
+
_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
|
|
50
|
+
_hatrac_store = importlib.import_module("deriva.core.hatrac_store")
|
|
51
|
+
_hash_utils = importlib.import_module("deriva.core.utils.hash_utils")
|
|
52
|
+
_mime_utils = importlib.import_module("deriva.core.utils.mime_utils")
|
|
53
|
+
_deriva_upload = importlib.import_module("deriva.transfer.upload.deriva_upload")
|
|
54
|
+
|
|
55
|
+
urlquote = _deriva_core.urlquote
|
|
56
|
+
Table = _ermrest_model.Table
|
|
57
|
+
HatracStore = _hatrac_store.HatracStore
|
|
58
|
+
hash_utils = _hash_utils
|
|
59
|
+
mime_utils = _mime_utils
|
|
60
|
+
GenericUploader = _deriva_upload.GenericUploader
|
|
50
61
|
from pydantic import ConfigDict, validate_call
|
|
51
62
|
|
|
52
63
|
from deriva_ml.core.definitions import (
|
|
53
64
|
RID,
|
|
54
65
|
DerivaSystemColumns,
|
|
55
66
|
FileUploadState,
|
|
67
|
+
UploadProgress,
|
|
56
68
|
UploadState,
|
|
57
69
|
)
|
|
58
70
|
from deriva_ml.core.exceptions import DerivaMLException
|
|
@@ -89,7 +101,7 @@ def is_feature_dir(path: Path) -> Optional[re.Match]:
|
|
|
89
101
|
return re.match(feature_table_dir_regex + "$", path.as_posix())
|
|
90
102
|
|
|
91
103
|
|
|
92
|
-
def normalize_asset_dir(path: str) -> Optional[tuple[str, str]]:
|
|
104
|
+
def normalize_asset_dir(path: str | Path) -> Optional[tuple[str, str]]:
|
|
93
105
|
"""Parse a path to an asset file and return the asset table name and file name.
|
|
94
106
|
|
|
95
107
|
Args:
|
|
@@ -177,12 +189,16 @@ def table_path(prefix: Path | str, schema: str, table: str) -> Path:
|
|
|
177
189
|
return path / f"{table}.csv"
|
|
178
190
|
|
|
179
191
|
|
|
180
|
-
def asset_table_upload_spec(
|
|
192
|
+
def asset_table_upload_spec(
|
|
193
|
+
model: DerivaModel, asset_table: str | Table, chunk_size: int | None = None
|
|
194
|
+
):
|
|
181
195
|
"""Generate upload specification for an asset table.
|
|
182
196
|
|
|
183
197
|
Args:
|
|
184
198
|
model: The DerivaModel instance.
|
|
185
199
|
asset_table: The asset table name or Table object.
|
|
200
|
+
chunk_size: Optional chunk size in bytes for hatrac uploads. If provided,
|
|
201
|
+
large files will be uploaded in chunks of this size.
|
|
186
202
|
|
|
187
203
|
Returns:
|
|
188
204
|
A dictionary containing the upload specification for the asset table.
|
|
@@ -197,6 +213,11 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
|
|
|
197
213
|
asset_table = model.name_to_table(asset_table)
|
|
198
214
|
schema = model.name_to_table(asset_table).schema.name
|
|
199
215
|
|
|
216
|
+
# Build hatrac_options with optional chunk_size
|
|
217
|
+
hatrac_options = {"versioned_urls": True}
|
|
218
|
+
if chunk_size is not None:
|
|
219
|
+
hatrac_options["chunk_size"] = chunk_size
|
|
220
|
+
|
|
200
221
|
# Create upload specification
|
|
201
222
|
spec = {
|
|
202
223
|
# Upload assets into an asset table of an asset table.
|
|
@@ -211,7 +232,7 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
|
|
|
211
232
|
"asset_type": "file",
|
|
212
233
|
"target_table": [schema, asset_table.name],
|
|
213
234
|
"checksum_types": ["sha256", "md5"],
|
|
214
|
-
"hatrac_options":
|
|
235
|
+
"hatrac_options": hatrac_options,
|
|
215
236
|
"hatrac_templates": {
|
|
216
237
|
"hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}",
|
|
217
238
|
"content-disposition": "filename*=UTF-8''{file_name}",
|
|
@@ -221,14 +242,27 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
|
|
|
221
242
|
return spec
|
|
222
243
|
|
|
223
244
|
|
|
224
|
-
def bulk_upload_configuration(
|
|
245
|
+
def bulk_upload_configuration(
|
|
246
|
+
model: DerivaModel, chunk_size: int | None = None
|
|
247
|
+
) -> dict[str, Any]:
|
|
225
248
|
"""Return an upload specification for deriva-ml
|
|
226
|
-
|
|
227
|
-
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
model: Model from which to generate the upload configuration.
|
|
252
|
+
chunk_size: Optional chunk size in bytes for hatrac uploads. If provided,
|
|
253
|
+
large files will be uploaded in chunks of this size.
|
|
228
254
|
"""
|
|
229
255
|
asset_tables_with_metadata = [
|
|
230
|
-
asset_table_upload_spec(model=model, asset_table=t
|
|
256
|
+
asset_table_upload_spec(model=model, asset_table=t, chunk_size=chunk_size)
|
|
257
|
+
for t in model.find_assets()
|
|
258
|
+
if model.asset_metadata(t)
|
|
231
259
|
]
|
|
260
|
+
|
|
261
|
+
# Build hatrac_options with optional chunk_size for non-metadata assets
|
|
262
|
+
hatrac_options = {"versioned_urls": True}
|
|
263
|
+
if chunk_size is not None:
|
|
264
|
+
hatrac_options["chunk_size"] = chunk_size
|
|
265
|
+
|
|
232
266
|
return {
|
|
233
267
|
"asset_mappings": asset_tables_with_metadata
|
|
234
268
|
+ [
|
|
@@ -244,7 +278,7 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
|
|
|
244
278
|
"target_table": ["{schema}", "{asset_table}"],
|
|
245
279
|
"file_pattern": asset_path_regex + "/" + asset_file_regex, # Sets schema, asset_table, name, ext
|
|
246
280
|
"checksum_types": ["sha256", "md5"],
|
|
247
|
-
"hatrac_options":
|
|
281
|
+
"hatrac_options": hatrac_options,
|
|
248
282
|
"hatrac_templates": {
|
|
249
283
|
"hatrac_uri": "/hatrac/{asset_table}/{md5}.{file_name}",
|
|
250
284
|
"content-disposition": "filename*=UTF-8''{file_name}",
|
|
@@ -273,14 +307,42 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
|
|
|
273
307
|
}
|
|
274
308
|
|
|
275
309
|
|
|
310
|
+
# Default timeout for large file uploads in seconds
|
|
311
|
+
# The requests timeout tuple is (connect_timeout, read_timeout), but this doesn't
|
|
312
|
+
# cover write operations. We also need to set socket.setdefaulttimeout() for writes.
|
|
313
|
+
DEFAULT_UPLOAD_TIMEOUT = (6, 600)
|
|
314
|
+
|
|
315
|
+
# Socket timeout for write operations (in seconds)
|
|
316
|
+
# This is needed because requests timeout only covers connect and read, not write.
|
|
317
|
+
# For large chunk uploads, the socket write can take significant time.
|
|
318
|
+
DEFAULT_SOCKET_TIMEOUT = 600.0
|
|
319
|
+
|
|
320
|
+
|
|
276
321
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
277
|
-
def upload_directory(
|
|
322
|
+
def upload_directory(
|
|
323
|
+
model: DerivaModel,
|
|
324
|
+
directory: Path | str,
|
|
325
|
+
progress_callback: Callable[[UploadProgress], None] | None = None,
|
|
326
|
+
max_retries: int = 3,
|
|
327
|
+
retry_delay: float = 5.0,
|
|
328
|
+
timeout: tuple[int, int] | None = None,
|
|
329
|
+
chunk_size: int | None = None,
|
|
330
|
+
) -> dict[Any, FileUploadState] | None:
|
|
278
331
|
"""Upload assets from a directory. This routine assumes that the current upload specification includes a
|
|
279
332
|
configuration for the specified directory. Every asset in the specified directory is uploaded
|
|
280
333
|
|
|
281
334
|
Args:
|
|
282
335
|
model: Model to upload assets to.
|
|
283
336
|
directory: Directory containing the assets and tables to upload.
|
|
337
|
+
progress_callback: Optional callback function to receive upload progress updates.
|
|
338
|
+
Called with UploadProgress objects containing file information and progress.
|
|
339
|
+
max_retries: Maximum number of retry attempts for failed uploads (default: 3).
|
|
340
|
+
retry_delay: Initial delay in seconds between retries, doubles with each attempt (default: 5.0).
|
|
341
|
+
timeout: Tuple of (connect_timeout, read_timeout) in seconds. Default is (6, 600)
|
|
342
|
+
which allows up to 10 minutes for each chunk upload. Increase read_timeout for
|
|
343
|
+
very large files on slow connections.
|
|
344
|
+
chunk_size: Optional chunk size in bytes for hatrac uploads. If provided,
|
|
345
|
+
large files will be uploaded in chunks of this size.
|
|
284
346
|
|
|
285
347
|
Returns:
|
|
286
348
|
Results of the upload operation.
|
|
@@ -288,37 +350,162 @@ def upload_directory(model: DerivaModel, directory: Path | str) -> dict[Any, Fil
|
|
|
288
350
|
Raises:
|
|
289
351
|
DerivaMLException: If there is an issue with uploading the assets.
|
|
290
352
|
"""
|
|
353
|
+
import logging
|
|
354
|
+
import time
|
|
355
|
+
|
|
356
|
+
from deriva.core import DEFAULT_SESSION_CONFIG
|
|
357
|
+
|
|
358
|
+
logger = logging.getLogger("deriva_ml")
|
|
359
|
+
|
|
291
360
|
directory = Path(directory)
|
|
292
361
|
if not directory.is_dir():
|
|
293
362
|
raise DerivaMLException("Directory does not exist")
|
|
294
363
|
|
|
364
|
+
# Track upload progress across files
|
|
365
|
+
# status_callback is called twice per file: once before upload starts, once after it completes
|
|
366
|
+
upload_state = {"completed_files": 0, "total_files": 0, "status_calls": 0}
|
|
367
|
+
|
|
368
|
+
# Count total files to upload
|
|
369
|
+
for root, dirs, files in os.walk(directory):
|
|
370
|
+
upload_state["total_files"] += len(files)
|
|
371
|
+
|
|
372
|
+
# Create wrapper callbacks for GenericUploader if a progress callback was provided
|
|
373
|
+
def file_callback(**kwargs) -> bool:
|
|
374
|
+
"""Callback for per-chunk progress updates from GenericUploader.
|
|
375
|
+
|
|
376
|
+
The deriva GenericUploader passes kwargs with: completed, total, file_path, host, job_info.
|
|
377
|
+
Note: This callback is only invoked for large files (> 25MB) that use chunked uploads.
|
|
378
|
+
Small files are uploaded in a single request and this callback won't be called.
|
|
379
|
+
"""
|
|
380
|
+
if progress_callback is not None:
|
|
381
|
+
file_path = kwargs.get("file_path", "")
|
|
382
|
+
completed_chunks = kwargs.get("completed", 0)
|
|
383
|
+
total_chunks = kwargs.get("total", 0)
|
|
384
|
+
|
|
385
|
+
progress = UploadProgress(
|
|
386
|
+
file_path=file_path,
|
|
387
|
+
file_name=Path(file_path).name if file_path else "",
|
|
388
|
+
bytes_completed=completed_chunks,
|
|
389
|
+
bytes_total=total_chunks,
|
|
390
|
+
percent_complete=(completed_chunks / total_chunks * 100) if total_chunks > 0 else 0,
|
|
391
|
+
phase="uploading_chunks",
|
|
392
|
+
message=f"Uploading large file: chunk {completed_chunks} of {total_chunks}",
|
|
393
|
+
)
|
|
394
|
+
progress_callback(progress)
|
|
395
|
+
return True # Continue upload
|
|
396
|
+
|
|
397
|
+
def status_callback() -> None:
|
|
398
|
+
"""Callback for per-file status updates from GenericUploader.
|
|
399
|
+
|
|
400
|
+
GenericUploader calls this twice per file: once before upload starts (odd calls)
|
|
401
|
+
and once after upload completes (even calls). We use even calls to track completed files.
|
|
402
|
+
"""
|
|
403
|
+
if progress_callback is not None:
|
|
404
|
+
upload_state["status_calls"] += 1
|
|
405
|
+
|
|
406
|
+
# Even calls indicate file completion (after upload)
|
|
407
|
+
if upload_state["status_calls"] % 2 == 0:
|
|
408
|
+
upload_state["completed_files"] += 1
|
|
409
|
+
|
|
410
|
+
# Report progress with current file count
|
|
411
|
+
current_file = (upload_state["status_calls"] + 1) // 2 # 1-indexed current file
|
|
412
|
+
progress = UploadProgress(
|
|
413
|
+
phase="uploading",
|
|
414
|
+
message=f"Uploading file {current_file} of {upload_state['total_files']}",
|
|
415
|
+
percent_complete=(upload_state["completed_files"] / upload_state["total_files"] * 100)
|
|
416
|
+
if upload_state["total_files"] > 0
|
|
417
|
+
else 0,
|
|
418
|
+
)
|
|
419
|
+
progress_callback(progress)
|
|
420
|
+
|
|
421
|
+
def do_upload(uploader) -> dict[str, dict]:
|
|
422
|
+
"""Perform the upload and return raw results."""
|
|
423
|
+
uploader.getUpdatedConfig()
|
|
424
|
+
uploader.scanDirectory(directory, purge_state=True)
|
|
425
|
+
return uploader.uploadFiles(
|
|
426
|
+
file_callback=file_callback if progress_callback else None,
|
|
427
|
+
status_callback=status_callback if progress_callback else None,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
# Use provided timeout or default
|
|
431
|
+
upload_timeout = timeout if timeout is not None else DEFAULT_UPLOAD_TIMEOUT
|
|
432
|
+
|
|
295
433
|
# Now upload the files by creating an upload spec and then calling the uploader.
|
|
296
434
|
with TemporaryDirectory() as temp_dir:
|
|
297
435
|
spec_file = Path(temp_dir) / "config.json"
|
|
298
436
|
with spec_file.open("w+") as cfile:
|
|
299
|
-
json.dump(bulk_upload_configuration(model), cfile)
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
437
|
+
json.dump(bulk_upload_configuration(model, chunk_size=chunk_size), cfile)
|
|
438
|
+
|
|
439
|
+
# Create session config with longer timeout for large file uploads
|
|
440
|
+
session_config = DEFAULT_SESSION_CONFIG.copy()
|
|
441
|
+
session_config["timeout"] = upload_timeout
|
|
442
|
+
logger.debug(f"Upload session config timeout: {session_config['timeout']}")
|
|
443
|
+
|
|
444
|
+
all_results = {}
|
|
445
|
+
attempt = 0
|
|
446
|
+
current_delay = retry_delay
|
|
447
|
+
|
|
448
|
+
while attempt <= max_retries:
|
|
449
|
+
uploader = GenericUploader(
|
|
450
|
+
server={
|
|
451
|
+
"host": model.hostname,
|
|
452
|
+
"protocol": "https",
|
|
453
|
+
"catalog_id": model.catalog.catalog_id,
|
|
454
|
+
"session": session_config,
|
|
455
|
+
},
|
|
456
|
+
config_file=spec_file,
|
|
457
|
+
)
|
|
458
|
+
try:
|
|
459
|
+
raw_results = do_upload(uploader)
|
|
460
|
+
|
|
461
|
+
# Process results and check for failures
|
|
462
|
+
failed_files = []
|
|
463
|
+
for path, result in raw_results.items():
|
|
464
|
+
state = UploadState(result["State"])
|
|
465
|
+
if state == UploadState.failed or result["Result"] is None:
|
|
466
|
+
failed_files.append((path, result["Status"]))
|
|
467
|
+
else:
|
|
468
|
+
# Store successful results
|
|
469
|
+
all_results[path] = FileUploadState(
|
|
470
|
+
state=state,
|
|
471
|
+
status=result["Status"],
|
|
472
|
+
result=result["Result"],
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
if not failed_files:
|
|
476
|
+
# All uploads successful
|
|
477
|
+
break
|
|
478
|
+
|
|
479
|
+
attempt += 1
|
|
480
|
+
if attempt > max_retries:
|
|
481
|
+
# Final attempt failed, raise error with details
|
|
482
|
+
error_details = "; ".join([f"{path}: {msg}" for path, msg in failed_files])
|
|
483
|
+
raise DerivaMLException(
|
|
484
|
+
f"Failed to upload {len(failed_files)} file(s) after {max_retries} retries: {error_details}"
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
# Log retry attempt and wait before retrying
|
|
488
|
+
logger.warning(
|
|
489
|
+
f"Upload failed for {len(failed_files)} file(s), retrying in {current_delay:.1f}s "
|
|
490
|
+
f"(attempt {attempt}/{max_retries}): {[p for p, _ in failed_files]}"
|
|
316
491
|
)
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
492
|
+
if progress_callback:
|
|
493
|
+
progress_callback(UploadProgress(
|
|
494
|
+
phase="retrying",
|
|
495
|
+
message=f"Retrying {len(failed_files)} failed upload(s) in {current_delay:.1f}s (attempt {attempt}/{max_retries})",
|
|
496
|
+
percent_complete=0,
|
|
497
|
+
))
|
|
498
|
+
|
|
499
|
+
time.sleep(current_delay)
|
|
500
|
+
current_delay *= 2 # Exponential backoff
|
|
501
|
+
|
|
502
|
+
# Reset upload state for retry
|
|
503
|
+
upload_state["status_calls"] = 0
|
|
504
|
+
|
|
505
|
+
finally:
|
|
506
|
+
uploader.cleanup()
|
|
507
|
+
|
|
508
|
+
return all_results
|
|
322
509
|
|
|
323
510
|
|
|
324
511
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
@@ -347,7 +534,7 @@ def upload_asset(model: DerivaModel, file: Path | str, table: Table, **kwargs: A
|
|
|
347
534
|
server=model.catalog.deriva_server.server,
|
|
348
535
|
credentials=model.catalog.deriva_server.credentials,
|
|
349
536
|
)
|
|
350
|
-
md5_hashes = hash_utils.compute_file_hashes(file, ["md5"])["md5"]
|
|
537
|
+
md5_hashes = hash_utils.compute_file_hashes(file, frozenset(["md5"]))["md5"]
|
|
351
538
|
sanitized_filename = urlquote(re.sub("[^a-zA-Z0-9_.-]", "_", md5_hashes[0] + "." + file_name))
|
|
352
539
|
hatrac_path = f"{hatrac_path}{sanitized_filename}"
|
|
353
540
|
|