ml-dash 0.6.2rc1__py3-none-any.whl → 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml_dash/__init__.py +36 -64
- ml_dash/auth/token_storage.py +267 -226
- ml_dash/auto_start.py +28 -15
- ml_dash/cli.py +16 -2
- ml_dash/cli_commands/api.py +165 -0
- ml_dash/cli_commands/download.py +757 -667
- ml_dash/cli_commands/list.py +146 -13
- ml_dash/cli_commands/login.py +190 -183
- ml_dash/cli_commands/profile.py +92 -0
- ml_dash/cli_commands/upload.py +1291 -1141
- ml_dash/client.py +79 -6
- ml_dash/config.py +119 -119
- ml_dash/experiment.py +1234 -1034
- ml_dash/files.py +339 -224
- ml_dash/log.py +7 -7
- ml_dash/metric.py +359 -100
- ml_dash/params.py +6 -6
- ml_dash/remote_auto_start.py +20 -17
- ml_dash/run.py +211 -65
- ml_dash/snowflake.py +173 -0
- ml_dash/storage.py +1051 -1081
- {ml_dash-0.6.2rc1.dist-info → ml_dash-0.6.3.dist-info}/METADATA +12 -14
- ml_dash-0.6.3.dist-info/RECORD +33 -0
- {ml_dash-0.6.2rc1.dist-info → ml_dash-0.6.3.dist-info}/WHEEL +1 -1
- ml_dash-0.6.2rc1.dist-info/RECORD +0 -30
- {ml_dash-0.6.2rc1.dist-info → ml_dash-0.6.3.dist-info}/entry_points.txt +0 -0
ml_dash/cli_commands/upload.py
CHANGED
|
@@ -2,20 +2,25 @@
|
|
|
2
2
|
|
|
3
3
|
import argparse
|
|
4
4
|
import json
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import List, Dict, Any, Optional
|
|
7
|
-
from dataclasses import dataclass, field
|
|
8
5
|
import threading
|
|
9
6
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
10
|
|
|
11
11
|
from rich.console import Console
|
|
12
|
-
from rich.progress import
|
|
12
|
+
from rich.progress import (
|
|
13
|
+
BarColumn,
|
|
14
|
+
Progress,
|
|
15
|
+
SpinnerColumn,
|
|
16
|
+
TaskProgressColumn,
|
|
17
|
+
TextColumn,
|
|
18
|
+
)
|
|
13
19
|
from rich.table import Table
|
|
14
|
-
from rich.panel import Panel
|
|
15
20
|
|
|
16
|
-
from ..storage import LocalStorage
|
|
17
21
|
from ..client import RemoteClient
|
|
18
22
|
from ..config import Config
|
|
23
|
+
from ..storage import LocalStorage
|
|
19
24
|
|
|
20
25
|
# Initialize rich console
|
|
21
26
|
console = Console()
|
|
@@ -23,1226 +28,1371 @@ console = Console()
|
|
|
23
28
|
|
|
24
29
|
@dataclass
|
|
25
30
|
class ExperimentInfo:
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
31
|
+
"""Information about an experiment to upload."""
|
|
32
|
+
|
|
33
|
+
project: str
|
|
34
|
+
experiment: str
|
|
35
|
+
path: Path
|
|
36
|
+
prefix: Optional[str] = None
|
|
37
|
+
has_logs: bool = False
|
|
38
|
+
has_params: bool = False
|
|
39
|
+
metric_names: List[str] = field(default_factory=list)
|
|
40
|
+
file_count: int = 0
|
|
41
|
+
estimated_size: int = 0 # in bytes
|
|
36
42
|
|
|
37
43
|
|
|
38
44
|
@dataclass
|
|
39
45
|
class ValidationResult:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
46
|
+
"""Result of experiment validation."""
|
|
47
|
+
|
|
48
|
+
is_valid: bool = True
|
|
49
|
+
warnings: List[str] = field(default_factory=list)
|
|
50
|
+
errors: List[str] = field(default_factory=list)
|
|
51
|
+
valid_data: Dict[str, Any] = field(default_factory=dict)
|
|
45
52
|
|
|
46
53
|
|
|
47
54
|
@dataclass
|
|
48
55
|
class UploadResult:
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
+
"""Result of uploading an experiment."""
|
|
57
|
+
|
|
58
|
+
experiment: str
|
|
59
|
+
success: bool = False
|
|
60
|
+
uploaded: Dict[str, int] = field(default_factory=dict) # {"logs": 100, "metrics": 3}
|
|
61
|
+
failed: Dict[str, List[str]] = field(default_factory=dict) # {"files": ["error msg"]}
|
|
62
|
+
errors: List[str] = field(default_factory=list)
|
|
63
|
+
bytes_uploaded: int = 0 # Total bytes uploaded
|
|
56
64
|
|
|
57
65
|
|
|
58
66
|
@dataclass
|
|
59
67
|
class UploadState:
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
68
|
+
"""Tracks upload state for resume functionality."""
|
|
69
|
+
|
|
70
|
+
local_path: str
|
|
71
|
+
remote_url: str
|
|
72
|
+
completed_experiments: List[str] = field(
|
|
73
|
+
default_factory=list
|
|
74
|
+
) # ["project/experiment"]
|
|
75
|
+
failed_experiments: List[str] = field(default_factory=list)
|
|
76
|
+
in_progress_experiment: Optional[str] = None
|
|
77
|
+
timestamp: Optional[str] = None
|
|
78
|
+
|
|
79
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
80
|
+
"""Convert to dictionary for JSON serialization."""
|
|
81
|
+
return {
|
|
82
|
+
"local_path": self.local_path,
|
|
83
|
+
"remote_url": self.remote_url,
|
|
84
|
+
"completed_experiments": self.completed_experiments,
|
|
85
|
+
"failed_experiments": self.failed_experiments,
|
|
86
|
+
"in_progress_experiment": self.in_progress_experiment,
|
|
87
|
+
"timestamp": self.timestamp,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
def from_dict(cls, data: Dict[str, Any]) -> "UploadState":
|
|
92
|
+
"""Create from dictionary."""
|
|
93
|
+
return cls(
|
|
94
|
+
local_path=data["local_path"],
|
|
95
|
+
remote_url=data["remote_url"],
|
|
96
|
+
completed_experiments=data.get("completed_experiments", []),
|
|
97
|
+
failed_experiments=data.get("failed_experiments", []),
|
|
98
|
+
in_progress_experiment=data.get("in_progress_experiment"),
|
|
99
|
+
timestamp=data.get("timestamp"),
|
|
100
|
+
)
|
|
90
101
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
102
|
+
def save(self, path: Path):
|
|
103
|
+
"""Save state to file."""
|
|
104
|
+
import datetime
|
|
105
|
+
|
|
106
|
+
self.timestamp = datetime.datetime.now().isoformat()
|
|
107
|
+
with open(path, "w") as f:
|
|
108
|
+
json.dump(self.to_dict(), f, indent=2)
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def load(cls, path: Path) -> Optional["UploadState"]:
|
|
112
|
+
"""Load state from file."""
|
|
113
|
+
if not path.exists():
|
|
114
|
+
return None
|
|
115
|
+
try:
|
|
116
|
+
with open(path, "r") as f:
|
|
117
|
+
data = json.load(f)
|
|
118
|
+
return cls.from_dict(data)
|
|
119
|
+
except (json.JSONDecodeError, IOError, KeyError):
|
|
120
|
+
return None
|
|
109
121
|
|
|
110
122
|
|
|
111
123
|
def add_parser(subparsers) -> argparse.ArgumentParser:
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
124
|
+
"""Add upload command parser."""
|
|
125
|
+
parser = subparsers.add_parser(
|
|
126
|
+
"upload",
|
|
127
|
+
help="Upload local experiments to remote server",
|
|
128
|
+
description="Upload locally-stored ML-Dash experiment data to a remote server.",
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Positional argument
|
|
132
|
+
parser.add_argument(
|
|
133
|
+
"path",
|
|
134
|
+
nargs="?",
|
|
135
|
+
default="./.dash",
|
|
136
|
+
help="Local storage directory to upload from (default: ./.dash)",
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Remote configuration
|
|
140
|
+
parser.add_argument(
|
|
141
|
+
"--dash-url",
|
|
142
|
+
type=str,
|
|
143
|
+
help="ML-Dash server URL (defaults to config or https://api.dash.ml)",
|
|
144
|
+
)
|
|
145
|
+
parser.add_argument(
|
|
146
|
+
"--api-key",
|
|
147
|
+
type=str,
|
|
148
|
+
help="JWT token for authentication (optional - auto-loads from 'ml-dash login' if not provided)",
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
cd .dash/geyang
|
|
154
|
+
cd iclr_2026
|
|
155
|
+
|
|
156
|
+
ml-dash upload -p geyang/new-run * # this uploads all of the folders to geyang/new-run.
|
|
157
|
+
|
|
158
|
+
or
|
|
159
|
+
|
|
160
|
+
ml-dash upload --prefix geyang/new-run/local-results ./* # uploads under the local-results prefix.
|
|
161
|
+
|
|
162
|
+
ml-dash download --prefix geyang/new-run/zehua-results --filter *.mp4 --dryrun --verbose
|
|
163
|
+
|
|
164
|
+
mo-dash list --prefix geyang/new-run/zehua-results --filter xxx-xxx --verbose
|
|
165
|
+
|
|
166
|
+
mo-dash list-exp --prefix geyang/new-run/zehua-results --filter xxx-xxx --verbose
|
|
167
|
+
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
# Scope control
|
|
171
|
+
# Ge: project should be {owner}/{proj_name}
|
|
172
|
+
parser.add_argument(
|
|
173
|
+
"-p",
|
|
174
|
+
"--pref",
|
|
175
|
+
"--prefix",
|
|
176
|
+
"--proj",
|
|
177
|
+
"--project",
|
|
178
|
+
type=str,
|
|
179
|
+
help="Filter experiments by prefix pattern (supports glob: 'tom/*/exp*', 'alice/project-?/baseline')",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Target prefix for server (like scp destination)
|
|
183
|
+
parser.add_argument(
|
|
184
|
+
"-t",
|
|
185
|
+
"--target",
|
|
186
|
+
type=str,
|
|
187
|
+
help="Target prefix/directory on server where experiments will be uploaded (e.g., 'alice/shared-project'). Similar to 'scp local/ remote-path/'",
|
|
188
|
+
)
|
|
189
|
+
# parser.add_argument(
|
|
190
|
+
# "--experiment",
|
|
191
|
+
# type=str,
|
|
192
|
+
# help="Upload only this specific experiment (requires --project)",
|
|
193
|
+
# )
|
|
194
|
+
|
|
195
|
+
# Data filtering
|
|
196
|
+
parser.add_argument(
|
|
197
|
+
"--skip-logs",
|
|
198
|
+
action="store_true",
|
|
199
|
+
help="Don't upload logs",
|
|
200
|
+
)
|
|
201
|
+
parser.add_argument(
|
|
202
|
+
"--skip-metrics",
|
|
203
|
+
action="store_true",
|
|
204
|
+
help="Don't upload metrics",
|
|
205
|
+
)
|
|
206
|
+
parser.add_argument(
|
|
207
|
+
"--skip-files",
|
|
208
|
+
action="store_true",
|
|
209
|
+
help="Don't upload files",
|
|
210
|
+
)
|
|
211
|
+
parser.add_argument(
|
|
212
|
+
"--skip-params",
|
|
213
|
+
action="store_true",
|
|
214
|
+
help="Don't upload parameters",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Behavior control
|
|
218
|
+
parser.add_argument(
|
|
219
|
+
"--dry-run",
|
|
220
|
+
action="store_true",
|
|
221
|
+
help="Show what would be uploaded without uploading",
|
|
222
|
+
)
|
|
223
|
+
parser.add_argument(
|
|
224
|
+
"--strict",
|
|
225
|
+
action="store_true",
|
|
226
|
+
help="Fail on any validation error (default: skip invalid data)",
|
|
227
|
+
)
|
|
228
|
+
parser.add_argument(
|
|
229
|
+
"-v",
|
|
230
|
+
"--verbose",
|
|
231
|
+
action="store_true",
|
|
232
|
+
help="Show detailed progress",
|
|
233
|
+
)
|
|
234
|
+
parser.add_argument(
|
|
235
|
+
"--batch-size",
|
|
236
|
+
type=int,
|
|
237
|
+
default=100,
|
|
238
|
+
help="Batch size for logs/metrics (default: 100)",
|
|
239
|
+
)
|
|
240
|
+
parser.add_argument(
|
|
241
|
+
"--resume",
|
|
242
|
+
action="store_true",
|
|
243
|
+
help="Resume previous interrupted upload",
|
|
244
|
+
)
|
|
245
|
+
parser.add_argument(
|
|
246
|
+
"--state-file",
|
|
247
|
+
type=str,
|
|
248
|
+
default=".dash-upload-state.json",
|
|
249
|
+
help="Path to state file for resume (default: .dash-upload-state.json)",
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
return parser
|
|
118
253
|
|
|
119
|
-
# Positional argument
|
|
120
|
-
parser.add_argument(
|
|
121
|
-
"path",
|
|
122
|
-
nargs="?",
|
|
123
|
-
default="./.ml-dash",
|
|
124
|
-
help="Local storage directory to upload from (default: ./.ml-dash)",
|
|
125
|
-
)
|
|
126
254
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
255
|
+
def discover_experiments(
|
|
256
|
+
local_path: Path,
|
|
257
|
+
project_filter: Optional[str] = None,
|
|
258
|
+
experiment_filter: Optional[str] = None,
|
|
259
|
+
) -> List[ExperimentInfo]:
|
|
260
|
+
"""
|
|
261
|
+
Discover experiments in local storage directory.
|
|
262
|
+
|
|
263
|
+
Supports both flat (local_path/project/experiment) and folder-based
|
|
264
|
+
(local_path/folder/project/experiment) hierarchies.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
local_path: Root path of local storage
|
|
268
|
+
project_filter: Glob pattern to filter experiments by prefix (e.g., "tom/*/exp*")
|
|
269
|
+
experiment_filter: Only discover this experiment (requires project_filter)
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
List of ExperimentInfo objects
|
|
273
|
+
"""
|
|
274
|
+
import fnmatch
|
|
275
|
+
|
|
276
|
+
local_path = Path(local_path)
|
|
277
|
+
|
|
278
|
+
if not local_path.exists():
|
|
279
|
+
return []
|
|
280
|
+
|
|
281
|
+
experiments = []
|
|
282
|
+
|
|
283
|
+
# Find all experiment.json files recursively
|
|
284
|
+
for exp_json in local_path.rglob("*/experiment.json"):
|
|
285
|
+
exp_dir = exp_json.parent
|
|
286
|
+
|
|
287
|
+
# Read prefix from experiment.json first
|
|
288
|
+
prefix = None
|
|
289
|
+
try:
|
|
290
|
+
with open(exp_json, "r") as f:
|
|
291
|
+
metadata = json.load(f)
|
|
292
|
+
prefix = metadata.get("prefix")
|
|
293
|
+
except:
|
|
294
|
+
pass
|
|
295
|
+
|
|
296
|
+
# Extract project and experiment names from PREFIX (not path)
|
|
297
|
+
# This handles nested folders correctly
|
|
298
|
+
# Prefix format: owner/project/folder.../experiment
|
|
299
|
+
try:
|
|
300
|
+
relative_path = exp_dir.relative_to(local_path)
|
|
301
|
+
full_relative_path = str(relative_path)
|
|
302
|
+
|
|
303
|
+
if prefix:
|
|
304
|
+
# Parse from prefix for accuracy
|
|
305
|
+
prefix_parts = prefix.strip("/").split("/")
|
|
306
|
+
if len(prefix_parts) < 3:
|
|
307
|
+
continue # Need at least owner/project/experiment
|
|
308
|
+
|
|
309
|
+
# owner = prefix_parts[0]
|
|
310
|
+
project_name = prefix_parts[1]
|
|
311
|
+
exp_name = prefix_parts[-1]
|
|
312
|
+
else:
|
|
313
|
+
# Fallback to path-based parsing (legacy support)
|
|
314
|
+
parts = relative_path.parts
|
|
315
|
+
if len(parts) < 2:
|
|
316
|
+
continue
|
|
317
|
+
exp_name = parts[-1]
|
|
318
|
+
project_name = parts[-2]
|
|
319
|
+
|
|
320
|
+
# Apply filters with glob pattern support
|
|
321
|
+
if project_filter:
|
|
322
|
+
# Support glob pattern matching on the full relative path
|
|
323
|
+
if not fnmatch.fnmatch(full_relative_path, project_filter):
|
|
324
|
+
continue
|
|
325
|
+
if experiment_filter and exp_name != experiment_filter:
|
|
326
|
+
continue
|
|
327
|
+
|
|
328
|
+
# Create experiment info
|
|
329
|
+
exp_info = ExperimentInfo(
|
|
330
|
+
project=project_name,
|
|
331
|
+
experiment=exp_name,
|
|
332
|
+
path=exp_dir,
|
|
333
|
+
prefix=prefix,
|
|
334
|
+
)
|
|
335
|
+
except (ValueError, IndexError):
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
# Check for parameters
|
|
339
|
+
params_file = exp_dir / "parameters.json"
|
|
340
|
+
exp_info.has_params = params_file.exists()
|
|
341
|
+
|
|
342
|
+
# Check for logs
|
|
343
|
+
logs_file = exp_dir / "logs/logs.jsonl"
|
|
344
|
+
exp_info.has_logs = logs_file.exists()
|
|
345
|
+
|
|
346
|
+
# Check for metrics
|
|
347
|
+
metrics_dir = exp_dir / "metrics"
|
|
348
|
+
if metrics_dir.exists():
|
|
349
|
+
for metric_dir in metrics_dir.iterdir():
|
|
350
|
+
if metric_dir.is_dir():
|
|
351
|
+
data_file = metric_dir / "data.jsonl"
|
|
352
|
+
if data_file.exists():
|
|
353
|
+
exp_info.metric_names.append(metric_dir.name)
|
|
354
|
+
|
|
355
|
+
# Check for files
|
|
356
|
+
files_dir = exp_dir / "files"
|
|
357
|
+
if files_dir.exists():
|
|
358
|
+
try:
|
|
359
|
+
# Count files recursively
|
|
360
|
+
exp_info.file_count = sum(1 for _ in files_dir.rglob("*") if _.is_file())
|
|
361
|
+
|
|
362
|
+
# Estimate size
|
|
363
|
+
exp_info.estimated_size = sum(
|
|
364
|
+
f.stat().st_size for f in files_dir.rglob("*") if f.is_file()
|
|
365
|
+
)
|
|
366
|
+
except (OSError, PermissionError):
|
|
367
|
+
pass
|
|
138
368
|
|
|
139
|
-
|
|
140
|
-
parser.add_argument(
|
|
141
|
-
"--project",
|
|
142
|
-
type=str,
|
|
143
|
-
help="Upload only experiments from this project",
|
|
144
|
-
)
|
|
145
|
-
parser.add_argument(
|
|
146
|
-
"--experiment",
|
|
147
|
-
type=str,
|
|
148
|
-
help="Upload only this specific experiment (requires --project)",
|
|
149
|
-
)
|
|
369
|
+
experiments.append(exp_info)
|
|
150
370
|
|
|
151
|
-
|
|
152
|
-
parser.add_argument(
|
|
153
|
-
"--skip-logs",
|
|
154
|
-
action="store_true",
|
|
155
|
-
help="Don't upload logs",
|
|
156
|
-
)
|
|
157
|
-
parser.add_argument(
|
|
158
|
-
"--skip-metrics",
|
|
159
|
-
action="store_true",
|
|
160
|
-
help="Don't upload metrics",
|
|
161
|
-
)
|
|
162
|
-
parser.add_argument(
|
|
163
|
-
"--skip-files",
|
|
164
|
-
action="store_true",
|
|
165
|
-
help="Don't upload files",
|
|
166
|
-
)
|
|
167
|
-
parser.add_argument(
|
|
168
|
-
"--skip-params",
|
|
169
|
-
action="store_true",
|
|
170
|
-
help="Don't upload parameters",
|
|
171
|
-
)
|
|
371
|
+
return experiments
|
|
172
372
|
|
|
173
|
-
# Behavior control
|
|
174
|
-
parser.add_argument(
|
|
175
|
-
"--dry-run",
|
|
176
|
-
action="store_true",
|
|
177
|
-
help="Show what would be uploaded without uploading",
|
|
178
|
-
)
|
|
179
|
-
parser.add_argument(
|
|
180
|
-
"--strict",
|
|
181
|
-
action="store_true",
|
|
182
|
-
help="Fail on any validation error (default: skip invalid data)",
|
|
183
|
-
)
|
|
184
|
-
parser.add_argument(
|
|
185
|
-
"-v", "--verbose",
|
|
186
|
-
action="store_true",
|
|
187
|
-
help="Show detailed progress",
|
|
188
|
-
)
|
|
189
|
-
parser.add_argument(
|
|
190
|
-
"--batch-size",
|
|
191
|
-
type=int,
|
|
192
|
-
default=100,
|
|
193
|
-
help="Batch size for logs/metrics (default: 100)",
|
|
194
|
-
)
|
|
195
|
-
parser.add_argument(
|
|
196
|
-
"--resume",
|
|
197
|
-
action="store_true",
|
|
198
|
-
help="Resume previous interrupted upload",
|
|
199
|
-
)
|
|
200
|
-
parser.add_argument(
|
|
201
|
-
"--state-file",
|
|
202
|
-
type=str,
|
|
203
|
-
default=".ml-dash-upload-state.json",
|
|
204
|
-
help="Path to state file for resume (default: .ml-dash-upload-state.json)",
|
|
205
|
-
)
|
|
206
373
|
|
|
207
|
-
|
|
374
|
+
class ExperimentValidator:
|
|
375
|
+
"""Validates local experiment data before upload."""
|
|
208
376
|
|
|
377
|
+
def __init__(self, strict: bool = False):
|
|
378
|
+
"""
|
|
379
|
+
Initialize validator.
|
|
209
380
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
project_filter: Optional[str] = None,
|
|
213
|
-
experiment_filter: Optional[str] = None,
|
|
214
|
-
) -> List[ExperimentInfo]:
|
|
381
|
+
Args:
|
|
382
|
+
strict: If True, fail on any validation error
|
|
215
383
|
"""
|
|
216
|
-
|
|
384
|
+
self.strict = strict
|
|
217
385
|
|
|
218
|
-
|
|
219
|
-
|
|
386
|
+
def validate_experiment(self, exp_info: ExperimentInfo) -> ValidationResult:
|
|
387
|
+
"""
|
|
388
|
+
Validate experiment directory structure and data.
|
|
220
389
|
|
|
221
390
|
Args:
|
|
222
|
-
|
|
223
|
-
project_filter: Only discover experiments in this project
|
|
224
|
-
experiment_filter: Only discover this experiment (requires project_filter)
|
|
391
|
+
exp_info: Experiment information
|
|
225
392
|
|
|
226
393
|
Returns:
|
|
227
|
-
|
|
394
|
+
ValidationResult with validation status and messages
|
|
228
395
|
"""
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
396
|
+
result = ValidationResult()
|
|
397
|
+
result.valid_data = {}
|
|
398
|
+
|
|
399
|
+
# 1. Validate experiment metadata (required)
|
|
400
|
+
if not self._validate_experiment_metadata(exp_info, result):
|
|
401
|
+
result.is_valid = False
|
|
402
|
+
return result
|
|
403
|
+
|
|
404
|
+
# 2. Validate parameters (optional)
|
|
405
|
+
self._validate_parameters(exp_info, result)
|
|
406
|
+
|
|
407
|
+
# 3. Validate logs (optional)
|
|
408
|
+
self._validate_logs(exp_info, result)
|
|
409
|
+
|
|
410
|
+
# 4. Validate metrics (optional)
|
|
411
|
+
self._validate_metrics(exp_info, result)
|
|
412
|
+
|
|
413
|
+
# 5. Validate files (optional)
|
|
414
|
+
self._validate_files(exp_info, result)
|
|
415
|
+
|
|
416
|
+
# In strict mode, any warning becomes an error
|
|
417
|
+
if self.strict and result.warnings:
|
|
418
|
+
result.errors.extend(result.warnings)
|
|
419
|
+
result.warnings = []
|
|
420
|
+
result.is_valid = False
|
|
421
|
+
|
|
422
|
+
return result
|
|
423
|
+
|
|
424
|
+
def _validate_experiment_metadata(
|
|
425
|
+
self, exp_info: ExperimentInfo, result: ValidationResult
|
|
426
|
+
) -> bool:
|
|
427
|
+
"""Validate experiment.json exists and is valid."""
|
|
428
|
+
exp_json = exp_info.path / "experiment.json"
|
|
429
|
+
|
|
430
|
+
if not exp_json.exists():
|
|
431
|
+
result.errors.append("Missing experiment.json")
|
|
432
|
+
return False
|
|
433
|
+
|
|
434
|
+
try:
|
|
435
|
+
with open(exp_json, "r") as f:
|
|
436
|
+
metadata = json.load(f)
|
|
437
|
+
|
|
438
|
+
# Check required fields
|
|
439
|
+
if "name" not in metadata or "project" not in metadata:
|
|
440
|
+
result.errors.append("experiment.json missing required fields (name, project)")
|
|
441
|
+
return False
|
|
442
|
+
|
|
443
|
+
result.valid_data["metadata"] = metadata
|
|
444
|
+
return True
|
|
445
|
+
|
|
446
|
+
except json.JSONDecodeError as e:
|
|
447
|
+
result.errors.append(f"Invalid JSON in experiment.json: {e}")
|
|
448
|
+
return False
|
|
449
|
+
except IOError as e:
|
|
450
|
+
result.errors.append(f"Cannot read experiment.json: {e}")
|
|
451
|
+
return False
|
|
452
|
+
|
|
453
|
+
def _validate_parameters(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
454
|
+
"""Validate parameters.json format."""
|
|
455
|
+
if not exp_info.has_params:
|
|
456
|
+
return
|
|
457
|
+
|
|
458
|
+
params_file = exp_info.path / "parameters.json"
|
|
459
|
+
try:
|
|
460
|
+
with open(params_file, "r") as f:
|
|
461
|
+
params = json.load(f)
|
|
462
|
+
|
|
463
|
+
# Check if it's a dict
|
|
464
|
+
if not isinstance(params, dict):
|
|
465
|
+
result.warnings.append("parameters.json is not a dict (will skip)")
|
|
466
|
+
return
|
|
467
|
+
|
|
468
|
+
# Check for valid data key if using versioned format
|
|
469
|
+
if "data" in params:
|
|
470
|
+
if not isinstance(params["data"], dict):
|
|
471
|
+
result.warnings.append("parameters.json data is not a dict (will skip)")
|
|
472
|
+
return
|
|
473
|
+
result.valid_data["parameters"] = params["data"]
|
|
474
|
+
else:
|
|
475
|
+
result.valid_data["parameters"] = params
|
|
476
|
+
|
|
477
|
+
except json.JSONDecodeError as e:
|
|
478
|
+
result.warnings.append(f"Invalid JSON in parameters.json: {e} (will skip)")
|
|
479
|
+
except IOError as e:
|
|
480
|
+
result.warnings.append(f"Cannot read parameters.json: {e} (will skip)")
|
|
481
|
+
|
|
482
|
+
def _validate_logs(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
483
|
+
"""Validate logs.jsonl format."""
|
|
484
|
+
if not exp_info.has_logs:
|
|
485
|
+
return
|
|
486
|
+
|
|
487
|
+
logs_file = exp_info.path / "logs/logs.jsonl"
|
|
488
|
+
invalid_lines = []
|
|
489
|
+
|
|
490
|
+
try:
|
|
491
|
+
with open(logs_file, "r") as f:
|
|
492
|
+
for line_num, line in enumerate(f, start=1):
|
|
493
|
+
try:
|
|
494
|
+
log_entry = json.loads(line)
|
|
495
|
+
# Check required fields
|
|
496
|
+
if "message" not in log_entry:
|
|
497
|
+
invalid_lines.append(line_num)
|
|
498
|
+
except json.JSONDecodeError:
|
|
499
|
+
invalid_lines.append(line_num)
|
|
500
|
+
|
|
501
|
+
if invalid_lines:
|
|
502
|
+
count = len(invalid_lines)
|
|
503
|
+
preview = invalid_lines[:5]
|
|
504
|
+
result.warnings.append(
|
|
505
|
+
f"logs.jsonl has {count} invalid lines (e.g., {preview}...) - will skip these"
|
|
506
|
+
)
|
|
245
507
|
|
|
246
|
-
|
|
247
|
-
|
|
508
|
+
except IOError as e:
|
|
509
|
+
result.warnings.append(f"Cannot read logs.jsonl: {e} (will skip logs)")
|
|
248
510
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
511
|
+
def _validate_metrics(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
512
|
+
"""Validate metrics data."""
|
|
513
|
+
if not exp_info.metric_names:
|
|
514
|
+
return
|
|
252
515
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
if experiment_filter and exp_name != experiment_filter:
|
|
257
|
-
continue
|
|
516
|
+
for metric_name in exp_info.metric_names:
|
|
517
|
+
metric_dir = exp_info.path / "metrics" / metric_name
|
|
518
|
+
data_file = metric_dir / "data.jsonl"
|
|
258
519
|
|
|
259
|
-
|
|
260
|
-
|
|
520
|
+
invalid_lines = []
|
|
521
|
+
try:
|
|
522
|
+
with open(data_file, "r") as f:
|
|
523
|
+
for line_num, line in enumerate(f, start=1):
|
|
261
524
|
try:
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
525
|
+
data_point = json.loads(line)
|
|
526
|
+
# Check for data field
|
|
527
|
+
if "data" not in data_point:
|
|
528
|
+
invalid_lines.append(line_num)
|
|
529
|
+
except json.JSONDecodeError:
|
|
530
|
+
invalid_lines.append(line_num)
|
|
531
|
+
|
|
532
|
+
if invalid_lines:
|
|
533
|
+
count = len(invalid_lines)
|
|
534
|
+
preview = invalid_lines[:5]
|
|
535
|
+
result.warnings.append(
|
|
536
|
+
f"metric '{metric_name}' has {count} invalid lines (e.g., {preview}...) - will skip these"
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
except IOError as e:
|
|
540
|
+
result.warnings.append(f"Cannot read metric '{metric_name}': {e} (will skip)")
|
|
541
|
+
|
|
542
|
+
def _validate_files(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
543
|
+
"""Validate files existence."""
|
|
544
|
+
files_dir = exp_info.path / "files"
|
|
545
|
+
if not files_dir.exists():
|
|
546
|
+
return
|
|
547
|
+
|
|
548
|
+
metadata_file = files_dir / ".files_metadata.json"
|
|
549
|
+
if not metadata_file.exists():
|
|
550
|
+
return
|
|
551
|
+
|
|
552
|
+
try:
|
|
553
|
+
with open(metadata_file, "r") as f:
|
|
554
|
+
files_metadata = json.load(f)
|
|
555
|
+
|
|
556
|
+
missing_files = []
|
|
557
|
+
for file_id, file_info in files_metadata.items():
|
|
558
|
+
if isinstance(file_info, dict) and file_info.get("deletedAt") is None:
|
|
559
|
+
# Check if file exists
|
|
560
|
+
file_path = (
|
|
561
|
+
files_dir
|
|
562
|
+
/ file_info.get("prefix", "")
|
|
563
|
+
/ file_id
|
|
564
|
+
/ file_info.get("filename", "")
|
|
565
|
+
)
|
|
566
|
+
if not file_path.exists():
|
|
567
|
+
missing_files.append(file_info.get("filename", file_id))
|
|
568
|
+
|
|
569
|
+
if missing_files:
|
|
570
|
+
count = len(missing_files)
|
|
571
|
+
preview = missing_files[:3]
|
|
572
|
+
result.warnings.append(
|
|
573
|
+
f"{count} files referenced in metadata but missing on disk (e.g., {preview}...) - will skip these"
|
|
574
|
+
)
|
|
310
575
|
|
|
311
|
-
|
|
576
|
+
except (json.JSONDecodeError, IOError):
|
|
577
|
+
pass # If we can't read metadata, just skip file validation
|
|
312
578
|
|
|
313
579
|
|
|
314
|
-
class
|
|
315
|
-
|
|
580
|
+
class ExperimentUploader:
|
|
581
|
+
"""Handles uploading a single experiment."""
|
|
582
|
+
|
|
583
|
+
def __init__(
|
|
584
|
+
self,
|
|
585
|
+
local_storage: LocalStorage,
|
|
586
|
+
remote_client: RemoteClient,
|
|
587
|
+
batch_size: int = 100,
|
|
588
|
+
skip_logs: bool = False,
|
|
589
|
+
skip_metrics: bool = False,
|
|
590
|
+
skip_files: bool = False,
|
|
591
|
+
skip_params: bool = False,
|
|
592
|
+
verbose: bool = False,
|
|
593
|
+
progress: Optional[Progress] = None,
|
|
594
|
+
max_concurrent_metrics: int = 5,
|
|
595
|
+
target_prefix: Optional[str] = None,
|
|
596
|
+
):
|
|
597
|
+
"""
|
|
598
|
+
Initialize uploader.
|
|
316
599
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
600
|
+
Args:
|
|
601
|
+
local_storage: Local storage instance
|
|
602
|
+
remote_client: Remote client instance
|
|
603
|
+
batch_size: Batch size for logs/metrics
|
|
604
|
+
skip_logs: Skip uploading logs
|
|
605
|
+
skip_metrics: Skip uploading metrics
|
|
606
|
+
skip_files: Skip uploading files
|
|
607
|
+
skip_params: Skip uploading parameters
|
|
608
|
+
verbose: Show verbose output
|
|
609
|
+
progress: Optional rich Progress instance for tracking
|
|
610
|
+
max_concurrent_metrics: Maximum concurrent metric uploads (default: 5)
|
|
611
|
+
target_prefix: Target prefix on server (overrides local prefix)
|
|
612
|
+
"""
|
|
613
|
+
self.local = local_storage
|
|
614
|
+
self.remote = remote_client
|
|
615
|
+
self.batch_size = batch_size
|
|
616
|
+
self.skip_logs = skip_logs
|
|
617
|
+
self.skip_metrics = skip_metrics
|
|
618
|
+
self.skip_files = skip_files
|
|
619
|
+
self.skip_params = skip_params
|
|
620
|
+
self.verbose = verbose
|
|
621
|
+
self.progress = progress
|
|
622
|
+
self.max_concurrent_metrics = max_concurrent_metrics
|
|
623
|
+
self.target_prefix = target_prefix
|
|
624
|
+
# Thread-safe lock for shared state updates
|
|
625
|
+
self._lock = threading.Lock()
|
|
626
|
+
# Thread-local storage for remote clients (for thread-safe HTTP requests)
|
|
627
|
+
self._thread_local = threading.local()
|
|
628
|
+
|
|
629
|
+
def _get_remote_client(self) -> RemoteClient:
|
|
630
|
+
"""Get thread-local remote client for safe concurrent access."""
|
|
631
|
+
if not hasattr(self._thread_local, "client"):
|
|
632
|
+
# Create a new client for this thread
|
|
633
|
+
# Use graphql_base_url (without /api) since RemoteClient.__init__ will add /api
|
|
634
|
+
self._thread_local.client = RemoteClient(
|
|
635
|
+
base_url=self.remote.graphql_base_url, api_key=self.remote.api_key
|
|
636
|
+
)
|
|
637
|
+
return self._thread_local.client
|
|
638
|
+
|
|
639
|
+
def upload_experiment(
|
|
640
|
+
self, exp_info: ExperimentInfo, validation_result: ValidationResult, task_id=None
|
|
641
|
+
) -> UploadResult:
|
|
642
|
+
"""
|
|
643
|
+
Upload a single experiment with all its data.
|
|
320
644
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
645
|
+
Args:
|
|
646
|
+
exp_info: Experiment information
|
|
647
|
+
validation_result: Validation results
|
|
648
|
+
task_id: Optional progress task ID
|
|
325
649
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
650
|
+
Returns:
|
|
651
|
+
UploadResult with upload status
|
|
652
|
+
"""
|
|
653
|
+
result = UploadResult(experiment=f"{exp_info.project}/{exp_info.experiment}")
|
|
654
|
+
|
|
655
|
+
# Calculate total steps for progress tracking
|
|
656
|
+
total_steps = 1 # metadata
|
|
657
|
+
if not self.skip_params and "parameters" in validation_result.valid_data:
|
|
658
|
+
total_steps += 1
|
|
659
|
+
if not self.skip_logs and exp_info.has_logs:
|
|
660
|
+
total_steps += 1
|
|
661
|
+
if not self.skip_metrics and exp_info.metric_names:
|
|
662
|
+
total_steps += len(exp_info.metric_names)
|
|
663
|
+
if not self.skip_files and exp_info.file_count > 0:
|
|
664
|
+
total_steps += exp_info.file_count
|
|
665
|
+
|
|
666
|
+
current_step = 0
|
|
667
|
+
|
|
668
|
+
def update_progress(description: str):
|
|
669
|
+
nonlocal current_step
|
|
670
|
+
current_step += 1
|
|
671
|
+
if self.progress and task_id is not None:
|
|
672
|
+
self.progress.update(
|
|
673
|
+
task_id, completed=current_step, total=total_steps, description=description
|
|
674
|
+
)
|
|
329
675
|
|
|
330
|
-
|
|
331
|
-
|
|
676
|
+
try:
|
|
677
|
+
# 1. Create/update experiment metadata
|
|
678
|
+
update_progress("Creating experiment...")
|
|
679
|
+
if self.verbose:
|
|
680
|
+
console.print(" [dim]Creating experiment...[/dim]")
|
|
681
|
+
|
|
682
|
+
exp_data = validation_result.valid_data
|
|
683
|
+
|
|
684
|
+
# Construct full prefix for server
|
|
685
|
+
# If --target is specified, use it as the base destination prefix
|
|
686
|
+
# Otherwise, preserve the local prefix structure
|
|
687
|
+
if self.target_prefix:
|
|
688
|
+
# User specified a target prefix (like scp destination directory)
|
|
689
|
+
# Append experiment name to it: target_prefix/experiment_name
|
|
690
|
+
full_prefix = f"{self.target_prefix.rstrip('/')}/{exp_info.experiment}"
|
|
691
|
+
|
|
692
|
+
# Extract project from target prefix for API call
|
|
693
|
+
# Target format: owner/project/path...
|
|
694
|
+
target_parts = self.target_prefix.strip("/").split("/")
|
|
695
|
+
if len(target_parts) >= 2:
|
|
696
|
+
target_project = target_parts[1]
|
|
697
|
+
else:
|
|
698
|
+
target_project = exp_info.project # Fallback to original
|
|
699
|
+
elif exp_info.prefix:
|
|
700
|
+
# No target specified, preserve local prefix structure
|
|
701
|
+
full_prefix = f"{exp_info.prefix}/{exp_info.experiment}"
|
|
702
|
+
target_project = exp_info.project
|
|
703
|
+
else:
|
|
704
|
+
full_prefix = exp_info.experiment
|
|
705
|
+
target_project = exp_info.project
|
|
706
|
+
|
|
707
|
+
response = self.remote.create_or_update_experiment(
|
|
708
|
+
project=target_project,
|
|
709
|
+
name=exp_info.experiment,
|
|
710
|
+
description=exp_data.get("description"),
|
|
711
|
+
tags=exp_data.get("tags"),
|
|
712
|
+
bindrs=exp_data.get("bindrs"),
|
|
713
|
+
prefix=full_prefix, # Send full prefix (folder + name) or target prefix
|
|
714
|
+
write_protected=exp_data.get("write_protected", False),
|
|
715
|
+
metadata=exp_data.get("metadata"),
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
# Extract experiment ID from nested response
|
|
719
|
+
experiment_id = response.get("experiment", {}).get("id") or response.get("id")
|
|
720
|
+
if self.verbose:
|
|
721
|
+
console.print(f" [green]✓[/green] Created experiment (id: {experiment_id})")
|
|
722
|
+
|
|
723
|
+
# 2. Upload parameters
|
|
724
|
+
if not self.skip_params and "parameters" in validation_result.valid_data:
|
|
725
|
+
update_progress("Uploading parameters...")
|
|
726
|
+
if self.verbose:
|
|
727
|
+
console.print(" [dim]Uploading parameters...[/dim]")
|
|
332
728
|
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
""
|
|
336
|
-
|
|
337
|
-
result.
|
|
729
|
+
params = validation_result.valid_data["parameters"]
|
|
730
|
+
self.remote.set_parameters(experiment_id, params)
|
|
731
|
+
result.uploaded["params"] = len(params)
|
|
732
|
+
# Track bytes (approximate JSON size)
|
|
733
|
+
result.bytes_uploaded += len(json.dumps(params).encode("utf-8"))
|
|
338
734
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
result.is_valid = False
|
|
342
|
-
return result
|
|
735
|
+
if self.verbose:
|
|
736
|
+
console.print(f" [green]✓[/green] Uploaded {len(params)} parameters")
|
|
343
737
|
|
|
344
|
-
|
|
345
|
-
|
|
738
|
+
# 3. Upload logs
|
|
739
|
+
if not self.skip_logs and exp_info.has_logs:
|
|
740
|
+
count = self._upload_logs(
|
|
741
|
+
experiment_id, exp_info, result, task_id, update_progress
|
|
742
|
+
)
|
|
743
|
+
result.uploaded["logs"] = count
|
|
346
744
|
|
|
347
|
-
|
|
348
|
-
|
|
745
|
+
# 4. Upload metrics
|
|
746
|
+
if not self.skip_metrics and exp_info.metric_names:
|
|
747
|
+
count = self._upload_metrics(
|
|
748
|
+
experiment_id, exp_info, result, task_id, update_progress
|
|
749
|
+
)
|
|
750
|
+
result.uploaded["metrics"] = count
|
|
349
751
|
|
|
350
|
-
|
|
351
|
-
|
|
752
|
+
# 5. Upload files
|
|
753
|
+
if not self.skip_files and exp_info.file_count > 0:
|
|
754
|
+
count = self._upload_files(
|
|
755
|
+
experiment_id, exp_info, result, task_id, update_progress
|
|
756
|
+
)
|
|
757
|
+
result.uploaded["files"] = count
|
|
758
|
+
|
|
759
|
+
result.success = True
|
|
760
|
+
|
|
761
|
+
except Exception as e:
|
|
762
|
+
result.success = False
|
|
763
|
+
result.errors.append(str(e))
|
|
764
|
+
if self.verbose:
|
|
765
|
+
console.print(f" [red]✗ Error: {e}[/red]")
|
|
766
|
+
|
|
767
|
+
return result
|
|
768
|
+
|
|
769
|
+
def _upload_logs(
|
|
770
|
+
self,
|
|
771
|
+
experiment_id: str,
|
|
772
|
+
exp_info: ExperimentInfo,
|
|
773
|
+
result: UploadResult,
|
|
774
|
+
task_id=None,
|
|
775
|
+
update_progress=None,
|
|
776
|
+
) -> int:
|
|
777
|
+
"""Upload logs in batches."""
|
|
778
|
+
if update_progress:
|
|
779
|
+
update_progress("Uploading logs...")
|
|
780
|
+
if self.verbose:
|
|
781
|
+
console.print(" [dim]Uploading logs...[/dim]")
|
|
782
|
+
|
|
783
|
+
logs_file = exp_info.path / "logs/logs.jsonl"
|
|
784
|
+
logs_batch = []
|
|
785
|
+
total_uploaded = 0
|
|
786
|
+
skipped = 0
|
|
787
|
+
|
|
788
|
+
try:
|
|
789
|
+
with open(logs_file, "r") as f:
|
|
790
|
+
for line in f:
|
|
791
|
+
try:
|
|
792
|
+
log_entry = json.loads(line)
|
|
793
|
+
|
|
794
|
+
# Validate required fields
|
|
795
|
+
if "message" not in log_entry:
|
|
796
|
+
skipped += 1
|
|
797
|
+
continue
|
|
798
|
+
|
|
799
|
+
# Prepare log entry for API
|
|
800
|
+
api_log = {
|
|
801
|
+
"timestamp": log_entry.get("timestamp"),
|
|
802
|
+
"level": log_entry.get("level", "info"),
|
|
803
|
+
"message": log_entry["message"],
|
|
804
|
+
}
|
|
805
|
+
if "metadata" in log_entry:
|
|
806
|
+
api_log["metadata"] = log_entry["metadata"]
|
|
352
807
|
|
|
353
|
-
|
|
354
|
-
|
|
808
|
+
logs_batch.append(api_log)
|
|
809
|
+
# Track bytes
|
|
810
|
+
result.bytes_uploaded += len(line.encode("utf-8"))
|
|
355
811
|
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
812
|
+
# Upload batch
|
|
813
|
+
if len(logs_batch) >= self.batch_size:
|
|
814
|
+
self.remote.create_log_entries(experiment_id, logs_batch)
|
|
815
|
+
total_uploaded += len(logs_batch)
|
|
816
|
+
logs_batch = []
|
|
361
817
|
|
|
362
|
-
|
|
818
|
+
except json.JSONDecodeError:
|
|
819
|
+
skipped += 1
|
|
820
|
+
continue
|
|
363
821
|
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
822
|
+
# Upload remaining logs
|
|
823
|
+
if logs_batch:
|
|
824
|
+
self.remote.create_log_entries(experiment_id, logs_batch)
|
|
825
|
+
total_uploaded += len(logs_batch)
|
|
367
826
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
827
|
+
if self.verbose:
|
|
828
|
+
msg = f" [green]✓[/green] Uploaded {total_uploaded} log entries"
|
|
829
|
+
if skipped > 0:
|
|
830
|
+
msg += f" (skipped {skipped} invalid)"
|
|
831
|
+
console.print(msg)
|
|
371
832
|
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
metadata = json.load(f)
|
|
833
|
+
except IOError as e:
|
|
834
|
+
result.failed.setdefault("logs", []).append(str(e))
|
|
375
835
|
|
|
376
|
-
|
|
377
|
-
if "name" not in metadata or "project" not in metadata:
|
|
378
|
-
result.errors.append("experiment.json missing required fields (name, project)")
|
|
379
|
-
return False
|
|
380
|
-
|
|
381
|
-
result.valid_data["metadata"] = metadata
|
|
382
|
-
return True
|
|
383
|
-
|
|
384
|
-
except json.JSONDecodeError as e:
|
|
385
|
-
result.errors.append(f"Invalid JSON in experiment.json: {e}")
|
|
386
|
-
return False
|
|
387
|
-
except IOError as e:
|
|
388
|
-
result.errors.append(f"Cannot read experiment.json: {e}")
|
|
389
|
-
return False
|
|
390
|
-
|
|
391
|
-
def _validate_parameters(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
392
|
-
"""Validate parameters.json format."""
|
|
393
|
-
if not exp_info.has_params:
|
|
394
|
-
return
|
|
395
|
-
|
|
396
|
-
params_file = exp_info.path / "parameters.json"
|
|
397
|
-
try:
|
|
398
|
-
with open(params_file, "r") as f:
|
|
399
|
-
params = json.load(f)
|
|
400
|
-
|
|
401
|
-
# Check if it's a dict
|
|
402
|
-
if not isinstance(params, dict):
|
|
403
|
-
result.warnings.append("parameters.json is not a dict (will skip)")
|
|
404
|
-
return
|
|
405
|
-
|
|
406
|
-
# Check for valid data key if using versioned format
|
|
407
|
-
if "data" in params:
|
|
408
|
-
if not isinstance(params["data"], dict):
|
|
409
|
-
result.warnings.append("parameters.json data is not a dict (will skip)")
|
|
410
|
-
return
|
|
411
|
-
result.valid_data["parameters"] = params["data"]
|
|
412
|
-
else:
|
|
413
|
-
result.valid_data["parameters"] = params
|
|
414
|
-
|
|
415
|
-
except json.JSONDecodeError as e:
|
|
416
|
-
result.warnings.append(f"Invalid JSON in parameters.json: {e} (will skip)")
|
|
417
|
-
except IOError as e:
|
|
418
|
-
result.warnings.append(f"Cannot read parameters.json: {e} (will skip)")
|
|
419
|
-
|
|
420
|
-
def _validate_logs(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
421
|
-
"""Validate logs.jsonl format."""
|
|
422
|
-
if not exp_info.has_logs:
|
|
423
|
-
return
|
|
424
|
-
|
|
425
|
-
logs_file = exp_info.path / "logs" / "logs.jsonl"
|
|
426
|
-
invalid_lines = []
|
|
836
|
+
return total_uploaded
|
|
427
837
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
# Check required fields
|
|
434
|
-
if "message" not in log_entry:
|
|
435
|
-
invalid_lines.append(line_num)
|
|
436
|
-
except json.JSONDecodeError:
|
|
437
|
-
invalid_lines.append(line_num)
|
|
438
|
-
|
|
439
|
-
if invalid_lines:
|
|
440
|
-
count = len(invalid_lines)
|
|
441
|
-
preview = invalid_lines[:5]
|
|
442
|
-
result.warnings.append(
|
|
443
|
-
f"logs.jsonl has {count} invalid lines (e.g., {preview}...) - will skip these"
|
|
444
|
-
)
|
|
838
|
+
def _upload_single_metric(
|
|
839
|
+
self, experiment_id: str, metric_name: str, metric_dir: Path, result: UploadResult
|
|
840
|
+
) -> Dict[str, Any]:
|
|
841
|
+
"""
|
|
842
|
+
Upload a single metric (thread-safe helper).
|
|
445
843
|
|
|
446
|
-
|
|
447
|
-
|
|
844
|
+
Returns:
|
|
845
|
+
Dict with 'success', 'uploaded', 'skipped', 'bytes', and 'error' keys
|
|
846
|
+
"""
|
|
847
|
+
data_file = metric_dir / "data.jsonl"
|
|
848
|
+
data_batch = []
|
|
849
|
+
total_uploaded = 0
|
|
850
|
+
skipped = 0
|
|
851
|
+
bytes_uploaded = 0
|
|
852
|
+
|
|
853
|
+
# Get thread-local client for safe concurrent HTTP requests
|
|
854
|
+
remote_client = self._get_remote_client()
|
|
855
|
+
|
|
856
|
+
try:
|
|
857
|
+
with open(data_file, "r") as f:
|
|
858
|
+
for line in f:
|
|
859
|
+
try:
|
|
860
|
+
data_point = json.loads(line)
|
|
861
|
+
|
|
862
|
+
# Validate required fields
|
|
863
|
+
if "data" not in data_point:
|
|
864
|
+
skipped += 1
|
|
865
|
+
continue
|
|
866
|
+
|
|
867
|
+
data_batch.append(data_point["data"])
|
|
868
|
+
bytes_uploaded += len(line.encode("utf-8"))
|
|
869
|
+
|
|
870
|
+
# Upload batch using thread-local client
|
|
871
|
+
if len(data_batch) >= self.batch_size:
|
|
872
|
+
remote_client.append_batch_to_metric(
|
|
873
|
+
experiment_id, metric_name, data_batch
|
|
874
|
+
)
|
|
875
|
+
total_uploaded += len(data_batch)
|
|
876
|
+
data_batch = []
|
|
877
|
+
|
|
878
|
+
except json.JSONDecodeError:
|
|
879
|
+
skipped += 1
|
|
880
|
+
continue
|
|
448
881
|
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
882
|
+
# Upload remaining data points using thread-local client
|
|
883
|
+
if data_batch:
|
|
884
|
+
remote_client.append_batch_to_metric(experiment_id, metric_name, data_batch)
|
|
885
|
+
total_uploaded += len(data_batch)
|
|
886
|
+
|
|
887
|
+
return {
|
|
888
|
+
"success": True,
|
|
889
|
+
"uploaded": total_uploaded,
|
|
890
|
+
"skipped": skipped,
|
|
891
|
+
"bytes": bytes_uploaded,
|
|
892
|
+
"error": None,
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
except Exception as e:
|
|
896
|
+
return {
|
|
897
|
+
"success": False,
|
|
898
|
+
"uploaded": 0,
|
|
899
|
+
"skipped": 0,
|
|
900
|
+
"bytes": 0,
|
|
901
|
+
"error": str(e),
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
def _upload_metrics(
|
|
905
|
+
self,
|
|
906
|
+
experiment_id: str,
|
|
907
|
+
exp_info: ExperimentInfo,
|
|
908
|
+
result: UploadResult,
|
|
909
|
+
task_id=None,
|
|
910
|
+
update_progress=None,
|
|
911
|
+
) -> int:
|
|
912
|
+
"""Upload metrics in parallel with concurrency limit."""
|
|
913
|
+
if not exp_info.metric_names:
|
|
914
|
+
return 0
|
|
915
|
+
|
|
916
|
+
total_metrics = 0
|
|
917
|
+
|
|
918
|
+
# Use ThreadPoolExecutor for parallel uploads
|
|
919
|
+
with ThreadPoolExecutor(max_workers=self.max_concurrent_metrics) as executor:
|
|
920
|
+
# Submit all metric upload tasks
|
|
921
|
+
future_to_metric = {}
|
|
922
|
+
for metric_name in exp_info.metric_names:
|
|
923
|
+
metric_dir = exp_info.path / "metrics" / metric_name
|
|
924
|
+
future = executor.submit(
|
|
925
|
+
self._upload_single_metric, experiment_id, metric_name, metric_dir, result
|
|
926
|
+
)
|
|
927
|
+
future_to_metric[future] = metric_name
|
|
453
928
|
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
929
|
+
# Process completed uploads as they finish
|
|
930
|
+
for future in as_completed(future_to_metric):
|
|
931
|
+
metric_name = future_to_metric[future]
|
|
457
932
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
for line_num, line in enumerate(f, start=1):
|
|
462
|
-
try:
|
|
463
|
-
data_point = json.loads(line)
|
|
464
|
-
# Check for data field
|
|
465
|
-
if "data" not in data_point:
|
|
466
|
-
invalid_lines.append(line_num)
|
|
467
|
-
except json.JSONDecodeError:
|
|
468
|
-
invalid_lines.append(line_num)
|
|
469
|
-
|
|
470
|
-
if invalid_lines:
|
|
471
|
-
count = len(invalid_lines)
|
|
472
|
-
preview = invalid_lines[:5]
|
|
473
|
-
result.warnings.append(
|
|
474
|
-
f"metric '{metric_name}' has {count} invalid lines (e.g., {preview}...) - will skip these"
|
|
475
|
-
)
|
|
476
|
-
|
|
477
|
-
except IOError as e:
|
|
478
|
-
result.warnings.append(f"Cannot read metric '{metric_name}': {e} (will skip)")
|
|
479
|
-
|
|
480
|
-
def _validate_files(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
481
|
-
"""Validate files existence."""
|
|
482
|
-
files_dir = exp_info.path / "files"
|
|
483
|
-
if not files_dir.exists():
|
|
484
|
-
return
|
|
485
|
-
|
|
486
|
-
metadata_file = files_dir / ".files_metadata.json"
|
|
487
|
-
if not metadata_file.exists():
|
|
488
|
-
return
|
|
933
|
+
# Update progress
|
|
934
|
+
if update_progress:
|
|
935
|
+
update_progress(f"Uploading metric '{metric_name}'...")
|
|
489
936
|
|
|
490
937
|
try:
|
|
491
|
-
|
|
492
|
-
files_metadata = json.load(f)
|
|
493
|
-
|
|
494
|
-
missing_files = []
|
|
495
|
-
for file_id, file_info in files_metadata.items():
|
|
496
|
-
if isinstance(file_info, dict) and file_info.get("deletedAt") is None:
|
|
497
|
-
# Check if file exists
|
|
498
|
-
file_path = files_dir / file_info.get("prefix", "") / file_id / file_info.get("filename", "")
|
|
499
|
-
if not file_path.exists():
|
|
500
|
-
missing_files.append(file_info.get("filename", file_id))
|
|
501
|
-
|
|
502
|
-
if missing_files:
|
|
503
|
-
count = len(missing_files)
|
|
504
|
-
preview = missing_files[:3]
|
|
505
|
-
result.warnings.append(
|
|
506
|
-
f"{count} files referenced in metadata but missing on disk (e.g., {preview}...) - will skip these"
|
|
507
|
-
)
|
|
508
|
-
|
|
509
|
-
except (json.JSONDecodeError, IOError):
|
|
510
|
-
pass # If we can't read metadata, just skip file validation
|
|
938
|
+
upload_result = future.result()
|
|
511
939
|
|
|
940
|
+
# Thread-safe update of shared state
|
|
941
|
+
with self._lock:
|
|
942
|
+
result.bytes_uploaded += upload_result["bytes"]
|
|
512
943
|
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
def __init__(
|
|
517
|
-
self,
|
|
518
|
-
local_storage: LocalStorage,
|
|
519
|
-
remote_client: RemoteClient,
|
|
520
|
-
batch_size: int = 100,
|
|
521
|
-
skip_logs: bool = False,
|
|
522
|
-
skip_metrics: bool = False,
|
|
523
|
-
skip_files: bool = False,
|
|
524
|
-
skip_params: bool = False,
|
|
525
|
-
verbose: bool = False,
|
|
526
|
-
progress: Optional[Progress] = None,
|
|
527
|
-
max_concurrent_metrics: int = 5,
|
|
528
|
-
):
|
|
529
|
-
"""
|
|
530
|
-
Initialize uploader.
|
|
531
|
-
|
|
532
|
-
Args:
|
|
533
|
-
local_storage: Local storage instance
|
|
534
|
-
remote_client: Remote client instance
|
|
535
|
-
batch_size: Batch size for logs/metrics
|
|
536
|
-
skip_logs: Skip uploading logs
|
|
537
|
-
skip_metrics: Skip uploading metrics
|
|
538
|
-
skip_files: Skip uploading files
|
|
539
|
-
skip_params: Skip uploading parameters
|
|
540
|
-
verbose: Show verbose output
|
|
541
|
-
progress: Optional rich Progress instance for tracking
|
|
542
|
-
max_concurrent_metrics: Maximum concurrent metric uploads (default: 5)
|
|
543
|
-
"""
|
|
544
|
-
self.local = local_storage
|
|
545
|
-
self.remote = remote_client
|
|
546
|
-
self.batch_size = batch_size
|
|
547
|
-
self.skip_logs = skip_logs
|
|
548
|
-
self.skip_metrics = skip_metrics
|
|
549
|
-
self.skip_files = skip_files
|
|
550
|
-
self.skip_params = skip_params
|
|
551
|
-
self.verbose = verbose
|
|
552
|
-
self.progress = progress
|
|
553
|
-
self.max_concurrent_metrics = max_concurrent_metrics
|
|
554
|
-
# Thread-safe lock for shared state updates
|
|
555
|
-
self._lock = threading.Lock()
|
|
556
|
-
# Thread-local storage for remote clients (for thread-safe HTTP requests)
|
|
557
|
-
self._thread_local = threading.local()
|
|
558
|
-
|
|
559
|
-
def _get_remote_client(self) -> RemoteClient:
|
|
560
|
-
"""Get thread-local remote client for safe concurrent access."""
|
|
561
|
-
if not hasattr(self._thread_local, 'client'):
|
|
562
|
-
# Create a new client for this thread
|
|
563
|
-
self._thread_local.client = RemoteClient(
|
|
564
|
-
base_url=self.remote.base_url,
|
|
565
|
-
api_key=self.remote.api_key
|
|
566
|
-
)
|
|
567
|
-
return self._thread_local.client
|
|
568
|
-
|
|
569
|
-
def upload_experiment(
|
|
570
|
-
self, exp_info: ExperimentInfo, validation_result: ValidationResult, task_id=None
|
|
571
|
-
) -> UploadResult:
|
|
572
|
-
"""
|
|
573
|
-
Upload a single experiment with all its data.
|
|
574
|
-
|
|
575
|
-
Args:
|
|
576
|
-
exp_info: Experiment information
|
|
577
|
-
validation_result: Validation results
|
|
578
|
-
task_id: Optional progress task ID
|
|
579
|
-
|
|
580
|
-
Returns:
|
|
581
|
-
UploadResult with upload status
|
|
582
|
-
"""
|
|
583
|
-
result = UploadResult(experiment=f"{exp_info.project}/{exp_info.experiment}")
|
|
584
|
-
|
|
585
|
-
# Calculate total steps for progress tracking
|
|
586
|
-
total_steps = 1 # metadata
|
|
587
|
-
if not self.skip_params and "parameters" in validation_result.valid_data:
|
|
588
|
-
total_steps += 1
|
|
589
|
-
if not self.skip_logs and exp_info.has_logs:
|
|
590
|
-
total_steps += 1
|
|
591
|
-
if not self.skip_metrics and exp_info.metric_names:
|
|
592
|
-
total_steps += len(exp_info.metric_names)
|
|
593
|
-
if not self.skip_files and exp_info.file_count > 0:
|
|
594
|
-
total_steps += exp_info.file_count
|
|
595
|
-
|
|
596
|
-
current_step = 0
|
|
597
|
-
|
|
598
|
-
def update_progress(description: str):
|
|
599
|
-
nonlocal current_step
|
|
600
|
-
current_step += 1
|
|
601
|
-
if self.progress and task_id is not None:
|
|
602
|
-
self.progress.update(task_id, completed=current_step, total=total_steps, description=description)
|
|
944
|
+
if upload_result["success"]:
|
|
945
|
+
total_metrics += 1
|
|
603
946
|
|
|
604
|
-
|
|
605
|
-
# 1. Create/update experiment metadata
|
|
606
|
-
update_progress("Creating experiment...")
|
|
947
|
+
# Thread-safe console output
|
|
607
948
|
if self.verbose:
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
bindrs=exp_data.get("bindrs"),
|
|
623
|
-
folder=None, # Don't send folder path as folderId (expects Snowflake ID)
|
|
624
|
-
write_protected=exp_data.get("write_protected", False),
|
|
625
|
-
metadata=custom_metadata if custom_metadata else None,
|
|
626
|
-
)
|
|
627
|
-
|
|
628
|
-
# Extract experiment ID from nested response
|
|
629
|
-
experiment_id = response.get("experiment", {}).get("id") or response.get("id")
|
|
630
|
-
if self.verbose:
|
|
631
|
-
console.print(f" [green]✓[/green] Created experiment (id: {experiment_id})")
|
|
632
|
-
|
|
633
|
-
# 2. Upload parameters
|
|
634
|
-
if not self.skip_params and "parameters" in validation_result.valid_data:
|
|
635
|
-
update_progress("Uploading parameters...")
|
|
636
|
-
if self.verbose:
|
|
637
|
-
console.print(f" [dim]Uploading parameters...[/dim]")
|
|
638
|
-
|
|
639
|
-
params = validation_result.valid_data["parameters"]
|
|
640
|
-
self.remote.set_parameters(experiment_id, params)
|
|
641
|
-
result.uploaded["params"] = len(params)
|
|
642
|
-
# Track bytes (approximate JSON size)
|
|
643
|
-
result.bytes_uploaded += len(json.dumps(params).encode('utf-8'))
|
|
644
|
-
|
|
645
|
-
if self.verbose:
|
|
646
|
-
console.print(f" [green]✓[/green] Uploaded {len(params)} parameters")
|
|
647
|
-
|
|
648
|
-
# 3. Upload logs
|
|
649
|
-
if not self.skip_logs and exp_info.has_logs:
|
|
650
|
-
count = self._upload_logs(experiment_id, exp_info, result, task_id, update_progress)
|
|
651
|
-
result.uploaded["logs"] = count
|
|
652
|
-
|
|
653
|
-
# 4. Upload metrics
|
|
654
|
-
if not self.skip_metrics and exp_info.metric_names:
|
|
655
|
-
count = self._upload_metrics(experiment_id, exp_info, result, task_id, update_progress)
|
|
656
|
-
result.uploaded["metrics"] = count
|
|
657
|
-
|
|
658
|
-
# 5. Upload files
|
|
659
|
-
if not self.skip_files and exp_info.file_count > 0:
|
|
660
|
-
count = self._upload_files(experiment_id, exp_info, result, task_id, update_progress)
|
|
661
|
-
result.uploaded["files"] = count
|
|
662
|
-
|
|
663
|
-
result.success = True
|
|
949
|
+
msg = f" [green]✓[/green] Uploaded {upload_result['uploaded']} data points for '{metric_name}'"
|
|
950
|
+
if upload_result["skipped"] > 0:
|
|
951
|
+
msg += f" (skipped {upload_result['skipped']} invalid)"
|
|
952
|
+
with self._lock:
|
|
953
|
+
console.print(msg)
|
|
954
|
+
else:
|
|
955
|
+
# Record failure
|
|
956
|
+
error_msg = f"{metric_name}: {upload_result['error']}"
|
|
957
|
+
with self._lock:
|
|
958
|
+
result.failed.setdefault("metrics", []).append(error_msg)
|
|
959
|
+
if self.verbose:
|
|
960
|
+
console.print(
|
|
961
|
+
f" [red]✗[/red] Failed to upload '{metric_name}': {upload_result['error']}"
|
|
962
|
+
)
|
|
664
963
|
|
|
665
964
|
except Exception as e:
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
return result
|
|
672
|
-
|
|
673
|
-
def _upload_logs(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
|
|
674
|
-
task_id=None, update_progress=None) -> int:
|
|
675
|
-
"""Upload logs in batches."""
|
|
676
|
-
if update_progress:
|
|
677
|
-
update_progress("Uploading logs...")
|
|
678
|
-
if self.verbose:
|
|
679
|
-
console.print(f" [dim]Uploading logs...[/dim]")
|
|
680
|
-
|
|
681
|
-
logs_file = exp_info.path / "logs" / "logs.jsonl"
|
|
682
|
-
logs_batch = []
|
|
683
|
-
total_uploaded = 0
|
|
684
|
-
skipped = 0
|
|
685
|
-
|
|
686
|
-
try:
|
|
687
|
-
with open(logs_file, "r") as f:
|
|
688
|
-
for line in f:
|
|
689
|
-
try:
|
|
690
|
-
log_entry = json.loads(line)
|
|
691
|
-
|
|
692
|
-
# Validate required fields
|
|
693
|
-
if "message" not in log_entry:
|
|
694
|
-
skipped += 1
|
|
695
|
-
continue
|
|
696
|
-
|
|
697
|
-
# Prepare log entry for API
|
|
698
|
-
api_log = {
|
|
699
|
-
"timestamp": log_entry.get("timestamp"),
|
|
700
|
-
"level": log_entry.get("level", "info"),
|
|
701
|
-
"message": log_entry["message"],
|
|
702
|
-
}
|
|
703
|
-
if "metadata" in log_entry:
|
|
704
|
-
api_log["metadata"] = log_entry["metadata"]
|
|
705
|
-
|
|
706
|
-
logs_batch.append(api_log)
|
|
707
|
-
# Track bytes
|
|
708
|
-
result.bytes_uploaded += len(line.encode('utf-8'))
|
|
709
|
-
|
|
710
|
-
# Upload batch
|
|
711
|
-
if len(logs_batch) >= self.batch_size:
|
|
712
|
-
self.remote.create_log_entries(experiment_id, logs_batch)
|
|
713
|
-
total_uploaded += len(logs_batch)
|
|
714
|
-
logs_batch = []
|
|
715
|
-
|
|
716
|
-
except json.JSONDecodeError:
|
|
717
|
-
skipped += 1
|
|
718
|
-
continue
|
|
719
|
-
|
|
720
|
-
# Upload remaining logs
|
|
721
|
-
if logs_batch:
|
|
722
|
-
self.remote.create_log_entries(experiment_id, logs_batch)
|
|
723
|
-
total_uploaded += len(logs_batch)
|
|
724
|
-
|
|
965
|
+
# Handle unexpected errors
|
|
966
|
+
error_msg = f"{metric_name}: {str(e)}"
|
|
967
|
+
with self._lock:
|
|
968
|
+
result.failed.setdefault("metrics", []).append(error_msg)
|
|
725
969
|
if self.verbose:
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
970
|
+
console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {e}")
|
|
971
|
+
|
|
972
|
+
return total_metrics
|
|
973
|
+
|
|
974
|
+
def _upload_files(
|
|
975
|
+
self,
|
|
976
|
+
experiment_id: str,
|
|
977
|
+
exp_info: ExperimentInfo,
|
|
978
|
+
result: UploadResult,
|
|
979
|
+
task_id=None,
|
|
980
|
+
update_progress=None,
|
|
981
|
+
) -> int:
|
|
982
|
+
"""Upload files one by one."""
|
|
983
|
+
files_dir = exp_info.path / "files"
|
|
984
|
+
total_uploaded = 0
|
|
985
|
+
|
|
986
|
+
# Parse prefix to get owner, project, and experiment path
|
|
987
|
+
# Format: owner/project/folder.../experiment
|
|
988
|
+
parts = exp_info.prefix.split("/") if exp_info.prefix else []
|
|
989
|
+
if len(parts) < 3:
|
|
990
|
+
# Invalid prefix format, skip file upload
|
|
991
|
+
return 0
|
|
992
|
+
|
|
993
|
+
owner = parts[0]
|
|
994
|
+
project = parts[1]
|
|
995
|
+
# Note: _get_experiment_dir expects the FULL prefix, not just the experiment part
|
|
996
|
+
# So we pass the full prefix to list_files
|
|
997
|
+
full_prefix = exp_info.prefix
|
|
998
|
+
|
|
999
|
+
# Use LocalStorage to list files
|
|
1000
|
+
try:
|
|
1001
|
+
files_list = self.local.list_files(owner, project, full_prefix)
|
|
1002
|
+
|
|
1003
|
+
# Debug: print file count
|
|
1004
|
+
if self.verbose:
|
|
1005
|
+
print(f"[DEBUG] Found {len(files_list)} files to upload")
|
|
1006
|
+
print(f"[DEBUG] Full prefix: {full_prefix}")
|
|
1007
|
+
|
|
1008
|
+
for file_info in files_list:
|
|
1009
|
+
# Skip deleted files
|
|
1010
|
+
if file_info.get("deletedAt") is not None:
|
|
1011
|
+
continue
|
|
757
1012
|
|
|
758
1013
|
try:
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
1014
|
+
if update_progress:
|
|
1015
|
+
update_progress(f"Uploading {file_info['filename']}...")
|
|
1016
|
+
|
|
1017
|
+
# Get file path directly from storage without copying
|
|
1018
|
+
file_id = file_info["id"]
|
|
1019
|
+
experiment_dir = self.local._get_experiment_dir(
|
|
1020
|
+
owner, project, full_prefix
|
|
1021
|
+
)
|
|
1022
|
+
files_dir = experiment_dir / "files"
|
|
1023
|
+
|
|
1024
|
+
# Construct file path
|
|
1025
|
+
file_prefix = file_info["path"].lstrip("/") if file_info["path"] else ""
|
|
1026
|
+
if file_prefix:
|
|
1027
|
+
file_path = files_dir / file_prefix / file_id / file_info["filename"]
|
|
1028
|
+
else:
|
|
1029
|
+
file_path = files_dir / file_id / file_info["filename"]
|
|
1030
|
+
|
|
1031
|
+
# Upload to remote with correct parameters
|
|
1032
|
+
self.remote.upload_file(
|
|
1033
|
+
experiment_id=experiment_id,
|
|
1034
|
+
file_path=str(file_path),
|
|
1035
|
+
prefix=file_info.get("path", ""),
|
|
1036
|
+
filename=file_info["filename"],
|
|
1037
|
+
description=file_info.get("description"),
|
|
1038
|
+
tags=file_info.get("tags", []),
|
|
1039
|
+
metadata=file_info.get("metadata"),
|
|
1040
|
+
checksum=file_info["checksum"],
|
|
1041
|
+
content_type=file_info["contentType"],
|
|
1042
|
+
size_bytes=file_info["sizeBytes"],
|
|
1043
|
+
)
|
|
1044
|
+
|
|
1045
|
+
total_uploaded += 1
|
|
1046
|
+
# Track bytes
|
|
1047
|
+
result.bytes_uploaded += file_info.get("sizeBytes", 0)
|
|
1048
|
+
|
|
1049
|
+
if self.verbose:
|
|
1050
|
+
size_mb = file_info.get("sizeBytes", 0) / (1024 * 1024)
|
|
1051
|
+
console.print(
|
|
1052
|
+
f" [green]✓[/green] {file_info['filename']} ({size_mb:.1f}MB)"
|
|
1053
|
+
)
|
|
796
1054
|
|
|
797
1055
|
except Exception as e:
|
|
798
|
-
|
|
799
|
-
'success': False,
|
|
800
|
-
'uploaded': 0,
|
|
801
|
-
'skipped': 0,
|
|
802
|
-
'bytes': 0,
|
|
803
|
-
'error': str(e)
|
|
804
|
-
}
|
|
1056
|
+
result.failed.setdefault("files", []).append(f"{file_info['filename']}: {e}")
|
|
805
1057
|
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
"""Upload metrics in parallel with concurrency limit."""
|
|
809
|
-
if not exp_info.metric_names:
|
|
810
|
-
return 0
|
|
811
|
-
|
|
812
|
-
total_metrics = 0
|
|
813
|
-
|
|
814
|
-
# Use ThreadPoolExecutor for parallel uploads
|
|
815
|
-
with ThreadPoolExecutor(max_workers=self.max_concurrent_metrics) as executor:
|
|
816
|
-
# Submit all metric upload tasks
|
|
817
|
-
future_to_metric = {}
|
|
818
|
-
for metric_name in exp_info.metric_names:
|
|
819
|
-
metric_dir = exp_info.path / "metrics" / metric_name
|
|
820
|
-
future = executor.submit(
|
|
821
|
-
self._upload_single_metric,
|
|
822
|
-
experiment_id,
|
|
823
|
-
metric_name,
|
|
824
|
-
metric_dir,
|
|
825
|
-
result
|
|
826
|
-
)
|
|
827
|
-
future_to_metric[future] = metric_name
|
|
828
|
-
|
|
829
|
-
# Process completed uploads as they finish
|
|
830
|
-
for future in as_completed(future_to_metric):
|
|
831
|
-
metric_name = future_to_metric[future]
|
|
832
|
-
|
|
833
|
-
# Update progress
|
|
834
|
-
if update_progress:
|
|
835
|
-
update_progress(f"Uploading metric '{metric_name}'...")
|
|
836
|
-
|
|
837
|
-
try:
|
|
838
|
-
upload_result = future.result()
|
|
839
|
-
|
|
840
|
-
# Thread-safe update of shared state
|
|
841
|
-
with self._lock:
|
|
842
|
-
result.bytes_uploaded += upload_result['bytes']
|
|
843
|
-
|
|
844
|
-
if upload_result['success']:
|
|
845
|
-
total_metrics += 1
|
|
846
|
-
|
|
847
|
-
# Thread-safe console output
|
|
848
|
-
if self.verbose:
|
|
849
|
-
msg = f" [green]✓[/green] Uploaded {upload_result['uploaded']} data points for '{metric_name}'"
|
|
850
|
-
if upload_result['skipped'] > 0:
|
|
851
|
-
msg += f" (skipped {upload_result['skipped']} invalid)"
|
|
852
|
-
with self._lock:
|
|
853
|
-
console.print(msg)
|
|
854
|
-
else:
|
|
855
|
-
# Record failure
|
|
856
|
-
error_msg = f"{metric_name}: {upload_result['error']}"
|
|
857
|
-
with self._lock:
|
|
858
|
-
result.failed.setdefault("metrics", []).append(error_msg)
|
|
859
|
-
if self.verbose:
|
|
860
|
-
console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {upload_result['error']}")
|
|
861
|
-
|
|
862
|
-
except Exception as e:
|
|
863
|
-
# Handle unexpected errors
|
|
864
|
-
error_msg = f"{metric_name}: {str(e)}"
|
|
865
|
-
with self._lock:
|
|
866
|
-
result.failed.setdefault("metrics", []).append(error_msg)
|
|
867
|
-
if self.verbose:
|
|
868
|
-
console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {e}")
|
|
869
|
-
|
|
870
|
-
return total_metrics
|
|
871
|
-
|
|
872
|
-
def _upload_files(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
|
|
873
|
-
task_id=None, update_progress=None) -> int:
|
|
874
|
-
"""Upload files one by one."""
|
|
875
|
-
files_dir = exp_info.path / "files"
|
|
876
|
-
total_uploaded = 0
|
|
877
|
-
|
|
878
|
-
# Use LocalStorage to list files
|
|
879
|
-
try:
|
|
880
|
-
files_list = self.local.list_files(exp_info.project, exp_info.experiment)
|
|
881
|
-
|
|
882
|
-
for file_info in files_list:
|
|
883
|
-
# Skip deleted files
|
|
884
|
-
if file_info.get("deletedAt") is not None:
|
|
885
|
-
continue
|
|
886
|
-
|
|
887
|
-
try:
|
|
888
|
-
if update_progress:
|
|
889
|
-
update_progress(f"Uploading {file_info['filename']}...")
|
|
890
|
-
|
|
891
|
-
# Get file path directly from storage without copying
|
|
892
|
-
file_id = file_info["id"]
|
|
893
|
-
experiment_dir = self.local._get_experiment_dir(exp_info.project, exp_info.experiment)
|
|
894
|
-
files_dir = experiment_dir / "files"
|
|
895
|
-
|
|
896
|
-
# Construct file path
|
|
897
|
-
file_prefix = file_info["path"].lstrip("/") if file_info["path"] else ""
|
|
898
|
-
if file_prefix:
|
|
899
|
-
file_path = files_dir / file_prefix / file_id / file_info["filename"]
|
|
900
|
-
else:
|
|
901
|
-
file_path = files_dir / file_id / file_info["filename"]
|
|
902
|
-
|
|
903
|
-
# Upload to remote with correct parameters
|
|
904
|
-
self.remote.upload_file(
|
|
905
|
-
experiment_id=experiment_id,
|
|
906
|
-
file_path=str(file_path),
|
|
907
|
-
prefix=file_info.get("path", ""),
|
|
908
|
-
filename=file_info["filename"],
|
|
909
|
-
description=file_info.get("description"),
|
|
910
|
-
tags=file_info.get("tags", []),
|
|
911
|
-
metadata=file_info.get("metadata"),
|
|
912
|
-
checksum=file_info["checksum"],
|
|
913
|
-
content_type=file_info["contentType"],
|
|
914
|
-
size_bytes=file_info["sizeBytes"],
|
|
915
|
-
)
|
|
916
|
-
|
|
917
|
-
total_uploaded += 1
|
|
918
|
-
# Track bytes
|
|
919
|
-
result.bytes_uploaded += file_info.get("sizeBytes", 0)
|
|
920
|
-
|
|
921
|
-
if self.verbose:
|
|
922
|
-
size_mb = file_info.get("sizeBytes", 0) / (1024 * 1024)
|
|
923
|
-
console.print(f" [green]✓[/green] {file_info['filename']} ({size_mb:.1f}MB)")
|
|
924
|
-
|
|
925
|
-
except Exception as e:
|
|
926
|
-
result.failed.setdefault("files", []).append(f"{file_info['filename']}: {e}")
|
|
927
|
-
|
|
928
|
-
except Exception as e:
|
|
929
|
-
result.failed.setdefault("files", []).append(str(e))
|
|
1058
|
+
except Exception as e:
|
|
1059
|
+
result.failed.setdefault("files", []).append(str(e))
|
|
930
1060
|
|
|
931
|
-
|
|
932
|
-
|
|
1061
|
+
if self.verbose and not result.failed.get("files"):
|
|
1062
|
+
console.print(f" [green]✓[/green] Uploaded {total_uploaded} files")
|
|
933
1063
|
|
|
934
|
-
|
|
1064
|
+
return total_uploaded
|
|
935
1065
|
|
|
936
1066
|
|
|
937
1067
|
def cmd_upload(args: argparse.Namespace) -> int:
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
if upload_state:
|
|
978
|
-
# Validate state matches current upload
|
|
979
|
-
if upload_state.local_path != str(local_path.absolute()):
|
|
980
|
-
console.print("[yellow]Warning:[/yellow] State file local path doesn't match. Starting fresh upload.")
|
|
981
|
-
upload_state = None
|
|
982
|
-
elif upload_state.remote_url != remote_url:
|
|
983
|
-
console.print("[yellow]Warning:[/yellow] State file remote URL doesn't match. Starting fresh upload.")
|
|
984
|
-
upload_state = None
|
|
985
|
-
else:
|
|
986
|
-
console.print(f"[green]Resuming previous upload from {upload_state.timestamp}[/green]")
|
|
987
|
-
console.print(f" Already completed: {len(upload_state.completed_experiments)} experiments")
|
|
988
|
-
console.print(f" Failed: {len(upload_state.failed_experiments)} experiments")
|
|
989
|
-
else:
|
|
990
|
-
console.print("[yellow]No previous upload state found. Starting fresh upload.[/yellow]")
|
|
991
|
-
|
|
992
|
-
# Create new state if not resuming
|
|
993
|
-
if not upload_state:
|
|
994
|
-
upload_state = UploadState(
|
|
995
|
-
local_path=str(local_path.absolute()),
|
|
996
|
-
remote_url=remote_url,
|
|
1068
|
+
"""
|
|
1069
|
+
Execute upload command.
|
|
1070
|
+
|
|
1071
|
+
Args:
|
|
1072
|
+
args: Parsed command-line arguments
|
|
1073
|
+
|
|
1074
|
+
Returns:
|
|
1075
|
+
Exit code (0 for success, 1 for error)
|
|
1076
|
+
"""
|
|
1077
|
+
# Load config
|
|
1078
|
+
config = Config()
|
|
1079
|
+
|
|
1080
|
+
# Get remote URL (command line > config)
|
|
1081
|
+
remote_url = args.dash_url or config.remote_url
|
|
1082
|
+
if not remote_url:
|
|
1083
|
+
console.print("[red]Error:[/red] --dash-url is required (or set in config)")
|
|
1084
|
+
return 1
|
|
1085
|
+
|
|
1086
|
+
# Get API key (command line > config > auto-load from storage)
|
|
1087
|
+
# RemoteClient will auto-load from storage if api_key is None
|
|
1088
|
+
api_key = args.api_key or config.api_key
|
|
1089
|
+
|
|
1090
|
+
# Discover experiments
|
|
1091
|
+
local_path = Path(args.path)
|
|
1092
|
+
if not local_path.exists():
|
|
1093
|
+
console.print(f"[red]Error:[/red] Local storage path does not exist: {local_path}")
|
|
1094
|
+
return 1
|
|
1095
|
+
|
|
1096
|
+
# Handle state file for resume functionality
|
|
1097
|
+
state_file = Path(args.state_file)
|
|
1098
|
+
upload_state = None
|
|
1099
|
+
|
|
1100
|
+
if args.resume:
|
|
1101
|
+
upload_state = UploadState.load(state_file)
|
|
1102
|
+
if upload_state:
|
|
1103
|
+
# Validate state matches current upload
|
|
1104
|
+
if upload_state.local_path != str(local_path.absolute()):
|
|
1105
|
+
console.print(
|
|
1106
|
+
"[yellow]Warning:[/yellow] State file local path doesn't match. Starting fresh upload."
|
|
997
1107
|
)
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1108
|
+
upload_state = None
|
|
1109
|
+
elif upload_state.remote_url != remote_url:
|
|
1110
|
+
console.print(
|
|
1111
|
+
"[yellow]Warning:[/yellow] State file remote URL doesn't match. Starting fresh upload."
|
|
1112
|
+
)
|
|
1113
|
+
upload_state = None
|
|
1114
|
+
else:
|
|
1115
|
+
console.print(
|
|
1116
|
+
f"[green]Resuming previous upload from {upload_state.timestamp}[/green]"
|
|
1117
|
+
)
|
|
1118
|
+
console.print(
|
|
1119
|
+
f" Already completed: {len(upload_state.completed_experiments)} experiments"
|
|
1120
|
+
)
|
|
1121
|
+
console.print(f" Failed: {len(upload_state.failed_experiments)} experiments")
|
|
1122
|
+
else:
|
|
1123
|
+
console.print(
|
|
1124
|
+
"[yellow]No previous upload state found. Starting fresh upload.[/yellow]"
|
|
1125
|
+
)
|
|
1126
|
+
|
|
1127
|
+
# Create new state if not resuming
|
|
1128
|
+
if not upload_state:
|
|
1129
|
+
upload_state = UploadState(
|
|
1130
|
+
local_path=str(local_path.absolute()),
|
|
1131
|
+
remote_url=remote_url,
|
|
1004
1132
|
)
|
|
1005
1133
|
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
if
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
if exp.file_count:
|
|
1040
|
-
size_mb = exp.estimated_size / (1024 * 1024)
|
|
1041
|
-
parts.append(f"{exp.file_count} files ({size_mb:.1f}MB)")
|
|
1042
|
-
|
|
1043
|
-
details = ", ".join(parts) if parts else "metadata only"
|
|
1044
|
-
console.print(f" [cyan]•[/cyan] {exp.project}/{exp.experiment} [dim]({details})[/dim]")
|
|
1045
|
-
|
|
1046
|
-
# Dry-run mode: stop here
|
|
1047
|
-
if args.dry_run:
|
|
1048
|
-
console.print("\n[yellow bold]DRY RUN[/yellow bold] - No data will be uploaded")
|
|
1049
|
-
console.print("Run without --dry-run to proceed with upload.")
|
|
1050
|
-
return 0
|
|
1051
|
-
|
|
1052
|
-
# Validate experiments
|
|
1053
|
-
console.print("\n[bold]Validating experiments...[/bold]")
|
|
1054
|
-
validator = ExperimentValidator(strict=args.strict)
|
|
1055
|
-
validation_results = {}
|
|
1056
|
-
valid_experiments = []
|
|
1057
|
-
invalid_experiments = []
|
|
1058
|
-
|
|
1134
|
+
console.print(f"[bold]Scanning local storage:[/bold] {local_path.absolute()}")
|
|
1135
|
+
experiments = discover_experiments(
|
|
1136
|
+
local_path,
|
|
1137
|
+
project_filter=args.pref, # Using --prefix/-p argument
|
|
1138
|
+
experiment_filter=None,
|
|
1139
|
+
)
|
|
1140
|
+
|
|
1141
|
+
if not experiments:
|
|
1142
|
+
if args.pref:
|
|
1143
|
+
console.print(f"[yellow]No experiments found matching pattern:[/yellow] {args.pref}")
|
|
1144
|
+
else:
|
|
1145
|
+
console.print("[yellow]No experiments found in local storage[/yellow]")
|
|
1146
|
+
return 1
|
|
1147
|
+
|
|
1148
|
+
# Filter out already completed experiments when resuming
|
|
1149
|
+
if args.resume and upload_state.completed_experiments:
|
|
1150
|
+
original_count = len(experiments)
|
|
1151
|
+
experiments = [
|
|
1152
|
+
exp
|
|
1153
|
+
for exp in experiments
|
|
1154
|
+
if f"{exp.project}/{exp.experiment}" not in upload_state.completed_experiments
|
|
1155
|
+
]
|
|
1156
|
+
skipped_count = original_count - len(experiments)
|
|
1157
|
+
if skipped_count > 0:
|
|
1158
|
+
console.print(
|
|
1159
|
+
f"[dim]Skipping {skipped_count} already completed experiment(s)[/dim]"
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
console.print(f"[green]Found {len(experiments)} experiment(s) to upload[/green]")
|
|
1163
|
+
|
|
1164
|
+
# Display discovered experiments
|
|
1165
|
+
if args.verbose or args.dry_run:
|
|
1166
|
+
console.print("\n[bold]Discovered experiments:[/bold]")
|
|
1059
1167
|
for exp in experiments:
|
|
1060
|
-
|
|
1061
|
-
|
|
1168
|
+
parts = []
|
|
1169
|
+
if exp.has_logs:
|
|
1170
|
+
parts.append("logs")
|
|
1171
|
+
if exp.has_params:
|
|
1172
|
+
parts.append("params")
|
|
1173
|
+
if exp.metric_names:
|
|
1174
|
+
parts.append(f"{len(exp.metric_names)} metrics")
|
|
1175
|
+
if exp.file_count:
|
|
1176
|
+
size_mb = exp.estimated_size / (1024 * 1024)
|
|
1177
|
+
parts.append(f"{exp.file_count} files ({size_mb:.1f}MB)")
|
|
1178
|
+
|
|
1179
|
+
details = ", ".join(parts) if parts else "metadata only"
|
|
1180
|
+
console.print(
|
|
1181
|
+
f" [cyan]•[/cyan] {exp.project}/{exp.experiment} [dim]({details})[/dim]"
|
|
1182
|
+
)
|
|
1183
|
+
|
|
1184
|
+
# Dry-run mode: stop here
|
|
1185
|
+
if args.dry_run:
|
|
1186
|
+
console.print("\n[yellow bold]DRY RUN[/yellow bold] - No data will be uploaded")
|
|
1187
|
+
console.print("Run without --dry-run to proceed with upload.")
|
|
1188
|
+
return 0
|
|
1189
|
+
|
|
1190
|
+
# Validate experiments
|
|
1191
|
+
console.print("\n[bold]Validating experiments...[/bold]")
|
|
1192
|
+
validator = ExperimentValidator(strict=args.strict)
|
|
1193
|
+
validation_results = {}
|
|
1194
|
+
valid_experiments = []
|
|
1195
|
+
invalid_experiments = []
|
|
1196
|
+
|
|
1197
|
+
for exp in experiments:
|
|
1198
|
+
validation = validator.validate_experiment(exp)
|
|
1199
|
+
validation_results[f"{exp.project}/{exp.experiment}"] = validation
|
|
1200
|
+
|
|
1201
|
+
if validation.is_valid:
|
|
1202
|
+
valid_experiments.append(exp)
|
|
1203
|
+
else:
|
|
1204
|
+
invalid_experiments.append(exp)
|
|
1205
|
+
|
|
1206
|
+
# Show warnings and errors
|
|
1207
|
+
if args.verbose or validation.errors:
|
|
1208
|
+
exp_key = f"{exp.project}/{exp.experiment}"
|
|
1209
|
+
if validation.errors:
|
|
1210
|
+
console.print(f" [red]✗[/red] {exp_key}:")
|
|
1211
|
+
for error in validation.errors:
|
|
1212
|
+
console.print(f" [red]{error}[/red]")
|
|
1213
|
+
elif validation.warnings:
|
|
1214
|
+
console.print(f" [yellow]⚠[/yellow] {exp_key}:")
|
|
1215
|
+
for warning in validation.warnings:
|
|
1216
|
+
console.print(f" [yellow]{warning}[/yellow]")
|
|
1217
|
+
|
|
1218
|
+
if invalid_experiments:
|
|
1219
|
+
console.print(
|
|
1220
|
+
f"\n[yellow]{len(invalid_experiments)} experiment(s) failed validation and will be skipped[/yellow]"
|
|
1221
|
+
)
|
|
1222
|
+
if args.strict:
|
|
1223
|
+
console.print("[red]Error: Validation failed in --strict mode[/red]")
|
|
1224
|
+
return 1
|
|
1225
|
+
|
|
1226
|
+
if not valid_experiments:
|
|
1227
|
+
console.print("[red]Error: No valid experiments to upload[/red]")
|
|
1228
|
+
return 1
|
|
1229
|
+
|
|
1230
|
+
console.print(
|
|
1231
|
+
f"[green]{len(valid_experiments)} experiment(s) ready to upload[/green]"
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
# Initialize remote client and local storage
|
|
1235
|
+
remote_client = RemoteClient(base_url=remote_url, api_key=api_key)
|
|
1236
|
+
local_storage = LocalStorage(root_path=local_path)
|
|
1237
|
+
|
|
1238
|
+
# Upload experiments with progress tracking
|
|
1239
|
+
console.print(f"\n[bold]Uploading to:[/bold] {remote_url}")
|
|
1240
|
+
if args.target:
|
|
1241
|
+
console.print(f"[bold]Target prefix:[/bold] {args.target}")
|
|
1242
|
+
results = []
|
|
1243
|
+
|
|
1244
|
+
# Track upload timing
|
|
1245
|
+
import time
|
|
1246
|
+
|
|
1247
|
+
start_time = time.time()
|
|
1248
|
+
|
|
1249
|
+
# Create progress bar for overall upload
|
|
1250
|
+
with Progress(
|
|
1251
|
+
SpinnerColumn(),
|
|
1252
|
+
TextColumn("[progress.description]{task.description}"),
|
|
1253
|
+
BarColumn(),
|
|
1254
|
+
TaskProgressColumn(),
|
|
1255
|
+
console=console,
|
|
1256
|
+
transient=not args.verbose, # Keep progress visible in verbose mode
|
|
1257
|
+
) as progress:
|
|
1258
|
+
# Create uploader with progress tracking
|
|
1259
|
+
uploader = ExperimentUploader(
|
|
1260
|
+
local_storage=local_storage,
|
|
1261
|
+
remote_client=remote_client,
|
|
1262
|
+
batch_size=args.batch_size,
|
|
1263
|
+
skip_logs=args.skip_logs,
|
|
1264
|
+
skip_metrics=args.skip_metrics,
|
|
1265
|
+
skip_files=args.skip_files,
|
|
1266
|
+
skip_params=args.skip_params,
|
|
1267
|
+
verbose=args.verbose,
|
|
1268
|
+
progress=progress,
|
|
1269
|
+
target_prefix=args.target,
|
|
1270
|
+
)
|
|
1062
1271
|
|
|
1063
|
-
|
|
1064
|
-
|
|
1272
|
+
for i, exp in enumerate(valid_experiments, start=1):
|
|
1273
|
+
exp_key = f"{exp.project}/{exp.experiment}"
|
|
1274
|
+
|
|
1275
|
+
# Create task for this experiment
|
|
1276
|
+
task_id = progress.add_task(
|
|
1277
|
+
f"[{i}/{len(valid_experiments)}] {exp_key}",
|
|
1278
|
+
total=100, # Will be updated with actual steps
|
|
1279
|
+
)
|
|
1280
|
+
|
|
1281
|
+
# Update state - mark as in progress
|
|
1282
|
+
upload_state.in_progress_experiment = exp_key
|
|
1283
|
+
if not args.dry_run:
|
|
1284
|
+
upload_state.save(state_file)
|
|
1285
|
+
|
|
1286
|
+
validation = validation_results[exp_key]
|
|
1287
|
+
result = uploader.upload_experiment(exp, validation, task_id=task_id)
|
|
1288
|
+
results.append(result)
|
|
1289
|
+
|
|
1290
|
+
# Update state - mark as completed or failed
|
|
1291
|
+
upload_state.in_progress_experiment = None
|
|
1292
|
+
if result.success:
|
|
1293
|
+
upload_state.completed_experiments.append(exp_key)
|
|
1294
|
+
else:
|
|
1295
|
+
upload_state.failed_experiments.append(exp_key)
|
|
1296
|
+
|
|
1297
|
+
if not args.dry_run:
|
|
1298
|
+
upload_state.save(state_file)
|
|
1299
|
+
|
|
1300
|
+
# Update task to completed
|
|
1301
|
+
progress.update(task_id, completed=100, total=100)
|
|
1302
|
+
|
|
1303
|
+
if not args.verbose:
|
|
1304
|
+
# Show brief status
|
|
1305
|
+
if result.success:
|
|
1306
|
+
parts = []
|
|
1307
|
+
if result.uploaded.get("params"):
|
|
1308
|
+
parts.append(f"{result.uploaded['params']} params")
|
|
1309
|
+
if result.uploaded.get("logs"):
|
|
1310
|
+
parts.append(f"{result.uploaded['logs']} logs")
|
|
1311
|
+
if result.uploaded.get("metrics"):
|
|
1312
|
+
parts.append(f"{result.uploaded['metrics']} metrics")
|
|
1313
|
+
if result.uploaded.get("files"):
|
|
1314
|
+
parts.append(f"{result.uploaded['files']} files")
|
|
1315
|
+
status = ", ".join(parts) if parts else "metadata only"
|
|
1316
|
+
console.print(f" [green]✓[/green] Uploaded ({status})")
|
|
1065
1317
|
else:
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1318
|
+
console.print(" [red]✗[/red] Failed")
|
|
1319
|
+
if result.errors:
|
|
1320
|
+
for error in result.errors[:3]: # Show first 3 errors
|
|
1321
|
+
console.print(f" [red]{error}[/red]")
|
|
1322
|
+
|
|
1323
|
+
# Calculate timing
|
|
1324
|
+
end_time = time.time()
|
|
1325
|
+
elapsed_time = end_time - start_time
|
|
1326
|
+
total_bytes = sum(r.bytes_uploaded for r in results)
|
|
1327
|
+
|
|
1328
|
+
# Print summary with rich Table
|
|
1329
|
+
console.print()
|
|
1330
|
+
|
|
1331
|
+
successful = [r for r in results if r.success]
|
|
1332
|
+
failed = [r for r in results if not r.success]
|
|
1333
|
+
|
|
1334
|
+
# Create summary table
|
|
1335
|
+
summary_table = Table(title="Upload Summary", show_header=True, header_style="bold")
|
|
1336
|
+
summary_table.add_column("Status", style="cyan")
|
|
1337
|
+
summary_table.add_column("Count", justify="right")
|
|
1338
|
+
|
|
1339
|
+
summary_table.add_row(
|
|
1340
|
+
"Successful", f"[green]{len(successful)}/{len(results)}[/green]"
|
|
1341
|
+
)
|
|
1342
|
+
if failed:
|
|
1343
|
+
summary_table.add_row("Failed", f"[red]{len(failed)}/{len(results)}[/red]")
|
|
1344
|
+
|
|
1345
|
+
# Add timing information
|
|
1346
|
+
summary_table.add_row("Total Time", f"{elapsed_time:.2f}s")
|
|
1347
|
+
|
|
1348
|
+
# Calculate and display upload speed
|
|
1349
|
+
if total_bytes > 0 and elapsed_time > 0:
|
|
1350
|
+
# Convert to appropriate unit
|
|
1351
|
+
if total_bytes < 1024 * 1024: # Less than 1 MB
|
|
1352
|
+
speed_kb = (total_bytes / 1024) / elapsed_time
|
|
1353
|
+
summary_table.add_row("Avg Speed", f"{speed_kb:.2f} KB/s")
|
|
1354
|
+
else: # 1 MB or more
|
|
1355
|
+
speed_mb = (total_bytes / (1024 * 1024)) / elapsed_time
|
|
1356
|
+
summary_table.add_row("Avg Speed", f"{speed_mb:.2f} MB/s")
|
|
1357
|
+
|
|
1358
|
+
console.print(summary_table)
|
|
1359
|
+
|
|
1360
|
+
# Show failed experiments
|
|
1361
|
+
if failed:
|
|
1362
|
+
console.print("\n[bold red]Failed Experiments:[/bold red]")
|
|
1363
|
+
for result in failed:
|
|
1364
|
+
console.print(f" [red]✗[/red] {result.experiment}")
|
|
1365
|
+
for error in result.errors:
|
|
1366
|
+
console.print(f" [dim]{error}[/dim]")
|
|
1367
|
+
|
|
1368
|
+
# Data statistics
|
|
1369
|
+
total_logs = sum(r.uploaded.get("logs", 0) for r in results)
|
|
1370
|
+
total_metrics = sum(r.uploaded.get("metrics", 0) for r in results)
|
|
1371
|
+
total_files = sum(r.uploaded.get("files", 0) for r in results)
|
|
1372
|
+
|
|
1373
|
+
if total_logs or total_metrics or total_files:
|
|
1374
|
+
data_table = Table(title="Data Uploaded", show_header=True, header_style="bold")
|
|
1375
|
+
data_table.add_column("Type", style="cyan")
|
|
1376
|
+
data_table.add_column("Count", justify="right", style="green")
|
|
1377
|
+
|
|
1378
|
+
if total_logs:
|
|
1379
|
+
data_table.add_row("Logs", f"{total_logs} entries")
|
|
1380
|
+
if total_metrics:
|
|
1381
|
+
data_table.add_row("Metrics", f"{total_metrics} metrics")
|
|
1382
|
+
if total_files:
|
|
1383
|
+
data_table.add_row("Files", f"{total_files} files")
|
|
1125
1384
|
|
|
1126
|
-
for i, exp in enumerate(valid_experiments, start=1):
|
|
1127
|
-
exp_key = f"{exp.project}/{exp.experiment}"
|
|
1128
|
-
|
|
1129
|
-
# Create task for this experiment
|
|
1130
|
-
task_id = progress.add_task(
|
|
1131
|
-
f"[{i}/{len(valid_experiments)}] {exp_key}",
|
|
1132
|
-
total=100, # Will be updated with actual steps
|
|
1133
|
-
)
|
|
1134
|
-
|
|
1135
|
-
# Update state - mark as in progress
|
|
1136
|
-
upload_state.in_progress_experiment = exp_key
|
|
1137
|
-
if not args.dry_run:
|
|
1138
|
-
upload_state.save(state_file)
|
|
1139
|
-
|
|
1140
|
-
validation = validation_results[exp_key]
|
|
1141
|
-
result = uploader.upload_experiment(exp, validation, task_id=task_id)
|
|
1142
|
-
results.append(result)
|
|
1143
|
-
|
|
1144
|
-
# Update state - mark as completed or failed
|
|
1145
|
-
upload_state.in_progress_experiment = None
|
|
1146
|
-
if result.success:
|
|
1147
|
-
upload_state.completed_experiments.append(exp_key)
|
|
1148
|
-
else:
|
|
1149
|
-
upload_state.failed_experiments.append(exp_key)
|
|
1150
|
-
|
|
1151
|
-
if not args.dry_run:
|
|
1152
|
-
upload_state.save(state_file)
|
|
1153
|
-
|
|
1154
|
-
# Update task to completed
|
|
1155
|
-
progress.update(task_id, completed=100, total=100)
|
|
1156
|
-
|
|
1157
|
-
if not args.verbose:
|
|
1158
|
-
# Show brief status
|
|
1159
|
-
if result.success:
|
|
1160
|
-
parts = []
|
|
1161
|
-
if result.uploaded.get("params"):
|
|
1162
|
-
parts.append(f"{result.uploaded['params']} params")
|
|
1163
|
-
if result.uploaded.get("logs"):
|
|
1164
|
-
parts.append(f"{result.uploaded['logs']} logs")
|
|
1165
|
-
if result.uploaded.get("metrics"):
|
|
1166
|
-
parts.append(f"{result.uploaded['metrics']} metrics")
|
|
1167
|
-
if result.uploaded.get("files"):
|
|
1168
|
-
parts.append(f"{result.uploaded['files']} files")
|
|
1169
|
-
status = ", ".join(parts) if parts else "metadata only"
|
|
1170
|
-
console.print(f" [green]✓[/green] Uploaded ({status})")
|
|
1171
|
-
else:
|
|
1172
|
-
console.print(f" [red]✗[/red] Failed")
|
|
1173
|
-
if result.errors:
|
|
1174
|
-
for error in result.errors[:3]: # Show first 3 errors
|
|
1175
|
-
console.print(f" [red]{error}[/red]")
|
|
1176
|
-
|
|
1177
|
-
# Calculate timing
|
|
1178
|
-
end_time = time.time()
|
|
1179
|
-
elapsed_time = end_time - start_time
|
|
1180
|
-
total_bytes = sum(r.bytes_uploaded for r in results)
|
|
1181
|
-
|
|
1182
|
-
# Print summary with rich Table
|
|
1183
1385
|
console.print()
|
|
1386
|
+
console.print(data_table)
|
|
1387
|
+
|
|
1388
|
+
# Clean up state file if all uploads succeeded
|
|
1389
|
+
if not args.dry_run and len(failed) == 0 and state_file.exists():
|
|
1390
|
+
state_file.unlink()
|
|
1391
|
+
console.print("\n[dim]Upload complete. State file removed.[/dim]")
|
|
1392
|
+
elif not args.dry_run and failed:
|
|
1393
|
+
console.print(
|
|
1394
|
+
f"\n[yellow]State saved to {state_file}. Use --resume to retry failed uploads.[/yellow]"
|
|
1395
|
+
)
|
|
1184
1396
|
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
# Create summary table
|
|
1189
|
-
summary_table = Table(title="Upload Summary", show_header=True, header_style="bold")
|
|
1190
|
-
summary_table.add_column("Status", style="cyan")
|
|
1191
|
-
summary_table.add_column("Count", justify="right")
|
|
1192
|
-
|
|
1193
|
-
summary_table.add_row("Successful", f"[green]{len(successful)}/{len(results)}[/green]")
|
|
1194
|
-
if failed:
|
|
1195
|
-
summary_table.add_row("Failed", f"[red]{len(failed)}/{len(results)}[/red]")
|
|
1196
|
-
|
|
1197
|
-
# Add timing information
|
|
1198
|
-
summary_table.add_row("Total Time", f"{elapsed_time:.2f}s")
|
|
1199
|
-
|
|
1200
|
-
# Calculate and display upload speed
|
|
1201
|
-
if total_bytes > 0 and elapsed_time > 0:
|
|
1202
|
-
# Convert to appropriate unit
|
|
1203
|
-
if total_bytes < 1024 * 1024: # Less than 1 MB
|
|
1204
|
-
speed_kb = (total_bytes / 1024) / elapsed_time
|
|
1205
|
-
summary_table.add_row("Avg Speed", f"{speed_kb:.2f} KB/s")
|
|
1206
|
-
else: # 1 MB or more
|
|
1207
|
-
speed_mb = (total_bytes / (1024 * 1024)) / elapsed_time
|
|
1208
|
-
summary_table.add_row("Avg Speed", f"{speed_mb:.2f} MB/s")
|
|
1209
|
-
|
|
1210
|
-
console.print(summary_table)
|
|
1211
|
-
|
|
1212
|
-
# Show failed experiments
|
|
1213
|
-
if failed:
|
|
1214
|
-
console.print("\n[bold red]Failed Experiments:[/bold red]")
|
|
1215
|
-
for result in failed:
|
|
1216
|
-
console.print(f" [red]✗[/red] {result.experiment}")
|
|
1217
|
-
for error in result.errors:
|
|
1218
|
-
console.print(f" [dim]{error}[/dim]")
|
|
1219
|
-
|
|
1220
|
-
# Data statistics
|
|
1221
|
-
total_logs = sum(r.uploaded.get("logs", 0) for r in results)
|
|
1222
|
-
total_metrics = sum(r.uploaded.get("metrics", 0) for r in results)
|
|
1223
|
-
total_files = sum(r.uploaded.get("files", 0) for r in results)
|
|
1224
|
-
|
|
1225
|
-
if total_logs or total_metrics or total_files:
|
|
1226
|
-
data_table = Table(title="Data Uploaded", show_header=True, header_style="bold")
|
|
1227
|
-
data_table.add_column("Type", style="cyan")
|
|
1228
|
-
data_table.add_column("Count", justify="right", style="green")
|
|
1229
|
-
|
|
1230
|
-
if total_logs:
|
|
1231
|
-
data_table.add_row("Logs", f"{total_logs} entries")
|
|
1232
|
-
if total_metrics:
|
|
1233
|
-
data_table.add_row("Metrics", f"{total_metrics} metrics")
|
|
1234
|
-
if total_files:
|
|
1235
|
-
data_table.add_row("Files", f"{total_files} files")
|
|
1236
|
-
|
|
1237
|
-
console.print()
|
|
1238
|
-
console.print(data_table)
|
|
1239
|
-
|
|
1240
|
-
# Clean up state file if all uploads succeeded
|
|
1241
|
-
if not args.dry_run and len(failed) == 0 and state_file.exists():
|
|
1242
|
-
state_file.unlink()
|
|
1243
|
-
console.print("\n[dim]Upload complete. State file removed.[/dim]")
|
|
1244
|
-
elif not args.dry_run and failed:
|
|
1245
|
-
console.print(f"\n[yellow]State saved to {state_file}. Use --resume to retry failed uploads.[/yellow]")
|
|
1246
|
-
|
|
1247
|
-
# Return exit code
|
|
1248
|
-
return 0 if len(failed) == 0 else 1
|
|
1397
|
+
# Return exit code
|
|
1398
|
+
return 0 if len(failed) == 0 else 1
|