ml-dash 0.6.2__py3-none-any.whl → 0.6.2rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml_dash/__init__.py +64 -36
- ml_dash/auth/token_storage.py +226 -267
- ml_dash/auto_start.py +15 -28
- ml_dash/cli.py +2 -16
- ml_dash/cli_commands/download.py +667 -757
- ml_dash/cli_commands/list.py +13 -146
- ml_dash/cli_commands/login.py +183 -190
- ml_dash/cli_commands/upload.py +1141 -1291
- ml_dash/client.py +6 -79
- ml_dash/config.py +119 -119
- ml_dash/experiment.py +1034 -1234
- ml_dash/files.py +224 -339
- ml_dash/log.py +7 -7
- ml_dash/metric.py +100 -359
- ml_dash/params.py +6 -6
- ml_dash/remote_auto_start.py +17 -20
- ml_dash/run.py +65 -211
- ml_dash/storage.py +1081 -1051
- {ml_dash-0.6.2.dist-info → ml_dash-0.6.2rc1.dist-info}/METADATA +14 -12
- ml_dash-0.6.2rc1.dist-info/RECORD +30 -0
- {ml_dash-0.6.2.dist-info → ml_dash-0.6.2rc1.dist-info}/WHEEL +1 -1
- ml_dash/cli_commands/api.py +0 -165
- ml_dash/cli_commands/profile.py +0 -92
- ml_dash/snowflake.py +0 -173
- ml_dash-0.6.2.dist-info/RECORD +0 -33
- {ml_dash-0.6.2.dist-info → ml_dash-0.6.2rc1.dist-info}/entry_points.txt +0 -0
ml_dash/cli_commands/upload.py
CHANGED
|
@@ -2,25 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
import argparse
|
|
4
4
|
import json
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Dict, Any, Optional
|
|
7
|
+
from dataclasses import dataclass, field
|
|
5
8
|
import threading
|
|
6
9
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
-
from dataclasses import dataclass, field
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
from typing import Any, Dict, List, Optional
|
|
10
10
|
|
|
11
11
|
from rich.console import Console
|
|
12
|
-
from rich.progress import
|
|
13
|
-
BarColumn,
|
|
14
|
-
Progress,
|
|
15
|
-
SpinnerColumn,
|
|
16
|
-
TaskProgressColumn,
|
|
17
|
-
TextColumn,
|
|
18
|
-
)
|
|
12
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
|
|
19
13
|
from rich.table import Table
|
|
14
|
+
from rich.panel import Panel
|
|
20
15
|
|
|
16
|
+
from ..storage import LocalStorage
|
|
21
17
|
from ..client import RemoteClient
|
|
22
18
|
from ..config import Config
|
|
23
|
-
from ..storage import LocalStorage
|
|
24
19
|
|
|
25
20
|
# Initialize rich console
|
|
26
21
|
console = Console()
|
|
@@ -28,1371 +23,1226 @@ console = Console()
|
|
|
28
23
|
|
|
29
24
|
@dataclass
|
|
30
25
|
class ExperimentInfo:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
estimated_size: int = 0 # in bytes
|
|
26
|
+
"""Information about an experiment to upload."""
|
|
27
|
+
project: str
|
|
28
|
+
experiment: str
|
|
29
|
+
path: Path
|
|
30
|
+
folder: Optional[str] = None
|
|
31
|
+
has_logs: bool = False
|
|
32
|
+
has_params: bool = False
|
|
33
|
+
metric_names: List[str] = field(default_factory=list)
|
|
34
|
+
file_count: int = 0
|
|
35
|
+
estimated_size: int = 0 # in bytes
|
|
42
36
|
|
|
43
37
|
|
|
44
38
|
@dataclass
|
|
45
39
|
class ValidationResult:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
valid_data: Dict[str, Any] = field(default_factory=dict)
|
|
40
|
+
"""Result of experiment validation."""
|
|
41
|
+
is_valid: bool = True
|
|
42
|
+
warnings: List[str] = field(default_factory=list)
|
|
43
|
+
errors: List[str] = field(default_factory=list)
|
|
44
|
+
valid_data: Dict[str, Any] = field(default_factory=dict)
|
|
52
45
|
|
|
53
46
|
|
|
54
47
|
@dataclass
|
|
55
48
|
class UploadResult:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
bytes_uploaded: int = 0 # Total bytes uploaded
|
|
49
|
+
"""Result of uploading an experiment."""
|
|
50
|
+
experiment: str
|
|
51
|
+
success: bool = False
|
|
52
|
+
uploaded: Dict[str, int] = field(default_factory=dict) # {"logs": 100, "metrics": 3}
|
|
53
|
+
failed: Dict[str, List[str]] = field(default_factory=dict) # {"files": ["error msg"]}
|
|
54
|
+
errors: List[str] = field(default_factory=list)
|
|
55
|
+
bytes_uploaded: int = 0 # Total bytes uploaded
|
|
64
56
|
|
|
65
57
|
|
|
66
58
|
@dataclass
|
|
67
59
|
class UploadState:
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
in_progress_experiment=data.get("in_progress_experiment"),
|
|
99
|
-
timestamp=data.get("timestamp"),
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
def save(self, path: Path):
|
|
103
|
-
"""Save state to file."""
|
|
104
|
-
import datetime
|
|
105
|
-
|
|
106
|
-
self.timestamp = datetime.datetime.now().isoformat()
|
|
107
|
-
with open(path, "w") as f:
|
|
108
|
-
json.dump(self.to_dict(), f, indent=2)
|
|
60
|
+
"""Tracks upload state for resume functionality."""
|
|
61
|
+
local_path: str
|
|
62
|
+
remote_url: str
|
|
63
|
+
completed_experiments: List[str] = field(default_factory=list) # ["project/experiment"]
|
|
64
|
+
failed_experiments: List[str] = field(default_factory=list)
|
|
65
|
+
in_progress_experiment: Optional[str] = None
|
|
66
|
+
timestamp: Optional[str] = None
|
|
67
|
+
|
|
68
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
69
|
+
"""Convert to dictionary for JSON serialization."""
|
|
70
|
+
return {
|
|
71
|
+
"local_path": self.local_path,
|
|
72
|
+
"remote_url": self.remote_url,
|
|
73
|
+
"completed_experiments": self.completed_experiments,
|
|
74
|
+
"failed_experiments": self.failed_experiments,
|
|
75
|
+
"in_progress_experiment": self.in_progress_experiment,
|
|
76
|
+
"timestamp": self.timestamp,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
def from_dict(cls, data: Dict[str, Any]) -> "UploadState":
|
|
81
|
+
"""Create from dictionary."""
|
|
82
|
+
return cls(
|
|
83
|
+
local_path=data["local_path"],
|
|
84
|
+
remote_url=data["remote_url"],
|
|
85
|
+
completed_experiments=data.get("completed_experiments", []),
|
|
86
|
+
failed_experiments=data.get("failed_experiments", []),
|
|
87
|
+
in_progress_experiment=data.get("in_progress_experiment"),
|
|
88
|
+
timestamp=data.get("timestamp"),
|
|
89
|
+
)
|
|
109
90
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
91
|
+
def save(self, path: Path):
|
|
92
|
+
"""Save state to file."""
|
|
93
|
+
import datetime
|
|
94
|
+
self.timestamp = datetime.datetime.now().isoformat()
|
|
95
|
+
with open(path, "w") as f:
|
|
96
|
+
json.dump(self.to_dict(), f, indent=2)
|
|
97
|
+
|
|
98
|
+
@classmethod
|
|
99
|
+
def load(cls, path: Path) -> Optional["UploadState"]:
|
|
100
|
+
"""Load state from file."""
|
|
101
|
+
if not path.exists():
|
|
102
|
+
return None
|
|
103
|
+
try:
|
|
104
|
+
with open(path, "r") as f:
|
|
105
|
+
data = json.load(f)
|
|
106
|
+
return cls.from_dict(data)
|
|
107
|
+
except (json.JSONDecodeError, IOError, KeyError):
|
|
108
|
+
return None
|
|
121
109
|
|
|
122
110
|
|
|
123
111
|
def add_parser(subparsers) -> argparse.ArgumentParser:
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
# Positional argument
|
|
132
|
-
parser.add_argument(
|
|
133
|
-
"path",
|
|
134
|
-
nargs="?",
|
|
135
|
-
default="./.dash",
|
|
136
|
-
help="Local storage directory to upload from (default: ./.dash)",
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
# Remote configuration
|
|
140
|
-
parser.add_argument(
|
|
141
|
-
"--dash-url",
|
|
142
|
-
type=str,
|
|
143
|
-
help="ML-Dash server URL (defaults to config or https://api.dash.ml)",
|
|
144
|
-
)
|
|
145
|
-
parser.add_argument(
|
|
146
|
-
"--api-key",
|
|
147
|
-
type=str,
|
|
148
|
-
help="JWT token for authentication (optional - auto-loads from 'ml-dash login' if not provided)",
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
"""
|
|
152
|
-
|
|
153
|
-
cd .dash/geyang
|
|
154
|
-
cd iclr_2026
|
|
155
|
-
|
|
156
|
-
ml-dash upload -p geyang/new-run * # this uploads all of the folders to geyang/new-run.
|
|
157
|
-
|
|
158
|
-
or
|
|
159
|
-
|
|
160
|
-
ml-dash upload --prefix geyang/new-run/local-results ./* # uploads under the local-results prefix.
|
|
161
|
-
|
|
162
|
-
ml-dash download --prefix geyang/new-run/zehua-results --filter *.mp4 --dryrun --verbose
|
|
163
|
-
|
|
164
|
-
mo-dash list --prefix geyang/new-run/zehua-results --filter xxx-xxx --verbose
|
|
165
|
-
|
|
166
|
-
mo-dash list-exp --prefix geyang/new-run/zehua-results --filter xxx-xxx --verbose
|
|
167
|
-
|
|
168
|
-
"""
|
|
169
|
-
|
|
170
|
-
# Scope control
|
|
171
|
-
# Ge: project should be {owner}/{proj_name}
|
|
172
|
-
parser.add_argument(
|
|
173
|
-
"-p",
|
|
174
|
-
"--pref",
|
|
175
|
-
"--prefix",
|
|
176
|
-
"--proj",
|
|
177
|
-
"--project",
|
|
178
|
-
type=str,
|
|
179
|
-
help="Filter experiments by prefix pattern (supports glob: 'tom/*/exp*', 'alice/project-?/baseline')",
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
# Target prefix for server (like scp destination)
|
|
183
|
-
parser.add_argument(
|
|
184
|
-
"-t",
|
|
185
|
-
"--target",
|
|
186
|
-
type=str,
|
|
187
|
-
help="Target prefix/directory on server where experiments will be uploaded (e.g., 'alice/shared-project'). Similar to 'scp local/ remote-path/'",
|
|
188
|
-
)
|
|
189
|
-
# parser.add_argument(
|
|
190
|
-
# "--experiment",
|
|
191
|
-
# type=str,
|
|
192
|
-
# help="Upload only this specific experiment (requires --project)",
|
|
193
|
-
# )
|
|
194
|
-
|
|
195
|
-
# Data filtering
|
|
196
|
-
parser.add_argument(
|
|
197
|
-
"--skip-logs",
|
|
198
|
-
action="store_true",
|
|
199
|
-
help="Don't upload logs",
|
|
200
|
-
)
|
|
201
|
-
parser.add_argument(
|
|
202
|
-
"--skip-metrics",
|
|
203
|
-
action="store_true",
|
|
204
|
-
help="Don't upload metrics",
|
|
205
|
-
)
|
|
206
|
-
parser.add_argument(
|
|
207
|
-
"--skip-files",
|
|
208
|
-
action="store_true",
|
|
209
|
-
help="Don't upload files",
|
|
210
|
-
)
|
|
211
|
-
parser.add_argument(
|
|
212
|
-
"--skip-params",
|
|
213
|
-
action="store_true",
|
|
214
|
-
help="Don't upload parameters",
|
|
215
|
-
)
|
|
216
|
-
|
|
217
|
-
# Behavior control
|
|
218
|
-
parser.add_argument(
|
|
219
|
-
"--dry-run",
|
|
220
|
-
action="store_true",
|
|
221
|
-
help="Show what would be uploaded without uploading",
|
|
222
|
-
)
|
|
223
|
-
parser.add_argument(
|
|
224
|
-
"--strict",
|
|
225
|
-
action="store_true",
|
|
226
|
-
help="Fail on any validation error (default: skip invalid data)",
|
|
227
|
-
)
|
|
228
|
-
parser.add_argument(
|
|
229
|
-
"-v",
|
|
230
|
-
"--verbose",
|
|
231
|
-
action="store_true",
|
|
232
|
-
help="Show detailed progress",
|
|
233
|
-
)
|
|
234
|
-
parser.add_argument(
|
|
235
|
-
"--batch-size",
|
|
236
|
-
type=int,
|
|
237
|
-
default=100,
|
|
238
|
-
help="Batch size for logs/metrics (default: 100)",
|
|
239
|
-
)
|
|
240
|
-
parser.add_argument(
|
|
241
|
-
"--resume",
|
|
242
|
-
action="store_true",
|
|
243
|
-
help="Resume previous interrupted upload",
|
|
244
|
-
)
|
|
245
|
-
parser.add_argument(
|
|
246
|
-
"--state-file",
|
|
247
|
-
type=str,
|
|
248
|
-
default=".dash-upload-state.json",
|
|
249
|
-
help="Path to state file for resume (default: .dash-upload-state.json)",
|
|
250
|
-
)
|
|
251
|
-
|
|
252
|
-
return parser
|
|
112
|
+
"""Add upload command parser."""
|
|
113
|
+
parser = subparsers.add_parser(
|
|
114
|
+
"upload",
|
|
115
|
+
help="Upload local experiments to remote server",
|
|
116
|
+
description="Upload locally-stored ML-Dash experiment data to a remote server.",
|
|
117
|
+
)
|
|
253
118
|
|
|
119
|
+
# Positional argument
|
|
120
|
+
parser.add_argument(
|
|
121
|
+
"path",
|
|
122
|
+
nargs="?",
|
|
123
|
+
default="./.ml-dash",
|
|
124
|
+
help="Local storage directory to upload from (default: ./.ml-dash)",
|
|
125
|
+
)
|
|
254
126
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
Args:
|
|
267
|
-
local_path: Root path of local storage
|
|
268
|
-
project_filter: Glob pattern to filter experiments by prefix (e.g., "tom/*/exp*")
|
|
269
|
-
experiment_filter: Only discover this experiment (requires project_filter)
|
|
270
|
-
|
|
271
|
-
Returns:
|
|
272
|
-
List of ExperimentInfo objects
|
|
273
|
-
"""
|
|
274
|
-
import fnmatch
|
|
275
|
-
|
|
276
|
-
local_path = Path(local_path)
|
|
277
|
-
|
|
278
|
-
if not local_path.exists():
|
|
279
|
-
return []
|
|
280
|
-
|
|
281
|
-
experiments = []
|
|
282
|
-
|
|
283
|
-
# Find all experiment.json files recursively
|
|
284
|
-
for exp_json in local_path.rglob("*/experiment.json"):
|
|
285
|
-
exp_dir = exp_json.parent
|
|
286
|
-
|
|
287
|
-
# Read prefix from experiment.json first
|
|
288
|
-
prefix = None
|
|
289
|
-
try:
|
|
290
|
-
with open(exp_json, "r") as f:
|
|
291
|
-
metadata = json.load(f)
|
|
292
|
-
prefix = metadata.get("prefix")
|
|
293
|
-
except:
|
|
294
|
-
pass
|
|
295
|
-
|
|
296
|
-
# Extract project and experiment names from PREFIX (not path)
|
|
297
|
-
# This handles nested folders correctly
|
|
298
|
-
# Prefix format: owner/project/folder.../experiment
|
|
299
|
-
try:
|
|
300
|
-
relative_path = exp_dir.relative_to(local_path)
|
|
301
|
-
full_relative_path = str(relative_path)
|
|
302
|
-
|
|
303
|
-
if prefix:
|
|
304
|
-
# Parse from prefix for accuracy
|
|
305
|
-
prefix_parts = prefix.strip("/").split("/")
|
|
306
|
-
if len(prefix_parts) < 3:
|
|
307
|
-
continue # Need at least owner/project/experiment
|
|
308
|
-
|
|
309
|
-
# owner = prefix_parts[0]
|
|
310
|
-
project_name = prefix_parts[1]
|
|
311
|
-
exp_name = prefix_parts[-1]
|
|
312
|
-
else:
|
|
313
|
-
# Fallback to path-based parsing (legacy support)
|
|
314
|
-
parts = relative_path.parts
|
|
315
|
-
if len(parts) < 2:
|
|
316
|
-
continue
|
|
317
|
-
exp_name = parts[-1]
|
|
318
|
-
project_name = parts[-2]
|
|
319
|
-
|
|
320
|
-
# Apply filters with glob pattern support
|
|
321
|
-
if project_filter:
|
|
322
|
-
# Support glob pattern matching on the full relative path
|
|
323
|
-
if not fnmatch.fnmatch(full_relative_path, project_filter):
|
|
324
|
-
continue
|
|
325
|
-
if experiment_filter and exp_name != experiment_filter:
|
|
326
|
-
continue
|
|
327
|
-
|
|
328
|
-
# Create experiment info
|
|
329
|
-
exp_info = ExperimentInfo(
|
|
330
|
-
project=project_name,
|
|
331
|
-
experiment=exp_name,
|
|
332
|
-
path=exp_dir,
|
|
333
|
-
prefix=prefix,
|
|
334
|
-
)
|
|
335
|
-
except (ValueError, IndexError):
|
|
336
|
-
continue
|
|
337
|
-
|
|
338
|
-
# Check for parameters
|
|
339
|
-
params_file = exp_dir / "parameters.json"
|
|
340
|
-
exp_info.has_params = params_file.exists()
|
|
341
|
-
|
|
342
|
-
# Check for logs
|
|
343
|
-
logs_file = exp_dir / "logs/logs.jsonl"
|
|
344
|
-
exp_info.has_logs = logs_file.exists()
|
|
345
|
-
|
|
346
|
-
# Check for metrics
|
|
347
|
-
metrics_dir = exp_dir / "metrics"
|
|
348
|
-
if metrics_dir.exists():
|
|
349
|
-
for metric_dir in metrics_dir.iterdir():
|
|
350
|
-
if metric_dir.is_dir():
|
|
351
|
-
data_file = metric_dir / "data.jsonl"
|
|
352
|
-
if data_file.exists():
|
|
353
|
-
exp_info.metric_names.append(metric_dir.name)
|
|
354
|
-
|
|
355
|
-
# Check for files
|
|
356
|
-
files_dir = exp_dir / "files"
|
|
357
|
-
if files_dir.exists():
|
|
358
|
-
try:
|
|
359
|
-
# Count files recursively
|
|
360
|
-
exp_info.file_count = sum(1 for _ in files_dir.rglob("*") if _.is_file())
|
|
361
|
-
|
|
362
|
-
# Estimate size
|
|
363
|
-
exp_info.estimated_size = sum(
|
|
364
|
-
f.stat().st_size for f in files_dir.rglob("*") if f.is_file()
|
|
365
|
-
)
|
|
366
|
-
except (OSError, PermissionError):
|
|
367
|
-
pass
|
|
127
|
+
# Remote configuration
|
|
128
|
+
parser.add_argument(
|
|
129
|
+
"--remote",
|
|
130
|
+
type=str,
|
|
131
|
+
help="Remote server URL (required unless set in config)",
|
|
132
|
+
)
|
|
133
|
+
parser.add_argument(
|
|
134
|
+
"--api-key",
|
|
135
|
+
type=str,
|
|
136
|
+
help="JWT token for authentication (optional - auto-loads from 'ml-dash login' if not provided)",
|
|
137
|
+
)
|
|
368
138
|
|
|
369
|
-
|
|
139
|
+
# Scope control
|
|
140
|
+
parser.add_argument(
|
|
141
|
+
"--project",
|
|
142
|
+
type=str,
|
|
143
|
+
help="Upload only experiments from this project",
|
|
144
|
+
)
|
|
145
|
+
parser.add_argument(
|
|
146
|
+
"--experiment",
|
|
147
|
+
type=str,
|
|
148
|
+
help="Upload only this specific experiment (requires --project)",
|
|
149
|
+
)
|
|
370
150
|
|
|
371
|
-
|
|
151
|
+
# Data filtering
|
|
152
|
+
parser.add_argument(
|
|
153
|
+
"--skip-logs",
|
|
154
|
+
action="store_true",
|
|
155
|
+
help="Don't upload logs",
|
|
156
|
+
)
|
|
157
|
+
parser.add_argument(
|
|
158
|
+
"--skip-metrics",
|
|
159
|
+
action="store_true",
|
|
160
|
+
help="Don't upload metrics",
|
|
161
|
+
)
|
|
162
|
+
parser.add_argument(
|
|
163
|
+
"--skip-files",
|
|
164
|
+
action="store_true",
|
|
165
|
+
help="Don't upload files",
|
|
166
|
+
)
|
|
167
|
+
parser.add_argument(
|
|
168
|
+
"--skip-params",
|
|
169
|
+
action="store_true",
|
|
170
|
+
help="Don't upload parameters",
|
|
171
|
+
)
|
|
372
172
|
|
|
173
|
+
# Behavior control
|
|
174
|
+
parser.add_argument(
|
|
175
|
+
"--dry-run",
|
|
176
|
+
action="store_true",
|
|
177
|
+
help="Show what would be uploaded without uploading",
|
|
178
|
+
)
|
|
179
|
+
parser.add_argument(
|
|
180
|
+
"--strict",
|
|
181
|
+
action="store_true",
|
|
182
|
+
help="Fail on any validation error (default: skip invalid data)",
|
|
183
|
+
)
|
|
184
|
+
parser.add_argument(
|
|
185
|
+
"-v", "--verbose",
|
|
186
|
+
action="store_true",
|
|
187
|
+
help="Show detailed progress",
|
|
188
|
+
)
|
|
189
|
+
parser.add_argument(
|
|
190
|
+
"--batch-size",
|
|
191
|
+
type=int,
|
|
192
|
+
default=100,
|
|
193
|
+
help="Batch size for logs/metrics (default: 100)",
|
|
194
|
+
)
|
|
195
|
+
parser.add_argument(
|
|
196
|
+
"--resume",
|
|
197
|
+
action="store_true",
|
|
198
|
+
help="Resume previous interrupted upload",
|
|
199
|
+
)
|
|
200
|
+
parser.add_argument(
|
|
201
|
+
"--state-file",
|
|
202
|
+
type=str,
|
|
203
|
+
default=".ml-dash-upload-state.json",
|
|
204
|
+
help="Path to state file for resume (default: .ml-dash-upload-state.json)",
|
|
205
|
+
)
|
|
373
206
|
|
|
374
|
-
|
|
375
|
-
"""Validates local experiment data before upload."""
|
|
207
|
+
return parser
|
|
376
208
|
|
|
377
|
-
def __init__(self, strict: bool = False):
|
|
378
|
-
"""
|
|
379
|
-
Initialize validator.
|
|
380
209
|
|
|
381
|
-
|
|
382
|
-
|
|
210
|
+
def discover_experiments(
|
|
211
|
+
local_path: Path,
|
|
212
|
+
project_filter: Optional[str] = None,
|
|
213
|
+
experiment_filter: Optional[str] = None,
|
|
214
|
+
) -> List[ExperimentInfo]:
|
|
383
215
|
"""
|
|
384
|
-
|
|
216
|
+
Discover experiments in local storage directory.
|
|
385
217
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
Validate experiment directory structure and data.
|
|
218
|
+
Supports both flat (local_path/project/experiment) and folder-based
|
|
219
|
+
(local_path/folder/project/experiment) hierarchies.
|
|
389
220
|
|
|
390
221
|
Args:
|
|
391
|
-
|
|
222
|
+
local_path: Root path of local storage
|
|
223
|
+
project_filter: Only discover experiments in this project
|
|
224
|
+
experiment_filter: Only discover this experiment (requires project_filter)
|
|
392
225
|
|
|
393
226
|
Returns:
|
|
394
|
-
|
|
227
|
+
List of ExperimentInfo objects
|
|
395
228
|
"""
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
# 3. Validate logs (optional)
|
|
408
|
-
self._validate_logs(exp_info, result)
|
|
409
|
-
|
|
410
|
-
# 4. Validate metrics (optional)
|
|
411
|
-
self._validate_metrics(exp_info, result)
|
|
412
|
-
|
|
413
|
-
# 5. Validate files (optional)
|
|
414
|
-
self._validate_files(exp_info, result)
|
|
415
|
-
|
|
416
|
-
# In strict mode, any warning becomes an error
|
|
417
|
-
if self.strict and result.warnings:
|
|
418
|
-
result.errors.extend(result.warnings)
|
|
419
|
-
result.warnings = []
|
|
420
|
-
result.is_valid = False
|
|
421
|
-
|
|
422
|
-
return result
|
|
423
|
-
|
|
424
|
-
def _validate_experiment_metadata(
|
|
425
|
-
self, exp_info: ExperimentInfo, result: ValidationResult
|
|
426
|
-
) -> bool:
|
|
427
|
-
"""Validate experiment.json exists and is valid."""
|
|
428
|
-
exp_json = exp_info.path / "experiment.json"
|
|
429
|
-
|
|
430
|
-
if not exp_json.exists():
|
|
431
|
-
result.errors.append("Missing experiment.json")
|
|
432
|
-
return False
|
|
433
|
-
|
|
434
|
-
try:
|
|
435
|
-
with open(exp_json, "r") as f:
|
|
436
|
-
metadata = json.load(f)
|
|
437
|
-
|
|
438
|
-
# Check required fields
|
|
439
|
-
if "name" not in metadata or "project" not in metadata:
|
|
440
|
-
result.errors.append("experiment.json missing required fields (name, project)")
|
|
441
|
-
return False
|
|
442
|
-
|
|
443
|
-
result.valid_data["metadata"] = metadata
|
|
444
|
-
return True
|
|
445
|
-
|
|
446
|
-
except json.JSONDecodeError as e:
|
|
447
|
-
result.errors.append(f"Invalid JSON in experiment.json: {e}")
|
|
448
|
-
return False
|
|
449
|
-
except IOError as e:
|
|
450
|
-
result.errors.append(f"Cannot read experiment.json: {e}")
|
|
451
|
-
return False
|
|
452
|
-
|
|
453
|
-
def _validate_parameters(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
454
|
-
"""Validate parameters.json format."""
|
|
455
|
-
if not exp_info.has_params:
|
|
456
|
-
return
|
|
457
|
-
|
|
458
|
-
params_file = exp_info.path / "parameters.json"
|
|
459
|
-
try:
|
|
460
|
-
with open(params_file, "r") as f:
|
|
461
|
-
params = json.load(f)
|
|
462
|
-
|
|
463
|
-
# Check if it's a dict
|
|
464
|
-
if not isinstance(params, dict):
|
|
465
|
-
result.warnings.append("parameters.json is not a dict (will skip)")
|
|
466
|
-
return
|
|
467
|
-
|
|
468
|
-
# Check for valid data key if using versioned format
|
|
469
|
-
if "data" in params:
|
|
470
|
-
if not isinstance(params["data"], dict):
|
|
471
|
-
result.warnings.append("parameters.json data is not a dict (will skip)")
|
|
472
|
-
return
|
|
473
|
-
result.valid_data["parameters"] = params["data"]
|
|
474
|
-
else:
|
|
475
|
-
result.valid_data["parameters"] = params
|
|
476
|
-
|
|
477
|
-
except json.JSONDecodeError as e:
|
|
478
|
-
result.warnings.append(f"Invalid JSON in parameters.json: {e} (will skip)")
|
|
479
|
-
except IOError as e:
|
|
480
|
-
result.warnings.append(f"Cannot read parameters.json: {e} (will skip)")
|
|
481
|
-
|
|
482
|
-
def _validate_logs(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
483
|
-
"""Validate logs.jsonl format."""
|
|
484
|
-
if not exp_info.has_logs:
|
|
485
|
-
return
|
|
486
|
-
|
|
487
|
-
logs_file = exp_info.path / "logs/logs.jsonl"
|
|
488
|
-
invalid_lines = []
|
|
489
|
-
|
|
490
|
-
try:
|
|
491
|
-
with open(logs_file, "r") as f:
|
|
492
|
-
for line_num, line in enumerate(f, start=1):
|
|
493
|
-
try:
|
|
494
|
-
log_entry = json.loads(line)
|
|
495
|
-
# Check required fields
|
|
496
|
-
if "message" not in log_entry:
|
|
497
|
-
invalid_lines.append(line_num)
|
|
498
|
-
except json.JSONDecodeError:
|
|
499
|
-
invalid_lines.append(line_num)
|
|
500
|
-
|
|
501
|
-
if invalid_lines:
|
|
502
|
-
count = len(invalid_lines)
|
|
503
|
-
preview = invalid_lines[:5]
|
|
504
|
-
result.warnings.append(
|
|
505
|
-
f"logs.jsonl has {count} invalid lines (e.g., {preview}...) - will skip these"
|
|
506
|
-
)
|
|
229
|
+
local_path = Path(local_path)
|
|
230
|
+
|
|
231
|
+
if not local_path.exists():
|
|
232
|
+
return []
|
|
233
|
+
|
|
234
|
+
experiments = []
|
|
235
|
+
|
|
236
|
+
# Find all experiment.json files recursively
|
|
237
|
+
for exp_json in local_path.rglob("*/experiment.json"):
|
|
238
|
+
exp_dir = exp_json.parent
|
|
507
239
|
|
|
508
|
-
|
|
509
|
-
|
|
240
|
+
# Extract project and experiment names from path
|
|
241
|
+
# Path structure: local_path / [folder] / project / experiment
|
|
242
|
+
try:
|
|
243
|
+
relative_path = exp_dir.relative_to(local_path)
|
|
244
|
+
parts = relative_path.parts
|
|
245
|
+
|
|
246
|
+
if len(parts) < 2:
|
|
247
|
+
continue # Need at least project/experiment
|
|
510
248
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
return
|
|
249
|
+
# Last two parts are project/experiment
|
|
250
|
+
exp_name = parts[-1]
|
|
251
|
+
project_name = parts[-2]
|
|
515
252
|
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
253
|
+
# Apply filters
|
|
254
|
+
if project_filter and project_name != project_filter:
|
|
255
|
+
continue
|
|
256
|
+
if experiment_filter and exp_name != experiment_filter:
|
|
257
|
+
continue
|
|
519
258
|
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
with open(data_file, "r") as f:
|
|
523
|
-
for line_num, line in enumerate(f, start=1):
|
|
259
|
+
# Read folder from experiment.json
|
|
260
|
+
folder = None
|
|
524
261
|
try:
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
result.warnings.append(f"Cannot read metric '{metric_name}': {e} (will skip)")
|
|
541
|
-
|
|
542
|
-
def _validate_files(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
543
|
-
"""Validate files existence."""
|
|
544
|
-
files_dir = exp_info.path / "files"
|
|
545
|
-
if not files_dir.exists():
|
|
546
|
-
return
|
|
547
|
-
|
|
548
|
-
metadata_file = files_dir / ".files_metadata.json"
|
|
549
|
-
if not metadata_file.exists():
|
|
550
|
-
return
|
|
551
|
-
|
|
552
|
-
try:
|
|
553
|
-
with open(metadata_file, "r") as f:
|
|
554
|
-
files_metadata = json.load(f)
|
|
555
|
-
|
|
556
|
-
missing_files = []
|
|
557
|
-
for file_id, file_info in files_metadata.items():
|
|
558
|
-
if isinstance(file_info, dict) and file_info.get("deletedAt") is None:
|
|
559
|
-
# Check if file exists
|
|
560
|
-
file_path = (
|
|
561
|
-
files_dir
|
|
562
|
-
/ file_info.get("prefix", "")
|
|
563
|
-
/ file_id
|
|
564
|
-
/ file_info.get("filename", "")
|
|
565
|
-
)
|
|
566
|
-
if not file_path.exists():
|
|
567
|
-
missing_files.append(file_info.get("filename", file_id))
|
|
568
|
-
|
|
569
|
-
if missing_files:
|
|
570
|
-
count = len(missing_files)
|
|
571
|
-
preview = missing_files[:3]
|
|
572
|
-
result.warnings.append(
|
|
573
|
-
f"{count} files referenced in metadata but missing on disk (e.g., {preview}...) - will skip these"
|
|
574
|
-
)
|
|
262
|
+
with open(exp_json, 'r') as f:
|
|
263
|
+
metadata = json.load(f)
|
|
264
|
+
folder = metadata.get('folder')
|
|
265
|
+
except:
|
|
266
|
+
pass
|
|
267
|
+
|
|
268
|
+
# Create experiment info
|
|
269
|
+
exp_info = ExperimentInfo(
|
|
270
|
+
project=project_name,
|
|
271
|
+
experiment=exp_name,
|
|
272
|
+
path=exp_dir,
|
|
273
|
+
folder=folder,
|
|
274
|
+
)
|
|
275
|
+
except (ValueError, IndexError):
|
|
276
|
+
continue
|
|
575
277
|
|
|
576
|
-
|
|
577
|
-
|
|
278
|
+
# Check for parameters
|
|
279
|
+
params_file = exp_dir / "parameters.json"
|
|
280
|
+
exp_info.has_params = params_file.exists()
|
|
281
|
+
|
|
282
|
+
# Check for logs
|
|
283
|
+
logs_file = exp_dir / "logs" / "logs.jsonl"
|
|
284
|
+
exp_info.has_logs = logs_file.exists()
|
|
285
|
+
|
|
286
|
+
# Check for metrics
|
|
287
|
+
metrics_dir = exp_dir / "metrics"
|
|
288
|
+
if metrics_dir.exists():
|
|
289
|
+
for metric_dir in metrics_dir.iterdir():
|
|
290
|
+
if metric_dir.is_dir():
|
|
291
|
+
data_file = metric_dir / "data.jsonl"
|
|
292
|
+
if data_file.exists():
|
|
293
|
+
exp_info.metric_names.append(metric_dir.name)
|
|
294
|
+
|
|
295
|
+
# Check for files
|
|
296
|
+
files_dir = exp_dir / "files"
|
|
297
|
+
if files_dir.exists():
|
|
298
|
+
try:
|
|
299
|
+
# Count files recursively
|
|
300
|
+
exp_info.file_count = sum(1 for _ in files_dir.rglob("*") if _.is_file())
|
|
578
301
|
|
|
302
|
+
# Estimate size
|
|
303
|
+
exp_info.estimated_size = sum(
|
|
304
|
+
f.stat().st_size for f in files_dir.rglob("*") if f.is_file()
|
|
305
|
+
)
|
|
306
|
+
except (OSError, PermissionError):
|
|
307
|
+
pass
|
|
579
308
|
|
|
580
|
-
|
|
581
|
-
"""Handles uploading a single experiment."""
|
|
582
|
-
|
|
583
|
-
def __init__(
|
|
584
|
-
self,
|
|
585
|
-
local_storage: LocalStorage,
|
|
586
|
-
remote_client: RemoteClient,
|
|
587
|
-
batch_size: int = 100,
|
|
588
|
-
skip_logs: bool = False,
|
|
589
|
-
skip_metrics: bool = False,
|
|
590
|
-
skip_files: bool = False,
|
|
591
|
-
skip_params: bool = False,
|
|
592
|
-
verbose: bool = False,
|
|
593
|
-
progress: Optional[Progress] = None,
|
|
594
|
-
max_concurrent_metrics: int = 5,
|
|
595
|
-
target_prefix: Optional[str] = None,
|
|
596
|
-
):
|
|
597
|
-
"""
|
|
598
|
-
Initialize uploader.
|
|
309
|
+
experiments.append(exp_info)
|
|
599
310
|
|
|
600
|
-
|
|
601
|
-
local_storage: Local storage instance
|
|
602
|
-
remote_client: Remote client instance
|
|
603
|
-
batch_size: Batch size for logs/metrics
|
|
604
|
-
skip_logs: Skip uploading logs
|
|
605
|
-
skip_metrics: Skip uploading metrics
|
|
606
|
-
skip_files: Skip uploading files
|
|
607
|
-
skip_params: Skip uploading parameters
|
|
608
|
-
verbose: Show verbose output
|
|
609
|
-
progress: Optional rich Progress instance for tracking
|
|
610
|
-
max_concurrent_metrics: Maximum concurrent metric uploads (default: 5)
|
|
611
|
-
target_prefix: Target prefix on server (overrides local prefix)
|
|
612
|
-
"""
|
|
613
|
-
self.local = local_storage
|
|
614
|
-
self.remote = remote_client
|
|
615
|
-
self.batch_size = batch_size
|
|
616
|
-
self.skip_logs = skip_logs
|
|
617
|
-
self.skip_metrics = skip_metrics
|
|
618
|
-
self.skip_files = skip_files
|
|
619
|
-
self.skip_params = skip_params
|
|
620
|
-
self.verbose = verbose
|
|
621
|
-
self.progress = progress
|
|
622
|
-
self.max_concurrent_metrics = max_concurrent_metrics
|
|
623
|
-
self.target_prefix = target_prefix
|
|
624
|
-
# Thread-safe lock for shared state updates
|
|
625
|
-
self._lock = threading.Lock()
|
|
626
|
-
# Thread-local storage for remote clients (for thread-safe HTTP requests)
|
|
627
|
-
self._thread_local = threading.local()
|
|
628
|
-
|
|
629
|
-
def _get_remote_client(self) -> RemoteClient:
|
|
630
|
-
"""Get thread-local remote client for safe concurrent access."""
|
|
631
|
-
if not hasattr(self._thread_local, "client"):
|
|
632
|
-
# Create a new client for this thread
|
|
633
|
-
# Use graphql_base_url (without /api) since RemoteClient.__init__ will add /api
|
|
634
|
-
self._thread_local.client = RemoteClient(
|
|
635
|
-
base_url=self.remote.graphql_base_url, api_key=self.remote.api_key
|
|
636
|
-
)
|
|
637
|
-
return self._thread_local.client
|
|
638
|
-
|
|
639
|
-
def upload_experiment(
|
|
640
|
-
self, exp_info: ExperimentInfo, validation_result: ValidationResult, task_id=None
|
|
641
|
-
) -> UploadResult:
|
|
642
|
-
"""
|
|
643
|
-
Upload a single experiment with all its data.
|
|
311
|
+
return experiments
|
|
644
312
|
|
|
645
|
-
Args:
|
|
646
|
-
exp_info: Experiment information
|
|
647
|
-
validation_result: Validation results
|
|
648
|
-
task_id: Optional progress task ID
|
|
649
313
|
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
"""
|
|
653
|
-
result = UploadResult(experiment=f"{exp_info.project}/{exp_info.experiment}")
|
|
654
|
-
|
|
655
|
-
# Calculate total steps for progress tracking
|
|
656
|
-
total_steps = 1 # metadata
|
|
657
|
-
if not self.skip_params and "parameters" in validation_result.valid_data:
|
|
658
|
-
total_steps += 1
|
|
659
|
-
if not self.skip_logs and exp_info.has_logs:
|
|
660
|
-
total_steps += 1
|
|
661
|
-
if not self.skip_metrics and exp_info.metric_names:
|
|
662
|
-
total_steps += len(exp_info.metric_names)
|
|
663
|
-
if not self.skip_files and exp_info.file_count > 0:
|
|
664
|
-
total_steps += exp_info.file_count
|
|
665
|
-
|
|
666
|
-
current_step = 0
|
|
667
|
-
|
|
668
|
-
def update_progress(description: str):
|
|
669
|
-
nonlocal current_step
|
|
670
|
-
current_step += 1
|
|
671
|
-
if self.progress and task_id is not None:
|
|
672
|
-
self.progress.update(
|
|
673
|
-
task_id, completed=current_step, total=total_steps, description=description
|
|
674
|
-
)
|
|
314
|
+
class ExperimentValidator:
|
|
315
|
+
"""Validates local experiment data before upload."""
|
|
675
316
|
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
if self.verbose:
|
|
680
|
-
console.print(" [dim]Creating experiment...[/dim]")
|
|
681
|
-
|
|
682
|
-
exp_data = validation_result.valid_data
|
|
683
|
-
|
|
684
|
-
# Construct full prefix for server
|
|
685
|
-
# If --target is specified, use it as the base destination prefix
|
|
686
|
-
# Otherwise, preserve the local prefix structure
|
|
687
|
-
if self.target_prefix:
|
|
688
|
-
# User specified a target prefix (like scp destination directory)
|
|
689
|
-
# Append experiment name to it: target_prefix/experiment_name
|
|
690
|
-
full_prefix = f"{self.target_prefix.rstrip('/')}/{exp_info.experiment}"
|
|
691
|
-
|
|
692
|
-
# Extract project from target prefix for API call
|
|
693
|
-
# Target format: owner/project/path...
|
|
694
|
-
target_parts = self.target_prefix.strip("/").split("/")
|
|
695
|
-
if len(target_parts) >= 2:
|
|
696
|
-
target_project = target_parts[1]
|
|
697
|
-
else:
|
|
698
|
-
target_project = exp_info.project # Fallback to original
|
|
699
|
-
elif exp_info.prefix:
|
|
700
|
-
# No target specified, preserve local prefix structure
|
|
701
|
-
full_prefix = f"{exp_info.prefix}/{exp_info.experiment}"
|
|
702
|
-
target_project = exp_info.project
|
|
703
|
-
else:
|
|
704
|
-
full_prefix = exp_info.experiment
|
|
705
|
-
target_project = exp_info.project
|
|
706
|
-
|
|
707
|
-
response = self.remote.create_or_update_experiment(
|
|
708
|
-
project=target_project,
|
|
709
|
-
name=exp_info.experiment,
|
|
710
|
-
description=exp_data.get("description"),
|
|
711
|
-
tags=exp_data.get("tags"),
|
|
712
|
-
bindrs=exp_data.get("bindrs"),
|
|
713
|
-
prefix=full_prefix, # Send full prefix (folder + name) or target prefix
|
|
714
|
-
write_protected=exp_data.get("write_protected", False),
|
|
715
|
-
metadata=exp_data.get("metadata"),
|
|
716
|
-
)
|
|
717
|
-
|
|
718
|
-
# Extract experiment ID from nested response
|
|
719
|
-
experiment_id = response.get("experiment", {}).get("id") or response.get("id")
|
|
720
|
-
if self.verbose:
|
|
721
|
-
console.print(f" [green]✓[/green] Created experiment (id: {experiment_id})")
|
|
722
|
-
|
|
723
|
-
# 2. Upload parameters
|
|
724
|
-
if not self.skip_params and "parameters" in validation_result.valid_data:
|
|
725
|
-
update_progress("Uploading parameters...")
|
|
726
|
-
if self.verbose:
|
|
727
|
-
console.print(" [dim]Uploading parameters...[/dim]")
|
|
317
|
+
def __init__(self, strict: bool = False):
|
|
318
|
+
"""
|
|
319
|
+
Initialize validator.
|
|
728
320
|
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
result.bytes_uploaded += len(json.dumps(params).encode("utf-8"))
|
|
321
|
+
Args:
|
|
322
|
+
strict: If True, fail on any validation error
|
|
323
|
+
"""
|
|
324
|
+
self.strict = strict
|
|
734
325
|
|
|
735
|
-
|
|
736
|
-
|
|
326
|
+
def validate_experiment(self, exp_info: ExperimentInfo) -> ValidationResult:
|
|
327
|
+
"""
|
|
328
|
+
Validate experiment directory structure and data.
|
|
737
329
|
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
count = self._upload_logs(
|
|
741
|
-
experiment_id, exp_info, result, task_id, update_progress
|
|
742
|
-
)
|
|
743
|
-
result.uploaded["logs"] = count
|
|
330
|
+
Args:
|
|
331
|
+
exp_info: Experiment information
|
|
744
332
|
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
result.uploaded["metrics"] = count
|
|
333
|
+
Returns:
|
|
334
|
+
ValidationResult with validation status and messages
|
|
335
|
+
"""
|
|
336
|
+
result = ValidationResult()
|
|
337
|
+
result.valid_data = {}
|
|
751
338
|
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
)
|
|
757
|
-
result.uploaded["files"] = count
|
|
758
|
-
|
|
759
|
-
result.success = True
|
|
760
|
-
|
|
761
|
-
except Exception as e:
|
|
762
|
-
result.success = False
|
|
763
|
-
result.errors.append(str(e))
|
|
764
|
-
if self.verbose:
|
|
765
|
-
console.print(f" [red]✗ Error: {e}[/red]")
|
|
766
|
-
|
|
767
|
-
return result
|
|
768
|
-
|
|
769
|
-
def _upload_logs(
|
|
770
|
-
self,
|
|
771
|
-
experiment_id: str,
|
|
772
|
-
exp_info: ExperimentInfo,
|
|
773
|
-
result: UploadResult,
|
|
774
|
-
task_id=None,
|
|
775
|
-
update_progress=None,
|
|
776
|
-
) -> int:
|
|
777
|
-
"""Upload logs in batches."""
|
|
778
|
-
if update_progress:
|
|
779
|
-
update_progress("Uploading logs...")
|
|
780
|
-
if self.verbose:
|
|
781
|
-
console.print(" [dim]Uploading logs...[/dim]")
|
|
782
|
-
|
|
783
|
-
logs_file = exp_info.path / "logs/logs.jsonl"
|
|
784
|
-
logs_batch = []
|
|
785
|
-
total_uploaded = 0
|
|
786
|
-
skipped = 0
|
|
787
|
-
|
|
788
|
-
try:
|
|
789
|
-
with open(logs_file, "r") as f:
|
|
790
|
-
for line in f:
|
|
791
|
-
try:
|
|
792
|
-
log_entry = json.loads(line)
|
|
793
|
-
|
|
794
|
-
# Validate required fields
|
|
795
|
-
if "message" not in log_entry:
|
|
796
|
-
skipped += 1
|
|
797
|
-
continue
|
|
798
|
-
|
|
799
|
-
# Prepare log entry for API
|
|
800
|
-
api_log = {
|
|
801
|
-
"timestamp": log_entry.get("timestamp"),
|
|
802
|
-
"level": log_entry.get("level", "info"),
|
|
803
|
-
"message": log_entry["message"],
|
|
804
|
-
}
|
|
805
|
-
if "metadata" in log_entry:
|
|
806
|
-
api_log["metadata"] = log_entry["metadata"]
|
|
339
|
+
# 1. Validate experiment metadata (required)
|
|
340
|
+
if not self._validate_experiment_metadata(exp_info, result):
|
|
341
|
+
result.is_valid = False
|
|
342
|
+
return result
|
|
807
343
|
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
result.bytes_uploaded += len(line.encode("utf-8"))
|
|
344
|
+
# 2. Validate parameters (optional)
|
|
345
|
+
self._validate_parameters(exp_info, result)
|
|
811
346
|
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
self.remote.create_log_entries(experiment_id, logs_batch)
|
|
815
|
-
total_uploaded += len(logs_batch)
|
|
816
|
-
logs_batch = []
|
|
347
|
+
# 3. Validate logs (optional)
|
|
348
|
+
self._validate_logs(exp_info, result)
|
|
817
349
|
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
continue
|
|
350
|
+
# 4. Validate metrics (optional)
|
|
351
|
+
self._validate_metrics(exp_info, result)
|
|
821
352
|
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
self.remote.create_log_entries(experiment_id, logs_batch)
|
|
825
|
-
total_uploaded += len(logs_batch)
|
|
353
|
+
# 5. Validate files (optional)
|
|
354
|
+
self._validate_files(exp_info, result)
|
|
826
355
|
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
356
|
+
# In strict mode, any warning becomes an error
|
|
357
|
+
if self.strict and result.warnings:
|
|
358
|
+
result.errors.extend(result.warnings)
|
|
359
|
+
result.warnings = []
|
|
360
|
+
result.is_valid = False
|
|
832
361
|
|
|
833
|
-
|
|
834
|
-
result.failed.setdefault("logs", []).append(str(e))
|
|
362
|
+
return result
|
|
835
363
|
|
|
836
|
-
|
|
364
|
+
def _validate_experiment_metadata(self, exp_info: ExperimentInfo, result: ValidationResult) -> bool:
|
|
365
|
+
"""Validate experiment.json exists and is valid."""
|
|
366
|
+
exp_json = exp_info.path / "experiment.json"
|
|
837
367
|
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
"""
|
|
842
|
-
Upload a single metric (thread-safe helper).
|
|
368
|
+
if not exp_json.exists():
|
|
369
|
+
result.errors.append("Missing experiment.json")
|
|
370
|
+
return False
|
|
843
371
|
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
data_file = metric_dir / "data.jsonl"
|
|
848
|
-
data_batch = []
|
|
849
|
-
total_uploaded = 0
|
|
850
|
-
skipped = 0
|
|
851
|
-
bytes_uploaded = 0
|
|
852
|
-
|
|
853
|
-
# Get thread-local client for safe concurrent HTTP requests
|
|
854
|
-
remote_client = self._get_remote_client()
|
|
855
|
-
|
|
856
|
-
try:
|
|
857
|
-
with open(data_file, "r") as f:
|
|
858
|
-
for line in f:
|
|
859
|
-
try:
|
|
860
|
-
data_point = json.loads(line)
|
|
861
|
-
|
|
862
|
-
# Validate required fields
|
|
863
|
-
if "data" not in data_point:
|
|
864
|
-
skipped += 1
|
|
865
|
-
continue
|
|
866
|
-
|
|
867
|
-
data_batch.append(data_point["data"])
|
|
868
|
-
bytes_uploaded += len(line.encode("utf-8"))
|
|
869
|
-
|
|
870
|
-
# Upload batch using thread-local client
|
|
871
|
-
if len(data_batch) >= self.batch_size:
|
|
872
|
-
remote_client.append_batch_to_metric(
|
|
873
|
-
experiment_id, metric_name, data_batch
|
|
874
|
-
)
|
|
875
|
-
total_uploaded += len(data_batch)
|
|
876
|
-
data_batch = []
|
|
877
|
-
|
|
878
|
-
except json.JSONDecodeError:
|
|
879
|
-
skipped += 1
|
|
880
|
-
continue
|
|
372
|
+
try:
|
|
373
|
+
with open(exp_json, "r") as f:
|
|
374
|
+
metadata = json.load(f)
|
|
881
375
|
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
"
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
376
|
+
# Check required fields
|
|
377
|
+
if "name" not in metadata or "project" not in metadata:
|
|
378
|
+
result.errors.append("experiment.json missing required fields (name, project)")
|
|
379
|
+
return False
|
|
380
|
+
|
|
381
|
+
result.valid_data["metadata"] = metadata
|
|
382
|
+
return True
|
|
383
|
+
|
|
384
|
+
except json.JSONDecodeError as e:
|
|
385
|
+
result.errors.append(f"Invalid JSON in experiment.json: {e}")
|
|
386
|
+
return False
|
|
387
|
+
except IOError as e:
|
|
388
|
+
result.errors.append(f"Cannot read experiment.json: {e}")
|
|
389
|
+
return False
|
|
390
|
+
|
|
391
|
+
def _validate_parameters(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
392
|
+
"""Validate parameters.json format."""
|
|
393
|
+
if not exp_info.has_params:
|
|
394
|
+
return
|
|
395
|
+
|
|
396
|
+
params_file = exp_info.path / "parameters.json"
|
|
397
|
+
try:
|
|
398
|
+
with open(params_file, "r") as f:
|
|
399
|
+
params = json.load(f)
|
|
400
|
+
|
|
401
|
+
# Check if it's a dict
|
|
402
|
+
if not isinstance(params, dict):
|
|
403
|
+
result.warnings.append("parameters.json is not a dict (will skip)")
|
|
404
|
+
return
|
|
405
|
+
|
|
406
|
+
# Check for valid data key if using versioned format
|
|
407
|
+
if "data" in params:
|
|
408
|
+
if not isinstance(params["data"], dict):
|
|
409
|
+
result.warnings.append("parameters.json data is not a dict (will skip)")
|
|
410
|
+
return
|
|
411
|
+
result.valid_data["parameters"] = params["data"]
|
|
412
|
+
else:
|
|
413
|
+
result.valid_data["parameters"] = params
|
|
414
|
+
|
|
415
|
+
except json.JSONDecodeError as e:
|
|
416
|
+
result.warnings.append(f"Invalid JSON in parameters.json: {e} (will skip)")
|
|
417
|
+
except IOError as e:
|
|
418
|
+
result.warnings.append(f"Cannot read parameters.json: {e} (will skip)")
|
|
419
|
+
|
|
420
|
+
def _validate_logs(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
421
|
+
"""Validate logs.jsonl format."""
|
|
422
|
+
if not exp_info.has_logs:
|
|
423
|
+
return
|
|
424
|
+
|
|
425
|
+
logs_file = exp_info.path / "logs" / "logs.jsonl"
|
|
426
|
+
invalid_lines = []
|
|
427
|
+
|
|
428
|
+
try:
|
|
429
|
+
with open(logs_file, "r") as f:
|
|
430
|
+
for line_num, line in enumerate(f, start=1):
|
|
431
|
+
try:
|
|
432
|
+
log_entry = json.loads(line)
|
|
433
|
+
# Check required fields
|
|
434
|
+
if "message" not in log_entry:
|
|
435
|
+
invalid_lines.append(line_num)
|
|
436
|
+
except json.JSONDecodeError:
|
|
437
|
+
invalid_lines.append(line_num)
|
|
438
|
+
|
|
439
|
+
if invalid_lines:
|
|
440
|
+
count = len(invalid_lines)
|
|
441
|
+
preview = invalid_lines[:5]
|
|
442
|
+
result.warnings.append(
|
|
443
|
+
f"logs.jsonl has {count} invalid lines (e.g., {preview}...) - will skip these"
|
|
444
|
+
)
|
|
928
445
|
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
metric_name = future_to_metric[future]
|
|
446
|
+
except IOError as e:
|
|
447
|
+
result.warnings.append(f"Cannot read logs.jsonl: {e} (will skip logs)")
|
|
932
448
|
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
449
|
+
def _validate_metrics(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
450
|
+
"""Validate metrics data."""
|
|
451
|
+
if not exp_info.metric_names:
|
|
452
|
+
return
|
|
453
|
+
|
|
454
|
+
for metric_name in exp_info.metric_names:
|
|
455
|
+
metric_dir = exp_info.path / "metrics" / metric_name
|
|
456
|
+
data_file = metric_dir / "data.jsonl"
|
|
457
|
+
|
|
458
|
+
invalid_lines = []
|
|
459
|
+
try:
|
|
460
|
+
with open(data_file, "r") as f:
|
|
461
|
+
for line_num, line in enumerate(f, start=1):
|
|
462
|
+
try:
|
|
463
|
+
data_point = json.loads(line)
|
|
464
|
+
# Check for data field
|
|
465
|
+
if "data" not in data_point:
|
|
466
|
+
invalid_lines.append(line_num)
|
|
467
|
+
except json.JSONDecodeError:
|
|
468
|
+
invalid_lines.append(line_num)
|
|
469
|
+
|
|
470
|
+
if invalid_lines:
|
|
471
|
+
count = len(invalid_lines)
|
|
472
|
+
preview = invalid_lines[:5]
|
|
473
|
+
result.warnings.append(
|
|
474
|
+
f"metric '{metric_name}' has {count} invalid lines (e.g., {preview}...) - will skip these"
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
except IOError as e:
|
|
478
|
+
result.warnings.append(f"Cannot read metric '{metric_name}': {e} (will skip)")
|
|
479
|
+
|
|
480
|
+
def _validate_files(self, exp_info: ExperimentInfo, result: ValidationResult):
|
|
481
|
+
"""Validate files existence."""
|
|
482
|
+
files_dir = exp_info.path / "files"
|
|
483
|
+
if not files_dir.exists():
|
|
484
|
+
return
|
|
485
|
+
|
|
486
|
+
metadata_file = files_dir / ".files_metadata.json"
|
|
487
|
+
if not metadata_file.exists():
|
|
488
|
+
return
|
|
936
489
|
|
|
937
490
|
try:
|
|
938
|
-
|
|
491
|
+
with open(metadata_file, "r") as f:
|
|
492
|
+
files_metadata = json.load(f)
|
|
493
|
+
|
|
494
|
+
missing_files = []
|
|
495
|
+
for file_id, file_info in files_metadata.items():
|
|
496
|
+
if isinstance(file_info, dict) and file_info.get("deletedAt") is None:
|
|
497
|
+
# Check if file exists
|
|
498
|
+
file_path = files_dir / file_info.get("prefix", "") / file_id / file_info.get("filename", "")
|
|
499
|
+
if not file_path.exists():
|
|
500
|
+
missing_files.append(file_info.get("filename", file_id))
|
|
501
|
+
|
|
502
|
+
if missing_files:
|
|
503
|
+
count = len(missing_files)
|
|
504
|
+
preview = missing_files[:3]
|
|
505
|
+
result.warnings.append(
|
|
506
|
+
f"{count} files referenced in metadata but missing on disk (e.g., {preview}...) - will skip these"
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
except (json.JSONDecodeError, IOError):
|
|
510
|
+
pass # If we can't read metadata, just skip file validation
|
|
939
511
|
|
|
940
|
-
# Thread-safe update of shared state
|
|
941
|
-
with self._lock:
|
|
942
|
-
result.bytes_uploaded += upload_result["bytes"]
|
|
943
512
|
|
|
944
|
-
|
|
945
|
-
|
|
513
|
+
class ExperimentUploader:
|
|
514
|
+
"""Handles uploading a single experiment."""
|
|
515
|
+
|
|
516
|
+
def __init__(
|
|
517
|
+
self,
|
|
518
|
+
local_storage: LocalStorage,
|
|
519
|
+
remote_client: RemoteClient,
|
|
520
|
+
batch_size: int = 100,
|
|
521
|
+
skip_logs: bool = False,
|
|
522
|
+
skip_metrics: bool = False,
|
|
523
|
+
skip_files: bool = False,
|
|
524
|
+
skip_params: bool = False,
|
|
525
|
+
verbose: bool = False,
|
|
526
|
+
progress: Optional[Progress] = None,
|
|
527
|
+
max_concurrent_metrics: int = 5,
|
|
528
|
+
):
|
|
529
|
+
"""
|
|
530
|
+
Initialize uploader.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
local_storage: Local storage instance
|
|
534
|
+
remote_client: Remote client instance
|
|
535
|
+
batch_size: Batch size for logs/metrics
|
|
536
|
+
skip_logs: Skip uploading logs
|
|
537
|
+
skip_metrics: Skip uploading metrics
|
|
538
|
+
skip_files: Skip uploading files
|
|
539
|
+
skip_params: Skip uploading parameters
|
|
540
|
+
verbose: Show verbose output
|
|
541
|
+
progress: Optional rich Progress instance for tracking
|
|
542
|
+
max_concurrent_metrics: Maximum concurrent metric uploads (default: 5)
|
|
543
|
+
"""
|
|
544
|
+
self.local = local_storage
|
|
545
|
+
self.remote = remote_client
|
|
546
|
+
self.batch_size = batch_size
|
|
547
|
+
self.skip_logs = skip_logs
|
|
548
|
+
self.skip_metrics = skip_metrics
|
|
549
|
+
self.skip_files = skip_files
|
|
550
|
+
self.skip_params = skip_params
|
|
551
|
+
self.verbose = verbose
|
|
552
|
+
self.progress = progress
|
|
553
|
+
self.max_concurrent_metrics = max_concurrent_metrics
|
|
554
|
+
# Thread-safe lock for shared state updates
|
|
555
|
+
self._lock = threading.Lock()
|
|
556
|
+
# Thread-local storage for remote clients (for thread-safe HTTP requests)
|
|
557
|
+
self._thread_local = threading.local()
|
|
558
|
+
|
|
559
|
+
def _get_remote_client(self) -> RemoteClient:
|
|
560
|
+
"""Get thread-local remote client for safe concurrent access."""
|
|
561
|
+
if not hasattr(self._thread_local, 'client'):
|
|
562
|
+
# Create a new client for this thread
|
|
563
|
+
self._thread_local.client = RemoteClient(
|
|
564
|
+
base_url=self.remote.base_url,
|
|
565
|
+
api_key=self.remote.api_key
|
|
566
|
+
)
|
|
567
|
+
return self._thread_local.client
|
|
568
|
+
|
|
569
|
+
def upload_experiment(
|
|
570
|
+
self, exp_info: ExperimentInfo, validation_result: ValidationResult, task_id=None
|
|
571
|
+
) -> UploadResult:
|
|
572
|
+
"""
|
|
573
|
+
Upload a single experiment with all its data.
|
|
574
|
+
|
|
575
|
+
Args:
|
|
576
|
+
exp_info: Experiment information
|
|
577
|
+
validation_result: Validation results
|
|
578
|
+
task_id: Optional progress task ID
|
|
579
|
+
|
|
580
|
+
Returns:
|
|
581
|
+
UploadResult with upload status
|
|
582
|
+
"""
|
|
583
|
+
result = UploadResult(experiment=f"{exp_info.project}/{exp_info.experiment}")
|
|
584
|
+
|
|
585
|
+
# Calculate total steps for progress tracking
|
|
586
|
+
total_steps = 1 # metadata
|
|
587
|
+
if not self.skip_params and "parameters" in validation_result.valid_data:
|
|
588
|
+
total_steps += 1
|
|
589
|
+
if not self.skip_logs and exp_info.has_logs:
|
|
590
|
+
total_steps += 1
|
|
591
|
+
if not self.skip_metrics and exp_info.metric_names:
|
|
592
|
+
total_steps += len(exp_info.metric_names)
|
|
593
|
+
if not self.skip_files and exp_info.file_count > 0:
|
|
594
|
+
total_steps += exp_info.file_count
|
|
595
|
+
|
|
596
|
+
current_step = 0
|
|
597
|
+
|
|
598
|
+
def update_progress(description: str):
|
|
599
|
+
nonlocal current_step
|
|
600
|
+
current_step += 1
|
|
601
|
+
if self.progress and task_id is not None:
|
|
602
|
+
self.progress.update(task_id, completed=current_step, total=total_steps, description=description)
|
|
946
603
|
|
|
947
|
-
|
|
604
|
+
try:
|
|
605
|
+
# 1. Create/update experiment metadata
|
|
606
|
+
update_progress("Creating experiment...")
|
|
948
607
|
if self.verbose:
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
)
|
|
608
|
+
console.print(f" [dim]Creating experiment...[/dim]")
|
|
609
|
+
|
|
610
|
+
exp_data = validation_result.valid_data
|
|
611
|
+
|
|
612
|
+
# Store folder path in metadata (not as folderId which expects Snowflake ID)
|
|
613
|
+
custom_metadata = exp_data.get("metadata") or {}
|
|
614
|
+
if exp_data.get("folder"):
|
|
615
|
+
custom_metadata["folder"] = exp_data["folder"]
|
|
616
|
+
|
|
617
|
+
response = self.remote.create_or_update_experiment(
|
|
618
|
+
project=exp_info.project,
|
|
619
|
+
name=exp_info.experiment,
|
|
620
|
+
description=exp_data.get("description"),
|
|
621
|
+
tags=exp_data.get("tags"),
|
|
622
|
+
bindrs=exp_data.get("bindrs"),
|
|
623
|
+
folder=None, # Don't send folder path as folderId (expects Snowflake ID)
|
|
624
|
+
write_protected=exp_data.get("write_protected", False),
|
|
625
|
+
metadata=custom_metadata if custom_metadata else None,
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
# Extract experiment ID from nested response
|
|
629
|
+
experiment_id = response.get("experiment", {}).get("id") or response.get("id")
|
|
630
|
+
if self.verbose:
|
|
631
|
+
console.print(f" [green]✓[/green] Created experiment (id: {experiment_id})")
|
|
632
|
+
|
|
633
|
+
# 2. Upload parameters
|
|
634
|
+
if not self.skip_params and "parameters" in validation_result.valid_data:
|
|
635
|
+
update_progress("Uploading parameters...")
|
|
636
|
+
if self.verbose:
|
|
637
|
+
console.print(f" [dim]Uploading parameters...[/dim]")
|
|
638
|
+
|
|
639
|
+
params = validation_result.valid_data["parameters"]
|
|
640
|
+
self.remote.set_parameters(experiment_id, params)
|
|
641
|
+
result.uploaded["params"] = len(params)
|
|
642
|
+
# Track bytes (approximate JSON size)
|
|
643
|
+
result.bytes_uploaded += len(json.dumps(params).encode('utf-8'))
|
|
644
|
+
|
|
645
|
+
if self.verbose:
|
|
646
|
+
console.print(f" [green]✓[/green] Uploaded {len(params)} parameters")
|
|
647
|
+
|
|
648
|
+
# 3. Upload logs
|
|
649
|
+
if not self.skip_logs and exp_info.has_logs:
|
|
650
|
+
count = self._upload_logs(experiment_id, exp_info, result, task_id, update_progress)
|
|
651
|
+
result.uploaded["logs"] = count
|
|
652
|
+
|
|
653
|
+
# 4. Upload metrics
|
|
654
|
+
if not self.skip_metrics and exp_info.metric_names:
|
|
655
|
+
count = self._upload_metrics(experiment_id, exp_info, result, task_id, update_progress)
|
|
656
|
+
result.uploaded["metrics"] = count
|
|
657
|
+
|
|
658
|
+
# 5. Upload files
|
|
659
|
+
if not self.skip_files and exp_info.file_count > 0:
|
|
660
|
+
count = self._upload_files(experiment_id, exp_info, result, task_id, update_progress)
|
|
661
|
+
result.uploaded["files"] = count
|
|
662
|
+
|
|
663
|
+
result.success = True
|
|
963
664
|
|
|
964
665
|
except Exception as e:
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
666
|
+
result.success = False
|
|
667
|
+
result.errors.append(str(e))
|
|
668
|
+
if self.verbose:
|
|
669
|
+
console.print(f" [red]✗ Error: {e}[/red]")
|
|
670
|
+
|
|
671
|
+
return result
|
|
672
|
+
|
|
673
|
+
def _upload_logs(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
|
|
674
|
+
task_id=None, update_progress=None) -> int:
|
|
675
|
+
"""Upload logs in batches."""
|
|
676
|
+
if update_progress:
|
|
677
|
+
update_progress("Uploading logs...")
|
|
678
|
+
if self.verbose:
|
|
679
|
+
console.print(f" [dim]Uploading logs...[/dim]")
|
|
680
|
+
|
|
681
|
+
logs_file = exp_info.path / "logs" / "logs.jsonl"
|
|
682
|
+
logs_batch = []
|
|
683
|
+
total_uploaded = 0
|
|
684
|
+
skipped = 0
|
|
685
|
+
|
|
686
|
+
try:
|
|
687
|
+
with open(logs_file, "r") as f:
|
|
688
|
+
for line in f:
|
|
689
|
+
try:
|
|
690
|
+
log_entry = json.loads(line)
|
|
691
|
+
|
|
692
|
+
# Validate required fields
|
|
693
|
+
if "message" not in log_entry:
|
|
694
|
+
skipped += 1
|
|
695
|
+
continue
|
|
696
|
+
|
|
697
|
+
# Prepare log entry for API
|
|
698
|
+
api_log = {
|
|
699
|
+
"timestamp": log_entry.get("timestamp"),
|
|
700
|
+
"level": log_entry.get("level", "info"),
|
|
701
|
+
"message": log_entry["message"],
|
|
702
|
+
}
|
|
703
|
+
if "metadata" in log_entry:
|
|
704
|
+
api_log["metadata"] = log_entry["metadata"]
|
|
705
|
+
|
|
706
|
+
logs_batch.append(api_log)
|
|
707
|
+
# Track bytes
|
|
708
|
+
result.bytes_uploaded += len(line.encode('utf-8'))
|
|
709
|
+
|
|
710
|
+
# Upload batch
|
|
711
|
+
if len(logs_batch) >= self.batch_size:
|
|
712
|
+
self.remote.create_log_entries(experiment_id, logs_batch)
|
|
713
|
+
total_uploaded += len(logs_batch)
|
|
714
|
+
logs_batch = []
|
|
715
|
+
|
|
716
|
+
except json.JSONDecodeError:
|
|
717
|
+
skipped += 1
|
|
718
|
+
continue
|
|
719
|
+
|
|
720
|
+
# Upload remaining logs
|
|
721
|
+
if logs_batch:
|
|
722
|
+
self.remote.create_log_entries(experiment_id, logs_batch)
|
|
723
|
+
total_uploaded += len(logs_batch)
|
|
724
|
+
|
|
969
725
|
if self.verbose:
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
files_list = self.local.list_files(owner, project, full_prefix)
|
|
1002
|
-
|
|
1003
|
-
# Debug: print file count
|
|
1004
|
-
if self.verbose:
|
|
1005
|
-
print(f"[DEBUG] Found {len(files_list)} files to upload")
|
|
1006
|
-
print(f"[DEBUG] Full prefix: {full_prefix}")
|
|
1007
|
-
|
|
1008
|
-
for file_info in files_list:
|
|
1009
|
-
# Skip deleted files
|
|
1010
|
-
if file_info.get("deletedAt") is not None:
|
|
1011
|
-
continue
|
|
726
|
+
msg = f" [green]✓[/green] Uploaded {total_uploaded} log entries"
|
|
727
|
+
if skipped > 0:
|
|
728
|
+
msg += f" (skipped {skipped} invalid)"
|
|
729
|
+
console.print(msg)
|
|
730
|
+
|
|
731
|
+
except IOError as e:
|
|
732
|
+
result.failed.setdefault("logs", []).append(str(e))
|
|
733
|
+
|
|
734
|
+
return total_uploaded
|
|
735
|
+
|
|
736
|
+
def _upload_single_metric(
|
|
737
|
+
self,
|
|
738
|
+
experiment_id: str,
|
|
739
|
+
metric_name: str,
|
|
740
|
+
metric_dir: Path,
|
|
741
|
+
result: UploadResult
|
|
742
|
+
) -> Dict[str, Any]:
|
|
743
|
+
"""
|
|
744
|
+
Upload a single metric (thread-safe helper).
|
|
745
|
+
|
|
746
|
+
Returns:
|
|
747
|
+
Dict with 'success', 'uploaded', 'skipped', 'bytes', and 'error' keys
|
|
748
|
+
"""
|
|
749
|
+
data_file = metric_dir / "data.jsonl"
|
|
750
|
+
data_batch = []
|
|
751
|
+
total_uploaded = 0
|
|
752
|
+
skipped = 0
|
|
753
|
+
bytes_uploaded = 0
|
|
754
|
+
|
|
755
|
+
# Get thread-local client for safe concurrent HTTP requests
|
|
756
|
+
remote_client = self._get_remote_client()
|
|
1012
757
|
|
|
1013
758
|
try:
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
console.print(
|
|
1052
|
-
f" [green]✓[/green] {file_info['filename']} ({size_mb:.1f}MB)"
|
|
1053
|
-
)
|
|
759
|
+
with open(data_file, "r") as f:
|
|
760
|
+
for line in f:
|
|
761
|
+
try:
|
|
762
|
+
data_point = json.loads(line)
|
|
763
|
+
|
|
764
|
+
# Validate required fields
|
|
765
|
+
if "data" not in data_point:
|
|
766
|
+
skipped += 1
|
|
767
|
+
continue
|
|
768
|
+
|
|
769
|
+
data_batch.append(data_point["data"])
|
|
770
|
+
bytes_uploaded += len(line.encode('utf-8'))
|
|
771
|
+
|
|
772
|
+
# Upload batch using thread-local client
|
|
773
|
+
if len(data_batch) >= self.batch_size:
|
|
774
|
+
remote_client.append_batch_to_metric(
|
|
775
|
+
experiment_id, metric_name, data_batch
|
|
776
|
+
)
|
|
777
|
+
total_uploaded += len(data_batch)
|
|
778
|
+
data_batch = []
|
|
779
|
+
|
|
780
|
+
except json.JSONDecodeError:
|
|
781
|
+
skipped += 1
|
|
782
|
+
continue
|
|
783
|
+
|
|
784
|
+
# Upload remaining data points using thread-local client
|
|
785
|
+
if data_batch:
|
|
786
|
+
remote_client.append_batch_to_metric(experiment_id, metric_name, data_batch)
|
|
787
|
+
total_uploaded += len(data_batch)
|
|
788
|
+
|
|
789
|
+
return {
|
|
790
|
+
'success': True,
|
|
791
|
+
'uploaded': total_uploaded,
|
|
792
|
+
'skipped': skipped,
|
|
793
|
+
'bytes': bytes_uploaded,
|
|
794
|
+
'error': None
|
|
795
|
+
}
|
|
1054
796
|
|
|
1055
797
|
except Exception as e:
|
|
1056
|
-
|
|
798
|
+
return {
|
|
799
|
+
'success': False,
|
|
800
|
+
'uploaded': 0,
|
|
801
|
+
'skipped': 0,
|
|
802
|
+
'bytes': 0,
|
|
803
|
+
'error': str(e)
|
|
804
|
+
}
|
|
1057
805
|
|
|
1058
|
-
|
|
1059
|
-
|
|
806
|
+
def _upload_metrics(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
|
|
807
|
+
task_id=None, update_progress=None) -> int:
|
|
808
|
+
"""Upload metrics in parallel with concurrency limit."""
|
|
809
|
+
if not exp_info.metric_names:
|
|
810
|
+
return 0
|
|
811
|
+
|
|
812
|
+
total_metrics = 0
|
|
813
|
+
|
|
814
|
+
# Use ThreadPoolExecutor for parallel uploads
|
|
815
|
+
with ThreadPoolExecutor(max_workers=self.max_concurrent_metrics) as executor:
|
|
816
|
+
# Submit all metric upload tasks
|
|
817
|
+
future_to_metric = {}
|
|
818
|
+
for metric_name in exp_info.metric_names:
|
|
819
|
+
metric_dir = exp_info.path / "metrics" / metric_name
|
|
820
|
+
future = executor.submit(
|
|
821
|
+
self._upload_single_metric,
|
|
822
|
+
experiment_id,
|
|
823
|
+
metric_name,
|
|
824
|
+
metric_dir,
|
|
825
|
+
result
|
|
826
|
+
)
|
|
827
|
+
future_to_metric[future] = metric_name
|
|
828
|
+
|
|
829
|
+
# Process completed uploads as they finish
|
|
830
|
+
for future in as_completed(future_to_metric):
|
|
831
|
+
metric_name = future_to_metric[future]
|
|
832
|
+
|
|
833
|
+
# Update progress
|
|
834
|
+
if update_progress:
|
|
835
|
+
update_progress(f"Uploading metric '{metric_name}'...")
|
|
836
|
+
|
|
837
|
+
try:
|
|
838
|
+
upload_result = future.result()
|
|
839
|
+
|
|
840
|
+
# Thread-safe update of shared state
|
|
841
|
+
with self._lock:
|
|
842
|
+
result.bytes_uploaded += upload_result['bytes']
|
|
843
|
+
|
|
844
|
+
if upload_result['success']:
|
|
845
|
+
total_metrics += 1
|
|
846
|
+
|
|
847
|
+
# Thread-safe console output
|
|
848
|
+
if self.verbose:
|
|
849
|
+
msg = f" [green]✓[/green] Uploaded {upload_result['uploaded']} data points for '{metric_name}'"
|
|
850
|
+
if upload_result['skipped'] > 0:
|
|
851
|
+
msg += f" (skipped {upload_result['skipped']} invalid)"
|
|
852
|
+
with self._lock:
|
|
853
|
+
console.print(msg)
|
|
854
|
+
else:
|
|
855
|
+
# Record failure
|
|
856
|
+
error_msg = f"{metric_name}: {upload_result['error']}"
|
|
857
|
+
with self._lock:
|
|
858
|
+
result.failed.setdefault("metrics", []).append(error_msg)
|
|
859
|
+
if self.verbose:
|
|
860
|
+
console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {upload_result['error']}")
|
|
861
|
+
|
|
862
|
+
except Exception as e:
|
|
863
|
+
# Handle unexpected errors
|
|
864
|
+
error_msg = f"{metric_name}: {str(e)}"
|
|
865
|
+
with self._lock:
|
|
866
|
+
result.failed.setdefault("metrics", []).append(error_msg)
|
|
867
|
+
if self.verbose:
|
|
868
|
+
console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {e}")
|
|
869
|
+
|
|
870
|
+
return total_metrics
|
|
871
|
+
|
|
872
|
+
def _upload_files(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
|
|
873
|
+
task_id=None, update_progress=None) -> int:
|
|
874
|
+
"""Upload files one by one."""
|
|
875
|
+
files_dir = exp_info.path / "files"
|
|
876
|
+
total_uploaded = 0
|
|
877
|
+
|
|
878
|
+
# Use LocalStorage to list files
|
|
879
|
+
try:
|
|
880
|
+
files_list = self.local.list_files(exp_info.project, exp_info.experiment)
|
|
881
|
+
|
|
882
|
+
for file_info in files_list:
|
|
883
|
+
# Skip deleted files
|
|
884
|
+
if file_info.get("deletedAt") is not None:
|
|
885
|
+
continue
|
|
886
|
+
|
|
887
|
+
try:
|
|
888
|
+
if update_progress:
|
|
889
|
+
update_progress(f"Uploading {file_info['filename']}...")
|
|
890
|
+
|
|
891
|
+
# Get file path directly from storage without copying
|
|
892
|
+
file_id = file_info["id"]
|
|
893
|
+
experiment_dir = self.local._get_experiment_dir(exp_info.project, exp_info.experiment)
|
|
894
|
+
files_dir = experiment_dir / "files"
|
|
895
|
+
|
|
896
|
+
# Construct file path
|
|
897
|
+
file_prefix = file_info["path"].lstrip("/") if file_info["path"] else ""
|
|
898
|
+
if file_prefix:
|
|
899
|
+
file_path = files_dir / file_prefix / file_id / file_info["filename"]
|
|
900
|
+
else:
|
|
901
|
+
file_path = files_dir / file_id / file_info["filename"]
|
|
902
|
+
|
|
903
|
+
# Upload to remote with correct parameters
|
|
904
|
+
self.remote.upload_file(
|
|
905
|
+
experiment_id=experiment_id,
|
|
906
|
+
file_path=str(file_path),
|
|
907
|
+
prefix=file_info.get("path", ""),
|
|
908
|
+
filename=file_info["filename"],
|
|
909
|
+
description=file_info.get("description"),
|
|
910
|
+
tags=file_info.get("tags", []),
|
|
911
|
+
metadata=file_info.get("metadata"),
|
|
912
|
+
checksum=file_info["checksum"],
|
|
913
|
+
content_type=file_info["contentType"],
|
|
914
|
+
size_bytes=file_info["sizeBytes"],
|
|
915
|
+
)
|
|
916
|
+
|
|
917
|
+
total_uploaded += 1
|
|
918
|
+
# Track bytes
|
|
919
|
+
result.bytes_uploaded += file_info.get("sizeBytes", 0)
|
|
920
|
+
|
|
921
|
+
if self.verbose:
|
|
922
|
+
size_mb = file_info.get("sizeBytes", 0) / (1024 * 1024)
|
|
923
|
+
console.print(f" [green]✓[/green] {file_info['filename']} ({size_mb:.1f}MB)")
|
|
924
|
+
|
|
925
|
+
except Exception as e:
|
|
926
|
+
result.failed.setdefault("files", []).append(f"{file_info['filename']}: {e}")
|
|
927
|
+
|
|
928
|
+
except Exception as e:
|
|
929
|
+
result.failed.setdefault("files", []).append(str(e))
|
|
1060
930
|
|
|
1061
|
-
|
|
1062
|
-
|
|
931
|
+
if self.verbose and not result.failed.get("files"):
|
|
932
|
+
console.print(f" [green]✓[/green] Uploaded {total_uploaded} files")
|
|
1063
933
|
|
|
1064
|
-
|
|
934
|
+
return total_uploaded
|
|
1065
935
|
|
|
1066
936
|
|
|
1067
937
|
def cmd_upload(args: argparse.Namespace) -> int:
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
938
|
+
"""
|
|
939
|
+
Execute upload command.
|
|
940
|
+
|
|
941
|
+
Args:
|
|
942
|
+
args: Parsed command-line arguments
|
|
943
|
+
|
|
944
|
+
Returns:
|
|
945
|
+
Exit code (0 for success, 1 for error)
|
|
946
|
+
"""
|
|
947
|
+
# Load config
|
|
948
|
+
config = Config()
|
|
949
|
+
|
|
950
|
+
# Get remote URL (command line > config)
|
|
951
|
+
remote_url = args.remote or config.remote_url
|
|
952
|
+
if not remote_url:
|
|
953
|
+
console.print("[red]Error:[/red] --remote URL is required (or set in config)")
|
|
954
|
+
return 1
|
|
955
|
+
|
|
956
|
+
# Get API key (command line > config > auto-load from storage)
|
|
957
|
+
# RemoteClient will auto-load from storage if api_key is None
|
|
958
|
+
api_key = args.api_key or config.api_key
|
|
959
|
+
|
|
960
|
+
# Validate experiment filter requires project
|
|
961
|
+
if args.experiment and not args.project:
|
|
962
|
+
console.print("[red]Error:[/red] --experiment requires --project")
|
|
963
|
+
return 1
|
|
964
|
+
|
|
965
|
+
# Discover experiments
|
|
966
|
+
local_path = Path(args.path)
|
|
967
|
+
if not local_path.exists():
|
|
968
|
+
console.print(f"[red]Error:[/red] Local storage path does not exist: {local_path}")
|
|
969
|
+
return 1
|
|
970
|
+
|
|
971
|
+
# Handle state file for resume functionality
|
|
972
|
+
state_file = Path(args.state_file)
|
|
973
|
+
upload_state = None
|
|
974
|
+
|
|
975
|
+
if args.resume:
|
|
976
|
+
upload_state = UploadState.load(state_file)
|
|
977
|
+
if upload_state:
|
|
978
|
+
# Validate state matches current upload
|
|
979
|
+
if upload_state.local_path != str(local_path.absolute()):
|
|
980
|
+
console.print("[yellow]Warning:[/yellow] State file local path doesn't match. Starting fresh upload.")
|
|
981
|
+
upload_state = None
|
|
982
|
+
elif upload_state.remote_url != remote_url:
|
|
983
|
+
console.print("[yellow]Warning:[/yellow] State file remote URL doesn't match. Starting fresh upload.")
|
|
984
|
+
upload_state = None
|
|
985
|
+
else:
|
|
986
|
+
console.print(f"[green]Resuming previous upload from {upload_state.timestamp}[/green]")
|
|
987
|
+
console.print(f" Already completed: {len(upload_state.completed_experiments)} experiments")
|
|
988
|
+
console.print(f" Failed: {len(upload_state.failed_experiments)} experiments")
|
|
989
|
+
else:
|
|
990
|
+
console.print("[yellow]No previous upload state found. Starting fresh upload.[/yellow]")
|
|
991
|
+
|
|
992
|
+
# Create new state if not resuming
|
|
993
|
+
if not upload_state:
|
|
994
|
+
upload_state = UploadState(
|
|
995
|
+
local_path=str(local_path.absolute()),
|
|
996
|
+
remote_url=remote_url,
|
|
1120
997
|
)
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
# Create new state if not resuming
|
|
1128
|
-
if not upload_state:
|
|
1129
|
-
upload_state = UploadState(
|
|
1130
|
-
local_path=str(local_path.absolute()),
|
|
1131
|
-
remote_url=remote_url,
|
|
998
|
+
|
|
999
|
+
console.print(f"[bold]Scanning local storage:[/bold] {local_path.absolute()}")
|
|
1000
|
+
experiments = discover_experiments(
|
|
1001
|
+
local_path,
|
|
1002
|
+
project_filter=args.project,
|
|
1003
|
+
experiment_filter=args.experiment,
|
|
1132
1004
|
)
|
|
1133
1005
|
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
if
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1006
|
+
if not experiments:
|
|
1007
|
+
if args.project and args.experiment:
|
|
1008
|
+
console.print(f"[yellow]No experiment found:[/yellow] {args.project}/{args.experiment}")
|
|
1009
|
+
elif args.project:
|
|
1010
|
+
console.print(f"[yellow]No experiments found in project:[/yellow] {args.project}")
|
|
1011
|
+
else:
|
|
1012
|
+
console.print("[yellow]No experiments found in local storage[/yellow]")
|
|
1013
|
+
return 1
|
|
1014
|
+
|
|
1015
|
+
# Filter out already completed experiments when resuming
|
|
1016
|
+
if args.resume and upload_state.completed_experiments:
|
|
1017
|
+
original_count = len(experiments)
|
|
1018
|
+
experiments = [
|
|
1019
|
+
exp for exp in experiments
|
|
1020
|
+
if f"{exp.project}/{exp.experiment}" not in upload_state.completed_experiments
|
|
1021
|
+
]
|
|
1022
|
+
skipped_count = original_count - len(experiments)
|
|
1023
|
+
if skipped_count > 0:
|
|
1024
|
+
console.print(f"[dim]Skipping {skipped_count} already completed experiment(s)[/dim]")
|
|
1025
|
+
|
|
1026
|
+
console.print(f"[green]Found {len(experiments)} experiment(s) to upload[/green]")
|
|
1027
|
+
|
|
1028
|
+
# Display discovered experiments
|
|
1029
|
+
if args.verbose or args.dry_run:
|
|
1030
|
+
console.print("\n[bold]Discovered experiments:[/bold]")
|
|
1031
|
+
for exp in experiments:
|
|
1032
|
+
parts = []
|
|
1033
|
+
if exp.has_logs:
|
|
1034
|
+
parts.append("logs")
|
|
1035
|
+
if exp.has_params:
|
|
1036
|
+
parts.append("params")
|
|
1037
|
+
if exp.metric_names:
|
|
1038
|
+
parts.append(f"{len(exp.metric_names)} metrics")
|
|
1039
|
+
if exp.file_count:
|
|
1040
|
+
size_mb = exp.estimated_size / (1024 * 1024)
|
|
1041
|
+
parts.append(f"{exp.file_count} files ({size_mb:.1f}MB)")
|
|
1042
|
+
|
|
1043
|
+
details = ", ".join(parts) if parts else "metadata only"
|
|
1044
|
+
console.print(f" [cyan]•[/cyan] {exp.project}/{exp.experiment} [dim]({details})[/dim]")
|
|
1045
|
+
|
|
1046
|
+
# Dry-run mode: stop here
|
|
1047
|
+
if args.dry_run:
|
|
1048
|
+
console.print("\n[yellow bold]DRY RUN[/yellow bold] - No data will be uploaded")
|
|
1049
|
+
console.print("Run without --dry-run to proceed with upload.")
|
|
1050
|
+
return 0
|
|
1051
|
+
|
|
1052
|
+
# Validate experiments
|
|
1053
|
+
console.print("\n[bold]Validating experiments...[/bold]")
|
|
1054
|
+
validator = ExperimentValidator(strict=args.strict)
|
|
1055
|
+
validation_results = {}
|
|
1056
|
+
valid_experiments = []
|
|
1057
|
+
invalid_experiments = []
|
|
1058
|
+
|
|
1167
1059
|
for exp in experiments:
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
parts.append("logs")
|
|
1171
|
-
if exp.has_params:
|
|
1172
|
-
parts.append("params")
|
|
1173
|
-
if exp.metric_names:
|
|
1174
|
-
parts.append(f"{len(exp.metric_names)} metrics")
|
|
1175
|
-
if exp.file_count:
|
|
1176
|
-
size_mb = exp.estimated_size / (1024 * 1024)
|
|
1177
|
-
parts.append(f"{exp.file_count} files ({size_mb:.1f}MB)")
|
|
1178
|
-
|
|
1179
|
-
details = ", ".join(parts) if parts else "metadata only"
|
|
1180
|
-
console.print(
|
|
1181
|
-
f" [cyan]•[/cyan] {exp.project}/{exp.experiment} [dim]({details})[/dim]"
|
|
1182
|
-
)
|
|
1183
|
-
|
|
1184
|
-
# Dry-run mode: stop here
|
|
1185
|
-
if args.dry_run:
|
|
1186
|
-
console.print("\n[yellow bold]DRY RUN[/yellow bold] - No data will be uploaded")
|
|
1187
|
-
console.print("Run without --dry-run to proceed with upload.")
|
|
1188
|
-
return 0
|
|
1189
|
-
|
|
1190
|
-
# Validate experiments
|
|
1191
|
-
console.print("\n[bold]Validating experiments...[/bold]")
|
|
1192
|
-
validator = ExperimentValidator(strict=args.strict)
|
|
1193
|
-
validation_results = {}
|
|
1194
|
-
valid_experiments = []
|
|
1195
|
-
invalid_experiments = []
|
|
1196
|
-
|
|
1197
|
-
for exp in experiments:
|
|
1198
|
-
validation = validator.validate_experiment(exp)
|
|
1199
|
-
validation_results[f"{exp.project}/{exp.experiment}"] = validation
|
|
1200
|
-
|
|
1201
|
-
if validation.is_valid:
|
|
1202
|
-
valid_experiments.append(exp)
|
|
1203
|
-
else:
|
|
1204
|
-
invalid_experiments.append(exp)
|
|
1205
|
-
|
|
1206
|
-
# Show warnings and errors
|
|
1207
|
-
if args.verbose or validation.errors:
|
|
1208
|
-
exp_key = f"{exp.project}/{exp.experiment}"
|
|
1209
|
-
if validation.errors:
|
|
1210
|
-
console.print(f" [red]✗[/red] {exp_key}:")
|
|
1211
|
-
for error in validation.errors:
|
|
1212
|
-
console.print(f" [red]{error}[/red]")
|
|
1213
|
-
elif validation.warnings:
|
|
1214
|
-
console.print(f" [yellow]⚠[/yellow] {exp_key}:")
|
|
1215
|
-
for warning in validation.warnings:
|
|
1216
|
-
console.print(f" [yellow]{warning}[/yellow]")
|
|
1217
|
-
|
|
1218
|
-
if invalid_experiments:
|
|
1219
|
-
console.print(
|
|
1220
|
-
f"\n[yellow]{len(invalid_experiments)} experiment(s) failed validation and will be skipped[/yellow]"
|
|
1221
|
-
)
|
|
1222
|
-
if args.strict:
|
|
1223
|
-
console.print("[red]Error: Validation failed in --strict mode[/red]")
|
|
1224
|
-
return 1
|
|
1225
|
-
|
|
1226
|
-
if not valid_experiments:
|
|
1227
|
-
console.print("[red]Error: No valid experiments to upload[/red]")
|
|
1228
|
-
return 1
|
|
1229
|
-
|
|
1230
|
-
console.print(
|
|
1231
|
-
f"[green]{len(valid_experiments)} experiment(s) ready to upload[/green]"
|
|
1232
|
-
)
|
|
1233
|
-
|
|
1234
|
-
# Initialize remote client and local storage
|
|
1235
|
-
remote_client = RemoteClient(base_url=remote_url, api_key=api_key)
|
|
1236
|
-
local_storage = LocalStorage(root_path=local_path)
|
|
1237
|
-
|
|
1238
|
-
# Upload experiments with progress tracking
|
|
1239
|
-
console.print(f"\n[bold]Uploading to:[/bold] {remote_url}")
|
|
1240
|
-
if args.target:
|
|
1241
|
-
console.print(f"[bold]Target prefix:[/bold] {args.target}")
|
|
1242
|
-
results = []
|
|
1243
|
-
|
|
1244
|
-
# Track upload timing
|
|
1245
|
-
import time
|
|
1246
|
-
|
|
1247
|
-
start_time = time.time()
|
|
1248
|
-
|
|
1249
|
-
# Create progress bar for overall upload
|
|
1250
|
-
with Progress(
|
|
1251
|
-
SpinnerColumn(),
|
|
1252
|
-
TextColumn("[progress.description]{task.description}"),
|
|
1253
|
-
BarColumn(),
|
|
1254
|
-
TaskProgressColumn(),
|
|
1255
|
-
console=console,
|
|
1256
|
-
transient=not args.verbose, # Keep progress visible in verbose mode
|
|
1257
|
-
) as progress:
|
|
1258
|
-
# Create uploader with progress tracking
|
|
1259
|
-
uploader = ExperimentUploader(
|
|
1260
|
-
local_storage=local_storage,
|
|
1261
|
-
remote_client=remote_client,
|
|
1262
|
-
batch_size=args.batch_size,
|
|
1263
|
-
skip_logs=args.skip_logs,
|
|
1264
|
-
skip_metrics=args.skip_metrics,
|
|
1265
|
-
skip_files=args.skip_files,
|
|
1266
|
-
skip_params=args.skip_params,
|
|
1267
|
-
verbose=args.verbose,
|
|
1268
|
-
progress=progress,
|
|
1269
|
-
target_prefix=args.target,
|
|
1270
|
-
)
|
|
1060
|
+
validation = validator.validate_experiment(exp)
|
|
1061
|
+
validation_results[f"{exp.project}/{exp.experiment}"] = validation
|
|
1271
1062
|
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
# Create task for this experiment
|
|
1276
|
-
task_id = progress.add_task(
|
|
1277
|
-
f"[{i}/{len(valid_experiments)}] {exp_key}",
|
|
1278
|
-
total=100, # Will be updated with actual steps
|
|
1279
|
-
)
|
|
1280
|
-
|
|
1281
|
-
# Update state - mark as in progress
|
|
1282
|
-
upload_state.in_progress_experiment = exp_key
|
|
1283
|
-
if not args.dry_run:
|
|
1284
|
-
upload_state.save(state_file)
|
|
1285
|
-
|
|
1286
|
-
validation = validation_results[exp_key]
|
|
1287
|
-
result = uploader.upload_experiment(exp, validation, task_id=task_id)
|
|
1288
|
-
results.append(result)
|
|
1289
|
-
|
|
1290
|
-
# Update state - mark as completed or failed
|
|
1291
|
-
upload_state.in_progress_experiment = None
|
|
1292
|
-
if result.success:
|
|
1293
|
-
upload_state.completed_experiments.append(exp_key)
|
|
1294
|
-
else:
|
|
1295
|
-
upload_state.failed_experiments.append(exp_key)
|
|
1296
|
-
|
|
1297
|
-
if not args.dry_run:
|
|
1298
|
-
upload_state.save(state_file)
|
|
1299
|
-
|
|
1300
|
-
# Update task to completed
|
|
1301
|
-
progress.update(task_id, completed=100, total=100)
|
|
1302
|
-
|
|
1303
|
-
if not args.verbose:
|
|
1304
|
-
# Show brief status
|
|
1305
|
-
if result.success:
|
|
1306
|
-
parts = []
|
|
1307
|
-
if result.uploaded.get("params"):
|
|
1308
|
-
parts.append(f"{result.uploaded['params']} params")
|
|
1309
|
-
if result.uploaded.get("logs"):
|
|
1310
|
-
parts.append(f"{result.uploaded['logs']} logs")
|
|
1311
|
-
if result.uploaded.get("metrics"):
|
|
1312
|
-
parts.append(f"{result.uploaded['metrics']} metrics")
|
|
1313
|
-
if result.uploaded.get("files"):
|
|
1314
|
-
parts.append(f"{result.uploaded['files']} files")
|
|
1315
|
-
status = ", ".join(parts) if parts else "metadata only"
|
|
1316
|
-
console.print(f" [green]✓[/green] Uploaded ({status})")
|
|
1063
|
+
if validation.is_valid:
|
|
1064
|
+
valid_experiments.append(exp)
|
|
1317
1065
|
else:
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
if total_logs:
|
|
1379
|
-
data_table.add_row("Logs", f"{total_logs} entries")
|
|
1380
|
-
if total_metrics:
|
|
1381
|
-
data_table.add_row("Metrics", f"{total_metrics} metrics")
|
|
1382
|
-
if total_files:
|
|
1383
|
-
data_table.add_row("Files", f"{total_files} files")
|
|
1066
|
+
invalid_experiments.append(exp)
|
|
1067
|
+
|
|
1068
|
+
# Show warnings and errors
|
|
1069
|
+
if args.verbose or validation.errors:
|
|
1070
|
+
exp_key = f"{exp.project}/{exp.experiment}"
|
|
1071
|
+
if validation.errors:
|
|
1072
|
+
console.print(f" [red]✗[/red] {exp_key}:")
|
|
1073
|
+
for error in validation.errors:
|
|
1074
|
+
console.print(f" [red]{error}[/red]")
|
|
1075
|
+
elif validation.warnings:
|
|
1076
|
+
console.print(f" [yellow]⚠[/yellow] {exp_key}:")
|
|
1077
|
+
for warning in validation.warnings:
|
|
1078
|
+
console.print(f" [yellow]{warning}[/yellow]")
|
|
1079
|
+
|
|
1080
|
+
if invalid_experiments:
|
|
1081
|
+
console.print(f"\n[yellow]{len(invalid_experiments)} experiment(s) failed validation and will be skipped[/yellow]")
|
|
1082
|
+
if args.strict:
|
|
1083
|
+
console.print("[red]Error: Validation failed in --strict mode[/red]")
|
|
1084
|
+
return 1
|
|
1085
|
+
|
|
1086
|
+
if not valid_experiments:
|
|
1087
|
+
console.print("[red]Error: No valid experiments to upload[/red]")
|
|
1088
|
+
return 1
|
|
1089
|
+
|
|
1090
|
+
console.print(f"[green]{len(valid_experiments)} experiment(s) ready to upload[/green]")
|
|
1091
|
+
|
|
1092
|
+
# Initialize remote client and local storage
|
|
1093
|
+
remote_client = RemoteClient(base_url=remote_url, api_key=api_key)
|
|
1094
|
+
local_storage = LocalStorage(root_path=local_path)
|
|
1095
|
+
|
|
1096
|
+
# Upload experiments with progress tracking
|
|
1097
|
+
console.print(f"\n[bold]Uploading to:[/bold] {remote_url}")
|
|
1098
|
+
results = []
|
|
1099
|
+
|
|
1100
|
+
# Track upload timing
|
|
1101
|
+
import time
|
|
1102
|
+
start_time = time.time()
|
|
1103
|
+
|
|
1104
|
+
# Create progress bar for overall upload
|
|
1105
|
+
with Progress(
|
|
1106
|
+
SpinnerColumn(),
|
|
1107
|
+
TextColumn("[progress.description]{task.description}"),
|
|
1108
|
+
BarColumn(),
|
|
1109
|
+
TaskProgressColumn(),
|
|
1110
|
+
console=console,
|
|
1111
|
+
transient=not args.verbose, # Keep progress visible in verbose mode
|
|
1112
|
+
) as progress:
|
|
1113
|
+
# Create uploader with progress tracking
|
|
1114
|
+
uploader = ExperimentUploader(
|
|
1115
|
+
local_storage=local_storage,
|
|
1116
|
+
remote_client=remote_client,
|
|
1117
|
+
batch_size=args.batch_size,
|
|
1118
|
+
skip_logs=args.skip_logs,
|
|
1119
|
+
skip_metrics=args.skip_metrics,
|
|
1120
|
+
skip_files=args.skip_files,
|
|
1121
|
+
skip_params=args.skip_params,
|
|
1122
|
+
verbose=args.verbose,
|
|
1123
|
+
progress=progress,
|
|
1124
|
+
)
|
|
1384
1125
|
|
|
1126
|
+
for i, exp in enumerate(valid_experiments, start=1):
|
|
1127
|
+
exp_key = f"{exp.project}/{exp.experiment}"
|
|
1128
|
+
|
|
1129
|
+
# Create task for this experiment
|
|
1130
|
+
task_id = progress.add_task(
|
|
1131
|
+
f"[{i}/{len(valid_experiments)}] {exp_key}",
|
|
1132
|
+
total=100, # Will be updated with actual steps
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
# Update state - mark as in progress
|
|
1136
|
+
upload_state.in_progress_experiment = exp_key
|
|
1137
|
+
if not args.dry_run:
|
|
1138
|
+
upload_state.save(state_file)
|
|
1139
|
+
|
|
1140
|
+
validation = validation_results[exp_key]
|
|
1141
|
+
result = uploader.upload_experiment(exp, validation, task_id=task_id)
|
|
1142
|
+
results.append(result)
|
|
1143
|
+
|
|
1144
|
+
# Update state - mark as completed or failed
|
|
1145
|
+
upload_state.in_progress_experiment = None
|
|
1146
|
+
if result.success:
|
|
1147
|
+
upload_state.completed_experiments.append(exp_key)
|
|
1148
|
+
else:
|
|
1149
|
+
upload_state.failed_experiments.append(exp_key)
|
|
1150
|
+
|
|
1151
|
+
if not args.dry_run:
|
|
1152
|
+
upload_state.save(state_file)
|
|
1153
|
+
|
|
1154
|
+
# Update task to completed
|
|
1155
|
+
progress.update(task_id, completed=100, total=100)
|
|
1156
|
+
|
|
1157
|
+
if not args.verbose:
|
|
1158
|
+
# Show brief status
|
|
1159
|
+
if result.success:
|
|
1160
|
+
parts = []
|
|
1161
|
+
if result.uploaded.get("params"):
|
|
1162
|
+
parts.append(f"{result.uploaded['params']} params")
|
|
1163
|
+
if result.uploaded.get("logs"):
|
|
1164
|
+
parts.append(f"{result.uploaded['logs']} logs")
|
|
1165
|
+
if result.uploaded.get("metrics"):
|
|
1166
|
+
parts.append(f"{result.uploaded['metrics']} metrics")
|
|
1167
|
+
if result.uploaded.get("files"):
|
|
1168
|
+
parts.append(f"{result.uploaded['files']} files")
|
|
1169
|
+
status = ", ".join(parts) if parts else "metadata only"
|
|
1170
|
+
console.print(f" [green]✓[/green] Uploaded ({status})")
|
|
1171
|
+
else:
|
|
1172
|
+
console.print(f" [red]✗[/red] Failed")
|
|
1173
|
+
if result.errors:
|
|
1174
|
+
for error in result.errors[:3]: # Show first 3 errors
|
|
1175
|
+
console.print(f" [red]{error}[/red]")
|
|
1176
|
+
|
|
1177
|
+
# Calculate timing
|
|
1178
|
+
end_time = time.time()
|
|
1179
|
+
elapsed_time = end_time - start_time
|
|
1180
|
+
total_bytes = sum(r.bytes_uploaded for r in results)
|
|
1181
|
+
|
|
1182
|
+
# Print summary with rich Table
|
|
1385
1183
|
console.print()
|
|
1386
|
-
console.print(data_table)
|
|
1387
|
-
|
|
1388
|
-
# Clean up state file if all uploads succeeded
|
|
1389
|
-
if not args.dry_run and len(failed) == 0 and state_file.exists():
|
|
1390
|
-
state_file.unlink()
|
|
1391
|
-
console.print("\n[dim]Upload complete. State file removed.[/dim]")
|
|
1392
|
-
elif not args.dry_run and failed:
|
|
1393
|
-
console.print(
|
|
1394
|
-
f"\n[yellow]State saved to {state_file}. Use --resume to retry failed uploads.[/yellow]"
|
|
1395
|
-
)
|
|
1396
1184
|
|
|
1397
|
-
|
|
1398
|
-
|
|
1185
|
+
successful = [r for r in results if r.success]
|
|
1186
|
+
failed = [r for r in results if not r.success]
|
|
1187
|
+
|
|
1188
|
+
# Create summary table
|
|
1189
|
+
summary_table = Table(title="Upload Summary", show_header=True, header_style="bold")
|
|
1190
|
+
summary_table.add_column("Status", style="cyan")
|
|
1191
|
+
summary_table.add_column("Count", justify="right")
|
|
1192
|
+
|
|
1193
|
+
summary_table.add_row("Successful", f"[green]{len(successful)}/{len(results)}[/green]")
|
|
1194
|
+
if failed:
|
|
1195
|
+
summary_table.add_row("Failed", f"[red]{len(failed)}/{len(results)}[/red]")
|
|
1196
|
+
|
|
1197
|
+
# Add timing information
|
|
1198
|
+
summary_table.add_row("Total Time", f"{elapsed_time:.2f}s")
|
|
1199
|
+
|
|
1200
|
+
# Calculate and display upload speed
|
|
1201
|
+
if total_bytes > 0 and elapsed_time > 0:
|
|
1202
|
+
# Convert to appropriate unit
|
|
1203
|
+
if total_bytes < 1024 * 1024: # Less than 1 MB
|
|
1204
|
+
speed_kb = (total_bytes / 1024) / elapsed_time
|
|
1205
|
+
summary_table.add_row("Avg Speed", f"{speed_kb:.2f} KB/s")
|
|
1206
|
+
else: # 1 MB or more
|
|
1207
|
+
speed_mb = (total_bytes / (1024 * 1024)) / elapsed_time
|
|
1208
|
+
summary_table.add_row("Avg Speed", f"{speed_mb:.2f} MB/s")
|
|
1209
|
+
|
|
1210
|
+
console.print(summary_table)
|
|
1211
|
+
|
|
1212
|
+
# Show failed experiments
|
|
1213
|
+
if failed:
|
|
1214
|
+
console.print("\n[bold red]Failed Experiments:[/bold red]")
|
|
1215
|
+
for result in failed:
|
|
1216
|
+
console.print(f" [red]✗[/red] {result.experiment}")
|
|
1217
|
+
for error in result.errors:
|
|
1218
|
+
console.print(f" [dim]{error}[/dim]")
|
|
1219
|
+
|
|
1220
|
+
# Data statistics
|
|
1221
|
+
total_logs = sum(r.uploaded.get("logs", 0) for r in results)
|
|
1222
|
+
total_metrics = sum(r.uploaded.get("metrics", 0) for r in results)
|
|
1223
|
+
total_files = sum(r.uploaded.get("files", 0) for r in results)
|
|
1224
|
+
|
|
1225
|
+
if total_logs or total_metrics or total_files:
|
|
1226
|
+
data_table = Table(title="Data Uploaded", show_header=True, header_style="bold")
|
|
1227
|
+
data_table.add_column("Type", style="cyan")
|
|
1228
|
+
data_table.add_column("Count", justify="right", style="green")
|
|
1229
|
+
|
|
1230
|
+
if total_logs:
|
|
1231
|
+
data_table.add_row("Logs", f"{total_logs} entries")
|
|
1232
|
+
if total_metrics:
|
|
1233
|
+
data_table.add_row("Metrics", f"{total_metrics} metrics")
|
|
1234
|
+
if total_files:
|
|
1235
|
+
data_table.add_row("Files", f"{total_files} files")
|
|
1236
|
+
|
|
1237
|
+
console.print()
|
|
1238
|
+
console.print(data_table)
|
|
1239
|
+
|
|
1240
|
+
# Clean up state file if all uploads succeeded
|
|
1241
|
+
if not args.dry_run and len(failed) == 0 and state_file.exists():
|
|
1242
|
+
state_file.unlink()
|
|
1243
|
+
console.print("\n[dim]Upload complete. State file removed.[/dim]")
|
|
1244
|
+
elif not args.dry_run and failed:
|
|
1245
|
+
console.print(f"\n[yellow]State saved to {state_file}. Use --resume to retry failed uploads.[/yellow]")
|
|
1246
|
+
|
|
1247
|
+
# Return exit code
|
|
1248
|
+
return 0 if len(failed) == 0 else 1
|