ml-dash 0.6.2__py3-none-any.whl → 0.6.2rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,25 +2,20 @@
2
2
 
3
3
  import argparse
4
4
  import json
5
+ from pathlib import Path
6
+ from typing import List, Dict, Any, Optional
7
+ from dataclasses import dataclass, field
5
8
  import threading
6
9
  from concurrent.futures import ThreadPoolExecutor, as_completed
7
- from dataclasses import dataclass, field
8
- from pathlib import Path
9
- from typing import Any, Dict, List, Optional
10
10
 
11
11
  from rich.console import Console
12
- from rich.progress import (
13
- BarColumn,
14
- Progress,
15
- SpinnerColumn,
16
- TaskProgressColumn,
17
- TextColumn,
18
- )
12
+ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
19
13
  from rich.table import Table
14
+ from rich.panel import Panel
20
15
 
16
+ from ..storage import LocalStorage
21
17
  from ..client import RemoteClient
22
18
  from ..config import Config
23
- from ..storage import LocalStorage
24
19
 
25
20
  # Initialize rich console
26
21
  console = Console()
@@ -28,1371 +23,1226 @@ console = Console()
28
23
 
29
24
  @dataclass
30
25
  class ExperimentInfo:
31
- """Information about an experiment to upload."""
32
-
33
- project: str
34
- experiment: str
35
- path: Path
36
- prefix: Optional[str] = None
37
- has_logs: bool = False
38
- has_params: bool = False
39
- metric_names: List[str] = field(default_factory=list)
40
- file_count: int = 0
41
- estimated_size: int = 0 # in bytes
26
+ """Information about an experiment to upload."""
27
+ project: str
28
+ experiment: str
29
+ path: Path
30
+ folder: Optional[str] = None
31
+ has_logs: bool = False
32
+ has_params: bool = False
33
+ metric_names: List[str] = field(default_factory=list)
34
+ file_count: int = 0
35
+ estimated_size: int = 0 # in bytes
42
36
 
43
37
 
44
38
  @dataclass
45
39
  class ValidationResult:
46
- """Result of experiment validation."""
47
-
48
- is_valid: bool = True
49
- warnings: List[str] = field(default_factory=list)
50
- errors: List[str] = field(default_factory=list)
51
- valid_data: Dict[str, Any] = field(default_factory=dict)
40
+ """Result of experiment validation."""
41
+ is_valid: bool = True
42
+ warnings: List[str] = field(default_factory=list)
43
+ errors: List[str] = field(default_factory=list)
44
+ valid_data: Dict[str, Any] = field(default_factory=dict)
52
45
 
53
46
 
54
47
  @dataclass
55
48
  class UploadResult:
56
- """Result of uploading an experiment."""
57
-
58
- experiment: str
59
- success: bool = False
60
- uploaded: Dict[str, int] = field(default_factory=dict) # {"logs": 100, "metrics": 3}
61
- failed: Dict[str, List[str]] = field(default_factory=dict) # {"files": ["error msg"]}
62
- errors: List[str] = field(default_factory=list)
63
- bytes_uploaded: int = 0 # Total bytes uploaded
49
+ """Result of uploading an experiment."""
50
+ experiment: str
51
+ success: bool = False
52
+ uploaded: Dict[str, int] = field(default_factory=dict) # {"logs": 100, "metrics": 3}
53
+ failed: Dict[str, List[str]] = field(default_factory=dict) # {"files": ["error msg"]}
54
+ errors: List[str] = field(default_factory=list)
55
+ bytes_uploaded: int = 0 # Total bytes uploaded
64
56
 
65
57
 
66
58
  @dataclass
67
59
  class UploadState:
68
- """Tracks upload state for resume functionality."""
69
-
70
- local_path: str
71
- remote_url: str
72
- completed_experiments: List[str] = field(
73
- default_factory=list
74
- ) # ["project/experiment"]
75
- failed_experiments: List[str] = field(default_factory=list)
76
- in_progress_experiment: Optional[str] = None
77
- timestamp: Optional[str] = None
78
-
79
- def to_dict(self) -> Dict[str, Any]:
80
- """Convert to dictionary for JSON serialization."""
81
- return {
82
- "local_path": self.local_path,
83
- "remote_url": self.remote_url,
84
- "completed_experiments": self.completed_experiments,
85
- "failed_experiments": self.failed_experiments,
86
- "in_progress_experiment": self.in_progress_experiment,
87
- "timestamp": self.timestamp,
88
- }
89
-
90
- @classmethod
91
- def from_dict(cls, data: Dict[str, Any]) -> "UploadState":
92
- """Create from dictionary."""
93
- return cls(
94
- local_path=data["local_path"],
95
- remote_url=data["remote_url"],
96
- completed_experiments=data.get("completed_experiments", []),
97
- failed_experiments=data.get("failed_experiments", []),
98
- in_progress_experiment=data.get("in_progress_experiment"),
99
- timestamp=data.get("timestamp"),
100
- )
101
-
102
- def save(self, path: Path):
103
- """Save state to file."""
104
- import datetime
105
-
106
- self.timestamp = datetime.datetime.now().isoformat()
107
- with open(path, "w") as f:
108
- json.dump(self.to_dict(), f, indent=2)
60
+ """Tracks upload state for resume functionality."""
61
+ local_path: str
62
+ remote_url: str
63
+ completed_experiments: List[str] = field(default_factory=list) # ["project/experiment"]
64
+ failed_experiments: List[str] = field(default_factory=list)
65
+ in_progress_experiment: Optional[str] = None
66
+ timestamp: Optional[str] = None
67
+
68
+ def to_dict(self) -> Dict[str, Any]:
69
+ """Convert to dictionary for JSON serialization."""
70
+ return {
71
+ "local_path": self.local_path,
72
+ "remote_url": self.remote_url,
73
+ "completed_experiments": self.completed_experiments,
74
+ "failed_experiments": self.failed_experiments,
75
+ "in_progress_experiment": self.in_progress_experiment,
76
+ "timestamp": self.timestamp,
77
+ }
78
+
79
+ @classmethod
80
+ def from_dict(cls, data: Dict[str, Any]) -> "UploadState":
81
+ """Create from dictionary."""
82
+ return cls(
83
+ local_path=data["local_path"],
84
+ remote_url=data["remote_url"],
85
+ completed_experiments=data.get("completed_experiments", []),
86
+ failed_experiments=data.get("failed_experiments", []),
87
+ in_progress_experiment=data.get("in_progress_experiment"),
88
+ timestamp=data.get("timestamp"),
89
+ )
109
90
 
110
- @classmethod
111
- def load(cls, path: Path) -> Optional["UploadState"]:
112
- """Load state from file."""
113
- if not path.exists():
114
- return None
115
- try:
116
- with open(path, "r") as f:
117
- data = json.load(f)
118
- return cls.from_dict(data)
119
- except (json.JSONDecodeError, IOError, KeyError):
120
- return None
91
+ def save(self, path: Path):
92
+ """Save state to file."""
93
+ import datetime
94
+ self.timestamp = datetime.datetime.now().isoformat()
95
+ with open(path, "w") as f:
96
+ json.dump(self.to_dict(), f, indent=2)
97
+
98
+ @classmethod
99
+ def load(cls, path: Path) -> Optional["UploadState"]:
100
+ """Load state from file."""
101
+ if not path.exists():
102
+ return None
103
+ try:
104
+ with open(path, "r") as f:
105
+ data = json.load(f)
106
+ return cls.from_dict(data)
107
+ except (json.JSONDecodeError, IOError, KeyError):
108
+ return None
121
109
 
122
110
 
123
111
  def add_parser(subparsers) -> argparse.ArgumentParser:
124
- """Add upload command parser."""
125
- parser = subparsers.add_parser(
126
- "upload",
127
- help="Upload local experiments to remote server",
128
- description="Upload locally-stored ML-Dash experiment data to a remote server.",
129
- )
130
-
131
- # Positional argument
132
- parser.add_argument(
133
- "path",
134
- nargs="?",
135
- default="./.dash",
136
- help="Local storage directory to upload from (default: ./.dash)",
137
- )
138
-
139
- # Remote configuration
140
- parser.add_argument(
141
- "--dash-url",
142
- type=str,
143
- help="ML-Dash server URL (defaults to config or https://api.dash.ml)",
144
- )
145
- parser.add_argument(
146
- "--api-key",
147
- type=str,
148
- help="JWT token for authentication (optional - auto-loads from 'ml-dash login' if not provided)",
149
- )
150
-
151
- """
152
-
153
- cd .dash/geyang
154
- cd iclr_2026
155
-
156
- ml-dash upload -p geyang/new-run * # this uploads all of the folders to geyang/new-run.
157
-
158
- or
159
-
160
- ml-dash upload --prefix geyang/new-run/local-results ./* # uploads under the local-results prefix.
161
-
162
- ml-dash download --prefix geyang/new-run/zehua-results --filter *.mp4 --dryrun --verbose
163
-
164
- mo-dash list --prefix geyang/new-run/zehua-results --filter xxx-xxx --verbose
165
-
166
- mo-dash list-exp --prefix geyang/new-run/zehua-results --filter xxx-xxx --verbose
167
-
168
- """
169
-
170
- # Scope control
171
- # Ge: project should be {owner}/{proj_name}
172
- parser.add_argument(
173
- "-p",
174
- "--pref",
175
- "--prefix",
176
- "--proj",
177
- "--project",
178
- type=str,
179
- help="Filter experiments by prefix pattern (supports glob: 'tom/*/exp*', 'alice/project-?/baseline')",
180
- )
181
-
182
- # Target prefix for server (like scp destination)
183
- parser.add_argument(
184
- "-t",
185
- "--target",
186
- type=str,
187
- help="Target prefix/directory on server where experiments will be uploaded (e.g., 'alice/shared-project'). Similar to 'scp local/ remote-path/'",
188
- )
189
- # parser.add_argument(
190
- # "--experiment",
191
- # type=str,
192
- # help="Upload only this specific experiment (requires --project)",
193
- # )
194
-
195
- # Data filtering
196
- parser.add_argument(
197
- "--skip-logs",
198
- action="store_true",
199
- help="Don't upload logs",
200
- )
201
- parser.add_argument(
202
- "--skip-metrics",
203
- action="store_true",
204
- help="Don't upload metrics",
205
- )
206
- parser.add_argument(
207
- "--skip-files",
208
- action="store_true",
209
- help="Don't upload files",
210
- )
211
- parser.add_argument(
212
- "--skip-params",
213
- action="store_true",
214
- help="Don't upload parameters",
215
- )
216
-
217
- # Behavior control
218
- parser.add_argument(
219
- "--dry-run",
220
- action="store_true",
221
- help="Show what would be uploaded without uploading",
222
- )
223
- parser.add_argument(
224
- "--strict",
225
- action="store_true",
226
- help="Fail on any validation error (default: skip invalid data)",
227
- )
228
- parser.add_argument(
229
- "-v",
230
- "--verbose",
231
- action="store_true",
232
- help="Show detailed progress",
233
- )
234
- parser.add_argument(
235
- "--batch-size",
236
- type=int,
237
- default=100,
238
- help="Batch size for logs/metrics (default: 100)",
239
- )
240
- parser.add_argument(
241
- "--resume",
242
- action="store_true",
243
- help="Resume previous interrupted upload",
244
- )
245
- parser.add_argument(
246
- "--state-file",
247
- type=str,
248
- default=".dash-upload-state.json",
249
- help="Path to state file for resume (default: .dash-upload-state.json)",
250
- )
251
-
252
- return parser
112
+ """Add upload command parser."""
113
+ parser = subparsers.add_parser(
114
+ "upload",
115
+ help="Upload local experiments to remote server",
116
+ description="Upload locally-stored ML-Dash experiment data to a remote server.",
117
+ )
253
118
 
119
+ # Positional argument
120
+ parser.add_argument(
121
+ "path",
122
+ nargs="?",
123
+ default="./.ml-dash",
124
+ help="Local storage directory to upload from (default: ./.ml-dash)",
125
+ )
254
126
 
255
- def discover_experiments(
256
- local_path: Path,
257
- project_filter: Optional[str] = None,
258
- experiment_filter: Optional[str] = None,
259
- ) -> List[ExperimentInfo]:
260
- """
261
- Discover experiments in local storage directory.
262
-
263
- Supports both flat (local_path/project/experiment) and folder-based
264
- (local_path/folder/project/experiment) hierarchies.
265
-
266
- Args:
267
- local_path: Root path of local storage
268
- project_filter: Glob pattern to filter experiments by prefix (e.g., "tom/*/exp*")
269
- experiment_filter: Only discover this experiment (requires project_filter)
270
-
271
- Returns:
272
- List of ExperimentInfo objects
273
- """
274
- import fnmatch
275
-
276
- local_path = Path(local_path)
277
-
278
- if not local_path.exists():
279
- return []
280
-
281
- experiments = []
282
-
283
- # Find all experiment.json files recursively
284
- for exp_json in local_path.rglob("*/experiment.json"):
285
- exp_dir = exp_json.parent
286
-
287
- # Read prefix from experiment.json first
288
- prefix = None
289
- try:
290
- with open(exp_json, "r") as f:
291
- metadata = json.load(f)
292
- prefix = metadata.get("prefix")
293
- except:
294
- pass
295
-
296
- # Extract project and experiment names from PREFIX (not path)
297
- # This handles nested folders correctly
298
- # Prefix format: owner/project/folder.../experiment
299
- try:
300
- relative_path = exp_dir.relative_to(local_path)
301
- full_relative_path = str(relative_path)
302
-
303
- if prefix:
304
- # Parse from prefix for accuracy
305
- prefix_parts = prefix.strip("/").split("/")
306
- if len(prefix_parts) < 3:
307
- continue # Need at least owner/project/experiment
308
-
309
- # owner = prefix_parts[0]
310
- project_name = prefix_parts[1]
311
- exp_name = prefix_parts[-1]
312
- else:
313
- # Fallback to path-based parsing (legacy support)
314
- parts = relative_path.parts
315
- if len(parts) < 2:
316
- continue
317
- exp_name = parts[-1]
318
- project_name = parts[-2]
319
-
320
- # Apply filters with glob pattern support
321
- if project_filter:
322
- # Support glob pattern matching on the full relative path
323
- if not fnmatch.fnmatch(full_relative_path, project_filter):
324
- continue
325
- if experiment_filter and exp_name != experiment_filter:
326
- continue
327
-
328
- # Create experiment info
329
- exp_info = ExperimentInfo(
330
- project=project_name,
331
- experiment=exp_name,
332
- path=exp_dir,
333
- prefix=prefix,
334
- )
335
- except (ValueError, IndexError):
336
- continue
337
-
338
- # Check for parameters
339
- params_file = exp_dir / "parameters.json"
340
- exp_info.has_params = params_file.exists()
341
-
342
- # Check for logs
343
- logs_file = exp_dir / "logs/logs.jsonl"
344
- exp_info.has_logs = logs_file.exists()
345
-
346
- # Check for metrics
347
- metrics_dir = exp_dir / "metrics"
348
- if metrics_dir.exists():
349
- for metric_dir in metrics_dir.iterdir():
350
- if metric_dir.is_dir():
351
- data_file = metric_dir / "data.jsonl"
352
- if data_file.exists():
353
- exp_info.metric_names.append(metric_dir.name)
354
-
355
- # Check for files
356
- files_dir = exp_dir / "files"
357
- if files_dir.exists():
358
- try:
359
- # Count files recursively
360
- exp_info.file_count = sum(1 for _ in files_dir.rglob("*") if _.is_file())
361
-
362
- # Estimate size
363
- exp_info.estimated_size = sum(
364
- f.stat().st_size for f in files_dir.rglob("*") if f.is_file()
365
- )
366
- except (OSError, PermissionError):
367
- pass
127
+ # Remote configuration
128
+ parser.add_argument(
129
+ "--remote",
130
+ type=str,
131
+ help="Remote server URL (required unless set in config)",
132
+ )
133
+ parser.add_argument(
134
+ "--api-key",
135
+ type=str,
136
+ help="JWT token for authentication (optional - auto-loads from 'ml-dash login' if not provided)",
137
+ )
368
138
 
369
- experiments.append(exp_info)
139
+ # Scope control
140
+ parser.add_argument(
141
+ "--project",
142
+ type=str,
143
+ help="Upload only experiments from this project",
144
+ )
145
+ parser.add_argument(
146
+ "--experiment",
147
+ type=str,
148
+ help="Upload only this specific experiment (requires --project)",
149
+ )
370
150
 
371
- return experiments
151
+ # Data filtering
152
+ parser.add_argument(
153
+ "--skip-logs",
154
+ action="store_true",
155
+ help="Don't upload logs",
156
+ )
157
+ parser.add_argument(
158
+ "--skip-metrics",
159
+ action="store_true",
160
+ help="Don't upload metrics",
161
+ )
162
+ parser.add_argument(
163
+ "--skip-files",
164
+ action="store_true",
165
+ help="Don't upload files",
166
+ )
167
+ parser.add_argument(
168
+ "--skip-params",
169
+ action="store_true",
170
+ help="Don't upload parameters",
171
+ )
372
172
 
173
+ # Behavior control
174
+ parser.add_argument(
175
+ "--dry-run",
176
+ action="store_true",
177
+ help="Show what would be uploaded without uploading",
178
+ )
179
+ parser.add_argument(
180
+ "--strict",
181
+ action="store_true",
182
+ help="Fail on any validation error (default: skip invalid data)",
183
+ )
184
+ parser.add_argument(
185
+ "-v", "--verbose",
186
+ action="store_true",
187
+ help="Show detailed progress",
188
+ )
189
+ parser.add_argument(
190
+ "--batch-size",
191
+ type=int,
192
+ default=100,
193
+ help="Batch size for logs/metrics (default: 100)",
194
+ )
195
+ parser.add_argument(
196
+ "--resume",
197
+ action="store_true",
198
+ help="Resume previous interrupted upload",
199
+ )
200
+ parser.add_argument(
201
+ "--state-file",
202
+ type=str,
203
+ default=".ml-dash-upload-state.json",
204
+ help="Path to state file for resume (default: .ml-dash-upload-state.json)",
205
+ )
373
206
 
374
- class ExperimentValidator:
375
- """Validates local experiment data before upload."""
207
+ return parser
376
208
 
377
- def __init__(self, strict: bool = False):
378
- """
379
- Initialize validator.
380
209
 
381
- Args:
382
- strict: If True, fail on any validation error
210
+ def discover_experiments(
211
+ local_path: Path,
212
+ project_filter: Optional[str] = None,
213
+ experiment_filter: Optional[str] = None,
214
+ ) -> List[ExperimentInfo]:
383
215
  """
384
- self.strict = strict
216
+ Discover experiments in local storage directory.
385
217
 
386
- def validate_experiment(self, exp_info: ExperimentInfo) -> ValidationResult:
387
- """
388
- Validate experiment directory structure and data.
218
+ Supports both flat (local_path/project/experiment) and folder-based
219
+ (local_path/folder/project/experiment) hierarchies.
389
220
 
390
221
  Args:
391
- exp_info: Experiment information
222
+ local_path: Root path of local storage
223
+ project_filter: Only discover experiments in this project
224
+ experiment_filter: Only discover this experiment (requires project_filter)
392
225
 
393
226
  Returns:
394
- ValidationResult with validation status and messages
227
+ List of ExperimentInfo objects
395
228
  """
396
- result = ValidationResult()
397
- result.valid_data = {}
398
-
399
- # 1. Validate experiment metadata (required)
400
- if not self._validate_experiment_metadata(exp_info, result):
401
- result.is_valid = False
402
- return result
403
-
404
- # 2. Validate parameters (optional)
405
- self._validate_parameters(exp_info, result)
406
-
407
- # 3. Validate logs (optional)
408
- self._validate_logs(exp_info, result)
409
-
410
- # 4. Validate metrics (optional)
411
- self._validate_metrics(exp_info, result)
412
-
413
- # 5. Validate files (optional)
414
- self._validate_files(exp_info, result)
415
-
416
- # In strict mode, any warning becomes an error
417
- if self.strict and result.warnings:
418
- result.errors.extend(result.warnings)
419
- result.warnings = []
420
- result.is_valid = False
421
-
422
- return result
423
-
424
- def _validate_experiment_metadata(
425
- self, exp_info: ExperimentInfo, result: ValidationResult
426
- ) -> bool:
427
- """Validate experiment.json exists and is valid."""
428
- exp_json = exp_info.path / "experiment.json"
429
-
430
- if not exp_json.exists():
431
- result.errors.append("Missing experiment.json")
432
- return False
433
-
434
- try:
435
- with open(exp_json, "r") as f:
436
- metadata = json.load(f)
437
-
438
- # Check required fields
439
- if "name" not in metadata or "project" not in metadata:
440
- result.errors.append("experiment.json missing required fields (name, project)")
441
- return False
442
-
443
- result.valid_data["metadata"] = metadata
444
- return True
445
-
446
- except json.JSONDecodeError as e:
447
- result.errors.append(f"Invalid JSON in experiment.json: {e}")
448
- return False
449
- except IOError as e:
450
- result.errors.append(f"Cannot read experiment.json: {e}")
451
- return False
452
-
453
- def _validate_parameters(self, exp_info: ExperimentInfo, result: ValidationResult):
454
- """Validate parameters.json format."""
455
- if not exp_info.has_params:
456
- return
457
-
458
- params_file = exp_info.path / "parameters.json"
459
- try:
460
- with open(params_file, "r") as f:
461
- params = json.load(f)
462
-
463
- # Check if it's a dict
464
- if not isinstance(params, dict):
465
- result.warnings.append("parameters.json is not a dict (will skip)")
466
- return
467
-
468
- # Check for valid data key if using versioned format
469
- if "data" in params:
470
- if not isinstance(params["data"], dict):
471
- result.warnings.append("parameters.json data is not a dict (will skip)")
472
- return
473
- result.valid_data["parameters"] = params["data"]
474
- else:
475
- result.valid_data["parameters"] = params
476
-
477
- except json.JSONDecodeError as e:
478
- result.warnings.append(f"Invalid JSON in parameters.json: {e} (will skip)")
479
- except IOError as e:
480
- result.warnings.append(f"Cannot read parameters.json: {e} (will skip)")
481
-
482
- def _validate_logs(self, exp_info: ExperimentInfo, result: ValidationResult):
483
- """Validate logs.jsonl format."""
484
- if not exp_info.has_logs:
485
- return
486
-
487
- logs_file = exp_info.path / "logs/logs.jsonl"
488
- invalid_lines = []
489
-
490
- try:
491
- with open(logs_file, "r") as f:
492
- for line_num, line in enumerate(f, start=1):
493
- try:
494
- log_entry = json.loads(line)
495
- # Check required fields
496
- if "message" not in log_entry:
497
- invalid_lines.append(line_num)
498
- except json.JSONDecodeError:
499
- invalid_lines.append(line_num)
500
-
501
- if invalid_lines:
502
- count = len(invalid_lines)
503
- preview = invalid_lines[:5]
504
- result.warnings.append(
505
- f"logs.jsonl has {count} invalid lines (e.g., {preview}...) - will skip these"
506
- )
229
+ local_path = Path(local_path)
230
+
231
+ if not local_path.exists():
232
+ return []
233
+
234
+ experiments = []
235
+
236
+ # Find all experiment.json files recursively
237
+ for exp_json in local_path.rglob("*/experiment.json"):
238
+ exp_dir = exp_json.parent
507
239
 
508
- except IOError as e:
509
- result.warnings.append(f"Cannot read logs.jsonl: {e} (will skip logs)")
240
+ # Extract project and experiment names from path
241
+ # Path structure: local_path / [folder] / project / experiment
242
+ try:
243
+ relative_path = exp_dir.relative_to(local_path)
244
+ parts = relative_path.parts
245
+
246
+ if len(parts) < 2:
247
+ continue # Need at least project/experiment
510
248
 
511
- def _validate_metrics(self, exp_info: ExperimentInfo, result: ValidationResult):
512
- """Validate metrics data."""
513
- if not exp_info.metric_names:
514
- return
249
+ # Last two parts are project/experiment
250
+ exp_name = parts[-1]
251
+ project_name = parts[-2]
515
252
 
516
- for metric_name in exp_info.metric_names:
517
- metric_dir = exp_info.path / "metrics" / metric_name
518
- data_file = metric_dir / "data.jsonl"
253
+ # Apply filters
254
+ if project_filter and project_name != project_filter:
255
+ continue
256
+ if experiment_filter and exp_name != experiment_filter:
257
+ continue
519
258
 
520
- invalid_lines = []
521
- try:
522
- with open(data_file, "r") as f:
523
- for line_num, line in enumerate(f, start=1):
259
+ # Read folder from experiment.json
260
+ folder = None
524
261
  try:
525
- data_point = json.loads(line)
526
- # Check for data field
527
- if "data" not in data_point:
528
- invalid_lines.append(line_num)
529
- except json.JSONDecodeError:
530
- invalid_lines.append(line_num)
531
-
532
- if invalid_lines:
533
- count = len(invalid_lines)
534
- preview = invalid_lines[:5]
535
- result.warnings.append(
536
- f"metric '{metric_name}' has {count} invalid lines (e.g., {preview}...) - will skip these"
537
- )
538
-
539
- except IOError as e:
540
- result.warnings.append(f"Cannot read metric '{metric_name}': {e} (will skip)")
541
-
542
- def _validate_files(self, exp_info: ExperimentInfo, result: ValidationResult):
543
- """Validate files existence."""
544
- files_dir = exp_info.path / "files"
545
- if not files_dir.exists():
546
- return
547
-
548
- metadata_file = files_dir / ".files_metadata.json"
549
- if not metadata_file.exists():
550
- return
551
-
552
- try:
553
- with open(metadata_file, "r") as f:
554
- files_metadata = json.load(f)
555
-
556
- missing_files = []
557
- for file_id, file_info in files_metadata.items():
558
- if isinstance(file_info, dict) and file_info.get("deletedAt") is None:
559
- # Check if file exists
560
- file_path = (
561
- files_dir
562
- / file_info.get("prefix", "")
563
- / file_id
564
- / file_info.get("filename", "")
565
- )
566
- if not file_path.exists():
567
- missing_files.append(file_info.get("filename", file_id))
568
-
569
- if missing_files:
570
- count = len(missing_files)
571
- preview = missing_files[:3]
572
- result.warnings.append(
573
- f"{count} files referenced in metadata but missing on disk (e.g., {preview}...) - will skip these"
574
- )
262
+ with open(exp_json, 'r') as f:
263
+ metadata = json.load(f)
264
+ folder = metadata.get('folder')
265
+ except:
266
+ pass
267
+
268
+ # Create experiment info
269
+ exp_info = ExperimentInfo(
270
+ project=project_name,
271
+ experiment=exp_name,
272
+ path=exp_dir,
273
+ folder=folder,
274
+ )
275
+ except (ValueError, IndexError):
276
+ continue
575
277
 
576
- except (json.JSONDecodeError, IOError):
577
- pass # If we can't read metadata, just skip file validation
278
+ # Check for parameters
279
+ params_file = exp_dir / "parameters.json"
280
+ exp_info.has_params = params_file.exists()
281
+
282
+ # Check for logs
283
+ logs_file = exp_dir / "logs" / "logs.jsonl"
284
+ exp_info.has_logs = logs_file.exists()
285
+
286
+ # Check for metrics
287
+ metrics_dir = exp_dir / "metrics"
288
+ if metrics_dir.exists():
289
+ for metric_dir in metrics_dir.iterdir():
290
+ if metric_dir.is_dir():
291
+ data_file = metric_dir / "data.jsonl"
292
+ if data_file.exists():
293
+ exp_info.metric_names.append(metric_dir.name)
294
+
295
+ # Check for files
296
+ files_dir = exp_dir / "files"
297
+ if files_dir.exists():
298
+ try:
299
+ # Count files recursively
300
+ exp_info.file_count = sum(1 for _ in files_dir.rglob("*") if _.is_file())
578
301
 
302
+ # Estimate size
303
+ exp_info.estimated_size = sum(
304
+ f.stat().st_size for f in files_dir.rglob("*") if f.is_file()
305
+ )
306
+ except (OSError, PermissionError):
307
+ pass
579
308
 
580
- class ExperimentUploader:
581
- """Handles uploading a single experiment."""
582
-
583
- def __init__(
584
- self,
585
- local_storage: LocalStorage,
586
- remote_client: RemoteClient,
587
- batch_size: int = 100,
588
- skip_logs: bool = False,
589
- skip_metrics: bool = False,
590
- skip_files: bool = False,
591
- skip_params: bool = False,
592
- verbose: bool = False,
593
- progress: Optional[Progress] = None,
594
- max_concurrent_metrics: int = 5,
595
- target_prefix: Optional[str] = None,
596
- ):
597
- """
598
- Initialize uploader.
309
+ experiments.append(exp_info)
599
310
 
600
- Args:
601
- local_storage: Local storage instance
602
- remote_client: Remote client instance
603
- batch_size: Batch size for logs/metrics
604
- skip_logs: Skip uploading logs
605
- skip_metrics: Skip uploading metrics
606
- skip_files: Skip uploading files
607
- skip_params: Skip uploading parameters
608
- verbose: Show verbose output
609
- progress: Optional rich Progress instance for tracking
610
- max_concurrent_metrics: Maximum concurrent metric uploads (default: 5)
611
- target_prefix: Target prefix on server (overrides local prefix)
612
- """
613
- self.local = local_storage
614
- self.remote = remote_client
615
- self.batch_size = batch_size
616
- self.skip_logs = skip_logs
617
- self.skip_metrics = skip_metrics
618
- self.skip_files = skip_files
619
- self.skip_params = skip_params
620
- self.verbose = verbose
621
- self.progress = progress
622
- self.max_concurrent_metrics = max_concurrent_metrics
623
- self.target_prefix = target_prefix
624
- # Thread-safe lock for shared state updates
625
- self._lock = threading.Lock()
626
- # Thread-local storage for remote clients (for thread-safe HTTP requests)
627
- self._thread_local = threading.local()
628
-
629
- def _get_remote_client(self) -> RemoteClient:
630
- """Get thread-local remote client for safe concurrent access."""
631
- if not hasattr(self._thread_local, "client"):
632
- # Create a new client for this thread
633
- # Use graphql_base_url (without /api) since RemoteClient.__init__ will add /api
634
- self._thread_local.client = RemoteClient(
635
- base_url=self.remote.graphql_base_url, api_key=self.remote.api_key
636
- )
637
- return self._thread_local.client
638
-
639
- def upload_experiment(
640
- self, exp_info: ExperimentInfo, validation_result: ValidationResult, task_id=None
641
- ) -> UploadResult:
642
- """
643
- Upload a single experiment with all its data.
311
+ return experiments
644
312
 
645
- Args:
646
- exp_info: Experiment information
647
- validation_result: Validation results
648
- task_id: Optional progress task ID
649
313
 
650
- Returns:
651
- UploadResult with upload status
652
- """
653
- result = UploadResult(experiment=f"{exp_info.project}/{exp_info.experiment}")
654
-
655
- # Calculate total steps for progress tracking
656
- total_steps = 1 # metadata
657
- if not self.skip_params and "parameters" in validation_result.valid_data:
658
- total_steps += 1
659
- if not self.skip_logs and exp_info.has_logs:
660
- total_steps += 1
661
- if not self.skip_metrics and exp_info.metric_names:
662
- total_steps += len(exp_info.metric_names)
663
- if not self.skip_files and exp_info.file_count > 0:
664
- total_steps += exp_info.file_count
665
-
666
- current_step = 0
667
-
668
- def update_progress(description: str):
669
- nonlocal current_step
670
- current_step += 1
671
- if self.progress and task_id is not None:
672
- self.progress.update(
673
- task_id, completed=current_step, total=total_steps, description=description
674
- )
314
+ class ExperimentValidator:
315
+ """Validates local experiment data before upload."""
675
316
 
676
- try:
677
- # 1. Create/update experiment metadata
678
- update_progress("Creating experiment...")
679
- if self.verbose:
680
- console.print(" [dim]Creating experiment...[/dim]")
681
-
682
- exp_data = validation_result.valid_data
683
-
684
- # Construct full prefix for server
685
- # If --target is specified, use it as the base destination prefix
686
- # Otherwise, preserve the local prefix structure
687
- if self.target_prefix:
688
- # User specified a target prefix (like scp destination directory)
689
- # Append experiment name to it: target_prefix/experiment_name
690
- full_prefix = f"{self.target_prefix.rstrip('/')}/{exp_info.experiment}"
691
-
692
- # Extract project from target prefix for API call
693
- # Target format: owner/project/path...
694
- target_parts = self.target_prefix.strip("/").split("/")
695
- if len(target_parts) >= 2:
696
- target_project = target_parts[1]
697
- else:
698
- target_project = exp_info.project # Fallback to original
699
- elif exp_info.prefix:
700
- # No target specified, preserve local prefix structure
701
- full_prefix = f"{exp_info.prefix}/{exp_info.experiment}"
702
- target_project = exp_info.project
703
- else:
704
- full_prefix = exp_info.experiment
705
- target_project = exp_info.project
706
-
707
- response = self.remote.create_or_update_experiment(
708
- project=target_project,
709
- name=exp_info.experiment,
710
- description=exp_data.get("description"),
711
- tags=exp_data.get("tags"),
712
- bindrs=exp_data.get("bindrs"),
713
- prefix=full_prefix, # Send full prefix (folder + name) or target prefix
714
- write_protected=exp_data.get("write_protected", False),
715
- metadata=exp_data.get("metadata"),
716
- )
717
-
718
- # Extract experiment ID from nested response
719
- experiment_id = response.get("experiment", {}).get("id") or response.get("id")
720
- if self.verbose:
721
- console.print(f" [green]✓[/green] Created experiment (id: {experiment_id})")
722
-
723
- # 2. Upload parameters
724
- if not self.skip_params and "parameters" in validation_result.valid_data:
725
- update_progress("Uploading parameters...")
726
- if self.verbose:
727
- console.print(" [dim]Uploading parameters...[/dim]")
317
+ def __init__(self, strict: bool = False):
318
+ """
319
+ Initialize validator.
728
320
 
729
- params = validation_result.valid_data["parameters"]
730
- self.remote.set_parameters(experiment_id, params)
731
- result.uploaded["params"] = len(params)
732
- # Track bytes (approximate JSON size)
733
- result.bytes_uploaded += len(json.dumps(params).encode("utf-8"))
321
+ Args:
322
+ strict: If True, fail on any validation error
323
+ """
324
+ self.strict = strict
734
325
 
735
- if self.verbose:
736
- console.print(f" [green]✓[/green] Uploaded {len(params)} parameters")
326
+ def validate_experiment(self, exp_info: ExperimentInfo) -> ValidationResult:
327
+ """
328
+ Validate experiment directory structure and data.
737
329
 
738
- # 3. Upload logs
739
- if not self.skip_logs and exp_info.has_logs:
740
- count = self._upload_logs(
741
- experiment_id, exp_info, result, task_id, update_progress
742
- )
743
- result.uploaded["logs"] = count
330
+ Args:
331
+ exp_info: Experiment information
744
332
 
745
- # 4. Upload metrics
746
- if not self.skip_metrics and exp_info.metric_names:
747
- count = self._upload_metrics(
748
- experiment_id, exp_info, result, task_id, update_progress
749
- )
750
- result.uploaded["metrics"] = count
333
+ Returns:
334
+ ValidationResult with validation status and messages
335
+ """
336
+ result = ValidationResult()
337
+ result.valid_data = {}
751
338
 
752
- # 5. Upload files
753
- if not self.skip_files and exp_info.file_count > 0:
754
- count = self._upload_files(
755
- experiment_id, exp_info, result, task_id, update_progress
756
- )
757
- result.uploaded["files"] = count
758
-
759
- result.success = True
760
-
761
- except Exception as e:
762
- result.success = False
763
- result.errors.append(str(e))
764
- if self.verbose:
765
- console.print(f" [red]✗ Error: {e}[/red]")
766
-
767
- return result
768
-
769
- def _upload_logs(
770
- self,
771
- experiment_id: str,
772
- exp_info: ExperimentInfo,
773
- result: UploadResult,
774
- task_id=None,
775
- update_progress=None,
776
- ) -> int:
777
- """Upload logs in batches."""
778
- if update_progress:
779
- update_progress("Uploading logs...")
780
- if self.verbose:
781
- console.print(" [dim]Uploading logs...[/dim]")
782
-
783
- logs_file = exp_info.path / "logs/logs.jsonl"
784
- logs_batch = []
785
- total_uploaded = 0
786
- skipped = 0
787
-
788
- try:
789
- with open(logs_file, "r") as f:
790
- for line in f:
791
- try:
792
- log_entry = json.loads(line)
793
-
794
- # Validate required fields
795
- if "message" not in log_entry:
796
- skipped += 1
797
- continue
798
-
799
- # Prepare log entry for API
800
- api_log = {
801
- "timestamp": log_entry.get("timestamp"),
802
- "level": log_entry.get("level", "info"),
803
- "message": log_entry["message"],
804
- }
805
- if "metadata" in log_entry:
806
- api_log["metadata"] = log_entry["metadata"]
339
+ # 1. Validate experiment metadata (required)
340
+ if not self._validate_experiment_metadata(exp_info, result):
341
+ result.is_valid = False
342
+ return result
807
343
 
808
- logs_batch.append(api_log)
809
- # Track bytes
810
- result.bytes_uploaded += len(line.encode("utf-8"))
344
+ # 2. Validate parameters (optional)
345
+ self._validate_parameters(exp_info, result)
811
346
 
812
- # Upload batch
813
- if len(logs_batch) >= self.batch_size:
814
- self.remote.create_log_entries(experiment_id, logs_batch)
815
- total_uploaded += len(logs_batch)
816
- logs_batch = []
347
+ # 3. Validate logs (optional)
348
+ self._validate_logs(exp_info, result)
817
349
 
818
- except json.JSONDecodeError:
819
- skipped += 1
820
- continue
350
+ # 4. Validate metrics (optional)
351
+ self._validate_metrics(exp_info, result)
821
352
 
822
- # Upload remaining logs
823
- if logs_batch:
824
- self.remote.create_log_entries(experiment_id, logs_batch)
825
- total_uploaded += len(logs_batch)
353
+ # 5. Validate files (optional)
354
+ self._validate_files(exp_info, result)
826
355
 
827
- if self.verbose:
828
- msg = f" [green]✓[/green] Uploaded {total_uploaded} log entries"
829
- if skipped > 0:
830
- msg += f" (skipped {skipped} invalid)"
831
- console.print(msg)
356
+ # In strict mode, any warning becomes an error
357
+ if self.strict and result.warnings:
358
+ result.errors.extend(result.warnings)
359
+ result.warnings = []
360
+ result.is_valid = False
832
361
 
833
- except IOError as e:
834
- result.failed.setdefault("logs", []).append(str(e))
362
+ return result
835
363
 
836
- return total_uploaded
364
+ def _validate_experiment_metadata(self, exp_info: ExperimentInfo, result: ValidationResult) -> bool:
365
+ """Validate experiment.json exists and is valid."""
366
+ exp_json = exp_info.path / "experiment.json"
837
367
 
838
- def _upload_single_metric(
839
- self, experiment_id: str, metric_name: str, metric_dir: Path, result: UploadResult
840
- ) -> Dict[str, Any]:
841
- """
842
- Upload a single metric (thread-safe helper).
368
+ if not exp_json.exists():
369
+ result.errors.append("Missing experiment.json")
370
+ return False
843
371
 
844
- Returns:
845
- Dict with 'success', 'uploaded', 'skipped', 'bytes', and 'error' keys
846
- """
847
- data_file = metric_dir / "data.jsonl"
848
- data_batch = []
849
- total_uploaded = 0
850
- skipped = 0
851
- bytes_uploaded = 0
852
-
853
- # Get thread-local client for safe concurrent HTTP requests
854
- remote_client = self._get_remote_client()
855
-
856
- try:
857
- with open(data_file, "r") as f:
858
- for line in f:
859
- try:
860
- data_point = json.loads(line)
861
-
862
- # Validate required fields
863
- if "data" not in data_point:
864
- skipped += 1
865
- continue
866
-
867
- data_batch.append(data_point["data"])
868
- bytes_uploaded += len(line.encode("utf-8"))
869
-
870
- # Upload batch using thread-local client
871
- if len(data_batch) >= self.batch_size:
872
- remote_client.append_batch_to_metric(
873
- experiment_id, metric_name, data_batch
874
- )
875
- total_uploaded += len(data_batch)
876
- data_batch = []
877
-
878
- except json.JSONDecodeError:
879
- skipped += 1
880
- continue
372
+ try:
373
+ with open(exp_json, "r") as f:
374
+ metadata = json.load(f)
881
375
 
882
- # Upload remaining data points using thread-local client
883
- if data_batch:
884
- remote_client.append_batch_to_metric(experiment_id, metric_name, data_batch)
885
- total_uploaded += len(data_batch)
886
-
887
- return {
888
- "success": True,
889
- "uploaded": total_uploaded,
890
- "skipped": skipped,
891
- "bytes": bytes_uploaded,
892
- "error": None,
893
- }
894
-
895
- except Exception as e:
896
- return {
897
- "success": False,
898
- "uploaded": 0,
899
- "skipped": 0,
900
- "bytes": 0,
901
- "error": str(e),
902
- }
903
-
904
- def _upload_metrics(
905
- self,
906
- experiment_id: str,
907
- exp_info: ExperimentInfo,
908
- result: UploadResult,
909
- task_id=None,
910
- update_progress=None,
911
- ) -> int:
912
- """Upload metrics in parallel with concurrency limit."""
913
- if not exp_info.metric_names:
914
- return 0
915
-
916
- total_metrics = 0
917
-
918
- # Use ThreadPoolExecutor for parallel uploads
919
- with ThreadPoolExecutor(max_workers=self.max_concurrent_metrics) as executor:
920
- # Submit all metric upload tasks
921
- future_to_metric = {}
922
- for metric_name in exp_info.metric_names:
923
- metric_dir = exp_info.path / "metrics" / metric_name
924
- future = executor.submit(
925
- self._upload_single_metric, experiment_id, metric_name, metric_dir, result
926
- )
927
- future_to_metric[future] = metric_name
376
+ # Check required fields
377
+ if "name" not in metadata or "project" not in metadata:
378
+ result.errors.append("experiment.json missing required fields (name, project)")
379
+ return False
380
+
381
+ result.valid_data["metadata"] = metadata
382
+ return True
383
+
384
+ except json.JSONDecodeError as e:
385
+ result.errors.append(f"Invalid JSON in experiment.json: {e}")
386
+ return False
387
+ except IOError as e:
388
+ result.errors.append(f"Cannot read experiment.json: {e}")
389
+ return False
390
+
391
+ def _validate_parameters(self, exp_info: ExperimentInfo, result: ValidationResult):
392
+ """Validate parameters.json format."""
393
+ if not exp_info.has_params:
394
+ return
395
+
396
+ params_file = exp_info.path / "parameters.json"
397
+ try:
398
+ with open(params_file, "r") as f:
399
+ params = json.load(f)
400
+
401
+ # Check if it's a dict
402
+ if not isinstance(params, dict):
403
+ result.warnings.append("parameters.json is not a dict (will skip)")
404
+ return
405
+
406
+ # Check for valid data key if using versioned format
407
+ if "data" in params:
408
+ if not isinstance(params["data"], dict):
409
+ result.warnings.append("parameters.json data is not a dict (will skip)")
410
+ return
411
+ result.valid_data["parameters"] = params["data"]
412
+ else:
413
+ result.valid_data["parameters"] = params
414
+
415
+ except json.JSONDecodeError as e:
416
+ result.warnings.append(f"Invalid JSON in parameters.json: {e} (will skip)")
417
+ except IOError as e:
418
+ result.warnings.append(f"Cannot read parameters.json: {e} (will skip)")
419
+
420
+ def _validate_logs(self, exp_info: ExperimentInfo, result: ValidationResult):
421
+ """Validate logs.jsonl format."""
422
+ if not exp_info.has_logs:
423
+ return
424
+
425
+ logs_file = exp_info.path / "logs" / "logs.jsonl"
426
+ invalid_lines = []
427
+
428
+ try:
429
+ with open(logs_file, "r") as f:
430
+ for line_num, line in enumerate(f, start=1):
431
+ try:
432
+ log_entry = json.loads(line)
433
+ # Check required fields
434
+ if "message" not in log_entry:
435
+ invalid_lines.append(line_num)
436
+ except json.JSONDecodeError:
437
+ invalid_lines.append(line_num)
438
+
439
+ if invalid_lines:
440
+ count = len(invalid_lines)
441
+ preview = invalid_lines[:5]
442
+ result.warnings.append(
443
+ f"logs.jsonl has {count} invalid lines (e.g., {preview}...) - will skip these"
444
+ )
928
445
 
929
- # Process completed uploads as they finish
930
- for future in as_completed(future_to_metric):
931
- metric_name = future_to_metric[future]
446
+ except IOError as e:
447
+ result.warnings.append(f"Cannot read logs.jsonl: {e} (will skip logs)")
932
448
 
933
- # Update progress
934
- if update_progress:
935
- update_progress(f"Uploading metric '{metric_name}'...")
449
+ def _validate_metrics(self, exp_info: ExperimentInfo, result: ValidationResult):
450
+ """Validate metrics data."""
451
+ if not exp_info.metric_names:
452
+ return
453
+
454
+ for metric_name in exp_info.metric_names:
455
+ metric_dir = exp_info.path / "metrics" / metric_name
456
+ data_file = metric_dir / "data.jsonl"
457
+
458
+ invalid_lines = []
459
+ try:
460
+ with open(data_file, "r") as f:
461
+ for line_num, line in enumerate(f, start=1):
462
+ try:
463
+ data_point = json.loads(line)
464
+ # Check for data field
465
+ if "data" not in data_point:
466
+ invalid_lines.append(line_num)
467
+ except json.JSONDecodeError:
468
+ invalid_lines.append(line_num)
469
+
470
+ if invalid_lines:
471
+ count = len(invalid_lines)
472
+ preview = invalid_lines[:5]
473
+ result.warnings.append(
474
+ f"metric '{metric_name}' has {count} invalid lines (e.g., {preview}...) - will skip these"
475
+ )
476
+
477
+ except IOError as e:
478
+ result.warnings.append(f"Cannot read metric '{metric_name}': {e} (will skip)")
479
+
480
+ def _validate_files(self, exp_info: ExperimentInfo, result: ValidationResult):
481
+ """Validate files existence."""
482
+ files_dir = exp_info.path / "files"
483
+ if not files_dir.exists():
484
+ return
485
+
486
+ metadata_file = files_dir / ".files_metadata.json"
487
+ if not metadata_file.exists():
488
+ return
936
489
 
937
490
  try:
938
- upload_result = future.result()
491
+ with open(metadata_file, "r") as f:
492
+ files_metadata = json.load(f)
493
+
494
+ missing_files = []
495
+ for file_id, file_info in files_metadata.items():
496
+ if isinstance(file_info, dict) and file_info.get("deletedAt") is None:
497
+ # Check if file exists
498
+ file_path = files_dir / file_info.get("prefix", "") / file_id / file_info.get("filename", "")
499
+ if not file_path.exists():
500
+ missing_files.append(file_info.get("filename", file_id))
501
+
502
+ if missing_files:
503
+ count = len(missing_files)
504
+ preview = missing_files[:3]
505
+ result.warnings.append(
506
+ f"{count} files referenced in metadata but missing on disk (e.g., {preview}...) - will skip these"
507
+ )
508
+
509
+ except (json.JSONDecodeError, IOError):
510
+ pass # If we can't read metadata, just skip file validation
939
511
 
940
- # Thread-safe update of shared state
941
- with self._lock:
942
- result.bytes_uploaded += upload_result["bytes"]
943
512
 
944
- if upload_result["success"]:
945
- total_metrics += 1
513
+ class ExperimentUploader:
514
+ """Handles uploading a single experiment."""
515
+
516
+ def __init__(
517
+ self,
518
+ local_storage: LocalStorage,
519
+ remote_client: RemoteClient,
520
+ batch_size: int = 100,
521
+ skip_logs: bool = False,
522
+ skip_metrics: bool = False,
523
+ skip_files: bool = False,
524
+ skip_params: bool = False,
525
+ verbose: bool = False,
526
+ progress: Optional[Progress] = None,
527
+ max_concurrent_metrics: int = 5,
528
+ ):
529
+ """
530
+ Initialize uploader.
531
+
532
+ Args:
533
+ local_storage: Local storage instance
534
+ remote_client: Remote client instance
535
+ batch_size: Batch size for logs/metrics
536
+ skip_logs: Skip uploading logs
537
+ skip_metrics: Skip uploading metrics
538
+ skip_files: Skip uploading files
539
+ skip_params: Skip uploading parameters
540
+ verbose: Show verbose output
541
+ progress: Optional rich Progress instance for tracking
542
+ max_concurrent_metrics: Maximum concurrent metric uploads (default: 5)
543
+ """
544
+ self.local = local_storage
545
+ self.remote = remote_client
546
+ self.batch_size = batch_size
547
+ self.skip_logs = skip_logs
548
+ self.skip_metrics = skip_metrics
549
+ self.skip_files = skip_files
550
+ self.skip_params = skip_params
551
+ self.verbose = verbose
552
+ self.progress = progress
553
+ self.max_concurrent_metrics = max_concurrent_metrics
554
+ # Thread-safe lock for shared state updates
555
+ self._lock = threading.Lock()
556
+ # Thread-local storage for remote clients (for thread-safe HTTP requests)
557
+ self._thread_local = threading.local()
558
+
559
+ def _get_remote_client(self) -> RemoteClient:
560
+ """Get thread-local remote client for safe concurrent access."""
561
+ if not hasattr(self._thread_local, 'client'):
562
+ # Create a new client for this thread
563
+ self._thread_local.client = RemoteClient(
564
+ base_url=self.remote.base_url,
565
+ api_key=self.remote.api_key
566
+ )
567
+ return self._thread_local.client
568
+
569
+ def upload_experiment(
570
+ self, exp_info: ExperimentInfo, validation_result: ValidationResult, task_id=None
571
+ ) -> UploadResult:
572
+ """
573
+ Upload a single experiment with all its data.
574
+
575
+ Args:
576
+ exp_info: Experiment information
577
+ validation_result: Validation results
578
+ task_id: Optional progress task ID
579
+
580
+ Returns:
581
+ UploadResult with upload status
582
+ """
583
+ result = UploadResult(experiment=f"{exp_info.project}/{exp_info.experiment}")
584
+
585
+ # Calculate total steps for progress tracking
586
+ total_steps = 1 # metadata
587
+ if not self.skip_params and "parameters" in validation_result.valid_data:
588
+ total_steps += 1
589
+ if not self.skip_logs and exp_info.has_logs:
590
+ total_steps += 1
591
+ if not self.skip_metrics and exp_info.metric_names:
592
+ total_steps += len(exp_info.metric_names)
593
+ if not self.skip_files and exp_info.file_count > 0:
594
+ total_steps += exp_info.file_count
595
+
596
+ current_step = 0
597
+
598
+ def update_progress(description: str):
599
+ nonlocal current_step
600
+ current_step += 1
601
+ if self.progress and task_id is not None:
602
+ self.progress.update(task_id, completed=current_step, total=total_steps, description=description)
946
603
 
947
- # Thread-safe console output
604
+ try:
605
+ # 1. Create/update experiment metadata
606
+ update_progress("Creating experiment...")
948
607
  if self.verbose:
949
- msg = f" [green][/green] Uploaded {upload_result['uploaded']} data points for '{metric_name}'"
950
- if upload_result["skipped"] > 0:
951
- msg += f" (skipped {upload_result['skipped']} invalid)"
952
- with self._lock:
953
- console.print(msg)
954
- else:
955
- # Record failure
956
- error_msg = f"{metric_name}: {upload_result['error']}"
957
- with self._lock:
958
- result.failed.setdefault("metrics", []).append(error_msg)
959
- if self.verbose:
960
- console.print(
961
- f" [red]✗[/red] Failed to upload '{metric_name}': {upload_result['error']}"
962
- )
608
+ console.print(f" [dim]Creating experiment...[/dim]")
609
+
610
+ exp_data = validation_result.valid_data
611
+
612
+ # Store folder path in metadata (not as folderId which expects Snowflake ID)
613
+ custom_metadata = exp_data.get("metadata") or {}
614
+ if exp_data.get("folder"):
615
+ custom_metadata["folder"] = exp_data["folder"]
616
+
617
+ response = self.remote.create_or_update_experiment(
618
+ project=exp_info.project,
619
+ name=exp_info.experiment,
620
+ description=exp_data.get("description"),
621
+ tags=exp_data.get("tags"),
622
+ bindrs=exp_data.get("bindrs"),
623
+ folder=None, # Don't send folder path as folderId (expects Snowflake ID)
624
+ write_protected=exp_data.get("write_protected", False),
625
+ metadata=custom_metadata if custom_metadata else None,
626
+ )
627
+
628
+ # Extract experiment ID from nested response
629
+ experiment_id = response.get("experiment", {}).get("id") or response.get("id")
630
+ if self.verbose:
631
+ console.print(f" [green]✓[/green] Created experiment (id: {experiment_id})")
632
+
633
+ # 2. Upload parameters
634
+ if not self.skip_params and "parameters" in validation_result.valid_data:
635
+ update_progress("Uploading parameters...")
636
+ if self.verbose:
637
+ console.print(f" [dim]Uploading parameters...[/dim]")
638
+
639
+ params = validation_result.valid_data["parameters"]
640
+ self.remote.set_parameters(experiment_id, params)
641
+ result.uploaded["params"] = len(params)
642
+ # Track bytes (approximate JSON size)
643
+ result.bytes_uploaded += len(json.dumps(params).encode('utf-8'))
644
+
645
+ if self.verbose:
646
+ console.print(f" [green]✓[/green] Uploaded {len(params)} parameters")
647
+
648
+ # 3. Upload logs
649
+ if not self.skip_logs and exp_info.has_logs:
650
+ count = self._upload_logs(experiment_id, exp_info, result, task_id, update_progress)
651
+ result.uploaded["logs"] = count
652
+
653
+ # 4. Upload metrics
654
+ if not self.skip_metrics and exp_info.metric_names:
655
+ count = self._upload_metrics(experiment_id, exp_info, result, task_id, update_progress)
656
+ result.uploaded["metrics"] = count
657
+
658
+ # 5. Upload files
659
+ if not self.skip_files and exp_info.file_count > 0:
660
+ count = self._upload_files(experiment_id, exp_info, result, task_id, update_progress)
661
+ result.uploaded["files"] = count
662
+
663
+ result.success = True
963
664
 
964
665
  except Exception as e:
965
- # Handle unexpected errors
966
- error_msg = f"{metric_name}: {str(e)}"
967
- with self._lock:
968
- result.failed.setdefault("metrics", []).append(error_msg)
666
+ result.success = False
667
+ result.errors.append(str(e))
668
+ if self.verbose:
669
+ console.print(f" [red]✗ Error: {e}[/red]")
670
+
671
+ return result
672
+
673
+ def _upload_logs(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
674
+ task_id=None, update_progress=None) -> int:
675
+ """Upload logs in batches."""
676
+ if update_progress:
677
+ update_progress("Uploading logs...")
678
+ if self.verbose:
679
+ console.print(f" [dim]Uploading logs...[/dim]")
680
+
681
+ logs_file = exp_info.path / "logs" / "logs.jsonl"
682
+ logs_batch = []
683
+ total_uploaded = 0
684
+ skipped = 0
685
+
686
+ try:
687
+ with open(logs_file, "r") as f:
688
+ for line in f:
689
+ try:
690
+ log_entry = json.loads(line)
691
+
692
+ # Validate required fields
693
+ if "message" not in log_entry:
694
+ skipped += 1
695
+ continue
696
+
697
+ # Prepare log entry for API
698
+ api_log = {
699
+ "timestamp": log_entry.get("timestamp"),
700
+ "level": log_entry.get("level", "info"),
701
+ "message": log_entry["message"],
702
+ }
703
+ if "metadata" in log_entry:
704
+ api_log["metadata"] = log_entry["metadata"]
705
+
706
+ logs_batch.append(api_log)
707
+ # Track bytes
708
+ result.bytes_uploaded += len(line.encode('utf-8'))
709
+
710
+ # Upload batch
711
+ if len(logs_batch) >= self.batch_size:
712
+ self.remote.create_log_entries(experiment_id, logs_batch)
713
+ total_uploaded += len(logs_batch)
714
+ logs_batch = []
715
+
716
+ except json.JSONDecodeError:
717
+ skipped += 1
718
+ continue
719
+
720
+ # Upload remaining logs
721
+ if logs_batch:
722
+ self.remote.create_log_entries(experiment_id, logs_batch)
723
+ total_uploaded += len(logs_batch)
724
+
969
725
  if self.verbose:
970
- console.print(f" [red][/red] Failed to upload '{metric_name}': {e}")
971
-
972
- return total_metrics
973
-
974
- def _upload_files(
975
- self,
976
- experiment_id: str,
977
- exp_info: ExperimentInfo,
978
- result: UploadResult,
979
- task_id=None,
980
- update_progress=None,
981
- ) -> int:
982
- """Upload files one by one."""
983
- files_dir = exp_info.path / "files"
984
- total_uploaded = 0
985
-
986
- # Parse prefix to get owner, project, and experiment path
987
- # Format: owner/project/folder.../experiment
988
- parts = exp_info.prefix.split("/") if exp_info.prefix else []
989
- if len(parts) < 3:
990
- # Invalid prefix format, skip file upload
991
- return 0
992
-
993
- owner = parts[0]
994
- project = parts[1]
995
- # Note: _get_experiment_dir expects the FULL prefix, not just the experiment part
996
- # So we pass the full prefix to list_files
997
- full_prefix = exp_info.prefix
998
-
999
- # Use LocalStorage to list files
1000
- try:
1001
- files_list = self.local.list_files(owner, project, full_prefix)
1002
-
1003
- # Debug: print file count
1004
- if self.verbose:
1005
- print(f"[DEBUG] Found {len(files_list)} files to upload")
1006
- print(f"[DEBUG] Full prefix: {full_prefix}")
1007
-
1008
- for file_info in files_list:
1009
- # Skip deleted files
1010
- if file_info.get("deletedAt") is not None:
1011
- continue
726
+ msg = f" [green][/green] Uploaded {total_uploaded} log entries"
727
+ if skipped > 0:
728
+ msg += f" (skipped {skipped} invalid)"
729
+ console.print(msg)
730
+
731
+ except IOError as e:
732
+ result.failed.setdefault("logs", []).append(str(e))
733
+
734
+ return total_uploaded
735
+
736
+ def _upload_single_metric(
737
+ self,
738
+ experiment_id: str,
739
+ metric_name: str,
740
+ metric_dir: Path,
741
+ result: UploadResult
742
+ ) -> Dict[str, Any]:
743
+ """
744
+ Upload a single metric (thread-safe helper).
745
+
746
+ Returns:
747
+ Dict with 'success', 'uploaded', 'skipped', 'bytes', and 'error' keys
748
+ """
749
+ data_file = metric_dir / "data.jsonl"
750
+ data_batch = []
751
+ total_uploaded = 0
752
+ skipped = 0
753
+ bytes_uploaded = 0
754
+
755
+ # Get thread-local client for safe concurrent HTTP requests
756
+ remote_client = self._get_remote_client()
1012
757
 
1013
758
  try:
1014
- if update_progress:
1015
- update_progress(f"Uploading {file_info['filename']}...")
1016
-
1017
- # Get file path directly from storage without copying
1018
- file_id = file_info["id"]
1019
- experiment_dir = self.local._get_experiment_dir(
1020
- owner, project, full_prefix
1021
- )
1022
- files_dir = experiment_dir / "files"
1023
-
1024
- # Construct file path
1025
- file_prefix = file_info["path"].lstrip("/") if file_info["path"] else ""
1026
- if file_prefix:
1027
- file_path = files_dir / file_prefix / file_id / file_info["filename"]
1028
- else:
1029
- file_path = files_dir / file_id / file_info["filename"]
1030
-
1031
- # Upload to remote with correct parameters
1032
- self.remote.upload_file(
1033
- experiment_id=experiment_id,
1034
- file_path=str(file_path),
1035
- prefix=file_info.get("path", ""),
1036
- filename=file_info["filename"],
1037
- description=file_info.get("description"),
1038
- tags=file_info.get("tags", []),
1039
- metadata=file_info.get("metadata"),
1040
- checksum=file_info["checksum"],
1041
- content_type=file_info["contentType"],
1042
- size_bytes=file_info["sizeBytes"],
1043
- )
1044
-
1045
- total_uploaded += 1
1046
- # Track bytes
1047
- result.bytes_uploaded += file_info.get("sizeBytes", 0)
1048
-
1049
- if self.verbose:
1050
- size_mb = file_info.get("sizeBytes", 0) / (1024 * 1024)
1051
- console.print(
1052
- f" [green]✓[/green] {file_info['filename']} ({size_mb:.1f}MB)"
1053
- )
759
+ with open(data_file, "r") as f:
760
+ for line in f:
761
+ try:
762
+ data_point = json.loads(line)
763
+
764
+ # Validate required fields
765
+ if "data" not in data_point:
766
+ skipped += 1
767
+ continue
768
+
769
+ data_batch.append(data_point["data"])
770
+ bytes_uploaded += len(line.encode('utf-8'))
771
+
772
+ # Upload batch using thread-local client
773
+ if len(data_batch) >= self.batch_size:
774
+ remote_client.append_batch_to_metric(
775
+ experiment_id, metric_name, data_batch
776
+ )
777
+ total_uploaded += len(data_batch)
778
+ data_batch = []
779
+
780
+ except json.JSONDecodeError:
781
+ skipped += 1
782
+ continue
783
+
784
+ # Upload remaining data points using thread-local client
785
+ if data_batch:
786
+ remote_client.append_batch_to_metric(experiment_id, metric_name, data_batch)
787
+ total_uploaded += len(data_batch)
788
+
789
+ return {
790
+ 'success': True,
791
+ 'uploaded': total_uploaded,
792
+ 'skipped': skipped,
793
+ 'bytes': bytes_uploaded,
794
+ 'error': None
795
+ }
1054
796
 
1055
797
  except Exception as e:
1056
- result.failed.setdefault("files", []).append(f"{file_info['filename']}: {e}")
798
+ return {
799
+ 'success': False,
800
+ 'uploaded': 0,
801
+ 'skipped': 0,
802
+ 'bytes': 0,
803
+ 'error': str(e)
804
+ }
1057
805
 
1058
- except Exception as e:
1059
- result.failed.setdefault("files", []).append(str(e))
806
+ def _upload_metrics(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
807
+ task_id=None, update_progress=None) -> int:
808
+ """Upload metrics in parallel with concurrency limit."""
809
+ if not exp_info.metric_names:
810
+ return 0
811
+
812
+ total_metrics = 0
813
+
814
+ # Use ThreadPoolExecutor for parallel uploads
815
+ with ThreadPoolExecutor(max_workers=self.max_concurrent_metrics) as executor:
816
+ # Submit all metric upload tasks
817
+ future_to_metric = {}
818
+ for metric_name in exp_info.metric_names:
819
+ metric_dir = exp_info.path / "metrics" / metric_name
820
+ future = executor.submit(
821
+ self._upload_single_metric,
822
+ experiment_id,
823
+ metric_name,
824
+ metric_dir,
825
+ result
826
+ )
827
+ future_to_metric[future] = metric_name
828
+
829
+ # Process completed uploads as they finish
830
+ for future in as_completed(future_to_metric):
831
+ metric_name = future_to_metric[future]
832
+
833
+ # Update progress
834
+ if update_progress:
835
+ update_progress(f"Uploading metric '{metric_name}'...")
836
+
837
+ try:
838
+ upload_result = future.result()
839
+
840
+ # Thread-safe update of shared state
841
+ with self._lock:
842
+ result.bytes_uploaded += upload_result['bytes']
843
+
844
+ if upload_result['success']:
845
+ total_metrics += 1
846
+
847
+ # Thread-safe console output
848
+ if self.verbose:
849
+ msg = f" [green]✓[/green] Uploaded {upload_result['uploaded']} data points for '{metric_name}'"
850
+ if upload_result['skipped'] > 0:
851
+ msg += f" (skipped {upload_result['skipped']} invalid)"
852
+ with self._lock:
853
+ console.print(msg)
854
+ else:
855
+ # Record failure
856
+ error_msg = f"{metric_name}: {upload_result['error']}"
857
+ with self._lock:
858
+ result.failed.setdefault("metrics", []).append(error_msg)
859
+ if self.verbose:
860
+ console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {upload_result['error']}")
861
+
862
+ except Exception as e:
863
+ # Handle unexpected errors
864
+ error_msg = f"{metric_name}: {str(e)}"
865
+ with self._lock:
866
+ result.failed.setdefault("metrics", []).append(error_msg)
867
+ if self.verbose:
868
+ console.print(f" [red]✗[/red] Failed to upload '{metric_name}': {e}")
869
+
870
+ return total_metrics
871
+
872
+ def _upload_files(self, experiment_id: str, exp_info: ExperimentInfo, result: UploadResult,
873
+ task_id=None, update_progress=None) -> int:
874
+ """Upload files one by one."""
875
+ files_dir = exp_info.path / "files"
876
+ total_uploaded = 0
877
+
878
+ # Use LocalStorage to list files
879
+ try:
880
+ files_list = self.local.list_files(exp_info.project, exp_info.experiment)
881
+
882
+ for file_info in files_list:
883
+ # Skip deleted files
884
+ if file_info.get("deletedAt") is not None:
885
+ continue
886
+
887
+ try:
888
+ if update_progress:
889
+ update_progress(f"Uploading {file_info['filename']}...")
890
+
891
+ # Get file path directly from storage without copying
892
+ file_id = file_info["id"]
893
+ experiment_dir = self.local._get_experiment_dir(exp_info.project, exp_info.experiment)
894
+ files_dir = experiment_dir / "files"
895
+
896
+ # Construct file path
897
+ file_prefix = file_info["path"].lstrip("/") if file_info["path"] else ""
898
+ if file_prefix:
899
+ file_path = files_dir / file_prefix / file_id / file_info["filename"]
900
+ else:
901
+ file_path = files_dir / file_id / file_info["filename"]
902
+
903
+ # Upload to remote with correct parameters
904
+ self.remote.upload_file(
905
+ experiment_id=experiment_id,
906
+ file_path=str(file_path),
907
+ prefix=file_info.get("path", ""),
908
+ filename=file_info["filename"],
909
+ description=file_info.get("description"),
910
+ tags=file_info.get("tags", []),
911
+ metadata=file_info.get("metadata"),
912
+ checksum=file_info["checksum"],
913
+ content_type=file_info["contentType"],
914
+ size_bytes=file_info["sizeBytes"],
915
+ )
916
+
917
+ total_uploaded += 1
918
+ # Track bytes
919
+ result.bytes_uploaded += file_info.get("sizeBytes", 0)
920
+
921
+ if self.verbose:
922
+ size_mb = file_info.get("sizeBytes", 0) / (1024 * 1024)
923
+ console.print(f" [green]✓[/green] {file_info['filename']} ({size_mb:.1f}MB)")
924
+
925
+ except Exception as e:
926
+ result.failed.setdefault("files", []).append(f"{file_info['filename']}: {e}")
927
+
928
+ except Exception as e:
929
+ result.failed.setdefault("files", []).append(str(e))
1060
930
 
1061
- if self.verbose and not result.failed.get("files"):
1062
- console.print(f" [green]✓[/green] Uploaded {total_uploaded} files")
931
+ if self.verbose and not result.failed.get("files"):
932
+ console.print(f" [green]✓[/green] Uploaded {total_uploaded} files")
1063
933
 
1064
- return total_uploaded
934
+ return total_uploaded
1065
935
 
1066
936
 
1067
937
  def cmd_upload(args: argparse.Namespace) -> int:
1068
- """
1069
- Execute upload command.
1070
-
1071
- Args:
1072
- args: Parsed command-line arguments
1073
-
1074
- Returns:
1075
- Exit code (0 for success, 1 for error)
1076
- """
1077
- # Load config
1078
- config = Config()
1079
-
1080
- # Get remote URL (command line > config)
1081
- remote_url = args.dash_url or config.remote_url
1082
- if not remote_url:
1083
- console.print("[red]Error:[/red] --dash-url is required (or set in config)")
1084
- return 1
1085
-
1086
- # Get API key (command line > config > auto-load from storage)
1087
- # RemoteClient will auto-load from storage if api_key is None
1088
- api_key = args.api_key or config.api_key
1089
-
1090
- # Discover experiments
1091
- local_path = Path(args.path)
1092
- if not local_path.exists():
1093
- console.print(f"[red]Error:[/red] Local storage path does not exist: {local_path}")
1094
- return 1
1095
-
1096
- # Handle state file for resume functionality
1097
- state_file = Path(args.state_file)
1098
- upload_state = None
1099
-
1100
- if args.resume:
1101
- upload_state = UploadState.load(state_file)
1102
- if upload_state:
1103
- # Validate state matches current upload
1104
- if upload_state.local_path != str(local_path.absolute()):
1105
- console.print(
1106
- "[yellow]Warning:[/yellow] State file local path doesn't match. Starting fresh upload."
1107
- )
1108
- upload_state = None
1109
- elif upload_state.remote_url != remote_url:
1110
- console.print(
1111
- "[yellow]Warning:[/yellow] State file remote URL doesn't match. Starting fresh upload."
1112
- )
1113
- upload_state = None
1114
- else:
1115
- console.print(
1116
- f"[green]Resuming previous upload from {upload_state.timestamp}[/green]"
1117
- )
1118
- console.print(
1119
- f" Already completed: {len(upload_state.completed_experiments)} experiments"
938
+ """
939
+ Execute upload command.
940
+
941
+ Args:
942
+ args: Parsed command-line arguments
943
+
944
+ Returns:
945
+ Exit code (0 for success, 1 for error)
946
+ """
947
+ # Load config
948
+ config = Config()
949
+
950
+ # Get remote URL (command line > config)
951
+ remote_url = args.remote or config.remote_url
952
+ if not remote_url:
953
+ console.print("[red]Error:[/red] --remote URL is required (or set in config)")
954
+ return 1
955
+
956
+ # Get API key (command line > config > auto-load from storage)
957
+ # RemoteClient will auto-load from storage if api_key is None
958
+ api_key = args.api_key or config.api_key
959
+
960
+ # Validate experiment filter requires project
961
+ if args.experiment and not args.project:
962
+ console.print("[red]Error:[/red] --experiment requires --project")
963
+ return 1
964
+
965
+ # Discover experiments
966
+ local_path = Path(args.path)
967
+ if not local_path.exists():
968
+ console.print(f"[red]Error:[/red] Local storage path does not exist: {local_path}")
969
+ return 1
970
+
971
+ # Handle state file for resume functionality
972
+ state_file = Path(args.state_file)
973
+ upload_state = None
974
+
975
+ if args.resume:
976
+ upload_state = UploadState.load(state_file)
977
+ if upload_state:
978
+ # Validate state matches current upload
979
+ if upload_state.local_path != str(local_path.absolute()):
980
+ console.print("[yellow]Warning:[/yellow] State file local path doesn't match. Starting fresh upload.")
981
+ upload_state = None
982
+ elif upload_state.remote_url != remote_url:
983
+ console.print("[yellow]Warning:[/yellow] State file remote URL doesn't match. Starting fresh upload.")
984
+ upload_state = None
985
+ else:
986
+ console.print(f"[green]Resuming previous upload from {upload_state.timestamp}[/green]")
987
+ console.print(f" Already completed: {len(upload_state.completed_experiments)} experiments")
988
+ console.print(f" Failed: {len(upload_state.failed_experiments)} experiments")
989
+ else:
990
+ console.print("[yellow]No previous upload state found. Starting fresh upload.[/yellow]")
991
+
992
+ # Create new state if not resuming
993
+ if not upload_state:
994
+ upload_state = UploadState(
995
+ local_path=str(local_path.absolute()),
996
+ remote_url=remote_url,
1120
997
  )
1121
- console.print(f" Failed: {len(upload_state.failed_experiments)} experiments")
1122
- else:
1123
- console.print(
1124
- "[yellow]No previous upload state found. Starting fresh upload.[/yellow]"
1125
- )
1126
-
1127
- # Create new state if not resuming
1128
- if not upload_state:
1129
- upload_state = UploadState(
1130
- local_path=str(local_path.absolute()),
1131
- remote_url=remote_url,
998
+
999
+ console.print(f"[bold]Scanning local storage:[/bold] {local_path.absolute()}")
1000
+ experiments = discover_experiments(
1001
+ local_path,
1002
+ project_filter=args.project,
1003
+ experiment_filter=args.experiment,
1132
1004
  )
1133
1005
 
1134
- console.print(f"[bold]Scanning local storage:[/bold] {local_path.absolute()}")
1135
- experiments = discover_experiments(
1136
- local_path,
1137
- project_filter=args.pref, # Using --prefix/-p argument
1138
- experiment_filter=None,
1139
- )
1140
-
1141
- if not experiments:
1142
- if args.pref:
1143
- console.print(f"[yellow]No experiments found matching pattern:[/yellow] {args.pref}")
1144
- else:
1145
- console.print("[yellow]No experiments found in local storage[/yellow]")
1146
- return 1
1147
-
1148
- # Filter out already completed experiments when resuming
1149
- if args.resume and upload_state.completed_experiments:
1150
- original_count = len(experiments)
1151
- experiments = [
1152
- exp
1153
- for exp in experiments
1154
- if f"{exp.project}/{exp.experiment}" not in upload_state.completed_experiments
1155
- ]
1156
- skipped_count = original_count - len(experiments)
1157
- if skipped_count > 0:
1158
- console.print(
1159
- f"[dim]Skipping {skipped_count} already completed experiment(s)[/dim]"
1160
- )
1161
-
1162
- console.print(f"[green]Found {len(experiments)} experiment(s) to upload[/green]")
1163
-
1164
- # Display discovered experiments
1165
- if args.verbose or args.dry_run:
1166
- console.print("\n[bold]Discovered experiments:[/bold]")
1006
+ if not experiments:
1007
+ if args.project and args.experiment:
1008
+ console.print(f"[yellow]No experiment found:[/yellow] {args.project}/{args.experiment}")
1009
+ elif args.project:
1010
+ console.print(f"[yellow]No experiments found in project:[/yellow] {args.project}")
1011
+ else:
1012
+ console.print("[yellow]No experiments found in local storage[/yellow]")
1013
+ return 1
1014
+
1015
+ # Filter out already completed experiments when resuming
1016
+ if args.resume and upload_state.completed_experiments:
1017
+ original_count = len(experiments)
1018
+ experiments = [
1019
+ exp for exp in experiments
1020
+ if f"{exp.project}/{exp.experiment}" not in upload_state.completed_experiments
1021
+ ]
1022
+ skipped_count = original_count - len(experiments)
1023
+ if skipped_count > 0:
1024
+ console.print(f"[dim]Skipping {skipped_count} already completed experiment(s)[/dim]")
1025
+
1026
+ console.print(f"[green]Found {len(experiments)} experiment(s) to upload[/green]")
1027
+
1028
+ # Display discovered experiments
1029
+ if args.verbose or args.dry_run:
1030
+ console.print("\n[bold]Discovered experiments:[/bold]")
1031
+ for exp in experiments:
1032
+ parts = []
1033
+ if exp.has_logs:
1034
+ parts.append("logs")
1035
+ if exp.has_params:
1036
+ parts.append("params")
1037
+ if exp.metric_names:
1038
+ parts.append(f"{len(exp.metric_names)} metrics")
1039
+ if exp.file_count:
1040
+ size_mb = exp.estimated_size / (1024 * 1024)
1041
+ parts.append(f"{exp.file_count} files ({size_mb:.1f}MB)")
1042
+
1043
+ details = ", ".join(parts) if parts else "metadata only"
1044
+ console.print(f" [cyan]•[/cyan] {exp.project}/{exp.experiment} [dim]({details})[/dim]")
1045
+
1046
+ # Dry-run mode: stop here
1047
+ if args.dry_run:
1048
+ console.print("\n[yellow bold]DRY RUN[/yellow bold] - No data will be uploaded")
1049
+ console.print("Run without --dry-run to proceed with upload.")
1050
+ return 0
1051
+
1052
+ # Validate experiments
1053
+ console.print("\n[bold]Validating experiments...[/bold]")
1054
+ validator = ExperimentValidator(strict=args.strict)
1055
+ validation_results = {}
1056
+ valid_experiments = []
1057
+ invalid_experiments = []
1058
+
1167
1059
  for exp in experiments:
1168
- parts = []
1169
- if exp.has_logs:
1170
- parts.append("logs")
1171
- if exp.has_params:
1172
- parts.append("params")
1173
- if exp.metric_names:
1174
- parts.append(f"{len(exp.metric_names)} metrics")
1175
- if exp.file_count:
1176
- size_mb = exp.estimated_size / (1024 * 1024)
1177
- parts.append(f"{exp.file_count} files ({size_mb:.1f}MB)")
1178
-
1179
- details = ", ".join(parts) if parts else "metadata only"
1180
- console.print(
1181
- f" [cyan]•[/cyan] {exp.project}/{exp.experiment} [dim]({details})[/dim]"
1182
- )
1183
-
1184
- # Dry-run mode: stop here
1185
- if args.dry_run:
1186
- console.print("\n[yellow bold]DRY RUN[/yellow bold] - No data will be uploaded")
1187
- console.print("Run without --dry-run to proceed with upload.")
1188
- return 0
1189
-
1190
- # Validate experiments
1191
- console.print("\n[bold]Validating experiments...[/bold]")
1192
- validator = ExperimentValidator(strict=args.strict)
1193
- validation_results = {}
1194
- valid_experiments = []
1195
- invalid_experiments = []
1196
-
1197
- for exp in experiments:
1198
- validation = validator.validate_experiment(exp)
1199
- validation_results[f"{exp.project}/{exp.experiment}"] = validation
1200
-
1201
- if validation.is_valid:
1202
- valid_experiments.append(exp)
1203
- else:
1204
- invalid_experiments.append(exp)
1205
-
1206
- # Show warnings and errors
1207
- if args.verbose or validation.errors:
1208
- exp_key = f"{exp.project}/{exp.experiment}"
1209
- if validation.errors:
1210
- console.print(f" [red]✗[/red] {exp_key}:")
1211
- for error in validation.errors:
1212
- console.print(f" [red]{error}[/red]")
1213
- elif validation.warnings:
1214
- console.print(f" [yellow]⚠[/yellow] {exp_key}:")
1215
- for warning in validation.warnings:
1216
- console.print(f" [yellow]{warning}[/yellow]")
1217
-
1218
- if invalid_experiments:
1219
- console.print(
1220
- f"\n[yellow]{len(invalid_experiments)} experiment(s) failed validation and will be skipped[/yellow]"
1221
- )
1222
- if args.strict:
1223
- console.print("[red]Error: Validation failed in --strict mode[/red]")
1224
- return 1
1225
-
1226
- if not valid_experiments:
1227
- console.print("[red]Error: No valid experiments to upload[/red]")
1228
- return 1
1229
-
1230
- console.print(
1231
- f"[green]{len(valid_experiments)} experiment(s) ready to upload[/green]"
1232
- )
1233
-
1234
- # Initialize remote client and local storage
1235
- remote_client = RemoteClient(base_url=remote_url, api_key=api_key)
1236
- local_storage = LocalStorage(root_path=local_path)
1237
-
1238
- # Upload experiments with progress tracking
1239
- console.print(f"\n[bold]Uploading to:[/bold] {remote_url}")
1240
- if args.target:
1241
- console.print(f"[bold]Target prefix:[/bold] {args.target}")
1242
- results = []
1243
-
1244
- # Track upload timing
1245
- import time
1246
-
1247
- start_time = time.time()
1248
-
1249
- # Create progress bar for overall upload
1250
- with Progress(
1251
- SpinnerColumn(),
1252
- TextColumn("[progress.description]{task.description}"),
1253
- BarColumn(),
1254
- TaskProgressColumn(),
1255
- console=console,
1256
- transient=not args.verbose, # Keep progress visible in verbose mode
1257
- ) as progress:
1258
- # Create uploader with progress tracking
1259
- uploader = ExperimentUploader(
1260
- local_storage=local_storage,
1261
- remote_client=remote_client,
1262
- batch_size=args.batch_size,
1263
- skip_logs=args.skip_logs,
1264
- skip_metrics=args.skip_metrics,
1265
- skip_files=args.skip_files,
1266
- skip_params=args.skip_params,
1267
- verbose=args.verbose,
1268
- progress=progress,
1269
- target_prefix=args.target,
1270
- )
1060
+ validation = validator.validate_experiment(exp)
1061
+ validation_results[f"{exp.project}/{exp.experiment}"] = validation
1271
1062
 
1272
- for i, exp in enumerate(valid_experiments, start=1):
1273
- exp_key = f"{exp.project}/{exp.experiment}"
1274
-
1275
- # Create task for this experiment
1276
- task_id = progress.add_task(
1277
- f"[{i}/{len(valid_experiments)}] {exp_key}",
1278
- total=100, # Will be updated with actual steps
1279
- )
1280
-
1281
- # Update state - mark as in progress
1282
- upload_state.in_progress_experiment = exp_key
1283
- if not args.dry_run:
1284
- upload_state.save(state_file)
1285
-
1286
- validation = validation_results[exp_key]
1287
- result = uploader.upload_experiment(exp, validation, task_id=task_id)
1288
- results.append(result)
1289
-
1290
- # Update state - mark as completed or failed
1291
- upload_state.in_progress_experiment = None
1292
- if result.success:
1293
- upload_state.completed_experiments.append(exp_key)
1294
- else:
1295
- upload_state.failed_experiments.append(exp_key)
1296
-
1297
- if not args.dry_run:
1298
- upload_state.save(state_file)
1299
-
1300
- # Update task to completed
1301
- progress.update(task_id, completed=100, total=100)
1302
-
1303
- if not args.verbose:
1304
- # Show brief status
1305
- if result.success:
1306
- parts = []
1307
- if result.uploaded.get("params"):
1308
- parts.append(f"{result.uploaded['params']} params")
1309
- if result.uploaded.get("logs"):
1310
- parts.append(f"{result.uploaded['logs']} logs")
1311
- if result.uploaded.get("metrics"):
1312
- parts.append(f"{result.uploaded['metrics']} metrics")
1313
- if result.uploaded.get("files"):
1314
- parts.append(f"{result.uploaded['files']} files")
1315
- status = ", ".join(parts) if parts else "metadata only"
1316
- console.print(f" [green]✓[/green] Uploaded ({status})")
1063
+ if validation.is_valid:
1064
+ valid_experiments.append(exp)
1317
1065
  else:
1318
- console.print(" [red]✗[/red] Failed")
1319
- if result.errors:
1320
- for error in result.errors[:3]: # Show first 3 errors
1321
- console.print(f" [red]{error}[/red]")
1322
-
1323
- # Calculate timing
1324
- end_time = time.time()
1325
- elapsed_time = end_time - start_time
1326
- total_bytes = sum(r.bytes_uploaded for r in results)
1327
-
1328
- # Print summary with rich Table
1329
- console.print()
1330
-
1331
- successful = [r for r in results if r.success]
1332
- failed = [r for r in results if not r.success]
1333
-
1334
- # Create summary table
1335
- summary_table = Table(title="Upload Summary", show_header=True, header_style="bold")
1336
- summary_table.add_column("Status", style="cyan")
1337
- summary_table.add_column("Count", justify="right")
1338
-
1339
- summary_table.add_row(
1340
- "Successful", f"[green]{len(successful)}/{len(results)}[/green]"
1341
- )
1342
- if failed:
1343
- summary_table.add_row("Failed", f"[red]{len(failed)}/{len(results)}[/red]")
1344
-
1345
- # Add timing information
1346
- summary_table.add_row("Total Time", f"{elapsed_time:.2f}s")
1347
-
1348
- # Calculate and display upload speed
1349
- if total_bytes > 0 and elapsed_time > 0:
1350
- # Convert to appropriate unit
1351
- if total_bytes < 1024 * 1024: # Less than 1 MB
1352
- speed_kb = (total_bytes / 1024) / elapsed_time
1353
- summary_table.add_row("Avg Speed", f"{speed_kb:.2f} KB/s")
1354
- else: # 1 MB or more
1355
- speed_mb = (total_bytes / (1024 * 1024)) / elapsed_time
1356
- summary_table.add_row("Avg Speed", f"{speed_mb:.2f} MB/s")
1357
-
1358
- console.print(summary_table)
1359
-
1360
- # Show failed experiments
1361
- if failed:
1362
- console.print("\n[bold red]Failed Experiments:[/bold red]")
1363
- for result in failed:
1364
- console.print(f" [red]✗[/red] {result.experiment}")
1365
- for error in result.errors:
1366
- console.print(f" [dim]{error}[/dim]")
1367
-
1368
- # Data statistics
1369
- total_logs = sum(r.uploaded.get("logs", 0) for r in results)
1370
- total_metrics = sum(r.uploaded.get("metrics", 0) for r in results)
1371
- total_files = sum(r.uploaded.get("files", 0) for r in results)
1372
-
1373
- if total_logs or total_metrics or total_files:
1374
- data_table = Table(title="Data Uploaded", show_header=True, header_style="bold")
1375
- data_table.add_column("Type", style="cyan")
1376
- data_table.add_column("Count", justify="right", style="green")
1377
-
1378
- if total_logs:
1379
- data_table.add_row("Logs", f"{total_logs} entries")
1380
- if total_metrics:
1381
- data_table.add_row("Metrics", f"{total_metrics} metrics")
1382
- if total_files:
1383
- data_table.add_row("Files", f"{total_files} files")
1066
+ invalid_experiments.append(exp)
1067
+
1068
+ # Show warnings and errors
1069
+ if args.verbose or validation.errors:
1070
+ exp_key = f"{exp.project}/{exp.experiment}"
1071
+ if validation.errors:
1072
+ console.print(f" [red]✗[/red] {exp_key}:")
1073
+ for error in validation.errors:
1074
+ console.print(f" [red]{error}[/red]")
1075
+ elif validation.warnings:
1076
+ console.print(f" [yellow]⚠[/yellow] {exp_key}:")
1077
+ for warning in validation.warnings:
1078
+ console.print(f" [yellow]{warning}[/yellow]")
1079
+
1080
+ if invalid_experiments:
1081
+ console.print(f"\n[yellow]{len(invalid_experiments)} experiment(s) failed validation and will be skipped[/yellow]")
1082
+ if args.strict:
1083
+ console.print("[red]Error: Validation failed in --strict mode[/red]")
1084
+ return 1
1085
+
1086
+ if not valid_experiments:
1087
+ console.print("[red]Error: No valid experiments to upload[/red]")
1088
+ return 1
1089
+
1090
+ console.print(f"[green]{len(valid_experiments)} experiment(s) ready to upload[/green]")
1091
+
1092
+ # Initialize remote client and local storage
1093
+ remote_client = RemoteClient(base_url=remote_url, api_key=api_key)
1094
+ local_storage = LocalStorage(root_path=local_path)
1095
+
1096
+ # Upload experiments with progress tracking
1097
+ console.print(f"\n[bold]Uploading to:[/bold] {remote_url}")
1098
+ results = []
1099
+
1100
+ # Track upload timing
1101
+ import time
1102
+ start_time = time.time()
1103
+
1104
+ # Create progress bar for overall upload
1105
+ with Progress(
1106
+ SpinnerColumn(),
1107
+ TextColumn("[progress.description]{task.description}"),
1108
+ BarColumn(),
1109
+ TaskProgressColumn(),
1110
+ console=console,
1111
+ transient=not args.verbose, # Keep progress visible in verbose mode
1112
+ ) as progress:
1113
+ # Create uploader with progress tracking
1114
+ uploader = ExperimentUploader(
1115
+ local_storage=local_storage,
1116
+ remote_client=remote_client,
1117
+ batch_size=args.batch_size,
1118
+ skip_logs=args.skip_logs,
1119
+ skip_metrics=args.skip_metrics,
1120
+ skip_files=args.skip_files,
1121
+ skip_params=args.skip_params,
1122
+ verbose=args.verbose,
1123
+ progress=progress,
1124
+ )
1384
1125
 
1126
+ for i, exp in enumerate(valid_experiments, start=1):
1127
+ exp_key = f"{exp.project}/{exp.experiment}"
1128
+
1129
+ # Create task for this experiment
1130
+ task_id = progress.add_task(
1131
+ f"[{i}/{len(valid_experiments)}] {exp_key}",
1132
+ total=100, # Will be updated with actual steps
1133
+ )
1134
+
1135
+ # Update state - mark as in progress
1136
+ upload_state.in_progress_experiment = exp_key
1137
+ if not args.dry_run:
1138
+ upload_state.save(state_file)
1139
+
1140
+ validation = validation_results[exp_key]
1141
+ result = uploader.upload_experiment(exp, validation, task_id=task_id)
1142
+ results.append(result)
1143
+
1144
+ # Update state - mark as completed or failed
1145
+ upload_state.in_progress_experiment = None
1146
+ if result.success:
1147
+ upload_state.completed_experiments.append(exp_key)
1148
+ else:
1149
+ upload_state.failed_experiments.append(exp_key)
1150
+
1151
+ if not args.dry_run:
1152
+ upload_state.save(state_file)
1153
+
1154
+ # Update task to completed
1155
+ progress.update(task_id, completed=100, total=100)
1156
+
1157
+ if not args.verbose:
1158
+ # Show brief status
1159
+ if result.success:
1160
+ parts = []
1161
+ if result.uploaded.get("params"):
1162
+ parts.append(f"{result.uploaded['params']} params")
1163
+ if result.uploaded.get("logs"):
1164
+ parts.append(f"{result.uploaded['logs']} logs")
1165
+ if result.uploaded.get("metrics"):
1166
+ parts.append(f"{result.uploaded['metrics']} metrics")
1167
+ if result.uploaded.get("files"):
1168
+ parts.append(f"{result.uploaded['files']} files")
1169
+ status = ", ".join(parts) if parts else "metadata only"
1170
+ console.print(f" [green]✓[/green] Uploaded ({status})")
1171
+ else:
1172
+ console.print(f" [red]✗[/red] Failed")
1173
+ if result.errors:
1174
+ for error in result.errors[:3]: # Show first 3 errors
1175
+ console.print(f" [red]{error}[/red]")
1176
+
1177
+ # Calculate timing
1178
+ end_time = time.time()
1179
+ elapsed_time = end_time - start_time
1180
+ total_bytes = sum(r.bytes_uploaded for r in results)
1181
+
1182
+ # Print summary with rich Table
1385
1183
  console.print()
1386
- console.print(data_table)
1387
-
1388
- # Clean up state file if all uploads succeeded
1389
- if not args.dry_run and len(failed) == 0 and state_file.exists():
1390
- state_file.unlink()
1391
- console.print("\n[dim]Upload complete. State file removed.[/dim]")
1392
- elif not args.dry_run and failed:
1393
- console.print(
1394
- f"\n[yellow]State saved to {state_file}. Use --resume to retry failed uploads.[/yellow]"
1395
- )
1396
1184
 
1397
- # Return exit code
1398
- return 0 if len(failed) == 0 else 1
1185
+ successful = [r for r in results if r.success]
1186
+ failed = [r for r in results if not r.success]
1187
+
1188
+ # Create summary table
1189
+ summary_table = Table(title="Upload Summary", show_header=True, header_style="bold")
1190
+ summary_table.add_column("Status", style="cyan")
1191
+ summary_table.add_column("Count", justify="right")
1192
+
1193
+ summary_table.add_row("Successful", f"[green]{len(successful)}/{len(results)}[/green]")
1194
+ if failed:
1195
+ summary_table.add_row("Failed", f"[red]{len(failed)}/{len(results)}[/red]")
1196
+
1197
+ # Add timing information
1198
+ summary_table.add_row("Total Time", f"{elapsed_time:.2f}s")
1199
+
1200
+ # Calculate and display upload speed
1201
+ if total_bytes > 0 and elapsed_time > 0:
1202
+ # Convert to appropriate unit
1203
+ if total_bytes < 1024 * 1024: # Less than 1 MB
1204
+ speed_kb = (total_bytes / 1024) / elapsed_time
1205
+ summary_table.add_row("Avg Speed", f"{speed_kb:.2f} KB/s")
1206
+ else: # 1 MB or more
1207
+ speed_mb = (total_bytes / (1024 * 1024)) / elapsed_time
1208
+ summary_table.add_row("Avg Speed", f"{speed_mb:.2f} MB/s")
1209
+
1210
+ console.print(summary_table)
1211
+
1212
+ # Show failed experiments
1213
+ if failed:
1214
+ console.print("\n[bold red]Failed Experiments:[/bold red]")
1215
+ for result in failed:
1216
+ console.print(f" [red]✗[/red] {result.experiment}")
1217
+ for error in result.errors:
1218
+ console.print(f" [dim]{error}[/dim]")
1219
+
1220
+ # Data statistics
1221
+ total_logs = sum(r.uploaded.get("logs", 0) for r in results)
1222
+ total_metrics = sum(r.uploaded.get("metrics", 0) for r in results)
1223
+ total_files = sum(r.uploaded.get("files", 0) for r in results)
1224
+
1225
+ if total_logs or total_metrics or total_files:
1226
+ data_table = Table(title="Data Uploaded", show_header=True, header_style="bold")
1227
+ data_table.add_column("Type", style="cyan")
1228
+ data_table.add_column("Count", justify="right", style="green")
1229
+
1230
+ if total_logs:
1231
+ data_table.add_row("Logs", f"{total_logs} entries")
1232
+ if total_metrics:
1233
+ data_table.add_row("Metrics", f"{total_metrics} metrics")
1234
+ if total_files:
1235
+ data_table.add_row("Files", f"{total_files} files")
1236
+
1237
+ console.print()
1238
+ console.print(data_table)
1239
+
1240
+ # Clean up state file if all uploads succeeded
1241
+ if not args.dry_run and len(failed) == 0 and state_file.exists():
1242
+ state_file.unlink()
1243
+ console.print("\n[dim]Upload complete. State file removed.[/dim]")
1244
+ elif not args.dry_run and failed:
1245
+ console.print(f"\n[yellow]State saved to {state_file}. Use --resume to retry failed uploads.[/yellow]")
1246
+
1247
+ # Return exit code
1248
+ return 0 if len(failed) == 0 else 1